From 1d6c33d2a59f6f9dd23e7db61618a0f0aa3c1600 Mon Sep 17 00:00:00 2001
From: Nicolas De Carli <ndecarli@meta.com>
Date: Sun, 2 Mar 2025 08:05:21 -0800
Subject: [PATCH 001/500] Enable hardware accelerated crc32c for ARM on Linux
 (#13432)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13432

We've noticed the default CRC32c function gets executed when running on aarch64 cpus within our servers

Issue is that ROCKSDB_AUXV_GETAUXVAL_PRESENT evaluates to false

This fix allows the usage of hardware-accelerated crc32 within our fleet

Reviewed By: jaykorean

Differential Revision: D70423483

fbshipit-source-id: 601da3fbf156e3e40695eb76ee5d37f67f83d427
---
 util/crc32c_arm64.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index f9753c0aba9b..842a692775e0 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -10,7 +10,7 @@
 #if defined(__linux__)
 #include <asm/hwcap.h>
 #endif
-#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__linux__)
 #include <sys/auxv.h>
 #endif
 #ifndef HWCAP_CRC32
@@ -52,9 +52,10 @@
 extern bool pmull_runtime_flag;
 
 uint32_t crc32c_runtime_check(void) {
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) || \
+    defined(__linux__)
   uint64_t auxv = 0;
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__linux__)
   auxv = getauxval(AT_HWCAP);
 #elif defined(__FreeBSD__)
   elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
@@ -81,9 +82,10 @@ uint32_t crc32c_runtime_check(void) {
 }
 
 bool crc32c_pmull_runtime_check(void) {
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) || \
+    defined(__linux__)
   uint64_t auxv = 0;
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__linux__)
   auxv = getauxval(AT_HWCAP);
 #elif defined(__FreeBSD__)
   elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));

From 7e272d20329449a7eb8d158d283e1635d029909a Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 3 Mar 2025 15:21:10 -0800
Subject: [PATCH 002/500] Update MultiGet to provide consistent CF view for
 kPersistedTier (#13433)

Summary:
when reading with ReadOptions::read_tier = kPersistedTier and with a snapshot, MultiGet allows the case where some CF is read before a flush and some CF is read after the flush. This is not desirable, especially when atomic_flush is enabled and users use MultiGet to do some consistency checks on the data in SST files. This PR updates the code path for SuperVersion acquisition to get a consistent view across when kPersistedTier is used.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13433

Test Plan: a new unit test that could be flaky without this change.

Reviewed By: jaykorean

Differential Revision: D70509688

Pulled By: cbi42

fbshipit-source-id: 80de96f94407af9bb2062b6a185c61f65827c092
---
 db/db_basic_test.cc                           | 63 +++++++++++++++++++
 db/db_impl/db_impl.cc                         | 30 +++++----
 .../persisted-tier-multiget.md                |  1 +
 3 files changed, 81 insertions(+), 13 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/persisted-tier-multiget.md

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index edb10693affd..27ef6124943a 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -3354,6 +3354,69 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
                      keys.data(), values.data(), statuses.data(), true);
 }
 
+TEST_F(DBBasicTest, MultiGetWithSnapshotsAndPersistedTier) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  // Insert initial data
+  ASSERT_OK(Put(0, "key1", "value1_cf0"));
+  ASSERT_OK(Put(1, "key1", "value1_cf1"));
+  ASSERT_OK(Put(2, "key1", "value1_cf2"));
+  ASSERT_OK(Flush({0, 1, 2}));
+  for (auto cf : {0, 1, 2}) {
+    ASSERT_EQ(1, NumTableFilesAtLevel(0, cf));
+  }
+
+  ASSERT_OK(Put(0, "key1", "value2_cf0"));
+  ASSERT_OK(Put(1, "key1", "value2_cf1"));
+  ASSERT_OK(Put(2, "key1", "value2_cf2"));
+
+  // Prepare for concurrent atomic flush
+  std::atomic<bool> flush_done(false);
+  std::thread flush_thread([&]() {
+    ASSERT_OK(Flush({0, 1, 2}));
+    flush_done.store(true);
+  });
+
+  // Perform MultiGet with snapshot and read_tier = kPersistentTier
+  ReadOptions ro;
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ro.snapshot = snapshot;
+  ro.read_tier = kPersistedTier;
+
+  std::string k = "key1";
+  std::vector<Slice> keys(3, Slice(k));
+  std::vector<Status> statuses(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size());
+  std::vector<Slice> new_keys(keys.size());
+  std::vector<PinnableSlice> pin_values(keys.size());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    cfs[i] = handles_[i];
+  }
+  db_->MultiGet(ro, cfs.size(), cfs.data(), keys.data(), pin_values.data(),
+                statuses.data());
+  for (const auto& s : statuses) {
+    ASSERT_OK(s);
+  }
+
+  if (pin_values[0] == "value1_cf0") {
+    // Check if the first value matches expected value
+    ASSERT_EQ(pin_values[1], "value1_cf1");
+    ASSERT_EQ(pin_values[2], "value1_cf2");
+  } else {
+    // If first value doesn't match, check if we got the updated values
+    ASSERT_EQ(pin_values[0], "value2_cf0");
+    ASSERT_EQ(pin_values[1], "value2_cf1");
+    ASSERT_EQ(pin_values[2], "value2_cf2");
+  }
+
+  flush_thread.join();
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 96613dfad050..be175614d881 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -2704,7 +2704,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
     }
   };
 
-  bool last_try = false;
+  bool acquire_mutex = false;
   if (cf_list->size() == 1) {
     // Fast path for a single column family. We can simply get the thread local
     // super version
@@ -2753,29 +2753,32 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
     // sure.
     constexpr int num_retries = 3;
     for (int i = 0; i < num_retries; ++i) {
-      last_try = (i == num_retries - 1);
+      // When reading from kPersistedTier, we want a consistent view into CFs.
+      // So we take mutex to prevent any SV change in any CF.
+      acquire_mutex = ((i == num_retries - 1) && !read_options.snapshot) ||
+                      read_options.read_tier == kPersistedTier;
       bool retry = false;
 
       if (i > 0) {
         sv_cleanup_func();
       }
       if (read_options.snapshot == nullptr) {
-        if (last_try) {
-          TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
-          // We're close to max number of retries. For the last retry,
-          // acquire the lock so we're sure to succeed
-          mutex_.Lock();
-        }
         *snapshot = GetLastPublishedSequence();
       } else {
         *snapshot =
             static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
                 ->number_;
       }
+      if (acquire_mutex) {
+        TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::LastTry");
+        // We're close to max number of retries. For the last retry,
+        // acquire the lock so we're sure to succeed
+        mutex_.Lock();
+      }
       for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
            ++cf_iter) {
         auto node = iter_deref_func(cf_iter);
-        if (!last_try) {
+        if (!acquire_mutex) {
           if (extra_sv_ref) {
             node->super_version = node->cfd->GetReferencedSuperVersion(this);
           } else {
@@ -2799,7 +2802,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
           }
         }
         TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::BeforeCheckingSnapshot");
-        if (read_options.snapshot != nullptr || last_try) {
+        if (read_options.snapshot != nullptr || acquire_mutex) {
           // If user passed a snapshot, then we don't care if a memtable is
           // sealed or compaction happens because the snapshot would ensure
           // that older key versions are kept around. If this is the last
@@ -2810,7 +2813,7 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
         // memtables, which will include immutable memtables as well, but that
         // might be tricky to maintain in case we decide, in future, to do
         // memtable compaction.
-        if (!last_try) {
+        if (!acquire_mutex) {
           SequenceNumber seq =
               node->super_version->mem->GetEarliestSequenceNumber();
           if (seq > *snapshot) {
@@ -2820,19 +2823,20 @@ Status DBImpl::MultiCFSnapshot(const ReadOptions& read_options,
         }
       }
       if (!retry) {
-        if (last_try) {
+        if (acquire_mutex) {
           mutex_.Unlock();
           TEST_SYNC_POINT("DBImpl::MultiCFSnapshot::AfterLastTryRefSV");
         }
         break;
       }
+      assert(!acquire_mutex);
     }
   }
 
   TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum1");
   TEST_SYNC_POINT("DBImpl::MultiCFSnapshot:AfterGetSeqNum2");
   PERF_TIMER_STOP(get_snapshot_time);
-  *sv_from_thread_local = !last_try;
+  *sv_from_thread_local = !acquire_mutex;
   if (!s.ok()) {
     sv_cleanup_func();
   }
diff --git a/unreleased_history/behavior_changes/persisted-tier-multiget.md b/unreleased_history/behavior_changes/persisted-tier-multiget.md
new file mode 100644
index 000000000000..9e7ae56a98c0
--- /dev/null
+++ b/unreleased_history/behavior_changes/persisted-tier-multiget.md
@@ -0,0 +1 @@
+* MultiGet with snapshot and ReadOptions::read_tier = kPersistedTier will now read a consistent view across CFs (instead of potentially reading some CF before and some CF after a flush).

From 0c7e5bd2f0d73d96f620f912a4569ca5375d1786 Mon Sep 17 00:00:00 2001
From: Sean Ovens <seangovens@gmail.com>
Date: Mon, 3 Mar 2025 21:25:29 -0800
Subject: [PATCH 003/500] Shrink size of HashSkipList buckets from 56B to 48B
 (#13424)

Summary:
Previous order of fields in SkipList:

`const uint16_t kMaxHeight_;  // 2B`
`const uint16_t kBranching_;  // 2B`
`const uint32_t kScaledInverseBranching_;  // 4B`
`Comparator const compare_;  // 8B`
`Allocator* const allocator_;  // 8B`
`Node* const head_;  // 8B`
`std::atomic<int> max_height_;  // 4B`
`// 4B padding added automatically for alignment`
`Node** prev_;  // 8B`
`int32_t prev_height_;  // 4B`
`// 4B padding added automatically for alignment`

= 56B in total. By swapping prev_ and prev_height_, we get the following:

`const uint16_t kMaxHeight_;  // 2B`
`const uint16_t kBranching_;  // 2B`
`const uint32_t kScaledInverseBranching_;  // 4B`
`Comparator const compare_;  // 8B`
`Allocator* const allocator_;  // 8B`
`Node* const head_;  // 8B`
`std::atomic<int> max_height_;  // 4B`
`int32_t prev_height_;  // 4B`
`Node** prev_;  // 8B`

= 48B in total. So this change saves 8B per SkipList object. When allocated using AllocateAligned (as is the case for the [hash skiplist](https://github.com/facebook/rocksdb/blob/main/memtable/hash_skiplist_rep.cc#L243)) and assuming alignof(std::max_align_t) = 16, this change saves an additional 8B per SkipList object (so 16B in total).

Note: this does not affect the "skiplist" memtable, which internally uses InlineSkipList

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13424

Reviewed By: cbi42

Differential Revision: D70423252

Pulled By: pdillinger

fbshipit-source-id: 450dcc7f0e9e86cd3481f6930e83eea5fef78b97
---
 memtable/skiplist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index f2e2a829de3b..c2774d6ceeb4 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -134,8 +134,8 @@ class SkipList {
   // i up to max_height_ is the predecessor of prev_[0] and prev_height_
   // is the height of prev_[0].  prev_[0] can only be equal to head before
   // insertion, in which case max_height_ and prev_height_ are 1.
-  Node** prev_;
   int32_t prev_height_;
+  Node** prev_;
 
   inline int GetMaxHeight() const {
     return max_height_.load(std::memory_order_relaxed);

From 5f9b7ccce32ecc05214461c160b0f13644582be8 Mon Sep 17 00:00:00 2001
From: Nicolas De Carli <ndecarli@meta.com>
Date: Tue, 4 Mar 2025 16:51:19 -0800
Subject: [PATCH 004/500] Add ROCKSDB_AUXV_GETAUXVAL_PRESENT flag to defs.bzl
 (#13435)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13435

We've noticed the default CRC32c function gets executed when running on aarch64 cpus within our servers

Issue is that ROCKSDB_AUXV_GETAUXVAL_PRESENT evaluates to false

This fix enables the flag internally and reverts the previous fix, landed with D70423483

Reviewed By: pdillinger

Differential Revision: D70584250

fbshipit-source-id: 28e41316187c474fdfaf854f301ad14b6721fcad
---
 util/crc32c_arm64.cc | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 842a692775e0..4bccb75bc792 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -10,7 +10,7 @@
 #if defined(__linux__)
 #include <asm/hwcap.h>
 #endif
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__linux__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
 #include <sys/auxv.h>
 #endif
 #ifndef HWCAP_CRC32
@@ -52,10 +52,9 @@
 extern bool pmull_runtime_flag;
 
 uint32_t crc32c_runtime_check(void) {
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) || \
-    defined(__linux__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
   uint64_t auxv = 0;
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__linux__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
   auxv = getauxval(AT_HWCAP);
 #elif defined(__FreeBSD__)
   elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
@@ -82,10 +81,9 @@ uint32_t crc32c_runtime_check(void) {
 }
 
 bool crc32c_pmull_runtime_check(void) {
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) || \
-    defined(__linux__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__)
   uint64_t auxv = 0;
-#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__linux__)
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
   auxv = getauxval(AT_HWCAP);
 #elif defined(__FreeBSD__)
   elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));

From da8eba8b4927ae92be24b4dab3417187ca979d66 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 4 Mar 2025 17:44:01 -0800
Subject: [PATCH 005/500] Improve consistency of SeqnoToTime tracking in
 SuperVersion (#13316)

Summary:
This is an unexpectedly complex follow-up to https://github.com/facebook/rocksdb/issues/13269.

This change solves (and detects regressed) inconsistencies between whether a CF's SuperVersion is configured with a preserve/preclude option and whether it gets a usable SeqnoToTimeMapping. Operating with preserve/preclude and no usable mapping is degraded functionality we need to avoid. And no mapping is useful for actually disabling the feature (except with respect to existing SST files, but that's less of a concern for now).

The challenge is that how we maintain the DB's SeqnoToTimeMapping can depend on all the column families, and we don't want to iterate over all column families *for each column family* (e.g. on initially creating each). The existing code was a bit relaxed:
* On initially creating or re-configuring a CF, we might install an empty mapping, but soon thereafter (after releasing and re-acquiring the DB mutex) re-install another SuperVersion with a useful mapping.

The solution here is to refactor the logic so that there's a distinct but related workflow for (a) ensuring a quality set of mappings when we might only be considering a single CF (`EnsureSeqnoToTimeMapping()`), and (b) massaging that set of mappings to account for all CFs (`RegisterRecordSeqnoTimeWorker`) which doesn't need to re-install new SuperVersions because each CF already has good mappings and will get updated SuperVersions when the periodic task adds new mappings. This should eliminate the extra SuperVersion installs associated with preserve/preclude on CF creation or re-configure, making it the same as any other CF.

Some more details:
* Some refactorings such as removing new_seqno_to_time_mapping from SuperVersionContext. (Now use parameter instead of being stateful.)
* Propagate `read_only` aspect of DB to more places so that we can pro-actively disable preserve/preclude on read-only DBs, so that we don't run afoul of the assertion expecting SeqnoToTime entries.
* Introduce a utility struct `MinAndMaxPreserveSeconds` for aggregating preserve/preclude settings in a useful way, sometimes on one CF and sometimes across multiple CFs. Much cleaner! (IMHO)
* Introduce a function `InstallSuperVersionForConfigChange` that is a superset of `InstallSuperVersionAndScheduleWork` for when a CF is new or might have had a change to its mutable options.
* Eliminate redundant re-install SuperVersions of created "missing" CFs in DBImpl::Open.

Intended follow-up:
* Ensure each flush has an "upper bound" SeqnoToTime entry, which would resolve a FIXME in tiered_compaction_test, but causes enough test churn to deserve its own PR + investigation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13316

Test Plan:
This change is primarily validated by a new assertion in SuperVersion::Init to ensure consistency between (a) presence of any SeqnoToTime mappings in the SuperVersion and (b) preserve/preclude option being currently set.

One unit test update was needed because we now ensure at least one SeqnoToTime entry is created on any DB::Open with preserve/preclude, so that there is a lower bound time on all the future data writes. This required a small hack in associating the time with Seqno 1 instead of 0, which is reserved for "unspecified old."

Reviewed By: cbi42

Differential Revision: D70540638

Pulled By: pdillinger

fbshipit-source-id: bb419fdbeb5a1f115fc429c211f9b8efaf2f56d7
---
 db/column_family.cc                    |  59 ++--
 db/column_family.h                     |  42 ++-
 db/column_family_test.cc               |   7 +-
 db/compaction/compaction_job.cc        |   3 +-
 db/db_impl/db_impl.cc                  | 377 ++++++++++++-------------
 db/db_impl/db_impl.h                   |  54 ++--
 db/db_impl/db_impl_compaction_flush.cc |  15 +-
 db/db_impl/db_impl_open.cc             |  32 ++-
 db/db_options_test.cc                  |   3 +-
 db/flush_job.cc                        |   9 +-
 db/job_context.h                       |   9 +-
 db/repair.cc                           |   8 +-
 db/seqno_time_test.cc                  |   5 +-
 db/seqno_to_time_mapping.cc            |   4 +-
 db/seqno_to_time_mapping.h             |  48 +++-
 db/version_edit_handler.cc             |   4 +-
 db/version_set.cc                      |   7 +-
 db/version_set.h                       |   2 +-
 tools/db_crashtest.py                  |   2 +-
 19 files changed, 381 insertions(+), 309 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 2c1ad930ab01..6b54454f8ff5 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -200,6 +200,7 @@ const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
 }  // anonymous namespace
 
 ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                    bool read_only,
                                     const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
   size_t clamp_max = std::conditional<
@@ -435,6 +436,18 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
     result.periodic_compaction_seconds = 0;
   }
 
+  if (read_only && (result.preserve_internal_time_seconds > 0 ||
+                    result.preclude_last_level_data_seconds > 0)) {
+    // With no writes coming in, we don't need periodic SeqnoToTime entries.
+    // Existing SST files may or may not have that info associated with them.
+    ROCKS_LOG_WARN(
+        db_options.info_log.get(),
+        "preserve_internal_time_seconds and preclude_last_level_data_seconds "
+        "are ignored in read-only DB");
+    result.preserve_internal_time_seconds = 0;
+    result.preclude_last_level_data_seconds = 0;
+  }
+
   return result;
 }
 
@@ -492,6 +505,17 @@ void SuperVersion::Init(
   imm->Ref();
   current->Ref();
   refs.store(1, std::memory_order_relaxed);
+
+  // There should be at least one mapping entry iff time tracking is enabled.
+#ifndef NDEBUG
+  MinAndMaxPreserveSeconds preserve_info{mutable_cf_options};
+  if (preserve_info.IsEnabled()) {
+    assert(seqno_to_time_mapping);
+    assert(!seqno_to_time_mapping->Empty());
+  } else {
+    assert(seqno_to_time_mapping == nullptr);
+  }
+#endif  // NDEBUG
 }
 
 namespace {
@@ -530,7 +554,7 @@ ColumnFamilyData::ColumnFamilyData(
     const FileOptions* file_options, ColumnFamilySet* column_family_set,
     BlockCacheTracer* const block_cache_tracer,
     const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
-    const std::string& db_session_id)
+    const std::string& db_session_id, bool read_only)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -540,7 +564,7 @@ ColumnFamilyData::ColumnFamilyData(
       dropped_(false),
       flush_skip_reschedule_(false),
       internal_comparator_(cf_options.comparator),
-      initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+      initial_cf_options_(SanitizeOptions(db_options, read_only, cf_options)),
       ioptions_(db_options, initial_cf_options_),
       mutable_cf_options_(initial_cf_options_),
       is_delete_range_supported_(
@@ -1339,20 +1363,17 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
-void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context,
-                                           InstrumentedMutex* db_mutex) {
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
+    std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+        new_seqno_to_time_mapping) {
   db_mutex->AssertHeld();
-  return InstallSuperVersion(sv_context, mutable_cf_options_);
-}
 
-void ColumnFamilyData::InstallSuperVersion(
-    SuperVersionContext* sv_context,
-    const MutableCFOptions& mutable_cf_options) {
   SuperVersion* new_superversion = sv_context->new_superversion.release();
-  new_superversion->mutable_cf_options = mutable_cf_options;
+  new_superversion->mutable_cf_options = GetLatestMutableCFOptions();
   new_superversion->Init(this, mem_, imm_.current(), current_,
-                         sv_context->new_seqno_to_time_mapping
-                             ? std::move(sv_context->new_seqno_to_time_mapping)
+                         new_seqno_to_time_mapping.has_value()
+                             ? std::move(new_seqno_to_time_mapping.value())
                          : super_version_
                              ? super_version_->ShareSeqnoToTimeMapping()
                              : nullptr);
@@ -1365,7 +1386,7 @@ void ColumnFamilyData::InstallSuperVersion(
     // currently RecalculateWriteStallConditions() treats it as further slowing
     // down is needed.
     super_version_->write_stall_condition =
-        RecalculateWriteStallConditions(mutable_cf_options);
+        RecalculateWriteStallConditions(new_superversion->mutable_cf_options);
   } else {
     super_version_->write_stall_condition =
         old_superversion->write_stall_condition;
@@ -1378,8 +1399,9 @@ void ColumnFamilyData::InstallSuperVersion(
     ResetThreadLocalSuperVersions();
 
     if (old_superversion->mutable_cf_options.write_buffer_size !=
-        mutable_cf_options.write_buffer_size) {
-      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+        new_superversion->mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(
+          new_superversion->mutable_cf_options.write_buffer_size);
     }
     if (old_superversion->write_stall_condition !=
         new_superversion->write_stall_condition) {
@@ -1680,7 +1702,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
       dummy_cfd_(new ColumnFamilyData(
           ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
           nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
-          block_cache_tracer, io_tracer, db_id, db_session_id)),
+          block_cache_tracer, io_tracer, db_id, db_session_id,
+          /*read_only*/ true)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
@@ -1752,12 +1775,12 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const {
 // under a DB mutex AND write thread
 ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
     const std::string& name, uint32_t id, Version* dummy_versions,
-    const ColumnFamilyOptions& options) {
+    const ColumnFamilyOptions& options, bool read_only) {
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
       id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
       *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
-      db_id_, db_session_id_);
+      db_id_, db_session_id_, read_only);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
   auto ucmp = new_cfd->user_comparator();
diff --git a/db/column_family.h b/db/column_family.h
index 51ad803b9002..89ededb502cd 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -282,6 +282,7 @@ Status CheckCFPathsSupported(const DBOptions& db_options,
                              const ColumnFamilyOptions& cf_options);
 
 ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                    bool read_only,
                                     const ColumnFamilyOptions& src);
 // Wrap user defined table properties collector factories `from cf_options`
 // into internal ones in internal_tbl_prop_coll_factories. Add a system internal
@@ -384,10 +385,10 @@ class ColumnFamilyData {
   Version* dummy_versions() { return dummy_versions_; }
   Version* current() { return current_; }  // REQUIRE: DB mutex held
   void SetCurrent(Version* _current);      // REQUIRE: DB mutex held
-  uint64_t GetNumLiveVersions() const;    // REQUIRE: DB mutex held
-  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
-  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
-  uint64_t GetTotalBlobFileSize() const;  // REQUIRE: DB mutex held
+  uint64_t GetNumLiveVersions() const;     // REQUIRE: DB mutex held
+  uint64_t GetTotalSstFilesSize() const;   // REQUIRE: DB mutex held
+  uint64_t GetLiveSstFilesSize() const;    // REQUIRE: DB mutex held
+  uint64_t GetTotalBlobFileSize() const;   // REQUIRE: DB mutex held
   // REQUIRE: DB mutex held
   void SetMemtable(MemTable* new_mem) {
     AssignMemtableID(new_mem);
@@ -487,15 +488,11 @@ class ColumnFamilyData {
   uint64_t GetSuperVersionNumberRelaxed() const {
     return super_version_number_.load(std::memory_order_relaxed);
   }
-  // will return a pointer to SuperVersion* if previous SuperVersion
-  // if its reference count is zero and needs deletion or nullptr if not
-  // As argument takes a pointer to allocated SuperVersion to enable
-  // the clients to allocate SuperVersion outside of mutex.
-  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+  // Only intended for use by DBImpl::InstallSuperVersion() and variants
   void InstallSuperVersion(SuperVersionContext* sv_context,
-                           const MutableCFOptions& mutable_cf_options);
-  void InstallSuperVersion(SuperVersionContext* sv_context,
-                           InstrumentedMutex* db_mutex);
+                           InstrumentedMutex* db_mutex,
+                           std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+                               new_seqno_to_time_mapping = {});
 
   void ResetThreadLocalSuperVersions();
 
@@ -590,16 +587,14 @@ class ColumnFamilyData {
 
  private:
   friend class ColumnFamilySet;
-  ColumnFamilyData(uint32_t id, const std::string& name,
-                   Version* dummy_versions, Cache* table_cache,
-                   WriteBufferManager* write_buffer_manager,
-                   const ColumnFamilyOptions& options,
-                   const ImmutableDBOptions& db_options,
-                   const FileOptions* file_options,
-                   ColumnFamilySet* column_family_set,
-                   BlockCacheTracer* const block_cache_tracer,
-                   const std::shared_ptr<IOTracer>& io_tracer,
-                   const std::string& db_id, const std::string& db_session_id);
+  ColumnFamilyData(
+      uint32_t id, const std::string& name, Version* dummy_versions,
+      Cache* table_cache, WriteBufferManager* write_buffer_manager,
+      const ColumnFamilyOptions& options, const ImmutableDBOptions& db_options,
+      const FileOptions* file_options, ColumnFamilySet* column_family_set,
+      BlockCacheTracer* const block_cache_tracer,
+      const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
+      const std::string& db_session_id, bool read_only);
 
   std::vector<std::string> GetDbPaths() const;
 
@@ -761,7 +756,8 @@ class ColumnFamilySet {
 
   ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
                                        Version* dummy_version,
-                                       const ColumnFamilyOptions& options);
+                                       const ColumnFamilyOptions& options,
+                                       bool read_only);
 
   const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
       const {
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 29ff2d15adbf..224257df4940 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -271,7 +271,8 @@ class ColumnFamilyTestBase : public testing::Test {
       // them.
       ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
           ConfigOptions(), desc.options,
-          SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
+          SanitizeOptions(dbfull()->immutable_db_options(), /*read_only*/ false,
+                          current_cf_opt)));
       cfi++;
     }
   }
@@ -2248,8 +2249,8 @@ TEST_P(ColumnFamilyTest, SanitizeOptions) {
             original.write_buffer_size =
                 l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
 
-            ColumnFamilyOptions result =
-                SanitizeOptions(ImmutableDBOptions(db_options), original);
+            ColumnFamilyOptions result = SanitizeOptions(
+                ImmutableDBOptions(db_options), /*read_only*/ false, original);
             ASSERT_TRUE(result.level0_stop_writes_trigger >=
                         result.level0_slowdown_writes_trigger);
             ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 0ea74891e40d..4fba2e6d9be3 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -301,8 +301,7 @@ void CompactionJob::Prepare(
   SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber;
   SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber;
   uint64_t preserve_time_duration =
-      std::max(c->mutable_cf_options().preserve_internal_time_seconds,
-               c->mutable_cf_options().preclude_last_level_data_seconds);
+      MinAndMaxPreserveSeconds(c->mutable_cf_options()).max_preserve_seconds;
 
   if (preserve_time_duration > 0) {
     const ReadOptions read_options(Env::IOActivity::kCompaction);
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index be175614d881..a3c107e0ab45 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -284,9 +284,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
                                    [this]() { this->FlushInfoLog(); });
   periodic_task_functions_.emplace(
-      PeriodicTaskType::kRecordSeqnoTime, [this]() {
-        this->RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
-      });
+      PeriodicTaskType::kRecordSeqnoTime,
+      [this]() { this->RecordSeqnoToTimeMapping(); });
 
   versions_.reset(new VersionSet(
       dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
@@ -843,57 +842,40 @@ Status DBImpl::StartPeriodicTaskScheduler() {
   return s;
 }
 
-Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
-                                             const WriteOptions& write_options,
-                                             bool is_new_db) {
+Status DBImpl::RegisterRecordSeqnoTimeWorker() {
   options_mutex_.AssertHeld();
 
-  uint64_t min_preserve_seconds = std::numeric_limits<uint64_t>::max();
-  uint64_t max_preserve_seconds = std::numeric_limits<uint64_t>::min();
-  std::vector<SuperVersionContext> sv_contexts;
+  // We assume InstallSuperVersionForConfigChange has already ensured suitable
+  // mappings are present for each relevant CF. We just need to be sure the DB's
+  // seqno_to_time_mapping_ and worker scheduler are appropriate for the
+  // combination of CF settings.
+
+  MinAndMaxPreserveSeconds preserve_info;
+  uint64_t seqno_time_cadence;
   {
     InstrumentedMutexLock l(&mutex_);
 
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       auto& mopts = cfd->GetLatestMutableCFOptions();
-      // preserve time is the max of 2 options.
-      uint64_t preserve_seconds =
-          std::max(mopts.preserve_internal_time_seconds,
-                   mopts.preclude_last_level_data_seconds);
-      if (!cfd->IsDropped() && preserve_seconds > 0) {
-        min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
-        max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
+      if (!cfd->IsDropped()) {
+        preserve_info.Combine(mopts);
       }
     }
-    size_t old_mapping_size = seqno_to_time_mapping_.Size();
-    if (min_preserve_seconds == std::numeric_limits<uint64_t>::max()) {
-      // Don't track
+    seqno_time_cadence = preserve_info.GetRecodingCadence();
+    if (seqno_time_cadence == 0) {
+      // To return as much as possible to the feature being disabled,
+      // clear the existing mapping
       seqno_to_time_mapping_.SetCapacity(0);
       seqno_to_time_mapping_.SetMaxTimeSpan(UINT64_MAX);
+      assert(seqno_to_time_mapping_.Empty());
     } else {
       uint64_t cap = std::min(kMaxSeqnoToTimeEntries,
-                              max_preserve_seconds * kMaxSeqnoTimePairsPerCF /
-                                  min_preserve_seconds);
+                              preserve_info.max_preserve_seconds *
+                                  kMaxSeqnoTimePairsPerCF /
+                                  preserve_info.min_preserve_seconds);
       seqno_to_time_mapping_.SetCapacity(cap);
-      seqno_to_time_mapping_.SetMaxTimeSpan(max_preserve_seconds);
+      seqno_to_time_mapping_.SetMaxTimeSpan(preserve_info.max_preserve_seconds);
     }
-    if (old_mapping_size != seqno_to_time_mapping_.Size()) {
-      InstallSeqnoToTimeMappingInSV(&sv_contexts);
-    }
-  }
-
-  // clean up outside db mutex
-  for (SuperVersionContext& sv_context : sv_contexts) {
-    sv_context.Clean();
-  }
-  sv_contexts.clear();
-
-  uint64_t seqno_time_cadence = 0;
-  if (min_preserve_seconds != std::numeric_limits<uint64_t>::max()) {
-    // round up to 1 when the time_duration is smaller than
-    // kMaxSeqnoTimePairsPerCF
-    seqno_time_cadence = (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
-                         kMaxSeqnoTimePairsPerCF;
   }
 
   TEST_SYNC_POINT_CALLBACK(
@@ -903,64 +885,6 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
   if (seqno_time_cadence == 0) {
     s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
   } else {
-    // Before registering the periodic task, we need to be sure to fulfill two
-    // promises:
-    // 1) Any DB created with preserve/preclude options set from the beginning
-    // will get pre-allocated seqnos with pre-populated time mappings back to
-    // the times we are interested in. (This will enable future import of data
-    // while preserving rough write time. We can only do this reliably from
-    // DB::Open, as otherwise there could be a race between CreateColumnFamily
-    // and the first Write to the DB, and seqno-to-time mappings need to be
-    // monotonic.
-    // 2) In any DB, any data written after setting preserve/preclude options
-    // must have a reasonable time estimate (so that we can accurately place
-    // the data), which means at least one entry in seqno_to_time_mapping_.
-    //
-    // FIXME: We don't currently guarantee that if the first column family with
-    // that setting is added or configured after initial DB::Open but before
-    // the first user Write. Fixing this causes complications with the crash
-    // test because if DB starts without preserve/preclude option, does some
-    // user writes but all those writes are lost in crash, then re-opens with
-    // preserve/preclude option, it sees seqno==1 which looks like one of the
-    // user writes was recovered, when actually it was not.
-    bool last_seqno_zero = GetLatestSequenceNumber() == 0;
-    assert(!is_new_db || last_seqno_zero);
-    if (is_new_db && last_seqno_zero) {
-      // Pre-allocate seqnos and pre-populate historical mapping
-      // We can simply modify these, before writes are allowed
-      constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
-      versions_->SetLastAllocatedSequence(kMax);
-      versions_->SetLastPublishedSequence(kMax);
-      versions_->SetLastSequence(kMax);
-
-      // And record in manifest, to avoid going backwards in seqno on re-open
-      // (potentially with different options). Concurrency is simple because we
-      // are in DB::Open
-      {
-        InstrumentedMutexLock l(&mutex_);
-        VersionEdit edit;
-        edit.SetLastSequence(kMax);
-        s = versions_->LogAndApplyToDefaultColumnFamily(
-            read_options, write_options, &edit, &mutex_,
-            directories_.GetDbDir());
-        if (!s.ok() && versions_->io_status().IsIOError()) {
-          error_handler_.SetBGError(versions_->io_status(),
-                                    BackgroundErrorReason::kManifestWrite);
-        }
-      }
-
-      // Pre-populate mappings for reserved sequence numbers.
-      RecordSeqnoToTimeMapping(max_preserve_seconds);
-    } else {
-      if (!last_seqno_zero) {
-        // Ensure at least one mapping (or log a warning), and
-        // an updated entry whenever relevant SetOptions is called
-        RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0);
-      } else {
-        // FIXME (see limitation described above)
-      }
-    }
-
     s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kRecordSeqnoTime,
         periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
@@ -1309,8 +1233,9 @@ Status DBImpl::SetOptions(
     //
     // (b) Append a new Version without manifest write nor DB mutex release
     //
-    // Thus aren't releasing the DB mutex again until the end of this block,
-    // after installing the new SuperVersion.
+    // Thus aren't releasing the DB mutex from LogAndApply calling pre_cb,
+    // through installing the new Version until the end of this block, after
+    // installing the new SuperVersion.
     auto pre_cb = [&]() -> Status {
       Status cb_s = cfd->SetOptions(db_options, options_map);
       if (cb_s.ok()) {
@@ -1335,7 +1260,7 @@ Status DBImpl::SetOptions(
       // Trigger possible flush/compactions. This has to be before we persist
       // options to file, otherwise there will be a deadlock with writer
       // thread.
-      InstallSuperVersionAndScheduleWork(cfd, &sv_context);
+      InstallSuperVersionForConfigChange(cfd, &sv_context);
       persist_options_status =
           WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
       bg_cv_.SignalAll();
@@ -1353,8 +1278,7 @@ Status DBImpl::SetOptions(
 
   if (s.ok() && (options_map.count("preserve_internal_time_seconds") > 0 ||
                  options_map.count("preclude_last_level_data_seconds") > 0)) {
-    s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                      false /* is_new_db*/);
+    s = RegisterRecordSeqnoTimeWorker();
   }
 
   ROCKS_LOG_INFO(
@@ -3542,7 +3466,7 @@ void DBImpl::MultiGetEntityWithCallback(
 }
 
 Status DBImpl::WrapUpCreateColumnFamilies(
-    const ReadOptions& read_options, const WriteOptions& write_options,
+    const WriteOptions& write_options,
     const std::vector<const ColumnFamilyOptions*>& cf_options) {
   options_mutex_.AssertHeld();
 
@@ -3559,8 +3483,7 @@ Status DBImpl::WrapUpCreateColumnFamilies(
   // Attempt both follow-up actions even if one fails
   Status s = WriteOptionsFile(write_options, false /*db_mutex_already_held*/);
   if (register_worker) {
-    s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                               /* is_new_db */ false));
+    s.UpdateIfOk(RegisterRecordSeqnoTimeWorker());
   }
   return s;
 }
@@ -3575,8 +3498,7 @@ Status DBImpl::CreateColumnFamily(const ReadOptions& read_options,
   Status s = CreateColumnFamilyImpl(read_options, write_options, cf_options,
                                     column_family, handle);
   if (s.ok()) {
-    s.UpdateIfOk(
-        WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
+    s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
   }
   return s;
 }
@@ -3603,8 +3525,7 @@ Status DBImpl::CreateColumnFamilies(
     success_once = true;
   }
   if (success_once) {
-    s.UpdateIfOk(
-        WrapUpCreateColumnFamilies(read_options, write_options, {&cf_options}));
+    s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, {&cf_options}));
   }
   return s;
 }
@@ -3634,8 +3555,7 @@ Status DBImpl::CreateColumnFamilies(
     cf_opts.push_back(&column_families[i].options);
   }
   if (success_once) {
-    s.UpdateIfOk(
-        WrapUpCreateColumnFamilies(read_options, write_options, cf_opts));
+    s.UpdateIfOk(WrapUpCreateColumnFamilies(write_options, cf_opts));
   }
   return s;
 }
@@ -3704,7 +3624,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
       auto* cfd =
           versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
       assert(cfd != nullptr);
-      InstallSuperVersionAndScheduleWork(cfd, &sv_context);
+      InstallSuperVersionForConfigChange(cfd, &sv_context);
 
       if (!cfd->mem()->IsSnapshotSupported()) {
         is_snapshot_supported_ = false;
@@ -3788,7 +3708,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
   Status s;
   // Save re-aquiring lock for RegisterRecordSeqnoTimeWorker when not
   // applicable
-  bool used_preserve_preclude = false;
+  MinAndMaxPreserveSeconds preserve_info;
   {
     InstrumentedMutexLock l(&mutex_);
     if (cfd->IsDropped()) {
@@ -3806,8 +3726,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
       auto& moptions = cfd->GetLatestMutableCFOptions();
       max_total_in_memory_state_ -=
           moptions.write_buffer_size * moptions.max_write_buffer_number;
-      used_preserve_preclude = moptions.preserve_internal_time_seconds > 0 ||
-                               moptions.preclude_last_level_data_seconds > 0;
+      preserve_info.Combine(moptions);
     }
 
     if (!cf_support_snapshot) {
@@ -3825,9 +3744,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
     bg_cv_.SignalAll();
   }
 
-  if (used_preserve_preclude) {
-    s = RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                      /* is_new_db */ false);
+  if (preserve_info.IsEnabled()) {
+    s = RegisterRecordSeqnoTimeWorker();
   }
 
   if (s.ok()) {
@@ -6239,7 +6157,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
           versions_->LogAndApply(cfd, read_options, write_options, &dummy_edit,
                                  &mutex_, directories_.GetDbDir());
       if (status.ok()) {
-        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx);
+        InstallSuperVersionForConfigChange(cfd, &dummy_sv_ctx);
       }
     }
   }
@@ -6276,7 +6194,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
                                         import_job.edit(), &mutex_,
                                         directories_.GetDbDir());
         if (status.ok()) {
-          InstallSuperVersionAndScheduleWork(cfd, &sv_context);
+          InstallSuperVersionForConfigChange(cfd, &sv_context);
         }
       }
 
@@ -6736,7 +6654,7 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
   }
 }
 
-void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
+std::pair<SequenceNumber, uint64_t> DBImpl::GetSeqnoToTimeSample() const {
   // TECHNICALITY: Sample last sequence number *before* time, as prescribed
   // for SeqnoToTimeMapping. We don't know how long it has been since the last
   // sequence number was written, so we at least have a one-sided bound by
@@ -6745,63 +6663,162 @@ void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) {
   // while holding the DB mutex. (This is really to make testing happy because
   // it's fine to throw out extra close-but-not-quite-consistent mappings in
   // production.)
-  std::vector<SuperVersionContext> sv_contexts;
-  bool success = true;
-  SequenceNumber seqno;
-  uint64_t unix_time;
-  {
-    InstrumentedMutexLock l(&mutex_);
+  mutex_.AssertHeld();
+  SequenceNumber seqno = GetLatestSequenceNumber();
+  // HACK/TODO: seqno might be zero but we can't record a mapping for that.
+  // Start with 1, which should be close enough.
+  seqno = std::max(seqno, SequenceNumber{1});
+  int64_t unix_time_signed = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
+      .PermitUncheckedError();  // Ignore error
+  return {seqno, static_cast<uint64_t>(unix_time_signed)};
+}
 
-    seqno = GetLatestSequenceNumber();
-    int64_t unix_time_signed = 0;
-    immutable_db_options_.clock->GetCurrentTime(&unix_time_signed)
-        .PermitUncheckedError();  // Ignore error
-    unix_time = static_cast<uint64_t>(unix_time_signed);
-
-    if (populate_historical_seconds > 0) {
-      if (seqno > 1 && unix_time > populate_historical_seconds) {
-        // seqno=0 is reserved
-        SequenceNumber from_seqno = 1;
-        success = seqno_to_time_mapping_.PrePopulate(
-            from_seqno, seqno, unix_time - populate_historical_seconds,
-            unix_time);
-        InstallSeqnoToTimeMappingInSV(&sv_contexts);
-      } else {
-        // One of these will fail
-        assert(seqno > 1);
-        assert(unix_time > populate_historical_seconds);
-        success = false;
-      }
-    } else {
-      // FIXME: assert(seqno > 0);
-      // Always successful assuming seqno never go backwards
-      seqno_to_time_mapping_.Append(seqno, unix_time);
-      InstallSeqnoToTimeMappingInSV(&sv_contexts);
-    }
+void DBImpl::EnsureSeqnoToTimeMapping(
+    const MinAndMaxPreserveSeconds& preserve_info) {
+  mutex_.AssertHeld();
+  assert(preserve_info.IsEnabled());
+
+  // Atomically with CF creation or mutable option change (see
+  // InstallSuperVersionForConfigChange()), we need to be sure any data written
+  // after setting preserve/preclude options must have a reasonable time
+  // estimate (so that we can accurately place the data), which means at least
+  // one entry in seqno_to_time_mapping_. It's not critical that `preserve_info`
+  // take into account all CFs, as that's mostly relevant to how we add
+  // recurring entries and purge old ones.
+
+  auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
+  // Ensure at least one sample that is sufficiently recent
+  uint64_t unix_time_last_sample = 0;
+  if (seqno_to_time_mapping_.Empty()) {
+    // The exact best settings will be found and applied in
+    // RegisterRecordSeqnoTimeWorker()
+    seqno_to_time_mapping_.SetCapacity(kMaxSeqnoToTimeEntries);
+  } else {
+    unix_time_last_sample =
+        seqno_to_time_mapping_.GetProximalTimeBeforeSeqno(kMaxSequenceNumber);
+  }
+  uint64_t cadence = preserve_info.GetRecodingCadence();
+  // Extend cadence so as to avoid stepping on toes of recorder job, which
+  // could lag a bit.
+  cadence += 3 + cadence / 100;
+  if (unix_time_now >= cadence &&
+      unix_time_last_sample <= unix_time_now - cadence) {
+    assert(seqno > 0);  // See GetSeqnoToTimeSample()
+    // Always successful assuming seqno never go backwards
+    seqno_to_time_mapping_.Append(seqno, unix_time_now);
   }
+}
 
-  // clean up & report outside db mutex
-  for (SuperVersionContext& sv_context : sv_contexts) {
-    sv_context.Clean();
+void DBImpl::PrepopulateSeqnoToTimeMapping(
+    const MinAndMaxPreserveSeconds& preserve_info) {
+  // Only for opening a new DB, with preserve/preclude options set
+  if (!preserve_info.IsEnabled()) {
+    assert(false);
+    return;
+  }
+  if (GetLatestSequenceNumber() != 0) {
+    assert(false);
+    return;
   }
 
-  if (populate_historical_seconds > 0) {
-    if (success) {
-      ROCKS_LOG_INFO(
-          immutable_db_options_.info_log,
-          "Pre-populated sequence number to time entries: [1,%" PRIu64
-          "] -> [%" PRIu64 ",%" PRIu64 "]",
-          seqno, unix_time - populate_historical_seconds, unix_time);
-    } else {
-      ROCKS_LOG_WARN(
-          immutable_db_options_.info_log,
-          "Failed to pre-populate sequence number to time entries: [1,%" PRIu64
-          "] -> [%" PRIu64 ",%" PRIu64 "]",
-          seqno, unix_time - populate_historical_seconds, unix_time);
-    }
+  // Here we fulfill the following promise:
+  //
+  // Any DB/CF created with preserve/preclude options set from the beginning
+  // will get pre-allocated seqnos with pre-populated time mappings back to
+  // the times we are interested in. (This will enable future import of data
+  // while preserving rough write time. We can only do this reliably from
+  // DB::Open, as otherwise there could be a race between CreateColumnFamily
+  // and the first Write to the DB, and seqno-to-time mappings need to be
+  // monotonic.
+  //
+  // FIXME: We don't currently guarantee that if the first column family with
+  // that setting is added or configured after initial DB::Open but before
+  // the first user Write. Fixing this causes complications with the crash
+  // test because if DB starts without preserve/preclude option, does some
+  // user writes but all those writes are lost in crash, then re-opens with
+  // preserve/preclude option, it sees seqno==1 which looks like one of the
+  // user writes was recovered, when actually it was not.
+
+  // Pre-allocate seqnos and pre-populate historical mapping
+  // We can simply modify these, before writes are allowed
+  constexpr uint64_t kMax = kMaxSeqnoTimePairsPerSST;
+  versions_->SetLastAllocatedSequence(kMax);
+  versions_->SetLastPublishedSequence(kMax);
+  versions_->SetLastSequence(kMax);
+
+  // And record in manifest, to avoid going backwards in seqno on re-open
+  // (potentially with different options). Concurrency is simple because we
+  // are in DB::Open
+  const WriteOptions write_options(Env::IOActivity::kDBOpen);
+  const ReadOptions read_options(Env::IOActivity::kDBOpen);
+  VersionEdit edit;
+  edit.SetLastSequence(kMax);
+  Status s = versions_->LogAndApplyToDefaultColumnFamily(
+      read_options, write_options, &edit, &mutex_, directories_.GetDbDir());
+  if (!s.ok() && versions_->io_status().IsIOError()) {
+    error_handler_.SetBGError(versions_->io_status(),
+                              BackgroundErrorReason::kManifestWrite);
+  }
+
+  auto [seqno, unix_time_now] = GetSeqnoToTimeSample();
+  uint64_t populate_historical_seconds = preserve_info.max_preserve_seconds;
+  if (seqno > 1 && unix_time_now > populate_historical_seconds) {
+    // seqno=0 is reserved
+    SequenceNumber from_seqno = 1;
+    seqno_to_time_mapping_.PrePopulate(
+        from_seqno, seqno, unix_time_now - populate_historical_seconds,
+        unix_time_now);
   } else {
-    assert(success);
+    // One of these will fail
+    assert(seqno > 1);
+    assert(unix_time_now > populate_historical_seconds);
+  }
+}
+
+void DBImpl::InstallSuperVersionForConfigChange(
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context) {
+  MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
+  std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping;
+  if (preserve_info.IsEnabled()) {
+    // TODO: detect & optimize if mapping hasn't changed from previous
+    // SuperVersion
+    EnsureSeqnoToTimeMapping(preserve_info);
+    new_seqno_to_time_mapping = std::make_shared<SeqnoToTimeMapping>();
+    new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
+  }
+  InstallSuperVersionAndScheduleWork(cfd, sv_context,
+                                     std::move(new_seqno_to_time_mapping));
+}
+
+void DBImpl::RecordSeqnoToTimeMapping() {
+  SuperVersionContext sv_context;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // Record next sample
+    seqno_to_time_mapping_.Append(GetSeqnoToTimeSample());
+    // Create an immutable snapshot for sharing across CFs
+    std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
+        std::make_shared<SeqnoToTimeMapping>();
+    new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
+
+    // Update in SV of all applicable CFs
+    for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      MinAndMaxPreserveSeconds preserve_info{cfd->GetLatestCFOptions()};
+      if (preserve_info.IsEnabled()) {
+        sv_context.NewSuperVersion();
+        cfd->InstallSuperVersion(&sv_context, &mutex_,
+                                 new_seqno_to_time_mapping);
+      }
+    }
+    bg_cv_.SignalAll();
   }
+
+  // clean up & report outside db mutex
+  sv_context.Clean();
 }
 
 void DBImpl::TrackOrUntrackFiles(
@@ -6860,22 +6877,4 @@ void DBImpl::TrackOrUntrackFiles(
   }
 }
 
-void DBImpl::InstallSeqnoToTimeMappingInSV(
-    std::vector<SuperVersionContext>* sv_contexts) {
-  mutex_.AssertHeld();
-  std::shared_ptr<SeqnoToTimeMapping> new_seqno_to_time_mapping =
-      std::make_shared<SeqnoToTimeMapping>();
-  new_seqno_to_time_mapping->CopyFrom(seqno_to_time_mapping_);
-  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->IsDropped()) {
-      continue;
-    }
-    sv_contexts->emplace_back(/*create_superversion=*/true);
-    sv_contexts->back().new_seqno_to_time_mapping = new_seqno_to_time_mapping;
-    cfd->InstallSuperVersion(&sv_contexts->back(),
-                             cfd->GetLatestMutableCFOptions());
-  }
-  bg_cv_.SignalAll();
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 9c3f4dbd7cd9..b746a7f3902c 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1267,27 +1267,18 @@ class DBImpl : public DB {
   // flush LOG out of application buffer
   void FlushInfoLog();
 
-  // record current sequence number to time mapping. If
-  // populate_historical_seconds > 0 then pre-populate all the
-  // sequence numbers from [1, last] to map to [now minus
-  // populate_historical_seconds, now].
-  void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds);
-
-  // Everytime DB's seqno to time mapping changed (which already hold the db
-  // mutex), we install a new SuperVersion in each column family with a shared
-  // copy of the new mapping while holding the db mutex.
-  // This is done for all column families even though the column family does not
-  // explicitly enabled the
-  // `preclude_last_level_data_seconds` or `preserve_internal_time_seconds`
-  // features.
-  // This mapping supports iterators to fulfill the
-  // "rocksdb.iterator.write-time" iterator property for entries in memtables.
-  //
-  // Since this new SuperVersion doesn't involve an LSM tree shape change, we
-  // don't schedule work after installing this SuperVersion. It returns the used
-  // `SuperVersionContext` for clean up after release mutex.
-  void InstallSeqnoToTimeMappingInSV(
-      std::vector<SuperVersionContext>* sv_contexts);
+  // For the background timer job
+  void RecordSeqnoToTimeMapping();
+
+  // REQUIRES: DB mutex held
+  std::pair<SequenceNumber, uint64_t> GetSeqnoToTimeSample() const;
+
+  // REQUIRES: DB mutex held or during open
+  void EnsureSeqnoToTimeMapping(const MinAndMaxPreserveSeconds& preserve_secs);
+
+  // Only called during open
+  void PrepopulateSeqnoToTimeMapping(
+      const MinAndMaxPreserveSeconds& preserve_secs);
 
   // Interface to block and signal the DB in case of stalling writes by
   // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
@@ -1979,7 +1970,7 @@ class DBImpl : public DB {
 
   // Follow-up work to user creating a column family or (families)
   Status WrapUpCreateColumnFamilies(
-      const ReadOptions& read_options, const WriteOptions& write_options,
+      const WriteOptions& write_options,
       const std::vector<const ColumnFamilyOptions*>& cf_options);
 
   Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
@@ -2450,9 +2441,7 @@ class DBImpl : public DB {
   // Cancel scheduled periodic tasks
   Status CancelPeriodicTaskScheduler();
 
-  Status RegisterRecordSeqnoTimeWorker(const ReadOptions& read_options,
-                                       const WriteOptions& write_options,
-                                       bool is_new_db);
+  Status RegisterRecordSeqnoTimeWorker();
 
   void PrintStatistics();
 
@@ -2518,12 +2507,21 @@ class DBImpl : public DB {
 
   // Background threads call this function, which is just a wrapper around
   // the InstallSuperVersion() function. Background threads carry
-  // sv_context which can have new_superversion already
-  // allocated.
+  // sv_context to allow allocation of SuperVersion object outside of holding
+  // the DB mutex.
   // All ColumnFamily state changes go through this function. Here we analyze
   // the new state and we schedule background work if we detect that the new
   // state needs flush or compaction.
-  void InstallSuperVersionAndScheduleWork(ColumnFamilyData* cfd,
+  // See also InstallSuperVersionForConfigChange().
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+          new_seqno_to_time_mapping = {});
+
+  // A variant of InstallSuperVersionAndScheduleWork() that must be used for
+  // new CFs or for changes to mutable_cf_options. This is so that it can
+  // update seqno_to_time_mapping cached for the new SuperVersion as relevant.
+  void InstallSuperVersionForConfigChange(ColumnFamilyData* cfd,
                                           SuperVersionContext* sv_context);
 
   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 0cbb6c79e382..f623511d7303 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -4273,9 +4273,10 @@ void DBImpl::BuildCompactionJobInfo(
 // for superversion_to_free
 
 void DBImpl::InstallSuperVersionAndScheduleWork(
-    ColumnFamilyData* cfd, SuperVersionContext* sv_context) {
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+    std::optional<std::shared_ptr<SeqnoToTimeMapping>>
+        new_seqno_to_time_mapping) {
   mutex_.AssertHeld();
-  const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
 
   // Update max_total_in_memory_state_
   size_t old_memtable_size = 0;
@@ -4289,7 +4290,8 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
   if (UNLIKELY(sv_context->new_superversion == nullptr)) {
     sv_context->NewSuperVersion();
   }
-  cfd->InstallSuperVersion(sv_context, mutable_cf_options);
+  cfd->InstallSuperVersion(sv_context, &mutex_,
+                           std::move(new_seqno_to_time_mapping));
 
   // There may be a small data race here. The snapshot tricking bottommost
   // compaction may already be released here. But assuming there will always be
@@ -4316,9 +4318,10 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
   MaybeScheduleFlushOrCompaction();
 
   // Update max_total_in_memory_state_
-  max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
-                               mutable_cf_options.write_buffer_size *
-                                   mutable_cf_options.max_write_buffer_number;
+  max_total_in_memory_state_ =
+      max_total_in_memory_state_ - old_memtable_size +
+      cfd->GetLatestMutableCFOptions().write_buffer_size *
+          cfd->GetLatestMutableCFOptions().max_write_buffer_number;
 }
 
 // ShouldPurge is called by FindObsoleteFiles when doing a full scan,
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 577a861dcca6..85a491066ebf 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -35,8 +35,8 @@ Options SanitizeOptions(const std::string& dbname, const Options& src,
   auto db_options =
       SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
   ImmutableDBOptions immutable_db_options(db_options);
-  auto cf_options =
-      SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+  auto cf_options = SanitizeOptions(immutable_db_options, read_only,
+                                    ColumnFamilyOptions(src));
   return Options(db_options, cf_options);
 }
 
@@ -2334,9 +2334,11 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
   handles->clear();
 
   size_t max_write_buffer_size = 0;
+  MinAndMaxPreserveSeconds preserve_info;
   for (const auto& cf : column_families) {
     max_write_buffer_size =
         std::max(max_write_buffer_size, cf.options.write_buffer_size);
+    preserve_info.Combine(cf.options);
   }
 
   auto impl = std::make_unique<DBImpl>(db_options, dbname, seq_per_batch,
@@ -2469,6 +2471,12 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     s = impl->InitPersistStatsColumnFamily();
   }
 
+  // After reaching the post-recovery seqno but before creating SuperVersions
+  // ensure seqno to time mapping is pre-populated as needed.
+  if (s.ok() && recovery_ctx.is_new_db_ && preserve_info.IsEnabled()) {
+    impl->PrepopulateSeqnoToTimeMapping(preserve_info);
+  }
+
   if (s.ok()) {
     // set column family handles
     for (const auto& cf : column_families) {
@@ -2478,6 +2486,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
         handles->push_back(
             new ColumnFamilyHandleImpl(cfd, impl.get(), &impl->mutex_));
         impl->NewThreadStatusCfInfo(cfd);
+        SuperVersionContext sv_context(/* create_superversion */ true);
+        impl->InstallSuperVersionForConfigChange(cfd, &sv_context);
+        sv_context.Clean();
       } else {
         if (db_options.create_missing_column_families) {
           // missing column family, create it
@@ -2485,6 +2496,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
           impl->mutex_.Unlock();
           // NOTE: the work normally done in WrapUpCreateColumnFamilies will
           // be done separately below.
+          // This includes InstallSuperVersionForConfigChange.
           s = impl->CreateColumnFamilyImpl(read_options, write_options,
                                            cf.options, cf.name, &handle);
           impl->mutex_.Lock();
@@ -2501,15 +2513,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     }
   }
 
-  if (s.ok()) {
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // Install SuperVersion for hidden column family
+    assert(impl->persist_stats_cf_handle_);
+    assert(impl->persist_stats_cf_handle_->cfd());
     SuperVersionContext sv_context(/* create_superversion */ true);
-    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-      impl->InstallSuperVersionAndScheduleWork(cfd, &sv_context);
-    }
+    impl->InstallSuperVersionForConfigChange(
+        impl->persist_stats_cf_handle_->cfd(), &sv_context);
     sv_context.Clean();
-  }
-
-  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
     // try to read format version
     s = impl->PersistentStatsProcessFormatVersion();
   }
@@ -2618,8 +2629,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     s = impl->StartPeriodicTaskScheduler();
   }
   if (s.ok()) {
-    s = impl->RegisterRecordSeqnoTimeWorker(read_options, write_options,
-                                            recovery_ctx.is_new_db_);
+    s = impl->RegisterRecordSeqnoTimeWorker();
   }
   impl->options_mutex_.Unlock();
   if (s.ok()) {
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 99d390db2399..d619e8604e55 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -70,7 +70,8 @@ class DBOptionsTest : public DBTestBase {
     options.env = env_;
     ImmutableDBOptions db_options(options);
     test::RandomInitCFOptions(&options, options, rnd);
-    auto sanitized_options = SanitizeOptions(db_options, options);
+    auto sanitized_options =
+        SanitizeOptions(db_options, /*read_only*/ false, options);
     auto opt_map = GetMutableCFOptionsMap(sanitized_options);
     delete options.compaction_filter;
     return opt_map;
diff --git a/db/flush_job.cc b/db/flush_job.cc
index ac2eaeb6c55c..552c122656ea 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -1193,13 +1193,12 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() {
 }
 
 void FlushJob::GetPrecludeLastLevelMinSeqno() {
-  if (mutable_cf_options_.preclude_last_level_data_seconds == 0 ||
-      // FIXME: create FlushJob and build SuperVersions such that
-      // preclude_last_level_data_seconds > 0 implies
-      // seqno_to_time_mapping_ != nullptr
-      seqno_to_time_mapping_ == nullptr) {
+  if (mutable_cf_options_.preclude_last_level_data_seconds == 0) {
     return;
   }
+  // SuperVersion should guarantee this
+  assert(seqno_to_time_mapping_);
+  assert(!seqno_to_time_mapping_->Empty());
   int64_t current_time = 0;
   Status s = db_options_.clock->GetCurrentTime(&current_time);
   if (!s.ok()) {
diff --git a/db/job_context.h b/db/job_context.h
index 83e9f5facafd..766502ca4602 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -22,6 +22,9 @@ namespace ROCKSDB_NAMESPACE {
 class MemTable;
 struct SuperVersion;
 
+// The purpose of this struct is to simplify pushing work such as
+// allocation/construction, de-allocation/destruction, and notifications to
+// outside of holding the DB mutex.
 struct SuperVersionContext {
   struct WriteStallNotification {
     WriteStallInfo write_stall_info;
@@ -35,12 +38,6 @@ struct SuperVersionContext {
   std::unique_ptr<SuperVersion>
       new_superversion;  // if nullptr no new superversion
 
-  // If not nullptr, a new seqno to time mapping is available to be installed.
-  // Otherwise, make a shared copy of the one in the existing SuperVersion and
-  // carry it over to the new SuperVersion. This is moved to the SuperVersion
-  // during installation.
-  std::shared_ptr<const SeqnoToTimeMapping> new_seqno_to_time_mapping{nullptr};
-
   explicit SuperVersionContext(bool create_superversion = false)
       : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
 
diff --git a/db/repair.cc b/db/repair.cc
index 73671154ba5f..39189402936d 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -100,13 +100,13 @@ class Repairer {
         db_options_(SanitizeOptions(dbname_, db_options)),
         immutable_db_options_(ImmutableDBOptions(db_options_)),
         icmp_(default_cf_opts.comparator),
-        default_cf_opts_(
-            SanitizeOptions(immutable_db_options_, default_cf_opts)),
+        default_cf_opts_(SanitizeOptions(immutable_db_options_,
+                                         /*read_only*/ false, default_cf_opts)),
         default_iopts_(
             ImmutableOptions(immutable_db_options_, default_cf_opts_)),
         default_mopts_(MutableCFOptions(default_cf_opts_)),
-        unknown_cf_opts_(
-            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+        unknown_cf_opts_(SanitizeOptions(immutable_db_options_,
+                                         /*read_only*/ false, unknown_cf_opts)),
         create_unknown_cfs_(create_unknown_cfs),
         raw_table_cache_(
             // TableCache can be small since we expect each table to be opened
diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc
index cb247edfb767..a23f9ce6671f 100644
--- a/db/seqno_time_test.cc
+++ b/db/seqno_time_test.cc
@@ -919,10 +919,11 @@ TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) {
       ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
 
       // And even if we re-open read-write, we do not get pre-population,
-      // because that's only for new DBs.
+      // because that's only for new DBs. We just get a single bootstrap
+      // entry as a lower bound on write times of future writes.
       Reopen(track_options);
       sttm = dbfull()->TEST_GetSeqnoToTimeMapping();
-      ASSERT_EQ(sttm.Size(), 0);
+      ASSERT_EQ(sttm.Size(), 1);
       ASSERT_EQ(db_->GetLatestSequenceNumber(), 0);
     }
   }
diff --git a/db/seqno_to_time_mapping.cc b/db/seqno_to_time_mapping.cc
index b540fd919671..36da27c5bf03 100644
--- a/db/seqno_to_time_mapping.cc
+++ b/db/seqno_to_time_mapping.cc
@@ -490,7 +490,7 @@ bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
   return added;
 }
 
-bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
+void SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
                                      SequenceNumber to_seqno,
                                      uint64_t from_time, uint64_t to_time) {
   assert(Empty());
@@ -505,8 +505,6 @@ bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno,
                                  (to_seqno - from_seqno);
     pairs_.emplace_back(i, t);
   }
-
-  return /*success*/ true;
 }
 
 std::string SeqnoToTimeMapping::ToHumanString() const {
diff --git a/db/seqno_to_time_mapping.h b/db/seqno_to_time_mapping.h
index 741e64369435..a74041fd9a0e 100644
--- a/db/seqno_to_time_mapping.h
+++ b/db/seqno_to_time_mapping.h
@@ -138,7 +138,7 @@ class SeqnoToTimeMapping {
   // Adds a series of mappings interpolating from from_seqno->from_time to
   // to_seqno->to_time. This can only be called on an empty object and both
   // seqno range and time range are inclusive.
-  bool PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno,
+  void PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno,
                    uint64_t from_time, uint64_t to_time);
 
   // Append a new entry to the list. The `seqno` should be >= all previous
@@ -148,6 +148,10 @@ class SeqnoToTimeMapping {
   // rather than creating a new entry.
   bool Append(SequenceNumber seqno, uint64_t time);
 
+  bool Append(std::pair<SequenceNumber, uint64_t> seqno_time_pair) {
+    return Append(seqno_time_pair.first, seqno_time_pair.second);
+  }
+
   // Clear all entries and (re-)enter enforced mode if not already in that
   // state. Enforced limits are unchanged.
   void Clear() {
@@ -274,6 +278,48 @@ class SeqnoToTimeMapping {
   pair_const_iterator FindGreaterEqSeqno(SequenceNumber seqno) const;
 };
 
+// A struct to help combining settings across column families
+struct MinAndMaxPreserveSeconds {
+  uint64_t min_preserve_seconds = std::numeric_limits<uint64_t>::max();
+  uint64_t max_preserve_seconds = std::numeric_limits<uint64_t>::min();
+
+  MinAndMaxPreserveSeconds() = default;
+
+  template <class CFOpts>
+  explicit MinAndMaxPreserveSeconds(const CFOpts& opts) {
+    Combine(opts);
+  }
+
+  bool IsEnabled() const {
+    return min_preserve_seconds != std::numeric_limits<uint64_t>::max();
+  }
+
+  // Incorporate another CF's settings into the result. If preserve/preclude are
+  // disabled for this CF, they are excluded from the result.
+  template <class CFOpts>
+  void Combine(const CFOpts& opts) {
+    uint64_t preserve_seconds = std::max(opts.preserve_internal_time_seconds,
+                                         opts.preclude_last_level_data_seconds);
+    if (preserve_seconds > 0) {
+      min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds);
+      max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds);
+    }
+  }
+
+  // Choose how many seconds between mapping samples
+  uint64_t GetRecodingCadence() const {
+    if (IsEnabled()) {
+      // round up to 1 when the time_duration is smaller than
+      // kMaxSeqnoTimePairsPerCF
+      return (min_preserve_seconds + kMaxSeqnoTimePairsPerCF - 1) /
+             kMaxSeqnoTimePairsPerCF;
+    } else {
+      // disabled
+      return 0;
+    }
+  }
+};
+
 // === Utility methods used for TimedPut === //
 
 // Pack a value Slice and a unix write time into buffer `buf` and return a Slice
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index 52947c484cf6..d1b5ee68cedb 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -471,8 +471,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
 ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
     const ColumnFamilyOptions& cf_options, const VersionEdit& edit) {
   uint32_t cf_id = edit.GetColumnFamily();
-  ColumnFamilyData* cfd =
-      version_set_->CreateColumnFamily(cf_options, read_options_, &edit);
+  ColumnFamilyData* cfd = version_set_->CreateColumnFamily(
+      cf_options, read_options_, &edit, read_only_);
   assert(cfd != nullptr);
   cfd->set_initialized();
   assert(builders_.find(cf_id) == builders_.end());
diff --git a/db/version_set.cc b/db/version_set.cc
index b560713cbbab..da1ad3ea8772 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5741,7 +5741,8 @@ Status VersionSet::ProcessManifestWrites(
       assert(new_cf_options != nullptr);
       assert(max_last_sequence == descriptor_last_sequence_);
       CreateColumnFamily(*new_cf_options, read_options,
-                         first_writer.edit_list.front());
+                         first_writer.edit_list.front(),
+                         /*read_only*/ false);
     } else if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
       assert(batch_edits.size() == 1);
       assert(max_last_sequence == descriptor_last_sequence_);
@@ -7294,7 +7295,7 @@ uint64_t VersionSet::GetObsoleteSstFilesSize() const {
 
 ColumnFamilyData* VersionSet::CreateColumnFamily(
     const ColumnFamilyOptions& cf_options, const ReadOptions& read_options,
-    const VersionEdit* edit) {
+    const VersionEdit* edit, bool read_only) {
   assert(edit->IsColumnFamilyAdd());
 
   MutableCFOptions dummy_cf_options;
@@ -7305,7 +7306,7 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
   dummy_versions->Ref();
   auto new_cfd = column_family_set_->CreateColumnFamily(
       edit->GetColumnFamilyName(), edit->GetColumnFamily(), dummy_versions,
-      cf_options);
+      cf_options, read_only);
 
   Version* v = new Version(new_cfd, this, file_options_,
                            new_cfd->GetLatestMutableCFOptions(), io_tracer_,
diff --git a/db/version_set.h b/db/version_set.h
index 72ae58f162c8..d9cc5a8e07ee 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1625,7 +1625,7 @@ class VersionSet {
 
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        const ReadOptions& read_options,
-                                       const VersionEdit* edit);
+                                       const VersionEdit* edit, bool read_only);
 
   Status VerifyFileMetadata(const ReadOptions& read_options,
                             ColumnFamilyData* cfd, const std::string& fpath,
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 1f618c82321b..831de21fd9d3 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -455,7 +455,7 @@ def is_direct_io_supported(dbname):
     # since we will be killing anyway, use large value for ops_per_thread
     "ops_per_thread": 100000000,
     "reopen": 0,
-    "set_options_one_in": 2000,
+    "set_options_one_in": 1000,
 }
 
 whitebox_default_params = {

From 15873b1fdd3a4aa7a21825616623e4fb482443e7 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 4 Mar 2025 18:33:52 -0800
Subject: [PATCH 006/500] New CF option disallow_memtable_writes (#13431)

Summary:
... to better support "ingestion only" column families such as those using an external file reader as in https://github.com/facebook/rocksdb/issues/13401.

It would be possible to implement this by getting rid of the memtable for that CF, but it quickly because clear that such an approach would need to update a lot of places to deal with such a possibility. And we already have logic to optimize reads when a memtable is empty. We put a vector memtable in place to minimize overheads of an empty memtable.

There are three layers of defense against writes to the memtable:
* WriteBatch ops to a disallowed CF will fail immediately, without waiting for Write(). For this check to work, we need a ColumnFamilyHandle and because of that, we don't support disallow_memtable_writes on the default column family.
* MemtableInserter will reject writes to disallowed CFs. This is needed to protect re-open with disallow when there are existing writes in a WAL.
* The placeholder memtable is marked immutable. This will cause an assertion failure on attempt to write, such as in case of bug or regression.

Suggested follow-up:
* Remove the limitation on using the option with the default column family, perhaps by solving https://github.com/facebook/rocksdb/issues/13429 more generally or perhaps with some specific check before the first memtable write of the batch (but potential CPU overhead for such a check - there's likely optimization opportunities around ColumnFamilyMemTables).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13431

Test Plan:
unit tests added

Performance: A db_bench call designed to realistically focus on the CPU cost of writes:

```
./db_bench -db=/dev/shm/dbbench1 --benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -num_column_families=20 -disable_wal -write_buffer_size=1234000
```

Running before & after tests at the same time on the same machine, 40 iterations each, average ops/s, DEBUG_LEVEL=0, remove slowest run of each:
Before: 772466
After: 773785 (0.2% faster)

Likely within the noise, as if there was any change, we would expect a slight regression.

Reviewed By: anand1976

Differential Revision: D70495936

Pulled By: pdillinger

fbshipit-source-id: 306f7e737f87c1fbb52c5805f3cadb6e8ced9b40
---
 db/column_family.cc                |   7 +-
 db/column_family.h                 |   3 +
 db/db_basic_test.cc                |  91 ++++++++++++++++++++++++
 db/db_impl/db_impl_open.cc         |   6 ++
 db/external_sst_file_basic_test.cc | 108 +++++++++++++++++++----------
 db/write_batch.cc                  |  19 +++++
 include/rocksdb/advanced_options.h |  11 +++
 options/cf_options.cc              |   5 ++
 options/cf_options.h               |   2 +
 options/options.cc                 |   3 +
 options/options_helper.cc          |   1 +
 options/options_settable_test.cc   |   1 +
 12 files changed, 218 insertions(+), 39 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 6b54454f8ff5..8c6bf9c96b9c 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -168,7 +168,8 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
   }
   if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
     return Status::InvalidArgument(
-        "Memtable doesn't allow concurrent writes (allow_concurrent_memtable_write)");
+        "Memtable doesn't allow concurrent writes "
+        "(allow_concurrent_memtable_write)");
   }
   return Status::OK();
 }
@@ -240,6 +241,10 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
 
     result.min_write_buffer_number_to_merge = 1;
   }
+  if (result.disallow_memtable_writes) {
+    // A simple memtable that enforces MarkReadOnly (unlike skip list)
+    result.memtable_factory = std::make_shared<VectorRepFactory>();
+  }
 
   if (result.num_levels < 1) {
     result.num_levels = 1;
diff --git a/db/column_family.h b/db/column_family.h
index 89ededb502cd..5e18c90a1bad 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -393,6 +393,9 @@ class ColumnFamilyData {
   void SetMemtable(MemTable* new_mem) {
     AssignMemtableID(new_mem);
     mem_ = new_mem;
+    if (ioptions_.disallow_memtable_writes) {
+      mem_->MarkImmutable();
+    }
   }
 
   void AssignMemtableID(ReadOnlyMemTable* new_imm) {
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 27ef6124943a..5ab97a7340dd 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -5057,6 +5057,97 @@ TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) {
             (sst_size + alignment - 1) / (alignment));
 }
 
+TEST_F(DBBasicTest, DisallowMemtableWrite) {
+  // This test is mostly about what you can't do with memtable writes
+  // disallowed. For what you can do, see
+  // ExternalSSTFileBasicTest.FailIfNotBottommostLevelAndDisallowMemtable
+  Options options_allow = GetDefaultOptions();
+  options_allow.create_if_missing = true;
+  Options options_disallow = options_allow;
+  options_disallow.disallow_memtable_writes = true;
+
+  DestroyAndReopen(options_allow);
+  // CFs allowing and disallowing memtable write
+  CreateColumnFamilies({"cf1", "cf2"}, options_allow);
+  CreateColumnFamilies({"cf3"}, options_disallow);
+  // XXX: needed to get consistent handles_ mappings
+  ReopenWithColumnFamilies(
+      {"default", "cf1", "cf2", "cf3"},
+      {options_allow, options_allow, options_allow, options_disallow});
+
+  EXPECT_EQ(Put(0, "a0", "1").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(1, "a1", "1").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(2, "a2", "1").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(3, "a3", "1").code(), Status::Code::kInvalidArgument);
+
+  EXPECT_EQ(Get(0, "a0"), "1");
+  EXPECT_EQ(Get(1, "a1"), "1");
+  EXPECT_EQ(Get(2, "a2"), "1");
+  EXPECT_EQ(Get(3, "a3"), "NOT_FOUND");
+
+  EXPECT_EQ(Delete(0, "z0").code(), Status::Code::kOk);
+  EXPECT_EQ(Delete(1, "z1").code(), Status::Code::kOk);
+  EXPECT_EQ(Delete(2, "z2").code(), Status::Code::kOk);
+  EXPECT_EQ(Delete(3, "z3").code(), Status::Code::kInvalidArgument);
+
+  WriteBatch wb;
+  EXPECT_EQ(wb.Put(handles_[0], "b0", "2").code(), Status::Code::kOk);
+  EXPECT_EQ(wb.Put(handles_[1], "b1", "2").code(), Status::Code::kOk);
+  EXPECT_EQ(wb.Put(handles_[2], "b2", "2").code(), Status::Code::kOk);
+  EXPECT_EQ(wb.Put(handles_[3], "b3", "2").code(),
+            Status::Code::kInvalidArgument);
+  ASSERT_OK(db_->Write({}, &wb));
+  wb.Clear();
+
+  EXPECT_EQ(Get(0, "b0"), "2");
+  EXPECT_EQ(Get(1, "b1"), "2");
+  EXPECT_EQ(Get(2, "b2"), "2");
+  EXPECT_EQ(Get(3, "b3"), "NOT_FOUND");
+
+  // When the DB is re-opened with WAL entries for a CF that is newly setting
+  // disallow_memtable_writes, we detect that and fail the open gracefully.
+  ASSERT_EQ(TryReopenWithColumnFamilies(
+                {"default", "cf1", "cf2", "cf3"},
+                {options_allow, options_allow, options_disallow, options_allow})
+                .code(),
+            Status::Code::kInvalidArgument);
+
+  // Successfully opening with allow creates L0 files from the WAL
+  ReopenWithColumnFamilies({"default", "cf1", "cf2", "cf3"}, options_allow);
+
+  EXPECT_EQ(Get(0, "a0"), "1");
+  EXPECT_EQ(Get(1, "a1"), "1");
+  EXPECT_EQ(Get(2, "a2"), "1");
+  EXPECT_EQ(Get(3, "a3"), "NOT_FOUND");
+
+  // Now able to disallow on CF2 because no relevant WAL entries
+  ReopenWithColumnFamilies(
+      {"default", "cf1", "cf2", "cf3"},
+      {options_allow, options_allow, options_disallow, options_allow});
+
+  EXPECT_EQ(Get(0, "a0"), "1");
+  EXPECT_EQ(Get(1, "a1"), "1");
+  EXPECT_EQ(Get(2, "a2"), "1");
+  EXPECT_EQ(Get(3, "a3"), "NOT_FOUND");
+
+  // Now able to write to CF 3 but not CF 2
+  EXPECT_EQ(Put(0, "c0", "3").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(1, "c1", "3").code(), Status::Code::kOk);
+  EXPECT_EQ(Put(2, "c2", "3").code(), Status::Code::kInvalidArgument);
+  EXPECT_EQ(Put(3, "c3", "3").code(), Status::Code::kOk);
+
+  EXPECT_EQ(Get(0, "c0"), "3");
+  EXPECT_EQ(Get(1, "c1"), "3");
+  EXPECT_EQ(Get(2, "c2"), "NOT_FOUND");
+  EXPECT_EQ(Get(3, "c3"), "3");
+
+  // disallow_memtable_writes not supported on default column family.
+  // (Would be complicated to make a WriteBatch aware of the setting in order
+  // to reject the write before entering the write path.)
+  Destroy(options_allow);
+  EXPECT_EQ(TryReopen(options_disallow).code(), Status::Code::kInvalidArgument);
+}
+
 // TODO: re-enable after we provide finer-grained control for WAL tracking to
 // meet the needs of different use cases, durability levels and recovery modes.
 TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 85a491066ebf..22b1cfd7c710 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -224,6 +224,12 @@ Status DBImpl::ValidateOptions(
     if (!s.ok()) {
       return s;
     }
+    if (cfd.name == kDefaultColumnFamilyName) {
+      if (cfd.options.disallow_memtable_writes) {
+        return Status::InvalidArgument(
+            "Default column family cannot use disallow_memtable_writes=true");
+      }
+    }
   }
   s = ValidateOptions(db_options);
   return s;
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 69b2668aea80..f8108651ec4c 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -2669,51 +2669,83 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
   }
 }
 
-TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) {
-  Options options = GetDefaultOptions();
+TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
+  for (bool disallow_memtable : {false, true}) {
+    Options options = GetDefaultOptions();
 
-  std::string file_path = sst_files_dir_ + std::to_string(1);
-  SstFileWriter sfw(EnvOptions(), options);
+    // First test with universal compaction
+    options.create_if_missing = true;
+    options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+    DestroyAndReopen(options);
 
-  ASSERT_OK(sfw.Open(file_path));
-  ASSERT_OK(sfw.Put("b", "dontcare"));
-  ASSERT_OK(sfw.Finish());
+    // And a CF potentially disallowing memtable write
+    options.disallow_memtable_writes = disallow_memtable;
+    CreateColumnFamilies({"cf0"}, options);
+    ASSERT_EQ(db_->GetOptions(handles_[0]).disallow_memtable_writes,
+              disallow_memtable);
 
-  // Test universal compaction + ingest with snapshot consistency
-  options.create_if_missing = true;
-  options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
-  DestroyAndReopen(options);
-  {
-    const Snapshot* snapshot = db_->GetSnapshot();
-    ManagedSnapshot snapshot_guard(db_, snapshot);
-    IngestExternalFileOptions ifo;
-    ifo.fail_if_not_bottommost_level = true;
-    ifo.snapshot_consistency = true;
-    const Status s = db_->IngestExternalFile({file_path}, ifo);
-    ASSERT_TRUE(s.ok());
-  }
+    // Ingest with snapshot consistency
+    std::string file_path = sst_files_dir_ + std::to_string(1);
+    SstFileWriter sfw(EnvOptions(), options);
 
-  // Test level compaction
-  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
-  options.num_levels = 2;
-  DestroyAndReopen(options);
-  ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare"));
-  ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare"));
-  ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_OK(sfw.Open(file_path));
+    ASSERT_OK(sfw.Put("b", "dontcare"));
+    ASSERT_OK(sfw.Finish());
 
-  ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare"));
-  ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare"));
-  ASSERT_OK(db_->Flush(FlushOptions()));
+    {
+      const Snapshot* snapshot = db_->GetSnapshot();
+      ManagedSnapshot snapshot_guard(db_, snapshot);
+      IngestExternalFileOptions ifo;
+      ifo.fail_if_not_bottommost_level = true;
+      ifo.snapshot_consistency = true;
+      ASSERT_OK(db_->IngestExternalFile(handles_[0], {file_path}, ifo));
+    }
 
-  {
-    CompactRangeOptions cro;
-    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    // Test level compaction
+    options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+    options.num_levels = 2;
+    CreateColumnFamilies({"cf1"}, options);
+    ASSERT_EQ(db_->GetOptions(handles_[1]).disallow_memtable_writes,
+              disallow_memtable);
+
+    if (!disallow_memtable) {
+      ASSERT_OK(Put(1, "a", "1"));
+      ASSERT_OK(Put(1, "c", "3"));
+      ASSERT_OK(Flush(1));
+
+      ASSERT_OK(Put(1, "b", "2"));
+      ASSERT_OK(Put(1, "d", "4"));
+      ASSERT_OK(Flush(1));
+    } else {
+      // Memtable write disallowed
+      EXPECT_EQ(Put(1, "a", "1").code(), Status::Code::kInvalidArgument);
+
+      // Use ingestion to get to the same state as above
+      std::string file_path2 = sst_files_dir_ + std::to_string(2);
+
+      ASSERT_OK(sfw.Open(file_path2));
+      ASSERT_OK(sfw.Put("a", "1"));
+      ASSERT_OK(sfw.Put("c", "3"));
+      ASSERT_OK(sfw.Finish());
+      ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {}));
+
+      ASSERT_OK(sfw.Open(file_path2));
+      ASSERT_OK(sfw.Put("b", "2"));
+      ASSERT_OK(sfw.Put("d", "4"));
+      ASSERT_OK(sfw.Finish());
+      ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {}));
+    }
 
-    IngestExternalFileOptions ifo;
-    ifo.fail_if_not_bottommost_level = true;
-    const Status s = db_->IngestExternalFile({file_path}, ifo);
-    ASSERT_TRUE(s.IsTryAgain());
+    {
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
+
+      IngestExternalFileOptions ifo;
+      ifo.fail_if_not_bottommost_level = true;
+      const Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_TRUE(s.IsTryAgain());
+    }
   }
 }
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 15034e5c3fcc..84dbd06d0255 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -815,6 +815,12 @@ WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(
         s = Status::InvalidArgument("Default cf timestamp size mismatch");
       }
     }
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+    if (cfd && cfd->ioptions().disallow_memtable_writes) {
+      s = Status::InvalidArgument(
+          "This column family has disallow_memtable_writes=true");
+    }
   } else if (b->default_cf_ts_sz_ > 0) {
     ts_sz = b->default_cf_ts_sz_;
   }
@@ -836,6 +842,12 @@ Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family,
   if (cf_ts_sz != ts.size()) {
     return Status::InvalidArgument("timestamp size mismatch");
   }
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (cfd && cfd->ioptions().disallow_memtable_writes) {
+    return Status::InvalidArgument(
+        "This column family has disallow_memtable_writes=true");
+  }
   return Status::OK();
 }
 }  // anonymous namespace
@@ -2185,6 +2197,13 @@ class MemTableInserter : public WriteBatch::Handler {
       }
       return false;
     }
+    auto* current = cf_mems_->current();
+    if (current && current->ioptions().disallow_memtable_writes) {
+      *s = Status::InvalidArgument(
+          "This column family has disallow_memtable_writes=true");
+      return false;
+    }
+
     if (recovering_log_number_ != 0 &&
         recovering_log_number_ < cf_mems_->GetLogNumber()) {
       // This is true only in recovery environment (recovering_log_number_ is
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index ad9b90f735bb..369603e7f7c8 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -719,6 +719,17 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   bool report_bg_io_stats = false;
 
+  // Setting this option to true disallows ordinary writes to the column family
+  // and it can only be populated through import and ingestion. It is intended
+  // to protect "ingestion only" column families. This option is not currently
+  // supported on the default column family because of error handling challenges
+  // analogous to https://github.com/facebook/rocksdb/issues/13429
+  //
+  // This option is not mutable with SetOptions(). It can be changed between
+  // DB::Open() calls, but open will fail if recovering WAL writes to a CF with
+  // this option set.
+  bool disallow_memtable_writes = false;
+
   // This option has different meanings for different compaction styles:
   //
   // Leveled: Non-bottom-level files with all keys older than TTL will go
diff --git a/options/cf_options.cc b/options/cf_options.cc
index d50eade93209..85ecc994f39b 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -736,6 +736,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableCFOptions, force_consistency_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"disallow_memtable_writes",
+         {offsetof(struct ImmutableCFOptions, disallow_memtable_writes),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"default_temperature",
          {offsetof(struct ImmutableCFOptions, default_temperature),
           OptionType::kTemperature, OptionVerificationType::kNormal,
@@ -998,6 +1002,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
       num_levels(cf_options.num_levels),
       optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
       force_consistency_checks(cf_options.force_consistency_checks),
+      disallow_memtable_writes(cf_options.disallow_memtable_writes),
       default_temperature(cf_options.default_temperature),
       memtable_insert_with_hint_prefix_extractor(
           cf_options.memtable_insert_with_hint_prefix_extractor),
diff --git a/options/cf_options.h b/options/cf_options.h
index 751e7b46d52b..51236394e342 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -68,6 +68,8 @@ struct ImmutableCFOptions {
 
   bool force_consistency_checks;
 
+  bool disallow_memtable_writes;
+
   Temperature default_temperature;
 
   std::shared_ptr<const SliceTransform>
diff --git a/options/options.cc b/options/options.cc
index c1e68260a14d..2ee431406651 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -90,6 +90,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       paranoid_file_checks(options.paranoid_file_checks),
       force_consistency_checks(options.force_consistency_checks),
       report_bg_io_stats(options.report_bg_io_stats),
+      disallow_memtable_writes(options.disallow_memtable_writes),
       ttl(options.ttl),
       periodic_compaction_seconds(options.periodic_compaction_seconds),
       sample_for_compression(options.sample_for_compression),
@@ -395,6 +396,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    force_consistency_checks);
   ROCKS_LOG_HEADER(log, "               Options.report_bg_io_stats: %d",
                    report_bg_io_stats);
+  ROCKS_LOG_HEADER(log, "               Options.disallow_memtable_writes: %d",
+                   disallow_memtable_writes);
   ROCKS_LOG_HEADER(log, "                              Options.ttl: %" PRIu64,
                    ttl);
   ROCKS_LOG_HEADER(log,
diff --git a/options/options_helper.cc b/options/options_helper.cc
index fad122166a0a..70311b2e8394 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -326,6 +326,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
   cf_opts->num_levels = ioptions.num_levels;
   cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits;
   cf_opts->force_consistency_checks = ioptions.force_consistency_checks;
+  cf_opts->disallow_memtable_writes = ioptions.disallow_memtable_writes;
   cf_opts->memtable_insert_with_hint_prefix_extractor =
       ioptions.memtable_insert_with_hint_prefix_extractor;
   cf_opts->cf_paths = ioptions.cf_paths;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index d6660908d8b8..6d777ee18735 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -644,6 +644,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "hard_pending_compaction_bytes_limit=0;"
       "disable_auto_compactions=false;"
       "report_bg_io_stats=true;"
+      "disallow_memtable_writes=true;"
       "ttl=60;"
       "periodic_compaction_seconds=3600;"
       "sample_for_compression=0;"

From ec8f1452f5214f554f4d492aa33be83555202494 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 5 Mar 2025 14:32:05 -0800
Subject: [PATCH 007/500] Temp disable in crash test: secondary instance +
 seqno-time tracking (#13439)

Summary:
PR https://github.com/facebook/rocksdb/issues/13316 broke some crash test cases in DBImplSecondary, from combining test_secondary=1 and preserve_internal_time_seconds>0. Disabling that while investigating the fix.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13439

Test Plan: manual blackbox_crash_test runs with forced test_secondary=1

Reviewed By: anand1976

Differential Revision: D70656373

Pulled By: pdillinger

fbshipit-source-id: fa2139e90bbe64ec8ebb062877d9337894ea3b43
---
 tools/db_crashtest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 831de21fd9d3..6503260658ef 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1030,6 +1030,9 @@ def finalize_and_sanitize(src_params):
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
+        # FIXME: temporarily broken combination
+        dest_params["preserve_internal_time_seconds"] = 0
+        dest_params["preclude_last_level_data_seconds"] = 0
     return dest_params
 
 

From f6bff87b92577e5f0f2d8289a736103682d27070 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:25:41 -0800
Subject: [PATCH 008/500] Add opaque options in ReadOptions for external tables
 (#13436)

Summary:
Add an unordered_map of name/value pairs in ReadOptions::property_bag, similar to IOOptions::property_bag. It allows users to pass through some custom options to an external table.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13436

Reviewed By: jaykorean

Differential Revision: D70649609

Pulled By: anand1976

fbshipit-source-id: 9b14806a9f3599b861827bd4ae6e948861edc51a
---
 include/rocksdb/options.h                                   | 6 ++++++
 .../public_api_changes/read_options_property_bag.md         | 1 +
 2 files changed, 7 insertions(+)
 create mode 100644 unreleased_history/public_api_changes/read_options_property_bag.md

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 796de1fef086..06e3e99681dd 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1985,6 +1985,12 @@ struct ReadOptions {
   // reader implementation.
   uint64_t weight = 0;
 
+  // A map of name,value pairs that can be passed by the user to an
+  // external table reader. This is completely opaque to RocksDB and is
+  // ignored by the natively supported table readers like block based and plain
+  // table. This is only useful for Iterator.
+  std::optional<std::unordered_map<std::string, std::string>> property_bag;
+
   // *** END options for RocksDB internal use only ***
 
   ReadOptions() {}
diff --git a/unreleased_history/public_api_changes/read_options_property_bag.md b/unreleased_history/public_api_changes/read_options_property_bag.md
new file mode 100644
index 000000000000..5b9b58e1ddb6
--- /dev/null
+++ b/unreleased_history/public_api_changes/read_options_property_bag.md
@@ -0,0 +1 @@
+Add an unordered map of name/value pairs, ReadOptions::property_bag, to pass opaque options through to an external table when creating an Iterator.

From 14c949df8ba25bb8518b4283f9dd7e002bdf8e4a Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:30:46 -0800
Subject: [PATCH 009/500] Initial implementation of ExternalTableBuilder
 (#13434)

Summary:
This PR adds the ability to use an ExternalTableBuilder through the SstFileWriter to create external tables. This is a counterpart to https://github.com/facebook/rocksdb/issues/13401 , which adds the ExternalTableReader. The support for external tables is confined to ingestion only DBs, with external table files ingested into the bottommost level only. https://github.com/facebook/rocksdb/issues/13431 enforces ingestion only DBs by adding a disallow_memtable_writes column family option.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13434

Test Plan: New unit tests in table_test.cc

Reviewed By: pdillinger

Differential Revision: D70532054

Pulled By: anand1976

fbshipit-source-id: a837487eadfabed9627a0eceb403bfc5fc2c427c
---
 BUCK                                          |   2 +-
 CMakeLists.txt                                |   2 +-
 ...ternal_table_reader.h => external_table.h} | 115 ++++++++--
 src.mk                                        |   2 +-
 ...rnal_table_reader.cc => external_table.cc} |  95 +++++++-
 table/table_test.cc                           | 215 +++++++++++++++---
 6 files changed, 367 insertions(+), 64 deletions(-)
 rename include/rocksdb/{external_table_reader.h => external_table.h} (50%)
 rename table/{external_table_reader.cc => external_table.cc} (68%)

diff --git a/BUCK b/BUCK
index bffed60e4add..811fcd5a3854 100644
--- a/BUCK
+++ b/BUCK
@@ -214,7 +214,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "table/cuckoo/cuckoo_table_builder.cc",
         "table/cuckoo/cuckoo_table_factory.cc",
         "table/cuckoo/cuckoo_table_reader.cc",
-        "table/external_table_reader.cc",
+        "table/external_table.cc",
         "table/format.cc",
         "table/get_context.cc",
         "table/iterator.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cce07d70fec7..182b4cde1514 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -835,7 +835,7 @@ set(SOURCES
         table/cuckoo/cuckoo_table_builder.cc
         table/cuckoo/cuckoo_table_factory.cc
         table/cuckoo/cuckoo_table_reader.cc
-        table/external_table_reader.cc
+        table/external_table.cc
         table/format.cc
         table/get_context.cc
         table/iterator.cc
diff --git a/include/rocksdb/external_table_reader.h b/include/rocksdb/external_table.h
similarity index 50%
rename from include/rocksdb/external_table_reader.h
rename to include/rocksdb/external_table.h
index 9bba9f4f3eff..87f4b9e6ba0d 100644
--- a/include/rocksdb/external_table_reader.h
+++ b/include/rocksdb/external_table.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "rocksdb/customizable.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
@@ -18,22 +19,21 @@ class ExternalTableFactory;
 // The interface defined in this file is subject to change at any time without
 // warning!!
 
-// This file defines an interface for plugging in an external table reader
+// This file defines an interface for plugging in an external table
 // into RocksDB. The external table reader will be used instead of the
-// BlockBasedTable to load and query sst files. As of now, creating the
-// external table files using RocksDB is not supported, but will be added in
-// the near future. The external table files can be created outside and
-// RocksDB and ingested into a RocksDB instance using the IngestExternalFIle()
-// API.
+// BlockBasedTable to load and query sst files.
+// The external table files can be created using an SstFileWriter. Eventually
+// external tables will be allowed to be ingested into a RocksDB instance
+// using the IngestExternalFIle() API.
 //
-// Initial support is for loading and querying the files using an
-// SstFileReader. We will add support for ingestion of an external table
-// into a limited RocksDB instance that only supports ingestion and not live
-// writes in the near future. It'll be followed by support for replacing the
-// column family by ingesting a new set of files. In all cases, the external
-// table files will only be allowed in the bottommost level.
+// Initial support is for writing and querying the files using an
+// SstFileWriter and SstFileReader. We will add support for ingestion of an
+// external table into a limited RocksDB instance that only supports ingestion
+// and not live writes in the near future. It'll be followed by support for
+// replacing the column family by ingesting a new set of files. In all cases,
+// the external table files will only be allowed in the bottommost level.
 //
-// The external table reader can support one or both of the following layouts -
+// The external table can support one or both of the following layouts -
 // 1. Total order seek - All the keys in the files are in sorted order, and a
 //    user can seek to the first, last, or any key in between and iterate
 //    forwards or backwards till the end of the range. To support this mode,
@@ -54,8 +54,8 @@ class ExternalTableFactory;
 //    true to seek to the first non-empty prefix (as determined by the key
 //    order) if the seek prefix is empty.
 //
-// Many of the options in ReadOptions may not be relevant to the external
-// table implementation.
+// Many of the options in ReadOptions and WriteOptions may not be relevant to
+// the external table implementation.
 // TODO: Specify which options are relevant
 
 class ExternalTableReader {
@@ -94,6 +94,62 @@ class ExternalTableReader {
   }
 };
 
+// A table builder interface that can be used by SstFileWriter to allow
+// RocksDB users to write external table files. The sequence of operations
+// to write an external table is as follows -
+// 1. Add() is called one or more times to write all key-values to the table.
+//    Its called in increasing key order, as determined by the comparator.
+//    The input key is a user key, i.e sequence number and value type are
+//    stripped out.
+// 2. After every Add() operation, status() is called to check the current
+//    status.
+// 3. After the last key is added, Finish() is called to do whatever is
+//    necessary to ensure the data is persisted in the table file.
+// 4. If there is a failure midway for some reason, Abandon() is called
+//    instead of Finish().
+// 5. At the end, FileSize(), GetTableProperties(), and status() are called to
+//    get the final size of the file, the table properties, and the final
+//    status. GetFileChecksum() and GetFileChecksumFuncName() may also be
+//    called to get checksum information about the whole file, but their
+//    implementation is optional.
+class ExternalTableBuilder {
+ public:
+  virtual ~ExternalTableBuilder() {}
+
+  // Write a single KV to the table file. This is guaranteed to be called
+  // in key order, and the write may be buffered and flushed at a later time.
+  virtual void Add(const Slice& key, const Slice& value) = 0;
+
+  // Return the current Status. This could return non-ok, for example, if
+  // Add() fails for some reason.
+  virtual Status status() const = 0;
+
+  // Flush and close the table file
+  virtual Status Finish() = 0;
+
+  // Delete the partial file and release any allocated resources. Either this
+  // or Finish() will be called, but not both.
+  virtual void Abandon() = 0;
+
+  // Return the size of the table file. Will be called at the end, after
+  // Finish().
+  virtual uint64_t FileSize() const = 0;
+
+  //  As mentioned in earlier comments, the following table properties must be
+  //  returned at a minimum -
+  //  comparator_name
+  //  num_entries
+  //  raw_key_size
+  //  raw_value_size
+  virtual TableProperties GetTableProperties() const = 0;
+
+  virtual std::string GetFileChecksum() const { return kUnknownFileChecksum; }
+
+  virtual const char* GetFileChecksumFuncName() const {
+    return kUnknownFileChecksumFuncName;
+  }
+};
+
 struct ExternalTableOptions {
   const std::shared_ptr<const SliceTransform>& prefix_extractor;
   const Comparator* comparator;
@@ -104,6 +160,29 @@ struct ExternalTableOptions {
       : prefix_extractor(_prefix_extractor), comparator(_comparator) {}
 };
 
+struct ExternalTableBuilderOptions {
+  const ReadOptions& read_options;
+  const WriteOptions& write_options;
+  const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  const Comparator* comparator;
+  const std::string& column_family_name;
+  const std::string db_id;
+  const std::string db_session_id;
+  const TableFileCreationReason reason;
+
+  ExternalTableBuilderOptions(
+      const ReadOptions& _read_options, const WriteOptions& _write_options,
+      const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      const Comparator* _comparator, const std::string& _column_family_name,
+      const TableFileCreationReason _reason)
+      : read_options(_read_options),
+        write_options(_write_options),
+        prefix_extractor(_prefix_extractor),
+        comparator(_comparator),
+        column_family_name(_column_family_name),
+        reason(_reason) {}
+};
+
 class ExternalTableFactory : public Customizable {
  public:
   ~ExternalTableFactory() override {}
@@ -113,7 +192,11 @@ class ExternalTableFactory : public Customizable {
   virtual Status NewTableReader(
       const ReadOptions& read_options, const std::string& file_path,
       const ExternalTableOptions& table_options,
-      std::unique_ptr<ExternalTableReader>* table_reader) = 0;
+      std::unique_ptr<ExternalTableReader>* table_reader) const = 0;
+
+  virtual ExternalTableBuilder* NewTableBuilder(
+      const ExternalTableBuilderOptions& builder_options,
+      const std::string& file_path) const = 0;
 };
 
 // Allocate a TableFactory that wraps around an ExternalTableFactory. Use this
diff --git a/src.mk b/src.mk
index 3f1de6688684..a25348b919e9 100644
--- a/src.mk
+++ b/src.mk
@@ -205,7 +205,7 @@ LIB_SOURCES =                                                   \
   table/cuckoo/cuckoo_table_builder.cc                          \
   table/cuckoo/cuckoo_table_factory.cc                          \
   table/cuckoo/cuckoo_table_reader.cc                           \
-  table/external_table_reader.cc				\
+  table/external_table.cc					\
   table/format.cc                                               \
   table/get_context.cc                                          \
   table/iterator.cc                                             \
diff --git a/table/external_table_reader.cc b/table/external_table.cc
similarity index 68%
rename from table/external_table_reader.cc
rename to table/external_table.cc
index fdd0de0a0674..83c313a3d7b1 100644
--- a/table/external_table_reader.cc
+++ b/table/external_table.cc
@@ -3,7 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "rocksdb/external_table_reader.h"
+#include "rocksdb/external_table.h"
 
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
@@ -16,7 +16,8 @@ namespace {
 
 class ExternalTableIterator : public InternalIterator {
  public:
-  explicit ExternalTableIterator(Iterator* iterator) : iterator_(iterator) {}
+  explicit ExternalTableIterator(Iterator* iterator)
+      : iterator_(iterator), valid_(false) {}
 
   // No copying allowed
   ExternalTableIterator(const ExternalTableIterator&) = delete;
@@ -24,7 +25,7 @@ class ExternalTableIterator : public InternalIterator {
 
   ~ExternalTableIterator() override {}
 
-  bool Valid() const override { return iterator_ && iterator_->Valid(); }
+  bool Valid() const override { return valid_; }
 
   void SeekToFirst() override {
     status_ = Status::OK();
@@ -94,23 +95,29 @@ class ExternalTableIterator : public InternalIterator {
     return Slice();
   }
 
-  Status status() const override {
-    return !status_.ok() ? status_
-                         : (iterator_ ? iterator_->status() : Status::OK());
-  }
+  Status status() const override { return status_; }
 
  private:
   std::unique_ptr<Iterator> iterator_;
   InternalKey key_;
+  bool valid_;
   Status status_;
 
-  void UpdateKey() { key_.Set(iterator_->key(), 0, ValueType::kTypeValue); }
+  void UpdateKey() {
+    if (iterator_) {
+      valid_ = iterator_->Valid();
+      status_ = iterator_->status();
+      if (valid_ && status_.ok()) {
+        key_.Set(iterator_->key(), 0, ValueType::kTypeValue);
+      }
+    }
+  }
 };
 
 class ExternalTableReaderAdapter : public TableReader {
  public:
   explicit ExternalTableReaderAdapter(
-      std::unique_ptr<ExternalTableReader> reader)
+      std::unique_ptr<ExternalTableReader>&& reader)
       : reader_(std::move(reader)) {}
 
   ~ExternalTableReaderAdapter() override {}
@@ -170,6 +177,62 @@ class ExternalTableReaderAdapter : public TableReader {
   std::unique_ptr<ExternalTableReader> reader_;
 };
 
+class ExternalTableBuilderAdapter : public TableBuilder {
+ public:
+  explicit ExternalTableBuilderAdapter(
+      std::unique_ptr<ExternalTableBuilder>&& builder)
+      : builder_(std::move(builder)), num_entries_(0) {}
+
+  void Add(const Slice& key, const Slice& value) override {
+    ParsedInternalKey pkey;
+    status_ = ParseInternalKey(key, &pkey, /*log_err_key=*/false);
+    if (status_.ok()) {
+      if (pkey.type != ValueType::kTypeValue) {
+        status_ = Status::NotSupported(
+            "Value type " + std::to_string(pkey.type) + "not supported");
+      } else {
+        builder_->Add(pkey.user_key, value);
+        num_entries_++;
+      }
+    }
+  }
+
+  Status status() const override {
+    if (status_.ok()) {
+      return builder_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  IOStatus io_status() const override { return status_to_io_status(status()); }
+
+  Status Finish() override { return builder_->Finish(); }
+
+  void Abandon() override { builder_->Abandon(); }
+
+  uint64_t FileSize() const override { return builder_->FileSize(); }
+
+  uint64_t NumEntries() const override { return num_entries_; }
+
+  TableProperties GetTableProperties() const override {
+    return builder_->GetTableProperties();
+  }
+
+  std::string GetFileChecksum() const override {
+    return builder_->GetFileChecksum();
+  }
+
+  const char* GetFileChecksumFuncName() const override {
+    return builder_->GetFileChecksumFuncName();
+  }
+
+ private:
+  Status status_;
+  std::unique_ptr<ExternalTableBuilder> builder_;
+  uint64_t num_entries_;
+};
+
 class ExternalTableFactoryAdapter : public TableFactory {
  public:
   explicit ExternalTableFactoryAdapter(
@@ -197,8 +260,18 @@ class ExternalTableFactoryAdapter : public TableFactory {
     return Status::OK();
   }
 
-  TableBuilder* NewTableBuilder(const TableBuilderOptions&,
-                                WritableFileWriter*) const override {
+  using TableFactory::NewTableBuilder;
+  TableBuilder* NewTableBuilder(const TableBuilderOptions& topts,
+                                WritableFileWriter* file) const override {
+    std::unique_ptr<ExternalTableBuilder> builder;
+    ExternalTableBuilderOptions ext_topts(
+        topts.read_options, topts.write_options,
+        topts.moptions.prefix_extractor, topts.ioptions.user_comparator,
+        topts.column_family_name, topts.reason);
+    builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name()));
+    if (builder) {
+      return new ExternalTableBuilderAdapter(std::move(builder));
+    }
     return nullptr;
   }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index 7441b0ff706b..51e7ea497f9a 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -36,7 +36,7 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "rocksdb/external_table_reader.h"
+#include "rocksdb/external_table.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/filter_policy.h"
@@ -6531,28 +6531,112 @@ class ExternalTableReaderTest : public DBTestBase {
       : DBTestBase("external_table_reader_test", /*env_do_fsync=*/false) {}
 
  protected:
+  class DummyExternalTableFile {
+   public:
+    explicit DummyExternalTableFile(const std::string& file_path)
+        : file_path_(file_path), file_size_(0) {
+      props_.comparator_name = BytewiseComparator()->Name();
+    }
+
+    Status Serialize(
+        const std::vector<std::pair<std::string, std::string>>& kv_vec) {
+      for (auto& kv : kv_vec) {
+        SerializeOne(kv.first, kv.second);
+        props_.raw_key_size += kv.first.length();
+        props_.raw_value_size += kv.second.length();
+      }
+      props_.num_entries = kv_vec.size();
+      file_size_ = buf_.length();
+      return WriteStringToFile(Env::Default(), buf_, file_path_);
+    }
+
+    Status Deserialize(std::map<std::string, std::string>& kv_map) {
+      Status s = ReadFileToString(Env::Default(), file_path_, &buf_);
+      if (!s.ok()) {
+        return s;
+      }
+
+      while (buf_.length() > 0) {
+        std::pair<std::string, std::string> kv;
+        s = DeserializeOne(kv);
+        if (!s.ok()) {
+          break;
+        }
+        size_t key_size = kv.first.length();
+        size_t value_size = kv.second.length();
+        kv_map.emplace(std::move(kv));
+        props_.raw_key_size += key_size;
+        props_.raw_value_size += value_size;
+      }
+      props_.num_entries = kv_map.size();
+      return s;
+    }
+
+    TableProperties GetTableProperties() const { return props_; }
+
+    uint64_t FileSize() const { return file_size_; }
+
+   private:
+    struct ItemHeader {
+      uint32_t key_size;
+      uint32_t value_size;
+    };
+
+    void SerializeOne(const Slice& key, const Slice& value) {
+      ItemHeader hdr;
+      hdr.key_size = static_cast<uint32_t>(key.size());
+      hdr.value_size = static_cast<uint32_t>(value.size());
+      buf_.append(static_cast<char*>(static_cast<void*>(&hdr)), sizeof(hdr));
+      buf_.append(key.data(), key.size());
+      buf_.append(value.data(), value.size());
+    }
+
+    Status DeserializeOne(std::pair<std::string, std::string>& kv) {
+      ItemHeader hdr;
+      size_t copied =
+          buf_.copy(static_cast<char*>(static_cast<void*>(&hdr)), sizeof(hdr));
+      if (copied < sizeof(hdr)) {
+        return Status::Corruption();
+      }
+      buf_.erase(0, sizeof(hdr));
+      if (buf_.length() < hdr.key_size + hdr.value_size) {
+        return Status::Corruption();
+      }
+      kv.first.assign(std::string_view(buf_.data(), hdr.key_size));
+      buf_.erase(0, hdr.key_size);
+      kv.second.assign(std::string_view(buf_.data(), hdr.value_size));
+      buf_.erase(0, hdr.value_size);
+      return Status::OK();
+    }
+
+    std::string file_path_;
+    std::string buf_;
+    TableProperties props_;
+    uint64_t file_size_;
+  };
+
   class DummyExternalTableIterator : public Iterator {
    public:
-    explicit DummyExternalTableIterator(bool empty) : empty_(empty) {}
+    explicit DummyExternalTableIterator(
+        const ReadOptions& ro, const std::map<std::string, std::string>& kv_map)
+        : weight_(ro.weight), kv_map_(kv_map), valid_(false) {}
 
-    bool Valid() const override { return empty_ ? !empty_ : valid_; }
+    bool Valid() const override { return valid_; }
 
     void SeekToFirst() override {
-      valid_ = true;
+      iter_ = kv_map_.begin();
+      valid_ = iter_ != kv_map_.end();
       status_ = Status::OK();
     }
 
     void SeekToLast() override {
-      valid_ = true;
-      status_ = Status::OK();
+      valid_ = false;
+      status_ = Status::NotSupported();
     }
 
     void Seek(const Slice& target) override {
-      if (target.compare(key_str) <= 0) {
-        valid_ = true;
-      } else {
-        valid_ = false;
-      }
+      iter_ = kv_map_.find(target.ToString());
+      valid_ = iter_ != kv_map_.end();
       status_ = Status::OK();
     }
 
@@ -6562,7 +6646,9 @@ class ExternalTableReaderTest : public DBTestBase {
     }
 
     void Next() override {
-      valid_ = false;
+      iter_++;
+      weight_--;
+      valid_ = iter_ != kv_map_.end() && weight_ > 0;
       // status_ is still ok. valid_ indicates end of scan
     }
 
@@ -6573,7 +6659,7 @@ class ExternalTableReaderTest : public DBTestBase {
 
     Slice key() const override {
       // If valid_ is false or status_ is non-ok, behavior is indeterminate
-      return Slice(key_str);
+      return Slice(iter_->first);
     }
 
     Status status() const override {
@@ -6583,31 +6669,36 @@ class ExternalTableReaderTest : public DBTestBase {
 
     Slice value() const override {
       // If valid_ is false or status_ is non-ok, behavior is indeterminate
-      return Slice(value_str);
+      return Slice(iter_->second);
     }
 
    private:
-    static const std::string key_str;
-    static const std::string value_str;
-
+    uint64_t weight_;
+    std::map<std::string, std::string> kv_map_;
     bool valid_ = false;
-    bool empty_;
     Status status_ = Status::OK();
+    std::map<std::string, std::string>::iterator iter_;
   };
 
   class DummyExternalTableReader : public ExternalTableReader {
    public:
+    explicit DummyExternalTableReader(const std::string& file_path)
+        : file_(file_path) {
+      Status s = file_.Deserialize(kv_map_);
+      EXPECT_OK(s);
+    }
+
     Iterator* NewIterator(const ReadOptions& read_options,
                           const SliceTransform* /*prefix_extractor*/) override {
-      return new DummyExternalTableIterator((read_options.weight == 0) ? true
-                                                                       : false);
+      return new DummyExternalTableIterator(read_options, kv_map_);
     }
 
     Status Get(const ReadOptions& /*read_options*/, const Slice& key,
                const SliceTransform* /*prefix_extractor*/,
                std::string* value) override {
-      if (!key.compare("foo")) {
-        value->assign("bar");
+      auto iter = kv_map_.find(key.ToString());
+      if (iter != kv_map_.end()) {
+        value->assign(iter->second);
         return Status::OK();
       }
       return Status::NotFound();
@@ -6635,6 +6726,43 @@ class ExternalTableReaderTest : public DBTestBase {
       props->raw_value_size = 3;
       return props;
     }
+
+   private:
+    std::map<std::string, std::string> kv_map_;
+    DummyExternalTableFile file_;
+  };
+
+  class DummyExternalTableBuilder : public ExternalTableBuilder {
+   public:
+    explicit DummyExternalTableBuilder(const std::string& file_path)
+        : file_(file_path) {}
+
+    void Add(const Slice& key, const Slice& value) override {
+      if (!kv_vec_.empty()) {
+        ASSERT_LT(BytewiseComparator()->Compare(kv_vec_.back().first, key), 0);
+      }
+      kv_vec_.emplace_back(key.ToString(), value.ToString());
+    }
+
+    Status Finish() override {
+      status_ = file_.Serialize(kv_vec_);
+      return status_;
+    }
+
+    void Abandon() override { kv_vec_.clear(); }
+
+    uint64_t FileSize() const override { return file_.FileSize(); }
+
+    TableProperties GetTableProperties() const override {
+      return file_.GetTableProperties();
+    }
+
+    Status status() const override { return status_; }
+
+   private:
+    std::vector<std::pair<std::string, std::string>> kv_vec_;
+    DummyExternalTableFile file_;
+    Status status_;
   };
 
   class DummyExternalTableFactory : public ExternalTableFactory {
@@ -6642,28 +6770,42 @@ class ExternalTableReaderTest : public DBTestBase {
     const char* Name() const override { return "DummyExternalTableFactory"; }
 
     Status NewTableReader(
-        const ReadOptions& /*read_options*/, const std::string& /*file_path*/,
+        const ReadOptions& /*read_options*/, const std::string& file_path,
         const ExternalTableOptions& /*topts*/,
-        std::unique_ptr<ExternalTableReader>* table_reader) override {
-      table_reader->reset(new DummyExternalTableReader());
+        std::unique_ptr<ExternalTableReader>* table_reader) const override {
+      table_reader->reset(new DummyExternalTableReader(file_path));
       return Status::OK();
     }
+
+    ExternalTableBuilder* NewTableBuilder(
+        const ExternalTableBuilderOptions& /*opts*/,
+        const std::string& file_path) const override {
+      return new DummyExternalTableBuilder(file_path);
+    }
   };
 };
 
-const std::string ExternalTableReaderTest::DummyExternalTableIterator::key_str =
-    "foo";
-const std::string
-    ExternalTableReaderTest::DummyExternalTableIterator::value_str = "bar";
-
 TEST_F(ExternalTableReaderTest, BasicTest) {
   std::shared_ptr<ExternalTableFactory> factory =
       std::make_shared<DummyExternalTableFactory>();
 
+  std::string file_path = test::PerThreadDBPath("external_table");
+  {
+    std::unique_ptr<ExternalTableBuilder> builder;
+    builder.reset(factory->NewTableBuilder(
+        ExternalTableBuilderOptions(ReadOptions(), WriteOptions(),
+                                    std::shared_ptr<const SliceTransform>(),
+                                    BytewiseComparator(), "default",
+                                    TableFileCreationReason::kMisc),
+        file_path));
+    builder->Add("foo", "bar");
+    ASSERT_OK(builder->Finish());
+  }
+
   std::unique_ptr<ExternalTableReader> reader;
   std::shared_ptr<SliceTransform> prefix_extractor;
   ASSERT_OK(factory->NewTableReader(
-      {}, "", ExternalTableOptions(prefix_extractor, nullptr), &reader));
+      {}, file_path, ExternalTableOptions(prefix_extractor, nullptr), &reader));
 
   ReadOptions ro;
   ro.weight = 1;
@@ -6694,14 +6836,19 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) {
   std::string dbname = test::PerThreadDBPath("external_table_reader_test");
   std::string ingest_file = dbname + "test.immutabledb";
   dbname += "_db";
+  // This test doesn't work with some custom Envs, like EncryptedEnv
+  options.env = Env::Default();
 
   std::shared_ptr<ExternalTableFactory> factory =
       std::make_shared<DummyExternalTableFactory>();
   options.table_factory = NewExternalTableFactory(factory);
 
-  // Create a file
-  ASSERT_OK(WriteStringToFile(options.env, "Hello World", ingest_file,
-                              /*should_sync=*/true));
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
 
   std::unique_ptr<SstFileReader> reader(new SstFileReader(options));
   ASSERT_OK(reader->Open(ingest_file));

From 8e6d4311533d371f70993dd5b7441082ae2f3cd9 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Wed, 5 Mar 2025 19:07:01 -0800
Subject: [PATCH 010/500] Add IOActivityToString helper method (#13440)

Summary:
I have a place I want to use this helper method inside the Sally codebase. I have this functionality in my Sally diff right now, but I think it is generic enough to warrant putting alongside `Env::PriorityToString`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13440

Test Plan: Just the compiler and CI checks are sufficient IMO.

Reviewed By: hx235

Differential Revision: D70664597

Pulled By: archang19

fbshipit-source-id: 341de6c6e311a3f421ad093c2c216e5caa5034dd
---
 env/env.cc            | 31 +++++++++++++++++++++++++++++++
 include/rocksdb/env.h |  2 ++
 2 files changed, 33 insertions(+)

diff --git a/env/env.cc b/env/env.cc
index 683771e72360..d392eb036a52 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -732,6 +732,37 @@ std::string Env::PriorityToString(Env::Priority priority) {
   return "Invalid";
 }
 
+std::string Env::IOActivityToString(IOActivity activity) {
+  switch (activity) {
+    case Env::IOActivity::kFlush:
+      return "Flush";
+    case Env::IOActivity::kCompaction:
+      return "Compaction";
+    case Env::IOActivity::kDBOpen:
+      return "DBOpen";
+    case Env::IOActivity::kGet:
+      return "Get";
+    case Env::IOActivity::kMultiGet:
+      return "MultiGet";
+    case Env::IOActivity::kDBIterator:
+      return "DBIterator";
+    case Env::IOActivity::kVerifyDBChecksum:
+      return "VerifyDBChecksum";
+    case Env::IOActivity::kVerifyFileChecksums:
+      return "VerifyFileChecksums";
+    case Env::IOActivity::kGetEntity:
+      return "GetEntity";
+    case Env::IOActivity::kMultiGetEntity:
+      return "MultiGetEntity";
+    case Env::IOActivity::kReadManifest:
+      return "ReadManifest";
+    case Env::IOActivity::kUnknown:
+      return "Unknown";
+  };
+  assert(false);
+  return "Invalid";
+}
+
 uint64_t Env::GetThreadID() const {
   std::hash<std::thread::id> hasher;
   return hasher(std::this_thread::get_id());
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 0d5f24b52683..dfd6789a40c6 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -459,6 +459,8 @@ class Env : public Customizable {
     kUnknown,  // Keep last for easy array of non-unknowns
   };
 
+  static std::string IOActivityToString(IOActivity activity);
+
   // Arrange to run "(*function)(arg)" once in a background thread, in
   // the thread pool specified by pri. By default, jobs go to the 'LOW'
   // priority thread pool.

From 68b2d941be2d273067aeb7e14ccd090d1512c426 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 5 Mar 2025 22:15:17 -0800
Subject: [PATCH 011/500] Introduce kAborted Status (#13438)

Summary:
If compaction job needs to be aborted inside `Schedule()` or `Wait()` today (e.g. Primary host is shutting down), the only two options are the following
- Handle it as failure by returning `CompactionServiceJobStatus::kFailure`
- Return `CompactionServiceJobStatus::kUseLocal` and let the compaction move on locally and eventually succeed or fail depending on the timing

In this PR, we are introducing a new status, `CompactionServiceJobStatus::kAborted`,  so that the implementation of `Schedule()` and `Wait()` can return it. Just like how `CompactionServiceJobStatus::kFailure` is handled, compaction will not move on and fail, but the status will be returned as `Status::Aborted()` instead of `Status::Incomplete()`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13438

Test Plan:
Unit Test added
```
 ./compaction_service_test --gtest_filter="*CompactionServiceTest.AbortedWhileWait*"
```

Reviewed By: anand1976, hx235

Differential Revision: D70655355

Pulled By: jaykorean

fbshipit-source-id: 22614ce9c7455cda649b15465625edc93978fe11
---
 db/compaction/compaction_job.cc               |  3 +-
 db/compaction/compaction_service_job.cc       | 18 ++++++++++
 db/compaction/compaction_service_test.cc      | 34 +++++++++++++++++++
 include/rocksdb/options.h                     |  1 +
 .../remote_compaction_aborted_status.md       |  1 +
 5 files changed, 55 insertions(+), 2 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remote_compaction_aborted_status.md

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 4fba2e6d9be3..3e8bbdb2c346 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1129,8 +1129,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   if (db_options_.compaction_service) {
     CompactionServiceJobStatus comp_status =
         ProcessKeyValueCompactionWithCompactionService(sub_compact);
-    if (comp_status == CompactionServiceJobStatus::kSuccess ||
-        comp_status == CompactionServiceJobStatus::kFailure) {
+    if (comp_status != CompactionServiceJobStatus::kUseLocal) {
       return;
     }
     // fallback to local compaction
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index d571dbbc0c5e..1e81b6937da4 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -83,6 +83,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
   switch (response.status) {
     case CompactionServiceJobStatus::kSuccess:
       break;
+    case CompactionServiceJobStatus::kAborted:
+      sub_compact->status =
+          Status::Aborted("Scheduling a remote compaction job was aborted");
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] Remote compaction was aborted at Schedule()",
+          compaction->column_family_data()->GetName().c_str(), job_id_);
+      return response.status;
     case CompactionServiceJobStatus::kFailure:
       sub_compact->status = Status::Incomplete(
           "CompactionService failed to schedule a remote compaction job.");
@@ -118,6 +126,16 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     return compaction_status;
   }
 
+  if (compaction_status == CompactionServiceJobStatus::kAborted) {
+    sub_compact->status =
+        Status::Aborted("Waiting a remote compaction job was aborted");
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction was aborted during Wait()",
+                   compaction->column_family_data()->GetName().c_str(),
+                   job_id_);
+    return compaction_status;
+  }
+
   CompactionServiceResult compaction_result;
   s = CompactionServiceResult::Read(compaction_result_binary,
                                     &compaction_result);
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 694466ce0c70..95c9bd789e10 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -1471,6 +1471,40 @@ TEST_F(CompactionServiceTest, FallbackLocalManual) {
   VerifyTestData();
 }
 
+TEST_F(CompactionServiceTest, AbortedWhileWait) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+
+  my_cs->ResetOverride();
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+
+  // Override Wait() result with kAborted
+  my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kAborted);
+  start_str = Key(120);
+  start = start_str;
+
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, nullptr);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsAborted());
+  // no remote compaction is run
+  ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+  // make sure the compaction statistics is not recorded any side
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+}
+
 TEST_F(CompactionServiceTest, RemoteEventListener) {
   class RemoteEventListenerTest : public EventListener {
    public:
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 06e3e99681dd..f83e6a381f06 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -454,6 +454,7 @@ extern const char* kHostnameForDbHostId;
 enum class CompactionServiceJobStatus : char {
   kSuccess,
   kFailure,
+  kAborted,
   kUseLocal,
 };
 
diff --git a/unreleased_history/public_api_changes/remote_compaction_aborted_status.md b/unreleased_history/public_api_changes/remote_compaction_aborted_status.md
new file mode 100644
index 000000000000..eb36ed63ac6b
--- /dev/null
+++ b/unreleased_history/public_api_changes/remote_compaction_aborted_status.md
@@ -0,0 +1 @@
+Introduced CompactionServiceJobStatus::kAborted to allow handling aborted scenario in Schedule(), Wait() or OnInstallation() APIs in Remote Compactions.

From d033c6a84958711b33f83ca3e9f40cf1ca57775a Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 6 Mar 2025 17:26:37 -0800
Subject: [PATCH 012/500] set ignore_unknown_options when parsing options
 (#13443)

Summary:
In case the primary host has a new option added which isn't available in the remote worker yet, the remote compaction currently fails. In most cases, these new options are not relevant to the remote compaction and the worker should be able to move on by ignoring it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13443

Test Plan: Verified internally in Meta Infra.

Reviewed By: anand1976

Differential Revision: D70744359

Pulled By: jaykorean

fbshipit-source-id: eb6a388c2358a7f8089f2e35a378b7017b9e03f3
---
 db/db_impl/db_impl_secondary.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index a9082db3b42f..b58d63e52606 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -994,6 +994,7 @@ Status DB::OpenAndCompact(
   DBOptions db_options;
   ConfigOptions config_options;
   config_options.env = override_options.env;
+  config_options.ignore_unknown_options = true;
   std::vector<ColumnFamilyDescriptor> all_column_families;
 
   TEST_SYNC_POINT_CALLBACK(

From 5d1c0a8832ce5aeb3b7c76044b079d079143dd73 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 7 Mar 2025 11:25:44 -0800
Subject: [PATCH 013/500] Reformat assertion in
 TEST_VerifyNoObsoleteFilesCached (#13446)

Summary:
... for better automatic failure grouping

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13446

Test Plan: no production code change

Reviewed By: hx235

Differential Revision: D70789464

Pulled By: pdillinger

fbshipit-source-id: 68263f6ed666349d65b5f493865973a213f35ec9
---
 db/db_impl/db_impl_debug.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 38873b0e3212..3f29d06341d7 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -379,10 +379,13 @@ void DBImpl::TEST_VerifyNoObsoleteFilesCached(
     uint64_t file_number;
     GetUnaligned(reinterpret_cast<const uint64_t*>(key.data()), &file_number);
     // Assert file is in live/quarantined set
-    if (live_and_quar_files.find(file_number) == live_and_quar_files.end()) {
+    bool cached_file_is_live_or_quar =
+        live_and_quar_files.find(file_number) != live_and_quar_files.end();
+    if (!cached_file_is_live_or_quar) {
+      // Fail with useful info
       std::cerr << "File " << file_number << " is not live nor quarantined"
                 << std::endl;
-      assert(false);
+      assert(cached_file_is_live_or_quar);
     }
   };
   table_cache_->ApplyToAllEntries(fn, {});

From b9c7481fc2424ac31a97929edc6dbad360be2646 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 7 Mar 2025 14:56:45 -0800
Subject: [PATCH 014/500] Fix some secondary/read-only DB logic (#13441)

Summary:
Primarily, fix an issue from https://github.com/facebook/rocksdb/issues/13316 with opening secondary DB with preserve/preclude option (crash test disable in https://github.com/facebook/rocksdb/issues/13439). The issue comes down to mixed-up interpretations of "read_only" which should now be resolved. I've introduced the stronger notion of "unchanging" which means the VersionSet never sees any changes to the LSM tree, and the weaker notion of "read_only" which means LSM tree changes are not written through this VersionSet/etc. but can pick up externally written changes. In particular, ManifestTailer should use read_only=true (along with unchanging=false) for proper handling of preserve/preclude options.

A new assertion in VersionSet::CreateColumnFamily to help ensure sane usage of the two boolean flags is incompatible with the known wart of allowing CreateColumnFamily on a read-only DB. So to keep that assertion, I have fixed that issue by disallowing it. And this in turn required downstream clean-up in ldb, where I cleaned up some call sites as well.

Also, rename SanitizeOptions for ColumnFamilyOptions to SanitizeCfOptions, for ease of search etc.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13441

Test Plan:
* Added preserve option to a test in db_secondary_test, which reproduced the failure seen in the crash test.
* Revert https://github.com/facebook/rocksdb/issues/13439 to re-enable crash test functionality
* Update some tests to deal with disallowing CF creation on read-only DB
* Add some testing around read-only DBs and CreateColumnFamily(ies)
* Resurrect a nearby test for read-only DB to be sure it doesn't write to the DB dir. New EnforcedReadOnlyReopen should probably be used in more places but didn't want to attempt a big migration here and now. (Suggested follow-up.)

Reviewed By: jowlyzhang

Differential Revision: D70808033

Pulled By: pdillinger

fbshipit-source-id: 486b4e9f9c9045150a0ebb9cb302753d03932a3f
---
 db/column_family.cc                           |  8 +--
 db/column_family.h                            |  6 +-
 db/column_family_test.cc                      |  8 +--
 db/db_basic_test.cc                           | 35 +++++++---
 db/db_impl/db_impl_open.cc                    |  4 +-
 db/db_impl/db_impl_readonly.h                 | 23 +++++++
 db/db_options_test.cc                         |  2 +-
 db/db_secondary_test.cc                       |  1 +
 db/db_test_util.cc                            | 12 ++++
 db/db_test_util.h                             |  4 ++
 db/repair.cc                                  | 10 +--
 db/version_edit_handler.cc                    |  2 +-
 db/version_edit_handler.h                     | 12 ++--
 db/version_set.cc                             | 10 +--
 db/version_set.h                              | 18 +++--
 db/version_set_test.cc                        | 19 +++---
 tools/db_crashtest.py                         |  3 -
 tools/ldb_cmd.cc                              | 67 ++++++++++---------
 .../behavior_changes/read_only_create_cf.md   |  1 +
 19 files changed, 161 insertions(+), 84 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/read_only_create_cf.md

diff --git a/db/column_family.cc b/db/column_family.cc
index 8c6bf9c96b9c..ffb89c75408a 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -200,9 +200,9 @@ const uint64_t kDefaultTtl = 0xfffffffffffffffe;
 const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
 }  // anonymous namespace
 
-ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
-                                    bool read_only,
-                                    const ColumnFamilyOptions& src) {
+ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
+                                      bool read_only,
+                                      const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
   size_t clamp_max = std::conditional<
       sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
@@ -569,7 +569,7 @@ ColumnFamilyData::ColumnFamilyData(
       dropped_(false),
       flush_skip_reschedule_(false),
       internal_comparator_(cf_options.comparator),
-      initial_cf_options_(SanitizeOptions(db_options, read_only, cf_options)),
+      initial_cf_options_(SanitizeCfOptions(db_options, read_only, cf_options)),
       ioptions_(db_options, initial_cf_options_),
       mutable_cf_options_(initial_cf_options_),
       is_delete_range_supported_(
diff --git a/db/column_family.h b/db/column_family.h
index 5e18c90a1bad..71401834ba80 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -281,9 +281,9 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options);
 Status CheckCFPathsSupported(const DBOptions& db_options,
                              const ColumnFamilyOptions& cf_options);
 
-ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
-                                    bool read_only,
-                                    const ColumnFamilyOptions& src);
+ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
+                                      bool read_only,
+                                      const ColumnFamilyOptions& src);
 // Wrap user defined table properties collector factories `from cf_options`
 // into internal ones in internal_tbl_prop_coll_factories. Add a system internal
 // one too.
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 224257df4940..d84799b57c42 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -271,8 +271,8 @@ class ColumnFamilyTestBase : public testing::Test {
       // them.
       ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
           ConfigOptions(), desc.options,
-          SanitizeOptions(dbfull()->immutable_db_options(), /*read_only*/ false,
-                          current_cf_opt)));
+          SanitizeCfOptions(dbfull()->immutable_db_options(),
+                            /*read_only*/ false, current_cf_opt)));
       cfi++;
     }
   }
@@ -2233,7 +2233,7 @@ TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
   ASSERT_EQ(my_fs->options_files_created.load(), 2);
 }
 
-TEST_P(ColumnFamilyTest, SanitizeOptions) {
+TEST_P(ColumnFamilyTest, SanitizeCfOptions) {
   DBOptions db_options;
   for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
     for (int l = 0; l <= 2; l++) {
@@ -2249,7 +2249,7 @@ TEST_P(ColumnFamilyTest, SanitizeOptions) {
             original.write_buffer_size =
                 l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
 
-            ColumnFamilyOptions result = SanitizeOptions(
+            ColumnFamilyOptions result = SanitizeCfOptions(
                 ImmutableDBOptions(db_options), /*read_only*/ false, original);
             ASSERT_TRUE(result.level0_stop_writes_trigger >=
                         result.level0_slowdown_writes_trigger);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 5ab97a7340dd..08b6486df965 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -161,6 +161,7 @@ TEST_F(DBBasicTest, UniqueSession) {
 
   ASSERT_EQ(sid2, sid3);
 
+  DestroyAndReopen(options);
   CreateAndReopenWithCF({"goku"}, options);
   ASSERT_OK(db_->GetDbSessionId(sid1));
   ASSERT_OK(Put("bar", "e1"));
@@ -179,6 +180,7 @@ TEST_F(DBBasicTest, UniqueSession) {
 TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("foo", "v3"));
   Close();
 
@@ -208,10 +210,11 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
 
   auto options = CurrentOptions();
   assert(options.env == env_);
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   verify_all_iters();
+  ASSERT_EQ(Flush().code(), Status::Code::kNotSupported);
   Close();
 
   // Reopen and flush memtable.
@@ -219,26 +222,38 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
   ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   verify_all_iters();
-  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  ASSERT_EQ(db_->SyncWAL().code(), Status::Code::kNotSupported);
+
+  // More ops that should fail
+  std::vector<ColumnFamilyHandle*> cfhs{{}};
+  ASSERT_EQ(db_->CreateColumnFamily(options, "blah", &cfhs[0]).code(),
+            Status::Code::kNotSupported);
+
+  ASSERT_EQ(db_->CreateColumnFamilies(options, {"blah"}, &cfhs).code(),
+            Status::Code::kNotSupported);
+
+  std::vector<ColumnFamilyDescriptor> cfds;
+  cfds.push_back({"blah", options});
+  ASSERT_EQ(db_->CreateColumnFamilies(cfds, &cfhs).code(),
+            Status::Code::kNotSupported);
 }
 
-// TODO akanksha: Update the test to check that combination
-// does not actually write to FS (use open read-only with
-// CompositeEnvWrapper+ReadOnlyFileSystem).
-TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
+TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
+  auto options = CurrentOptions();
+  options.write_dbid_to_manifest = false;
+  DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Put("bar", "v2"));
   ASSERT_OK(Put("foo", "v3"));
   Close();
 
-  auto options = CurrentOptions();
   options.write_dbid_to_manifest = true;
   assert(options.env == env_);
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   std::string db_id1;
   ASSERT_OK(db_->GetDbIdentity(db_id1));
   ASSERT_EQ("v3", Get("foo"));
@@ -258,7 +273,7 @@ TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
   ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
-  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(EnforcedReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
   ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 22b1cfd7c710..549e574f7f9b 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -35,8 +35,8 @@ Options SanitizeOptions(const std::string& dbname, const Options& src,
   auto db_options =
       SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
   ImmutableDBOptions immutable_db_options(db_options);
-  auto cf_options = SanitizeOptions(immutable_db_options, read_only,
-                                    ColumnFamilyOptions(src));
+  auto cf_options = SanitizeCfOptions(immutable_db_options, read_only,
+                                      ColumnFamilyOptions(src));
   return Options(db_options, cf_options);
 }
 
diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
index 9566f547bfeb..3edfeb0e5508 100644
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -155,6 +155,29 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DB::CreateColumnFamily;
+  using DBImpl::CreateColumnFamily;
+  Status CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+                            const std::string& /*column_family*/,
+                            ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::CreateColumnFamilies;
+  using DBImpl::CreateColumnFamilies;
+  Status CreateColumnFamilies(
+      const ColumnFamilyOptions& /*cf_options*/,
+      const std::vector<std::string>& /*column_family_names*/,
+      std::vector<ColumnFamilyHandle*>* /*handles*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+      std::vector<ColumnFamilyHandle*>* /*handles*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
   // FIXME: some missing overrides for more "write" functions
 
  protected:
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index d619e8604e55..df0d4ca3c795 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -71,7 +71,7 @@ class DBOptionsTest : public DBTestBase {
     ImmutableDBOptions db_options(options);
     test::RandomInitCFOptions(&options, options, rnd);
     auto sanitized_options =
-        SanitizeOptions(db_options, /*read_only*/ false, options);
+        SanitizeCfOptions(db_options, /*read_only*/ false, options);
     auto opt_map = GetMutableCFOptionsMap(sanitized_options);
     delete options.compaction_filter;
     return opt_map;
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index 060ce8644087..5be4feecf74c 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -160,6 +160,7 @@ TEST_F(DBSecondaryTest, NonExistingDb) {
 TEST_F(DBSecondaryTest, ReopenAsSecondary) {
   Options options;
   options.env = env_;
+  options.preserve_internal_time_seconds = 300;
   Reopen(options);
   ASSERT_OK(Put("foo", "foo_value"));
   ASSERT_OK(Put("bar", "bar_value"));
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 3944e92a0dc0..64a85bc41032 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -11,6 +11,7 @@
 
 #include "cache/cache_reservation_manager.h"
 #include "db/forward_iterator.h"
+#include "env/fs_readonly.h"
 #include "env/mock_env.h"
 #include "port/lang.h"
 #include "rocksdb/cache.h"
@@ -716,6 +717,17 @@ Status DBTestBase::ReadOnlyReopen(const Options& options) {
   return DB::OpenForReadOnly(options, dbname_, &db_);
 }
 
+Status DBTestBase::EnforcedReadOnlyReopen(const Options& options) {
+  Close();
+  Options options_copy = options;
+  MaybeInstallTimeElapseOnlySleep(options_copy);
+  auto fs_read_only =
+      std::make_shared<ReadOnlyFileSystem>(env_->GetFileSystem());
+  env_read_only_ = std::make_shared<CompositeEnvWrapper>(env_, fs_read_only);
+  options_copy.env = env_read_only_.get();
+  return DB::OpenForReadOnly(options_copy, dbname_, &db_);
+}
+
 Status DBTestBase::TryReopen(const Options& options) {
   Close();
   last_options_.table_factory.reset();
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 1ddb4faef169..4a00ea4371b8 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -1062,6 +1062,7 @@ class DBTestBase : public testing::Test {
   MockEnv* mem_env_;
   Env* encrypted_env_;
   SpecialEnv* env_;
+  std::shared_ptr<Env> env_read_only_;
   std::shared_ptr<Env> env_guard_;
   DB* db_;
   std::vector<ColumnFamilyHandle*> handles_;
@@ -1178,6 +1179,9 @@ class DBTestBase : public testing::Test {
 
   Status ReadOnlyReopen(const Options& options);
 
+  // With a filesystem wrapper that fails on attempted write
+  Status EnforcedReadOnlyReopen(const Options& options);
+
   Status TryReopen(const Options& options);
 
   bool IsDirectIOSupported();
diff --git a/db/repair.cc b/db/repair.cc
index 39189402936d..0c108a601659 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -100,13 +100,15 @@ class Repairer {
         db_options_(SanitizeOptions(dbname_, db_options)),
         immutable_db_options_(ImmutableDBOptions(db_options_)),
         icmp_(default_cf_opts.comparator),
-        default_cf_opts_(SanitizeOptions(immutable_db_options_,
-                                         /*read_only*/ false, default_cf_opts)),
+        default_cf_opts_(SanitizeCfOptions(immutable_db_options_,
+                                           /*read_only*/ false,
+                                           default_cf_opts)),
         default_iopts_(
             ImmutableOptions(immutable_db_options_, default_cf_opts_)),
         default_mopts_(MutableCFOptions(default_cf_opts_)),
-        unknown_cf_opts_(SanitizeOptions(immutable_db_options_,
-                                         /*read_only*/ false, unknown_cf_opts)),
+        unknown_cf_opts_(SanitizeCfOptions(immutable_db_options_,
+                                           /*read_only*/ false,
+                                           unknown_cf_opts)),
         create_unknown_cfs_(create_unknown_cfs),
         raw_table_cache_(
             // TableCache can be small since we expect each table to be opened
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index d1b5ee68cedb..e60644e2714b 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -408,7 +408,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
       if (cfd->IsDropped()) {
         continue;
       }
-      if (read_only_) {
+      if (version_set_->unchanging()) {
         cfd->table_cache()->SetTablesAreImmortal();
       }
       *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h
index f3637ae73075..0cef558826b8 100644
--- a/db/version_edit_handler.h
+++ b/db/version_edit_handler.h
@@ -198,7 +198,9 @@ class VersionEditHandler : public VersionEditHandlerBase {
                             bool prefetch_index_and_filter_in_cache,
                             bool is_initial_load);
 
-  virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+  virtual bool MustOpenAllColumnFamilies() const {
+    return !version_set_->unchanging();
+  }
 
   const bool read_only_;
   std::vector<ColumnFamilyDescriptor> column_families_;
@@ -334,10 +336,10 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
                           const ReadOptions& read_options,
                           EpochNumberRequirement epoch_number_requirement =
                               EpochNumberRequirement::kMustPresent)
-      : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
-                                      version_set, io_tracer, read_options,
-                                      /*allow_incomplete_valid_version=*/false,
-                                      epoch_number_requirement),
+      : VersionEditHandlerPointInTime(
+            /*read_only=*/true, column_families, version_set, io_tracer,
+            read_options,
+            /*allow_incomplete_valid_version=*/false, epoch_number_requirement),
         mode_(Mode::kRecovery) {}
 
   Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
diff --git a/db/version_set.cc b/db/version_set.cc
index da1ad3ea8772..7e9893a93c0e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5114,7 +5114,7 @@ VersionSet::VersionSet(
     BlockCacheTracer* const block_cache_tracer,
     const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
     const std::string& db_session_id, const std::string& daily_offpeak_time_utc,
-    ErrorHandler* const error_handler, const bool read_only)
+    ErrorHandler* error_handler, bool unchanging)
     : column_family_set_(new ColumnFamilySet(
           dbname, _db_options, storage_options, table_cache,
           write_buffer_manager, write_controller, block_cache_tracer, io_tracer,
@@ -5143,12 +5143,12 @@ VersionSet::VersionSet(
       db_session_id_(db_session_id),
       offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
       error_handler_(error_handler),
-      read_only_(read_only),
+      unchanging_(unchanging),
       closed_(false) {}
 
 Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
   Status s;
-  if (closed_ || read_only_ || !manifest_file_number_ || !descriptor_log_) {
+  if (closed_ || unchanging_ || !manifest_file_number_ || !descriptor_log_) {
     return s;
   }
 
@@ -7297,6 +7297,8 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
     const ColumnFamilyOptions& cf_options, const ReadOptions& read_options,
     const VersionEdit* edit, bool read_only) {
   assert(edit->IsColumnFamilyAdd());
+  // Unchanging LSM tree implies no writes to the CF
+  assert(!unchanging_ || read_only);
 
   MutableCFOptions dummy_cf_options;
   Version* dummy_versions =
@@ -7430,7 +7432,7 @@ ReactiveVersionSet::ReactiveVersionSet(
                  write_buffer_manager, write_controller,
                  /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
                  /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
-                 /*error_handler=*/nullptr, /*read_only=*/true) {}
+                 /*error_handler=*/nullptr, /*unchanging=*/false) {}
 
 ReactiveVersionSet::~ReactiveVersionSet() = default;
 
diff --git a/db/version_set.h b/db/version_set.h
index d9cc5a8e07ee..6d6ee5c4864b 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1174,6 +1174,9 @@ class AtomicGroupReadBuffer {
 // VersionSet is the collection of versions of all the column families of the
 // database. Each database owns one VersionSet. A VersionSet has access to all
 // column families via ColumnFamilySet, i.e. set of the column families.
+// `unchanging` means the LSM tree structure of the column families will not
+// change during the lifetime of this VersionSet (true for read-only instance,
+// but false for secondary instance or writable DB).
 class VersionSet {
  public:
   VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
@@ -1184,7 +1187,7 @@ class VersionSet {
              const std::shared_ptr<IOTracer>& io_tracer,
              const std::string& db_id, const std::string& db_session_id,
              const std::string& daily_offpeak_time_utc,
-             ErrorHandler* const error_handler, const bool read_only);
+             ErrorHandler* error_handler, bool unchanging);
   // No copying allowed
   VersionSet(const VersionSet&) = delete;
   void operator=(const VersionSet&) = delete;
@@ -1263,8 +1266,11 @@ class VersionSet {
   void WakeUpWaitingManifestWriters();
 
   // Recover the last saved descriptor (MANIFEST) from persistent storage.
-  // If read_only == true, Recover() will not complain if some column families
-  // are not opened
+  // Unlike `unchanging` on the VersionSet, `read_only` here and in other
+  // functions below refers to the CF receiving no writes or modifications
+  // through this VersionSet, but could through external manifest updates
+  // etc. Thus, `read_only=true` for secondary instances as well as read-only
+  // instances.
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  bool read_only = false, std::string* db_id = nullptr,
                  bool no_error_if_files_missing = false, bool is_retry = false,
@@ -1342,6 +1348,8 @@ class VersionSet {
     return min_log_number_to_keep_.load();
   }
 
+  bool unchanging() const { return unchanging_; }
+
   // Allocate and return a new file number
   uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
 
@@ -1573,6 +1581,8 @@ class VersionSet {
     AppendVersion(cfd, version);
   }
 
+  bool& TEST_unchanging() { return const_cast<bool&>(unchanging_); }
+
  protected:
   struct ManifestWriter;
 
@@ -1722,7 +1732,7 @@ class VersionSet {
                            VersionEdit* edit, SequenceNumber* max_last_sequence,
                            InstrumentedMutex* mu);
 
-  const bool read_only_;
+  const bool unchanging_;
   bool closed_;
 };
 
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index c249fa6dafad..65cee38de10d 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -26,6 +26,7 @@
 #include "test_util/mock_time_env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/defer.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -1905,7 +1906,7 @@ TEST_F(VersionSetTest, WalAddition) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -1973,7 +1974,7 @@ TEST_F(VersionSetTest, WalCloseWithoutSync) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 2);
@@ -2027,7 +2028,7 @@ TEST_F(VersionSetTest, WalDeletion) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -2066,7 +2067,7 @@ TEST_F(VersionSetTest, WalDeletion) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -2187,7 +2188,7 @@ TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 1);
@@ -2224,7 +2225,7 @@ TEST_F(VersionSetTest, DeleteAllWals) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(new_versions->Recover(column_families_, false));
     const auto& wals = new_versions->GetWalSet().GetWals();
     ASSERT_EQ(wals.size(), 0);
@@ -2267,7 +2268,7 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     std::string db_id;
     ASSERT_OK(
         new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
@@ -2447,7 +2448,7 @@ class VersionSetWithTimestampTest : public VersionSetTest {
         &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
-        /*error_handler=*/nullptr, /*read_only=*/false));
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
                             /*db_id=*/nullptr));
     for (auto* cfd : *(vset->GetColumnFamilySet())) {
@@ -3749,6 +3750,8 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
   }
   std::string db_id;
   bool has_missing_table_file = false;
+  SaveAndRestore<bool> override_unchanging(&versions_->TEST_unchanging(),
+                                           read_only);
   s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
                                            read_only, &db_id,
                                            &has_missing_table_file);
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 6503260658ef..831de21fd9d3 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1030,9 +1030,6 @@ def finalize_and_sanitize(src_params):
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
-        # FIXME: temporarily broken combination
-        dest_params["preserve_internal_time_seconds"] = 0
-        dest_params["preclude_last_level_data_seconds"] = 0
     return dest_params
 
 
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 16a47ab5b0ac..3b8a29337382 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1421,7 +1421,7 @@ CompactorCommand::CompactorCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_FROM, ARG_TO, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_TTL})),
       null_from_(true),
@@ -1496,7 +1496,7 @@ DBLoaderCommand::DBLoaderCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, false,
+          options, flags, false /* is_read_only */,
           BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
                                ARG_TO, ARG_CREATE_IF_MISSING, ARG_DISABLE_WAL,
                                ARG_BULK_LOAD, ARG_COMPACT})),
@@ -1628,7 +1628,7 @@ ManifestDumpCommand::ManifestDumpCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, false,
+          options, flags, true /* is_read_only */,
           BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})),
       verbose_(false),
       json_(false) {
@@ -1776,7 +1776,7 @@ FileChecksumDumpCommand::FileChecksumDumpCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_PATH, ARG_HEX})) {
   auto itr = options.find(ARG_PATH);
   if (itr != options.end()) {
@@ -1840,7 +1840,8 @@ GetPropertyCommand::GetPropertyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({})) {
   if (params.size() != 1) {
     exec_state_ =
         LDBCommandExecuteResult::Failed("property name must be specified");
@@ -1891,7 +1892,8 @@ ListColumnFamiliesCommand::ListColumnFamiliesCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({})) {}
 
 void ListColumnFamiliesCommand::DoCommand() {
   PrepareOptions();
@@ -1925,7 +1927,7 @@ CreateColumnFamilyCommand::CreateColumnFamilyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, {ARG_DB}) {
+    : LDBCommand(options, flags, false /* is_read_only */, {ARG_DB}) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
         "new column family name must be specified");
@@ -1962,7 +1964,7 @@ DropColumnFamilyCommand::DropColumnFamilyCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, {ARG_DB}) {
+    : LDBCommand(options, flags, false /* is_read_only */, {ARG_DB}) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
         "The name of column family to drop must be specified");
@@ -2038,7 +2040,7 @@ InternalDumpCommand::InternalDumpCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
                       ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
@@ -2219,7 +2221,7 @@ DBDumperCommand::DBDumperCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, true,
+          options, flags, true /* is_read_only */,
           BuildCmdLineOptions(
               {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
                ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
@@ -2539,7 +2541,7 @@ ReduceDBLevelsCommand::ReduceDBLevelsCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_NEW_LEVELS, ARG_PRINT_OLD_LEVELS})),
       old_levels_(1 << 7),
       new_levels_(-1),
@@ -2596,7 +2598,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) {
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       opt.daily_offpeak_time_utc,
-                      /*error_handler=*/nullptr, /*read_only=*/true);
+                      /*error_handler=*/nullptr, /*unchanging=*/false);
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
@@ -2678,7 +2680,7 @@ ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_OLD_COMPACTION_STYLE, ARG_NEW_COMPACTION_STYLE})),
       old_compaction_style_(-1),
@@ -3224,7 +3226,7 @@ WALDumperCommand::WALDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_WAL_FILE, ARG_DB, ARG_WRITE_COMMITTED,
                                       ARG_PRINT_HEADER, ARG_PRINT_VALUE,
                                       ARG_ONLY_PRINT_SEQNO_GAPS})),
@@ -3280,7 +3282,7 @@ void WALDumperCommand::DoCommand() {
 GetCommand::GetCommand(const std::vector<std::string>& params,
                        const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_READ_TIMESTAMP})) {
   if (params.size() != 1) {
@@ -3339,7 +3341,7 @@ MultiGetCommand::MultiGetCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX,
                                       ARG_READ_TIMESTAMP})) {
   if (params.size() < 1) {
@@ -3414,7 +3416,7 @@ GetEntityCommand::GetEntityCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_READ_TIMESTAMP})) {
   if (params.size() != 1) {
@@ -3552,7 +3554,7 @@ ApproxSizeCommand::ApproxSizeCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO})) {
   if (options.find(ARG_FROM) != options.end()) {
@@ -3608,7 +3610,7 @@ BatchPutCommand::BatchPutCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
   if (params.size() < 2) {
@@ -3680,7 +3682,7 @@ ScanCommand::ScanCommand(const std::vector<std::string>& /*params*/,
                          const std::map<std::string, std::string>& options,
                          const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, true,
+          options, flags, true /* is_read_only */,
           BuildCmdLineOptions({ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX,
                                ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP,
                                ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END,
@@ -3857,7 +3859,7 @@ void ScanCommand::DoCommand() {
 DeleteCommand::DeleteCommand(const std::vector<std::string>& params,
                              const std::map<std::string, std::string>& options,
                              const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -3893,7 +3895,7 @@ SingleDeleteCommand::SingleDeleteCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
   if (params.size() != 1) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -3929,7 +3931,7 @@ DeleteRangeCommand::DeleteRangeCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
   if (params.size() != 2) {
     exec_state_ = LDBCommandExecuteResult::Failed(
@@ -3967,7 +3969,7 @@ void DeleteRangeCommand::DoCommand() {
 PutCommand::PutCommand(const std::vector<std::string>& params,
                        const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
   if (params.size() != 2) {
@@ -4021,7 +4023,7 @@ PutEntityCommand::PutEntityCommand(
     const std::vector<std::string>& params,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false,
+    : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX,
                                       ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) {
   if (params.size() < 2) {
@@ -4103,7 +4105,7 @@ DBQuerierCommand::DBQuerierCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
-          options, flags, false,
+          options, flags, false /* is_read_only */,
           BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) {
 
 }
@@ -4339,7 +4341,8 @@ CheckConsistencyCommand::CheckConsistencyCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({})) {}
 
 void CheckConsistencyCommand::Help(std::string& ret) {
   ret.append("  ");
@@ -4402,7 +4405,8 @@ const std::string RepairCommand::ARG_VERBOSE = "verbose";
 RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/,
                              const std::map<std::string, std::string>& options,
                              const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_VERBOSE})) {
+    : LDBCommand(options, flags, false /* is_read_only */,
+                 BuildCmdLineOptions({ARG_VERBOSE})) {
   verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
 }
 
@@ -4683,7 +4687,7 @@ DBFileDumperCommand::DBFileDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions(
                      {ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
       decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)),
@@ -4804,7 +4808,7 @@ DBLiveFilesMetadataDumperCommand::DBLiveFilesMetadataDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true,
+    : LDBCommand(options, flags, true /* is_read_only */,
                  BuildCmdLineOptions({ARG_SORT_BY_FILENAME})) {
   sort_by_filename_ = IsFlagPresent(flags, ARG_SORT_BY_FILENAME);
 }
@@ -5119,7 +5123,8 @@ void IngestExternalSstFilesCommand::OverrideBaseOptions() {
 ListFileRangeDeletesCommand::ListFileRangeDeletesCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, true, BuildCmdLineOptions({ARG_MAX_KEYS})) {
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions({ARG_MAX_KEYS})) {
   auto itr = options.find(ARG_MAX_KEYS);
   if (itr != options.end()) {
     try {
diff --git a/unreleased_history/behavior_changes/read_only_create_cf.md b/unreleased_history/behavior_changes/read_only_create_cf.md
new file mode 100644
index 000000000000..2ff8e658a75c
--- /dev/null
+++ b/unreleased_history/behavior_changes/read_only_create_cf.md
@@ -0,0 +1 @@
+* CreateColumnFamily() is no longer allowed on a read-only DB (OpenForReadOnly())

From 60c266658dc6597d4afce59297b9ebcd2998cf3a Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Sun, 9 Mar 2025 11:18:56 -0700
Subject: [PATCH 015/500] Use `nullptr` in
 infra_asic_fpga/ip/mtia/athena/main/models/cmodel/util/jsonUtils.cpp

Summary:
`nullptr` is preferable to `0` or `NULL`. Let's use it everywhere so we can enable `-Wzero-as-null-pointer-constant`.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Reviewed By: dtolnay

Differential Revision: D70818166

fbshipit-source-id: 4658fb004676fe2686249fdd8ecb322dec8aa63d
---
 memtable/skiplist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index c2774d6ceeb4..aabbe75c8615 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -438,7 +438,7 @@ SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
       kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
       compare_(cmp),
       allocator_(allocator),
-      head_(NewNode(0 /* any key will do */, max_height)),
+      head_(NewNode({} /* any key will do */, max_height)),
       max_height_(1),
       prev_height_(1) {
   assert(max_height > 0 && kMaxHeight_ == static_cast<uint32_t>(max_height));

From 22ca6e5e68381915ad317db9e0afc06a52742fcd Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 10 Mar 2025 13:37:47 -0700
Subject: [PATCH 016/500] Additional debug logging for InputFileCheck Failure
 (#13452)

Summary:
Add debug logging when the Wait() does not return `kSuccess` so that we can compare the version state that was printed by the logging added in https://github.com/facebook/rocksdb/issues/13427 upon InputFileCheck failure.

# Test Plan

CI + Tested with Temporary Change in Meta Internal Infra

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13452

Reviewed By: hx235

Differential Revision: D70898963

Pulled By: jaykorean

fbshipit-source-id: d591b82f2df173b5e01f6552230844ce95155256
---
 db/compaction/compaction_service_job.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 1e81b6937da4..17a37e9fc29c 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -110,6 +110,9 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       break;
   }
 
+  std::string debug_str_before_wait =
+      compaction->input_version()->DebugString();
+
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Waiting for remote compaction...",
                  compaction->column_family_data()->GetName().c_str(), job_id_);
@@ -118,6 +121,16 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       db_options_.compaction_service->Wait(response.scheduled_job_id,
                                            &compaction_result_binary);
 
+  if (compaction_status != CompactionServiceJobStatus::kSuccess) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Wait() status is not kSuccess. "
+                    "\nDebugString Before Wait():\n%s"
+                    "\nDebugString After Wait():\n%s",
+                    compaction->column_family_data()->GetName().c_str(),
+                    job_id_, debug_str_before_wait.c_str(),
+                    compaction->input_version()->DebugString().c_str());
+  }
+
   if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
     ROCKS_LOG_INFO(
         db_options_.info_log,

From 8e16f8fecf469f2068f1020008993c9a4b6fceb6 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 12 Mar 2025 01:13:40 -0700
Subject: [PATCH 017/500] Reduce db stress noise (#13447)

Summary:
[Experiment]

This PR is a followup to https://github.com/facebook/rocksdb/pull/13408. Thick bandaid of ignoring all injected read errors in context of periodic iterator auto refreshes in db stress proved to be effective. We confirmed our theory that errors are not a really a consequence / defect related to this new feature but rather due to subtle ways in which downstream code paths handle their respective IO failures. In this change we're replacing a thick 'ignore all IO read errors' bandaid in `no_batched_ops_stress` with a much smaller, targeted patches in obsolete files purge / delete codepaths, table block cache reader, table cache lookup to make sure we don't miss signal and ensure there's a single mechanism for ignoring error injection in db stress tests.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13447

Reviewed By: hx235

Differential Revision: D70794787

Pulled By: mszeszko-meta

fbshipit-source-id: c5fcd4780d82357c407f53bf0bb22fc38f7bd277
---
 db/db_impl/db_impl_files.cc             | 3 +++
 db/table_cache.cc                       | 1 +
 db/wal_manager.cc                       | 1 +
 db_stress_tool/db_stress_shared_state.h | 2 +-
 db_stress_tool/no_batched_ops_stress.cc | 9 +--------
 file/delete_scheduler.cc                | 2 ++
 table/block_fetcher.cc                  | 1 +
 7 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index c1ef7b96b160..2c4b3bfde925 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -368,6 +368,7 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
                                     FileType type, uint64_t number) {
   TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion",
                            const_cast<std::string*>(&fname));
+  IGNORE_STATUS_IF_ERROR(Status::IOError());
 
   Status file_deletion_status;
   if (type == kTableFile || type == kBlobFile || type == kWalFile) {
@@ -423,6 +424,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
   // FindObsoleteFiles() should've populated this so nonzero
   assert(state.manifest_file_number != 0);
 
+  IGNORE_STATUS_IF_ERROR(Status::IOError());
+
   // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow.
   std::unordered_set<uint64_t> sst_live_set(state.sst_live.begin(),
                                             state.sst_live.end());
diff --git a/db/table_cache.cc b/db/table_cache.cc
index 773446b6a583..b689a7730ade 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -205,6 +205,7 @@ Status TableCache::FindTable(
       RecordTick(ioptions_.stats, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
+      IGNORE_STATUS_IF_ERROR(s);
     } else {
       s = cache_.Insert(key, table_reader.get(), 1, handle);
       if (s.ok()) {
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 60e85567be4a..a0a7a8d9027d 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -283,6 +283,7 @@ void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
   // The sync point below is used in (DBTest,TransactionLogIteratorRace)
   TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
   Status s = env_->RenameFile(fname, archived_log_name);
+  IGNORE_STATUS_IF_ERROR(s);
   // The sync point below is used in (DBTest,TransactionLogIteratorRace)
   TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
   // The sync point below is used in
diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h
index 5d9fb34ac10c..9a14986b396b 100644
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@@ -137,7 +137,7 @@ class SharedState {
     for (int i = 0; i < FLAGS_column_families; ++i) {
       key_locks_[i].reset(new port::Mutex[num_locks]);
     }
-    if (FLAGS_read_fault_one_in) {
+    if (FLAGS_read_fault_one_in || FLAGS_metadata_read_fault_one_in) {
 #ifdef NDEBUG
       // Unsupported in release mode because it relies on
       // `IGNORE_STATUS_IF_ERROR` to distinguish faults not expected to lead to
diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 616035a1b4fe..44165563c621 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -1600,12 +1600,6 @@ class NonBatchedOpsStressTest : public StressTest {
     Slice ub_slice;
     ReadOptions ro_copy = read_opts;
 
-    // There is a narrow window in iterator auto refresh run where injected read
-    // errors are simply untraceable, ex. failure to delete file as a part of
-    // superversion cleanup callback invoked by the DBIter destructor.
-    bool ignore_injected_read_error_in_iter =
-        ro_copy.auto_refresh_iterator_with_snapshot;
-
     // Randomly test with `iterate_upper_bound` and `prefix_same_as_start`
     //
     // Get the next prefix first and then see if we want to set it to be the
@@ -1698,8 +1692,7 @@ class NonBatchedOpsStressTest : public StressTest {
               FaultInjectionIOType::kRead),
           fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
               FaultInjectionIOType::kMetadataRead));
-      if (!ignore_injected_read_error_in_iter &&
-          !SharedState::ignore_read_error && injected_error_count > 0 &&
+      if (!SharedState::ignore_read_error && injected_error_count > 0 &&
           s.ok()) {
         // Grab mutex so multiple thread don't try to print the
         // stack trace at the same time
diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index b06409a5dcbb..79bb63c5b3d9 100644
--- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -130,6 +130,7 @@ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
                  s.ToString().c_str());
 
   if (!s.ok()) {
+    IGNORE_STATUS_IF_ERROR(s);
     ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash -- %s",
                     file_path.c_str(), s.ToString().c_str());
     s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
@@ -151,6 +152,7 @@ Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
     if (io_s.ok()) {
       total_trash_size_.fetch_add(trash_file_size);
     }
+    IGNORE_STATUS_IF_ERROR(s);
   }
   //**TODO: What should we do if we failed to
   // get the file size?
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 0637440bdcf9..d0ccc2a70e81 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -320,6 +320,7 @@ void BlockFetcher::ReadBlock(bool retry) {
   }
 
   PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
+  IGNORE_STATUS_IF_ERROR(io_status_);
   if (io_status_.ok()) {
     if (use_fs_scratch_ && !read_req.status.ok()) {
       io_status_ = read_req.status;

From c5921df3d71c723ae4ca5650ff1f65305a53e5e0 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 12 Mar 2025 11:46:02 -0700
Subject: [PATCH 018/500] Add PerKeyPlacement support (#13459)

Summary:
This PR adds support for PerKeyPlacement in Remote Compaction.

The `seqno_to_time_mapping` is already available from the table properties of the input files. `preserve_internal_time_seconds` and `preclude_last_level_data_seconds` are directly read from the OPTIONS file upon db open in the remote worker. The necessary changes include:

- Add `is_penultimate_level_output` and `file_temperature` to the `CompactionServiceOutputFile`
- When building the output for the remote compaction, get the outputs for penultimate level and last level separately, serialize them with the two additional information added in this PR.
- When deserializing the result from the primary, SubcompactionState's `GetOutputs()` now takes `is_penultimate_level`. This allows us to determine which level to place the output file.
- Include stats from `compaction_stats.penultimate_level_stats` in the remote compaction result

# To Follow up
- Stats to be fixed. Stats are not being populated correctly for PerKeyPlacement even for non-remote compactions.
- Clean up / Reconcile the "penultimate" naming by replacing with "proximal"

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13459

Test Plan:
Updated the unit test

```
./compaction_service_test
```

Reviewed By: pdillinger

Differential Revision: D71007211

Pulled By: jaykorean

fbshipit-source-id: f926e56df17239875d849d46b8b940f8cd5f1825
---
 db/compaction/compaction_job.h                | 10 +++-
 db/compaction/compaction_job_test.cc          |  5 +-
 db/compaction/compaction_outputs.h            |  9 ++--
 db/compaction/compaction_service_job.cc       | 48 ++++++++++++-------
 db/compaction/compaction_service_test.cc      | 30 ++++++------
 db/compaction/subcompaction_state.h           |  9 ++++
 .../per_key_placement_remote_compaction.md    |  1 +
 7 files changed, 74 insertions(+), 38 deletions(-)
 create mode 100644 unreleased_history/new_features/per_key_placement_remote_compaction.md

diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 730b5ddac945..ea4839f21580 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -431,6 +431,9 @@ struct CompactionServiceOutputFile {
   bool marked_for_compaction;
   UniqueId64x2 unique_id{};
   TableProperties table_properties;
+  // TODO: clean up the rest of the "penultimate" naming in the codebase
+  bool is_proximal_level_output;  // == is_penultimate_level_output
+  Temperature file_temperature;
 
   CompactionServiceOutputFile() = default;
   CompactionServiceOutputFile(
@@ -440,7 +443,8 @@ struct CompactionServiceOutputFile {
       uint64_t _epoch_number, const std::string& _file_checksum,
       const std::string& _file_checksum_func_name, uint64_t _paranoid_hash,
       bool _marked_for_compaction, UniqueId64x2 _unique_id,
-      const TableProperties& _table_properties)
+      const TableProperties& _table_properties, bool _is_proximal_level_output,
+      Temperature _file_temperature)
       : file_name(name),
         smallest_seqno(smallest),
         largest_seqno(largest),
@@ -454,7 +458,9 @@ struct CompactionServiceOutputFile {
         paranoid_hash(_paranoid_hash),
         marked_for_compaction(_marked_for_compaction),
         unique_id(std::move(_unique_id)),
-        table_properties(_table_properties) {}
+        table_properties(_table_properties),
+        is_proximal_level_output(_is_proximal_level_output),
+        file_temperature(_file_temperature) {}
 };
 
 // CompactionServiceResult contains the compaction result from a different db
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 1108223a6f29..2eb354f89e3c 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -1682,7 +1682,8 @@ TEST_F(CompactionJobTest, ResultSerialization) {
         file_checksum /* file_checksum */,
         file_checksum_func_name /* file_checksum_func_name */,
         rnd64.Uniform(UINT64_MAX) /* paranoid_hash */,
-        rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp);
+        rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, tp,
+        false /* is_proximal_level_output */, Temperature::kHot);
   }
   result.output_level = rnd.Uniform(10);
   result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
@@ -1736,6 +1737,8 @@ TEST_F(CompactionJobTest, ResultSerialization) {
     ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum, file_checksum);
     ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum_func_name,
               file_checksum_func_name);
+    ASSERT_EQ(deserialized_tmp.output_files[0].file_temperature,
+              Temperature::kHot);
   }
 
   // Test unknown field
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index 33259be4670a..6b88eb452a08 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -30,13 +30,16 @@ class CompactionOutputs {
   // compaction output file
   struct Output {
     Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
-           bool _enable_hash, bool _finished, uint64_t precalculated_hash)
+           bool _enable_hash, bool _finished, uint64_t precalculated_hash,
+           bool _is_penultimate_level)
         : meta(std::move(_meta)),
           validator(_icmp, _enable_hash, precalculated_hash),
-          finished(_finished) {}
+          finished(_finished),
+          is_penultimate_level(_is_penultimate_level) {}
     FileMetaData meta;
     OutputValidator validator;
     bool finished;
+    bool is_penultimate_level;
     std::shared_ptr<const TableProperties> table_properties;
   };
 
@@ -52,7 +55,7 @@ class CompactionOutputs {
                  bool enable_hash, bool finished = false,
                  uint64_t precalculated_hash = 0) {
     outputs_.emplace_back(std::move(meta), icmp, enable_hash, finished,
-                          precalculated_hash);
+                          precalculated_hash, is_penultimate_level_);
   }
 
   // Set new table builder for the current output
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 17a37e9fc29c..cc75729bfe00 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -239,12 +239,15 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     meta.file_checksum_func_name = file.file_checksum_func_name;
     meta.marked_for_compaction = file.marked_for_compaction;
     meta.unique_id = file.unique_id;
+    meta.temperature = file.file_temperature;
 
     auto cfd = compaction->column_family_data();
-    sub_compact->Current().AddOutput(std::move(meta),
-                                     cfd->internal_comparator(), false, true,
-                                     file.paranoid_hash);
-    sub_compact->Current().UpdateTableProperties(file.table_properties);
+    CompactionOutputs* compaction_outputs =
+        sub_compact->Outputs(file.is_proximal_level_output);
+    assert(compaction_outputs);
+    compaction_outputs->AddOutput(std::move(meta), cfd->internal_comparator(),
+                                  false, true, file.paranoid_hash);
+    compaction_outputs->UpdateTableProperties(file.table_properties);
   }
   sub_compact->compaction_job_stats = compaction_result.stats;
   sub_compact->Current().SetNumOutputRecords(
@@ -273,14 +276,12 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() {
 
 void CompactionServiceCompactionJob::UpdateCompactionJobStats(
     const InternalStats::CompactionStats& stats) const {
-  compaction_job_stats_->elapsed_micros = stats.micros;
-
   // output information only in remote compaction
-  compaction_job_stats_->total_output_bytes = stats.bytes_written;
-  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
-  compaction_job_stats_->num_output_records = stats.num_output_records;
-  compaction_job_stats_->num_output_files = stats.num_output_files;
-  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+  compaction_job_stats_->total_output_bytes += stats.bytes_written;
+  compaction_job_stats_->total_output_bytes_blob += stats.bytes_written_blob;
+  compaction_job_stats_->num_output_records += stats.num_output_records;
+  compaction_job_stats_->num_output_files += stats.num_output_files;
+  compaction_job_stats_->num_output_files_blob += stats.num_output_files_blob;
 }
 
 CompactionServiceCompactionJob::CompactionServiceCompactionJob(
@@ -344,15 +345,15 @@ Status CompactionServiceCompactionJob::Run() {
 
   ProcessKeyValueCompaction(sub_compact);
 
-  compaction_stats_.stats.micros =
+  compaction_job_stats_->elapsed_micros =
       db_options_.clock->NowMicros() - start_micros;
-  compaction_stats_.stats.cpu_micros =
+  compaction_job_stats_->cpu_micros =
       sub_compact->compaction_job_stats.cpu_micros;
 
   RecordTimeToHistogram(stats_, COMPACTION_TIME,
-                        compaction_stats_.stats.micros);
+                        compaction_job_stats_->elapsed_micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
-                        compaction_stats_.stats.cpu_micros);
+                        compaction_job_stats_->cpu_micros);
 
   Status status = sub_compact->status;
   IOStatus io_s = sub_compact->io_status;
@@ -390,6 +391,9 @@ Status CompactionServiceCompactionJob::Run() {
   // 2. Update the Output information in the Compaction Job Stats with
   // aggregated Internal Compaction Stats.
   UpdateCompactionJobStats(compaction_stats_.stats);
+  if (compaction_stats_.has_penultimate_level_output) {
+    UpdateCompactionJobStats(compaction_stats_.penultimate_level_stats);
+  }
 
   // 3. Set fields that are not propagated as part of aggregations above
   compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
@@ -413,7 +417,8 @@ Status CompactionServiceCompactionJob::Run() {
           meta.file_creation_time, meta.epoch_number, meta.file_checksum,
           meta.file_checksum_func_name, output_file.validator.GetHash(),
           meta.marked_for_compaction, meta.unique_id,
-          *output_file.table_properties);
+          *output_file.table_properties, output_file.is_penultimate_level,
+          meta.temperature);
     }
   }
 
@@ -585,7 +590,16 @@ static std::unordered_map<std::string, OptionTypeInfo>
             const auto this_one = static_cast<const TableProperties*>(addr1);
             const auto that_one = static_cast<const TableProperties*>(addr2);
             return this_one->AreEqual(opts, that_one, mismatch);
-          }}}};
+          }}},
+        {"is_proximal_level_output",
+         {offsetof(struct CompactionServiceOutputFile,
+                   is_proximal_level_output),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_temperature",
+         {offsetof(struct CompactionServiceOutputFile, file_temperature),
+          OptionType::kTemperature, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}}};
 
 static std::unordered_map<std::string, OptionTypeInfo>
     compaction_job_stats_type_info = {
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 95c9bd789e10..064eae37fe86 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -1188,34 +1188,34 @@ TEST_F(CompactionServiceTest, PrecludeLastLevel) {
 
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
-      // FIXME: need to assign outputs to levels to allow overlapping ranges:
-      // ASSERT_OK(Put(Key(j * kNumTrigger + i), "v" + std::to_string(i)));
-      // instead of this (too easy):
-      ASSERT_OK(Put(Key(i * kNumKeys + j), "v" + std::to_string(i)));
+      ASSERT_OK(Put(Key(j * kNumTrigger + i), "v" + std::to_string(i)));
     }
     ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Data split between penultimate (kUnknown) and last (kCold) levels
-  // FIXME: need to assign outputs to levels to get this:
-  // ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
-  // ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
-  // ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
-  // instead of this (WRONG but currently expected):
-  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
-  // Check manifest temperatures
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
   // TODO: Check FileSystem temperatures with FileTemperatureTestFS
 
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
-      // FIXME
-      // ASSERT_EQ(Get(Key(j * kNumTrigger + i)), "v" + std::to_string(i));
-      ASSERT_EQ(Get(Key(i * kNumKeys + j)), "v" + std::to_string(i));
+      ASSERT_EQ(Get(Key(j * kNumTrigger + i)), "v" + std::to_string(i));
     }
   }
+
+  // Verify Output Stats
+  auto my_cs = GetCompactionService();
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_GT(result.stats.cpu_micros, 0);
+  ASSERT_GT(result.stats.elapsed_micros, 0);
+  ASSERT_EQ(result.stats.num_output_records, kNumTrigger * kNumKeys);
+  ASSERT_EQ(result.stats.num_output_files, 2);
 }
 
 TEST_F(CompactionServiceTest, ConcurrentCompaction) {
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 6a28f74d9089..b9117f5adfbc 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -169,6 +169,15 @@ class SubcompactionState {
     return *current_outputs_;
   }
 
+  CompactionOutputs* Outputs(bool is_penultimate_level) {
+    assert(compaction);
+    if (is_penultimate_level) {
+      assert(compaction->SupportsPerKeyPlacement());
+      return &penultimate_level_outputs_;
+    }
+    return &compaction_outputs_;
+  }
+
   CompactionRangeDelAggregator* RangeDelAgg() const {
     return range_del_agg_.get();
   }
diff --git a/unreleased_history/new_features/per_key_placement_remote_compaction.md b/unreleased_history/new_features/per_key_placement_remote_compaction.md
new file mode 100644
index 000000000000..e89d3e155e76
--- /dev/null
+++ b/unreleased_history/new_features/per_key_placement_remote_compaction.md
@@ -0,0 +1 @@
+Added per-key-placement feature in Remote Compaction

From ca7367a00319b333e783cab60cb600dd54f18b8f Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 12 Mar 2025 18:24:28 -0700
Subject: [PATCH 019/500] Replace penultimate naming with proximal (#13460)

Summary:
With generalized age-based tiering (work-in-progress), the "warm tier" data will no longer necessarily be placed in the second-to-last level (also known as the "penultimate level").

Also, the cold tier may no longer necessarily be at the last level, so we need to rename options like `preclude_last_level_seconds` to `preclude_cold_tier_seconds`, but renaming options is trickier because it can be a breaking change for consuming applications. We will do this later as a follow up.

**Minor fix included**: Fixed one `use-after-move` in CompactionPicker

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13460

Test Plan: CI

Reviewed By: pdillinger

Differential Revision: D71059486

Pulled By: jaykorean

fbshipit-source-id: fd360cdf719e015bf9f9e3f6f1663438226566a4
---
 db/compaction/compaction.cc                  | 106 +++++++++---------
 db/compaction/compaction.h                   |  86 +++++++-------
 db/compaction/compaction_job.cc              | 111 +++++++++---------
 db/compaction/compaction_job.h               |  19 ++--
 db/compaction/compaction_job_test.cc         |   9 +-
 db/compaction/compaction_outputs.cc          |   4 +-
 db/compaction/compaction_outputs.h           |  28 ++---
 db/compaction/compaction_picker.cc           |  38 +++----
 db/compaction/compaction_picker.h            |   2 +-
 db/compaction/compaction_picker_level.cc     |  18 +--
 db/compaction/compaction_picker_test.cc      |  58 +++++-----
 db/compaction/compaction_picker_universal.cc |  41 +++----
 db/compaction/compaction_service_job.cc      |   6 +-
 db/compaction/compaction_service_test.cc     |   2 +-
 db/compaction/subcompaction_state.cc         |  31 +++--
 db/compaction/subcompaction_state.h          |  54 +++++----
 db/compaction/tiered_compaction_test.cc      | 112 +++++++++----------
 db/flush_job.h                               |   2 +-
 db/internal_stats.h                          |  31 +++--
 db/seqno_time_test.cc                        |   8 +-
 db/version_set.cc                            |   2 +-
 include/rocksdb/compaction_job_stats.h       |   2 +-
 22 files changed, 373 insertions(+), 397 deletions(-)

diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 313e2998aecd..e5c817a0f218 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -338,16 +338,16 @@ Compaction::Compaction(
                   _blob_garbage_collection_age_cutoff > 1
               ? mutable_cf_options().blob_garbage_collection_age_cutoff
               : _blob_garbage_collection_age_cutoff),
-      penultimate_level_(
-          // For simplicity, we don't support the concept of "penultimate level"
+      proximal_level_(
+          // For simplicity, we don't support the concept of "proximal level"
           // with `CompactionReason::kExternalSstIngestion` and
           // `CompactionReason::kRefitLevel`
           _compaction_reason == CompactionReason::kExternalSstIngestion ||
                   _compaction_reason == CompactionReason::kRefitLevel
               ? Compaction::kInvalidLevel
-              : EvaluatePenultimateLevel(vstorage, mutable_cf_options_,
-                                         immutable_options_, start_level_,
-                                         output_level_)) {
+              : EvaluateProximalLevel(vstorage, mutable_cf_options_,
+                                      immutable_options_, start_level_,
+                                      output_level_)) {
   MarkFilesBeingCompacted(true);
   if (is_manual_compaction_) {
     compaction_reason_ = CompactionReason::kManualCompaction;
@@ -405,10 +405,10 @@ Compaction::Compaction(
     }
   }
 
-  PopulatePenultimateLevelOutputRange();
+  PopulateProximalLevelOutputRange();
 }
 
-void Compaction::PopulatePenultimateLevelOutputRange() {
+void Compaction::PopulateProximalLevelOutputRange() {
   if (!SupportsPerKeyPlacement()) {
     assert(keep_in_last_level_through_seqno_ == kMaxSequenceNumber);
     return;
@@ -417,46 +417,42 @@ void Compaction::PopulatePenultimateLevelOutputRange() {
   // exclude the last level, the range of all input levels is the safe range
   // of keys that can be moved up.
   int exclude_level = number_levels_ - 1;
-  penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
+  proximal_output_range_type_ = ProximalOutputRangeType::kNonLastRange;
 
-  // For universal compaction, the penultimate_output_range could be extended if
-  // all penultimate level files are included in the compaction (which includes
-  // the case that the penultimate level is empty).
+  // For universal compaction, the proximal_output_range could be extended if
+  // all proximal level files are included in the compaction (which includes
+  // the case that the proximal level is empty).
   if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
     exclude_level = kInvalidLevel;
-    penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
-    std::set<uint64_t> penultimate_inputs;
+    proximal_output_range_type_ = ProximalOutputRangeType::kFullRange;
+    std::set<uint64_t> proximal_inputs;
     for (const auto& input_lvl : inputs_) {
-      if (input_lvl.level == penultimate_level_) {
+      if (input_lvl.level == proximal_level_) {
         for (const auto& file : input_lvl.files) {
-          penultimate_inputs.emplace(file->fd.GetNumber());
+          proximal_inputs.emplace(file->fd.GetNumber());
         }
       }
     }
-    auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
-    for (const auto& file : penultimate_files) {
-      if (penultimate_inputs.find(file->fd.GetNumber()) ==
-          penultimate_inputs.end()) {
+    auto proximal_files = input_vstorage_->LevelFiles(proximal_level_);
+    for (const auto& file : proximal_files) {
+      if (proximal_inputs.find(file->fd.GetNumber()) == proximal_inputs.end()) {
         exclude_level = number_levels_ - 1;
-        penultimate_output_range_type_ =
-            PenultimateOutputRangeType::kNonLastRange;
+        proximal_output_range_type_ = ProximalOutputRangeType::kNonLastRange;
         break;
       }
     }
   }
 
-  // FIXME: should make use of `penultimate_output_range_type_`.
+  // FIXME: should make use of `proximal_output_range_type_`.
   // FIXME: when last level's input range does not overlap with
-  //  penultimate level, and penultimate level input is empty,
-  //  this call will not set penultimate_level_smallest_ or
-  //  penultimate_level_largest_. No keys will be compacted up.
-  GetBoundaryInternalKeys(input_vstorage_, inputs_,
-                          &penultimate_level_smallest_,
-                          &penultimate_level_largest_, exclude_level);
-
-  if (penultimate_output_range_type_ !=
-      PenultimateOutputRangeType::kFullRange) {
-    // If not full range in penultimate level, must keep everything already
+  //  proximal level, and proximal level input is empty,
+  //  this call will not set proximal_level_smallest_ or
+  //  proximal_level_largest_. No keys will be compacted up.
+  GetBoundaryInternalKeys(input_vstorage_, inputs_, &proximal_level_smallest_,
+                          &proximal_level_largest_, exclude_level);
+
+  if (proximal_output_range_type_ != ProximalOutputRangeType::kFullRange) {
+    // If not full range in proximal level, must keep everything already
     // in the last level there, because moving it back up might cause
     // overlap/placement issues that are difficult to resolve properly in the
     // presence of range deletes
@@ -486,23 +482,23 @@ Compaction::~Compaction() {
 }
 
 bool Compaction::SupportsPerKeyPlacement() const {
-  return penultimate_level_ != kInvalidLevel;
+  return proximal_level_ != kInvalidLevel;
 }
 
-int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
+int Compaction::GetProximalLevel() const { return proximal_level_; }
 
 // smallest_key and largest_key include timestamps if user-defined timestamp is
 // enabled.
-bool Compaction::OverlapPenultimateLevelOutputRange(
+bool Compaction::OverlapProximalLevelOutputRange(
     const Slice& smallest_key, const Slice& largest_key) const {
   if (!SupportsPerKeyPlacement()) {
     return false;
   }
 
-  // See FIXME in Compaction::PopulatePenultimateLevelOutputRange().
+  // See FIXME in Compaction::PopulateProximalLevelOutputRange().
   // We do not compact any key up in this case.
-  if (penultimate_level_smallest_.size() == 0 ||
-      penultimate_level_largest_.size() == 0) {
+  if (proximal_level_smallest_.size() == 0 ||
+      proximal_level_largest_.size() == 0) {
     return false;
   }
 
@@ -510,13 +506,13 @@ bool Compaction::OverlapPenultimateLevelOutputRange(
       input_vstorage_->InternalComparator()->user_comparator();
 
   return ucmp->CompareWithoutTimestamp(
-             smallest_key, penultimate_level_largest_.user_key()) <= 0 &&
+             smallest_key, proximal_level_largest_.user_key()) <= 0 &&
          ucmp->CompareWithoutTimestamp(
-             largest_key, penultimate_level_smallest_.user_key()) >= 0;
+             largest_key, proximal_level_smallest_.user_key()) >= 0;
 }
 
 // key includes timestamp if user-defined timestamp is enabled.
-void Compaction::TEST_AssertWithinPenultimateLevelOutputRange(
+void Compaction::TEST_AssertWithinProximalLevelOutputRange(
     const Slice& user_key, bool expect_failure) const {
 #ifdef NDEBUG
   (void)user_key;
@@ -524,15 +520,15 @@ void Compaction::TEST_AssertWithinPenultimateLevelOutputRange(
 #else
   assert(SupportsPerKeyPlacement());
 
-  assert(penultimate_level_smallest_.size() > 0);
-  assert(penultimate_level_largest_.size() > 0);
+  assert(proximal_level_smallest_.size() > 0);
+  assert(proximal_level_largest_.size() > 0);
 
   auto* cmp = input_vstorage_->user_comparator();
 
   // op_type of a key can change during compaction, e.g. Merge -> Put.
-  if (!(cmp->Compare(user_key, penultimate_level_smallest_.user_key()) >= 0)) {
+  if (!(cmp->Compare(user_key, proximal_level_smallest_.user_key()) >= 0)) {
     assert(expect_failure);
-  } else if (!(cmp->Compare(user_key, penultimate_level_largest_.user_key()) <=
+  } else if (!(cmp->Compare(user_key, proximal_level_largest_.user_key()) <=
                0)) {
     assert(expect_failure);
   } else {
@@ -1018,7 +1014,7 @@ uint64_t Compaction::MinInputFileEpochNumber() const {
   return min_epoch_number;
 }
 
-int Compaction::EvaluatePenultimateLevel(
+int Compaction::EvaluateProximalLevel(
     const VersionStorageInfo* vstorage,
     const MutableCFOptions& mutable_cf_options,
     const ImmutableOptions& immutable_options, const int start_level,
@@ -1033,21 +1029,21 @@ int Compaction::EvaluatePenultimateLevel(
     return kInvalidLevel;
   }
 
-  int penultimate_level = output_level - 1;
-  assert(penultimate_level < immutable_options.num_levels);
-  if (penultimate_level <= 0) {
+  int proximal_level = output_level - 1;
+  assert(proximal_level < immutable_options.num_levels);
+  if (proximal_level <= 0) {
     return kInvalidLevel;
   }
 
-  // If the penultimate level is not within input level -> output level range
-  // check if the penultimate output level is empty, if it's empty, it could
-  // also be locked for the penultimate output.
+  // If the proximal level is not within input level -> output level range
+  // check if the proximal output level is empty, if it's empty, it could
+  // also be locked for the proximal output.
   // TODO: ideally, it only needs to check if there's a file within the
   //  compaction output key range. For simplicity, it just check if there's any
-  //  file on the penultimate level.
+  //  file on the proximal level.
   if (start_level == immutable_options.num_levels - 1 &&
       (immutable_options.compaction_style != kCompactionStyleUniversal ||
-       !vstorage->LevelFiles(penultimate_level).empty())) {
+       !vstorage->LevelFiles(proximal_level).empty())) {
     return kInvalidLevel;
   }
 
@@ -1061,7 +1057,7 @@ int Compaction::EvaluatePenultimateLevel(
     return kInvalidLevel;
   }
 
-  return penultimate_level;
+  return proximal_level;
 }
 
 void Compaction::FilterInputsForCompactionIterator() {
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 534b13c6a8f8..fe7fc5026ed8 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -102,13 +102,13 @@ class Compaction {
                  BlobGarbageCollectionPolicy::kUseDefault,
              double blob_garbage_collection_age_cutoff = -1);
 
-  // The type of the penultimate level output range
-  enum class PenultimateOutputRangeType : int {
-    kNotSupported,  // it cannot output to the penultimate level
-    kFullRange,     // any data could be output to the penultimate level
+  // The type of the proximal level output range
+  enum class ProximalOutputRangeType : int {
+    kNotSupported,  // it cannot output to the proximal level
+    kFullRange,     // any data could be output to the proximal level
     kNonLastRange,  // only the keys within non_last_level compaction inputs can
-                    // be outputted to the penultimate level
-    kDisabled,      // no data can be outputted to the penultimate level
+                    // be outputted to the proximal level
+    kDisabled,      // no data can be outputted to the proximal level
   };
 
   // No copying allowed
@@ -370,29 +370,29 @@ class Compaction {
 
   Slice GetLargestUserKey() const { return largest_user_key_; }
 
-  PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
-    return penultimate_output_range_type_;
+  ProximalOutputRangeType GetProximalOutputRangeType() const {
+    return proximal_output_range_type_;
   }
 
   // Return true if the compaction supports per_key_placement
   bool SupportsPerKeyPlacement() const;
 
-  // Get per_key_placement penultimate output level, which is `last_level - 1`
+  // Get per_key_placement proximal output level, which is `last_level - 1`
   // if per_key_placement feature is supported. Otherwise, return -1.
-  int GetPenultimateLevel() const;
+  int GetProximalLevel() const;
 
-  // Return true if the given range is overlap with penultimate level output
+  // Return true if the given range is overlap with proximal level output
   // range.
   // Both smallest_key and largest_key include timestamps if user-defined
   // timestamp is enabled.
-  bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
-                                          const Slice& largest_key) const;
+  bool OverlapProximalLevelOutputRange(const Slice& smallest_key,
+                                       const Slice& largest_key) const;
 
-  // For testing purposes, check that a key is within penultimate level
+  // For testing purposes, check that a key is within proximal level
   // output range for per_key_placement feature, which is safe to place the key
-  // to the penultimate level. Different compaction strategies have different
+  // to the proximal level. Different compaction strategies have different
   // rules. `user_key` includes timestamp if user-defined timestamp is enabled.
-  void TEST_AssertWithinPenultimateLevelOutputRange(
+  void TEST_AssertWithinProximalLevelOutputRange(
       const Slice& user_key, bool expect_failure = false) const;
 
   CompactionReason compaction_reason() const { return compaction_reason_; }
@@ -441,20 +441,20 @@ class Compaction {
 
   static constexpr int kInvalidLevel = -1;
 
-  // Evaluate penultimate output level. If the compaction supports
-  // per_key_placement feature, it returns the penultimate level number.
+  // Evaluate proximal output level. If the compaction supports
+  // per_key_placement feature, it returns the proximal level number.
   // Otherwise, it's set to kInvalidLevel (-1), which means
-  // output_to_penultimate_level is not supported.
-  // Note: even the penultimate level output is supported (PenultimateLevel !=
+  // output_to_proximal_level is not supported.
+  // Note: even the proximal level output is supported (ProximalLevel !=
   // kInvalidLevel), some key range maybe unsafe to be outputted to the
-  // penultimate level. The safe key range is populated by
-  // `PopulatePenultimateLevelOutputRange()`.
-  // Which could potentially disable all penultimate level output.
-  static int EvaluatePenultimateLevel(
-      const VersionStorageInfo* vstorage,
-      const MutableCFOptions& mutable_cf_options,
-      const ImmutableOptions& immutable_options, const int start_level,
-      const int output_level);
+  // proximal level. The safe key range is populated by
+  // `PopulateProximalLevelOutputRange()`.
+  // Which could potentially disable all proximal level output.
+  static int EvaluateProximalLevel(const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   const ImmutableOptions& immutable_options,
+                                   const int start_level,
+                                   const int output_level);
 
   // If some data cannot be safely migrated "up" the LSM tree due to a change
   // in the preclude_last_level_data_seconds setting, this indicates a sequence
@@ -482,10 +482,10 @@ class Compaction {
       InternalKey* smallest_key, InternalKey* largest_key,
       int exclude_level = -1);
 
-  // populate penultimate level output range, which will be used to determine if
-  // a key is safe to output to the penultimate level (details see
-  // `Compaction::WithinPenultimateLevelOutputRange()`.
-  void PopulatePenultimateLevelOutputRange();
+  // populate proximal level output range, which will be used to determine if
+  // a key is safe to output to the proximal level (details see
+  // `Compaction::WithinProximalLevelOutputRange()`.
+  void PopulateProximalLevelOutputRange();
 
   // If oldest snapshot is specified at Compaction construction time, we have
   // an opportunity to optimize inputs for compaction iterator for this case:
@@ -616,20 +616,20 @@ class Compaction {
 
   // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
   // means not supported.
-  const int penultimate_level_;
+  const int proximal_level_;
 
-  // Key range for penultimate level output
+  // Key range for proximal level output
   // includes timestamp if user-defined timestamp is enabled.
-  // penultimate_output_range_type_ shows the range type
-  InternalKey penultimate_level_smallest_;
-  InternalKey penultimate_level_largest_;
-  PenultimateOutputRangeType penultimate_output_range_type_ =
-      PenultimateOutputRangeType::kNotSupported;
+  // proximal_output_range_type_ shows the range type
+  InternalKey proximal_level_smallest_;
+  InternalKey proximal_level_largest_;
+  ProximalOutputRangeType proximal_output_range_type_ =
+      ProximalOutputRangeType::kNotSupported;
 };
 
 #ifndef NDEBUG
 // Helper struct only for tests, which contains the data to decide if a key
-// should be output to the penultimate level.
+// should be output to the proximal level.
 // TODO: remove this when the public feature knob is available
 struct PerKeyPlacementContext {
   const int level;
@@ -637,16 +637,16 @@ struct PerKeyPlacementContext {
   const Slice value;
   const SequenceNumber seq_num;
 
-  bool& output_to_penultimate_level;
+  bool& output_to_proximal_level;
 
   PerKeyPlacementContext(int _level, Slice _key, Slice _value,
                          SequenceNumber _seq_num,
-                         bool& _output_to_penultimate_level)
+                         bool& _output_to_proximal_level)
       : level(_level),
         key(_key),
         value(_value),
         seq_num(_seq_num),
-        output_to_penultimate_level(_output_to_penultimate_level) {}
+        output_to_proximal_level(_output_to_proximal_level) {}
 };
 #endif /* !NDEBUG */
 
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3e8bbdb2c346..46939e988cd0 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -109,16 +109,16 @@ const char* GetCompactionReasonString(CompactionReason compaction_reason) {
   }
 }
 
-const char* GetCompactionPenultimateOutputRangeTypeString(
-    Compaction::PenultimateOutputRangeType range_type) {
+const char* GetCompactionProximalOutputRangeTypeString(
+    Compaction::ProximalOutputRangeType range_type) {
   switch (range_type) {
-    case Compaction::PenultimateOutputRangeType::kNotSupported:
+    case Compaction::ProximalOutputRangeType::kNotSupported:
       return "NotSupported";
-    case Compaction::PenultimateOutputRangeType::kFullRange:
+    case Compaction::ProximalOutputRangeType::kFullRange:
       return "FullRange";
-    case Compaction::PenultimateOutputRangeType::kNonLastRange:
+    case Compaction::ProximalOutputRangeType::kNonLastRange:
       return "NonLastRange";
-    case Compaction::PenultimateOutputRangeType::kDisabled:
+    case Compaction::ProximalOutputRangeType::kDisabled:
       return "Disabled";
     default:
       assert(false);
@@ -378,8 +378,8 @@ void CompactionJob::Prepare(
   }
   // Now combine what we would like to preclude from last level with what we
   // can safely support without dangerously moving data back up the LSM tree,
-  // to get the final seqno threshold for penultimate vs. last. In particular,
-  // when the reserved output key range for the penultimate level does not
+  // to get the final seqno threshold for proximal vs. last. In particular,
+  // when the reserved output key range for the proximal level does not
   // include the entire last level input key range, we need to keep entries
   // already in the last level there. (Even allowing within-range entries to
   // move back up could cause problems with range tombstones. Perhaps it
@@ -388,8 +388,8 @@ void CompactionJob::Prepare(
   // tracking and complexity to CompactionIterator that is probably not
   // worthwhile overall. Correctness is also more clear when splitting by
   // seqno threshold.)
-  penultimate_after_seqno_ = std::max(preclude_last_level_min_seqno,
-                                      c->GetKeepInLastLevelThroughSeqno());
+  proximal_after_seqno_ = std::max(preclude_last_level_min_seqno,
+                                   c->GetKeepInLastLevelThroughSeqno());
 
   options_file_number_ = versions_->options_file_number();
 }
@@ -993,16 +993,16 @@ Status CompactionJob::Install(bool* compaction_released) {
         blob_files.back()->GetBlobFileNumber());
   }
 
-  if (compaction_stats_.has_penultimate_level_output) {
-    ROCKS_LOG_BUFFER(
-        log_buffer_,
-        "[%s] has Penultimate Level output: %" PRIu64
-        ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64,
-        column_family_name.c_str(),
-        compaction_stats_.penultimate_level_stats.bytes_written,
-        compact_->compaction->GetPenultimateLevel(),
-        compaction_stats_.penultimate_level_stats.num_output_files,
-        compaction_stats_.penultimate_level_stats.num_output_records);
+  if (compaction_stats_.has_proximal_level_output) {
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] has Proximal Level output: %" PRIu64
+                     ", level %d, number of files: %" PRIu64
+                     ", number of records: %" PRIu64,
+                     column_family_name.c_str(),
+                     compaction_stats_.proximal_level_stats.bytes_written,
+                     compact_->compaction->GetProximalLevel(),
+                     compaction_stats_.proximal_level_stats.num_output_files,
+                     compaction_stats_.proximal_level_stats.num_output_records);
   }
 
   UpdateCompactionJobStats(stats);
@@ -1055,16 +1055,16 @@ Status CompactionJob::Install(bool* compaction_released) {
     stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
   }
 
-  if (compaction_stats_.has_penultimate_level_output) {
+  if (compaction_stats_.has_proximal_level_output) {
     InternalStats::CompactionStats& pl_stats =
-        compaction_stats_.penultimate_level_stats;
-    stream << "penultimate_level_num_output_files" << pl_stats.num_output_files;
-    stream << "penultimate_level_bytes_written" << pl_stats.bytes_written;
-    stream << "penultimate_level_num_output_records"
+        compaction_stats_.proximal_level_stats;
+    stream << "proximal_level_num_output_files" << pl_stats.num_output_files;
+    stream << "proximal_level_bytes_written" << pl_stats.bytes_written;
+    stream << "proximal_level_num_output_records"
            << pl_stats.num_output_records;
-    stream << "penultimate_level_num_output_files_blob"
+    stream << "proximal_level_num_output_files_blob"
            << pl_stats.num_output_files_blob;
-    stream << "penultimate_level_bytes_written_blob"
+    stream << "proximal_level_bytes_written_blob"
            << pl_stats.bytes_written_blob;
   }
 
@@ -1312,7 +1312,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 
   std::vector<std::string> blob_file_paths;
 
-  // TODO: BlobDB to support output_to_penultimate_level compaction, which needs
+  // TODO: BlobDB to support output_to_proximal_level compaction, which needs
   //  2 builders, so may need to move to `CompactionOutputs`
   std::unique_ptr<BlobFileBuilder> blob_file_builder(
       (mutable_cf_options.enable_blob_files &&
@@ -1397,30 +1397,30 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     }
 
     const auto& ikey = c_iter->ikey();
-    bool use_penultimate_output = ikey.sequence > penultimate_after_seqno_;
+    bool use_proximal_output = ikey.sequence > proximal_after_seqno_;
 #ifndef NDEBUG
     if (sub_compact->compaction->SupportsPerKeyPlacement()) {
       // Could be overridden by unittest
       PerKeyPlacementContext context(sub_compact->compaction->output_level(),
                                      ikey.user_key, c_iter->value(),
-                                     ikey.sequence, use_penultimate_output);
+                                     ikey.sequence, use_proximal_output);
       TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
                                &context);
-      if (use_penultimate_output) {
-        // Verify that entries sent to the penultimate level are within the
+      if (use_proximal_output) {
+        // Verify that entries sent to the proximal level are within the
         // allowed range (because the input key range of the last level could
-        // be larger than the allowed output key range of the penultimate
+        // be larger than the allowed output key range of the proximal
         // level). This check uses user keys (ignores sequence numbers) because
         // compaction boundaries are a "clean cut" between user keys (see
         // CompactionPicker::ExpandInputsToCleanCut()), which is especially
         // important when preferred sequence numbers has been swapped in for
         // kTypeValuePreferredSeqno / TimedPut.
-        sub_compact->compaction->TEST_AssertWithinPenultimateLevelOutputRange(
+        sub_compact->compaction->TEST_AssertWithinProximalLevelOutputRange(
             c_iter->user_key());
       }
     } else {
-      assert(penultimate_after_seqno_ == kMaxSequenceNumber);
-      assert(!use_penultimate_output);
+      assert(proximal_after_seqno_ == kMaxSequenceNumber);
+      assert(!use_proximal_output);
     }
 #endif  // NDEBUG
 
@@ -1429,7 +1429,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     // and `close_file_func`.
     // TODO: it would be better to have the compaction file open/close moved
     // into `CompactionOutputs` which has the output file information.
-    status = sub_compact->AddToOutput(*c_iter, use_penultimate_output,
+    status = sub_compact->AddToOutput(*c_iter, use_proximal_output,
                                       open_file_func, close_file_func);
     if (!status.ok()) {
       break;
@@ -1641,15 +1641,15 @@ Status CompactionJob::FinishCompactionOutputFile(
     std::pair<SequenceNumber, SequenceNumber> keep_seqno_range{
         0, kMaxSequenceNumber};
     if (sub_compact->compaction->SupportsPerKeyPlacement()) {
-      if (outputs.IsPenultimateLevel()) {
-        keep_seqno_range.first = penultimate_after_seqno_;
+      if (outputs.IsProximalLevel()) {
+        keep_seqno_range.first = proximal_after_seqno_;
       } else {
-        keep_seqno_range.second = penultimate_after_seqno_;
+        keep_seqno_range.second = proximal_after_seqno_;
       }
     }
     CompactionIterationStats range_del_out_stats;
     // NOTE1: Use `bottommost_level_ = true` for both bottommost and
-    // output_to_penultimate_level compaction here, as it's only used to decide
+    // output_to_proximal_level compaction here, as it's only used to decide
     // if range dels could be dropped. (Logically, we are taking a single sorted
     // run returned from CompactionIterator and physically splitting it between
     // two output levels.)
@@ -1812,14 +1812,14 @@ Status CompactionJob::InstallCompactionResults(bool* compaction_released) {
 
   {
     Compaction::InputLevelSummaryBuffer inputs_summary;
-    if (compaction_stats_.has_penultimate_level_output) {
+    if (compaction_stats_.has_proximal_level_output) {
       ROCKS_LOG_BUFFER(
           log_buffer_,
-          "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64
+          "[%s] [JOB %d] Compacted %s => output_to_proximal_level: %" PRIu64
           " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
           compaction->column_family_data()->GetName().c_str(), job_id_,
           compaction->InputLevelSummary(&inputs_summary),
-          compaction_stats_.penultimate_level_stats.bytes_written,
+          compaction_stats_.proximal_level_stats.bytes_written,
           compaction_stats_.stats.bytes_written,
           compaction_stats_.TotalBytesWritten());
     } else {
@@ -1946,8 +1946,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
   // Here last_level_temperature supersedes default_write_temperature, when
   // enabled and applicable
   if (last_level_temp != Temperature::kUnknown &&
-      sub_compact->compaction->is_last_level() &&
-      !outputs.IsPenultimateLevel()) {
+      sub_compact->compaction->is_last_level() && !outputs.IsProximalLevel()) {
     temperature = last_level_temp;
   }
   fo_copy.temperature = temperature;
@@ -2061,7 +2060,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
       bottommost_level_, TableFileCreationReason::kCompaction,
       0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
       sub_compact->compaction->max_output_file_size(), file_number,
-      penultimate_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/);
+      proximal_after_seqno_ /*last_level_inclusive_max_seqno_threshold*/);
 
   outputs.NewBuilder(tboptions);
 
@@ -2232,19 +2231,19 @@ void CompactionJob::LogCompaction() {
                    ? int64_t{-1}  // Use -1 for "none"
                    : static_cast<int64_t>(existing_snapshots_[0]));
     if (compaction->SupportsPerKeyPlacement()) {
-      stream << "prenultimate_after_seqno" << penultimate_after_seqno_;
+      stream << "proximal_after_seqno" << proximal_after_seqno_;
       stream << "preserve_seqno_after" << preserve_seqno_after_;
-      stream << "penultimate_output_level" << compaction->GetPenultimateLevel();
-      stream << "penultimate_output_range"
-             << GetCompactionPenultimateOutputRangeTypeString(
-                    compaction->GetPenultimateOutputRangeType());
+      stream << "proximal_output_level" << compaction->GetProximalLevel();
+      stream << "proximal_output_range"
+             << GetCompactionProximalOutputRangeTypeString(
+                    compaction->GetProximalOutputRangeType());
 
-      if (compaction->GetPenultimateOutputRangeType() ==
-          Compaction::PenultimateOutputRangeType::kDisabled) {
+      if (compaction->GetProximalOutputRangeType() ==
+          Compaction::ProximalOutputRangeType::kDisabled) {
         ROCKS_LOG_WARN(
             db_options_.info_log,
-            "[%s] [JOB %d] Penultimate level output is disabled, likely "
-            "because of the range conflict in the penultimate level",
+            "[%s] [JOB %d] Proximal level output is disabled, likely "
+            "because of the range conflict in the proximal level",
             cfd->GetName().c_str(), job_id_);
       }
     }
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index ea4839f21580..e990124d9e98 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -104,9 +104,9 @@ class SubcompactionState;
 //    logging and public metrics.
 //    Internally, it's an aggregation of stats_ from each `SubcompactionState`.
 //    It has 2 parts, normal stats about the main compaction information and
-//    the penultimate level output stats.
-//    `SubcompactionState` maintains the CompactionOutputs for normal output and
-//    the penultimate level output if exists, the per_level stats is
+//    the proximal level output stats.
+//    `SubcompactionState` maintains the CompactionOutputs for ordinary level
+//    output and the proximal level output if exists, the per_level stats is
 //    stored with the outputs.
 //                                                +---------------------------+
 //                                                | SubcompactionState        |
@@ -119,7 +119,7 @@ class SubcompactionState;
 //                                            |   |                           |
 //                                            |   | +----------------------+  |
 // +--------------------------------+         |   | | CompactionOutputs    |  |
-// | CompactionJob                  |         |   | | (penultimate_level)  |  |
+// | CompactionJob                  |         |   | | (proximal_level)     |  |
 // |                                |    +--------->|   stats_             |  |
 // |   compaction_stats_            |    |    |   | +----------------------+  |
 // |    +-------------------------+ |    |    |   |                           |
@@ -127,7 +127,7 @@ class SubcompactionState;
 // |    +-------------------------+ |    |    |
 // |                                |    |    |
 // |    +-------------------------+ |    |    |   +---------------------------+
-// |    |penultimate_level_stats  +------+    |   | SubcompactionState        |
+// |    |proximal_level_stats     |------+    |   | SubcompactionState        |
 // |    +-------------------------+ |    |    |   |                           |
 // |                                |    |    |   | +----------------------+  |
 // |                                |    |    |   | | CompactionOutputs    |  |
@@ -137,7 +137,7 @@ class SubcompactionState;
 //                                       |        |                           |
 //                                       |        | +----------------------+  |
 //                                       |        | | CompactionOutputs    |  |
-//                                       |        | | (penultimate_level)  |  |
+//                                       |        | | (proximal_level)     |  |
 //                                       +--------->|   stats_             |  |
 //                                                | +----------------------+  |
 //                                                |                           |
@@ -363,8 +363,8 @@ class CompactionJob {
 
   // Minimal sequence number to preclude the data from the last level. If the
   // key has bigger (newer) sequence number than this, it will be precluded from
-  // the last level (output to penultimate level).
-  SequenceNumber penultimate_after_seqno_ = kMaxSequenceNumber;
+  // the last level (output to proximal level).
+  SequenceNumber proximal_after_seqno_ = kMaxSequenceNumber;
 
   // Options File Number used for Remote Compaction
   // Setting this requires DBMutex.
@@ -431,8 +431,7 @@ struct CompactionServiceOutputFile {
   bool marked_for_compaction;
   UniqueId64x2 unique_id{};
   TableProperties table_properties;
-  // TODO: clean up the rest of the "penultimate" naming in the codebase
-  bool is_proximal_level_output;  // == is_penultimate_level_output
+  bool is_proximal_level_output;
   Temperature file_temperature;
 
   CompactionServiceOutputFile() = default;
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 2eb354f89e3c..b7afc07b996c 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -1474,7 +1474,7 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) {
                 /* expected_oldest_blob_file_numbers */ {19});
 }
 
-TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
+TEST_F(CompactionJobTest, VerifyProximalLevelOutput) {
   cf_options_.last_level_temperature = Temperature::kCold;
   SyncPoint::GetInstance()->SetCallBack(
       "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
@@ -1487,8 +1487,7 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
   SyncPoint::GetInstance()->SetCallBack(
       "CompactionIterator::PrepareOutput.context", [&](void* arg) {
         auto context = static_cast<PerKeyPlacementContext*>(arg);
-        context->output_to_penultimate_level =
-            context->seq_num > latest_cold_seq;
+        context->output_to_proximal_level = context->seq_num > latest_cold_seq;
       });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1534,11 +1533,11 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
       /*verify_func=*/[&](Compaction& comp) {
         for (char c = 'a'; c <= 'z'; c++) {
           if (c == 'a') {
-            comp.TEST_AssertWithinPenultimateLevelOutputRange(
+            comp.TEST_AssertWithinProximalLevelOutputRange(
                 "a", true /*expect_failure*/);
           } else {
             std::string c_str{c};
-            comp.TEST_AssertWithinPenultimateLevelOutputRange(c_str);
+            comp.TEST_AssertWithinProximalLevelOutputRange(c_str);
           }
         }
       });
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index 3e1c4402cea3..d3a0c711ac67 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -792,8 +792,8 @@ void CompactionOutputs::FillFilesToCutForTtl() {
 }
 
 CompactionOutputs::CompactionOutputs(const Compaction* compaction,
-                                     const bool is_penultimate_level)
-    : compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+                                     const bool is_proximal_level)
+    : compaction_(compaction), is_proximal_level_(is_proximal_level) {
   partitioner_ = compaction->output_level() == 0
                      ? nullptr
                      : compaction->CreateSstPartitioner();
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index 6b88eb452a08..a95bdaaa7ab0 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -31,31 +31,31 @@ class CompactionOutputs {
   struct Output {
     Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
            bool _enable_hash, bool _finished, uint64_t precalculated_hash,
-           bool _is_penultimate_level)
+           bool _is_proximal_level)
         : meta(std::move(_meta)),
           validator(_icmp, _enable_hash, precalculated_hash),
           finished(_finished),
-          is_penultimate_level(_is_penultimate_level) {}
+          is_proximal_level(_is_proximal_level) {}
     FileMetaData meta;
     OutputValidator validator;
     bool finished;
-    bool is_penultimate_level;
+    bool is_proximal_level;
     std::shared_ptr<const TableProperties> table_properties;
   };
 
   CompactionOutputs() = delete;
 
   explicit CompactionOutputs(const Compaction* compaction,
-                             const bool is_penultimate_level);
+                             const bool is_proximal_level);
 
-  bool IsPenultimateLevel() const { return is_penultimate_level_; }
+  bool IsProximalLevel() const { return is_proximal_level_; }
 
   // Add generated output to the list
   void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
                  bool enable_hash, bool finished = false,
                  uint64_t precalculated_hash = 0) {
     outputs_.emplace_back(std::move(meta), icmp, enable_hash, finished,
-                          precalculated_hash, is_penultimate_level_);
+                          precalculated_hash, is_proximal_level_);
   }
 
   // Set new table builder for the current output
@@ -73,27 +73,27 @@ class CompactionOutputs {
 
   // TODO: Move the BlobDB builder into CompactionOutputs
   const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
-    if (is_penultimate_level_) {
+    if (is_proximal_level_) {
       assert(blob_file_additions_.empty());
     }
     return blob_file_additions_;
   }
 
   std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
-    assert(!is_penultimate_level_);
+    assert(!is_proximal_level_);
     return &blob_file_additions_;
   }
 
   bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
 
   BlobGarbageMeter* CreateBlobGarbageMeter() {
-    assert(!is_penultimate_level_);
+    assert(!is_proximal_level_);
     blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
     return blob_garbage_meter_.get();
   }
 
   BlobGarbageMeter* GetBlobGarbageMeter() const {
-    if (is_penultimate_level_) {
+    if (is_proximal_level_) {
       // blobdb doesn't support per_key_placement yet
       assert(blob_garbage_meter_ == nullptr);
       return nullptr;
@@ -102,7 +102,7 @@ class CompactionOutputs {
   }
 
   void UpdateBlobStats() {
-    assert(!is_penultimate_level_);
+    assert(!is_proximal_level_);
     stats_.num_output_files_blob = blob_file_additions_.size();
     for (const auto& blob : blob_file_additions_) {
       stats_.bytes_written_blob += blob.GetTotalBlobBytes();
@@ -310,9 +310,9 @@ class CompactionOutputs {
   // Basic compaction output stats for this level's outputs
   InternalStats::CompactionOutputsStats stats_;
 
-  // indicate if this CompactionOutputs obj for penultimate_level, should always
+  // indicate if this CompactionOutputs obj for proximal_level, should always
   // be false if per_key_placement feature is not enabled.
-  const bool is_penultimate_level_;
+  const bool is_proximal_level_;
 
   // partitioner information
   std::string last_key_for_partitioner_;
@@ -366,7 +366,7 @@ class CompactionOutputs {
   std::vector<size_t> level_ptrs_;
 };
 
-// helper struct to concatenate the last level and penultimate level outputs
+// helper struct to concatenate the last level and proximal level outputs
 // which could be replaced by std::ranges::join_view() in c++20
 struct OutputIterator {
  public:
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 946dab5ddefe..f65556d38de6 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -272,8 +272,8 @@ bool CompactionPicker::RangeOverlapWithCompaction(
       return true;
     }
     if (c->SupportsPerKeyPlacement()) {
-      if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
-                                                largest_user_key)) {
+      if (c->OverlapProximalLevelOutputRange(smallest_user_key,
+                                             largest_user_key)) {
         return true;
       }
     }
@@ -284,7 +284,7 @@ bool CompactionPicker::RangeOverlapWithCompaction(
 
 bool CompactionPicker::FilesRangeOverlapWithCompaction(
     const std::vector<CompactionInputFiles>& inputs, int level,
-    int penultimate_level) const {
+    int proximal_level) const {
   bool is_empty = true;
   for (auto& in : inputs) {
     if (!in.empty()) {
@@ -301,18 +301,18 @@ bool CompactionPicker::FilesRangeOverlapWithCompaction(
   //  files cannot be overlapped in the order of L0 files.
   InternalKey smallest, largest;
   GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
-  if (penultimate_level != Compaction::kInvalidLevel) {
+  if (proximal_level != Compaction::kInvalidLevel) {
     if (ioptions_.compaction_style == kCompactionStyleUniversal) {
       if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
-                                     penultimate_level)) {
+                                     proximal_level)) {
         return true;
       }
     } else {
-      InternalKey penultimate_smallest, penultimate_largest;
-      GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
-      if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
-                                     penultimate_largest.user_key(),
-                                     penultimate_level)) {
+      InternalKey proximal_smallest, proximal_largest;
+      GetRange(inputs, &proximal_smallest, &proximal_largest, level);
+      if (RangeOverlapWithCompaction(proximal_smallest.user_key(),
+                                     proximal_largest.user_key(),
+                                     proximal_level)) {
         return true;
       }
     }
@@ -353,7 +353,7 @@ Compaction* CompactionPicker::CompactFiles(
   }
   assert(output_level == 0 || !FilesRangeOverlapWithCompaction(
                                   input_files, output_level,
-                                  Compaction::EvaluatePenultimateLevel(
+                                  Compaction::EvaluateProximalLevel(
                                       vstorage, mutable_cf_options, ioptions_,
                                       start_level, output_level)));
 #endif /* !NDEBUG */
@@ -659,9 +659,9 @@ Compaction* CompactionPicker::CompactRange(
     // overlaping outputs in the same level.
     if (FilesRangeOverlapWithCompaction(
             inputs, output_level,
-            Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
-                                                 ioptions_, start_level,
-                                                 output_level))) {
+            Compaction::EvaluateProximalLevel(vstorage, mutable_cf_options,
+                                              ioptions_, start_level,
+                                              output_level))) {
       // This compaction output could potentially conflict with the output
       // of a currently running compaction, we cannot run it.
       *manual_conflict = true;
@@ -848,9 +848,9 @@ Compaction* CompactionPicker::CompactRange(
   // overlaping outputs in the same level.
   if (FilesRangeOverlapWithCompaction(
           compaction_inputs, output_level,
-          Compaction::EvaluatePenultimateLevel(vstorage, mutable_cf_options,
-                                               ioptions_, input_level,
-                                               output_level))) {
+          Compaction::EvaluateProximalLevel(vstorage, mutable_cf_options,
+                                            ioptions_, input_level,
+                                            output_level))) {
     // This compaction output could potentially conflict with the output
     // of a currently running compaction, we cannot run it.
     *manual_conflict = true;
@@ -1137,7 +1137,7 @@ Status CompactionPicker::SanitizeAndConvertCompactionInputFiles(
   if (output_level != 0 &&
       FilesRangeOverlapWithCompaction(
           *converted_input_files, output_level,
-          Compaction::EvaluatePenultimateLevel(
+          Compaction::EvaluateProximalLevel(
               version->storage_info(), version->GetMutableCFOptions(),
               ioptions_, (*converted_input_files)[0].level, output_level))) {
     return Status::Aborted(
@@ -1154,7 +1154,7 @@ void CompactionPicker::RegisterCompaction(Compaction* c) {
   assert(ioptions_.compaction_style != kCompactionStyleLevel ||
          c->output_level() == 0 ||
          !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(),
-                                          c->GetPenultimateLevel()));
+                                          c->GetProximalLevel()));
   // CompactionReason::kExternalSstIngestion's start level is just a placeholder
   // number without actual meaning as file ingestion technically does not have
   // an input level like other compactions
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 6285e054301e..9d23555ec596 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -190,7 +190,7 @@ class CompactionPicker {
   // key range of a currently running compaction.
   bool FilesRangeOverlapWithCompaction(
       const std::vector<CompactionInputFiles>& inputs, int level,
-      int penultimate_level) const;
+      int proximal_level) const;
 
   bool SetupOtherInputs(const std::string& cf_name,
                         const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 612c1e5af21a..b4a122954bf4 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -414,9 +414,9 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
                                                     &tmp_start_level_inputs) ||
         compaction_picker_->FilesRangeOverlapWithCompaction(
             {tmp_start_level_inputs}, output_level_,
-            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
-                                                 ioptions_, start_level_,
-                                                 output_level_))) {
+            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                              ioptions_, start_level_,
+                                              output_level_))) {
       // Constraint 1a
       tmp_start_level_inputs.clear();
       return;
@@ -490,9 +490,9 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
     // We need to disallow this from happening.
     if (compaction_picker_->FilesRangeOverlapWithCompaction(
             compaction_inputs_, output_level_,
-            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
-                                                 ioptions_, start_level_,
-                                                 output_level_))) {
+            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                              ioptions_, start_level_,
+                                              output_level_))) {
       // This compaction output could potentially conflict with the output
       // of a currently running compaction, we cannot run it.
       return false;
@@ -846,9 +846,9 @@ bool LevelCompactionBuilder::PickFileToCompact() {
                                                     &start_level_inputs_) ||
         compaction_picker_->FilesRangeOverlapWithCompaction(
             {start_level_inputs_}, output_level_,
-            Compaction::EvaluatePenultimateLevel(vstorage_, mutable_cf_options_,
-                                                 ioptions_, start_level_,
-                                                 output_level_))) {
+            Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                              ioptions_, start_level_,
+                                              output_level_))) {
       // A locked (pending compaction) input-level file was pulled in due to
       // user-key overlap.
       start_level_inputs_.clear();
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index f48195e29a0b..35193db57eed 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -3677,7 +3677,7 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
   const uint64_t kFileSize = 100000;
   const int kNumLevels = 7;
   const int kLastLevel = kNumLevels - 1;
-  const int kPenultimateLevel = kLastLevel - 1;
+  const int kProximalLevel = kLastLevel - 1;
 
   ioptions_.compaction_style = kCompactionStyleUniversal;
   mutable_cf_options_.preclude_last_level_data_seconds = 1000;
@@ -3702,14 +3702,14 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
   // Here to make sure it's size ratio compaction instead of size amp
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kUniversalSizeRatio);
-  ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
+  ASSERT_EQ(compaction->output_level(), kProximalLevel - 1);
   ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
   ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
   ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
 }
 
 TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
-  // Tiered compaction only support level_num > 2 (otherwise the penultimate
+  // Tiered compaction only support level_num > 2 (otherwise the proximal
   // level is going to be level 0, which may make thing more complicated), so
   // when there's only 2 level, still treating level 1 as the last level for
   // size amp compaction
@@ -3753,7 +3753,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
   const uint64_t kFileSize = 100000;
   const int kNumLevels = 7;
   const int kLastLevel = kNumLevels - 1;
-  const int kPenultimateLevel = kLastLevel - 1;
+  const int kProximalLevel = kLastLevel - 1;
 
   ioptions_.compaction_style = kCompactionStyleUniversal;
   mutable_cf_options_.preclude_last_level_data_seconds = 1000;
@@ -3775,10 +3775,10 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
           vstorage_.get(), &log_buffer_));
 
   // It's a Size Amp compaction, but doesn't include the last level file and
-  // output to the penultimate level.
+  // output to the proximal level.
   ASSERT_EQ(compaction->compaction_reason(),
             CompactionReason::kUniversalSizeAmplification);
-  ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
+  ASSERT_EQ(compaction->output_level(), kProximalLevel);
   ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
   ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
   ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
@@ -3940,7 +3940,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
   ASSERT_EQ(enable_per_key_placement_,
             level_compaction_picker.FilesRangeOverlapWithCompaction(
                 input_files, 6,
-                Compaction::EvaluatePenultimateLevel(
+                Compaction::EvaluateProximalLevel(
                     vstorage_.get(), mutable_cf_options_, ioptions_, 0, 6)));
 }
 
@@ -4028,7 +4028,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_EQ(enable_per_key_placement_,
             universal_compaction_picker.FilesRangeOverlapWithCompaction(
                 input_files, 6,
-                Compaction::EvaluatePenultimateLevel(
+                Compaction::EvaluateProximalLevel(
                     vstorage_.get(), mutable_cf_options_, ioptions_, 0, 6)));
 }
 
@@ -4076,9 +4076,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
                 input_files, 5, Compaction::kInvalidLevel));
 }
 
-TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
+TEST_P(PerKeyPlacementCompactionPickerTest, ProximalOverlapUniversal) {
   // This test is make sure the Tiered compaction would lock whole range of
-  // both output level and penultimate level
+  // both output level and proximal level
   if (enable_per_key_placement_) {
     mutable_cf_options_.preclude_last_level_data_seconds = 10000;
   }
@@ -4098,7 +4098,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
   UpdateVersionStorageInfo();
 
   // the existing compaction is the 1st L4 file + L6 file
-  // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped
+  // then compaction of the 2nd L4 file to L5 (proximal level) is overlapped
   // when the tiered compaction feature is on.
   CompactionOptions comp_options;
   std::unordered_set<uint64_t> input_set;
@@ -4187,9 +4187,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
 }
 
 TEST_P(PerKeyPlacementCompactionPickerTest,
-       LastLevelOnlyFailPenultimateUniversal) {
+       LastLevelOnlyFailProximalUniversal) {
   // This is to test last_level only compaction still unable to do the
-  // penultimate level compaction if there's already a file in the penultimate
+  // proximal level compaction if there's already a file in the proximal
   // level.
   // This should rarely happen in universal compaction, as the non-empty L5
   // should be included in the compaction.
@@ -4222,9 +4222,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
       mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
-  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+  ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
 
-  // As comp1 cannot be output to the penultimate level, compacting file 40 to
+  // As comp1 cannot be output to the proximal level, compacting file 40 to
   // L5 is always safe.
   input_set.clear();
   input_files.clear();
@@ -4239,14 +4239,14 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
       comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
       mutable_db_options_, 0));
   ASSERT_TRUE(comp2);
-  ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
 }
 
 TEST_P(PerKeyPlacementCompactionPickerTest,
        LastLevelOnlyConflictWithOngoingUniversal) {
   // This is to test last_level only compaction still unable to do the
-  // penultimate level compaction if there's already an ongoing compaction to
-  // the penultimate level
+  // proximal level compaction if there's already an ongoing compaction to
+  // the proximal level
   if (enable_per_key_placement_) {
     mutable_cf_options_.preclude_last_level_data_seconds = 10000;
   }
@@ -4265,7 +4265,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   Add(6, 60U, "101", "351", 60000000U);
   UpdateVersionStorageInfo();
 
-  // create an ongoing compaction to L5 (penultimate level)
+  // create an ongoing compaction to L5 (proximal level)
   CompactionOptions comp_options;
   std::unordered_set<uint64_t> input_set;
   input_set.insert(40);
@@ -4278,7 +4278,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
       mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
-  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+  ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
 
   input_set.clear();
   input_files.clear();
@@ -4289,7 +4289,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_EQ(enable_per_key_placement_,
             universal_compaction_picker.FilesRangeOverlapWithCompaction(
                 input_files, 6,
-                Compaction::EvaluatePenultimateLevel(
+                Compaction::EvaluateProximalLevel(
                     vstorage_.get(), mutable_cf_options_, ioptions_, 6, 6)));
 
   if (!enable_per_key_placement_) {
@@ -4297,7 +4297,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
         comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
         mutable_db_options_, 0));
     ASSERT_TRUE(comp2);
-    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   }
 }
 
@@ -4306,7 +4306,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only
   // change is the ongoing compaction to L5 has no overlap with the last level
   // compaction, so it's safe to move data from the last level to the
-  // penultimate level.
+  // proximal level.
   if (enable_per_key_placement_) {
     mutable_cf_options_.preclude_last_level_data_seconds = 10000;
   }
@@ -4325,7 +4325,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   Add(6, 60U, "101", "351", 60000000U);
   UpdateVersionStorageInfo();
 
-  // create an ongoing compaction to L5 (penultimate level)
+  // create an ongoing compaction to L5 (proximal level)
   CompactionOptions comp_options;
   std::unordered_set<uint64_t> input_set;
   input_set.insert(42);
@@ -4338,7 +4338,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
       mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
-  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+  ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
 
   input_set.clear();
   input_files.clear();
@@ -4349,8 +4349,8 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   // always safe to move data up
   ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
       input_files, 6,
-      Compaction::EvaluatePenultimateLevel(vstorage_.get(), mutable_cf_options_,
-                                           ioptions_, 6, 6)));
+      Compaction::EvaluateProximalLevel(vstorage_.get(), mutable_cf_options_,
+                                        ioptions_, 6, 6)));
 
   // 2 compactions can be run in parallel
   std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
@@ -4358,9 +4358,9 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
       mutable_db_options_, 0));
   ASSERT_TRUE(comp2);
   if (enable_per_key_placement_) {
-    ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+    ASSERT_NE(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   } else {
-    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   }
 }
 
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 427abb9eabc7..f2bc740028ee 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -288,7 +288,9 @@ class UniversalCompactionBuilder {
 // and the index of the file in that level
 
 struct InputFileInfo {
-  InputFileInfo() : f(nullptr), level(0), index(0) {}
+  InputFileInfo() : InputFileInfo(nullptr, 0, 0) {}
+  InputFileInfo(FileMetaData* file_meta, size_t l, size_t i)
+      : f(file_meta), level(l), index(i) {}
 
   FileMetaData* f;
   size_t level;
@@ -321,22 +323,14 @@ SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
   SmallestKeyHeap smallest_key_priority_q =
       SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
 
-  InputFileInfo input_file;
-
   for (size_t l = 0; l < c->num_input_levels(); l++) {
     if (c->num_input_files(l) != 0) {
       if (l == 0 && c->start_level() == 0) {
         for (size_t i = 0; i < c->num_input_files(0); i++) {
-          input_file.f = c->input(0, i);
-          input_file.level = 0;
-          input_file.index = i;
-          smallest_key_priority_q.push(std::move(input_file));
+          smallest_key_priority_q.emplace(c->input(0, i), 0, i);
         }
       } else {
-        input_file.f = c->input(l, 0);
-        input_file.level = l;
-        input_file.index = 0;
-        smallest_key_priority_q.push(std::move(input_file));
+        smallest_key_priority_q.emplace(c->input(l, 0), l, 0);
       }
     }
   }
@@ -374,7 +368,7 @@ bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
   auto comparator = icmp_->user_comparator();
   int first_iter = 1;
 
-  InputFileInfo prev, curr, next;
+  InputFileInfo prev, curr;
 
   SmallestKeyHeap smallest_key_priority_q =
       create_level_heap(c, icmp_->user_comparator());
@@ -397,17 +391,10 @@ bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
       prev = curr;
     }
 
-    next.f = nullptr;
-
     if (c->level(curr.level) != 0 &&
         curr.index < c->num_input_files(curr.level) - 1) {
-      next.f = c->input(curr.level, curr.index + 1);
-      next.level = curr.level;
-      next.index = curr.index + 1;
-    }
-
-    if (next.f) {
-      smallest_key_priority_q.push(std::move(next));
+      smallest_key_priority_q.emplace(c->input(curr.level, curr.index + 1),
+                                      curr.level, curr.index + 1);
     }
   }
   return true;
@@ -996,7 +983,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
 
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
-                               Compaction::EvaluatePenultimateLevel(
+                               Compaction::EvaluateProximalLevel(
                                    vstorage_, mutable_cf_options_, ioptions_,
                                    start_level, output_level))) {
     return nullptr;
@@ -1345,7 +1332,7 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
   // intra L0 compactions outputs could have overlap
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
-                               Compaction::EvaluatePenultimateLevel(
+                               Compaction::EvaluateProximalLevel(
                                    vstorage_, mutable_cf_options_, ioptions_,
                                    start_level, output_level))) {
     return nullptr;
@@ -1486,9 +1473,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       }
       if (picker_->FilesRangeOverlapWithCompaction(
               inputs, output_level,
-              Compaction::EvaluatePenultimateLevel(
-                  vstorage_, mutable_cf_options_, ioptions_, start_level,
-                  output_level))) {
+              Compaction::EvaluateProximalLevel(vstorage_, mutable_cf_options_,
+                                                ioptions_, start_level,
+                                                output_level))) {
         return nullptr;
       }
 
@@ -1590,7 +1577,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
   // intra L0 compactions outputs could have overlap
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
-                               Compaction::EvaluatePenultimateLevel(
+                               Compaction::EvaluateProximalLevel(
                                    vstorage_, mutable_cf_options_, ioptions_,
                                    start_level, output_level))) {
     return nullptr;
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index cc75729bfe00..0b6afa10e6bb 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -391,8 +391,8 @@ Status CompactionServiceCompactionJob::Run() {
   // 2. Update the Output information in the Compaction Job Stats with
   // aggregated Internal Compaction Stats.
   UpdateCompactionJobStats(compaction_stats_.stats);
-  if (compaction_stats_.has_penultimate_level_output) {
-    UpdateCompactionJobStats(compaction_stats_.penultimate_level_stats);
+  if (compaction_stats_.has_proximal_level_output) {
+    UpdateCompactionJobStats(compaction_stats_.proximal_level_stats);
   }
 
   // 3. Set fields that are not propagated as part of aggregations above
@@ -417,7 +417,7 @@ Status CompactionServiceCompactionJob::Run() {
           meta.file_creation_time, meta.epoch_number, meta.file_checksum,
           meta.file_checksum_func_name, output_file.validator.GetHash(),
           meta.marked_for_compaction, meta.unique_id,
-          *output_file.table_properties, output_file.is_penultimate_level,
+          *output_file.table_properties, output_file.is_proximal_level,
           meta.temperature);
     }
   }
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 064eae37fe86..af3cfa029ce7 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -1194,7 +1194,7 @@ TEST_F(CompactionServiceTest, PrecludeLastLevel) {
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // Data split between penultimate (kUnknown) and last (kCold) levels
+  // Data split between proximal (kUnknown) and last (kCold) levels
   ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc
index 13f40f63f0ca..1651a3dec48f 100644
--- a/db/compaction/subcompaction_state.cc
+++ b/db/compaction/subcompaction_state.cc
@@ -18,29 +18,28 @@ void SubcompactionState::AggregateCompactionOutputStats(
   // Outputs should be closed. By extension, any files created just for
   // range deletes have already been written also.
   assert(compaction_outputs_.HasBuilder() == false);
-  assert(penultimate_level_outputs_.HasBuilder() == false);
+  assert(proximal_level_outputs_.HasBuilder() == false);
 
   // FIXME: These stats currently include abandonned output files
   // assert(compaction_outputs_.stats_.num_output_files ==
   //        compaction_outputs_.outputs_.size());
-  // assert(penultimate_level_outputs_.stats_.num_output_files ==
-  //        penultimate_level_outputs_.outputs_.size());
+  // assert(proximal_level_outputs_.stats_.num_output_files ==
+  //        proximal_level_outputs_.outputs_.size());
 
   compaction_stats.stats.Add(compaction_outputs_.stats_);
-  if (penultimate_level_outputs_.HasOutput()) {
-    compaction_stats.has_penultimate_level_output = true;
-    compaction_stats.penultimate_level_stats.Add(
-        penultimate_level_outputs_.stats_);
+  if (proximal_level_outputs_.HasOutput()) {
+    compaction_stats.has_proximal_level_output = true;
+    compaction_stats.proximal_level_stats.Add(proximal_level_outputs_.stats_);
   }
 }
 
 OutputIterator SubcompactionState::GetOutputs() const {
-  return OutputIterator(penultimate_level_outputs_.outputs_,
+  return OutputIterator(proximal_level_outputs_.outputs_,
                         compaction_outputs_.outputs_);
 }
 
 void SubcompactionState::Cleanup(Cache* cache) {
-  penultimate_level_outputs_.Cleanup();
+  proximal_level_outputs_.Cleanup();
   compaction_outputs_.Cleanup();
 
   if (!status.ok()) {
@@ -63,9 +62,9 @@ void SubcompactionState::Cleanup(Cache* cache) {
 }
 
 Slice SubcompactionState::SmallestUserKey() const {
-  if (penultimate_level_outputs_.HasOutput()) {
+  if (proximal_level_outputs_.HasOutput()) {
     Slice a = compaction_outputs_.SmallestUserKey();
-    Slice b = penultimate_level_outputs_.SmallestUserKey();
+    Slice b = proximal_level_outputs_.SmallestUserKey();
     if (a.empty()) {
       return b;
     }
@@ -85,9 +84,9 @@ Slice SubcompactionState::SmallestUserKey() const {
 }
 
 Slice SubcompactionState::LargestUserKey() const {
-  if (penultimate_level_outputs_.HasOutput()) {
+  if (proximal_level_outputs_.HasOutput()) {
     Slice a = compaction_outputs_.LargestUserKey();
-    Slice b = penultimate_level_outputs_.LargestUserKey();
+    Slice b = proximal_level_outputs_.LargestUserKey();
     if (a.empty()) {
       return b;
     }
@@ -107,12 +106,12 @@ Slice SubcompactionState::LargestUserKey() const {
 }
 
 Status SubcompactionState::AddToOutput(
-    const CompactionIterator& iter, bool use_penultimate_output,
+    const CompactionIterator& iter, bool use_proximal_output,
     const CompactionFileOpenFunc& open_file_func,
     const CompactionFileCloseFunc& close_file_func) {
   // update target output
-  current_outputs_ = use_penultimate_output ? &penultimate_level_outputs_
-                                            : &compaction_outputs_;
+  current_outputs_ =
+      use_proximal_output ? &proximal_level_outputs_ : &compaction_outputs_;
   return current_outputs_->AddToOutput(iter, open_file_func, close_file_func);
 }
 
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index b9117f5adfbc..016d253566e9 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -26,13 +26,13 @@ namespace ROCKSDB_NAMESPACE {
 // Maintains state and outputs for each sub-compaction
 // It contains 2 `CompactionOutputs`:
 //  1. one for the normal output files
-//  2. another for the penultimate level outputs
+//  2. another for the proximal level outputs
 // a `current` pointer maintains the current output group, when calling
 // `AddToOutput()`, it checks the output of the current compaction_iterator key
 // and point `current` to the target output group. By default, it just points to
 // normal compaction_outputs, if the compaction_iterator key should be placed on
-// the penultimate level, `current` is changed to point to
-// `penultimate_level_outputs`.
+// the proximal level, `current` is changed to point to
+// `proximal_level_outputs`.
 // The later operations uses `Current()` to get the target group.
 //
 // +----------+          +-----------------------------+      +---------+
@@ -43,7 +43,7 @@ namespace ROCKSDB_NAMESPACE {
 //       |                                                    |  ...    |
 //       |
 //       |               +-----------------------------+      +---------+
-//       +-------------> | penultimate_level_outputs   |----->| output  |
+//       +-------------> | proximal_level_outputs      |----->| output  |
 //                       +-----------------------------+      +---------+
 //                                                            |  ...    |
 
@@ -78,7 +78,7 @@ class SubcompactionState {
   Slice LargestUserKey() const;
 
   // Get all outputs from the subcompaction. For per_key_placement compaction,
-  // it returns both the last level outputs and penultimate level outputs.
+  // it returns both the last level outputs and proximal level outputs.
   OutputIterator GetOutputs() const;
 
   // Assign range dels aggregator. The various tombstones will potentially
@@ -92,7 +92,7 @@ class SubcompactionState {
 
   void RemoveLastEmptyOutput() {
     compaction_outputs_.RemoveLastEmptyOutput();
-    penultimate_level_outputs_.RemoveLastEmptyOutput();
+    proximal_level_outputs_.RemoveLastEmptyOutput();
   }
 
   void BuildSubcompactionJobInfo(
@@ -119,14 +119,14 @@ class SubcompactionState {
         start(_start),
         end(_end),
         sub_job_id(_sub_job_id),
-        compaction_outputs_(c, /*is_penultimate_level=*/false),
-        penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
+        compaction_outputs_(c, /*is_proximal_level=*/false),
+        proximal_level_outputs_(c, /*is_proximal_level=*/true) {
     assert(compaction != nullptr);
     // Set output split key (used for RoundRobin feature) only for normal
-    // compaction_outputs, output to penultimate_level feature doesn't support
+    // compaction_outputs, output to proximal_level feature doesn't support
     // RoundRobin feature (and may never going to be supported, because for
     // RoundRobin, the data time is mostly naturally sorted, no need to have
-    // per-key placement with output_to_penultimate_level).
+    // per-key placement with output_to_proximal_level).
     compaction_outputs_.SetOutputSlitKey(start, end);
   }
 
@@ -141,18 +141,17 @@ class SubcompactionState {
         compaction_job_stats(std::move(state.compaction_job_stats)),
         sub_job_id(state.sub_job_id),
         compaction_outputs_(std::move(state.compaction_outputs_)),
-        penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
+        proximal_level_outputs_(std::move(state.proximal_level_outputs_)),
         range_del_agg_(std::move(state.range_del_agg_)) {
-    current_outputs_ =
-        state.current_outputs_ == &state.penultimate_level_outputs_
-            ? &penultimate_level_outputs_
-            : &compaction_outputs_;
+    current_outputs_ = state.current_outputs_ == &state.proximal_level_outputs_
+                           ? &proximal_level_outputs_
+                           : &compaction_outputs_;
   }
 
   // Add all the new files from this compaction to version_edit
   void AddOutputsEdit(VersionEdit* out_edit) const {
-    for (const auto& file : penultimate_level_outputs_.outputs_) {
-      out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
+    for (const auto& file : proximal_level_outputs_.outputs_) {
+      out_edit->AddFile(compaction->GetProximalLevel(), file.meta);
     }
     for (const auto& file : compaction_outputs_.outputs_) {
       out_edit->AddFile(compaction->output_level(), file.meta);
@@ -169,11 +168,11 @@ class SubcompactionState {
     return *current_outputs_;
   }
 
-  CompactionOutputs* Outputs(bool is_penultimate_level) {
+  CompactionOutputs* Outputs(bool is_proximal_level) {
     assert(compaction);
-    if (is_penultimate_level) {
+    if (is_proximal_level) {
       assert(compaction->SupportsPerKeyPlacement());
-      return &penultimate_level_outputs_;
+      return &proximal_level_outputs_;
     }
     return &compaction_outputs_;
   }
@@ -188,12 +187,11 @@ class SubcompactionState {
   }
 
   // Add compaction_iterator key/value to the `Current` output group.
-  Status AddToOutput(const CompactionIterator& iter,
-                     bool use_penultimate_output,
+  Status AddToOutput(const CompactionIterator& iter, bool use_proximal_output,
                      const CompactionFileOpenFunc& open_file_func,
                      const CompactionFileCloseFunc& close_file_func);
 
-  // Close all compaction output files, both output_to_penultimate_level outputs
+  // Close all compaction output files, both output_to_proximal_level outputs
   // and normal outputs.
   Status CloseCompactionFiles(const Status& curr_status,
                               const CompactionFileOpenFunc& open_file_func,
@@ -204,11 +202,11 @@ class SubcompactionState {
     // CloseOutput() may open new compaction output files.
     Status s = curr_status;
     if (per_key) {
-      s = penultimate_level_outputs_.CloseOutput(
-          s, range_del_agg_.get(), open_file_func, close_file_func);
+      s = proximal_level_outputs_.CloseOutput(s, range_del_agg_.get(),
+                                              open_file_func, close_file_func);
     } else {
-      assert(penultimate_level_outputs_.HasBuilder() == false);
-      assert(penultimate_level_outputs_.HasOutput() == false);
+      assert(proximal_level_outputs_.HasBuilder() == false);
+      assert(proximal_level_outputs_.HasOutput() == false);
     }
     s = compaction_outputs_.CloseOutput(s, range_del_agg_.get(), open_file_func,
                                         close_file_func);
@@ -218,7 +216,7 @@ class SubcompactionState {
  private:
   // State kept for output being generated
   CompactionOutputs compaction_outputs_;
-  CompactionOutputs penultimate_level_outputs_;
+  CompactionOutputs proximal_level_outputs_;
   CompactionOutputs* current_outputs_ = &compaction_outputs_;
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
 };
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index eed5cb936f06..a7f2e948d16d 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -215,8 +215,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // the penultimate level file temperature is not cold, all data are output to
-  // the penultimate level.
+  // the proximal level file temperature is not cold, all data are output to
+  // the proximal level.
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
@@ -230,7 +230,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ResetAllStats(expect_stats, expect_pl_stats);
 
   // move forward the cold_seq to split the file into 2 levels, so should have
-  // both the last level stats and the output_to_penultimate_level stats
+  // both the last level stats and the output_to_proximal_level stats
   latest_cold_seq = seq_history[0];
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
@@ -246,7 +246,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
   VerifyCompactionStats(expect_stats, expect_pl_stats);
 
-  // delete all cold data, so all data will be on penultimate level
+  // delete all cold data, so all data will be on proximal level
   for (int i = 0; i < 10; i++) {
     ASSERT_OK(Delete(Key(i)));
   }
@@ -364,7 +364,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
       "CompactionIterator::PrepareOutput.context", [&](void* arg) {
         auto context = static_cast<PerKeyPlacementContext*>(arg);
         MutexLock l(&mutex);
-        context->output_to_penultimate_level =
+        context->output_to_proximal_level =
             cmp->Compare(context->key, hot_start) >= 0 &&
             cmp->Compare(context->key, hot_end) < 0;
       });
@@ -393,7 +393,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
 
   ResetAllStats(expect_stats, expect_pl_stats);
 
-  // change to all cold, no output_to_penultimate_level output
+  // change to all cold, no output_to_proximal_level output
   {
     MutexLock l(&mutex);
     hot_start = Key(100);
@@ -421,7 +421,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   }
 
   // No data is moved from cold tier to hot tier because no input files from L5
-  // or higher, it's not safe to move data to output_to_penultimate_level level.
+  // or higher, it's not safe to move data to output_to_proximal_level level.
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
 
@@ -567,7 +567,7 @@ TEST_F(TieredCompactionTest, LevelColdRangeDelete) {
 
   // 20->30 will be marked as cold data, but it cannot be placed to cold tier
   // (bottommost) otherwise, it will be "deleted" by the range del in
-  // output_to_penultimate_level level verify that these data will be able to
+  // output_to_proximal_level level verify that these data will be able to
   // queried
   for (int i = 20; i < 30; i++) {
     ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
@@ -677,17 +677,17 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
   std::vector<std::vector<FileMetaData>> level_to_files;
   dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
                                   &level_to_files);
-  // range tombstone is in the penultimate level
-  const int penultimate_level = kNumLevels - 2;
-  ASSERT_EQ(level_to_files[penultimate_level].size(), 1);
-  ASSERT_EQ(level_to_files[penultimate_level][0].num_entries, 1);
-  ASSERT_EQ(level_to_files[penultimate_level][0].num_deletions, 1);
-  ASSERT_EQ(level_to_files[penultimate_level][0].temperature,
+  // range tombstone is in the proximal level
+  const int proximal_level = kNumLevels - 2;
+  ASSERT_EQ(level_to_files[proximal_level].size(), 1);
+  ASSERT_EQ(level_to_files[proximal_level][0].num_entries, 1);
+  ASSERT_EQ(level_to_files[proximal_level][0].num_deletions, 1);
+  ASSERT_EQ(level_to_files[proximal_level][0].temperature,
             Temperature::kUnknown);
 
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
   ASSERT_EQ("0,1,10",
-            FilesPerLevel());  // one file is at the penultimate level which
+            FilesPerLevel());  // one file is at the proximal level which
                                // only contains a range delete
 
   // Add 2 hot keys, each is a new SST, they will be placed in the same level as
@@ -701,7 +701,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
 
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,2,10",
-            FilesPerLevel());  // one file is at the penultimate level
+            FilesPerLevel());  // one file is at the proximal level
                                // which only contains a range delete
   std::vector<LiveFileMetaData> live_file_meta;
   db_->GetLiveFilesMetaData(&live_file_meta);
@@ -711,7 +711,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
     if (meta.num_deletions > 0) {
       // found SST with del, which has 2 entries, one for data one for range del
       ASSERT_EQ(meta.level,
-                kNumLevels - 2);  // output to penultimate level
+                kNumLevels - 2);  // output to proximal level
       ASSERT_EQ(meta.num_entries, 2);
       ASSERT_EQ(meta.num_deletions, 1);
       found_sst_with_del = true;
@@ -722,7 +722,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
 
   // release the first snapshot and compact, which should compact the range del
   // but new inserted key `0` and `6` are still hot data which will be placed on
-  // the penultimate level
+  // the proximal level
   db_->ReleaseSnapshot(snap);
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,2,7", FilesPerLevel());
@@ -738,7 +738,7 @@ TEST_F(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
   ASSERT_FALSE(found_sst_with_del);
 
   // Now make all data cold, key 0 will be moved to the last level, but key 6 is
-  // still in snap2, so it will be kept at the penultimate level
+  // still in snap2, so it will be kept at the proximal level
   latest_cold_seq = dbfull()->GetLatestSequenceNumber();
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,1,8", FilesPerLevel());
@@ -783,7 +783,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) {
   }
   ASSERT_OK(Flush());
 
-  // compact to the penultimate level with 10 files
+  // compact to the proximal level with 10 files
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
@@ -810,7 +810,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) {
 
   ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel());
 
-  // range del with snapshot should be preserved in the penultimate level
+  // range del with snapshot should be preserved in the proximal level
   auto snap = db_->GetSnapshot();
 
   start = Key(6);
@@ -841,7 +841,7 @@ TEST_F(TieredCompactionTest, UniversalRangeDelete) {
     if (meta.num_deletions > 0) {
       // found SST with del, which has 2 entries, one for data one for range del
       ASSERT_EQ(meta.level,
-                kNumLevels - 2);  // output_to_penultimate_level level
+                kNumLevels - 2);  // output_to_proximal_level level
       ASSERT_EQ(meta.num_entries, 2);
       ASSERT_EQ(meta.num_deletions, 1);
       found_sst_with_del = true;
@@ -1138,7 +1138,7 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) {
       "CompactionIterator::PrepareOutput.context", [&](void* arg) {
         auto context = static_cast<PerKeyPlacementContext*>(arg);
         MutexLock l(&mutex);
-        context->output_to_penultimate_level =
+        context->output_to_proximal_level =
             cmp->Compare(context->key, hot_start) >= 0 &&
             cmp->Compare(context->key, hot_end) < 0;
       });
@@ -1221,10 +1221,10 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageLevel) {
       options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
       1);
 
-  // Tests that we only compact keys up to penultimate level
-  // that are within penultimate level input's internal key range.
-  // UPDATE: this functionality has changed. With penultimate-enabled
-  // compaction, the expanded potential output range in the penultimate
+  // Tests that we only compact keys up to proximal level
+  // that are within proximal level input's internal key range.
+  // UPDATE: this functionality has changed. With proximal-enabled
+  // compaction, the expanded potential output range in the proximal
   // level is reserved so should be safe to use.
   {
     MutexLock l(&mutex);
@@ -1376,7 +1376,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) {
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  // all data is moved up to the penultimate level
+  // all data is moved up to the proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
@@ -1448,7 +1448,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) {
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
-  // all data is moved up to the penultimate level
+  // all data is moved up to the proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
@@ -1579,9 +1579,9 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
 }
 
 TEST_P(PrecludeLastLevelTest, CheckInternalKeyRange) {
-  // When compacting keys from the last level to penultimate level,
-  // output to penultimate level should be within internal key range
-  // of input files from penultimate level.
+  // When compacting keys from the last level to proximal level,
+  // output to proximal level should be within internal key range
+  // of input files from proximal level.
   // Set up:
   // L5:
   //  File 1: DeleteRange[1, 3)@4, File 2: [3@5, 100@6]
@@ -1719,8 +1719,8 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) {
 
   ApplyConfigChange(&options, {{"preclude_last_level_data_seconds", "10000"}});
 
-  // To exercise the WithinPenultimateLevelOutputRange feature, we want files
-  // around the middle file to be compacted on the penultimate level
+  // To exercise the WithinProximalLevelOutputRange feature, we want files
+  // around the middle file to be compacted on the proximal level
   ASSERT_OK(Put(Key(0), "val0"));
   ASSERT_OK(Flush());
   ASSERT_OK(Put(Key(3), "val3"));
@@ -1777,9 +1777,9 @@ TEST_P(PrecludeWithCompactStyleTest, RangeTombstoneSnapshotMigrateFromLast) {
   EXPECT_EQ("0,0,0,0,0,3,1", FilesPerLevel());
   VerifyLogicalState(__LINE__);
 
-  // Compact everything, but some data still goes to both penultimate and last
+  // Compact everything, but some data still goes to both proximal and last
   // levels. A full-range compaction should be safe to "migrate" data from the
-  // last level to penultimate (because of preclude setting change).
+  // last level to proximal (because of preclude setting change).
   ASSERT_OK(CompactRange({}, {}, {}));
   EXPECT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   VerifyLogicalState(__LINE__);
@@ -1898,7 +1898,7 @@ TEST_P(TimedPutPrecludeLastLevelTest, InterleavedTimedPutAndPut) {
   Close();
 }
 
-TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnPenultimateLevel) {
+TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnProximalLevel) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options.disable_auto_compactions = true;
@@ -1924,14 +1924,14 @@ TEST_P(TimedPutPrecludeLastLevelTest, PreserveTimedPutOnPenultimateLevel) {
   ASSERT_OK(TimedPut(0, Key(2), "v2", kMockStartTime - 1 * 24 * 60 * 60, wo));
   ASSERT_OK(Flush());
 
-  // Should still be in penultimate level.
+  // Should still be in proximal level.
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
   // Wait one more day and release snapshot. Data's preferred seqno should be
-  // swapped in, but data should still stay in penultimate level. SST file's
+  // swapped in, but data should still stay in proximal level. SST file's
   // seqno to time mapping should continue to cover preferred seqno after
   // compaction.
   db_->ReleaseSnapshot(snap1);
@@ -2253,13 +2253,13 @@ TEST_P(PrecludeLastLevelOptionalTest, LastLevelOnlyCompactionNoPreclude) {
   Close();
 }
 
-TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
+TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToProximalLevel) {
   // Test the last level only periodic compaction should also be blocked by an
-  // ongoing compaction in penultimate level if tiered compaction is enabled
+  // ongoing compaction in proximal level if tiered compaction is enabled
   // otherwise, the periodic compaction should just run for the last level.
   const int kNumTrigger = 4;
   const int kNumLevels = 7;
-  const int kPenultimateLevel = kNumLevels - 2;
+  const int kProximalLevel = kNumLevels - 2;
   const int kKeyPerSec = 1;
   const int kNumKeys = 100;
 
@@ -2301,13 +2301,13 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
   SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
         auto compaction = static_cast<Compaction*>(arg);
-        if (compaction->output_level() == kPenultimateLevel) {
+        if (compaction->output_level() == kProximalLevel) {
           is_size_ratio_compaction_running = true;
           TEST_SYNC_POINT(
-              "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+              "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
               "SizeRatioCompaction1");
           TEST_SYNC_POINT(
-              "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+              "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
               "SizeRatioCompaction2");
           is_size_ratio_compaction_running = false;
         }
@@ -2329,17 +2329,17 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
           verified_last_level_compaction = true;
         }
         TEST_SYNC_POINT(
-            "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+            "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
             "AutoCompactionPicked");
       });
 
   SyncPoint::GetInstance()->LoadDependency({
-      {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+      {"PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
        "SizeRatioCompaction1",
-       "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite"},
-      {"PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+       "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:DoneWrite"},
+      {"PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
        "AutoCompactionPicked",
-       "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:"
+       "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:"
        "SizeRatioCompaction2"},
   });
 
@@ -2356,11 +2356,11 @@ TEST_P(PrecludeLastLevelOptionalTest, PeriodicCompactionToPenultimateLevel) {
   }
 
   TEST_SYNC_POINT(
-      "PrecludeLastLevelTest::PeriodicCompactionToPenultimateLevel:DoneWrite");
+      "PrecludeLastLevelTest::PeriodicCompactionToProximalLevel:DoneWrite");
 
   // wait for periodic compaction time and flush to trigger the periodic
   // compaction, which should be blocked by ongoing compaction in the
-  // penultimate level
+  // proximal level
   mock_clock_->MockSleepForSeconds(10000);
   for (int i = 0; i < 3 * kNumKeys; i++) {
     ASSERT_OK(Put(Key(i), rnd.RandomString(10)));
@@ -2423,7 +2423,7 @@ class ThreeRangesPartitionerFactory : public SstPartitionerFactory {
   }
 };
 
-TEST_P(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
+TEST_P(PrecludeLastLevelTest, PartialProximalLevelCompaction) {
   const int kNumTrigger = 4;
   const int kNumLevels = 7;
   const int kKeyPerSec = 10;
@@ -2593,8 +2593,8 @@ TEST_P(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
       "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
         auto compaction = static_cast<Compaction*>(arg);
         if (compaction->SupportsPerKeyPlacement()) {
-          ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
-                    Compaction::PenultimateOutputRangeType::kNonLastRange);
+          ASSERT_EQ(compaction->GetProximalOutputRangeType(),
+                    Compaction::ProximalOutputRangeType::kNonLastRange);
           per_key_comp_num++;
         }
       });
@@ -2650,7 +2650,7 @@ TEST_P(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) {
   ASSERT_EQ(3, per_key_comp_num);
   verify_db();
 
-  // Finish off the penultimate level.
+  // Finish off the proximal level.
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,0,3", FilesPerLevel());
   verify_db();
diff --git a/db/flush_job.h b/db/flush_job.h
index 1c1f15d1b1dc..f3f85abbcc70 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -234,7 +234,7 @@ class FlushJob {
 
   // The current minimum seqno that compaction jobs will preclude the data from
   // the last level. Data with seqnos larger than this or larger than
-  // `earliest_snapshot_` will be output to the penultimate level had it gone
+  // `earliest_snapshot_` will be output to the proximal level had it gone
   // through a compaction to the last level.
   SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
 };
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 7ebd406db757..01c4c4bd7184 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -474,33 +474,33 @@ class InternalStats {
   };
 
   // Compaction stats, for per_key_placement compaction, it includes 2 levels
-  // stats: the last level and the penultimate level.
+  // stats: the last level and the proximal level.
   struct CompactionStatsFull {
     // the stats for the target primary output level
     CompactionStats stats;
 
-    // stats for penultimate level output if exist
-    bool has_penultimate_level_output = false;
-    CompactionStats penultimate_level_stats;
+    // stats for proximal level output if exist
+    bool has_proximal_level_output = false;
+    CompactionStats proximal_level_stats;
 
-    explicit CompactionStatsFull() : stats(), penultimate_level_stats() {}
+    explicit CompactionStatsFull() : stats(), proximal_level_stats() {}
 
     explicit CompactionStatsFull(CompactionReason reason, int c)
-        : stats(reason, c), penultimate_level_stats(reason, c) {}
+        : stats(reason, c), proximal_level_stats(reason, c) {}
 
     uint64_t TotalBytesWritten() const {
       uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
-      if (has_penultimate_level_output) {
-        bytes_written += penultimate_level_stats.bytes_written +
-                         penultimate_level_stats.bytes_written_blob;
+      if (has_proximal_level_output) {
+        bytes_written += proximal_level_stats.bytes_written +
+                         proximal_level_stats.bytes_written_blob;
       }
       return bytes_written;
     }
 
     uint64_t DroppedRecords() {
       uint64_t output_records = stats.num_output_records;
-      if (has_penultimate_level_output) {
-        output_records += penultimate_level_stats.num_output_records;
+      if (has_proximal_level_output) {
+        output_records += proximal_level_stats.num_output_records;
       }
       if (stats.num_input_records > output_records) {
         return stats.num_input_records - output_records;
@@ -510,12 +510,12 @@ class InternalStats {
 
     void SetMicros(uint64_t val) {
       stats.micros = val;
-      penultimate_level_stats.micros = val;
+      proximal_level_stats.micros = val;
     }
 
     void AddCpuMicros(uint64_t val) {
       stats.cpu_micros += val;
-      penultimate_level_stats.cpu_micros += val;
+      proximal_level_stats.cpu_micros += val;
     }
   };
 
@@ -588,9 +588,8 @@ class InternalStats {
   void AddCompactionStats(int level, Env::Priority thread_pri,
                           const CompactionStatsFull& comp_stats_full) {
     AddCompactionStats(level, thread_pri, comp_stats_full.stats);
-    if (comp_stats_full.has_penultimate_level_output) {
-      per_key_placement_comp_stats_.Add(
-          comp_stats_full.penultimate_level_stats);
+    if (comp_stats_full.has_proximal_level_output) {
+      per_key_placement_comp_stats_.Add(comp_stats_full.proximal_level_stats);
     }
   }
 
diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc
index a23f9ce6671f..98fae6d6c531 100644
--- a/db/seqno_time_test.cc
+++ b/db/seqno_time_test.cc
@@ -96,7 +96,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // All data is hot, only output to penultimate level
+  // All data is hot, only output to proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
@@ -185,7 +185,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
   options.num_levels = kNumLevels;
   options.level_compaction_dynamic_level_bytes = true;
   // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
-  //  the penultimate level score > 1, but the hot is not cold enough to compact
+  //  the proximal level score > 1, but the hot is not cold enough to compact
   //  to last level, which will keep triggering compaction.
   options.disable_auto_compactions = true;
   DestroyAndReopen(options);
@@ -205,7 +205,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  // All data is hot, only output to penultimate level
+  // All data is hot, only output to proximal level
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
@@ -753,7 +753,7 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
-  // make sure the data is all compacted to penultimate level if the feature is
+  // make sure the data is all compacted to proximal level if the feature is
   // on, otherwise, compacted to the last level.
   if (options.preclude_last_level_data_seconds > 0) {
     ASSERT_GT(NumTableFilesAtLevel(5), 0);
diff --git a/db/version_set.cc b/db/version_set.cc
index 7e9893a93c0e..c8f13b48bc39 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4761,7 +4761,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
             cur_level_size <= base_bytes_min &&
             (options.preclude_last_level_data_seconds == 0 ||
              i < num_levels_ - 2)) {
-          // When per_key_placement is enabled, the penultimate level is
+          // When per_key_placement is enabled, the proximal level is
           // necessary.
           lowest_unnecessary_level_ = i;
         }
diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index 91709795a176..ba3fcebcc905 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -118,6 +118,6 @@ struct CompactionJobStats {
   // number of single-deletes which meet something other than a put
   uint64_t num_single_del_mismatch = 0;
 
-  // TODO: Add output_to_penultimate_level output information
+  // TODO: Add output_to_proximal_level output information
 };
 }  // namespace ROCKSDB_NAMESPACE

From 0cc943c06714e7d1fff1df94c5c47f60ace6410e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 14 Mar 2025 10:50:05 -0700
Subject: [PATCH 020/500] format_version < 2 unsupported for write, deprecated
 for read (#13463)

Summary:
In hopes of eventually removing some ugly and awkard code for compress_format_version < 2, users can no longer write files in that format and its read support is marked deprecated. For continuing to test that read support, there is a back door to writing the files in unit tests.

If format_version < 2 is specified, it is quietly sanitized to 2. (This is similar to other BlockBasedTableOptions.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13463

Test Plan: unit tests updated.

Reviewed By: hx235

Differential Revision: D71152916

Pulled By: pdillinger

fbshipit-source-id: 95be55e86f93f09fd898223578b9381385c3ccd8
---
 db/db_test.cc                                 | 21 ++++++++++++++++
 include/rocksdb/cache.h                       |  2 +-
 include/rocksdb/table.h                       | 24 ++++++++++++-------
 .../block_based/block_based_table_builder.cc  | 10 --------
 .../block_based/block_based_table_factory.cc  | 19 +++++++++++++++
 table/block_based/block_based_table_factory.h |  6 +++++
 table/table_test.cc                           |  4 ++++
 .../public_api_changes/unsupport_fv1.md       |  1 +
 8 files changed, 68 insertions(+), 19 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/unsupport_fv1.md

diff --git a/db/db_test.cc b/db/db_test.cc
index e141e562afbd..875ca64d29ee 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -59,11 +59,13 @@
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/compression.h"
+#include "util/defer.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/rate_limiter_impl.h"
@@ -6084,6 +6086,11 @@ TEST_F(DBTest, L0L1L2AndUpHitCounter) {
 }
 
 TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  bool& allow_unsupported_fv =
+      BlockBasedTableFactory::AllowUnsupportedFormatVersion();
+  SaveAndRestore guard(&allow_unsupported_fv);
+  ASSERT_FALSE(allow_unsupported_fv);
+
   // iter 0 -- zlib
   // iter 1 -- bzip2
   // iter 2 -- lz4
@@ -6106,7 +6113,16 @@ TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
       table_options.format_version = first_table_version;
       table_options.filter_policy.reset(NewBloomFilterPolicy(10));
       Options options = CurrentOptions();
+
+      // Hack to generate old files (checked in factory construction)
+      allow_unsupported_fv = true;
       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      ASSERT_EQ(options.table_factory->GetOptions<BlockBasedTableOptions>()
+                    ->format_version,
+                first_table_version);
+      // Able to read old files without the hack
+      allow_unsupported_fv = false;
+
       options.create_if_missing = true;
       options.compression = comp;
       DestroyAndReopen(options);
@@ -6118,9 +6134,14 @@ TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
         // compressible string
         ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
       }
+      ASSERT_OK(Flush());
 
       table_options.format_version = first_table_version == 1 ? 2 : 1;
       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      // format_version (for writing) is sanitized to minimum supported
+      ASSERT_EQ(options.table_factory->GetOptions<BlockBasedTableOptions>()
+                    ->format_version,
+                BlockBasedTableFactory::kMinSupportedFormatVersion);
       Reopen(options);
       for (int i = 0; i < kNumKeysWritten; ++i) {
         auto r = Get(Key(i));
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 54e9e88aacba..8ca5f272f132 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -300,7 +300,7 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
 
   // compress_format_version can have two values:
   // compress_format_version == 1 -- decompressed size is not included in the
-  // block header.
+  // block header. DEPRECATED
   // compress_format_version == 2 -- decompressed size is included in the block
   // header in varint32 format.
   uint32_t compress_format_version = 2;
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index e1f76fcd4632..1ce073d4a44e 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -126,7 +126,15 @@ struct CacheUsageOptions {
 };
 
 // Configures how SST files using the block-based table format (standard)
-// are written and read.
+// are written and read. With few exceptions, each option only affects either
+// (a) how new SST files are written, or (b) how SST files are read. If an
+// option seems to affect how the SST file is constructed, e.g. format_version,
+// that option *ONLY* has an effect at construction time. Contrast this with
+// options like the various `cache` and `pin` options, that only affect
+// in-memory and IO behavior at read time. In general, any version of RocksDB
+// able to read the full key-value and indexing data in the SST file will read
+// it as written regardless of current options for writing new files. See
+// filter_policy regarding filters.
 //
 // Except as specifically noted, all options here are "mutable" using
 // SetOptions(), with the caveat that only new table builders and new table
@@ -480,6 +488,10 @@ struct BlockBasedTableOptions {
   // If non-nullptr, use the specified filter policy to reduce disk reads.
   // Many applications will benefit from passing the result of
   // NewBloomFilterPolicy() here.
+  //
+  // Because filters only impact performance and are not data-critical, an
+  // SST file can be opened and used without filters if (a) the filter
+  // policy name or schema is unrecognized, or (b) filter_policy is nullptr.
   std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
 
   // If true, place whole keys in the filter (not just prefixes).
@@ -524,13 +536,9 @@ struct BlockBasedTableOptions {
   // Default: 0 (disabled)
   uint32_t read_amp_bytes_per_bit = 0;
 
-  // We currently have these versions:
-  // 0 -- This version can be read by really old RocksDB's. Doesn't support
-  // changing checksum type (default is CRC32).
-  // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default
-  // checksum, like xxHash. It is written by RocksDB when
-  // BlockBasedTableOptions::checksum is something other than kCRC32c. (version
-  // 0 is silently upconverted)
+  // We currently have these format versions:
+  // 0 - 1 -- Unsupported for writing new files and quietly sanitized to 2.
+  // Read support is deprecated and could be removed in the future.
   // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
   // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
   // don't plan to run RocksDB before version 3.10, you should probably use
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3fb7b2dbdaf4..654317d9da95 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -980,16 +980,6 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
     WritableFileWriter* file) {
   BlockBasedTableOptions sanitized_table_options(table_options);
-  if (sanitized_table_options.format_version == 0 &&
-      sanitized_table_options.checksum != kCRC32c) {
-    ROCKS_LOG_WARN(
-        tbo.ioptions.logger,
-        "Silently converting format_version to 1 because checksum is "
-        "non-default");
-    // silently convert format_version to 1 to keep consistent with current
-    // behavior
-    sanitized_table_options.format_version = 1;
-  }
   auto ucmp = tbo.internal_comparator.user_comparator();
   assert(ucmp);
   (void)ucmp;  // avoids unused variable error.
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 7add9fb16fcb..c93dea119f3b 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -467,6 +467,20 @@ void BlockBasedTableFactory::InitializeOptions() {
       options_overrides_iter->second.charged = options.charged;
     }
   }
+
+  if (table_options_.format_version < kMinSupportedFormatVersion) {
+    if (AllowUnsupportedFormatVersion()) {
+      // Allow old format version for testing.
+      // And relevant old sanitization.
+      if (table_options_.format_version == 0 &&
+          table_options_.checksum != kCRC32c) {
+        // silently convert format_version to 1 to support non-CRC32c checksum
+        table_options_.format_version = 1;
+      }
+    } else {
+      table_options_.format_version = kMinSupportedFormatVersion;
+    }
+  }
 }
 
 Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
@@ -910,6 +924,11 @@ Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
   return status;
 }
 
+bool& BlockBasedTableFactory::AllowUnsupportedFormatVersion() {
+  static bool allow = false;
+  return allow;
+}
+
 Status GetBlockBasedTableOptionsFromString(
     const ConfigOptions& config_options,
     const BlockBasedTableOptions& table_options, const std::string& opts_str,
diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h
index b05b45660401..fd1c577d7f5d 100644
--- a/table/block_based/block_based_table_factory.h
+++ b/table/block_based/block_based_table_factory.h
@@ -87,6 +87,12 @@ class BlockBasedTableFactory : public TableFactory {
     return &shared_state_->tail_prefetch_stats;
   }
 
+  static constexpr int kMinSupportedFormatVersion = 2;
+
+  // Set to true to allow unit testing of writing unsupported block-based table
+  // format versions (to test read side)
+  static bool& AllowUnsupportedFormatVersion();
+
  protected:
   const void* GetOptionsPtr(const std::string& name) const override;
   Status ParseOption(const ConfigOptions& config_options,
diff --git a/table/table_test.cc b/table/table_test.cc
index 51e7ea497f9a..ba5b01a132fd 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6868,6 +6868,10 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) {
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
+  // Opt-in this whole test file
+  ROCKSDB_NAMESPACE::BlockBasedTableFactory::AllowUnsupportedFormatVersion() =
+      true;
+
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/unreleased_history/public_api_changes/unsupport_fv1.md b/unreleased_history/public_api_changes/unsupport_fv1.md
new file mode 100644
index 000000000000..6f31edde0b52
--- /dev/null
+++ b/unreleased_history/public_api_changes/unsupport_fv1.md
@@ -0,0 +1 @@
+* format\_version < 2 in BlockBasedTableOptions is no longer supported for writing new files. Support for reading such files is deprecated and might be removed in the future. `CompressedSecondaryCacheOptions::compress_format_version == 1` is also deprecated.

From 6ac13a5f0aecdbd7fd09f01b014f6564b87e0436 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Fri, 14 Mar 2025 21:43:50 -0700
Subject: [PATCH 021/500] Expose WriteLifeTimeHint at the FileOptions level
 (#13461)

Summary:
The original implementation of NVMe write lifetime hints (https://github.com/facebook/rocksdb/pull/3095) assumed a flexible interface which decouples file creation from the explicit act of setting write lifetime hint (see `PosixWritableFile` for more context). However, there are existing file systems implementations (ex. Warm Storage) that require all the options (including file write lifetime hints) to be specified once at the time of the actual `FSWritableFile` object instantiation. We're extending the `FileOptions` with `Env::WriteLifeTimeHint` and patch existing callsites accordingly to enable one-shot metadata setup for those more constraint implementations.

NOTE: Today `CalculateSSTWriteHint` only sets write lifetime hint for Level compactions. We'll fill that gap in following PRs and add calculation for Universal Compactions which would unblock Zippy's use case.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13461

Reviewed By: anand1976

Differential Revision: D71144645

Pulled By: mszeszko-meta

fbshipit-source-id: 6c09b62a360d48bd6e4fb08a1265bce2a49f3f4a
---
 db/blob/blob_file_builder.cc    | 10 +++++++---
 db/builder.cc                   |  8 ++++++--
 db/compaction/compaction_job.cc |  5 ++++-
 db/db_impl/db_impl_open.cc      |  5 ++++-
 include/rocksdb/file_system.h   |  9 ++++++++-
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc
index dceb90cee57a..1cb6833b5918 100644
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@@ -188,10 +188,12 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
   }
 
   std::unique_ptr<FSWritableFile> file;
-
+  FileOptions fo_copy;
   {
     assert(file_options_);
-    Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+    fo_copy = *file_options_;
+    fo_copy.write_hint = write_hint_;
+    Status s = NewWritableFile(fs_, blob_file_path, &file, fo_copy);
 
     TEST_SYNC_POINT_CALLBACK(
         "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
@@ -209,7 +211,9 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
 
   assert(file);
   file->SetIOPriority(write_options_->rate_limiter_priority);
-  file->SetWriteLifeTimeHint(write_hint_);
+  // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+  // with the very same value will be ignored by the fs.
+  file->SetWriteLifeTimeHint(fo_copy.write_hint);
   FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
   Statistics* const statistics = immutable_options_->stats;
   std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
diff --git a/db/builder.cc b/db/builder.cc
index 08a9fecc7278..a39bcf3b4765 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -145,7 +145,9 @@ Status BuildTable(
       bool use_direct_writes = file_options.use_direct_writes;
       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
 #endif  // !NDEBUG
-      IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
+      FileOptions fo_copy = file_options;
+      fo_copy.write_hint = write_hint;
+      IOStatus io_s = NewWritableFile(fs, fname, &file, fo_copy);
       assert(s.ok());
       s = io_s;
       if (io_status->ok()) {
@@ -163,7 +165,9 @@ Status BuildTable(
       table_file_created = true;
       FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
       file->SetIOPriority(tboptions.write_options.rate_limiter_priority);
-      file->SetWriteLifeTimeHint(write_hint);
+      // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+      // with the very same value will be ignored by the fs.
+      file->SetWriteLifeTimeHint(fo_copy.write_hint);
       file_writer.reset(new WritableFileWriter(
           std::move(file), fname, file_options, ioptions.clock, io_tracer,
           ioptions.stats, Histograms::SST_WRITE_MICROS, ioptions.listeners,
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 46939e988cd0..512a9ae23ca9 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1950,6 +1950,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
     temperature = last_level_temp;
   }
   fo_copy.temperature = temperature;
+  fo_copy.write_hint = write_hint_;
 
   Status s;
   IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
@@ -2035,7 +2036,9 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
   }
 
   writable_file->SetIOPriority(GetRateLimiterPriority());
-  writable_file->SetWriteLifeTimeHint(write_hint_);
+  // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+  // with the very same value will be ignored by the fs.
+  writable_file->SetWriteLifeTimeHint(fo_copy.write_hint);
   FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 549e574f7f9b..08a037bcb504 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -2264,6 +2264,7 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
       BuildDBOptions(immutable_db_options_, mutable_db_options_);
   FileOptions opt_file_options =
       fs_->OptimizeForLogWrite(file_options_, db_options);
+  opt_file_options.write_hint = CalculateWALWriteHint();
   // DB option takes precedence when not kUnknown
   if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
     opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
@@ -2285,7 +2286,9 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
   }
 
   if (io_s.ok()) {
-    lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+    // Subsequent attempts to override the hint via SetWriteLifeTimeHint
+    // with the very same value will be ignored by the fs.
+    lfile->SetWriteLifeTimeHint(opt_file_options.write_hint);
     lfile->SetPreallocationBlockSize(preallocate_block_size);
 
     const auto& listeners = immutable_db_options_.listeners;
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index 27e497f432b5..ec10a5f12682 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -192,6 +192,12 @@ struct FileOptions : EnvOptions {
   // handoff during file writes.
   ChecksumType handoff_checksum_type;
 
+  // Expose write lifetime hint on the FileOptions level to provide more
+  // flexibility in setting the hint in downstream, custom implementations
+  // that might be able to process the hint only at the time of the actual
+  // FSWritableFile object creation.
+  Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET;
+
   FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const DBOptions& opts)
@@ -206,7 +212,8 @@ struct FileOptions : EnvOptions {
       : EnvOptions(opts),
         io_options(opts.io_options),
         temperature(opts.temperature),
-        handoff_checksum_type(opts.handoff_checksum_type) {}
+        handoff_checksum_type(opts.handoff_checksum_type),
+        write_hint(opts.write_hint) {}
 
   FileOptions& operator=(const FileOptions&) = default;
 };

From 24952ff0883dad4826093411d4573aa00e479772 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 17 Mar 2025 11:11:44 -0700
Subject: [PATCH 022/500] Expose number of L0 files in the CF right before the
 compaction starts in CompactionJobInfo (#13462)

Summary:
**Context/Summary:**
For users who are interested in knowing how efficient their compaction in reducing L0 files or how bad their long-running compaction in "locking" L0 files, they now have a reference point "L0 files in the CF pre compaction" for their input compaction files.
- Compared to the existing stats or exposing in some other way, exposing this info in CompactionJobInfo allows users to compare it with other compaction data (e.g, compaction input num, compaction reason) of within **one** compaction (of per-compaction granularity).
- If this number is high while their "short-running" compaction has little L0 files input, then those compaction may have a room for improvement. Similar for those long-running compaction. This PR is to add a new field `CompactionJobInfo::num_l0_files_pre_compaction` for that.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13462

Test Plan: - Piggyback on an existing test

Reviewed By: jaykorean

Differential Revision: D71124938

Pulled By: hx235

fbshipit-source-id: aa47c9c86c62d9425771b320f5636e50671fd289
---
 db/db_compaction_test.cc                   | 31 ++++++++++++++++++++++
 db/db_impl/db_impl_compaction_flush.cc     |  4 +++
 include/rocksdb/listener.h                 |  3 +++
 unreleased_history/new_features/l0_file.md |  1 +
 4 files changed, 39 insertions(+)
 create mode 100644 unreleased_history/new_features/l0_file.md

diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index a17a5a6ebe02..67303416878b 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -5912,6 +5912,9 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
       ASSERT_EQ(running_compactions_.find(ci.job_id),
                 running_compactions_.end());
       running_compactions_.emplace(ci.job_id, std::unordered_set<int>());
+      if (expected_num_l0_files_pre_compaction_ != -1) {
+        ASSERT_EQ(expected_num_l0_files_pre_compaction_, ci.num_l0_files);
+      }
     }
 
     void OnCompactionCompleted(DB* /*db*/,
@@ -5921,6 +5924,9 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
       ASSERT_NE(it, running_compactions_.end());
       ASSERT_EQ(it->second.size(), 0);
       running_compactions_.erase(it);
+      if (expected_num_l0_files_post_compaction_ != -1) {
+        ASSERT_EQ(expected_num_l0_files_post_compaction_, ci.num_l0_files);
+      }
     }
 
     void OnSubcompactionBegin(const SubcompactionJobInfo& si) override {
@@ -5950,10 +5956,25 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
       return total_subcompaction_cnt_;
     }
 
+    void SetExpectedNumL0FilesPreCompaction(int num) {
+      expected_num_l0_files_pre_compaction_ = num;
+    }
+
+    void SetExpectedNumL0FilesPostCompaction(int num) {
+      expected_num_l0_files_post_compaction_ = num;
+    }
+
+    void ResetExpectedNumL0Files() {
+      SetExpectedNumL0FilesPreCompaction(-1);
+      SetExpectedNumL0FilesPostCompaction(-1);
+    }
+
    private:
     InstrumentedMutex mutex_;
     std::unordered_map<int, std::unordered_set<int>> running_compactions_;
     size_t total_subcompaction_cnt_ = 0;
+    int expected_num_l0_files_pre_compaction_ = -1;
+    int expected_num_l0_files_post_compaction_ = -1;
   };
 
   Options options = CurrentOptions();
@@ -5973,6 +5994,7 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
     ASSERT_OK(Flush());
   }
   MoveFilesToLevel(2);
+  ASSERT_EQ(FilesPerLevel(), "0,0,4");
 
   // generate 2 files @ L1 which overlaps with L2 files
   for (int i = 0; i < 2; i++) {
@@ -5982,11 +6004,18 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
     }
     ASSERT_OK(Flush());
   }
+  listener->SetExpectedNumL0FilesPreCompaction(2 /* num */);
+  listener->SetExpectedNumL0FilesPostCompaction(0 /* num */);
+
   MoveFilesToLevel(1);
   ASSERT_EQ(FilesPerLevel(), "0,2,4");
 
+  listener->ResetExpectedNumL0Files();
+
   CompactRangeOptions comp_opts;
   comp_opts.max_subcompactions = 4;
+
+  listener->SetExpectedNumL0FilesPreCompaction(0 /* num */);
   Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr);
   ASSERT_OK(s);
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -5994,6 +6023,8 @@ TEST_F(DBCompactionTest, SubcompactionEvent) {
   ASSERT_EQ(listener->GetRunningCompactionCount(), 0);
   // and sub compaction is triggered
   ASSERT_GT(listener->GetTotalSubcompactionCount(), 0);
+
+  listener->ResetExpectedNumL0Files();
 }
 
 TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index f623511d7303..07d446186727 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1695,11 +1695,13 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
   }
 
   c->SetNotifyOnCompactionCompleted();
+  int num_l0_files = c->input_version()->storage_info()->NumLevelFiles(0);
   // release lock while notifying events
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
   {
     CompactionJobInfo info{};
+    info.num_l0_files = num_l0_files;
     BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info);
     for (const auto& listener : immutable_db_options_.listeners) {
       listener->OnCompactionBegin(this, info);
@@ -1724,11 +1726,13 @@ void DBImpl::NotifyOnCompactionCompleted(
     return;
   }
 
+  int num_l0_files = cfd->current()->storage_info()->NumLevelFiles(0);
   // release lock while notifying events
   mutex_.Unlock();
   TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
   {
     CompactionJobInfo info{};
+    info.num_l0_files = num_l0_files;
     BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info);
     for (const auto& listener : immutable_db_options_.listeners) {
       listener->OnCompactionCompleted(this, info);
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index 019f4d40bf60..fe90a7b2ec94 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -439,6 +439,9 @@ struct CompactionJobInfo {
   // the job id, which is unique in the same thread.
   int job_id;
 
+  // the number of L0 files in the CF right before and after the compaction
+  int num_l0_files;
+
   // the smallest input level of the compaction.
   int base_input_level;
   // the output level of the compaction.
diff --git a/unreleased_history/new_features/l0_file.md b/unreleased_history/new_features/l0_file.md
new file mode 100644
index 000000000000..f31178217b31
--- /dev/null
+++ b/unreleased_history/new_features/l0_file.md
@@ -0,0 +1 @@
+Add a new field `num_l0_files` in `CompactionJobInfo` about the number of L0 files in the CF right before and after the compaction

From 17ac19f2c448856f2cfca2d85500f8b0b4af0bdc Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Mon, 17 Mar 2025 12:49:10 -0700
Subject: [PATCH 023/500] Add a check during recovery for proper seqno
 advancement (#13465)

Summary:
This PR adds a check for an invariant of sequence number during recovery, that it should not be set backward. This is inspired by a recent SEV that is caused by a software bug. It is a relatively cheap and straightforward check that RocksDB can do to avoid silently opening the DB in a corrupted state.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13465

Test Plan:
Existing tests should cover the case when the invariant is met

The corrupted state is manually tested using aforementioned bug.

Reviewed By: hx235

Differential Revision: D71226513

Pulled By: jowlyzhang

fbshipit-source-id: cd8056fa6653d44ceeb9ba9b4693ab0660a53b4e
---
 db/db_impl/db_impl.h       |  5 +++++
 db/db_impl/db_impl_open.cc | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index b746a7f3902c..11cf2347831f 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -2129,6 +2129,11 @@ class DBImpl : public DB {
       bool flushed, std::unordered_map<int, VersionEdit>* version_edits,
       RecoveryContext* recovery_ctx);
 
+  // Check that DB sequence number is not set back during recovery between
+  // replaying of WAL files and between replaying of WriteBatches.
+  Status CheckSeqnoNotSetBackDuringRecovery(SequenceNumber prev_next_seqno,
+                                            SequenceNumber current_next_seqno);
+
   void FinishLogFilesRecovery(int job_id, const Status& status);
   // The following two methods are used to flush a memtable to
   // storage. The first one is used at database RecoveryTime (when the
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 08a037bcb504..5fb98f2b6b98 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1203,6 +1203,13 @@ Status DBImpl::ProcessLogFiles(
   PredecessorWALInfo predecessor_wal_info;
 
   for (auto wal_number : wal_numbers) {
+    // Detecting early break on the next iteration after `wal_number` has been
+    // advanced since this `wal_number` doesn't affect follow-up handling after
+    // breaking out of the for loop.
+    if (!status.ok()) {
+      break;
+    }
+    SequenceNumber prev_next_sequence = *next_sequence;
     if (status.ok()) {
       status = ProcessLogFile(
           wal_number, min_wal_number, is_retry, read_only, job_id,
@@ -1210,6 +1217,10 @@ Status DBImpl::ProcessLogFiles(
           &stop_replay_by_wal_filter, &corrupted_wal_number,
           corrupted_wal_found, version_edits, &flushed, predecessor_wal_info);
     }
+    if (status.ok()) {
+      status = CheckSeqnoNotSetBackDuringRecovery(prev_next_sequence,
+                                                  *next_sequence);
+    }
   }
 
   if (status.ok()) {
@@ -1317,6 +1328,7 @@ Status DBImpl::ProcessLogFile(
     }
 
     // FIXME(hx235): consolidate `process_status` and `status`
+    SequenceNumber prev_next_sequence = *next_sequence;
     Status process_status = ProcessLogRecord(
         record, reader, running_ts_sz, wal_number, fname, read_only, job_id,
         logFileDropped, &reporter, &record_checksum, &last_seqno_observed,
@@ -1325,6 +1337,12 @@ Status DBImpl::ProcessLogFile(
 
     if (!process_status.ok()) {
       return process_status;
+    } else if (Status seqno_check_status = CheckSeqnoNotSetBackDuringRecovery(
+                   prev_next_sequence, *next_sequence);
+               !seqno_check_status.ok()) {
+      // Sequence number being set back indicates a serious software bug, the DB
+      // should not be opened in this case.
+      return seqno_check_status;
     } else if (*stop_replay_for_corruption) {
       break;
     }
@@ -1863,6 +1881,20 @@ Status DBImpl::MaybeFlushFinalMemtableOrRestoreActiveLogFiles(
   return status;
 }
 
+Status DBImpl::CheckSeqnoNotSetBackDuringRecovery(
+    SequenceNumber prev_next_seqno, SequenceNumber current_next_seqno) {
+  if (prev_next_seqno == kMaxSequenceNumber ||
+      prev_next_seqno <= current_next_seqno) {
+    return Status::OK();
+  }
+  std::string msg =
+      "Sequence number is being set backwards during recovery, this is likely "
+      "a software bug or a data corruption. Prev next seqno: " +
+      std::to_string(prev_next_seqno) +
+      " , current next seqno: " + std::to_string(current_next_seqno);
+  return Status::Corruption(msg);
+}
+
 void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
   event_logger_.Log() << "job" << job_id << "event"
                       << (status.ok() ? "recovery_finished" : "recovery_failed")

From cc487ba3678b5514994c6357f7a53115f998d909 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 18 Mar 2025 16:28:18 -0700
Subject: [PATCH 024/500] Fix Compaction Stats for Remote Compaction and Tiered
 Storage (#13464)

Summary:
## Background

Compaction statistics are collected at various levels across different classes and structs.

* `InternalStats::CompactionStats`: Per-level Compaction Stats within a job (can be at subcompaction level which later get aggregated to the compaction level)
* `InternalStats::CompactionStatsFull`: Contains two per-level compaction stats - `output_level_stats` for primary output level stats and `proximal_level_stats` for proximal level stats. Proximal level statistics are only relevant when using Tiered Storage with the per-key placement feature enabled.
* `InternalStats::CompactionOutputsStats`: Simplified version of `InternalStats::CompactionStats`. Only has a subset of fields from `InternalStats::CompactionStats`
* `CompactionJobStats`: Job-level Compaction Stats. (can be at subcompaction level which later get aggregated to the compaction level)

Please note that some fields in Job-level stats are not in Per-level stats and they don't map 1-to-1 today.

## Issues

* In non-remote compactions, proximal level compaction statistics were not being aggregated into job-level statistics. Job level statistics were missing stats for proximal level for tiered storage compactions with per-key-replacement feature enabled.
* During remote compactions, proximal level compaction statistics were pre-aggregated into job-level statistics on the remote side. However, per-level compaction statistics were not part of the serialized compaction result, so that primary host lost that information and weren't able to populate `per_key_placement_comp_stats_` and `internal_stats_.proximal_level_stats` properly during the installation.
* `TieredCompactionTest` was only checking if (expected stats > 0 && actual stats > 0) instead actual value comparison

## Fixes

* Renamed `compaction_stats_` to `internal_stats_` for `InternalStats::CompactionStatsFull` in `CompactionJob` for better readability
* Removed the usage of `InternalStats::CompactionOutputsStats` and consolidated them to `InternalStats::CompactionStats`.
* Remote Compactions now include the internal stats in the serialized `CompactionServiceResult`. `output_level_stats` and `proximal_level_stats` get later propagated in sub_compact output stats accordingly.
* `CompactionJob::UpdateCompactionJobStats()` now takes `CompactionStatsFull` and aggregates the `proximal_level_stats` as well
* `TieredCompactionTest` is now doing the actual value comparisons for input/output file counts and record counts. Follow up is needed to do the same for the bytes read / written.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13464

Test Plan:
Unit Tests updated to verify stats

```
./compaction_service_test
```
```
./tiered_compaction_test
```

Reviewed By: pdillinger

Differential Revision: D71220393

Pulled By: jaykorean

fbshipit-source-id: ad70bffd9614ced683f90c7570a17def9b5c8f3f
---
 db/compaction/compaction_job.cc               | 281 ++++++----
 db/compaction/compaction_job.h                |  48 +-
 db/compaction/compaction_outputs.cc           |   2 +-
 db/compaction/compaction_outputs.h            |  12 +-
 db/compaction/compaction_service_job.cc       | 209 ++++++--
 db/compaction/compaction_service_test.cc      |  23 +-
 db/compaction/compaction_state.cc             |   8 +-
 db/compaction/compaction_state.h              |   4 +-
 db/compaction/subcompaction_state.cc          |   8 +-
 db/compaction/subcompaction_state.h           |  12 +-
 db/compaction/tiered_compaction_test.cc       | 488 +++++++++++++-----
 db/internal_stats.h                           |  52 +-
 .../bug_fixes/stats_fix_for_tiered_storage.md |   1 +
 13 files changed, 785 insertions(+), 363 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 512a9ae23ca9..94b7b102d8bf 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -147,7 +147,7 @@ CompactionJob::CompactionJob(
     BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
     int* bg_bottom_compaction_scheduled)
     : compact_(new CompactionState(compaction)),
-      compaction_stats_(compaction->compaction_reason(), 1),
+      internal_stats_(compaction->compaction_reason(), 1),
       db_options_(db_options),
       mutable_db_options_copy_(mutable_db_options),
       log_buffer_(log_buffer),
@@ -155,7 +155,7 @@ CompactionJob::CompactionJob(
       stats_(stats),
       bottommost_level_(false),
       write_hint_(Env::WLTH_NOT_SET),
-      compaction_job_stats_(compaction_job_stats),
+      job_stats_(compaction_job_stats),
       job_id_(job_id),
       dbname_(dbname),
       db_id_(db_id),
@@ -191,7 +191,7 @@ CompactionJob::CompactionJob(
       extra_num_subcompaction_threads_reserved_(0),
       bg_compaction_scheduled_(bg_compaction_scheduled),
       bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
-  assert(compaction_job_stats_ != nullptr);
+  assert(job_stats_ != nullptr);
   assert(log_buffer_ != nullptr);
 
   const auto* cfd = compact_->compaction->column_family_data();
@@ -240,9 +240,8 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
   // to ensure GetThreadList() can always show them all together.
   ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
 
-  compaction_job_stats_->is_manual_compaction =
-      compaction->is_manual_compaction();
-  compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
+  job_stats_->is_manual_compaction = compaction->is_manual_compaction();
+  job_stats_->is_full_compaction = compaction->is_full_compaction();
 }
 
 void CompactionJob::Prepare(
@@ -695,17 +694,17 @@ Status CompactionJob::Run() {
     thread.join();
   }
 
-  compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
+  internal_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
 
   for (auto& state : compact_->sub_compact_states) {
-    compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
+    internal_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
     state.RemoveLastEmptyOutput();
   }
 
   RecordTimeToHistogram(stats_, COMPACTION_TIME,
-                        compaction_stats_.stats.micros);
+                        internal_stats_.output_level_stats.micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
-                        compaction_stats_.stats.cpu_micros);
+                        internal_stats_.output_level_stats.cpu_micros);
 
   TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
 
@@ -855,46 +854,54 @@ Status CompactionJob::Run() {
   // compaction_service is set. We now know whether each sub_compaction was
   // done remotely or not. Reset is_remote_compaction back to false and allow
   // AggregateCompactionStats() to set the right value.
-  compaction_job_stats_->is_remote_compaction = false;
+  job_stats_->is_remote_compaction = false;
 
   // Finish up all bookkeeping to unify the subcompaction results.
-  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
-  uint64_t num_input_range_del = 0;
-  bool ok = UpdateCompactionStats(&num_input_range_del);
-  // (Sub)compactions returned ok, do sanity check on the number of input keys.
-  if (status.ok() && ok && compaction_job_stats_->has_num_input_records) {
-    size_t ts_sz = compact_->compaction->column_family_data()
-                       ->user_comparator()
-                       ->timestamp_size();
-    // When trim_ts_ is non-empty, CompactionIterator takes
-    // HistoryTrimmingIterator as input iterator and sees a trimmed view of
-    // input keys. So the number of keys it processed is not suitable for
-    // verification here.
-    // TODO: support verification when trim_ts_ is non-empty.
-    if (!(ts_sz > 0 && !trim_ts_.empty())) {
-      assert(compaction_stats_.stats.num_input_records > 0);
-      // TODO: verify the number of range deletion entries.
-      uint64_t expected =
-          compaction_stats_.stats.num_input_records - num_input_range_del;
-      uint64_t actual = compaction_job_stats_->num_input_records;
-      if (expected != actual) {
-        char scratch[2345];
-        compact_->compaction->Summary(scratch, sizeof(scratch));
-        std::string msg =
-            "Compaction number of input keys does not match "
-            "number of keys processed. Expected " +
-            std::to_string(expected) + " but processed " +
-            std::to_string(actual) + ". Compaction summary: " + scratch;
-        ROCKS_LOG_WARN(
-            db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
-            compact_->compaction->column_family_data()->GetName().c_str(),
-            job_context_->job_id, msg.c_str());
-        if (db_options_.compaction_verify_record_count) {
-          status = Status::Corruption(msg);
+  compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
+
+  // For remote compactions, internal_stats_.output_level_stats were part of the
+  // compaction_result already. No need to re-update it.
+  if (job_stats_->is_remote_compaction == false) {
+    uint64_t num_input_range_del = 0;
+    bool ok = UpdateOutputLevelCompactionStats(&num_input_range_del);
+    // (Sub)compactions returned ok, do sanity check on the number of input
+    // keys.
+    if (status.ok() && ok && job_stats_->has_num_input_records) {
+      size_t ts_sz = compact_->compaction->column_family_data()
+                         ->user_comparator()
+                         ->timestamp_size();
+      // When trim_ts_ is non-empty, CompactionIterator takes
+      // HistoryTrimmingIterator as input iterator and sees a trimmed view of
+      // input keys. So the number of keys it processed is not suitable for
+      // verification here.
+      // TODO: support verification when trim_ts_ is non-empty.
+      if (!(ts_sz > 0 && !trim_ts_.empty())) {
+        assert(internal_stats_.output_level_stats.num_input_records > 0);
+        // TODO: verify the number of range deletion entries.
+        uint64_t expected =
+            internal_stats_.output_level_stats.num_input_records -
+            num_input_range_del;
+        uint64_t actual = job_stats_->num_input_records;
+        if (expected != actual) {
+          char scratch[2345];
+          compact_->compaction->Summary(scratch, sizeof(scratch));
+          std::string msg =
+              "Compaction number of input keys does not match "
+              "number of keys processed. Expected " +
+              std::to_string(expected) + " but processed " +
+              std::to_string(actual) + ". Compaction summary: " + scratch;
+          ROCKS_LOG_WARN(
+              db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
+              compact_->compaction->column_family_data()->GetName().c_str(),
+              job_context_->job_id, msg.c_str());
+          if (db_options_.compaction_verify_record_count) {
+            status = Status::Corruption(msg);
+          }
         }
       }
     }
   }
+
   RecordCompactionIOStats();
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
@@ -916,7 +923,7 @@ Status CompactionJob::Install(bool* compaction_released) {
 
   int output_level = compact_->compaction->output_level();
   cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_,
-                                            compaction_stats_);
+                                            internal_stats_);
 
   if (status.ok()) {
     status = InstallCompactionResults(compaction_released);
@@ -927,7 +934,7 @@ Status CompactionJob::Install(bool* compaction_released) {
 
   VersionStorageInfo::LevelSummaryStorage tmp;
   auto vstorage = cfd->current()->storage_info();
-  const auto& stats = compaction_stats_.stats;
+  const auto& stats = internal_stats_.output_level_stats;
 
   double read_write_amp = 0.0;
   double write_amp = 0.0;
@@ -993,19 +1000,21 @@ Status CompactionJob::Install(bool* compaction_released) {
         blob_files.back()->GetBlobFileNumber());
   }
 
-  if (compaction_stats_.has_proximal_level_output) {
+  if (internal_stats_.has_proximal_level_output) {
     ROCKS_LOG_BUFFER(log_buffer_,
                      "[%s] has Proximal Level output: %" PRIu64
                      ", level %d, number of files: %" PRIu64
                      ", number of records: %" PRIu64,
                      column_family_name.c_str(),
-                     compaction_stats_.proximal_level_stats.bytes_written,
+                     internal_stats_.proximal_level_stats.bytes_written,
                      compact_->compaction->GetProximalLevel(),
-                     compaction_stats_.proximal_level_stats.num_output_files,
-                     compaction_stats_.proximal_level_stats.num_output_records);
+                     internal_stats_.proximal_level_stats.num_output_files,
+                     internal_stats_.proximal_level_stats.num_output_records);
   }
 
-  UpdateCompactionJobStats(stats);
+  UpdateCompactionJobStats(internal_stats_);
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Install:AfterUpdateCompactionJobStats", job_stats_);
 
   auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
   stream << "job" << job_id_ << "event" << "compaction_finished"
@@ -1027,17 +1036,16 @@ Status CompactionJob::Install(bool* compaction_released) {
          << CompressionTypeToString(compact_->compaction->output_compression());
 
   stream << "num_single_delete_mismatches"
-         << compaction_job_stats_->num_single_del_mismatch;
+         << job_stats_->num_single_del_mismatch;
   stream << "num_single_delete_fallthrough"
-         << compaction_job_stats_->num_single_del_fallthru;
+         << job_stats_->num_single_del_fallthru;
 
   if (measure_io_stats_) {
-    stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
-    stream << "file_range_sync_nanos"
-           << compaction_job_stats_->file_range_sync_nanos;
-    stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+    stream << "file_write_nanos" << job_stats_->file_write_nanos;
+    stream << "file_range_sync_nanos" << job_stats_->file_range_sync_nanos;
+    stream << "file_fsync_nanos" << job_stats_->file_fsync_nanos;
     stream << "file_prepare_write_nanos"
-           << compaction_job_stats_->file_prepare_write_nanos;
+           << job_stats_->file_prepare_write_nanos;
   }
 
   stream << "lsm_state";
@@ -1055,9 +1063,9 @@ Status CompactionJob::Install(bool* compaction_released) {
     stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
   }
 
-  if (compaction_stats_.has_proximal_level_output) {
+  if (internal_stats_.has_proximal_level_output) {
     InternalStats::CompactionStats& pl_stats =
-        compaction_stats_.proximal_level_stats;
+        internal_stats_.proximal_level_stats;
     stream << "proximal_level_num_output_files" << pl_stats.num_output_files;
     stream << "proximal_level_bytes_written" << pl_stats.bytes_written;
     stream << "proximal_level_num_output_records"
@@ -1812,22 +1820,22 @@ Status CompactionJob::InstallCompactionResults(bool* compaction_released) {
 
   {
     Compaction::InputLevelSummaryBuffer inputs_summary;
-    if (compaction_stats_.has_proximal_level_output) {
+    if (internal_stats_.has_proximal_level_output) {
       ROCKS_LOG_BUFFER(
           log_buffer_,
           "[%s] [JOB %d] Compacted %s => output_to_proximal_level: %" PRIu64
           " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
           compaction->column_family_data()->GetName().c_str(), job_id_,
           compaction->InputLevelSummary(&inputs_summary),
-          compaction_stats_.proximal_level_stats.bytes_written,
-          compaction_stats_.stats.bytes_written,
-          compaction_stats_.TotalBytesWritten());
+          internal_stats_.proximal_level_stats.bytes_written,
+          internal_stats_.output_level_stats.bytes_written,
+          internal_stats_.TotalBytesWritten());
     } else {
       ROCKS_LOG_BUFFER(log_buffer_,
                        "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
                        compaction->column_family_data()->GetName().c_str(),
                        job_id_, compaction->InputLevelSummary(&inputs_summary),
-                       compaction_stats_.TotalBytesWritten());
+                       internal_stats_.TotalBytesWritten());
     }
   }
 
@@ -2087,12 +2095,13 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 }
 }  // namespace
 
-bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
+bool CompactionJob::UpdateOutputLevelCompactionStats(
+    uint64_t* num_input_range_del) {
   assert(compact_);
 
   Compaction* compaction = compact_->compaction;
-  compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
-  compaction_stats_.stats.num_input_files_in_output_level = 0;
+  internal_stats_.output_level_stats.num_input_files_in_non_output_levels = 0;
+  internal_stats_.output_level_stats.num_input_files_in_output_level = 0;
 
   bool has_error = false;
   const ReadOptions read_options(Env::IOActivity::kCompaction);
@@ -2104,13 +2113,14 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
     size_t num_input_files = flevel->num_files;
     uint64_t* bytes_read;
     if (compaction->level(input_level) != compaction->output_level()) {
-      compaction_stats_.stats.num_input_files_in_non_output_levels +=
+      internal_stats_.output_level_stats.num_input_files_in_non_output_levels +=
           static_cast<int>(num_input_files);
-      bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels;
+      bytes_read =
+          &internal_stats_.output_level_stats.bytes_read_non_output_levels;
     } else {
-      compaction_stats_.stats.num_input_files_in_output_level +=
+      internal_stats_.output_level_stats.num_input_files_in_output_level +=
           static_cast<int>(num_input_files);
-      bytes_read = &compaction_stats_.stats.bytes_read_output_level;
+      bytes_read = &internal_stats_.output_level_stats.bytes_read_output_level;
     }
     for (size_t i = 0; i < num_input_files; ++i) {
       const FileMetaData* file_meta = flevel->files[i].file_metadata;
@@ -2130,7 +2140,8 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
           has_error = true;
         }
       }
-      compaction_stats_.stats.num_input_records += file_input_entries;
+      internal_stats_.output_level_stats.num_input_records +=
+          file_input_entries;
       if (num_input_range_del) {
         *num_input_range_del += file_num_range_del;
       }
@@ -2141,62 +2152,116 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
     size_t num_filtered_input_files = filtered_flevel.size();
     uint64_t* bytes_skipped;
     if (compaction->level(input_level) != compaction->output_level()) {
-      compaction_stats_.stats.num_filtered_input_files_in_non_output_levels +=
+      internal_stats_.output_level_stats
+          .num_filtered_input_files_in_non_output_levels +=
           static_cast<int>(num_filtered_input_files);
-      bytes_skipped = &compaction_stats_.stats.bytes_skipped_non_output_levels;
+      bytes_skipped =
+          &internal_stats_.output_level_stats.bytes_skipped_non_output_levels;
     } else {
-      compaction_stats_.stats.num_filtered_input_files_in_output_level +=
+      internal_stats_.output_level_stats
+          .num_filtered_input_files_in_output_level +=
           static_cast<int>(num_filtered_input_files);
-      bytes_skipped = &compaction_stats_.stats.bytes_skipped_output_level;
+      bytes_skipped =
+          &internal_stats_.output_level_stats.bytes_skipped_output_level;
     }
     for (const FileMetaData* filtered_file_meta : filtered_flevel) {
       *bytes_skipped += filtered_file_meta->fd.GetFileSize();
     }
   }
 
-  assert(compaction_job_stats_);
-  compaction_stats_.stats.bytes_read_blob =
-      compaction_job_stats_->total_blob_bytes_read;
+  assert(job_stats_);
+  internal_stats_.output_level_stats.bytes_read_blob =
+      job_stats_->total_blob_bytes_read;
 
-  compaction_stats_.stats.num_dropped_records =
-      compaction_stats_.DroppedRecords();
+  internal_stats_.output_level_stats.num_dropped_records =
+      internal_stats_.DroppedRecords();
   return !has_error;
 }
 
 void CompactionJob::UpdateCompactionJobStats(
-    const InternalStats::CompactionStats& stats) const {
-  compaction_job_stats_->elapsed_micros = stats.micros;
+    const InternalStats::CompactionStatsFull& internal_stats) const {
+  assert(job_stats_);
+  job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
+  job_stats_->cpu_micros = internal_stats.output_level_stats.cpu_micros;
 
   // input information
-  compaction_job_stats_->total_input_bytes =
-      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
-  compaction_job_stats_->num_input_records = stats.num_input_records;
-  compaction_job_stats_->num_input_files =
-      stats.num_input_files_in_non_output_levels +
-      stats.num_input_files_in_output_level;
-  compaction_job_stats_->num_input_files_at_output_level =
-      stats.num_input_files_in_output_level;
-  compaction_job_stats_->num_filtered_input_files =
-      stats.num_filtered_input_files_in_non_output_levels +
-      stats.num_filtered_input_files_in_output_level;
-  compaction_job_stats_->num_filtered_input_files_at_output_level =
-      stats.num_filtered_input_files_in_output_level;
-  compaction_job_stats_->total_skipped_input_bytes =
-      stats.bytes_skipped_non_output_levels + stats.bytes_skipped_output_level;
+  job_stats_->total_input_bytes =
+      internal_stats.output_level_stats.bytes_read_non_output_levels +
+      internal_stats.output_level_stats.bytes_read_output_level;
+  job_stats_->num_input_records =
+      internal_stats.output_level_stats.num_input_records;
+  job_stats_->num_input_files =
+      internal_stats.output_level_stats.num_input_files_in_non_output_levels +
+      internal_stats.output_level_stats.num_input_files_in_output_level;
+  job_stats_->num_input_files_at_output_level =
+      internal_stats.output_level_stats.num_input_files_in_output_level;
+  job_stats_->num_filtered_input_files =
+      internal_stats.output_level_stats
+          .num_filtered_input_files_in_non_output_levels +
+      internal_stats.output_level_stats
+          .num_filtered_input_files_in_output_level;
+  job_stats_->num_filtered_input_files_at_output_level =
+      internal_stats.output_level_stats
+          .num_filtered_input_files_in_output_level;
+  job_stats_->total_skipped_input_bytes =
+      internal_stats.output_level_stats.bytes_skipped_non_output_levels +
+      internal_stats.output_level_stats.bytes_skipped_output_level;
 
   // output information
-  compaction_job_stats_->total_output_bytes = stats.bytes_written;
-  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
-  compaction_job_stats_->num_output_records = stats.num_output_records;
-  compaction_job_stats_->num_output_files = stats.num_output_files;
-  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
-
-  if (stats.num_output_files > 0) {
+  job_stats_->total_output_bytes =
+      internal_stats.output_level_stats.bytes_written;
+  job_stats_->total_output_bytes_blob =
+      internal_stats.output_level_stats.bytes_written_blob;
+  job_stats_->num_output_records =
+      internal_stats.output_level_stats.num_output_records;
+  job_stats_->num_output_files =
+      internal_stats.output_level_stats.num_output_files;
+  job_stats_->num_output_files_blob =
+      internal_stats.output_level_stats.num_output_files_blob;
+
+  // If proximal level output exists
+  if (internal_stats.has_proximal_level_output) {
+    job_stats_->total_input_bytes +=
+        internal_stats.proximal_level_stats.bytes_read_non_output_levels +
+        internal_stats.proximal_level_stats.bytes_read_output_level;
+    job_stats_->num_input_records +=
+        internal_stats.proximal_level_stats.num_input_records;
+    job_stats_->num_input_files +=
+        internal_stats.proximal_level_stats
+            .num_input_files_in_non_output_levels +
+        internal_stats.proximal_level_stats.num_input_files_in_output_level;
+    job_stats_->num_input_files_at_output_level +=
+        internal_stats.proximal_level_stats.num_input_files_in_output_level;
+    job_stats_->num_filtered_input_files +=
+        internal_stats.proximal_level_stats
+            .num_filtered_input_files_in_non_output_levels +
+        internal_stats.proximal_level_stats
+            .num_filtered_input_files_in_output_level;
+    job_stats_->num_filtered_input_files_at_output_level +=
+        internal_stats.proximal_level_stats
+            .num_filtered_input_files_in_output_level;
+    job_stats_->total_skipped_input_bytes +=
+        internal_stats.proximal_level_stats.bytes_skipped_non_output_levels +
+        internal_stats.proximal_level_stats.bytes_skipped_output_level;
+
+    job_stats_->total_output_bytes +=
+        internal_stats.proximal_level_stats.bytes_written;
+    job_stats_->total_output_bytes_blob +=
+        internal_stats.proximal_level_stats.bytes_written_blob;
+    job_stats_->num_output_records +=
+        internal_stats.proximal_level_stats.num_output_records;
+    job_stats_->num_output_files +=
+        internal_stats.proximal_level_stats.num_output_files;
+    job_stats_->num_output_files_blob +=
+        internal_stats.proximal_level_stats.num_output_files_blob;
+  }
+
+  if (job_stats_->num_output_files > 0) {
     CopyPrefix(compact_->SmallestUserKey(),
                CompactionJobStats::kMaxPrefixLength,
-               &compaction_job_stats_->smallest_output_key_prefix);
+               &job_stats_->smallest_output_key_prefix);
     CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
-               &compaction_job_stats_->largest_output_key_prefix);
+               &job_stats_->largest_output_key_prefix);
   }
 }
 
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index e990124d9e98..a5ab355dd037 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -67,7 +67,7 @@ class SubcompactionState;
 // if needed.
 //
 // CompactionJob has 2 main stats:
-// 1. CompactionJobStats compaction_job_stats_
+// 1. CompactionJobStats job_stats_
 //    CompactionJobStats is a public data structure which is part of Compaction
 //    event listener that rocksdb share the job stats with the user.
 //    Internally it's an aggregation of all the compaction_job_stats from each
@@ -81,7 +81,7 @@ class SubcompactionState;
 // +------------------------+     |
 // | CompactionJob          |     |          +------------------------+
 // |                        |     |          | SubcompactionState     |
-// |   compaction_job_stats +-----+          |                        |
+// |   job_stats            +-----+          |                        |
 // |                        |     +--------->|   compaction_job_stats |
 // |                        |     |          |                        |
 // +------------------------+     |          +------------------------+
@@ -98,16 +98,13 @@ class SubcompactionState;
 //                                +--------->+                        |
 //                                           +------------------------+
 //
-// 2. CompactionStatsFull compaction_stats_
+// 2. CompactionStatsFull internal_stats_
 //    `CompactionStatsFull` is an internal stats about the compaction, which
 //    is eventually sent to `ColumnFamilyData::internal_stats_` and used for
 //    logging and public metrics.
 //    Internally, it's an aggregation of stats_ from each `SubcompactionState`.
-//    It has 2 parts, normal stats about the main compaction information and
-//    the proximal level output stats.
-//    `SubcompactionState` maintains the CompactionOutputs for ordinary level
-//    output and the proximal level output if exists, the per_level stats is
-//    stored with the outputs.
+//    It has 2 parts, ordinary output level stats and the proximal level output
+//    stats.
 //                                                +---------------------------+
 //                                                | SubcompactionState        |
 //                                                |                           |
@@ -121,9 +118,9 @@ class SubcompactionState;
 // +--------------------------------+         |   | | CompactionOutputs    |  |
 // | CompactionJob                  |         |   | | (proximal_level)     |  |
 // |                                |    +--------->|   stats_             |  |
-// |   compaction_stats_            |    |    |   | +----------------------+  |
+// |   internal_stats_              |    |    |   | +----------------------+  |
 // |    +-------------------------+ |    |    |   |                           |
-// |    |stats (normal)           |------|----+   +---------------------------+
+// |    |output_level_stats       |------|----+   +---------------------------+
 // |    +-------------------------+ |    |    |
 // |                                |    |    |
 // |    +-------------------------+ |    |    |   +---------------------------+
@@ -199,7 +196,7 @@ class CompactionJob {
   IOStatus io_status() const { return io_status_; }
 
  protected:
-  // Update the following stats in compaction_stats_.stats
+  // Update the following stats in internal_stats_.output_level_stats
   // - num_input_files_in_non_output_levels
   // - num_input_files_in_output_level
   // - bytes_read_non_output_levels
@@ -211,11 +208,12 @@ class CompactionJob {
   // @param num_input_range_del if non-null, will be set to the number of range
   // deletion entries in this compaction input.
   //
-  // Returns true iff compaction_stats_.stats.num_input_records and
+  // Returns true iff internal_stats_.output_level_stats.num_input_records and
   // num_input_range_del are calculated successfully.
-  bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
-  virtual void UpdateCompactionJobStats(
-      const InternalStats::CompactionStats& stats) const;
+  bool UpdateOutputLevelCompactionStats(
+      uint64_t* num_input_range_del = nullptr);
+  void UpdateCompactionJobStats(
+      const InternalStats::CompactionStatsFull& internal_stats) const;
   void LogCompaction();
   virtual void RecordCompactionIOStats();
   void CleanupCompaction();
@@ -224,7 +222,7 @@ class CompactionJob {
   void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
 
   CompactionState* compact_;
-  InternalStats::CompactionStatsFull compaction_stats_;
+  InternalStats::CompactionStatsFull internal_stats_;
   const ImmutableDBOptions& db_options_;
   const MutableDBOptions mutable_db_options_copy_;
   LogBuffer* log_buffer_;
@@ -237,7 +235,7 @@ class CompactionJob {
 
   IOStatus io_status_;
 
-  CompactionJobStats* compaction_job_stats_;
+  CompactionJobStats* job_stats_;
 
  private:
   friend class CompactionJobTestBase;
@@ -475,8 +473,21 @@ struct CompactionServiceResult {
 
   uint64_t bytes_read = 0;
   uint64_t bytes_written = 0;
+
+  // Job-level Compaction Stats.
+  //
+  // NOTE: Job level stats cannot be rebuilt from scratch by simply aggregating
+  // per-level stats due to some fields populated directly during compaction
+  // (e.g. RecordDroppedKeys()). This is why we need both job-level stats and
+  // per-level in the serialized result. If rebuilding job-level stats from
+  // per-level stats become possible in the future, consider deprecating this
+  // field.
   CompactionJobStats stats;
 
+  // Per-level Compaction Stats for both output_level_stats and
+  // proximal_level_stats
+  InternalStats::CompactionStatsFull internal_stats;
+
   // serialization interface to read and write the object
   static Status Read(const std::string& data_str, CompactionServiceResult* obj);
   Status Write(std::string* output);
@@ -522,9 +533,6 @@ class CompactionServiceCompactionJob : private CompactionJob {
  protected:
   void RecordCompactionIOStats() override;
 
-  void UpdateCompactionJobStats(
-      const InternalStats::CompactionStats& stats) const override;
-
  private:
   // Get table file name in output_path
   std::string GetTableFileName(uint64_t file_number) override;
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index d3a0c711ac67..e1eb1f449394 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -54,7 +54,7 @@ Status CompactionOutputs::Finish(
   }
   current_output().finished = true;
   stats_.bytes_written += current_bytes;
-  stats_.num_output_files = outputs_.size();
+  stats_.num_output_files = static_cast<int>(outputs_.size());
 
   return s;
 }
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index a95bdaaa7ab0..de9a1741492e 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -66,11 +66,6 @@ class CompactionOutputs {
     file_writer_.reset(writer);
   }
 
-  // TODO: Remove it when remote compaction support tiered compaction
-  void AddBytesWritten(uint64_t bytes) { stats_.bytes_written += bytes; }
-  void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
-  void SetNumOutputFiles(uint64_t num) { stats_.num_output_files = num; }
-
   // TODO: Move the BlobDB builder into CompactionOutputs
   const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
     if (is_proximal_level_) {
@@ -103,7 +98,8 @@ class CompactionOutputs {
 
   void UpdateBlobStats() {
     assert(!is_proximal_level_);
-    stats_.num_output_files_blob = blob_file_additions_.size();
+    stats_.num_output_files_blob =
+        static_cast<int>(blob_file_additions_.size());
     for (const auto& blob : blob_file_additions_) {
       stats_.bytes_written_blob += blob.GetTotalBlobBytes();
     }
@@ -307,8 +303,8 @@ class CompactionOutputs {
   std::vector<BlobFileAddition> blob_file_additions_;
   std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
 
-  // Basic compaction output stats for this level's outputs
-  InternalStats::CompactionOutputsStats stats_;
+  // Per level's output stat
+  InternalStats::CompactionStats stats_;
 
   // indicate if this CompactionOutputs obj for proximal_level, should always
   // be false if per_key_placement feature is not enabled.
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 0b6afa10e6bb..f345942e8a17 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -249,12 +249,24 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
                                   false, true, file.paranoid_hash);
     compaction_outputs->UpdateTableProperties(file.table_properties);
   }
+
+  // Set per-level stats
+  auto compaction_output_stats =
+      sub_compact->OutputStats(false /* is_proximal_level */);
+  assert(compaction_output_stats);
+  compaction_output_stats->Add(
+      compaction_result.internal_stats.output_level_stats);
+  if (compaction->SupportsPerKeyPlacement()) {
+    compaction_output_stats =
+        sub_compact->OutputStats(true /* is_proximal_level */);
+    assert(compaction_output_stats);
+    compaction_output_stats->Add(
+        compaction_result.internal_stats.proximal_level_stats);
+  }
+
+  // Set job stats
   sub_compact->compaction_job_stats = compaction_result.stats;
-  sub_compact->Current().SetNumOutputRecords(
-      compaction_result.stats.num_output_records);
-  sub_compact->Current().SetNumOutputFiles(
-      compaction_result.stats.num_output_files);
-  sub_compact->Current().AddBytesWritten(compaction_result.bytes_written);
+
   RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
   RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
              compaction_result.bytes_written);
@@ -274,16 +286,6 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() {
   CompactionJob::RecordCompactionIOStats();
 }
 
-void CompactionServiceCompactionJob::UpdateCompactionJobStats(
-    const InternalStats::CompactionStats& stats) const {
-  // output information only in remote compaction
-  compaction_job_stats_->total_output_bytes += stats.bytes_written;
-  compaction_job_stats_->total_output_bytes_blob += stats.bytes_written_blob;
-  compaction_job_stats_->num_output_records += stats.num_output_records;
-  compaction_job_stats_->num_output_files += stats.num_output_files;
-  compaction_job_stats_->num_output_files_blob += stats.num_output_files_blob;
-}
-
 CompactionServiceCompactionJob::CompactionServiceCompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
     const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
@@ -345,15 +347,14 @@ Status CompactionServiceCompactionJob::Run() {
 
   ProcessKeyValueCompaction(sub_compact);
 
-  compaction_job_stats_->elapsed_micros =
-      db_options_.clock->NowMicros() - start_micros;
-  compaction_job_stats_->cpu_micros =
-      sub_compact->compaction_job_stats.cpu_micros;
+  uint64_t elapsed_micros = db_options_.clock->NowMicros() - start_micros;
+  internal_stats_.SetMicros(elapsed_micros);
+  internal_stats_.AddCpuMicros(elapsed_micros);
 
   RecordTimeToHistogram(stats_, COMPACTION_TIME,
-                        compaction_job_stats_->elapsed_micros);
+                        internal_stats_.output_level_stats.micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
-                        compaction_job_stats_->cpu_micros);
+                        internal_stats_.output_level_stats.cpu_micros);
 
   Status status = sub_compact->status;
   IOStatus io_s = sub_compact->io_status;
@@ -383,28 +384,44 @@ Status CompactionServiceCompactionJob::Run() {
 
   // Build Compaction Job Stats
 
-  // 1. Aggregate CompactionOutputStats into Internal Compaction Stats
-  // (compaction_stats_) and aggregate Compaction Job Stats
-  // (compaction_job_stats_) from the sub compactions
-  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
-
-  // 2. Update the Output information in the Compaction Job Stats with
-  // aggregated Internal Compaction Stats.
-  UpdateCompactionJobStats(compaction_stats_.stats);
-  if (compaction_stats_.has_proximal_level_output) {
-    UpdateCompactionJobStats(compaction_stats_.proximal_level_stats);
+  // 1. Aggregate internal stats and job stats for all subcompactions
+  // internal stats: sub_compact.proximal_level_outputs_.stats and
+  //                 sub_compact.compaction_outputs_.stats into
+  //                 internal_stats_.output_level_stats and
+  //                 internal_stats_.proximal_level_stats
+  // job-level stats: sub_compact.compaction_job_stats into compact.job_stats_
+  //
+  // For remote compaction, there's only one subcompaction.
+  compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
+
+  // 2. Update the following stats in internal_stats_.output_level_stats
+  // - num_input_files_in_non_output_levels
+  // - num_input_files_in_output_level
+  // - bytes_read_non_output_levels
+  // - bytes_read_output_level
+  // - num_input_records
+  // - bytes_read_blob
+  // - num_dropped_records
+  uint64_t num_input_range_del = 0;
+  const bool ok = UpdateOutputLevelCompactionStats(&num_input_range_del);
+  if (status.ok() && ok && job_stats_->has_num_input_records) {
+    // TODO(jaykorean) - verify record count
+    assert(job_stats_->num_input_records > 0);
   }
 
-  // 3. Set fields that are not propagated as part of aggregations above
+  // 3. Update job-level stats with the aggregated internal_stats_
+  UpdateCompactionJobStats(internal_stats_);
+  // and set fields that are not propagated as part of the update
   compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
   compaction_result_->stats.is_full_compaction = c->is_full_compaction();
   compaction_result_->stats.is_remote_compaction = true;
 
-  // 4. Update IO Stats that are not part of the aggregations above (bytes_read,
-  // bytes_written)
+  // 4. Update IO Stats that are not part of the the update above
+  // (bytes_read, bytes_written)
   RecordCompactionIOStats();
 
   // Build Output
+  compaction_result_->internal_stats = internal_stats_;
   compaction_result_->output_level = compact_->compaction->output_level();
   compaction_result_->output_path = output_path_;
   if (status.ok()) {
@@ -724,6 +741,125 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionTypeFlags::kNone}},
 };
 
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_stats_type_info = {
+        {"micros",
+         {offsetof(struct InternalStats::CompactionStats, micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cpu_micros",
+         {offsetof(struct InternalStats::CompactionStats, cpu_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_read_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_read_non_output_levels),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_read_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_read_output_level),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_skipped_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_skipped_non_output_levels),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_skipped_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   bytes_skipped_output_level),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_read_blob",
+         {offsetof(struct InternalStats::CompactionStats, bytes_read_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_written",
+         {offsetof(struct InternalStats::CompactionStats, bytes_written),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_written_blob",
+         {offsetof(struct InternalStats::CompactionStats, bytes_written_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"bytes_moved",
+         {offsetof(struct InternalStats::CompactionStats, bytes_moved),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_in_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_input_files_in_non_output_levels),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_in_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_input_files_in_output_level),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_filtered_input_files_in_non_output_levels",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_filtered_input_files_in_non_output_levels),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_filtered_input_files_in_output_level",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_filtered_input_files_in_output_level),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files",
+         {offsetof(struct InternalStats::CompactionStats, num_output_files),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files_blob",
+         {offsetof(struct InternalStats::CompactionStats,
+                   num_output_files_blob),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_records",
+         {offsetof(struct InternalStats::CompactionStats, num_input_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_dropped_records",
+         {offsetof(struct InternalStats::CompactionStats, num_dropped_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_records",
+         {offsetof(struct InternalStats::CompactionStats, num_output_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"count",
+         {offsetof(struct InternalStats::CompactionStats, count),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"counts", OptionTypeInfo::Array<
+                       int, static_cast<int>(CompactionReason::kNumOfReasons)>(
+                       offsetof(struct InternalStats::CompactionStats, counts),
+                       OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                       {0, OptionType::kInt})},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_internal_stats_type_info = {
+        {"output_level_stats",
+         OptionTypeInfo::Struct(
+             "output_level_stats", &compaction_stats_type_info,
+             offsetof(struct InternalStats::CompactionStatsFull,
+                      output_level_stats),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+        {"has_proximal_level_output",
+         {offsetof(struct InternalStats::CompactionStatsFull,
+                   has_proximal_level_output),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"proximal_level_stats",
+         OptionTypeInfo::Struct(
+             "proximal_level_stats", &compaction_stats_type_info,
+             offsetof(struct InternalStats::CompactionStatsFull,
+                      proximal_level_stats),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
 namespace {
 // this is a helper struct to serialize and deserialize class Status, because
 // Status's members are not public.
@@ -830,6 +966,11 @@ static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
                   "stats", &compaction_job_stats_type_info,
                   offsetof(struct CompactionServiceResult, stats),
                   OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+    {"internal_stats",
+     OptionTypeInfo::Struct(
+         "internal_stats", &compaction_internal_stats_type_info,
+         offsetof(struct CompactionServiceResult, internal_stats),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
 };
 
 Status CompactionServiceInput::Read(const std::string& data_str,
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index af3cfa029ce7..711999262484 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -357,11 +357,12 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
   } else {
     ASSERT_OK(result.status);
   }
-  ASSERT_GE(result.stats.elapsed_micros, 1);
-  ASSERT_GE(result.stats.cpu_micros, 1);
+  ASSERT_GE(result.internal_stats.output_level_stats.micros, 1);
+  ASSERT_GE(result.internal_stats.output_level_stats.cpu_micros, 1);
 
-  ASSERT_EQ(20, result.stats.num_output_records);
-  ASSERT_EQ(result.output_files.size(), result.stats.num_output_files);
+  ASSERT_EQ(20, result.internal_stats.output_level_stats.num_output_records);
+  ASSERT_EQ(result.output_files.size(),
+            result.internal_stats.output_level_stats.num_output_files);
 
   uint64_t total_size = 0;
   for (auto output_file : result.output_files) {
@@ -372,7 +373,7 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
     ASSERT_GT(file_size, 0);
     total_size += file_size;
   }
-  ASSERT_EQ(total_size, result.stats.total_output_bytes);
+  ASSERT_EQ(total_size, result.internal_stats.TotalBytesWritten());
 
   ASSERT_TRUE(result.stats.is_remote_compaction);
   ASSERT_TRUE(result.stats.is_manual_compaction);
@@ -1212,10 +1213,14 @@ TEST_F(CompactionServiceTest, PrecludeLastLevel) {
   CompactionServiceResult result;
   my_cs->GetResult(&result);
   ASSERT_OK(result.status);
-  ASSERT_GT(result.stats.cpu_micros, 0);
-  ASSERT_GT(result.stats.elapsed_micros, 0);
-  ASSERT_EQ(result.stats.num_output_records, kNumTrigger * kNumKeys);
-  ASSERT_EQ(result.stats.num_output_files, 2);
+  ASSERT_GT(result.internal_stats.output_level_stats.cpu_micros, 0);
+  ASSERT_GT(result.internal_stats.output_level_stats.micros, 0);
+  ASSERT_EQ(result.internal_stats.output_level_stats.num_output_records +
+                result.internal_stats.proximal_level_stats.num_output_records,
+            kNumTrigger * kNumKeys);
+  ASSERT_EQ(result.internal_stats.output_level_stats.num_output_files +
+                result.internal_stats.proximal_level_stats.num_output_files,
+            2);
 }
 
 TEST_F(CompactionServiceTest, ConcurrentCompaction) {
diff --git a/db/compaction/compaction_state.cc b/db/compaction/compaction_state.cc
index bf016d04b694..febf2e01d1e0 100644
--- a/db/compaction/compaction_state.cc
+++ b/db/compaction/compaction_state.cc
@@ -36,11 +36,11 @@ Slice CompactionState::LargestUserKey() {
 }
 
 void CompactionState::AggregateCompactionStats(
-    InternalStats::CompactionStatsFull& compaction_stats,
-    CompactionJobStats& compaction_job_stats) {
+    InternalStats::CompactionStatsFull& internal_stats,
+    CompactionJobStats& job_stats) {
   for (const auto& sc : sub_compact_states) {
-    sc.AggregateCompactionOutputStats(compaction_stats);
-    compaction_job_stats.Add(sc.compaction_job_stats);
+    sc.AggregateCompactionOutputStats(internal_stats);
+    job_stats.Add(sc.compaction_job_stats);
   }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_state.h b/db/compaction/compaction_state.h
index cc5b66c68224..faad712b6ff5 100644
--- a/db/compaction/compaction_state.h
+++ b/db/compaction/compaction_state.h
@@ -29,8 +29,8 @@ class CompactionState {
   Status status;
 
   void AggregateCompactionStats(
-      InternalStats::CompactionStatsFull& compaction_stats,
-      CompactionJobStats& compaction_job_stats);
+      InternalStats::CompactionStatsFull& internal_stats,
+      CompactionJobStats& job_stats);
 
   explicit CompactionState(Compaction* c) : compaction(c) {}
 
diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc
index 1651a3dec48f..6aab80445647 100644
--- a/db/compaction/subcompaction_state.cc
+++ b/db/compaction/subcompaction_state.cc
@@ -14,7 +14,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 void SubcompactionState::AggregateCompactionOutputStats(
-    InternalStats::CompactionStatsFull& compaction_stats) const {
+    InternalStats::CompactionStatsFull& internal_stats) const {
   // Outputs should be closed. By extension, any files created just for
   // range deletes have already been written also.
   assert(compaction_outputs_.HasBuilder() == false);
@@ -26,10 +26,10 @@ void SubcompactionState::AggregateCompactionOutputStats(
   // assert(proximal_level_outputs_.stats_.num_output_files ==
   //        proximal_level_outputs_.outputs_.size());
 
-  compaction_stats.stats.Add(compaction_outputs_.stats_);
+  internal_stats.output_level_stats.Add(compaction_outputs_.stats_);
   if (proximal_level_outputs_.HasOutput()) {
-    compaction_stats.has_proximal_level_output = true;
-    compaction_stats.proximal_level_stats.Add(proximal_level_outputs_.stats_);
+    internal_stats.has_proximal_level_output = true;
+    internal_stats.proximal_level_stats.Add(proximal_level_outputs_.stats_);
   }
 }
 
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 016d253566e9..fba41c974318 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -161,7 +161,7 @@ class SubcompactionState {
   void Cleanup(Cache* cache);
 
   void AggregateCompactionOutputStats(
-      InternalStats::CompactionStatsFull& compaction_stats) const;
+      InternalStats::CompactionStatsFull& internal_stats) const;
 
   CompactionOutputs& Current() const {
     assert(current_outputs_);
@@ -177,6 +177,16 @@ class SubcompactionState {
     return &compaction_outputs_;
   }
 
+  // Per-level stats for the output
+  InternalStats::CompactionStats* OutputStats(bool is_proximal_level) {
+    assert(compaction);
+    if (is_proximal_level) {
+      assert(compaction->SupportsPerKeyPlacement());
+      return &proximal_level_outputs_.stats_;
+    }
+    return &compaction_outputs_.stats_;
+  }
+
   CompactionRangeDelAggregator* RangeDelAgg() const {
     return range_del_agg_.get();
   }
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index a7f2e948d16d..591f8b5d3be7 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -33,42 +33,13 @@ ConfigOptions GetStrictConfigOptions() {
 class TieredCompactionTest : public DBTestBase {
  public:
   TieredCompactionTest()
-      : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true),
-        kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1),
-        kBasicPerKeyPlacementCompStats(
-            CompactionReason::kUniversalSizeAmplification, 1),
-        kBasicFlushStats(CompactionReason::kFlush, 1) {
-    kBasicCompStats.micros = kHasValue;
-    kBasicCompStats.cpu_micros = kHasValue;
-    kBasicCompStats.bytes_read_non_output_levels = kHasValue;
-    kBasicCompStats.num_input_files_in_non_output_levels = kHasValue;
-    kBasicCompStats.num_input_records = kHasValue;
-    kBasicCompStats.num_dropped_records = kHasValue;
-
-    kBasicPerLevelStats.num_output_records = kHasValue;
-    kBasicPerLevelStats.bytes_written = kHasValue;
-    kBasicPerLevelStats.num_output_files = kHasValue;
-
-    kBasicPerKeyPlacementCompStats.micros = kHasValue;
-    kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue;
-    kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats);
-
-    kBasicFlushStats.micros = kHasValue;
-    kBasicFlushStats.cpu_micros = kHasValue;
-    kBasicFlushStats.bytes_written = kHasValue;
-    kBasicFlushStats.num_output_files = kHasValue;
-  }
+      : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true) {}
 
  protected:
-  static constexpr uint8_t kHasValue = 1;
-
-  InternalStats::CompactionStats kBasicCompStats;
-  InternalStats::CompactionStats kBasicPerKeyPlacementCompStats;
-  InternalStats::CompactionOutputsStats kBasicPerLevelStats;
-  InternalStats::CompactionStats kBasicFlushStats;
-
   std::atomic_bool enable_per_key_placement = true;
 
+  CompactionJobStats job_stats;
+
   void SetUp() override {
     SyncPoint::GetInstance()->SetCallBack(
         "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
@@ -108,21 +79,35 @@ class TieredCompactionTest : public DBTestBase {
 
   // Verify the compaction stats, the stats are roughly compared
   void VerifyCompactionStats(
-      const std::vector<InternalStats::CompactionStats>& expect_stats,
-      const InternalStats::CompactionStats& expect_pl_stats) {
+      const std::vector<InternalStats::CompactionStats>& expected_stats,
+      const InternalStats::CompactionStats& expected_pl_stats,
+      size_t output_level) {
     const std::vector<InternalStats::CompactionStats>& stats =
         GetCompactionStats();
-    const size_t kLevels = expect_stats.size();
+    const size_t kLevels = expected_stats.size();
     ASSERT_EQ(kLevels, stats.size());
+    ASSERT_TRUE(output_level < kLevels);
 
-    for (auto it = stats.begin(), expect = expect_stats.begin();
-         it != stats.end(); it++, expect++) {
-      VerifyCompactionStats(*it, *expect);
+    for (size_t level = 0; level < kLevels; level++) {
+      VerifyCompactionStats(stats[level], expected_stats[level]);
     }
 
     const InternalStats::CompactionStats& pl_stats =
         GetPerKeyPlacementCompactionStats();
-    VerifyCompactionStats(pl_stats, expect_pl_stats);
+    VerifyCompactionStats(pl_stats, expected_pl_stats);
+
+    const auto& output_level_stats = stats[output_level];
+    CompactionJobStats expected_job_stats;
+    expected_job_stats.cpu_micros = output_level_stats.cpu_micros;
+    expected_job_stats.num_input_files =
+        output_level_stats.num_input_files_in_output_level +
+        output_level_stats.num_input_files_in_non_output_levels;
+    expected_job_stats.num_input_records = output_level_stats.num_input_records;
+    expected_job_stats.num_output_files =
+        output_level_stats.num_output_files + pl_stats.num_output_files;
+    expected_job_stats.num_output_records =
+        output_level_stats.num_output_records + pl_stats.num_output_records;
+    VerifyCompactionJobStats(job_stats, expected_job_stats);
   }
 
   void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats,
@@ -139,42 +124,52 @@ class TieredCompactionTest : public DBTestBase {
   }
 
  private:
-  void CompareStats(uint64_t val, uint64_t expect) {
-    if (expect > 0) {
-      ASSERT_TRUE(val > 0);
-    } else {
-      ASSERT_EQ(val, 0);
-    }
-  }
-
   void VerifyCompactionStats(
       const InternalStats::CompactionStats& stats,
       const InternalStats::CompactionStats& expect_stats) {
-    CompareStats(stats.micros, expect_stats.micros);
-    CompareStats(stats.cpu_micros, expect_stats.cpu_micros);
-    CompareStats(stats.bytes_read_non_output_levels,
-                 expect_stats.bytes_read_non_output_levels);
-    CompareStats(stats.bytes_read_output_level,
-                 expect_stats.bytes_read_output_level);
-    CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob);
-    CompareStats(stats.bytes_written, expect_stats.bytes_written);
-    CompareStats(stats.bytes_moved, expect_stats.bytes_moved);
-    CompareStats(stats.num_input_files_in_non_output_levels,
-                 expect_stats.num_input_files_in_non_output_levels);
-    CompareStats(stats.num_input_files_in_output_level,
-                 expect_stats.num_input_files_in_output_level);
-    CompareStats(stats.num_output_files, expect_stats.num_output_files);
-    CompareStats(stats.num_output_files_blob,
-                 expect_stats.num_output_files_blob);
-    CompareStats(stats.num_input_records, expect_stats.num_input_records);
-    CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records);
-    CompareStats(stats.num_output_records, expect_stats.num_output_records);
+    ASSERT_EQ(stats.micros > 0, expect_stats.micros > 0);
+    ASSERT_EQ(stats.cpu_micros > 0, expect_stats.cpu_micros > 0);
+
+    // Hard to get consistent byte sizes of SST files.
+    // Use ASSERT_NEAR for comparison
+    ASSERT_NEAR(stats.bytes_read_non_output_levels * 1.0f,
+                expect_stats.bytes_read_non_output_levels * 1.0f,
+                stats.bytes_read_non_output_levels * 0.5f);
+    ASSERT_NEAR(stats.bytes_read_output_level * 1.0f,
+                expect_stats.bytes_read_output_level * 1.0f,
+                stats.bytes_read_output_level * 0.5f);
+    ASSERT_NEAR(stats.bytes_read_blob * 1.0f,
+                expect_stats.bytes_read_blob * 1.0f,
+                stats.bytes_read_blob * 0.5f);
+    ASSERT_NEAR(stats.bytes_written * 1.0f, expect_stats.bytes_written * 1.0f,
+                stats.bytes_written * 0.5f);
+
+    ASSERT_EQ(stats.bytes_moved, expect_stats.bytes_moved);
+    ASSERT_EQ(stats.num_input_files_in_non_output_levels,
+              expect_stats.num_input_files_in_non_output_levels);
+    ASSERT_EQ(stats.num_input_files_in_output_level,
+              expect_stats.num_input_files_in_output_level);
+    ASSERT_EQ(stats.num_output_files, expect_stats.num_output_files);
+    ASSERT_EQ(stats.num_output_files_blob, expect_stats.num_output_files_blob);
+    ASSERT_EQ(stats.num_input_records, expect_stats.num_input_records);
+    ASSERT_EQ(stats.num_dropped_records, expect_stats.num_dropped_records);
+    ASSERT_EQ(stats.num_output_records, expect_stats.num_output_records);
+
     ASSERT_EQ(stats.count, expect_stats.count);
     for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons);
          i++) {
       ASSERT_EQ(stats.counts[i], expect_stats.counts[i]);
     }
   }
+
+  void VerifyCompactionJobStats(const CompactionJobStats& stats,
+                                const CompactionJobStats& expected_stats) {
+    ASSERT_EQ(stats.cpu_micros, expected_stats.cpu_micros);
+    ASSERT_EQ(stats.num_input_files, expected_stats.num_input_files);
+    ASSERT_EQ(stats.num_input_records, expected_stats.num_input_records);
+    ASSERT_EQ(job_stats.num_output_files, expected_stats.num_output_files);
+    ASSERT_EQ(job_stats.num_output_records, expected_stats.num_output_records);
+  }
 };
 
 TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
@@ -199,52 +194,135 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
       [&](void* arg) {
         *static_cast<SequenceNumber*>(arg) = latest_cold_seq.load();
       });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Install:AfterUpdateCompactionJobStats", [&](void* arg) {
+        job_stats.Reset();
+        job_stats.Add(*(static_cast<CompactionJobStats*>(arg)));
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
-  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
   InternalStats::CompactionStats expect_pl_stats;
 
+  // Put keys in the following way to create overlaps
+  // First file from 0 ~ 99
+  // Second file from 10 ~ 109
+  // ...
+  size_t bytes_per_file = 1952;
+  int total_input_key_count = kNumTrigger * kNumKeys;
+  int total_output_key_count = 130;  // 0 ~ 129
+
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
       ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
     }
     ASSERT_OK(Flush());
+
     seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
-    expect_stats[0].Add(kBasicFlushStats);
+    InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, 1);
+    flush_stats.cpu_micros = 1;
+    flush_stats.micros = 1;
+    flush_stats.bytes_written = bytes_per_file;
+    flush_stats.num_output_files = 1;
+    expect_stats[0].Add(flush_stats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // the proximal level file temperature is not cold, all data are output to
-  // the proximal level.
+  // the penultimate level file temperature is not cold, all data are output to
+  // the penultimate level.
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
-  // basic compaction stats are still counted to the last level
-  expect_stats[kLastLevel].Add(kBasicCompStats);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  uint64_t bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
 
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // TODO - Use designated initializer when c++20 support is required
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kUniversalSizeAmplification, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = 0;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_per_file * kNumTrigger;
+    last_level_compaction_stats.num_input_files_in_non_output_levels =
+        kNumTrigger;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    last_level_compaction_stats.num_output_records = 0;
+    last_level_compaction_stats.num_output_files = 0;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kUniversalSizeAmplification, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_output_key_count;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
 
   ResetAllStats(expect_stats, expect_pl_stats);
 
   // move forward the cold_seq to split the file into 2 levels, so should have
-  // both the last level stats and the output_to_proximal_level stats
+  // both the last level stats and the penultimate level stats
   latest_cold_seq = seq_history[0];
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
 
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.num_dropped_records = 0;
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // Now update the input count to be the total count from the previous
+  total_input_key_count = total_output_key_count;
+  int moved_to_last_level_key_count = 10;
+
+  // bytes read in non output = bytes written in penultimate level from previous
+  uint64_t bytes_read_in_non_output_level = bytes_written_penultimate_level;
+  uint64_t bytes_written_output_level =
+      GetCompactionStats()[kLastLevel].bytes_written;
+
+  // Now get the new bytes written in penultimate level
+  bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = bytes_written_output_level;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_read_in_non_output_level;
+    last_level_compaction_stats.num_input_files_in_non_output_levels = 1;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    last_level_compaction_stats.num_output_records =
+        moved_to_last_level_key_count;
+    last_level_compaction_stats.num_output_files = 1;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_output_key_count - moved_to_last_level_key_count;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
 
   // delete all cold data, so all data will be on proximal level
   for (int i = 0; i < 10; i++) {
@@ -255,17 +333,54 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ResetAllStats(expect_stats, expect_pl_stats);
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // 10 tombstones added
+  total_input_key_count = total_input_key_count + 10;
+  total_output_key_count = total_output_key_count - 10;
+
+  auto last_level_stats = GetCompactionStats()[kLastLevel];
+  bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
+
+  ASSERT_LT(bytes_written_penultimate_level,
+            last_level_stats.bytes_read_non_output_levels +
+                last_level_stats.bytes_read_output_level);
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = 0;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        last_level_stats.bytes_read_non_output_levels;
+    last_level_compaction_stats.bytes_read_output_level =
+        last_level_stats.bytes_read_output_level;
+    last_level_compaction_stats.num_input_files_in_non_output_levels = 2;
+    last_level_compaction_stats.num_input_files_in_output_level = 1;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    last_level_compaction_stats.num_output_records = 0;
+    last_level_compaction_stats.num_output_files = 0;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_output_key_count;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
 
   // move forward the cold_seq again with range delete, take a snapshot to keep
   // the range dels in both cold and hot SSTs
@@ -283,12 +398,47 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // Previous output + one delete range
+  total_input_key_count = total_output_key_count + 1;
+  moved_to_last_level_key_count = 20;
+
+  last_level_stats = GetCompactionStats()[kLastLevel];
+  bytes_written_penultimate_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
+  // Expected to write more in last level
+  ASSERT_GT(bytes_written_penultimate_level, last_level_stats.bytes_written);
+  {
+    InternalStats::CompactionStats last_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    last_level_compaction_stats.cpu_micros = 1;
+    last_level_compaction_stats.micros = 1;
+    last_level_compaction_stats.bytes_written = last_level_stats.bytes_written;
+    last_level_compaction_stats.bytes_read_non_output_levels =
+        last_level_stats.bytes_read_non_output_levels;
+    last_level_compaction_stats.bytes_read_output_level = 0;
+    last_level_compaction_stats.num_input_files_in_non_output_levels = 2;
+    last_level_compaction_stats.num_input_files_in_output_level = 0;
+    last_level_compaction_stats.num_input_records = total_input_key_count;
+    last_level_compaction_stats.num_dropped_records =
+        1;  // delete range tombstone
+    last_level_compaction_stats.num_output_records =
+        moved_to_last_level_key_count;
+    last_level_compaction_stats.num_output_files = 1;
+    expect_stats[kLastLevel].Add(last_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats penultimate_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    penultimate_level_compaction_stats.cpu_micros = 1;
+    penultimate_level_compaction_stats.micros = 1;
+    penultimate_level_compaction_stats.bytes_written =
+        bytes_written_penultimate_level;
+    penultimate_level_compaction_stats.num_output_files = 1;
+    penultimate_level_compaction_stats.num_output_records =
+        total_input_key_count - moved_to_last_level_key_count - 1;
+    expect_pl_stats.Add(penultimate_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
 
   // verify data
   std::string value;
@@ -341,11 +491,11 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
 // This test was essentially for a hacked-up version on future functionality.
 // It can be resurrected if/when a form of range-based tiering is properly
 // implemented.
+// TODO - Add stats verification when adding this test back
 TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   const int kNumTrigger = 4;
   const int kNumLevels = 7;
   const int kNumKeys = 100;
-  const int kLastLevel = kNumLevels - 1;
 
   auto options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
@@ -371,7 +521,6 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
-  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
   InternalStats::CompactionStats expect_pl_stats;
 
   for (int i = 0; i < kNumTrigger; i++) {
@@ -379,18 +528,12 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
       ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
     }
     ASSERT_OK(Flush());
-    expect_stats[0].Add(kBasicFlushStats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.Add(kBasicPerLevelStats);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
-
   ResetAllStats(expect_stats, expect_pl_stats);
 
   // change to all cold, no output_to_proximal_level output
@@ -404,14 +547,6 @@ TEST_F(TieredCompactionTest, DISABLED_RangeBasedTieredStorageUniversal) {
   ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.num_dropped_records = 0;
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
-
   // change to all hot, universal compaction support moving data to up level if
   // it's within compaction level range.
   {
@@ -890,6 +1025,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   const int kNumKeys = 100;
   const int kLastLevel = kNumLevels - 1;
 
+  int output_level = 0;
+
   auto options = CurrentOptions();
   SetColdTemperature(options);
   options.level0_file_num_compaction_trigger = kNumTrigger;
@@ -906,18 +1043,40 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
       [&](void* arg) {
         *static_cast<SequenceNumber*>(arg) = latest_cold_seq.load();
       });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Install:AfterUpdateCompactionJobStats", [&](void* arg) {
+        job_stats.Reset();
+        job_stats.Add(*(static_cast<CompactionJobStats*>(arg)));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        output_level = compaction->output_level();
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
-  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
   InternalStats::CompactionStats expect_pl_stats;
 
+  // Put keys in the following way to create overlaps
+  // First file from 0 ~ 99
+  // Second file from 10 ~ 109
+  // ...
+  size_t bytes_per_file = 1952;
+  int total_input_key_count = kNumTrigger * kNumKeys;
+  int total_output_key_count = 130;  // 0 ~ 129
+
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
       ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
     }
     ASSERT_OK(Flush());
-    expect_stats[0].Add(kBasicFlushStats);
+    InternalStats::CompactionStats flush_stats(CompactionReason::kFlush, 1);
+    flush_stats.cpu_micros = 1;
+    flush_stats.micros = 1;
+    flush_stats.bytes_written = bytes_per_file;
+    flush_stats.num_output_files = 1;
+    expect_stats[0].Add(flush_stats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
@@ -926,10 +1085,30 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
-  expect_stats[1].Add(kBasicCompStats);
-  expect_stats[1].Add(kBasicPerLevelStats);
-  expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  uint64_t bytes_written_output_level =
+      GetCompactionStats()[output_level].bytes_written;
+  ASSERT_GT(bytes_written_output_level, 0);
+
+  {
+    InternalStats::CompactionStats output_level_compaction_stats(
+        CompactionReason::kLevelL0FilesNum, 1);
+    output_level_compaction_stats.cpu_micros = 1;
+    output_level_compaction_stats.micros = 1;
+    output_level_compaction_stats.bytes_written = bytes_written_output_level;
+    output_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_per_file * kNumTrigger;
+    output_level_compaction_stats.bytes_read_output_level = 0;
+    output_level_compaction_stats.num_input_files_in_non_output_levels =
+        kNumTrigger;
+    output_level_compaction_stats.num_input_files_in_output_level = 0;
+    output_level_compaction_stats.num_input_records = total_input_key_count;
+    output_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    output_level_compaction_stats.num_output_records = total_output_key_count;
+    output_level_compaction_stats.num_output_files = 1;
+    expect_stats[output_level].Add(output_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, output_level);
 
   // move all data to the last level
   MoveFilesToLevel(kLastLevel);
@@ -944,15 +1123,26 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.Add(kBasicPerLevelStats);
-  last_stats.num_dropped_records = 0;
-  last_stats.bytes_read_non_output_levels = 0;
-  last_stats.num_input_files_in_non_output_levels = 0;
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  total_input_key_count = total_output_key_count;
+  {
+    InternalStats::CompactionStats output_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    output_level_compaction_stats.cpu_micros = 1;
+    output_level_compaction_stats.micros = 1;
+    output_level_compaction_stats.bytes_written = bytes_written_output_level;
+    output_level_compaction_stats.bytes_read_non_output_levels = 0;
+    output_level_compaction_stats.bytes_read_output_level =
+        bytes_written_output_level;
+    output_level_compaction_stats.num_input_files_in_non_output_levels = 0;
+    output_level_compaction_stats.num_input_files_in_output_level = 1;
+    output_level_compaction_stats.num_input_records = total_input_key_count;
+    output_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    output_level_compaction_stats.num_output_records = total_output_key_count;
+    output_level_compaction_stats.num_output_files = 1;
+    expect_stats[output_level].Add(output_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, output_level);
 
   // Add new data, which is all hot and overriding all existing data
   latest_cold_seq = dbfull()->GetLatestSequenceNumber();
@@ -976,17 +1166,47 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
   ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
 
+  uint64_t bytes_written_in_proximal_level =
+      GetPerKeyPlacementCompactionStats().bytes_written;
   for (int level = 2; level < kNumLevels - 1; level++) {
-    expect_stats[level].bytes_moved = kHasValue;
+    expect_stats[level].bytes_moved = bytes_written_in_proximal_level;
   }
 
-  last_stats.Add(kBasicCompStats);
-  last_stats.bytes_read_output_level = kHasValue;
-  last_stats.num_input_files_in_output_level = kHasValue;
-  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
-  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
-  VerifyCompactionStats(expect_stats, expect_pl_stats);
+  // Another set of 130 keys + from the previous
+  total_input_key_count = total_output_key_count + 130;
+  // Merged into 130
+  total_output_key_count = 130;
+
+  {
+    InternalStats::CompactionStats output_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    output_level_compaction_stats.cpu_micros = 1;
+    output_level_compaction_stats.micros = 1;
+    output_level_compaction_stats.bytes_written = 0;
+    output_level_compaction_stats.bytes_read_non_output_levels =
+        bytes_written_in_proximal_level;
+    output_level_compaction_stats.bytes_read_output_level =
+        bytes_written_output_level;
+    output_level_compaction_stats.num_input_files_in_non_output_levels = 1;
+    output_level_compaction_stats.num_input_files_in_output_level = 1;
+    output_level_compaction_stats.num_input_records = total_input_key_count;
+    output_level_compaction_stats.num_dropped_records =
+        total_input_key_count - total_output_key_count;
+    output_level_compaction_stats.num_output_records = 0;
+    output_level_compaction_stats.num_output_files = 0;
+    expect_stats[output_level].Add(output_level_compaction_stats);
+  }
+  {
+    InternalStats::CompactionStats proximal_level_compaction_stats(
+        CompactionReason::kManualCompaction, 1);
+    expect_pl_stats.cpu_micros = 1;
+    expect_pl_stats.micros = 1;
+    expect_pl_stats.bytes_written = bytes_written_in_proximal_level;
+    expect_pl_stats.num_output_files = 1;
+    expect_pl_stats.num_output_records = total_output_key_count;
+    expect_pl_stats.Add(proximal_level_compaction_stats);
+  }
+  VerifyCompactionStats(expect_stats, expect_pl_stats, output_level);
 
   // move forward the cold_seq, try to split the data into cold and hot, but in
   // this case it's unsafe to split the data
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 01c4c4bd7184..35dc42a98a49 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -153,23 +153,6 @@ class InternalStats {
 
   InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
 
-  // Per level compaction stats
-  struct CompactionOutputsStats {
-    uint64_t num_output_records = 0;
-    uint64_t bytes_written = 0;
-    uint64_t bytes_written_blob = 0;
-    uint64_t num_output_files = 0;
-    uint64_t num_output_files_blob = 0;
-
-    void Add(const CompactionOutputsStats& stats) {
-      this->num_output_records += stats.num_output_records;
-      this->bytes_written += stats.bytes_written;
-      this->bytes_written_blob += stats.bytes_written_blob;
-      this->num_output_files += stats.num_output_files;
-      this->num_output_files_blob += stats.num_output_files_blob;
-    }
-  };
-
   // Per level compaction stats.  comp_stats_[level] stores the stats for
   // compactions that produced data for the specified "level".
   struct CompactionStats {
@@ -420,15 +403,6 @@ class InternalStats {
       }
     }
 
-    void Add(const CompactionOutputsStats& stats) {
-      this->num_output_files += static_cast<int>(stats.num_output_files);
-      this->num_output_records += stats.num_output_records;
-      this->bytes_written += stats.bytes_written;
-      this->bytes_written_blob += stats.bytes_written_blob;
-      this->num_output_files_blob +=
-          static_cast<int>(stats.num_output_files_blob);
-    }
-
     void Subtract(const CompactionStats& c) {
       this->micros -= c.micros;
       this->cpu_micros -= c.cpu_micros;
@@ -473,23 +447,25 @@ class InternalStats {
     }
   };
 
-  // Compaction stats, for per_key_placement compaction, it includes 2 levels
-  // stats: the last level and the proximal level.
+  // Compaction internal stats, for per_key_placement compaction, it includes 2
+  // levels stats: the last level and the proximal level.
   struct CompactionStatsFull {
     // the stats for the target primary output level
-    CompactionStats stats;
+    CompactionStats output_level_stats;
 
     // stats for proximal level output if exist
     bool has_proximal_level_output = false;
     CompactionStats proximal_level_stats;
 
-    explicit CompactionStatsFull() : stats(), proximal_level_stats() {}
+    explicit CompactionStatsFull()
+        : output_level_stats(), proximal_level_stats() {}
 
     explicit CompactionStatsFull(CompactionReason reason, int c)
-        : stats(reason, c), proximal_level_stats(reason, c) {}
+        : output_level_stats(reason, c), proximal_level_stats(reason, c) {}
 
     uint64_t TotalBytesWritten() const {
-      uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+      uint64_t bytes_written = output_level_stats.bytes_written +
+                               output_level_stats.bytes_written_blob;
       if (has_proximal_level_output) {
         bytes_written += proximal_level_stats.bytes_written +
                          proximal_level_stats.bytes_written_blob;
@@ -498,23 +474,23 @@ class InternalStats {
     }
 
     uint64_t DroppedRecords() {
-      uint64_t output_records = stats.num_output_records;
+      uint64_t output_records = output_level_stats.num_output_records;
       if (has_proximal_level_output) {
         output_records += proximal_level_stats.num_output_records;
       }
-      if (stats.num_input_records > output_records) {
-        return stats.num_input_records - output_records;
+      if (output_level_stats.num_input_records > output_records) {
+        return output_level_stats.num_input_records - output_records;
       }
       return 0;
     }
 
     void SetMicros(uint64_t val) {
-      stats.micros = val;
+      output_level_stats.micros = val;
       proximal_level_stats.micros = val;
     }
 
     void AddCpuMicros(uint64_t val) {
-      stats.cpu_micros += val;
+      output_level_stats.cpu_micros += val;
       proximal_level_stats.cpu_micros += val;
     }
   };
@@ -587,7 +563,7 @@ class InternalStats {
 
   void AddCompactionStats(int level, Env::Priority thread_pri,
                           const CompactionStatsFull& comp_stats_full) {
-    AddCompactionStats(level, thread_pri, comp_stats_full.stats);
+    AddCompactionStats(level, thread_pri, comp_stats_full.output_level_stats);
     if (comp_stats_full.has_proximal_level_output) {
       per_key_placement_comp_stats_.Add(comp_stats_full.proximal_level_stats);
     }
diff --git a/unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md b/unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md
new file mode 100644
index 000000000000..3da1236c899d
--- /dev/null
+++ b/unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md
@@ -0,0 +1 @@
+Fixed stats for Tiered Storage with preclude_last_level feature

From 0a43d8a261b9c633c0a4e369b1ef33aa5ee32810 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 18 Mar 2025 18:40:33 -0700
Subject: [PATCH 025/500] Verify compaction output record count (#13455)

Summary:
Continuing cbi42 's work in 602cc0f9a4be89020fb870dba2816f11dd515d16.

In this PR, we are adding record count verification for each compaction by comparing number of entries summed from Table Properties with the number of output records from the compaction stats.

If the count does not match, `Status::Corruption(msg)` is returned with detailed message including the actual number (from table property) and the expected number (from compaction stats)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13455

Test Plan:
New UT added
```
./db_compaction_test -- --gtest_filter="*Verify*"
```

The check had to be disabled for some of the existing tests using MockTable/MockTableFactory, because TableProperties aren't populated properly for the MockTables.

Reviewed By: hx235

Differential Revision: D71235790

Pulled By: jaykorean

fbshipit-source-id: 3a86a878d13e79d948409d6a9843d1c992d2c98e
---
 db/compaction/compaction_job.cc               | 32 +++++-
 db/compaction/compaction_job_test.cc          |  5 +
 db/corruption_test.cc                         |  4 +-
 db/db_compaction_test.cc                      | 99 ++++++++++++++++++-
 db/db_test.cc                                 |  1 +
 .../block_based/block_based_table_builder.cc  |  5 +
 table/plain/plain_table_builder.cc            |  8 ++
 utilities/transactions/transaction_test.cc    |  3 +
 8 files changed, 154 insertions(+), 3 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 94b7b102d8bf..9b0762e139b9 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -839,7 +839,6 @@ Status CompactionJob::Run() {
   TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
   TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
 
-  TablePropertiesCollection tp;
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.GetOutputs()) {
       auto fn =
@@ -902,6 +901,37 @@ Status CompactionJob::Run() {
     }
   }
 
+  // Verify number of output records
+  if (status.ok() && db_options_.compaction_verify_record_count) {
+    uint64_t total_output_num = 0;
+    for (const auto& state : compact_->sub_compact_states) {
+      for (const auto& output : state.GetOutputs()) {
+        total_output_num += output.table_properties->num_entries -
+                            output.table_properties->num_range_deletions;
+      }
+    }
+
+    uint64_t expected = internal_stats_.output_level_stats.num_output_records;
+    if (internal_stats_.has_proximal_level_output) {
+      expected += internal_stats_.proximal_level_stats.num_output_records;
+    }
+    if (expected != total_output_num) {
+      char scratch[2345];
+      compact_->compaction->Summary(scratch, sizeof(scratch));
+      std::string msg =
+          "Number of keys in compaction output SST files does not match "
+          "number of keys added. Expected " +
+          std::to_string(expected) + " but there are " +
+          std::to_string(total_output_num) +
+          " in output SST files. Compaction summary: " + scratch;
+      ROCKS_LOG_WARN(
+          db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
+          compact_->compaction->column_family_data()->GetName().c_str(),
+          job_context_->job_id, msg.c_str());
+      status = Status::Corruption(msg);
+    }
+  }
+
   RecordCompactionIOStats();
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index b7afc07b996c..36a4e5f0430a 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -232,6 +232,11 @@ class CompactionJobTestBase : public testing::Test {
     // set default for the tests
     mutable_cf_options_.target_file_size_base = 1024 * 1024;
     mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024;
+
+    // Turn off compaction_verify_record_count MockTables
+    if (table_type == TableTypeForTest::kMockTable) {
+      db_options_.compaction_verify_record_count = false;
+    }
   }
 
   void SetUp() override {
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index e99612c2b8a3..e20cd20df65f 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -851,6 +851,9 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
   options.env = env_.get();
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
+  // Skip verifying record count against TableProperties for
+  // MockTables
+  options.compaction_verify_record_count = false;
   Status s;
   for (const auto& mode : corruption_modes) {
     delete db_;
@@ -863,7 +866,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
     Build(100, 2);
-    // ASSERT_OK(db_->Flush(FlushOptions()));
     DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
     ASSERT_OK(dbi->TEST_FlushMemTable());
     mock->SetCorruptionMode(mode);
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 67303416878b..80269f374dc1 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -10503,7 +10503,7 @@ TEST_F(DBCompactionTest, NumberOfSubcompactions) {
   }
 }
 
-TEST_F(DBCompactionTest, VerifyRecordCount) {
+TEST_F(DBCompactionTest, VerifyInputRecordCount) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleLevel;
   options.level0_file_num_compaction_trigger = 3;
@@ -10541,6 +10541,103 @@ TEST_F(DBCompactionTest, VerifyRecordCount) {
   ASSERT_TRUE(std::strstr(s.getState(), expect));
 }
 
+TEST_F(DBCompactionTest, VerifyOutputRecordCountBlockBasedTable) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compaction_verify_record_count = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Create 2 overlapping L0 files
+  for (int i = 1; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), Key(10), Key(15)));
+
+  for (int i = 0; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  // Skip adding every 7th key in the output table
+  int num_iter = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::Add::skip", [&](void* skip) {
+        num_iter++;
+        if (num_iter % 7 == 0) {
+          *(bool*)skip = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption());
+  const char* expect =
+      "Number of keys in compaction output SST files does not match number of "
+      "keys added.";
+  ASSERT_TRUE(std::strstr(s.getState(), expect));
+}
+
+TEST_F(DBCompactionTest, VerifyOutputRecordCountPlainTable) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compaction_verify_record_count = true;
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 0;
+  plain_table_options.bloom_bits_per_key = 2;
+  plain_table_options.hash_table_ratio = 0.8;
+  plain_table_options.index_sparseness = 3;
+  plain_table_options.huge_page_tlb_size = 0;
+  plain_table_options.encoding_type = kPrefix;
+  plain_table_options.full_scan_mode = false;
+  plain_table_options.store_index_in_file = false;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.allow_mmap_reads = false;
+  options.allow_concurrent_memtable_write = false;
+  options.unordered_write = false;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Create 2 overlapping L0 files
+  for (int i = 1; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < 20; i += 2) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  // Skip adding every 7th key in the output table
+  int num_iter = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PlainTableBuilder::Add::skip", [&](void* skip) {
+        num_iter++;
+        if (num_iter % 7 == 0) {
+          *(bool*)skip = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.IsCorruption());
+  const char* expect =
+      "Number of keys in compaction output SST files does not match number of "
+      "keys added.";
+  ASSERT_TRUE(std::strstr(s.getState(), expect));
+}
+
 TEST_F(DBCompactionTest, ErrorWhenReadFileHead) {
   // This is to test a bug that is fixed in
   // https://github.com/facebook/rocksdb/pull/11782.
diff --git a/db/db_test.cc b/db/db_test.cc
index 875ca64d29ee..e30f2dd95aaf 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5455,6 +5455,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
   options.max_bytes_for_level_multiplier = 8;
   options.max_background_compactions = 1;
   options.num_levels = 5;
+  options.compaction_verify_record_count = false;
   std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
   options.table_factory = mtf;
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 654317d9da95..ad85daa5a6c4 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1018,6 +1018,11 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
     if (r->props.num_entries > r->props.num_range_deletions) {
       assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
     }
+    bool skip = false;
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTableBuilder::Add::skip", (void*)&skip);
+    if (skip) {
+      return;
+    }
 #endif  // !NDEBUG
 
     auto should_flush = r->flush_block_policy->Update(ikey, value);
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index 541b4a5b768a..9c4f87553774 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -151,6 +151,14 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
     return;
   }
 
+#ifndef NDEBUG
+  bool skip = false;
+  TEST_SYNC_POINT_CALLBACK("PlainTableBuilder::Add::skip", (void*)&skip);
+  if (skip) {
+    return;
+  }
+#endif  // !NDEBUG
+
   // Store key hash
   if (store_index_in_file_) {
     if (moptions_.prefix_extractor == nullptr) {
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 97b74f9bce68..5a465800e685 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -2558,6 +2558,9 @@ TEST_P(TransactionTest, FlushTest2) {
       case 0:
         break;
       case 1:
+        // Skip verifying record count against TableProperties for
+        // MockTables
+        options.compaction_verify_record_count = false;
         options.table_factory.reset(new mock::MockTableFactory());
         break;
       case 2: {

From 2e175124d8851225160a7beb506b6e6a767554e0 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 19 Mar 2025 12:08:06 -0700
Subject: [PATCH 026/500] Rename Env::IOActivity::kReadManifest (#13471)

Summary:
Context/Summary: as mentioned in the [comment](https://github.com/facebook/rocksdb/pull/13178?fbclid=IwZXh0bgNhZW0CMTAAAR1nvz-1Ifh6Pm8PwFZbGHAxhLtwfi4W_XaSe-BqnBx3ICJOq-9DTdqFvs0_aem_ITO_0B6cca0kTViRmsAA8g#issuecomment-2702510373) , we want to rename this public name to align with the naming convention.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13471

Test Plan:
- Compilation
- Manually check for no internal usage of this name. Hopefully it's good for OSS as well as this field is relatively new and the whole IOActivity is marked "EXPERIMENTAL"

Reviewed By: mszeszko-meta

Differential Revision: D71485300

Pulled By: hx235

fbshipit-source-id: 318c8b6c2a4d874f2f831e3ca690aa2fb8974c0f
---
 db/experimental.cc                     | 3 ++-
 env/env.cc                             | 4 ++--
 include/rocksdb/env.h                  | 2 +-
 include/rocksdb/thread_status.h        | 2 +-
 monitoring/thread_status_util_debug.cc | 5 +++--
 util/thread_operation.h                | 3 ++-
 6 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/db/experimental.cc b/db/experimental.cc
index 3691cfe8f741..2193342e056f 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -57,7 +57,8 @@ Status GetFileChecksumsFromCurrentManifest(FileSystem* fs,
   }
   assert(checksum_list);
 
-  const ReadOptions read_options(Env::IOActivity::kReadManifest);
+  const ReadOptions read_options(
+      Env::IOActivity::kGetFileChecksumsFromCurrentManifest);
   checksum_list->reset();
 
   std::unique_ptr<SequentialFileReader> file_reader;
diff --git a/env/env.cc b/env/env.cc
index d392eb036a52..8326c5619346 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -754,8 +754,8 @@ std::string Env::IOActivityToString(IOActivity activity) {
       return "GetEntity";
     case Env::IOActivity::kMultiGetEntity:
       return "MultiGetEntity";
-    case Env::IOActivity::kReadManifest:
-      return "ReadManifest";
+    case Env::IOActivity::kGetFileChecksumsFromCurrentManifest:
+      return "GetFileChecksumsFromCurrentManifest";
     case Env::IOActivity::kUnknown:
       return "Unknown";
   };
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index dfd6789a40c6..648c7bdf36c6 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -455,7 +455,7 @@ class Env : public Customizable {
     kVerifyFileChecksums = 7,
     kGetEntity = 8,
     kMultiGetEntity = 9,
-    kReadManifest = 10,
+    kGetFileChecksumsFromCurrentManifest = 10,
     kUnknown,  // Keep last for easy array of non-unknowns
   };
 
diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index 880b0bd4fa20..3c4bbe9a01ad 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -64,7 +64,7 @@ struct ThreadStatus {
     OP_VERIFY_FILE_CHECKSUMS,
     OP_GETENTITY,
     OP_MULTIGETENTITY,
-    OP_READ_MANIFEST,
+    OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST,
     NUM_OP_TYPES
   };
 
diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc
index a8233f78c623..7b6211bb5448 100644
--- a/monitoring/thread_status_util_debug.cc
+++ b/monitoring/thread_status_util_debug.cc
@@ -50,8 +50,9 @@ Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity(
       return Env::IOActivity::kGetEntity;
     case ThreadStatus::OperationType::OP_MULTIGETENTITY:
       return Env::IOActivity::kMultiGetEntity;
-    case ThreadStatus::OperationType::OP_READ_MANIFEST:
-      return Env::IOActivity::kReadManifest;
+    case ThreadStatus::OperationType::
+        OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST:
+      return Env::IOActivity::kGetFileChecksumsFromCurrentManifest;
     default:
       return Env::IOActivity::kUnknown;
   }
diff --git a/util/thread_operation.h b/util/thread_operation.h
index 7d906572615d..84911ddc82ff 100644
--- a/util/thread_operation.h
+++ b/util/thread_operation.h
@@ -47,7 +47,8 @@ static OperationInfo global_operation_table[] = {
     {ThreadStatus::OP_VERIFY_FILE_CHECKSUMS, "VerifyFileChecksums"},
     {ThreadStatus::OP_GETENTITY, "GetEntity"},
     {ThreadStatus::OP_MULTIGETENTITY, "MultiGetEntity"},
-    {ThreadStatus::OP_READ_MANIFEST, "ReadManifest"},
+    {ThreadStatus::OP_GET_FILE_CHECKSUMS_FROM_CURRENT_MANIFEST,
+     "GetFileChecksumsFromCurrentManifest"},
 
 };
 

From 12829883d7a9b9412217146e20af1e9cb79fc861 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 20 Mar 2025 13:18:48 -0700
Subject: [PATCH 027/500] Fix CompactionStats when max_sub_compaction > 1
 (#13470)

Summary:
## Issue

Thanks to PRs https://github.com/facebook/rocksdb/issues/13455 and https://github.com/facebook/rocksdb/issues/13464 , we were able to find another issue with compaction stats.

When there are multiple sub-compactions and they are processed remotely, some compaction stats are not collected correctly.

Here's an example of how `num_input_records` can be double-counted during a compaction with multiple sub-compactions executed remotely. Please note that this problem is not limited to `num_input_records`.

Input File: 1 SST file with 100 keys.

- Key 1~50 are in one sub compaction
- Key 51~100 in another sub compaction

`UpdateOutputLevelCompactionStats()` currently retrieves the total number of entries from the input files and sets `num_input_records` in the internal_stats to 100. In `CompactionJob::Run()`, this method is called once after all sub-compactions have finished. However, during remote compaction, `UpdateOutputLevelCompactionStats()` is called for each offloaded sub-compaction on the remote side and then aggregated on the primary host. The internal_stats for the first sub-compaction will have 100 `num_input_records`, and the second sub-compaction will have another 100 `num_input_records`. We end up having 200 `num_input_records` in the aggregated internal_stats.

There was another issue that `num_input_record` was not properly excluding `num_input_range_del` in `UpdateCompactionJobStats()`. `job_stats_->num_input_record` originally has correct value set by compaction iterator, but then later overwritten in `UpdateCompactionJobStats()`. `UpdateCompactionJobStats()` was called during `CompactionJob::Install()`, so not caught by `VerifyInputRecordCount()`.

## Refactor and other changes before the fixes
* Renamed `UpdateOutputLevelCompactionStats()` to `BuildStatsFromInputTableProperties()` to make the function more descriptive. `BuildStatsFromInputTableProperties()` builds input stats by scanning through entries from TableProperties in the Input Files and it's at the top compaction level, not at the sub-compaction level. (It also updates a couple of non-input stats, `bytes_read_blob` and `num_dropped_records`, but will be refactored in a later PR.)
* `UpdateCompactionJobStats()` was moved from `CompactionJob::Install()` to `CompactionJob::Run()` and separated into `UpdateCompactionJobInputStats()` and `UpdateCompactionJobOutputStats()`.

## Fixes
* Remote Compaction no longer updates the subcompaction-job-level input stats from InputTableProperties to avoid double-counted stats in case of multiple sub-compactions. Subcompaction-job-level input stats are aggregated to the compaction-job-level input stats in the primary host after all sub-compactions are finished.
* Remote Compaction now only calls `UpdateCompactionJobOutputStats()` to update the job-level output stats by copying from internal stats.
* `UpdateCompactionJobInputStats()` now takes `num_input_range_del` and properly subtracts it from the input record count. `VerifyInputRecordCount()` expected `job_stats.num_input_records` to be equal to `internal_stats_.output_level_stats.num_input_records - num_input_range_del`. However, when updating the job-level stats, we were taking the entire `internal_stats_.output_level_stats.num_input_records` after verification.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13470

Test Plan:
Local Compaction
```
./db_compaction_test -- --gtest_filter="*DBCompactionTest.VerifyRecordCount*"
```
Remote Compaction
```
./compaction_service_test --gtest_filter="*CompactionServiceTest.VerifyInputRecordCount*"
```

Reviewed By: pdillinger

Differential Revision: D71566149

Pulled By: jaykorean

fbshipit-source-id: c8aafcde701dec8901fd5e5a9ec186e26b896c19
---
 db/compaction/compaction_job.cc          | 134 ++++++++++++-----------
 db/compaction/compaction_job.h           |  45 +++++---
 db/compaction/compaction_service_job.cc  |  23 +---
 db/compaction/compaction_service_test.cc |  40 +++++++
 db/compaction/tiered_compaction_test.cc  |  26 +++--
 db/internal_stats.h                      |   2 +-
 6 files changed, 162 insertions(+), 108 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 9b0762e139b9..469f8ce54f59 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -858,48 +858,23 @@ Status CompactionJob::Run() {
   // Finish up all bookkeeping to unify the subcompaction results.
   compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
 
-  // For remote compactions, internal_stats_.output_level_stats were part of the
-  // compaction_result already. No need to re-update it.
-  if (job_stats_->is_remote_compaction == false) {
-    uint64_t num_input_range_del = 0;
-    bool ok = UpdateOutputLevelCompactionStats(&num_input_range_del);
-    // (Sub)compactions returned ok, do sanity check on the number of input
-    // keys.
-    if (status.ok() && ok && job_stats_->has_num_input_records) {
-      size_t ts_sz = compact_->compaction->column_family_data()
-                         ->user_comparator()
-                         ->timestamp_size();
-      // When trim_ts_ is non-empty, CompactionIterator takes
-      // HistoryTrimmingIterator as input iterator and sees a trimmed view of
-      // input keys. So the number of keys it processed is not suitable for
-      // verification here.
-      // TODO: support verification when trim_ts_ is non-empty.
-      if (!(ts_sz > 0 && !trim_ts_.empty())) {
-        assert(internal_stats_.output_level_stats.num_input_records > 0);
-        // TODO: verify the number of range deletion entries.
-        uint64_t expected =
-            internal_stats_.output_level_stats.num_input_records -
-            num_input_range_del;
-        uint64_t actual = job_stats_->num_input_records;
-        if (expected != actual) {
-          char scratch[2345];
-          compact_->compaction->Summary(scratch, sizeof(scratch));
-          std::string msg =
-              "Compaction number of input keys does not match "
-              "number of keys processed. Expected " +
-              std::to_string(expected) + " but processed " +
-              std::to_string(actual) + ". Compaction summary: " + scratch;
-          ROCKS_LOG_WARN(
-              db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
-              compact_->compaction->column_family_data()->GetName().c_str(),
-              job_context_->job_id, msg.c_str());
-          if (db_options_.compaction_verify_record_count) {
-            status = Status::Corruption(msg);
-          }
-        }
+  uint64_t num_input_range_del = 0;
+  bool ok = BuildStatsFromInputTableProperties(&num_input_range_del);
+  // (Sub)compactions returned ok, do sanity check on the number of input
+  // keys.
+  if (status.ok() && ok) {
+    if (job_stats_->has_num_input_records) {
+      status = VerifyInputRecordCount(num_input_range_del);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
+            compact_->compaction->column_family_data()->GetName().c_str(),
+            job_context_->job_id, status.ToString().c_str());
       }
     }
+    UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
   }
+  UpdateCompactionJobOutputStats(internal_stats_);
 
   // Verify number of output records
   if (status.ok() && db_options_.compaction_verify_record_count) {
@@ -1042,7 +1017,6 @@ Status CompactionJob::Install(bool* compaction_released) {
                      internal_stats_.proximal_level_stats.num_output_records);
   }
 
-  UpdateCompactionJobStats(internal_stats_);
   TEST_SYNC_POINT_CALLBACK(
       "CompactionJob::Install:AfterUpdateCompactionJobStats", job_stats_);
 
@@ -2125,7 +2099,7 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 }
 }  // namespace
 
-bool CompactionJob::UpdateOutputLevelCompactionStats(
+bool CompactionJob::BuildStatsFromInputTableProperties(
     uint64_t* num_input_range_del) {
   assert(compact_);
 
@@ -2199,27 +2173,25 @@ bool CompactionJob::UpdateOutputLevelCompactionStats(
     }
   }
 
+  // TODO - find a better place to set these two
   assert(job_stats_);
   internal_stats_.output_level_stats.bytes_read_blob =
       job_stats_->total_blob_bytes_read;
-
   internal_stats_.output_level_stats.num_dropped_records =
       internal_stats_.DroppedRecords();
   return !has_error;
 }
 
-void CompactionJob::UpdateCompactionJobStats(
-    const InternalStats::CompactionStatsFull& internal_stats) const {
+void CompactionJob::UpdateCompactionJobInputStats(
+    const InternalStats::CompactionStatsFull& internal_stats,
+    uint64_t num_input_range_del) const {
   assert(job_stats_);
-  job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
-  job_stats_->cpu_micros = internal_stats.output_level_stats.cpu_micros;
-
   // input information
   job_stats_->total_input_bytes =
       internal_stats.output_level_stats.bytes_read_non_output_levels +
       internal_stats.output_level_stats.bytes_read_output_level;
   job_stats_->num_input_records =
-      internal_stats.output_level_stats.num_input_records;
+      internal_stats.output_level_stats.num_input_records - num_input_range_del;
   job_stats_->num_input_files =
       internal_stats.output_level_stats.num_input_files_in_non_output_levels +
       internal_stats.output_level_stats.num_input_files_in_output_level;
@@ -2237,19 +2209,6 @@ void CompactionJob::UpdateCompactionJobStats(
       internal_stats.output_level_stats.bytes_skipped_non_output_levels +
       internal_stats.output_level_stats.bytes_skipped_output_level;
 
-  // output information
-  job_stats_->total_output_bytes =
-      internal_stats.output_level_stats.bytes_written;
-  job_stats_->total_output_bytes_blob =
-      internal_stats.output_level_stats.bytes_written_blob;
-  job_stats_->num_output_records =
-      internal_stats.output_level_stats.num_output_records;
-  job_stats_->num_output_files =
-      internal_stats.output_level_stats.num_output_files;
-  job_stats_->num_output_files_blob =
-      internal_stats.output_level_stats.num_output_files_blob;
-
-  // If proximal level output exists
   if (internal_stats.has_proximal_level_output) {
     job_stats_->total_input_bytes +=
         internal_stats.proximal_level_stats.bytes_read_non_output_levels +
@@ -2273,7 +2232,28 @@ void CompactionJob::UpdateCompactionJobStats(
     job_stats_->total_skipped_input_bytes +=
         internal_stats.proximal_level_stats.bytes_skipped_non_output_levels +
         internal_stats.proximal_level_stats.bytes_skipped_output_level;
+  }
+}
 
+void CompactionJob::UpdateCompactionJobOutputStats(
+    const InternalStats::CompactionStatsFull& internal_stats) const {
+  assert(job_stats_);
+  job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
+  job_stats_->cpu_micros = internal_stats.output_level_stats.cpu_micros;
+
+  // output information
+  job_stats_->total_output_bytes =
+      internal_stats.output_level_stats.bytes_written;
+  job_stats_->total_output_bytes_blob =
+      internal_stats.output_level_stats.bytes_written_blob;
+  job_stats_->num_output_records =
+      internal_stats.output_level_stats.num_output_records;
+  job_stats_->num_output_files =
+      internal_stats.output_level_stats.num_output_files;
+  job_stats_->num_output_files_blob =
+      internal_stats.output_level_stats.num_output_files_blob;
+
+  if (internal_stats.has_proximal_level_output) {
     job_stats_->total_output_bytes +=
         internal_stats.proximal_level_stats.bytes_written;
     job_stats_->total_output_bytes_blob +=
@@ -2366,4 +2346,36 @@ Env::IOPriority CompactionJob::GetRateLimiterPriority() {
   return Env::IO_LOW;
 }
 
+Status CompactionJob::VerifyInputRecordCount(
+    uint64_t num_input_range_del) const {
+  size_t ts_sz = compact_->compaction->column_family_data()
+                     ->user_comparator()
+                     ->timestamp_size();
+  // When trim_ts_ is non-empty, CompactionIterator takes
+  // HistoryTrimmingIterator as input iterator and sees a trimmed view of
+  // input keys. So the number of keys it processed is not suitable for
+  // verification here.
+  // TODO: support verification when trim_ts_ is non-empty.
+  if (!(ts_sz > 0 && !trim_ts_.empty())) {
+    assert(internal_stats_.output_level_stats.num_input_records > 0);
+    // TODO: verify the number of range deletion entries.
+    uint64_t expected = internal_stats_.output_level_stats.num_input_records -
+                        num_input_range_del;
+    uint64_t actual = job_stats_->num_input_records;
+    if (expected != actual) {
+      char scratch[2345];
+      compact_->compaction->Summary(scratch, sizeof(scratch));
+      std::string msg =
+          "Compaction number of input keys does not match "
+          "number of keys processed. Expected " +
+          std::to_string(expected) + " but processed " +
+          std::to_string(actual) + ". Compaction summary: " + scratch;
+      if (db_options_.compaction_verify_record_count) {
+        return Status::Corruption(msg);
+      }
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index a5ab355dd037..46deb9cc9bfe 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -196,24 +196,9 @@ class CompactionJob {
   IOStatus io_status() const { return io_status_; }
 
  protected:
-  // Update the following stats in internal_stats_.output_level_stats
-  // - num_input_files_in_non_output_levels
-  // - num_input_files_in_output_level
-  // - bytes_read_non_output_levels
-  // - bytes_read_output_level
-  // - num_input_records
-  // - bytes_read_blob
-  // - num_dropped_records
-  //
-  // @param num_input_range_del if non-null, will be set to the number of range
-  // deletion entries in this compaction input.
-  //
-  // Returns true iff internal_stats_.output_level_stats.num_input_records and
-  // num_input_range_del are calculated successfully.
-  bool UpdateOutputLevelCompactionStats(
-      uint64_t* num_input_range_del = nullptr);
-  void UpdateCompactionJobStats(
+  void UpdateCompactionJobOutputStats(
       const InternalStats::CompactionStatsFull& internal_stats) const;
+
   void LogCompaction();
   virtual void RecordCompactionIOStats();
   void CleanupCompaction();
@@ -240,6 +225,32 @@ class CompactionJob {
  private:
   friend class CompactionJobTestBase;
 
+  // Collect the following stats from Input Table Properties
+  // - num_input_files_in_non_output_levels
+  // - num_input_files_in_output_level
+  // - bytes_read_non_output_levels
+  // - bytes_read_output_level
+  // - num_input_records
+  // - bytes_read_blob
+  // - num_dropped_records
+  // and set them in internal_stats_.output_level_stats
+  //
+  // @param num_input_range_del if non-null, will be set to the number of range
+  // deletion entries in this compaction input.
+  //
+  // Returns true iff internal_stats_.output_level_stats.num_input_records and
+  // num_input_range_del are calculated successfully.
+  //
+  // This should be called only once for compactions (not per subcompaction)
+  bool BuildStatsFromInputTableProperties(
+      uint64_t* num_input_range_del = nullptr);
+
+  void UpdateCompactionJobInputStats(
+      const InternalStats::CompactionStatsFull& internal_stats,
+      uint64_t num_input_range_del) const;
+
+  Status VerifyInputRecordCount(uint64_t num_input_range_del) const;
+
   // Generates a histogram representing potential divisions of key ranges from
   // the input. It adds the starting and/or ending keys of certain input files
   // to the working set and then finds the approximate size of data in between
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index f345942e8a17..c5151f34b5be 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -394,29 +394,16 @@ Status CompactionServiceCompactionJob::Run() {
   // For remote compaction, there's only one subcompaction.
   compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
 
-  // 2. Update the following stats in internal_stats_.output_level_stats
-  // - num_input_files_in_non_output_levels
-  // - num_input_files_in_output_level
-  // - bytes_read_non_output_levels
-  // - bytes_read_output_level
-  // - num_input_records
-  // - bytes_read_blob
-  // - num_dropped_records
-  uint64_t num_input_range_del = 0;
-  const bool ok = UpdateOutputLevelCompactionStats(&num_input_range_del);
-  if (status.ok() && ok && job_stats_->has_num_input_records) {
-    // TODO(jaykorean) - verify record count
-    assert(job_stats_->num_input_records > 0);
-  }
-
-  // 3. Update job-level stats with the aggregated internal_stats_
-  UpdateCompactionJobStats(internal_stats_);
+  // 2. Update job-level output stats with the aggregated internal_stats_
+  // Please note that input stats will be updated by primary host when all
+  // subcompactions are finished
+  UpdateCompactionJobOutputStats(internal_stats_);
   // and set fields that are not propagated as part of the update
   compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
   compaction_result_->stats.is_full_compaction = c->is_full_compaction();
   compaction_result_->stats.is_remote_compaction = true;
 
-  // 4. Update IO Stats that are not part of the the update above
+  // 3. Update IO Stats that are not part of the the update above
   // (bytes_read, bytes_written)
   RecordCompactionIOStats();
 
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 711999262484..6f99a3781458 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -717,6 +717,46 @@ TEST_F(CompactionServiceTest, VerifyStatsLocalFallback) {
   VerifyTestData();
 }
 
+TEST_F(CompactionServiceTest, VerifyInputRecordCount) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  // Only iterator through 10 keys and force compaction to finish.
+  int num_iter = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::stop", [&](void* stop_ptr) {
+        num_iter++;
+        if (num_iter == 10) {
+          *(bool*)stop_ptr = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // CompactRange() should fail
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+  const char* expected_message =
+      "Compaction number of input keys does not match number of keys "
+      "processed.";
+  ASSERT_TRUE(std::strstr(s.getState(), expected_message));
+
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(CompactionServiceTest, CorruptedOutput) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index 591f8b5d3be7..5c21f8487572 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -81,7 +81,7 @@ class TieredCompactionTest : public DBTestBase {
   void VerifyCompactionStats(
       const std::vector<InternalStats::CompactionStats>& expected_stats,
       const InternalStats::CompactionStats& expected_pl_stats,
-      size_t output_level) {
+      size_t output_level, uint64_t num_input_range_del = 0) {
     const std::vector<InternalStats::CompactionStats>& stats =
         GetCompactionStats();
     const size_t kLevels = expected_stats.size();
@@ -102,7 +102,8 @@ class TieredCompactionTest : public DBTestBase {
     expected_job_stats.num_input_files =
         output_level_stats.num_input_files_in_output_level +
         output_level_stats.num_input_files_in_non_output_levels;
-    expected_job_stats.num_input_records = output_level_stats.num_input_records;
+    expected_job_stats.num_input_records =
+        output_level_stats.num_input_records - num_input_range_del;
     expected_job_stats.num_output_files =
         output_level_stats.num_output_files + pl_stats.num_output_files;
     expected_job_stats.num_output_records =
@@ -209,8 +210,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   // Second file from 10 ~ 109
   // ...
   size_t bytes_per_file = 1952;
-  int total_input_key_count = kNumTrigger * kNumKeys;
-  int total_output_key_count = 130;  // 0 ~ 129
+  uint64_t total_input_key_count = kNumTrigger * kNumKeys;
+  uint64_t total_output_key_count = 130;  // 0 ~ 129
 
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
@@ -283,7 +284,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
 
   // Now update the input count to be the total count from the previous
   total_input_key_count = total_output_key_count;
-  int moved_to_last_level_key_count = 10;
+  uint64_t moved_to_last_level_key_count = 10;
 
   // bytes read in non output = bytes written in penultimate level from previous
   uint64_t bytes_read_in_non_output_level = bytes_written_penultimate_level;
@@ -390,6 +391,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
   ASSERT_OK(Flush());
+  uint64_t num_input_range_del = 1;
 
   ResetAllStats(expect_stats, expect_pl_stats);
 
@@ -399,7 +401,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
 
   // Previous output + one delete range
-  total_input_key_count = total_output_key_count + 1;
+  total_input_key_count = total_output_key_count + num_input_range_del;
   moved_to_last_level_key_count = 20;
 
   last_level_stats = GetCompactionStats()[kLastLevel];
@@ -420,7 +422,7 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
     last_level_compaction_stats.num_input_files_in_output_level = 0;
     last_level_compaction_stats.num_input_records = total_input_key_count;
     last_level_compaction_stats.num_dropped_records =
-        1;  // delete range tombstone
+        num_input_range_del;  // delete range tombstone
     last_level_compaction_stats.num_output_records =
         moved_to_last_level_key_count;
     last_level_compaction_stats.num_output_files = 1;
@@ -435,10 +437,12 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
         bytes_written_penultimate_level;
     penultimate_level_compaction_stats.num_output_files = 1;
     penultimate_level_compaction_stats.num_output_records =
-        total_input_key_count - moved_to_last_level_key_count - 1;
+        total_input_key_count - moved_to_last_level_key_count -
+        num_input_range_del;
     expect_pl_stats.Add(penultimate_level_compaction_stats);
   }
-  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel);
+  VerifyCompactionStats(expect_stats, expect_pl_stats, kLastLevel,
+                        num_input_range_del);
 
   // verify data
   std::string value;
@@ -1063,8 +1067,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
   // Second file from 10 ~ 109
   // ...
   size_t bytes_per_file = 1952;
-  int total_input_key_count = kNumTrigger * kNumKeys;
-  int total_output_key_count = 130;  // 0 ~ 129
+  uint64_t total_input_key_count = kNumTrigger * kNumKeys;
+  uint64_t total_output_key_count = 130;  // 0 ~ 129
 
   for (int i = 0; i < kNumTrigger; i++) {
     for (int j = 0; j < kNumKeys; j++) {
diff --git a/db/internal_stats.h b/db/internal_stats.h
index 35dc42a98a49..e7fa002c4ccb 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -448,7 +448,7 @@ class InternalStats {
   };
 
   // Compaction internal stats, for per_key_placement compaction, it includes 2
-  // levels stats: the last level and the proximal level.
+  // output level stats: the last level and the proximal level.
   struct CompactionStatsFull {
     // the stats for the target primary output level
     CompactionStats output_level_stats;

From d0374a0a72908fb54bd60e9c32f78e848fd01d69 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Fri, 21 Mar 2025 13:10:43 -0700
Subject: [PATCH 028/500] Control SST write lifetime hints based on compaction
 style (#13472)

Summary:
This PR is a followup to https://github.com/facebook/rocksdb/pull/13461. We're introducing an experimental option / killswitch to control SST write lifetime hint calculation based on the selected compaction style. By default (and mostly for backwards compatibility reasons), we'll calculate the SST hints only for level compactions. With this change users have an option to configure SST lifetime hint policy in their environments to enable the calculations in the universal compaction mode as well. It's important to underline that as currently implemented, SST write lifetime hints are calculated in a static way and solely based on the level, which might not be suitable for non-uniform workloads with dynamic / high-variance lifespan of data within the same level. In those cases (or when the performance is not satisfactory), it's recommended to disable the hints by setting the set to empty. Please see the comment in `options.h` for more.

**NOTE:** We deliberately decided to introduce a new option to ensure no impact to external users running their RocksDB instances on local flash with the default `PosixWritableFile` file implementation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13472

Reviewed By: pdillinger, anand1976

Differential Revision: D71445488

Pulled By: mszeszko-meta

fbshipit-source-id: 57dc5e56662fa0b0fd686e183c0ec7090ff12d66
---
 db/compaction/compaction_job.cc               |  3 +-
 db/db_impl/db_impl_open.cc                    |  5 ++-
 db/flush_job.cc                               |  3 +-
 db/repair.cc                                  |  5 ++-
 db/version_set.cc                             | 42 ++++++++++++-------
 db/version_set.h                              |  3 +-
 include/rocksdb/options.h                     | 20 +++++++++
 options/db_options.cc                         |  5 ++-
 options/db_options.h                          |  1 +
 options/options_helper.cc                     |  2 +
 options/options_settable_test.cc              |  2 +
 .../calculate_sst_write_lifetime_hint_set.md  |  1 +
 12 files changed, 70 insertions(+), 22 deletions(-)
 create mode 100644 unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 469f8ce54f59..f534eb142d94 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -259,7 +259,8 @@ void CompactionJob::Prepare(
   assert(storage_info);
   assert(storage_info->NumLevelFiles(compact_->compaction->level()) > 0);
 
-  write_hint_ = storage_info->CalculateSSTWriteHint(c->output_level());
+  write_hint_ = storage_info->CalculateSSTWriteHint(
+      c->output_level(), db_options_.calculate_sst_write_lifetime_hint_set);
   bottommost_level_ = c->bottommost_level();
 
   if (!known_single_subcompact.has_value() && c->ShouldFormSubcompactions()) {
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 5fb98f2b6b98..ab6902075f46 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -2027,8 +2027,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     meta.oldest_ancester_time = current_time;
     meta.epoch_number = cfd->NewEpochNumber();
     {
-      auto write_hint =
-          cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
+      auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
+          /*level=*/0,
+          immutable_db_options_.calculate_sst_write_lifetime_hint_set);
       mutex_.Unlock();
 
       SequenceNumber earliest_write_conflict_snapshot;
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 552c122656ea..366f33a6b47c 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -870,7 +870,8 @@ Status FlushJob::WriteLevel0Table() {
   std::vector<BlobFileAddition> blob_file_additions;
 
   {
-    auto write_hint = base_->storage_info()->CalculateSSTWriteHint(/*level=*/0);
+    auto write_hint = base_->storage_info()->CalculateSSTWriteHint(
+        /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set);
     Env::IOPriority io_priority = GetRateLimiterPriority();
     db_mutex_->Unlock();
     if (log_buffer_) {
diff --git a/db/repair.cc b/db/repair.cc
index 0c108a601659..eaeb77795a1a 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -458,8 +458,9 @@ class Repairer {
       meta.file_creation_time = current_time;
       SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
 
-      auto write_hint =
-          cfd->current()->storage_info()->CalculateSSTWriteHint(/*level=*/0);
+      auto write_hint = cfd->current()->storage_info()->CalculateSSTWriteHint(
+          /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set);
+
       std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
           range_del_iters;
       auto range_del_iter = mem->NewRangeTombstoneIterator(
diff --git a/db/version_set.cc b/db/version_set.cc
index c8f13b48bc39..67d5c9fab3f7 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4903,24 +4903,38 @@ bool VersionStorageInfo::RangeMightExistAfterSortedRun(
 }
 
 Env::WriteLifeTimeHint VersionStorageInfo::CalculateSSTWriteHint(
-    int level) const {
-  if (compaction_style_ != kCompactionStyleLevel) {
+    int level, CompactionStyleSet compaction_style_set) const {
+  if (!compaction_style_set.Contains(compaction_style_)) {
     return Env::WLTH_NOT_SET;
   }
-  if (level == 0) {
-    return Env::WLTH_MEDIUM;
-  }
 
-  // L1: medium, L2: long, ...
-  if (level - base_level_ >= 2) {
-    return Env::WLTH_EXTREME;
-  } else if (level < base_level_) {
-    // There is no restriction which prevents level passed in to be smaller
-    // than base_level.
-    return Env::WLTH_MEDIUM;
+  switch (compaction_style_) {
+    case kCompactionStyleLevel:
+      if (level == 0) {
+        return Env::WLTH_MEDIUM;
+      }
+
+      // L1: medium, L2: long, ...
+      if (level - base_level_ >= 2) {
+        return Env::WLTH_EXTREME;
+      } else if (level < base_level_) {
+        // There is no restriction which prevents level passed in to be smaller
+        // than base_level.
+        return Env::WLTH_MEDIUM;
+      }
+      return static_cast<Env::WriteLifeTimeHint>(
+          level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM));
+    case kCompactionStyleUniversal:
+      if (level == 0) {
+        return Env::WLTH_SHORT;
+      }
+      if (level == 1) {
+        return Env::WLTH_MEDIUM;
+      }
+      return Env::WLTH_LONG;
+    default:
+      return Env::WLTH_NOT_SET;
   }
-  return static_cast<Env::WriteLifeTimeHint>(
-      level - base_level_ + static_cast<int>(Env::WLTH_MEDIUM));
 }
 
 void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
diff --git a/db/version_set.h b/db/version_set.h
index 6d6ee5c4864b..782f38846b55 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -630,7 +630,8 @@ class VersionStorageInfo {
                                      const Slice& largest_user_key,
                                      int last_level, int last_l0_idx);
 
-  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level) const;
+  Env::WriteLifeTimeHint CalculateSSTWriteHint(
+      int level, CompactionStyleSet compaction_style_set) const;
 
   const Comparator* user_comparator() const { return user_comparator_; }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index f83e6a381f06..66a31208ade4 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -62,6 +62,8 @@ struct Options;
 struct DbPath;
 
 using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+using CompactionStyleSet =
+    SmallEnumSet<CompactionStyle, CompactionStyle::kCompactionStyleNone>;
 
 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // The function recovers options to a previous version. Only 4.6 or later
@@ -1620,6 +1622,24 @@ struct DBOptions {
   // `kUnknown`, this overrides any temperature set by OptimizeForLogWrite
   // functions.
   Temperature wal_write_temperature = Temperature::kUnknown;
+
+  // Enum set indicative of which compaction styles SST write lifetime hint
+  // calculation is allowed on. Today, RocksDB provides native support for
+  // kCompactionStyleLevel and kCompactionStyleUniversal (experimental version).
+  // Other compaction styles, even when enabled in the set, won't have any
+  // effect in the default PosixWritableFile file implementation. There are
+  // numerous benefits coming from employing the hints including reduction in
+  // write amplification caused by OS file movement during garbage collection,
+  // and reduction in wear-leveling (SSDs). However, as currently implemented,
+  // SST write lifetime hints are calculated in a static way and solely based on
+  // the level, which might not be suitable for non-uniform workloads with
+  // dynamic / high-variance lifespan of data within the same level. In those
+  // cases (or when the performance is not satisfactory), it's recommended to
+  // disable the hints by assigning the setting to the empty set (= {});
+  //
+  // Default: Enabled in kCompactionStyleLevel mode.
+  CompactionStyleSet calculate_sst_write_lifetime_hint_set = {
+      CompactionStyle::kCompactionStyleLevel};
   // End EXPERIMENTAL
 };
 
diff --git a/options/db_options.cc b/options/db_options.cc
index ea8f4b22d7be..8453b101dd00 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -141,6 +141,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           std::shared_ptr<Statistics> statistics;
           std::vector<DbPath> db_paths;
           FileTypeSet checksum_handoff_file_types;
+          CompactionStyleSet calculate_sst_write_lifetime_hint_set;
          */
         {"advise_random_on_open",
          {offsetof(struct ImmutableDBOptions, advise_random_on_open),
@@ -801,7 +802,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       follower_catchup_retry_count(options.follower_catchup_retry_count),
       follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms),
       metadata_write_temperature(options.metadata_write_temperature),
-      wal_write_temperature(options.wal_write_temperature) {
+      wal_write_temperature(options.wal_write_temperature),
+      calculate_sst_write_lifetime_hint_set(
+          options.calculate_sst_write_lifetime_hint_set) {
   fs = env->GetFileSystem();
   clock = env->GetSystemClock().get();
   logger = info_log.get();
diff --git a/options/db_options.h b/options/db_options.h
index df0854f1dd61..0de6cccf7b0a 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -107,6 +107,7 @@ struct ImmutableDBOptions {
   uint64_t follower_catchup_retry_wait_ms;
   Temperature metadata_write_temperature;
   Temperature wal_write_temperature;
+  CompactionStyleSet calculate_sst_write_lifetime_hint_set;
 
   // Beginning convenience/helper objects that are not part of the base
   // DBOptions
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 70311b2e8394..c026ff195497 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -199,6 +199,8 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.metadata_write_temperature;
   options.wal_write_temperature = immutable_db_options.wal_write_temperature;
   options.compaction_service = immutable_db_options.compaction_service;
+  options.calculate_sst_write_lifetime_hint_set =
+      immutable_db_options.calculate_sst_write_lifetime_hint_set;
 }
 
 ColumnFamilyOptions BuildColumnFamilyOptions(
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 6d777ee18735..6fab7daeeba1 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -342,6 +342,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
       {offsetof(struct DBOptions, compaction_service),
        sizeof(std::shared_ptr<CompactionService>)},
       {offsetof(struct DBOptions, daily_offpeak_time_utc), sizeof(std::string)},
+      {offsetof(struct DBOptions, calculate_sst_write_lifetime_hint_set),
+       sizeof(CompactionStyleSet)},
   };
 
   char* options_ptr = new char[sizeof(DBOptions)];
diff --git a/unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md b/unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md
new file mode 100644
index 000000000000..5780d831a38a
--- /dev/null
+++ b/unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md
@@ -0,0 +1 @@
+Added a new `DBOptions.calculate_sst_write_lifetime_hint_set` setting that allows to customize which compaction styles SST write lifetime hint calculation is allowed on. Today RocksDB supports only two modes `kCompactionStyleLevel` and `kCompactionStyleUniversal`.

From 7f3ee34cdf705415a9207cd102eb6907ce4f91e5 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 21 Mar 2025 15:55:41 -0700
Subject: [PATCH 029/500] Experimental ingestion option atomic_replace_range
 (#13453)

Summary:
Adding a new option (argument) for file ingestion `atomic_replace_range` which is intended to support a couple forms of "atomic replacement of a key range":
* (Experimental implementation here) With snapshot_consistency=false, the feature acts like an atomic DeleteFilesInRange prior to the ingestion, though requires no existing files to partially overlap the range. (Consider using SstPartitioner.) This is especially useful for "always compacted" workloads, perhaps along with CF option `disallow_memtable_writes` and ingestion option `fail_if_not_bottommost_level`. If both bounds are nullptr, the whole CF is replaced.
* (To implement in follow-up) With snapshot_consistency=true (and perhaps in some fallback cases from above such as partial overlap), a "giant tombstone file" as in https://github.com/facebook/rocksdb/issues/13078 is generated and ingested at the beginning of the list.

Because I see this as a more elaborate DeleteRange, I would naturally expect the upper bound/limit key to be exclusive, but it has been challenging getting that to work. The inclusive/exclusive handling is currently a documented bug for the experimental feature to sort out in follow-up work. (I would love to take advantage of proposed SliceBound, but that would be ambitious to adapt to DeleteRange. Even getting the "replace whole CF" variant of the functionality might be difficult to get worthing with DeleteRange underneath. Nevertheless, I feel it's best to consolidate these two forms of "atomic replacement" under variants of the same API.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13453

Test Plan:
Unit tests added / updated.

db_stress integration left as follow-up work (experimental feature, will be challenging)

Reviewed By: anand1976

Differential Revision: D71584295

Pulled By: pdillinger

fbshipit-source-id: 307abff426e4b7d0a340008918ebcddc896ef747
---
 db/compaction/compaction_picker.h     |   6 +
 db/db_impl/db_impl.cc                 |  29 ++-
 db/dbformat.h                         |   3 +
 db/external_sst_file_basic_test.cc    | 247 +++++++++++++++++++++++++-
 db/external_sst_file_ingestion_job.cc | 126 +++++++++++--
 db/external_sst_file_ingestion_job.h  |  79 +++++---
 db/external_sst_file_test.cc          |   4 +-
 include/rocksdb/db.h                  |  22 +++
 include/rocksdb/options.h             |   7 +-
 9 files changed, 466 insertions(+), 57 deletions(-)

diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 9d23555ec596..093344c65c43 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -138,6 +138,12 @@ class CompactionPicker {
     return !level0_compactions_in_progress_.empty();
   }
 
+  // Is any compaction in progress
+  bool IsCompactionInProgress() const {
+    return !(level0_compactions_in_progress_.empty() &&
+             compactions_in_progress_.empty());
+  }
+
   // Return true if the passed key range overlap with a compaction output
   // that is currently running.
   bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index a3c107e0ab45..31512116f275 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -5799,6 +5799,27 @@ Status DBImpl::IngestExternalFiles(
             "timestamps enabled doesn't support ingest behind.");
       }
     }
+    if (arg.atomic_replace_range.has_value()) {
+      if (ingest_opts.ingest_behind) {
+        return Status::InvalidArgument(
+            "Can't combine atomic_replace_range with ingest_behind.");
+      }
+      if (ingest_opts.snapshot_consistency) {
+        // TODO: support generating and ingesting a big tombstone file, which
+        // might depend on non-nullptr start and limit
+        return Status::NotSupported(
+            "atomic_replace_range not yet supported with "
+            "snapshot_consistency.");
+      } else {
+        if ((arg.atomic_replace_range->start == nullptr) ^
+            (arg.atomic_replace_range->limit == nullptr)) {
+          return Status::NotSupported(
+              "Only one of atomic_replace_range.{start,limit} == nullptr is "
+              "not supported.");
+        }
+      }
+    }
+
     if (ingest_opts.allow_db_generated_files) {
       if (ingest_opts.write_global_seqno) {
         return Status::NotSupported(
@@ -5847,8 +5868,8 @@ Status DBImpl::IngestExternalFiles(
             this);
     Status es = ingestion_jobs[i].Prepare(
         args[i].external_files, args[i].files_checksums,
-        args[i].files_checksum_func_names, args[i].file_temperature,
-        start_file_number, super_version);
+        args[i].files_checksum_func_names, args[i].atomic_replace_range,
+        args[i].file_temperature, start_file_number, super_version);
     // capture first error only
     if (!es.ok() && status.ok()) {
       status = es;
@@ -5863,8 +5884,8 @@ Status DBImpl::IngestExternalFiles(
             this);
     Status es = ingestion_jobs[0].Prepare(
         args[0].external_files, args[0].files_checksums,
-        args[0].files_checksum_func_names, args[0].file_temperature,
-        next_file_number, super_version);
+        args[0].files_checksum_func_names, args[0].atomic_replace_range,
+        args[0].file_temperature, next_file_number, super_version);
     if (!es.ok()) {
       status = es;
     }
diff --git a/db/dbformat.h b/db/dbformat.h
index 3dfb077397ed..6ceda7fa54de 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -83,6 +83,8 @@ extern const ValueType kValueTypeForSeekForPrev;
 
 // A range of user keys used internally by RocksDB. Also see `Range` used by
 // public APIs.
+// TODO: merge with Range in pubic API, but this is generally inclusive limit
+// and it is maybe exclusive limit
 struct UserKeyRange {
   // In case of user_defined timestamp, if enabled, `start` and `limit` should
   // include user_defined timestamps.
@@ -469,6 +471,7 @@ class InternalKey {
 
   Slice user_key() const { return ExtractUserKey(rep_); }
   size_t size() const { return rep_.size(); }
+  bool unset() const { return rep_.empty(); }
 
   void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
     SetFrom(ParsedInternalKey(_user_key, s, t));
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index f8108651ec4c..95228a6fa14c 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -1954,21 +1954,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
     SstFileWriter sst_file_writer(EnvOptions(), options);
     std::string file3 = sst_files_dir_ + "file3.sst";
     ASSERT_OK(sst_file_writer.Open(file3));
-    ASSERT_OK(sst_file_writer.Put("j", "j1"));
+    ASSERT_OK(sst_file_writer.Put("k", "k1"));
     ASSERT_OK(sst_file_writer.Put("m", "m1"));
     ExternalSstFileInfo file3_info;
     ASSERT_OK(sst_file_writer.Finish(&file3_info));
     files.push_back(std::move(file3));
   }
 
+  // This could be ingested to the same level as file3 and file4, but the
+  // greedy/simple overlap check relegates it to a later level
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    ASSERT_OK(sst_file_writer.Put("j", "j1"));
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    files.push_back(std::move(file4));
+  }
+
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    ASSERT_OK(sst_file_writer.Put("i", "i3"));
+    ExternalSstFileInfo file5_info;
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
+    files.push_back(std::move(file5));
+  }
+
   IngestExternalFileOptions ifo;
   ifo.allow_global_seqno = false;
   ASSERT_NOK(db_->IngestExternalFile(files, ifo));
   ifo.allow_global_seqno = true;
   ASSERT_OK(db_->IngestExternalFile(files, ifo));
   ASSERT_EQ(Get("a"), "a1");
-  ASSERT_EQ(Get("i"), "i2");
+  ASSERT_EQ(Get("i"), "i3");
   ASSERT_EQ(Get("j"), "j1");
+  ASSERT_EQ(Get("k"), "k1");
   ASSERT_EQ(Get("m"), "m1");
 
   int total_keys = 0;
@@ -1979,10 +2002,11 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
   }
   ASSERT_OK(iter->status());
   delete iter;
-  ASSERT_EQ(total_keys, 4);
+  ASSERT_EQ(total_keys, 5);
 
   ASSERT_EQ(1, NumTableFilesAtLevel(6));
   ASSERT_EQ(2, NumTableFilesAtLevel(5));
+  ASSERT_EQ(2, NumTableFilesAtLevel(4));
 }
 
 class CompactionJobStatsCheckerForFilteredFiles : public EventListener {
@@ -2686,10 +2710,11 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
 
     // Ingest with snapshot consistency
     std::string file_path = sst_files_dir_ + std::to_string(1);
+    std::string file_path2 = sst_files_dir_ + std::to_string(2);
     SstFileWriter sfw(EnvOptions(), options);
 
     ASSERT_OK(sfw.Open(file_path));
-    ASSERT_OK(sfw.Put("b", "dontcare"));
+    ASSERT_OK(sfw.Put("b", "0"));
     ASSERT_OK(sfw.Finish());
 
     {
@@ -2700,6 +2725,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
       ifo.snapshot_consistency = true;
       ASSERT_OK(db_->IngestExternalFile(handles_[0], {file_path}, ifo));
     }
+    ASSERT_EQ(Get(0, "b"), "0");
 
     // Test level compaction
     options.compaction_style = CompactionStyle::kCompactionStyleLevel;
@@ -2721,8 +2747,6 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
       EXPECT_EQ(Put(1, "a", "1").code(), Status::Code::kInvalidArgument);
 
       // Use ingestion to get to the same state as above
-      std::string file_path2 = sst_files_dir_ + std::to_string(2);
-
       ASSERT_OK(sfw.Open(file_path2));
       ASSERT_OK(sfw.Put("a", "1"));
       ASSERT_OK(sfw.Put("c", "3"));
@@ -2735,16 +2759,221 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
       ASSERT_OK(sfw.Finish());
       ASSERT_OK(db_->IngestExternalFile(handles_[1], {file_path2}, {}));
     }
+    ASSERT_EQ(Get(1, "a"), "1");
+    ASSERT_EQ(Get(1, "b"), "2");
+    ASSERT_EQ(Get(1, "c"), "3");
+    ASSERT_EQ(Get(1, "d"), "4");
 
     {
+      // Test fail_if_not_bottommost_level, which fails if there's any overlap
+      // anywhere, even with snapshot_consistency=false
+      IngestExternalFileOptions ifo;
+      ASSERT_FALSE(ifo.fail_if_not_bottommost_level);
+      ifo.fail_if_not_bottommost_level = true;
+      ifo.snapshot_consistency = false;
+      // Fails with overlap on earlier level
+      Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_EQ(s.code(), Status::Code::kTryAgain);
+
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
 
+      // Fails with overlap on last level
+      s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_EQ(s.code(), Status::Code::kTryAgain);
+
+      // No change to data
+      ASSERT_EQ(Get(1, "a"), "1");
+      ASSERT_EQ(Get(1, "b"), "2");
+      ASSERT_EQ(Get(1, "c"), "3");
+      ASSERT_EQ(Get(1, "d"), "4");
+    }
+
+    if (!disallow_memtable) {
+      // Test allow_blocking_flush=false (fail because of memtable overlap)
       IngestExternalFileOptions ifo;
-      ifo.fail_if_not_bottommost_level = true;
-      const Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
-      ASSERT_TRUE(s.IsTryAgain());
+      ASSERT_TRUE(ifo.allow_blocking_flush);
+      ifo.allow_blocking_flush = false;
+      ASSERT_OK(Put(1, "b", "42"));
+      Status s = db_->IngestExternalFile(handles_[1], {file_path}, ifo);
+      ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+      ASSERT_EQ(Get(1, "a"), "1");
+      ASSERT_EQ(Get(1, "b"), "42");
+      ASSERT_EQ(Get(1, "c"), "3");
+      ASSERT_EQ(Get(1, "d"), "4");
+
+      // Revert state
+      ASSERT_OK(Put(1, "b", "2"));
+      ASSERT_OK(Flush(1));
+    }
+
+    {
+      // Test atomic_replace_range
+      IngestExternalFileArg arg;
+      arg.column_family = handles_[1];
+      arg.external_files = {file_path};
+      Range replace_rng{"a", "zzz"};
+      arg.atomic_replace_range = {{&replace_rng.start, &replace_rng.limit}};
+
+      // start with some failure cases
+      // TODO: support snapshot consistency with tombstone file
+      ASSERT_TRUE(arg.options.snapshot_consistency);
+      Status s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kNotSupported);
+
+      ASSERT_EQ(Get(1, "a"), "1");
+      ASSERT_EQ(Get(1, "b"), "2");
+      ASSERT_EQ(Get(1, "c"), "3");
+      ASSERT_EQ(Get(1, "d"), "4");
+
+      arg.options.snapshot_consistency = false;
+      // Can usually be used with atomic_replace_range and
+      // snapshot_consistency=false, except it requires no input overlap
+      arg.options.fail_if_not_bottommost_level = true;
+
+      // one-sided ranges not yet supported
+      arg.atomic_replace_range = {{nullptr, &replace_rng.limit}};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kNotSupported);
+
+      arg.atomic_replace_range = {{&replace_rng.start, nullptr}};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kNotSupported);
+
+      arg.atomic_replace_range = {{&replace_rng.start, &replace_rng.limit}};
+
+      // rejected because doesn't cover ingested file
+      replace_rng = {"x", "z"};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+      // rejected because of partial file overlap
+      replace_rng = {"a", "c"};
+      s = db_->IngestExternalFiles({arg});
+      ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+      if (!disallow_memtable) {
+        // memtable overlap with replace range
+        ASSERT_OK(Put(1, "e", "5"));
+        arg.options.allow_blocking_flush = false;
+
+        // rejected because of memtable overlap
+        replace_rng = {"a", "z"};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // rejected because of memtable overlap
+        arg.atomic_replace_range = {{nullptr, nullptr}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        arg.atomic_replace_range = {{&replace_rng.start, &replace_rng.limit}};
+
+        // FIXME: upper bound should be exclusive (DeleteRange semantics).
+        // currently rejected because of documented bug
+        replace_rng = {"a", "e"};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // work-around ensuring no memtable overlap
+        replace_rng = {"a", "d2"};
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "e"), "5");
+      } else {
+        // rejected because of partial file overlap
+        replace_rng = {"b", "z"};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // no memtable complications
+        replace_rng = {"a", "z"};
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
+      }
+      ASSERT_EQ(Get(1, "a"), "NOT_FOUND");
+      ASSERT_EQ(Get(1, "b"), "0");
+      ASSERT_EQ(Get(1, "c"), "NOT_FOUND");
+      ASSERT_EQ(Get(1, "d"), "NOT_FOUND");
+
+      // The single ingested file replaced everything (except perhaps memtable)
+      std::vector<LiveFileMetaData> live_files;
+      db_->GetLiveFilesMetaData(&live_files);
+      // One file in each CF
+      ASSERT_EQ(live_files.size(), 2);
+
+      ASSERT_OK(sfw.Open(file_path));
+      ASSERT_OK(sfw.Put("f", "6"));
+      ASSERT_OK(sfw.Finish());
+
+      // Another file
+      ASSERT_OK(sfw.Open(file_path2));
+      ASSERT_OK(sfw.Put("f", "7"));
+      ASSERT_OK(sfw.Put("g", "8"));
+      ASSERT_OK(sfw.Finish());
+
+      if (!disallow_memtable) {
+        // rejected because of memtable overlap with range
+        replace_rng = {"e", "z"};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // allow blocking flush of "e" (which is then replaced), and the file
+        // with just "b" is not replaced
+        arg.options.allow_blocking_flush = true;
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "b"), "0");
+        ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
+        ASSERT_EQ(Get(1, "f"), "6");
+        ASSERT_EQ(Get(1, "g"), "NOT_FOUND");
+
+        // memtable overlap with replace range
+        ASSERT_OK(Put(1, "e", "5"));
+        arg.options.allow_blocking_flush = false;
+        arg.external_files = {file_path2};
+
+        // rejected because of memtable overlap
+        arg.atomic_replace_range = {{nullptr, nullptr}};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // Replace everything, including with memtable flush
+        arg.options.allow_blocking_flush = true;
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "b"), "NOT_FOUND");
+        ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
+        ASSERT_EQ(Get(1, "f"), "7");
+        ASSERT_EQ(Get(1, "g"), "8");
+      } else {
+        arg.external_files = {file_path2, file_path};
+
+        // rejected because of overlap in files to ingest with fail_if_ = true
+        replace_rng = {"e", "z"};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kTryAgain);
+
+        arg.options.fail_if_not_bottommost_level = false;
+
+        // rejected because range doesn't cover ingested files
+        // FIXME: upper bound should be exclusive "g" instead
+        replace_rng = {"e", "f2"};
+        s = db_->IngestExternalFiles({arg});
+        ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
+
+        // Loaded into different levels, and the file with just "b" is not
+        // replaced
+        replace_rng = {"e", "z"};
+        ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+        ASSERT_EQ(Get(1, "b"), "0");
+        ASSERT_EQ(Get(1, "f"), "6");  // earlier file listed later to ingest
+        ASSERT_EQ(Get(1, "g"), "8");
+      }
     }
   }
 }
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index a439189afa7e..f1d891628af6 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -29,6 +29,7 @@ Status ExternalSstFileIngestionJob::Prepare(
     const std::vector<std::string>& external_files_paths,
     const std::vector<std::string>& files_checksums,
     const std::vector<std::string>& files_checksum_func_names,
+    const std::optional<RangePtr>& atomic_replace_range,
     const Temperature& file_temperature, uint64_t next_file_number,
     SuperVersion* sv) {
   Status status;
@@ -80,15 +81,47 @@ Status ExternalSstFileIngestionJob::Prepare(
     std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_);
 
     for (size_t i = 0; i + 1 < num_files; i++) {
-      if (file_range_checker_.OverlapsWithPrev(sorted_files[i],
-                                               sorted_files[i + 1],
-                                               /* ranges_sorted= */ true)) {
+      if (file_range_checker_.Overlaps(*sorted_files[i], *sorted_files[i + 1],
+                                       /* known_sorted= */ true)) {
         files_overlap_ = true;
         break;
       }
     }
   }
 
+  if (atomic_replace_range.has_value()) {
+    atomic_replace_range_.emplace();
+
+    if (atomic_replace_range->start && atomic_replace_range->limit) {
+      // User keys to internal keys (with timestamps)
+      const size_t ts_sz = ucmp_->timestamp_size();
+      std::string start_with_ts, limit_with_ts;
+      auto [start, limit] = MaybeAddTimestampsToRange(
+          atomic_replace_range->start, atomic_replace_range->limit, ts_sz,
+          &start_with_ts, &limit_with_ts);
+      assert(start.has_value());
+      assert(limit.has_value());
+      atomic_replace_range_->smallest_internal_key.Set(
+          *start, kMaxSequenceNumber, kValueTypeForSeek);
+      atomic_replace_range_->largest_internal_key.Set(
+          *limit, kMaxSequenceNumber, kValueTypeForSeek);
+      // Check files to ingest against replace range
+      for (size_t i = 0; i < num_files; i++) {
+        if (!file_range_checker_.Contains(*atomic_replace_range_,
+                                          files_to_ingest_[i])) {
+          return Status::InvalidArgument(
+              "Atomic replace range does not contain all files");
+        }
+      }
+    } else {
+      // Currently if either bound is nullptr, both must be
+      assert(atomic_replace_range->start == nullptr);
+      assert(atomic_replace_range->limit == nullptr);
+      assert(atomic_replace_range_->smallest_internal_key.unset());
+      assert(atomic_replace_range_->largest_internal_key.unset());
+    }
+  }
+
   if (ingestion_options_.ingest_behind && files_overlap_) {
     return Status::NotSupported(
         "Files with overlapping ranges cannot be ingested with ingestion "
@@ -359,9 +392,9 @@ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
 
   file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
   for (auto& file : files_to_ingest_) {
-    if (file_range_checker_.OverlapsWithPrev(&file_batches_to_ingest_.back(),
-                                             &file,
-                                             /* ranges_sorted= */ false)) {
+    if (!file_batches_to_ingest_.back().unset() &&
+        file_range_checker_.Overlaps(file_batches_to_ingest_.back(), file,
+                                     /* known_sorted= */ false)) {
       file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true);
     }
     file_batches_to_ingest_.back().AddFile(&file, file_range_checker_);
@@ -370,14 +403,32 @@ void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() {
 
 Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
                                                SuperVersion* super_version) {
-  size_t n = files_to_ingest_.size();
-  autovector<UserKeyRange> ranges;
-  ranges.reserve(n);
-  for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
-    ranges.emplace_back(file_to_ingest.start_ukey, file_to_ingest.limit_ukey);
+  Status status;
+  if (atomic_replace_range_.has_value() && atomic_replace_range_->unset()) {
+    // For replacing whole CF, we can simply check whether memtable is empty
+    *flush_needed = !super_version->mem->IsEmpty();
+  } else {
+    autovector<UserKeyRange> ranges;
+    if (atomic_replace_range_.has_value()) {
+      assert(!atomic_replace_range_->smallest_internal_key.unset());
+      assert(!atomic_replace_range_->largest_internal_key.unset());
+      // NOTE: we already checked in Prepare() that the atomic_replace_range
+      // covers all the files_to_ingest
+      // FIXME: need to make upper bound key exclusive (not easy here because
+      // the existing internal APIs deal in inclusive upper bound user keys)
+      ranges.emplace_back(
+          atomic_replace_range_->smallest_internal_key.user_key(),
+          atomic_replace_range_->largest_internal_key.user_key());
+    } else {
+      ranges.reserve(files_to_ingest_.size());
+      for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+        ranges.emplace_back(file_to_ingest.start_ukey,
+                            file_to_ingest.limit_ukey);
+      }
+    }
+    status = cfd_->RangesOverlapWithMemtables(
+        ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
   }
-  Status status = cfd_->RangesOverlapWithMemtables(
-      ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
   if (status.ok() && *flush_needed) {
     if (!ingestion_options_.allow_blocking_flush) {
       status = Status::InvalidArgument("External file requires flush");
@@ -430,8 +481,49 @@ Status ExternalSstFileIngestionJob::Run() {
   // the only active writer, and hence they are equal
   SequenceNumber last_seqno = versions_->LastSequence();
   edit_.SetColumnFamily(cfd_->GetID());
-  // The levels that the files will be ingested into
 
+  if (atomic_replace_range_.has_value()) {
+    auto* vstorage = super_version->current->storage_info();
+    if (atomic_replace_range_->unset()) {
+      if (cfd_->compaction_picker()->IsCompactionInProgress()) {
+        return Status::InvalidArgument(
+            "Atomic replace range (full) overlaps with pending compaction");
+      }
+      for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+        for (auto file : vstorage->LevelFiles(lvl)) {
+          // Set up to delete file to be replaced
+          edit_.DeleteFile(lvl, file->fd.GetNumber());
+        }
+      }
+    } else {
+      assert(!atomic_replace_range_->smallest_internal_key.unset());
+      assert(!atomic_replace_range_->largest_internal_key.unset());
+      for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+        if (cfd_->RangeOverlapWithCompaction(
+                atomic_replace_range_->smallest_internal_key.user_key(),
+                atomic_replace_range_->largest_internal_key.user_key(), lvl)) {
+          return Status::InvalidArgument(
+              "Atomic replace range overlaps with pending compaction");
+        }
+        for (auto file : vstorage->LevelFiles(lvl)) {
+          if (file_range_checker_.Overlaps(*atomic_replace_range_,
+                                           file->smallest, file->largest)) {
+            if (file_range_checker_.Contains(*atomic_replace_range_,
+                                             file->smallest, file->largest)) {
+              // Set up to delete file to be replaced
+              edit_.DeleteFile(lvl, file->fd.GetNumber());
+            } else {
+              // TODO: generate and ingest a tombstone file also
+              return Status::InvalidArgument(
+                  "Atomic replace range partially overlaps with existing file");
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Find levels to ingest into
   std::optional<int> prev_batch_uppermost_level;
   for (auto& batch : file_batches_to_ingest_) {
     int batch_uppermost_level = 0;
@@ -1104,6 +1196,10 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     if (lvl > 0 && lvl < vstorage->base_level()) {
       continue;
     }
+    if (atomic_replace_range_.has_value()) {
+      target_level = lvl;
+      continue;
+    }
     if (cfd_->RangeOverlapWithCompaction(file_to_ingest->start_ukey,
                                          file_to_ingest->limit_ukey, lvl)) {
       // We must use L0 or any level higher than `lvl` to be able to overwrite
@@ -1172,6 +1268,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
 
 Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
     IngestedFileInfo* file_to_ingest) {
+  assert(!atomic_replace_range_.has_value());
+
   auto* vstorage = cfd_->current()->storage_info();
   // First, check if new files fit in the last level
   int last_lvl = cfd_->NumberLevels() - 1;
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 4a853afed971..3ca95a2b543a 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -27,50 +27,77 @@ class SystemClock;
 
 struct KeyRangeInfo {
   // Smallest internal key in an external file or for a batch of external files.
+  // unset() could be either invalid or "before all keys"
   InternalKey smallest_internal_key;
   // Largest internal key in an external file or for a batch of external files.
+  // unset() could be either invalid or "after all keys"
   InternalKey largest_internal_key;
 
-  bool empty() const {
-    return smallest_internal_key.size() == 0 &&
-           largest_internal_key.size() == 0;
+  bool unset() const {
+    // Legal internal keys are at least 8 bytes.
+    return smallest_internal_key.unset() || largest_internal_key.unset();
   }
 };
 
 // Helper class to apply SST file key range checks to the external files.
+// XXX: using sstableKeyCompare with user comparator on internal keys is
+// very broken
 class ExternalFileRangeChecker {
  public:
   explicit ExternalFileRangeChecker(const Comparator* ucmp) : ucmp_(ucmp) {}
 
   // Operator used for sorting ranges.
-  bool operator()(const KeyRangeInfo* prev_range,
-                  const KeyRangeInfo* range) const {
-    assert(prev_range);
-    assert(range);
-    return sstableKeyCompare(ucmp_, prev_range->smallest_internal_key,
-                             range->smallest_internal_key) < 0;
+  bool operator()(const KeyRangeInfo* range1,
+                  const KeyRangeInfo* range2) const {
+    assert(range1);
+    assert(range2);
+    assert(!range1->unset());
+    assert(!range2->unset());
+    return sstableKeyCompare(ucmp_, range1->smallest_internal_key,
+                             range2->smallest_internal_key) < 0;
   }
 
-  // Check whether `range` overlaps with `prev_range`. `ranges_sorted` can be
-  // set to true when the inputs are already sorted based on the sorting logic
-  // provided by this checker's operator(), which can help simplify the check.
-  bool OverlapsWithPrev(const KeyRangeInfo* prev_range,
-                        const KeyRangeInfo* range,
-                        bool ranges_sorted = false) const {
-    assert(prev_range);
-    assert(range);
-    if (prev_range->empty() || range->empty()) {
+  bool Overlaps(const KeyRangeInfo& range1, const KeyRangeInfo& range2,
+                bool known_sorted = false) const {
+    return Overlaps(range1, range2.smallest_internal_key,
+                    range2.largest_internal_key, known_sorted);
+  }
+  bool Overlaps(const KeyRangeInfo& range1, const InternalKey& range2_smallest,
+                const InternalKey& range2_largest,
+                bool known_sorted = false) const {
+    bool any_unset =
+        range1.unset() || range2_smallest.unset() || range2_largest.unset();
+    if (any_unset) {
+      assert(!any_unset);
       return false;
     }
-    if (ranges_sorted) {
-      return sstableKeyCompare(ucmp_, prev_range->largest_internal_key,
-                               range->smallest_internal_key) >= 0;
+    if (known_sorted) {
+      return sstableKeyCompare(ucmp_, range1.largest_internal_key,
+                               range2_smallest) >= 0;
     }
 
-    return sstableKeyCompare(ucmp_, prev_range->largest_internal_key,
-                             range->smallest_internal_key) >= 0 &&
-           sstableKeyCompare(ucmp_, prev_range->smallest_internal_key,
-                             range->largest_internal_key) <= 0;
+    return sstableKeyCompare(ucmp_, range1.largest_internal_key,
+                             range2_smallest) >= 0 &&
+           sstableKeyCompare(ucmp_, range1.smallest_internal_key,
+                             range2_largest) <= 0;
+  }
+
+  bool Contains(const KeyRangeInfo& range1, const KeyRangeInfo& range2) {
+    return Contains(range1, range2.smallest_internal_key,
+                    range2.largest_internal_key);
+  }
+  bool Contains(const KeyRangeInfo& range1, const InternalKey& range2_smallest,
+                const InternalKey& range2_largest) {
+    bool any_unset =
+        range1.unset() || range2_smallest.unset() || range2_largest.unset();
+    if (any_unset) {
+      assert(!any_unset);
+      return false;
+    }
+    return sstableKeyCompare(ucmp_, range1.smallest_internal_key,
+                             range2_smallest) <= 0 &&
+           sstableKeyCompare(ucmp_, range1.largest_internal_key,
+                             range2_largest) >= 0;
   }
 
   void MaybeUpdateRange(const InternalKey& start_key,
@@ -218,6 +245,7 @@ class ExternalSstFileIngestionJob {
   Status Prepare(const std::vector<std::string>& external_files_paths,
                  const std::vector<std::string>& files_checksums,
                  const std::vector<std::string>& files_checksum_func_names,
+                 const std::optional<RangePtr>& atomic_replace_range,
                  const Temperature& file_temperature, uint64_t next_file_number,
                  SuperVersion* sv);
 
@@ -362,6 +390,7 @@ class ExternalSstFileIngestionJob {
   autovector<IngestedFileInfo> files_to_ingest_;
   std::vector<FileBatchInfo> file_batches_to_ingest_;
   const IngestExternalFileOptions& ingestion_options_;
+  std::optional<KeyRangeInfo> atomic_replace_range_;
   Directories* directories_;
   EventLogger* event_logger_;
   VersionEdit edit_;
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index de261af7a01b..2e4cae427731 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -3915,9 +3915,7 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
     s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
     ASSERT_TRUE(s.ToString().find(err) != std::string::npos);
     ASSERT_NOK(s);
-    if (options.compaction_style != kCompactionStyleUniversal) {
-      // FIXME: after fixing ingestion with universal compaction, currently
-      //  will always ingest into L0.
+    if (options.num_levels > 1) {
       ingest_opts.fail_if_not_bottommost_level = true;
       s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
       ASSERT_NOK(s);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 300af520ee9e..2a08bc2f1545 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -131,6 +131,28 @@ struct IngestExternalFileArg {
   std::vector<std::string> files_checksum_func_names;
   // A hint as to the temperature for *reading* the files to be ingested.
   Temperature file_temperature = Temperature::kUnknown;
+  // EXPERIMENTAL: When specified, existing keys in the given range will be
+  // cleared atomically as part of the ingestion, where the ingested files are
+  // logically applied on top of the cleared key range.
+  // * If both `start` and `limit` are nullptr, the entire column family is
+  // cleared; however, setting just one bound to nullptr is not yet supported.
+  // * When a range is specified, all the external files in this batch must
+  //   be contained in that key range.
+  // * Checks for memtable overlap and possible blocking flush will apply
+  //   to this range (not just the file ranges).
+  // * Not compatible with ingest_behind=true.
+  // * When options.snapshot_consistency = false, the range is cleared
+  // similarly to DeleteFilesInRange, but fails if any files overlap the range
+  // only partially.
+  //   * It is recommended to use fail_if_not_bottommost_level=true to ensure
+  //     data in the key range is ingested to a single compacted level (the
+  //     last level). (fail_if_not_bottommost_level=false allows overlap between
+  //     the ingested files.)
+  // * options.snapshot_consistency = true is not yet supported.
+  // BUG: the upper bound of the range may be interpreted as inclusive or
+  // exclusive, so it is best not to depend on one or the other until it is
+  // sorted out.
+  std::optional<RangePtr> atomic_replace_range;
 };
 
 struct GetMergeOperandsOptions {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 66a31208ade4..a89bc467c30e 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2267,8 +2267,11 @@ struct IngestExternalFileOptions {
   // during file ingestion in the DB (the conditions under which a global_seqno
   // must be assigned to the ingested file).
   bool allow_global_seqno = true;
-  // If set to false and the file key range overlaps with the memtable key range
-  // (memtable flush required), IngestExternalFile will fail.
+  // Normally (true), IngestExternalFile() will trigger and block for flushing
+  // memtable(s) if there is overlap between ingested files and memtable(s). If
+  // allow_blocking_flush is set to false, IngestExternalFile() will fail if the
+  // file key range overlaps with the memtable key range (memtable flush
+  // required).
   bool allow_blocking_flush = true;
   // Set to true if you would like duplicate keys in the file being ingested
   // to be skipped rather than overwriting existing data under that key.

From 0b815cf3b35182d91f378f6a647f61289c60942f Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Fri, 21 Mar 2025 17:17:03 -0700
Subject: [PATCH 030/500] Add a
 CompactionJobStats.num_input_files_trivially_moved field (#13479)

Summary:
This PR adds a new field `CompactionJobStats.num_input_files_trivially_moved` representing the number of files this compaction trivially moved. It should either equal to the total number of input files, or being 0.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13479

Test Plan: Added tests

Reviewed By: hx235

Differential Revision: D71638796

Pulled By: jowlyzhang

fbshipit-source-id: 794c085408a0dc95f11874ca60fca3e6b5b92cba
---
 db/db_compaction_test.cc               | 30 ++++++++++++++++++++++++++
 db/db_impl/db_impl_compaction_flush.cc |  2 ++
 include/rocksdb/compaction_job_stats.h |  2 ++
 3 files changed, 34 insertions(+)

diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 80269f374dc1..58b9afde8e68 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -127,6 +127,19 @@ class DBCompactionTestWithParam
     exclusive_manual_compaction_ = std::get<1>(GetParam());
   }
 
+  class TrivialMoveEventListener : public EventListener {
+   public:
+    explicit TrivialMoveEventListener(size_t expected_trivially_moved_files)
+        : expected_trivially_moved_files_(expected_trivially_moved_files) {}
+    void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+      ASSERT_EQ(ci.stats.num_input_files_trivially_moved,
+                expected_trivially_moved_files_);
+    }
+
+   private:
+    size_t expected_trivially_moved_files_ = 0;
+  };
+
   // Required if inheriting from testing::WithParamInterface<>
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
@@ -1301,6 +1314,9 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
 
   Options options = CurrentOptions();
   options.write_buffer_size = 100000000;
+  TrivialMoveEventListener* trivial_move_listener =
+      new TrivialMoveEventListener(1 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener);
   options.max_subcompactions = max_subcompactions_;
   DestroyAndReopen(options);
 
@@ -1361,6 +1377,10 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
 
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  // 8 is number of `ranges` that each is a non overlapping file.
+  TrivialMoveEventListener* trivial_move_listener =
+      new TrivialMoveEventListener(8 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener);
   options.write_buffer_size = 10 * 1024 * 1024;
   options.max_subcompactions = max_subcompactions_;
 
@@ -1408,6 +1428,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
   trivial_move = 0;
   non_trivial_move = 0;
   values.clear();
+  options.listeners.clear();
+  // Same ranges of files, but now overlapping, trivial move not applicable.
+  TrivialMoveEventListener* trivial_move_listener2 =
+      new TrivialMoveEventListener(0 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener2);
   DestroyAndReopen(options);
   // Same ranges as above but overlapping
   ranges = {
@@ -1455,6 +1480,11 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
 
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  // Two non overlapping files in L0 trivialy moved:
+  // file 1 [0 => 300], file 2 [600 => 700]
+  TrivialMoveEventListener* trivial_move_listener1 =
+      new TrivialMoveEventListener(2 /*expected_trivially_moved_files*/);
+  options.listeners.emplace_back(trivial_move_listener1);
   options.write_buffer_size = 10 * 1024 * 1024;
   options.num_levels = 7;
   options.max_subcompactions = max_subcompactions_;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 07d446186727..9ed28906ecbc 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -3802,6 +3802,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     compaction_job_stats.num_input_files = c->num_input_files(0);
     // Trivial moves do not get compacted remotely
     compaction_job_stats.is_remote_compaction = false;
+    compaction_job_stats.num_input_files_trivially_moved =
+        compaction_job_stats.num_input_files;
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index ba3fcebcc905..0af8c3eb689b 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -33,6 +33,8 @@ struct CompactionJobStats {
   uint64_t num_blobs_read = 0;
   // the number of compaction input files (table files)
   size_t num_input_files = 0;
+  // The number of input files that get trivially moved.
+  size_t num_input_files_trivially_moved = 0;
   // the number of compaction input files at the output level (table files)
   size_t num_input_files_at_output_level = 0;
   // the number of compaction input files that are filtered out by compaction

From 934cf2d40dc77905ec565ffec92bb54689c3199c Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Fri, 21 Mar 2025 17:23:01 -0700
Subject: [PATCH 031/500] Implement the DB::GetPropertiesOfTablesForLevels API
 (#13469)

Summary:
As titled. This API returns the table properties of files per level. It can be handy for use cases that needed file's leveling info while retrieving TableProperties. We will use this API to later aggregate per level data write time info.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13469

Test Plan: Added unit tests

Reviewed By: pdillinger

Differential Revision: D71353096

Pulled By: jowlyzhang

fbshipit-source-id: dc1fbb2c97e4365fc8d7241f9a59c65fbf4fb766
---
 db/db_impl/db_impl.cc                         | 23 +++++++++
 db/db_impl/db_impl.h                          |  5 ++
 db/db_table_properties_test.cc                | 50 +++++++++++++++++++
 db/db_test.cc                                 |  9 ++++
 db/version_set.cc                             | 24 +++++++--
 db/version_set.h                              | 10 ++--
 include/rocksdb/db.h                          | 13 ++---
 include/rocksdb/utilities/stackable_db.h      |  8 +++
 .../table_properties_for_levels.md            |  1 +
 9 files changed, 129 insertions(+), 14 deletions(-)
 create mode 100644 unreleased_history/new_features/table_properties_for_levels.md

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 31512116f275..ef14c20f4c9f 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -4424,6 +4424,29 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
   return s;
 }
 
+Status DBImpl::GetPropertiesOfTablesByLevel(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  const ReadOptions read_options;
+  auto s = version->GetPropertiesOfTablesByLevel(read_options, props_by_level);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
 const std::string& DBImpl::GetName() const { return dbname_; }
 
 Env* DBImpl::GetEnv() const { return env_; }
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 11cf2347831f..45970fdd4bbd 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -651,6 +651,11 @@ class DBImpl : public DB {
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) override;
 
+  Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+      override;
+
   // ---- End of implementations of the DB interface ----
   SystemClock* GetSystemClock() const;
 
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index a899c03e2935..d83a5f5aec9b 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -229,6 +229,56 @@ TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
   ASSERT_EQ(0.5, del_factory->GetDeletionRatio());
 }
 
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesByLevelTest) {
+  Random rnd(202);
+  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.hard_pending_compaction_bytes_limit = 16 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  // build a decent LSM
+  for (int i = 0; i < 10000; i++) {
+    EXPECT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 0) {
+    EXPECT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_->PauseBackgroundWork());
+
+  // Ensure that we have at least L0, L1 and L2
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(2), 0);
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  std::vector<std::unique_ptr<TablePropertiesCollection>> levels_props;
+  ASSERT_OK(db_->GetPropertiesOfTablesByLevel(db_->DefaultColumnFamily(),
+                                              &levels_props));
+  for (int i = 0; i < 8; i++) {
+    const std::unique_ptr<TablePropertiesCollection>& level_props =
+        levels_props[i];
+    ASSERT_EQ(level_props->size(), cf_meta.levels[i].files.size());
+  }
+
+  Close();
+}
+
 // Test params:
 // 1) whether to enable user-defined timestamps
 class DBTablePropertiesInRangeTest : public DBTestBase,
diff --git a/db/db_test.cc b/db/db_test.cc
index e30f2dd95aaf..4ce772a48516 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3182,6 +3182,15 @@ class ModelDB : public DB {
     return Status();
   }
 
+  using DB::GetPropertiesOfTablesByLevel;
+  Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* /* column_family */,
+      std::vector<
+          std::unique_ptr<TablePropertiesCollection>>* /* props_by_level */)
+      override {
+    return Status();
+  }
+
   using DB::KeyMayExist;
   bool KeyMayExist(const ReadOptions& /*options*/,
                    ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
diff --git a/db/version_set.cc b/db/version_set.cc
index 67d5c9fab3f7..2daf5b957390 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1627,8 +1627,8 @@ Status Version::GetTableProperties(const ReadOptions& read_options,
   return s;
 }
 
-Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
-                                         TablePropertiesCollection* props) {
+Status Version::GetPropertiesOfAllTables(
+    const ReadOptions& read_options, TablePropertiesCollection* props) const {
   Status s;
   for (int level = 0; level < storage_info_.num_levels_; level++) {
     s = GetPropertiesOfAllTables(read_options, props, level);
@@ -1699,7 +1699,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
 
 Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options,
                                          TablePropertiesCollection* props,
-                                         int level) {
+                                         int level) const {
   for (const auto& file_meta : storage_info_.files_[level]) {
     auto fname =
         TableFileName(cfd_->ioptions().cf_paths, file_meta->fd.GetNumber(),
@@ -1753,6 +1753,24 @@ Status Version::GetPropertiesOfTablesInRange(
   return Status::OK();
 }
 
+Status Version::GetPropertiesOfTablesByLevel(
+    const ReadOptions& read_options,
+    std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+    const {
+  Status s;
+
+  props_by_level->reserve(storage_info_.num_levels_);
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    props_by_level->push_back(std::make_unique<TablePropertiesCollection>());
+    s = GetPropertiesOfAllTables(read_options, props_by_level->back().get(),
+                                 level);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
 Status Version::GetAggregatedTableProperties(
     const ReadOptions& read_options, std::shared_ptr<const TableProperties>* tp,
     int level) {
diff --git a/db/version_set.h b/db/version_set.h
index 782f38846b55..2d81dfce73b1 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -994,17 +994,21 @@ class Version {
                             const FileMetaData* file_meta,
                             const std::string* fname = nullptr) const;
 
-  // REQUIRES: lock is held
   // On success, *props will be populated with all SSTables' table properties.
   // The keys of `props` are the sst file name, the values of `props` are the
   // tables' properties, represented as std::shared_ptr.
   Status GetPropertiesOfAllTables(const ReadOptions& read_options,
-                                  TablePropertiesCollection* props);
+                                  TablePropertiesCollection* props) const;
   Status GetPropertiesOfAllTables(const ReadOptions& read_options,
-                                  TablePropertiesCollection* props, int level);
+                                  TablePropertiesCollection* props,
+                                  int level) const;
   Status GetPropertiesOfTablesInRange(const ReadOptions& read_options,
                                       const autovector<UserKeyRange>& ranges,
                                       TablePropertiesCollection* props) const;
+  Status GetPropertiesOfTablesByLevel(
+      const ReadOptions& read_options,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+      const;
 
   // Print summary of range delete tombstones in SST files into out_str,
   // with maximum max_entries_to_print entries printed out.
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 2a08bc2f1545..3ac1ea02761d 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -2103,14 +2103,11 @@ class DB {
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) = 0;
 
-  // Get the table properties of files per level.
-  virtual Status GetPropertiesOfTablesForLevels(
-      ColumnFamilyHandle* /* column_family */,
-      std::vector<
-          std::unique_ptr<TablePropertiesCollection>>* /* levels_props */) {
-    return Status::NotSupported(
-        "GetPropertiesOfTablesForLevels() is not implemented.");
-  }
+  // Get the table properties of files by level.
+  virtual Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>*
+          props_by_level) = 0;
 
   virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
                                      const Slice* /*begin*/,
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 244989a6c98e..4cea4dafff5d 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -554,6 +554,14 @@ class StackableDB : public DB {
     return db_->GetPropertiesOfTablesInRange(column_family, range, n, props);
   }
 
+  using DB::GetPropertiesOfTablesByLevel;
+  Status GetPropertiesOfTablesByLevel(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::unique_ptr<TablePropertiesCollection>>* props_by_level)
+      override {
+    return db_->GetPropertiesOfTablesByLevel(column_family, props_by_level);
+  }
+
   Status GetUpdatesSince(
       SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options) override {
diff --git a/unreleased_history/new_features/table_properties_for_levels.md b/unreleased_history/new_features/table_properties_for_levels.md
new file mode 100644
index 000000000000..d0b2653c8d3c
--- /dev/null
+++ b/unreleased_history/new_features/table_properties_for_levels.md
@@ -0,0 +1 @@
+Implemented API DB::GetPropertiesOfTablesByLevel that retrieves table properties for files in each LSM tree level
\ No newline at end of file

From 82794e0a4f3116878db210bce7e3768a68c47173 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 24 Mar 2025 17:08:17 -0700
Subject: [PATCH 032/500] Deprecate RangePtr, favor new RangeOpt and OptSlice
 (#13481)

Summary:
The new API in https://github.com/facebook/rocksdb/issues/13453 is awkward and precarious because of using RangePtr, which encodes optional keys using raw pointers to Slice. We could use `std::optional<Slice>` instead but that is unsatisfyingly a larger object with an inefficient size (typically 17 bytes).

Here I introduce a custom optional Slice type, `OptSlice`, that is the same size as a Slice, and use it in a number of places to clean up code and make some public APIs easier to work with. This includes

* `atomic_replace_range` (not yet released, OK to change)
* `GetAllKeyVersions()` which gets a behavior change because of its unusual handling of empty keys.
* `DeleteFilesInRanges()`
* TODO in follow-up: `CompactRange()`

Most of the diff is associated updates and refactorings. Also

* Move some relevant things out of db.h to keep it as tidy as possible.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13481

Test Plan: tests updated

Reviewed By: hx235

Differential Revision: D71747774

Pulled By: pdillinger

fbshipit-source-id: b4c8519608d119b8bceca9bb0fd778608f62a141
---
 db/compaction/tiered_compaction_test.cc       |  6 +-
 db/convenience.cc                             | 11 ++++
 db/db_basic_test.cc                           | 14 +++--
 db/db_compaction_test.cc                      | 13 ++--
 db/db_impl/db_impl.cc                         | 30 ++++-----
 db/db_impl/db_impl.h                          |  2 +-
 db/db_impl/db_impl_compaction_flush.cc        |  3 +-
 db/db_merge_operator_test.cc                  |  6 +-
 db/db_table_properties_test.cc                |  2 +-
 db/db_test.cc                                 |  3 +-
 db/dbformat.h                                 | 11 ++--
 db/external_sst_file_basic_test.cc            | 33 +++++-----
 db/external_sst_file_ingestion_job.cc         |  8 +--
 db/external_sst_file_ingestion_job.h          |  2 +-
 db/seqno_time_test.cc                         |  3 +-
 include/rocksdb/convenience.h                 | 16 +++++
 include/rocksdb/db.h                          | 59 ------------------
 include/rocksdb/options.h                     | 61 +++++++++++++++++++
 include/rocksdb/slice.h                       | 41 +++++++++++++
 include/rocksdb/utilities/debug.h             |  6 +-
 java/rocksjni/rocksjni.cc                     | 18 +++---
 table/sst_file_dumper.cc                      |  9 ++-
 tools/ldb_cmd.cc                              |  5 +-
 .../public_api_changes/optslice.md            |  2 +
 util/udt_util.cc                              |  9 ++-
 util/udt_util.h                               | 13 ++--
 utilities/blob_db/blob_db_test.cc             | 14 ++---
 utilities/debug.cc                            | 19 +++---
 28 files changed, 239 insertions(+), 180 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/optslice.md

diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index 5c21f8487572..ba32dcbb05e2 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -1713,8 +1713,7 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
 
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
+  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
                               &key_versions));
 
   // make sure there're more than 300 keys and first 100 keys are having seqno
@@ -2303,8 +2302,7 @@ TEST_P(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
 
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
+  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
                               &key_versions));
 
   // make sure there're more than 300 keys and first 100 keys are having seqno
diff --git a/db/convenience.cc b/db/convenience.cc
index 47ce59f2f8d1..384854a1e0f7 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -26,6 +26,17 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
 
 Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
                            const RangePtr* ranges, size_t n, bool include_end) {
+  std::vector<RangeOpt> range_opts(n);
+  for (size_t i = 0; i < n; ++i) {
+    range_opts[i] = {OptSlice::CopyFromPtr(ranges[i].start),
+                     OptSlice::CopyFromPtr(ranges[i].limit)};
+  }
+  return DeleteFilesInRanges(db, column_family, range_opts.data(), n,
+                             include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangeOpt* ranges, size_t n, bool include_end) {
   return (static_cast_with_check<DBImpl>(db->GetRootDB()))
       ->DeleteFilesInRanges(column_family, ranges, n, include_end);
 }
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 08b6486df965..55323d29de8d 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -3288,8 +3288,7 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
     ASSERT_OK(Delete(std::to_string(i)));
   }
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
+  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
   for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
@@ -3299,7 +3298,7 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
       ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
     }
   }
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[0], {}, {},
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
@@ -3314,10 +3313,17 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
   for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
     ASSERT_OK(Delete(1, std::to_string(i)));
   }
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], {}, {},
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+
+  // Change from historical behavior: empty key is now interpreted literally as
+  // a legal key (rather than as a "not present" key)
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(key_versions.size(), 0);
 }
 
 TEST_F(DBBasicTest, ValueTypeString) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 58b9afde8e68..b539251a2998 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -2117,13 +2117,10 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
     auto begin_str1 = Key(0), end_str1 = Key(100);
     auto begin_str2 = Key(100), end_str2 = Key(200);
     auto begin_str3 = Key(200), end_str3 = Key(299);
-    Slice begin1(begin_str1), end1(end_str1);
-    Slice begin2(begin_str2), end2(end_str2);
-    Slice begin3(begin_str3), end3(end_str3);
-    std::vector<RangePtr> ranges;
-    ranges.emplace_back(&begin1, &end1);
-    ranges.emplace_back(&begin2, &end2);
-    ranges.emplace_back(&begin3, &end3);
+    std::vector<RangeOpt> ranges;
+    ranges.emplace_back(begin_str1, end_str1);
+    ranges.emplace_back(begin_str2, end_str2);
+    ranges.emplace_back(begin_str3, end_str3);
     ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
                                   ranges.data(), ranges.size()));
     ASSERT_EQ("0,3,7", FilesPerLevel(0));
@@ -2171,7 +2168,7 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
 
   // Delete all files.
   {
-    RangePtr range;
+    RangeOpt range;
     ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
     ASSERT_EQ("", FilesPerLevel(0));
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index ef14c20f4c9f..fa33bd62d54d 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -80,6 +80,7 @@
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
@@ -4407,7 +4408,7 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
   // Add timestamp if needed
   for (size_t i = 0; i < n; i++) {
     auto [start, limit] = MaybeAddTimestampsToRange(
-        &range[i].start, &range[i].limit, ts_sz, &keys.emplace_back(),
+        range[i].start, range[i].limit, ts_sz, &keys.emplace_back(),
         &keys.emplace_back(), /*exclusive_end=*/false);
     assert(start.has_value());
     assert(limit.has_value());
@@ -4747,7 +4748,7 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
   // Add timestamp if needed
   std::string start_with_ts, limit_with_ts;
   auto [start, limit] = MaybeAddTimestampsToRange(
-      &range.start, &range.limit, ts_sz, &start_with_ts, &limit_with_ts);
+      range.start, range.limit, ts_sz, &start_with_ts, &limit_with_ts);
   assert(start.has_value());
   assert(limit.has_value());
   // Convert user_key into a corresponding internal key.
@@ -4785,9 +4786,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
   for (int i = 0; i < n; i++) {
     // Add timestamp if needed
     std::string start_with_ts, limit_with_ts;
-    auto [start, limit] =
-        MaybeAddTimestampsToRange(&range[i].start, &range[i].limit, ts_sz,
-                                  &start_with_ts, &limit_with_ts);
+    auto [start, limit] = MaybeAddTimestampsToRange(
+        range[i].start, range[i].limit, ts_sz, &start_with_ts, &limit_with_ts);
     assert(start.has_value());
     assert(limit.has_value());
     // Convert user_key into a corresponding internal key.
@@ -4863,7 +4863,7 @@ Status DBImpl::GetUpdatesSince(
 }
 
 Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
-                                   const RangePtr* ranges, size_t n,
+                                   const RangeOpt* ranges, size_t n,
                                    bool include_end) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
@@ -4875,7 +4875,7 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
   const Comparator* ucmp = cfd->user_comparator();
   assert(ucmp);
   const size_t ts_sz = ucmp->timestamp_size();
-  autovector<UserKeyRangePtr> ukey_ranges;
+  autovector<UserKeyRangeOpt> ukey_ranges;
   std::vector<std::string> keys;
   std::vector<Slice> key_slices;
   ukey_ranges.reserve(n);
@@ -4885,8 +4885,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
     auto [start, limit] = MaybeAddTimestampsToRange(
         ranges[i].start, ranges[i].limit, ts_sz, &keys.emplace_back(),
         &keys.emplace_back(), !include_end);
-    assert((ranges[i].start != nullptr) == start.has_value());
-    assert((ranges[i].limit != nullptr) == limit.has_value());
+    assert(ranges[i].start.has_value() == start.has_value());
+    assert(ranges[i].limit.has_value() == limit.has_value());
     ukey_ranges.emplace_back(start, limit);
   }
 
@@ -5834,10 +5834,10 @@ Status DBImpl::IngestExternalFiles(
             "atomic_replace_range not yet supported with "
             "snapshot_consistency.");
       } else {
-        if ((arg.atomic_replace_range->start == nullptr) ^
-            (arg.atomic_replace_range->limit == nullptr)) {
+        if (arg.atomic_replace_range->start.has_value() ^
+            arg.atomic_replace_range->limit.has_value()) {
           return Status::NotSupported(
-              "Only one of atomic_replace_range.{start,limit} == nullptr is "
+              "Only one of atomic_replace_range.{start,limit}.has_value() is "
               "not supported.");
         }
       }
@@ -6299,9 +6299,9 @@ Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family,
 
   if (status.ok()) {
     // DeleteFilesInRanges non-overlap files except L0
-    std::vector<RangePtr> ranges;
-    ranges.emplace_back(nullptr, &begin_key);
-    ranges.emplace_back(&end_key, nullptr);
+    std::vector<RangeOpt> ranges;
+    ranges.emplace_back(OptSlice{}, begin_key);
+    ranges.emplace_back(end_key, OptSlice{});
     status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size());
   }
 
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 45970fdd4bbd..d8530cd7a98a 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -542,7 +542,7 @@ class DBImpl : public DB {
       const TransactionLogIterator::ReadOptions& read_options =
           TransactionLogIterator::ReadOptions()) override;
   Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
-                             const RangePtr* ranges, size_t n,
+                             const RangeOpt* ranges, size_t n,
                              bool include_end = true);
 
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) override;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 9ed28906ecbc..d0b1074b3b25 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -981,7 +981,8 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
 
   std::string begin_str, end_str;
   auto [begin, end] =
-      MaybeAddTimestampsToRange(begin_without_ts, end_without_ts, ts_sz,
+      MaybeAddTimestampsToRange(OptSlice::CopyFromPtr(begin_without_ts),
+                                OptSlice::CopyFromPtr(end_without_ts), ts_sz,
                                 &begin_str, &end_str, false /*exclusive_end*/);
 
   return CompactRangeInternal(
diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 69f6ec4e9185..0f6d05d0527c 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -971,7 +971,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
 
   // No base value
   {
-    constexpr char key[] = "key1";
+    const std::string key = "key1";
 
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, foo));
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar));
@@ -994,7 +994,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
 
   // Plain base value
   {
-    constexpr char key[] = "key2";
+    const std::string key = "key2";
 
     ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), key, foo));
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar));
@@ -1019,7 +1019,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
 
   // Wide-column base value
   {
-    constexpr char key[] = "key3";
+    const std::string key = "key3";
     const WideColumns columns{{kDefaultWideColumnName, foo}, {bar, baz}};
 
     ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), key,
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index d83a5f5aec9b..ddebfccbec83 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -342,7 +342,7 @@ class DBTablePropertiesInRangeTest : public DBTestBase,
     keys.reserve(range_size * 2);
     for (auto& r : ranges) {
       auto [start, limit] = MaybeAddTimestampsToRange(
-          &r.start, &r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(),
+          r.start, r.limit, ts_sz, &keys.emplace_back(), &keys.emplace_back(),
           /*exclusive_end=*/false);
       EXPECT_TRUE(start.has_value());
       EXPECT_TRUE(limit.has_value());
diff --git a/db/db_test.cc b/db/db_test.cc
index 4ce772a48516..763cdfc22d66 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5411,8 +5411,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
 
   for (const auto& file : cf_meta.levels[4].files) {
     listener->SetExpectedFileName(dbname_ + file.name);
-    Slice start(file.smallestkey), limit(file.largestkey);
-    const RangePtr ranges(&start, &limit);
+    const RangeOpt ranges(file.smallestkey, file.largestkey);
     // Given verification from above, we're guaranteed that by deleting all the
     // files in [<smallestkey>, <largestkey>] range, we're effectively deleting
     // that very single file and nothing more.
diff --git a/db/dbformat.h b/db/dbformat.h
index 6ceda7fa54de..0ee6e9272b5f 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -95,18 +95,17 @@ struct UserKeyRange {
   UserKeyRange(const Slice& s, const Slice& l) : start(s), limit(l) {}
 };
 
-// A range of user keys used internally by RocksDB. Also see `RangePtr` used by
+// A range of user keys used internally by RocksDB. Also see `RangeOpt` used by
 // public APIs.
-struct UserKeyRangePtr {
+struct UserKeyRangeOpt {
   // In case of user_defined timestamp, if enabled, `start` and `limit` should
   // point to key with timestamp part.
   // An optional range start, if missing, indicating a start before all keys.
-  std::optional<Slice> start;
+  OptSlice start;
   // An optional range end, if missing, indicating an end after all keys.
-  std::optional<Slice> limit;
+  OptSlice limit;
 
-  UserKeyRangePtr(const std::optional<Slice>& s, const std::optional<Slice>& l)
-      : start(s), limit(l) {}
+  UserKeyRangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
 };
 
 // Checks whether a type is an inline value type
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 95228a6fa14c..fe6d9282fe30 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -2814,8 +2814,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
       IngestExternalFileArg arg;
       arg.column_family = handles_[1];
       arg.external_files = {file_path};
-      Range replace_rng{"a", "zzz"};
-      arg.atomic_replace_range = {{&replace_rng.start, &replace_rng.limit}};
+      arg.atomic_replace_range = {{"a", "zzz"}};
 
       // start with some failure cases
       // TODO: support snapshot consistency with tombstone file
@@ -2834,23 +2833,21 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
       arg.options.fail_if_not_bottommost_level = true;
 
       // one-sided ranges not yet supported
-      arg.atomic_replace_range = {{nullptr, &replace_rng.limit}};
+      arg.atomic_replace_range = {{{}, "zzz"}};
       s = db_->IngestExternalFiles({arg});
       ASSERT_EQ(s.code(), Status::Code::kNotSupported);
 
-      arg.atomic_replace_range = {{&replace_rng.start, nullptr}};
+      arg.atomic_replace_range = {{"a", {}}};
       s = db_->IngestExternalFiles({arg});
       ASSERT_EQ(s.code(), Status::Code::kNotSupported);
 
-      arg.atomic_replace_range = {{&replace_rng.start, &replace_rng.limit}};
-
       // rejected because doesn't cover ingested file
-      replace_rng = {"x", "z"};
+      arg.atomic_replace_range = {{"x", "z"}};
       s = db_->IngestExternalFiles({arg});
       ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
       // rejected because of partial file overlap
-      replace_rng = {"a", "c"};
+      arg.atomic_replace_range = {{"a", "c"}};
       s = db_->IngestExternalFiles({arg});
       ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
@@ -2860,7 +2857,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
         arg.options.allow_blocking_flush = false;
 
         // rejected because of memtable overlap
-        replace_rng = {"a", "z"};
+        arg.atomic_replace_range = {{"a", "z"}};
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
@@ -2869,27 +2866,25 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
-        arg.atomic_replace_range = {{&replace_rng.start, &replace_rng.limit}};
-
         // FIXME: upper bound should be exclusive (DeleteRange semantics).
         // currently rejected because of documented bug
-        replace_rng = {"a", "e"};
+        arg.atomic_replace_range = {{"a", "e"}};
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
         // work-around ensuring no memtable overlap
-        replace_rng = {"a", "d2"};
+        arg.atomic_replace_range = {{"a", "d2"}};
         ASSERT_OK(db_->IngestExternalFiles({arg}));
 
         ASSERT_EQ(Get(1, "e"), "5");
       } else {
         // rejected because of partial file overlap
-        replace_rng = {"b", "z"};
+        arg.atomic_replace_range = {{"b", "z"}};
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
         // no memtable complications
-        replace_rng = {"a", "z"};
+        arg.atomic_replace_range = {{"a", "z"}};
         ASSERT_OK(db_->IngestExternalFiles({arg}));
 
         ASSERT_EQ(Get(1, "e"), "NOT_FOUND");
@@ -2917,7 +2912,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
 
       if (!disallow_memtable) {
         // rejected because of memtable overlap with range
-        replace_rng = {"e", "z"};
+        arg.atomic_replace_range = {{"e", "z"}};
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
@@ -2953,7 +2948,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
         arg.external_files = {file_path2, file_path};
 
         // rejected because of overlap in files to ingest with fail_if_ = true
-        replace_rng = {"e", "z"};
+        arg.atomic_replace_range = {{"e", "z"}};
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kTryAgain);
 
@@ -2961,13 +2956,13 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
 
         // rejected because range doesn't cover ingested files
         // FIXME: upper bound should be exclusive "g" instead
-        replace_rng = {"e", "f2"};
+        arg.atomic_replace_range = {{"e", "f2"}};
         s = db_->IngestExternalFiles({arg});
         ASSERT_EQ(s.code(), Status::Code::kInvalidArgument);
 
         // Loaded into different levels, and the file with just "b" is not
         // replaced
-        replace_rng = {"e", "z"};
+        arg.atomic_replace_range = {{"e", "z"}};
         ASSERT_OK(db_->IngestExternalFiles({arg}));
 
         ASSERT_EQ(Get(1, "b"), "0");
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index f1d891628af6..a65dacb7a114 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -29,7 +29,7 @@ Status ExternalSstFileIngestionJob::Prepare(
     const std::vector<std::string>& external_files_paths,
     const std::vector<std::string>& files_checksums,
     const std::vector<std::string>& files_checksum_func_names,
-    const std::optional<RangePtr>& atomic_replace_range,
+    const std::optional<RangeOpt>& atomic_replace_range,
     const Temperature& file_temperature, uint64_t next_file_number,
     SuperVersion* sv) {
   Status status;
@@ -114,9 +114,9 @@ Status ExternalSstFileIngestionJob::Prepare(
         }
       }
     } else {
-      // Currently if either bound is nullptr, both must be
-      assert(atomic_replace_range->start == nullptr);
-      assert(atomic_replace_range->limit == nullptr);
+      // Currently if either bound is not present, both must be
+      assert(atomic_replace_range->start.has_value() == false);
+      assert(atomic_replace_range->limit.has_value() == false);
       assert(atomic_replace_range_->smallest_internal_key.unset());
       assert(atomic_replace_range_->largest_internal_key.unset());
     }
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 3ca95a2b543a..628eb36848b8 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -245,7 +245,7 @@ class ExternalSstFileIngestionJob {
   Status Prepare(const std::vector<std::string>& external_files_paths,
                  const std::vector<std::string>& files_checksums,
                  const std::vector<std::string>& files_checksum_func_names,
-                 const std::optional<RangePtr>& atomic_replace_range,
+                 const std::optional<RangeOpt>& atomic_replace_range,
                  const Temperature& file_temperature, uint64_t next_file_number,
                  SuperVersion* sv);
 
diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc
index 98fae6d6c531..271a53fa9ae0 100644
--- a/db/seqno_time_test.cc
+++ b/db/seqno_time_test.cc
@@ -792,8 +792,7 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
   }
   ASSERT_GT(num_seqno_zeroing, 0);
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
-                              std::numeric_limits<size_t>::max(),
+  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
                               &key_versions));
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h
index 27127fbebfbf..95bfe2c692b6 100644
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@@ -450,6 +450,22 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
 // Delete files in multiple ranges at once
 // Delete files in a lot of ranges one at a time can be slow, use this API for
 // better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangeOpt* ranges, size_t n,
+                           bool include_end = true);
+
+// DEPRECATED
+struct RangePtr {
+  // In case of user_defined timestamp, if enabled, `start` and `limit` should
+  // point to key without timestamp part.
+  const Slice* start;
+  const Slice* limit;
+
+  RangePtr() : start(nullptr), limit(nullptr) {}
+  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+// DEPRECATED
 Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
                            const RangePtr* ranges, size_t n,
                            bool include_end = true);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 3ac1ea02761d..df951b2810a8 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -96,65 +96,6 @@ class ColumnFamilyHandle {
 static const int kMajorVersion = __ROCKSDB_MAJOR__;
 static const int kMinorVersion = __ROCKSDB_MINOR__;
 
-// A range of keys
-struct Range {
-  // In case of user_defined timestamp, if enabled, `start` and `limit` should
-  // point to key without timestamp part.
-  Slice start;
-  Slice limit;
-
-  Range() {}
-  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
-};
-
-struct RangePtr {
-  // In case of user_defined timestamp, if enabled, `start` and `limit` should
-  // point to key without timestamp part.
-  const Slice* start;
-  const Slice* limit;
-
-  RangePtr() : start(nullptr), limit(nullptr) {}
-  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
-};
-
-// It is valid that files_checksums and files_checksum_func_names are both
-// empty (no checksum information is provided for ingestion). Otherwise,
-// their sizes should be the same as external_files. The file order should
-// be the same in three vectors and guaranteed by the caller.
-// Note that, we assume the temperatures of this batch of files to be
-// ingested are the same.
-struct IngestExternalFileArg {
-  ColumnFamilyHandle* column_family = nullptr;
-  std::vector<std::string> external_files;
-  IngestExternalFileOptions options;
-  std::vector<std::string> files_checksums;
-  std::vector<std::string> files_checksum_func_names;
-  // A hint as to the temperature for *reading* the files to be ingested.
-  Temperature file_temperature = Temperature::kUnknown;
-  // EXPERIMENTAL: When specified, existing keys in the given range will be
-  // cleared atomically as part of the ingestion, where the ingested files are
-  // logically applied on top of the cleared key range.
-  // * If both `start` and `limit` are nullptr, the entire column family is
-  // cleared; however, setting just one bound to nullptr is not yet supported.
-  // * When a range is specified, all the external files in this batch must
-  //   be contained in that key range.
-  // * Checks for memtable overlap and possible blocking flush will apply
-  //   to this range (not just the file ranges).
-  // * Not compatible with ingest_behind=true.
-  // * When options.snapshot_consistency = false, the range is cleared
-  // similarly to DeleteFilesInRange, but fails if any files overlap the range
-  // only partially.
-  //   * It is recommended to use fail_if_not_bottommost_level=true to ensure
-  //     data in the key range is ingested to a single compacted level (the
-  //     last level). (fail_if_not_bottommost_level=false allows overlap between
-  //     the ingested files.)
-  // * options.snapshot_consistency = true is not yet supported.
-  // BUG: the upper bound of the range may be interpreted as inclusive or
-  // exclusive, so it is best not to depend on one or the other until it is
-  // sorted out.
-  std::optional<RangePtr> atomic_replace_range;
-};
-
 struct GetMergeOperandsOptions {
   using ContinueCallback = std::function<bool(Slice)>;
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a89bc467c30e..6b66976aa371 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2245,6 +2245,29 @@ struct CompactRangeOptions {
   double blob_garbage_collection_age_cutoff = -1;
 };
 
+// A range of keys. In case of user_defined timestamp, if enabled, `start` and
+// `limit` should point to key without timestamp part.
+struct Range {
+  Slice start;
+  Slice limit;
+
+  Range() {}
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+// A key range with optional endpoints. In case of user_defined timestamp, if
+// enabled, `start` and `limit` should point to key without timestamp part.
+struct RangeOpt {
+  // When start.has_value() == false, refers to starting before every key
+  OptSlice start;
+  // When limit.has_value() == false, refers to ending after every key
+  OptSlice limit;
+
+  RangeOpt() {}
+  RangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
+  // RangeOpt(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
 // IngestExternalFileOptions is used by IngestExternalFile()
 struct IngestExternalFileOptions {
   // Can be set to true to move the files instead of copying them.
@@ -2354,6 +2377,44 @@ struct IngestExternalFileOptions {
   bool fill_cache = true;
 };
 
+// It is valid that files_checksums and files_checksum_func_names are both
+// empty (no checksum information is provided for ingestion). Otherwise,
+// their sizes should be the same as external_files. The file order should
+// be the same in three vectors and guaranteed by the caller.
+// Note that, we assume the temperatures of this batch of files to be
+// ingested are the same.
+struct IngestExternalFileArg {
+  ColumnFamilyHandle* column_family = nullptr;
+  std::vector<std::string> external_files;
+  IngestExternalFileOptions options;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  // A hint as to the temperature for *reading* the files to be ingested.
+  Temperature file_temperature = Temperature::kUnknown;
+  // EXPERIMENTAL: When specified, existing keys in the given range will be
+  // cleared atomically as part of the ingestion, where the ingested files are
+  // logically applied on top of the cleared key range.
+  // * If both `start` and `limit` are nullptr, the entire column family is
+  // cleared; however, setting just one bound to nullptr is not yet supported.
+  // * When a range is specified, all the external files in this batch must
+  //   be contained in that key range.
+  // * Checks for memtable overlap and possible blocking flush will apply
+  //   to this range (not just the file ranges).
+  // * Not compatible with ingest_behind=true.
+  // * When options.snapshot_consistency = false, the range is cleared
+  // similarly to DeleteFilesInRange, but fails if any files overlap the range
+  // only partially.
+  //   * It is recommended to use fail_if_not_bottommost_level=true to ensure
+  //     data in the key range is ingested to a single compacted level (the
+  //     last level). (fail_if_not_bottommost_level=false allows overlap between
+  //     the ingested files.)
+  // * options.snapshot_consistency = true is not yet supported.
+  // BUG: the upper bound of the range may be interpreted as inclusive or
+  // exclusive, so it is best not to depend on one or the other until it is
+  // sorted out.
+  std::optional<RangeOpt> atomic_replace_range;
+};
+
 enum TraceFilterType : uint64_t {
   // Trace all the operations
   kTraceFilterNone = 0x0,
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
index 0d7eb59499eb..c914b1637b50 100644
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -20,6 +20,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <string>
@@ -129,6 +130,46 @@ class Slice {
   // Intentionally copyable
 };
 
+// A likely more efficient alternative to std::optional<Slice>. For example,
+// an empty key might be distinct from "not specified" (and Slice* as an
+// optional is more troublesome to deal with).
+class OptSlice {
+ public:
+  OptSlice() : slice_(nullptr, SIZE_MAX) {}
+  /*implicit*/ OptSlice(const Slice& s) : slice_(s) {}
+  /*implicit*/ OptSlice(const std::string& s) : slice_(s) {}
+  /*implicit*/ OptSlice(const std::string_view& sv) : slice_(sv) {}
+  /*implicit*/ OptSlice(const char* c_str) : slice_(c_str) {}
+  // For easier migrating from APIs uing Slice* as an optional type.
+  // CAUTION: OptSlice{nullptr} is "no value" while Slice{nullptr} is "empty"
+  /*implicit*/ OptSlice(std::nullptr_t) : OptSlice() {}
+
+  bool has_value() const noexcept { return slice_.size() != SIZE_MAX; }
+  explicit operator bool() const noexcept { return has_value(); }
+
+  const Slice& value() const noexcept {
+    assert(has_value());
+    return slice_;
+  }
+  const Slice& operator*() const noexcept { return value(); }
+  const Slice* operator->() const noexcept { return &value(); }
+
+  const Slice* AsPtr() const noexcept {
+    return has_value() ? &slice_ : nullptr;
+  }
+  // Populate from an optional pointer. This is a very explicit conversion
+  // to minimize risk of bugs as in
+  //   Slice start, limit;
+  //   RangeOpt rng = {&start, &limit};
+  //   start = ...;  // BUG: would not affect rng
+  static OptSlice CopyFromPtr(const Slice* ptr) {
+    return ptr ? OptSlice{*ptr} : OptSlice{};
+  }
+
+ protected:
+  Slice slice_;
+};
+
 /**
  * A Slice that can be pinned with some cleanup tasks, which will be run upon
  * ::Reset() or object destruction, whichever is invoked first. This can be used
diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h
index 1cbc7daf84cc..57968ad15e10 100644
--- a/include/rocksdb/utilities/debug.h
+++ b/include/rocksdb/utilities/debug.h
@@ -33,12 +33,12 @@ struct KeyVersion {
 // copied to memory, if the range covers too many keys, the memory usage
 // may be huge. `max_num_ikeys` can be used to cap the memory usage.
 // The result is inserted into the provided vector, `key_versions`.
-Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+Status GetAllKeyVersions(DB* db, OptSlice begin_key, OptSlice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
-Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
-                         Slice end_key, size_t max_num_ikeys,
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, OptSlice begin_key,
+                         OptSlice end_key, size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 9561b3893661..5aad46fa4926 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -3637,7 +3637,7 @@ void Java_org_rocksdb_RocksDB_destroyDB(JNIEnv* env, jclass, jstring jdb_path,
 }
 
 bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
-                      std::unique_ptr<ROCKSDB_NAMESPACE::Slice>& slice,
+                      ROCKSDB_NAMESPACE::OptSlice& opt_slice,
                       std::vector<std::unique_ptr<jbyte[]>>& ranges_to_free) {
   jobject jArray = env->GetObjectArrayElement(ranges, index);
   if (env->ExceptionCheck()) {
@@ -3659,8 +3659,8 @@ bool get_slice_helper(JNIEnv* env, jobjectArray ranges, jsize index,
     return false;
   }
   env->DeleteLocalRef(jArray);
-  slice.reset(new ROCKSDB_NAMESPACE::Slice(
-      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba));
+  opt_slice = ROCKSDB_NAMESPACE::Slice(
+      reinterpret_cast<char*>(ranges_to_free.back().get()), len_ba);
   return true;
 }
 /*
@@ -3675,24 +3675,24 @@ void Java_org_rocksdb_RocksDB_deleteFilesInRanges(JNIEnv* env, jclass /*jdb*/,
                                                   jboolean include_end) {
   jsize length = env->GetArrayLength(ranges);
 
-  std::vector<ROCKSDB_NAMESPACE::RangePtr> rangesVector;
-  std::vector<std::unique_ptr<ROCKSDB_NAMESPACE::Slice>> slices;
+  std::vector<ROCKSDB_NAMESPACE::RangeOpt> rangesVector;
+  std::vector<ROCKSDB_NAMESPACE::OptSlice> slices;
   std::vector<std::unique_ptr<jbyte[]>> ranges_to_free;
   for (jsize i = 0; (i + 1) < length; i += 2) {
-    slices.push_back(std::unique_ptr<ROCKSDB_NAMESPACE::Slice>());
+    slices.emplace_back();
     if (!get_slice_helper(env, ranges, i, slices.back(), ranges_to_free)) {
       // exception thrown
       return;
     }
 
-    slices.push_back(std::unique_ptr<ROCKSDB_NAMESPACE::Slice>());
+    slices.emplace_back();
     if (!get_slice_helper(env, ranges, i + 1, slices.back(), ranges_to_free)) {
       // exception thrown
       return;
     }
 
-    rangesVector.push_back(ROCKSDB_NAMESPACE::RangePtr(
-        slices[slices.size() - 2].get(), slices[slices.size() - 1].get()));
+    rangesVector.push_back(ROCKSDB_NAMESPACE::RangeOpt(
+        slices[slices.size() - 2], slices[slices.size() - 1]));
   }
 
   auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 905eef7004a7..cbad9aa120d8 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -474,12 +474,11 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit,
   const Comparator* ucmp = internal_comparator_.user_comparator();
   size_t ts_sz = ucmp->timestamp_size();
 
-  Slice from_slice = from_key;
-  Slice to_slice = to_key;
+  OptSlice from_opt = has_from ? from_key : OptSlice{};
+  OptSlice to_opt = has_to ? to_key : OptSlice{};
   std::string from_key_buf, to_key_buf;
-  auto [from, to] = MaybeAddTimestampsToRange(
-      has_from ? &from_slice : nullptr, has_to ? &to_slice : nullptr, ts_sz,
-      &from_key_buf, &to_key_buf);
+  auto [from, to] = MaybeAddTimestampsToRange(from_opt, to_opt, ts_sz,
+                                              &from_key_buf, &to_key_buf);
   uint64_t i = 0;
   if (from.has_value()) {
     InternalKey ikey;
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 3b8a29337382..a9dc34e5d01c 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -2113,8 +2113,9 @@ void InternalDumpCommand::DoCommand() {
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st = GetAllKeyVersions(db_, GetCfHandle(), from_, to_, max_keys_,
-                                &key_versions);
+  Status st =
+      GetAllKeyVersions(db_, GetCfHandle(), has_from_ ? from_ : OptSlice{},
+                        has_to_ ? to_ : OptSlice{}, max_keys_, &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
diff --git a/unreleased_history/public_api_changes/optslice.md b/unreleased_history/public_api_changes/optslice.md
new file mode 100644
index 000000000000..f7047db1d4d2
--- /dev/null
+++ b/unreleased_history/public_api_changes/optslice.md
@@ -0,0 +1,2 @@
+* `GetAllKeyVersions()` now interprets empty slices literally, as valid keys, and uses new `OptSlice` type default value for extreme upper and lower range limits.
+* `DeleteFilesInRanges()` now takes `RangeOpt` which is based on `OptSlice`. The overload taking `RangePtr` is deprecated.
diff --git a/util/udt_util.cc b/util/udt_util.cc
index 7a0eeb2e3d38..3246574d61bb 100644
--- a/util/udt_util.cc
+++ b/util/udt_util.cc
@@ -429,11 +429,10 @@ void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
   PutFixed64(full_history_ts_low, cutoff_udt_ts + 1);
 }
 
-std::tuple<std::optional<Slice>, std::optional<Slice>>
-MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz,
-                          std::string* start_with_ts, std::string* end_with_ts,
-                          bool exclusive_end) {
-  std::optional<Slice> ret_start, ret_end;
+std::tuple<OptSlice, OptSlice> MaybeAddTimestampsToRange(
+    const OptSlice& start, const OptSlice& end, size_t ts_sz,
+    std::string* start_with_ts, std::string* end_with_ts, bool exclusive_end) {
+  OptSlice ret_start, ret_end;
   if (start) {
     if (ts_sz == 0) {
       ret_start = *start;
diff --git a/util/udt_util.h b/util/udt_util.h
index 51ea76e8544e..8252bab64fca 100644
--- a/util/udt_util.h
+++ b/util/udt_util.h
@@ -278,11 +278,10 @@ void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
 // `start` is the inclusive lower user key bound without user-defined timestamp.
 // `end` is the upper user key bound without user-defined timestamp.
 // By default, `end` is treated as being exclusive. If `exclusive_end` is set to
-// false, it's treated as an inclusive upper bound.
-// If any of these two bounds is nullptr, an empty std::optional<Slice> is
-// returned for that bound.
-std::tuple<std::optional<Slice>, std::optional<Slice>>
-MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz,
-                          std::string* start_with_ts, std::string* end_with_ts,
-                          bool exclusive_end = true);
+// false, it's treated as an inclusive upper bound. For either bound that has no
+// value, a "no value" OptSlice is returned for that bound.
+std::tuple<OptSlice, OptSlice> MaybeAddTimestampsToRange(
+    const OptSlice& start, const OptSlice& end, size_t ts_sz,
+    std::string* start_with_ts, std::string* end_with_ts,
+    bool exclusive_end = true);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index a0e5b9da0dec..d686c7bac264 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -233,7 +233,7 @@ class BlobDBTest : public testing::Test {
     DB *db = blob_db_->GetRootDB();
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
-    ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions));
+    ASSERT_OK(GetAllKeyVersions(db, {}, {}, kMaxKeys, &versions));
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
     for (auto &key_version : expected_versions) {
@@ -259,7 +259,7 @@ class BlobDBTest : public testing::Test {
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
     ASSERT_OK(
-        GetAllKeyVersions(blob_db_->GetRootDB(), "", "", kMaxKeys, &versions));
+        GetAllKeyVersions(blob_db_->GetRootDB(), {}, {}, kMaxKeys, &versions));
     ASSERT_EQ(versions.size(), expected_versions.size());
 
     size_t i = 0;
@@ -1595,7 +1595,7 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   // Verify expired blob index are filtered.
   std::vector<KeyVersion> versions;
   const size_t kMaxKeys = 10000;
-  ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(blob_db_, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(data_after_compact.size(), versions.size());
   for (auto &version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
@@ -1629,14 +1629,14 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
 
   DB *base_db = blob_db_->GetRootDB();
   std::vector<KeyVersion> versions;
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   ASSERT_EQ("foo", versions[1].user_key);
   VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
 
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   ASSERT_EQ("foo", versions[1].user_key);
@@ -1646,7 +1646,7 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
   blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]);
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(1, versions.size());
   ASSERT_EQ("bar", versions[0].user_key);
   VerifyDB({{"bar", "v2"}});
@@ -1655,7 +1655,7 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
   blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]);
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(0, versions.size());
   VerifyDB({});
 }
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 89e1487faad4..59e6d46880f5 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -53,7 +53,7 @@ std::string KeyVersion::GetTypeName() const {
   }
 }
 
-Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+Status GetAllKeyVersions(DB* db, OptSlice begin_key, OptSlice end_key,
                          size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
   if (nullptr == db) {
@@ -63,8 +63,8 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                            max_num_ikeys, key_versions);
 }
 
-Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
-                         Slice end_key, size_t max_num_ikeys,
+Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, OptSlice begin_key,
+                         OptSlice end_key, size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
   if (nullptr == db) {
     return Status::InvalidArgument("db cannot be null.");
@@ -87,15 +87,10 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
   const Comparator* ucmp = icmp.user_comparator();
   size_t ts_sz = ucmp->timestamp_size();
 
-  Slice from_slice = begin_key;
-  bool has_begin = !begin_key.empty();
-  Slice end_slice = end_key;
-  bool has_end = !end_key.empty();
   std::string begin_key_buf, end_key_buf;
-  auto [from, end] = MaybeAddTimestampsToRange(
-      has_begin ? &from_slice : nullptr, has_end ? &end_slice : nullptr, ts_sz,
-      &begin_key_buf, &end_key_buf);
-  if (has_begin) {
+  auto [from, end] = MaybeAddTimestampsToRange(begin_key, end_key, ts_sz,
+                                               &begin_key_buf, &end_key_buf);
+  if (begin_key.has_value()) {
     assert(from.has_value());
     InternalKey ikey;
     ikey.SetMinPossibleForUserKey(from.value());
@@ -113,7 +108,7 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
       return pik_status;
     }
 
-    if (has_end && end.has_value() &&
+    if (end_key.has_value() && end.has_value() &&
         icmp.user_comparator()->Compare(ikey.user_key, end.value()) > 0) {
       break;
     }

From 49b0cb64df082978962b1d7ae24693dcb7f9dbf2 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 25 Mar 2025 10:56:25 -0700
Subject: [PATCH 033/500] Fix uninitialized use in WBWIMemTable::Get (#13486)

Summary:
Based on passing address of uninit variable in ReadOnlyMemTable::Get() in memtable.h. The contract and other implementations suggest it is a pure out parameter that is always overwritten, so we initialize it in the function before checking its value in a loop

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13486

Test Plan: watch build-linux-valgrind in CI

Reviewed By: cbi42

Differential Revision: D71819843

Pulled By: pdillinger

fbshipit-source-id: 1e06f3ee6998099791af27de5b2872eb476ceb7c
---
 memtable/wbwi_memtable.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/memtable/wbwi_memtable.cc b/memtable/wbwi_memtable.cc
index 540253666908..9686eac50299 100644
--- a/memtable/wbwi_memtable.cc
+++ b/memtable/wbwi_memtable.cc
@@ -61,6 +61,7 @@ bool WBWIMemTable::Get(const LookupKey& key, std::string* value,
   assert(!wbwi_->GetWriteBatch()->HasDeleteRange());
   assert(merge_context);
 
+  *out_seq = kMaxSequenceNumber;
   [[maybe_unused]] SequenceNumber read_seq =
       GetInternalKeySeqno(key.internal_key());
   // This is memtable is a single write batch, no snapshot can be taken within

From 9072f5db09372f3662a86c884183625a64c4cddd Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 25 Mar 2025 14:55:07 -0700
Subject: [PATCH 034/500] Update for 10.1 release (#13485)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13485

Reviewed By: jaykorean, pdillinger

Differential Revision: D71787995

Pulled By: hx235

fbshipit-source-id: 59b6ff7c824adbdef34b6ae12d7dbcc3e0852961
---
 HISTORY.md                                    | 22 +++++++++++++++++++
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 .../behavior_changes/ldb_comp.md              |  1 -
 .../persisted-tier-multiget.md                |  1 -
 .../behavior_changes/read_only_create_cf.md   |  1 -
 .../bug_fixes/stats_fix_for_tiered_storage.md |  1 -
 .../calculate_sst_write_lifetime_hint_set.md  |  1 -
 unreleased_history/new_features/l0_file.md    |  1 -
 .../per_key_placement_remote_compaction.md    |  1 -
 .../table_properties_for_levels.md            |  1 -
 .../public_api_changes/optslice.md            |  2 --
 .../read_options_property_bag.md              |  1 -
 .../remote_compaction_aborted_status.md       |  1 -
 .../public_api_changes/unsupport_fv1.md       |  1 -
 15 files changed, 24 insertions(+), 15 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/ldb_comp.md
 delete mode 100644 unreleased_history/behavior_changes/persisted-tier-multiget.md
 delete mode 100644 unreleased_history/behavior_changes/read_only_create_cf.md
 delete mode 100644 unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md
 delete mode 100644 unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md
 delete mode 100644 unreleased_history/new_features/l0_file.md
 delete mode 100644 unreleased_history/new_features/per_key_placement_remote_compaction.md
 delete mode 100644 unreleased_history/new_features/table_properties_for_levels.md
 delete mode 100644 unreleased_history/public_api_changes/optslice.md
 delete mode 100644 unreleased_history/public_api_changes/read_options_property_bag.md
 delete mode 100644 unreleased_history/public_api_changes/remote_compaction_aborted_status.md
 delete mode 100644 unreleased_history/public_api_changes/unsupport_fv1.md

diff --git a/HISTORY.md b/HISTORY.md
index ab8466abd1ce..9846f240916b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,28 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.1.0 (03/24/2025)
+### New Features
+* Added a new `DBOptions.calculate_sst_write_lifetime_hint_set` setting that allows to customize which compaction styles SST write lifetime hint calculation is allowed on. Today RocksDB supports only two modes `kCompactionStyleLevel` and `kCompactionStyleUniversal`.
+* Add a new field `num_l0_files` in `CompactionJobInfo` about the number of L0 files in the CF right before and after the compaction
+* Added per-key-placement feature in Remote Compaction
+* Implemented API DB::GetPropertiesOfTablesByLevel that retrieves table properties for files in each LSM tree level
+
+### Public API Changes
+* `GetAllKeyVersions()` now interprets empty slices literally, as valid keys, and uses new `OptSlice` type default value for extreme upper and lower range limits.
+* `DeleteFilesInRanges()` now takes `RangeOpt` which is based on `OptSlice`. The overload taking `RangePtr` is deprecated.
+* Add an unordered map of name/value pairs, ReadOptions::property_bag, to pass opaque options through to an external table when creating an Iterator.
+* Introduced CompactionServiceJobStatus::kAborted to allow handling aborted scenario in Schedule(), Wait() or OnInstallation() APIs in Remote Compactions.
+* format\_version < 2 in BlockBasedTableOptions is no longer supported for writing new files. Support for reading such files is deprecated and might be removed in the future. `CompressedSecondaryCacheOptions::compress_format_version == 1` is also deprecated.
+
+### Behavior Changes
+* `ldb` now returns an error if the specified `--compression_type` is not supported in the build.
+* MultiGet with snapshot and ReadOptions::read_tier = kPersistedTier will now read a consistent view across CFs (instead of potentially reading some CF before and some CF after a flush).
+* CreateColumnFamily() is no longer allowed on a read-only DB (OpenForReadOnly())
+
+### Bug Fixes
+* Fixed stats for Tiered Storage with preclude_last_level feature
+
 ## 10.0.0 (02/21/2025)
 ### New Features
 * Introduced new `auto_refresh_iterator_with_snapshot` opt-in knob that (when enabled) will periodically release obsolete memory and storage resources for as long as the iterator is making progress and its supplied `read_options.snapshot` was initialized with non-nullptr value.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 104a6483dc5c..274b4e01e5b4 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 1
+#define ROCKSDB_MINOR 2
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index b137fcc2a922..9b228f1b18bc 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -135,7 +135,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/ldb_comp.md b/unreleased_history/behavior_changes/ldb_comp.md
deleted file mode 100644
index 1dff841ef511..000000000000
--- a/unreleased_history/behavior_changes/ldb_comp.md
+++ /dev/null
@@ -1 +0,0 @@
-* `ldb` now returns an error if the specified `--compression_type` is not supported in the build.
diff --git a/unreleased_history/behavior_changes/persisted-tier-multiget.md b/unreleased_history/behavior_changes/persisted-tier-multiget.md
deleted file mode 100644
index 9e7ae56a98c0..000000000000
--- a/unreleased_history/behavior_changes/persisted-tier-multiget.md
+++ /dev/null
@@ -1 +0,0 @@
-* MultiGet with snapshot and ReadOptions::read_tier = kPersistedTier will now read a consistent view across CFs (instead of potentially reading some CF before and some CF after a flush).
diff --git a/unreleased_history/behavior_changes/read_only_create_cf.md b/unreleased_history/behavior_changes/read_only_create_cf.md
deleted file mode 100644
index 2ff8e658a75c..000000000000
--- a/unreleased_history/behavior_changes/read_only_create_cf.md
+++ /dev/null
@@ -1 +0,0 @@
-* CreateColumnFamily() is no longer allowed on a read-only DB (OpenForReadOnly())
diff --git a/unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md b/unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md
deleted file mode 100644
index 3da1236c899d..000000000000
--- a/unreleased_history/bug_fixes/stats_fix_for_tiered_storage.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed stats for Tiered Storage with preclude_last_level feature
diff --git a/unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md b/unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md
deleted file mode 100644
index 5780d831a38a..000000000000
--- a/unreleased_history/new_features/calculate_sst_write_lifetime_hint_set.md
+++ /dev/null
@@ -1 +0,0 @@
-Added a new `DBOptions.calculate_sst_write_lifetime_hint_set` setting that allows to customize which compaction styles SST write lifetime hint calculation is allowed on. Today RocksDB supports only two modes `kCompactionStyleLevel` and `kCompactionStyleUniversal`.
diff --git a/unreleased_history/new_features/l0_file.md b/unreleased_history/new_features/l0_file.md
deleted file mode 100644
index f31178217b31..000000000000
--- a/unreleased_history/new_features/l0_file.md
+++ /dev/null
@@ -1 +0,0 @@
-Add a new field `num_l0_files` in `CompactionJobInfo` about the number of L0 files in the CF right before and after the compaction
diff --git a/unreleased_history/new_features/per_key_placement_remote_compaction.md b/unreleased_history/new_features/per_key_placement_remote_compaction.md
deleted file mode 100644
index e89d3e155e76..000000000000
--- a/unreleased_history/new_features/per_key_placement_remote_compaction.md
+++ /dev/null
@@ -1 +0,0 @@
-Added per-key-placement feature in Remote Compaction
diff --git a/unreleased_history/new_features/table_properties_for_levels.md b/unreleased_history/new_features/table_properties_for_levels.md
deleted file mode 100644
index d0b2653c8d3c..000000000000
--- a/unreleased_history/new_features/table_properties_for_levels.md
+++ /dev/null
@@ -1 +0,0 @@
-Implemented API DB::GetPropertiesOfTablesByLevel that retrieves table properties for files in each LSM tree level
\ No newline at end of file
diff --git a/unreleased_history/public_api_changes/optslice.md b/unreleased_history/public_api_changes/optslice.md
deleted file mode 100644
index f7047db1d4d2..000000000000
--- a/unreleased_history/public_api_changes/optslice.md
+++ /dev/null
@@ -1,2 +0,0 @@
-* `GetAllKeyVersions()` now interprets empty slices literally, as valid keys, and uses new `OptSlice` type default value for extreme upper and lower range limits.
-* `DeleteFilesInRanges()` now takes `RangeOpt` which is based on `OptSlice`. The overload taking `RangePtr` is deprecated.
diff --git a/unreleased_history/public_api_changes/read_options_property_bag.md b/unreleased_history/public_api_changes/read_options_property_bag.md
deleted file mode 100644
index 5b9b58e1ddb6..000000000000
--- a/unreleased_history/public_api_changes/read_options_property_bag.md
+++ /dev/null
@@ -1 +0,0 @@
-Add an unordered map of name/value pairs, ReadOptions::property_bag, to pass opaque options through to an external table when creating an Iterator.
diff --git a/unreleased_history/public_api_changes/remote_compaction_aborted_status.md b/unreleased_history/public_api_changes/remote_compaction_aborted_status.md
deleted file mode 100644
index eb36ed63ac6b..000000000000
--- a/unreleased_history/public_api_changes/remote_compaction_aborted_status.md
+++ /dev/null
@@ -1 +0,0 @@
-Introduced CompactionServiceJobStatus::kAborted to allow handling aborted scenario in Schedule(), Wait() or OnInstallation() APIs in Remote Compactions.
diff --git a/unreleased_history/public_api_changes/unsupport_fv1.md b/unreleased_history/public_api_changes/unsupport_fv1.md
deleted file mode 100644
index 6f31edde0b52..000000000000
--- a/unreleased_history/public_api_changes/unsupport_fv1.md
+++ /dev/null
@@ -1 +0,0 @@
-* format\_version < 2 in BlockBasedTableOptions is no longer supported for writing new files. Support for reading such files is deprecated and might be removed in the future. `CompressedSecondaryCacheOptions::compress_format_version == 1` is also deprecated.

From 743a02d6f68811bed8c5f3ce5d7e6d9b6bc18783 Mon Sep 17 00:00:00 2001
From: prerit <8664107+jainpr@users.noreply.github.com>
Date: Thu, 27 Mar 2025 15:10:55 -0700
Subject: [PATCH 035/500] Check for yields while waiting for lock in a loop
 (#13498)

Summary:
Acquiring a lock here can take a long time and cause a user mode scheduler to hold up, as it relies on explicit yielding. Hence, forcing a check here but ignoring any abort requests. Would rely on upstream to take action on aborts.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13498

Reviewed By: pdillinger

Differential Revision: D71987173

Pulled By: jainpr

fbshipit-source-id: 4aec40bdf0bc657e29f72c306c576b3117f97a25
---
 utilities/transactions/lock/point/point_lock_manager.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 85916b86f9af..97d3ace29d1c 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -318,6 +318,9 @@ Status PointLockManager::AcquireWithTimeout(
       } else {
         uint64_t now = env->NowMicros();
         if (static_cast<uint64_t>(cv_end_time) > now) {
+          // This may be invoked multiple times since we divide
+          // the time into smaller intervals.
+          (void)ROCKSDB_THREAD_YIELD_CHECK_ABORT();
           result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
                                               cv_end_time - now);
         }

From 325dcdf2e54aaaad9ca44938deb2729142e670dc Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 28 Mar 2025 14:49:28 -0700
Subject: [PATCH 036/500] Deprecate `ReadOptions::ignore_range_deletions` and
 `experimental::PromoteL0()` (#13500)

Summary:
based on the option comment, `ignore_range_deletions` was added due to the overhead of range deletions in read path when a DB does not use DeleteRange(). The current implementation should not have a noticeable performance difference in this case.

`experimental::PromoteL0()` can be replaced by doing a manual compaction with proper CompactRangeOptions.

There are some internal use of these option and API so we will remove them later after the usages are updated.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13500

Test Plan:
comment change only.
Performance: benchmark the performance difference with `ignore_range_deletions` and without (borrowed flag `universal_incremental` for this purpose), ran at the same time on the same machine.

- random point get:
    - ignore_range_deletions=false: 343078 ops/sec
    - ignore_range_deletions=true: 340219 ops/sec (0.8% slower)
```
(for I in $(seq 1 1); do TEST_TMPDIR=/dev/shm/t1 /data/users/changyubi/vscode-root/rocksdb/db_bench --benchmarks=fillseq,waitforcompaction,readrandom --write_buffer_size=67108864 --writes=1000000 --num=2000000 --reads=1000000  --seed=1723056275 --universal_incremental=false 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }';
```

- sequential scan:
  - ignore_range_deletions=false: 5378104 ops/sec
  - ignore_range_deletions=true: 5393809 ops/sec (0.3% faster)
```
(for I in $(seq 1 10); do TEST_TMPDIR=/dev/shm/t1 /data/users/changyubi/vscode-root/rocksdb/db_bench --benchmarks=fillseq,waitforcompaction,readseq[-X10] --write_buffer_size=67108864 --writes=1000000 --num=2000000  --universal_incremental=true --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }';
```

The difference in ops/sec for the two benchmarks is likely noise.

Reviewed By: hx235

Differential Revision: D72069223

Pulled By: cbi42

fbshipit-source-id: ad82a051aa4682790d2178cd4fb2d1467397fbb5
---
 include/rocksdb/experimental.h                      |  5 +++++
 include/rocksdb/options.h                           |  4 ++++
 java/src/main/java/org/rocksdb/ReadOptions.java     |  6 ++++++
 java/src/main/java/org/rocksdb/RocksDB.java         | 13 +++++++++----
 .../deprecate-ignore-range-del.md                   |  1 +
 .../public_api_changes/deprecate-promote-l0.md      |  1 +
 6 files changed, 26 insertions(+), 4 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/deprecate-ignore-range-del.md
 create mode 100644 unreleased_history/public_api_changes/deprecate-promote-l0.md

diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h
index 349d05f9b403..d6a34c025728 100644
--- a/include/rocksdb/experimental.h
+++ b/include/rocksdb/experimental.h
@@ -21,6 +21,11 @@ Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
                            const Slice* begin, const Slice* end);
 Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
 
+// DEPRECATED: this API may be removed in a future release.
+// This operation can be done through CompactRange() by setting
+// CompactRangeOptions::bottommost_level_compaction set to
+// BottommostLevelCompaction::kSkip and setting target level.
+//
 // Move all L0 files to target_level skipping compaction.
 // This operation succeeds only if the files in L0 have disjoint ranges; this
 // is guaranteed to happen, for instance, if keys are inserted in sorted
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 6b66976aa371..181e7781b67c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1784,6 +1784,10 @@ struct ReadOptions {
   // block cache.
   bool fill_cache = true;
 
+  // DEPRECATED: This option might be removed in a future release.
+  // There should be no noticeable performance difference whether this option
+  // is turned on or off when a DB does not use DeleteRange().
+  //
   // If true, range tombstones handling will be skipped in key lookup paths.
   // For DB instances that don't use DeleteRange() calls, this setting can
   // be used to optimize the read performance.
diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java
index 5ce4a8656d3e..8cc9883d23cd 100644
--- a/java/src/main/java/org/rocksdb/ReadOptions.java
+++ b/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -398,7 +398,10 @@ public ReadOptions setMaxSkippableInternalKeys(
    * Default: false
    *
    * @return true if keys deleted using the DeleteRange() API will be visible
+   *
+   * @deprecated This option may be remove in a future release.
    */
+  @Deprecated
   public boolean ignoreRangeDeletions() {
     assert(isOwningHandle());
     return ignoreRangeDeletions(nativeHandle_);
@@ -414,7 +417,10 @@ public boolean ignoreRangeDeletions() {
    * @param ignoreRangeDeletions true if keys deleted using the DeleteRange()
    *     API should be visible
    * @return the reference to the current ReadOptions.
+   *
+   * @deprecated This option may be remove in a future release.
    */
+  @Deprecated
   public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) {
     assert(isOwningHandle());
     setIgnoreRangeDeletions(nativeHandle_, ignoreRangeDeletions);
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index 1ffb44b6a1b2..d01b98f48a17 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -4633,10 +4633,13 @@ public Range suggestCompactRange()
    * @param targetLevel the target level for L0
    *
    * @throws RocksDBException if an error occurs whilst promoting L0
+   *
+   * @deprecated this API may be removed in a future release.
    */
+  @Deprecated
   public void promoteL0(
-      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
-      final int targetLevel) throws RocksDBException {
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle, final int targetLevel)
+      throws RocksDBException {
     promoteL0(nativeHandle_,
         columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
         targetLevel);
@@ -4648,9 +4651,11 @@ public void promoteL0(
    * @param targetLevel the target level for L0
    *
    * @throws RocksDBException if an error occurs whilst promoting L0
+   *
+   * @deprecated this API may be removed in a future release.
    */
-  public void promoteL0(final int targetLevel)
-      throws RocksDBException {
+  @Deprecated
+  public void promoteL0(final int targetLevel) throws RocksDBException {
     promoteL0(null, targetLevel);
   }
 
diff --git a/unreleased_history/public_api_changes/deprecate-ignore-range-del.md b/unreleased_history/public_api_changes/deprecate-ignore-range-del.md
new file mode 100644
index 000000000000..d4e09e6ec2ee
--- /dev/null
+++ b/unreleased_history/public_api_changes/deprecate-ignore-range-del.md
@@ -0,0 +1 @@
+* Deprecated `ReadOptions::ignore_range_deletions`.
diff --git a/unreleased_history/public_api_changes/deprecate-promote-l0.md b/unreleased_history/public_api_changes/deprecate-promote-l0.md
new file mode 100644
index 000000000000..bcc31298299f
--- /dev/null
+++ b/unreleased_history/public_api_changes/deprecate-promote-l0.md
@@ -0,0 +1 @@
+* Deprecated API `experimental::PromoteL0()`.

From 48eb646787437bd9a1c5f03b717038ce23e15ee5 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 31 Mar 2025 19:29:40 -0700
Subject: [PATCH 037/500] Mark MaxMemCompactionLevel() deprecated (#13503)

Summary:
**Context/Summary:**

MaxMemCompactionLevel() developed 10 years ago simply returns the level a memtable flushed to, which has historically been L0 and have no plan to change to something different for future. It is also not used in test or internally.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13503

Test Plan: CI + fake release

Reviewed By: cbi42

Differential Revision: D72066092

Pulled By: hx235

fbshipit-source-id: 5ff5b16a6664ef3efabd3a6fbd8a2d0529b62460
---
 db/db_test.cc                                          |  5 -----
 include/rocksdb/db.h                                   |  5 ++++-
 java/src/main/java/org/rocksdb/RocksDB.java            |  1 +
 java/src/test/java/org/rocksdb/RocksDBTest.java        | 10 ----------
 .../dep_max_compact_memtable_level.md                  |  1 +
 5 files changed, 6 insertions(+), 16 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/dep_max_compact_memtable_level.md

diff --git a/db/db_test.cc b/db/db_test.cc
index 763cdfc22d66..b1c181a1f3ca 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3351,11 +3351,6 @@ class ModelDB : public DB {
   using DB::NumberLevels;
   int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
 
-  using DB::MaxMemCompactionLevel;
-  int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
-    return 1;
-  }
-
   using DB::Level0StopWriteTrigger;
   int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
     return -1;
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index df951b2810a8..dab74b8ef934 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1658,9 +1658,12 @@ class DB {
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
 
+  // DEPRECATED:
   // Maximum level to which a new compacted memtable is pushed if it
   // does not create overlap.
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) = 0;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+    return 0;
+  }
   virtual int MaxMemCompactionLevel() {
     return MaxMemCompactionLevel(DefaultColumnFamily());
   }
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index d01b98f48a17..b423649c111d 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -4126,6 +4126,7 @@ public int numberLevels(/* @Nullable */final ColumnFamilyHandle columnFamilyHand
    *
    * @return the maximum level
    */
+  @Deprecated
   public int maxMemCompactionLevel() {
     return maxMemCompactionLevel(null);
   }
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index 5a9c76fd8e3b..50cdf86a3e44 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -1563,16 +1563,6 @@ public void numberLevels() throws RocksDBException {
     }
   }
 
-  @Test
-  public void maxMemCompactionLevel() throws RocksDBException {
-    try (final Options options = new Options().setCreateIfMissing(true)) {
-      final String dbPath = dbFolder.getRoot().getAbsolutePath();
-      try (final RocksDB db = RocksDB.open(options, dbPath)) {
-        assertThat(db.maxMemCompactionLevel()).isEqualTo(0);
-      }
-    }
-  }
-
   @Test
   public void level0StopWriteTrigger() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true)) {
diff --git a/unreleased_history/public_api_changes/dep_max_compact_memtable_level.md b/unreleased_history/public_api_changes/dep_max_compact_memtable_level.md
new file mode 100644
index 000000000000..9b41bd977275
--- /dev/null
+++ b/unreleased_history/public_api_changes/dep_max_compact_memtable_level.md
@@ -0,0 +1 @@
+* Deprecated API `DB::MaxMemCompactionLevel()`.

From be99011f08a042ad76859476b2263ce108ec3be8 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 1 Apr 2025 14:17:37 -0700
Subject: [PATCH 038/500] More separation of txn_write_policy for crash tests
 (#13499)

Summary:
We are seeing some occasional failures with WRITE_(UN)PREPARED crash test runs, and it's alarming when these are grouped in with WRITE_COMMITTED, which AFAIK is the only one considered mature and mission-critical at this point.

* Mark WRITE_(UN)PREPARED as EXPERIMENTAL in the public APIs
* Separate out the `_with_txn` crash test jobs by write policy, now `_with_wc_txn`, `_with_wp_txn` and `_with_wup_txn` so that the major functional and maturity differences are better grouped.
* Add `_with_multiops_wup_txn` which was apparently missing
* Clean up db_crashtest.py for better consistency
  * Get rid of awkard "write_policy" parameter that could conflict with authoritative "txn_write_policy" parameter.
  * Similarly, move some multiops logic from different parameter sets to finalize_and_sanitize logic.

Immediate internal follow-up:
* Migrate from `_with_txn` which are now deprecated aliases of `_with_wc_txn` to more jobs with the new variants. And likely also add new multiops job.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13499

Test Plan: manual runs of modified jobs, at least long enough to spot check things like txn_write_policy

Reviewed By: hx235

Differential Revision: D72015307

Pulled By: pdillinger

fbshipit-source-id: 06b99b2d1f15ac76fe7b8e22c93a51aaa2a42ecf
---
 crash_test.mk                              | 76 +++++++++++++++++-----
 include/rocksdb/utilities/transaction_db.h | 13 +++-
 tools/db_crashtest.py                      | 66 ++++++++-----------
 3 files changed, 99 insertions(+), 56 deletions(-)

diff --git a/crash_test.mk b/crash_test.mk
index a71a55c15c73..43cce994a23b 100644
--- a/crash_test.mk
+++ b/crash_test.mk
@@ -11,18 +11,29 @@ CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
 CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
 
 .PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
+	crash_test_with_wc_txn crash_test_with_wp_txn crash_test_with_wup_txn \
 	crash_test_with_best_efforts_recovery crash_test_with_ts \
+	crash_test_with_multiops_wc_txn \
+	crash_test_with_multiops_wp_txn \
+	crash_test_with_multiops_wup_txn \
+	crash_test_with_optimistic_txn \
+	crash_test_with_tiered_storage \
 	blackbox_crash_test blackbox_crash_test_with_atomic_flush \
+	blackbox_crash_test_with_wc_txn blackbox_crash_test_with_wp_txn \
+	blackbox_crash_test_with_wup_txn \
 	blackbox_crash_test_with_txn blackbox_crash_test_with_ts \
 	blackbox_crash_test_with_best_efforts_recovery \
-	whitebox_crash_test whitebox_crash_test_with_atomic_flush \
-	whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
 	blackbox_crash_test_with_multiops_wc_txn \
 	blackbox_crash_test_with_multiops_wp_txn \
-	crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
-	whitebox_crash_test_with_tiered_storage \
-	whitebox_crash_test_with_optimistic_txn \
+	blackbox_crash_test_with_multiops_wup_txn \
 	blackbox_crash_test_with_optimistic_txn \
+	blackbox_crash_test_with_tiered_storage \
+	whitebox_crash_test whitebox_crash_test_with_atomic_flush \
+	whitebox_crash_test_with_wc_txn whitebox_crash_test_with_wp_txn \
+	whitebox_crash_test_with_wup_txn \
+	whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
+	whitebox_crash_test_with_optimistic_txn \
+	whitebox_crash_test_with_tiered_storage \
 
 crash_test: $(DB_STRESS_CMD)
 # Do not parallelize
@@ -34,10 +45,20 @@ crash_test_with_atomic_flush: $(DB_STRESS_CMD)
 	$(CRASHTEST_MAKE) whitebox_crash_test_with_atomic_flush
 	$(CRASHTEST_MAKE) blackbox_crash_test_with_atomic_flush
 
-crash_test_with_txn: $(DB_STRESS_CMD)
+crash_test_with_wc_txn: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_wc_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_wc_txn
+
+crash_test_with_wp_txn: $(DB_STRESS_CMD)
 # Do not parallelize
-	$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
-	$(CRASHTEST_MAKE) blackbox_crash_test_with_txn
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_wp_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_wp_txn
+
+crash_test_with_wup_txn: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_wup_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_wup_txn
 
 crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
 # Do not parallelize
@@ -62,6 +83,9 @@ crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
 crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
 	$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wp_txn
 
+crash_test_with_multiops_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wup_txn
+
 blackbox_crash_test: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --simple blackbox $(CRASH_TEST_EXT_ARGS)
 	$(CRASHTEST_PY) blackbox $(CRASH_TEST_EXT_ARGS)
@@ -69,8 +93,14 @@ blackbox_crash_test: $(DB_STRESS_CMD)
 blackbox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
 
-blackbox_crash_test_with_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --txn blackbox $(CRASH_TEST_EXT_ARGS)
+blackbox_crash_test_with_wc_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox --txn_write_policy 0 $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_wp_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox --txn_write_policy 1 $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn blackbox --txn_write_policy 2 $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_best_efforts_recovery: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
@@ -79,10 +109,13 @@ blackbox_crash_test_with_ts: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --test_multiops_txn --write_policy write_committed blackbox $(CRASH_TEST_EXT_ARGS)
+	$(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 0 blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --test_multiops_txn --write_policy write_prepared blackbox $(CRASH_TEST_EXT_ARGS)
+	$(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 1 blackbox $(CRASH_TEST_EXT_ARGS)
+
+blackbox_crash_test_with_multiops_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --test_multiops_txn --txn_write_policy 2 blackbox $(CRASH_TEST_EXT_ARGS)
 
 blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)
@@ -104,9 +137,17 @@ whitebox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --cf_consistency whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
-whitebox_crash_test_with_txn: $(DB_STRESS_CMD)
-	$(CRASHTEST_PY) --txn whitebox --random_kill_odd \
-      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+whitebox_crash_test_with_wc_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --txn_write_policy 0 \
+	  --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_wp_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --txn_write_policy 1 \
+      --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_wup_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --txn whitebox --txn_write_policy 2 \
+      --random_kill_odd $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
 whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --enable_ts whitebox --random_kill_odd \
@@ -119,3 +160,8 @@ whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
 whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+# Old names DEPRECATED
+crash_test_with_txn: crash_test_with_wc_txn
+whitebox_crash_test_with_txn: whitebox_crash_test_with_wc_txn
+blackbox_crash_test_with_txn: blackbox_crash_test_with_wc_txn
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 766fe75917c5..156583333d33 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -24,9 +24,16 @@ class SecondaryIndex;
 class TransactionDBMutexFactory;
 
 enum TxnDBWritePolicy {
-  WRITE_COMMITTED = 0,  // write only the committed data
-  WRITE_PREPARED,       // write data after the prepare phase of 2pc
-  WRITE_UNPREPARED      // write data before the prepare phase of 2pc
+  // Write data at transaction commit time
+  WRITE_COMMITTED = 0,
+
+  // EXPERIMENTAL: The remaining write policies are not as mature, well
+  // validated, nor as compatible with other features as WRITE_COMMITTED.
+
+  // Write data after the prepare phase of 2pc
+  WRITE_PREPARED,
+  // Write data before the prepare phase of 2pc
+  WRITE_UNPREPARED
 };
 
 constexpr uint32_t kInitialMaxDeadlocks = 5;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 831de21fd9d3..aef6bc3b8906 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -522,6 +522,7 @@ def is_direct_io_supported(dbname):
     "use_txn": 1,
     "use_optimistic_txn": 0,
     # Avoid lambda to set it once for the entire test
+    # NOTE: often passed in from command line overriding this
     "txn_write_policy": random.randint(0, 2),
     "unordered_write": random.randint(0, 1),
     # TODO: there is such a thing as transactions with WAL disabled. We should
@@ -623,18 +624,22 @@ def is_direct_io_supported(dbname):
     "default_write_temperature": lambda: random.choice(["kUnknown", "kHot", "kWarm"]),
 }
 
-multiops_txn_default_params = {
+multiops_txn_params = {
     "test_cf_consistency": 0,
     "test_batches_snapshots": 0,
     "test_multi_ops_txns": 1,
     "use_txn": 1,
+    # Avoid lambda to set it once for the entire test
+    # NOTE: often passed in from command line overriding this
+    "txn_write_policy": random.randint(0, 2),
     "two_write_queues": lambda: random.choice([0, 1]),
     # TODO: enable write-prepared
     "disable_wal": 0,
     "use_only_the_last_commit_time_batch_for_recovery": lambda: random.choice([0, 1]),
     "clear_column_family_one_in": 0,
     "column_families": 1,
-    "enable_pipelined_write": lambda: random.choice([0, 1]),
+    # TODO re-enable pipelined write (lambda: random.choice([0, 1]))
+    "enable_pipelined_write": 0,
     # This test already acquires snapshots in reads
     "acquire_snapshot_one_in": 0,
     "backup_one_in": 0,
@@ -681,34 +686,9 @@ def is_direct_io_supported(dbname):
     "use_timed_put_one_in": 0,
     # AttributeGroup not yet supported
     "use_attribute_group": 0,
-}
-
-multiops_wc_txn_params = {
-    "txn_write_policy": 0,
-    # TODO re-enable pipelined write. Not well tested atm
-    "enable_pipelined_write": 0,
     "commit_bypass_memtable_one_in": random.choice([0] * 4 + [100]),
 }
 
-multiops_wp_txn_params = {
-    "txn_write_policy": 1,
-    "wp_snapshot_cache_bits": 1,
-    # try small wp_commit_cache_bits, e.g. 0 once we explore storing full
-    # commit sequence numbers in commit cache
-    "wp_commit_cache_bits": 10,
-    # pipeline write is not currnetly compatible with WritePrepared txns
-    "enable_pipelined_write": 0,
-    # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
-    "checkpoint_one_in": 0,
-    # Required to be 1 in order to use commit-time-batch
-    "use_only_the_last_commit_time_batch_for_recovery": 1,
-    "clear_wp_commit_cache_one_in": 10,
-    "create_timestamped_snapshot_one_in": 0,
-    # sequence number can be advanced in SwitchMemtable::WriteRecoverableState() for WP.
-    # disable it for now until we find another way to test LockWAL().
-    "lock_wal_one_in": 0,
-}
-
 
 def finalize_and_sanitize(src_params):
     dest_params = {k: v() if callable(v) else v for (k, v) in src_params.items()}
@@ -802,8 +782,9 @@ def finalize_and_sanitize(src_params):
     # Remove the following once write-prepared/write-unprepared with/without
     # unordered write supports timestamped snapshots
     if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0:
-        dest_params["txn_write_policy"] = 0
         dest_params["unordered_write"] = 0
+        if dest_params.get("txn_write_policy", 0) != 0:
+            dest_params["create_timestamped_snapshot_one_in"] = 0
     # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
     # unordered_write is only enabled with --txn, and txn_params disables inplace_update_support, so
     # setting allow_concurrent_memtable_write=1 won't conflcit with inplace_update_support.
@@ -888,6 +869,23 @@ def finalize_and_sanitize(src_params):
         dest_params["metadata_write_fault_one_in"] = 0
         dest_params["read_fault_one_in"] = 0
         dest_params["metadata_read_fault_one_in"] = 0
+        if dest_params.get("txn_write_policy", 0) != 0:
+            # TODO: should any of this change for WUP (txn_write_policy==2)?
+            dest_params["wp_snapshot_cache_bits"] = 1
+            # try small wp_commit_cache_bits, e.g. 0 once we explore storing full
+            # commit sequence numbers in commit cache
+            dest_params["wp_commit_cache_bits"] = 10
+            # pipeline write is not currnetly compatible with WritePrepared txns
+            dest_params["enable_pipelined_write"] = 0
+            # OpenReadOnly after checkpoint is not currently compatible with WritePrepared txns
+            dest_params["checkpoint_one_in"] = 0
+            # Required to be 1 in order to use commit-time-batch
+            dest_params["use_only_the_last_commit_time_batch_for_recovery"] = 1
+            dest_params["clear_wp_commit_cache_one_in"] = 10
+            # sequence number can be advanced in SwitchMemtable::WriteRecoverableState() for WP.
+            # disable it for now until we find another way to test LockWAL().
+            dest_params["lock_wal_one_in"] = 0
+
     # Wide column stress tests require FullMergeV3
     if dest_params["use_put_entity_one_in"] != 0:
         dest_params["use_full_merge_v1"] = 0
@@ -1058,11 +1056,7 @@ def gen_cmd_params(args):
     if args.enable_ts:
         params.update(ts_params)
     if args.test_multiops_txn:
-        params.update(multiops_txn_default_params)
-        if args.write_policy == "write_committed":
-            params.update(multiops_wc_txn_params)
-        elif args.write_policy == "write_prepared":
-            params.update(multiops_wp_txn_params)
+        params.update(multiops_txn_params)
     if args.test_tiered_storage:
         params.update(tiered_params)
 
@@ -1111,7 +1105,6 @@ def gen_cmd(params, unknown_params):
                 "test_best_efforts_recovery",
                 "enable_ts",
                 "test_multiops_txn",
-                "write_policy",
                 "stress_cmd",
                 "test_tiered_storage",
                 "cleanup_cmd",
@@ -1422,7 +1415,6 @@ def main():
     parser.add_argument("--test_best_efforts_recovery", action="store_true")
     parser.add_argument("--enable_ts", action="store_true")
     parser.add_argument("--test_multiops_txn", action="store_true")
-    parser.add_argument("--write_policy", choices=["write_committed", "write_prepared"])
     parser.add_argument("--stress_cmd")
     parser.add_argument("--test_tiered_storage", action="store_true")
     parser.add_argument("--cleanup_cmd")
@@ -1438,9 +1430,7 @@ def main():
         + list(whitebox_simple_default_params.items())
         + list(blob_params.items())
         + list(ts_params.items())
-        + list(multiops_txn_default_params.items())
-        + list(multiops_wc_txn_params.items())
-        + list(multiops_wp_txn_params.items())
+        + list(multiops_txn_params.items())
         + list(best_efforts_recovery_params.items())
         + list(cf_consistency_params.items())
         + list(tiered_params.items())

From b7a9d414c8bbcc7a5dd9329e344a0451770ab555 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 1 Apr 2025 18:16:07 -0700
Subject: [PATCH 039/500] Fix WriteBatch atomicity and WAL recovery for some
 failures (#13489)

Summary:
Essentially fix https://github.com/facebook/rocksdb/issues/13429 by
* Avoiding publishing to readers a partial write batch written to memtable. Also clarify in DB::Write that WriteBatch is applied atomically, and improve some logging.
* When we know we have written a bad write batch to WAL due to memtable insert failure, make a good effort to roll it back to make the DB recoverable. (Not compatible with all options.)

Fixes https://github.com/facebook/rocksdb/issues/13429

Follow-up items:
* More rigorously test and fix the code paths and option combinations where these features could be useful.
* Allow default CF with disallow_memtable_writes (with caveat that violation stops writes on your open DB)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13489

Test Plan: Updated existing test, manually verified the DB went into a "stopped" state at least in this example.

Reviewed By: jaykorean

Differential Revision: D71917670

Pulled By: pdillinger

fbshipit-source-id: c9b9dfc102817fc4c160a6c7170c04011c228aaf
---
 db/db_impl/db_impl.h        | 19 +++++++++-
 db/db_impl/db_impl_write.cc | 71 +++++++++++++++++++++++++------------
 db/error_handler.cc         | 14 +++++---
 db/plain_table_db_test.cc   | 23 ++++++------
 include/rocksdb/db.h        |  2 +-
 5 files changed, 87 insertions(+), 42 deletions(-)

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index d8530cd7a98a..80f46ebd2c92 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1781,6 +1781,13 @@ class DBImpl : public DB {
       if (writer->file()) {
         // TODO: plumb Env::IOActivity, Env::IOPriority
         s = writer->WriteBuffer(WriteOptions());
+        if (attempt_truncate_size < SIZE_MAX &&
+            attempt_truncate_size < writer->file()->GetFileSize()) {
+          Status s2 = writer->file()->writable_file()->Truncate(
+              attempt_truncate_size, IOOptions{}, nullptr);
+          // This is just a best effort attempt
+          s2.PermitUncheckedError();
+        }
       }
       delete writer;
       writer = nullptr;
@@ -1813,6 +1820,11 @@ class DBImpl : public DB {
       getting_synced = false;
     }
 
+    void SetAttemptTruncateSize(uint64_t size) {
+      assert(attempt_truncate_size == SIZE_MAX);
+      attempt_truncate_size = size;
+    }
+
     uint64_t number;
     // Visual Studio doesn't support deque's member to be noncopyable because
     // of a std::unique_ptr as a member.
@@ -1825,6 +1837,10 @@ class DBImpl : public DB {
     // to be persisted even if appends happen during sync so it can be used for
     // tracking the synced size in MANIFEST.
     uint64_t pre_sync_size = 0;
+    // When < SIZE_MAX, attempt to truncate the WAL to this size on close,
+    // because a bad entry was written to it beyond that point and it likely
+    // won't be recoverable with the bad entry.
+    uint64_t attempt_truncate_size = SIZE_MAX;
   };
 
   struct LogContext {
@@ -1834,6 +1850,7 @@ class DBImpl : public DB {
     bool need_log_dir_sync = false;
     log::Writer* writer = nullptr;
     LogFileNumberSize* log_file_number_size = nullptr;
+    uint64_t prev_size = SIZE_MAX;
   };
 
   // PurgeFileInfo is a structure to hold information of files to be deleted in
@@ -2344,7 +2361,7 @@ class DBImpl : public DB {
   void WALIOStatusCheck(const IOStatus& status);
 
   // Used by WriteImpl to update bg_error_ in case of memtable insert error.
-  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+  void HandleMemTableInsertFailure(const Status& nonok_memtable_insert_status);
 
   Status CompactFilesImpl(const CompactionOptions& compact_options,
                           ColumnFamilyData* cfd, Version* version,
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 7051c970aad7..16a80a17bfa5 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -524,8 +524,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           assert(tmp_s.ok());
         }
       }
-      versions_->SetLastSequence(last_sequence);
-      MemTableInsertStatusCheck(w.status);
+      if (w.status.ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(last_sequence);
+      } else {
+        HandleMemTableInsertFailure(w.status);
+      }
       write_thread_.ExitAsBatchGroupFollower(&w);
     }
     assert(w.state == WriteThread::STATE_COMPLETED);
@@ -690,6 +693,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     if (!two_write_queues_) {
       if (status.ok() && !write_options.disableWAL) {
         assert(log_context.log_file_number_size);
+        log_context.prev_size = log_context.writer->file()->GetFileSize();
         LogFileNumberSize& log_file_number_size =
             *(log_context.log_file_number_size);
         PERF_TIMER_GUARD(write_wal_time);
@@ -873,9 +877,19 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       }
       // Note: if we are to resume after non-OK statuses we need to revisit how
       // we react to non-OK statuses here.
-      versions_->SetLastSequence(last_sequence);
+      if (w.status.ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(last_sequence);
+      }
+    }
+    if (!w.status.ok()) {
+      if (log_context.prev_size < SIZE_MAX) {
+        InstrumentedMutexLock l(&log_write_mutex_);
+        if (logs_.back().number == log_context.log_file_number_size->number) {
+          logs_.back().SetAttemptTruncateSize(log_context.prev_size);
+        }
+      }
+      HandleMemTableInsertFailure(w.status);
     }
-    MemTableInsertStatusCheck(w.status);
     write_thread_.ExitAsBatchGroupLeader(write_group, status);
   }
 
@@ -1032,7 +1046,12 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
           &flush_scheduler_, &trim_history_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
           false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
-      versions_->SetLastSequence(memtable_write_group.last_sequence);
+      if (memtable_write_group.status
+              .ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(memtable_write_group.last_sequence);
+      } else {
+        HandleMemTableInsertFailure(memtable_write_group.status);
+      }
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
   } else {
@@ -1061,8 +1080,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     PERF_TIMER_START(write_pre_and_post_process_time);
 
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
-      MemTableInsertStatusCheck(w.status);
-      versions_->SetLastSequence(w.write_group->last_sequence);
+      if (w.status.ok()) {  // Don't publish a partial batch write
+        versions_->SetLastSequence(w.write_group->last_sequence);
+      } else {
+        HandleMemTableInsertFailure(w.status);
+      }
       write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
     }
   }
@@ -1386,18 +1408,16 @@ void DBImpl::WALIOStatusCheck(const IOStatus& io_status) {
   }
 }
 
-void DBImpl::MemTableInsertStatusCheck(const Status& status) {
-  // A non-OK status here indicates that the state implied by the
-  // WAL has diverged from the in-memory state.  This could be
-  // because of a corrupt write_batch (very bad), or because the
-  // client specified an invalid column family and didn't specify
-  // ignore_missing_column_families.
-  if (!status.ok()) {
-    mutex_.Lock();
-    assert(!error_handler_.IsBGWorkStopped());
-    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
-    mutex_.Unlock();
-  }
+void DBImpl::HandleMemTableInsertFailure(const Status& status) {
+  assert(!status.ok());
+  // A non-OK status on memtable insert indicates that the state implied by the
+  // WAL has diverged from the in-memory state.  This could be because of a
+  // corrupt write_batch (very bad), or because the client specified an invalid
+  // column family and didn't specify ignore_missing_column_families.
+  mutex_.Lock();
+  assert(!error_handler_.IsBGWorkStopped());
+  error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
+  mutex_.Unlock();
 }
 
 Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
@@ -1598,6 +1618,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   }
   if (log_used != nullptr) {
     *log_used = logfile_number_;
+    assert(*log_used == log_file_number_size.number);
   }
   total_log_size_ += log_entry.size();
   log_file_number_size.AddSize(*log_size);
@@ -1815,11 +1836,15 @@ Status DBImpl::WriteRecoverableState() {
         0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
         &next_seq, &dont_care_bool, seq_per_batch_);
     auto last_seq = next_seq - 1;
-    if (two_write_queues_) {
-      versions_->FetchAddLastAllocatedSequence(last_seq - seq);
-      versions_->SetLastPublishedSequence(last_seq);
+    if (status.ok()) {  // Don't publish a partial batch write
+      if (two_write_queues_) {
+        versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+        versions_->SetLastPublishedSequence(last_seq);
+      }
+      versions_->SetLastSequence(last_seq);
+    } else {
+      HandleMemTableInsertFailure(status);
     }
-    versions_->SetLastSequence(last_seq);
     if (two_write_queues_) {
       log_write_mutex_.Unlock();
     }
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 24c555764f30..1e777fd42600 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -275,9 +275,6 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
     return;
   }
 
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "ErrorHandler: Set regular background error\n");
-
   bool paranoid = db_options_.paranoid_checks;
   Status::Severity sev = Status::Severity::kFatalError;
   Status new_bg_err;
@@ -335,12 +332,21 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
     if (!s.ok() && (s.severity() > bg_error_.severity())) {
       bg_error_ = s;
     } else {
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "ErrorHandler: Hit less severe background error\n");
+
       // This error is less severe than previously encountered error. Don't
       // take any further action
       return;
     }
   }
 
+  bool stop = bg_error_.severity() >= Status::Severity::kHardError;
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "ErrorHandler: Set regular background error, auto_recovery=%d, stop=%d\n",
+      int{auto_recovery}, int{stop});
+
   recover_context_ = context;
   if (auto_recovery) {
     recovery_in_prog_ = true;
@@ -351,7 +357,7 @@ void ErrorHandler::HandleKnownErrors(const Status& bg_err,
       RecoverFromNoSpace();
     }
   }
-  if (bg_error_.severity() >= Status::Severity::kHardError) {
+  if (stop) {
     is_db_stopped_.store(true, std::memory_order_release);
   }
 }
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 27aa0e28d0c9..4645ae31b7c6 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -1339,11 +1339,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) {
 INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
 
 TEST_P(PlainTableDBTest, DeleteRangeNotSupported) {
-  // XXX: After attempting DeleteRange with PlainTable, Writes will permanently
-  // fail. Even if re-opening the DB, if WAL is used, the WAL is not recoverable
-  // (without manual intervention). Furthermore, a partial write batch can
-  // be exposed to readers, breaking WriteBatch atomicity.
-  for (bool use_write_batch : {/*false, */ true}) {
+  for (bool use_write_batch : {false, true}) {
     DestroyAndReopen();
 
     ASSERT_OK(Put("a0001111", "1"));
@@ -1362,12 +1358,7 @@ TEST_P(PlainTableDBTest, DeleteRangeNotSupported) {
     ASSERT_EQ(Get("a0001111"), "1");
     ASSERT_EQ(Get("b0001111"), "2");
     ASSERT_EQ(Get("c0001111"), "3");
-    if (use_write_batch) {
-      // XXX: broken WriteBatch atomicity
-      ASSERT_EQ(Get("d0001111"), "4");
-    } else {
-      ASSERT_EQ(Get("d0001111"), "NOT_FOUND");
-    }
+    ASSERT_EQ(Get("d0001111"), "NOT_FOUND");  // expect WriteBatch atomicity
     ASSERT_EQ(Get("e0001111"), "NOT_FOUND");
 
     ASSERT_EQ(Put("e0001111", "5").code(), Status::Code::kNotSupported);
@@ -1377,8 +1368,14 @@ TEST_P(PlainTableDBTest, DeleteRangeNotSupported) {
     ASSERT_EQ(dbfull()->TEST_FlushMemTable().code(),
               Status::Code::kNotSupported);
 
-    // XXX: WAL is not recoverable
-    ASSERT_EQ(TryReopen().code(), Status::Code::kNotSupported);
+    // WAL is recoverable (at least in standard configurations)
+    ASSERT_OK(TryReopen());
+
+    ASSERT_EQ(Get("a0001111"), "1");
+    ASSERT_EQ(Get("b0001111"), "2");
+    ASSERT_EQ(Get("c0001111"), "3");
+    ASSERT_EQ(Get("d0001111"), "NOT_FOUND");
+    ASSERT_EQ(Get("e0001111"), "NOT_FOUND");
   }
 }
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index dab74b8ef934..067b718445da 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -616,7 +616,7 @@ class DB {
                        const Slice& /*key*/, const Slice& /*ts*/,
                        const Slice& /*value*/);
 
-  // Apply the specified updates to the database.
+  // Apply the specified updates atomically to the database.
   // If `updates` contains no update, WAL will still be synced if
   // options.sync=true.
   // Returns OK on success, non-OK on failure.

From 30e097e365bd00d15144f1e06632a8c8e4f54512 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 2 Apr 2025 11:35:41 -0700
Subject: [PATCH 040/500] Disable 2pc TXN with WAL write injection in db
 stress; Re-enable track_and_verify_wal (#13508)

Summary:
**Context/Summary:**
Pessimistic transactions use 2PC and can't auto-recover from WAL write errors. This is because RocksDB cannot easily discard the corrupted WAL without risking the loss of uncommitted prepared data within the same WAL. Stress test does not support injecting errors that can' be auto-recovered for now. Therefore disabling WAL write error injection in stress tests to prevent crashing.

Previously track_and_verify_wal was disabled due to it caught those corrupted WAL. We can enable the feature now since there won't be such corrupted WAL.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13508

Test Plan:
- Previous failed command pass now
```
python3 tools/db_crashtest.py --simple blackbox --interval=15  --WAL_size_limit_MB=0 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=100 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=1 --async_io=1 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --bgerror_resume_retry_interval=100 --blob_cache_size=1048576 --blob_compaction_readahead_size=4194304 --blob_compression_type=snappy --blob_file_size=1073741824 --blob_file_starting_level=0 --blob_garbage_collection_age_cutoff=1.0 --blob_garbage_collection_force_threshold=1.0 --block_align=1 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483646 --bloom_bits=19 --bottommost_compression_type=none --bottommost_file_compaction_delay=86400 --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=8388608 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=0 --checksum_type=kxxHash --clear_column_family_one_in=0 --commit_bypass_memtable_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=1 --compaction_readahead_size=0 --compaction_style=0 --compaction_ttl=10 --compress_format_version=1 --compressed_secondary_cache_size=16777216 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --create_timestamped_snapshot_one_in=0 --daily_offpeak_time_utc= --data_block_index_type=1 --db_write_buffer_size=0 --decouple_partitioned_filters=0 --default_temperature=kHot --default_write_temperature=kCold --delete_obsolete_files_period_micros=30000000 --delpercent=0 --delrangepercent=0 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=10000 --disable_wal=0 --dump_malloc_stats=1 --enable_blob_files=1 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_custom_split_merge=1 --enable_do_not_compress_roles=1 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=0 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=0 --fail_if_options_file_error=1 --fifo_allow_compaction=0 --file_checksum_impl=none --file_temperature_age_thresholds= --fill_cache=0 --flush_one_in=1000000 --format_version=5 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0.5 --index_block_restart_interval=12 --index_shortening=1 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=1000000 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=0 --manifest_preallocation_size=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=25000000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=2 --max_total_wal_size=0 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=0 --memtable_insert_hint_per_batch=0 --memtable_max_range_deletions=0 --memtable_prefix_bloom_size_ratio=0.1 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=0 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=128 --min_blob_size=16 --min_write_buffer_number_to_merge=1 --mmap_read=0 --mock_direct_io=True --nooverwritepercent=1 --num_file_reads_for_auto_readahead=2 --open_files=-1 --open_metadata_read_fault_one_in=8 --open_metadata_write_fault_one_in=8 --open_read_fault_one_in=0 --open_write_fault_one_in=16 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=0 --partition_pinning=0 --pause_background_one_in=10000 --periodic_compaction_seconds=0 --prefix_size=1 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=0 --preserve_internal_time_seconds=60 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=1000 --readahead_size=524288 --readpercent=0 --recycle_log_file_num=0 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=32 --secondary_cache_uri= --set_options_one_in=2000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=bar --sqfc_version=1 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=1 --sync=0 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=2 --track_and_verify_wals=1 --two_write_queues=0 --txn_write_policy=1 --uncache_aggressiveness=12 --universal_max_read_amp=-1 --unordered_write=0 --unpartitioned_pinning=3 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=1 --use_blob_cache=0 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=1 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=1 --use_multiget=0 --use_optimistic_txn=0 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=0 --use_timed_put_one_in=0 --use_txn=1 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=1048576 --write_dbid_to_manifest=1 --write_fault_one_in=5 --write_identity_file=0 --writepercent=100
```
- Rehearsal stress test 10x of our normal run shows no relevant errors to track_and_verify_wal

Reviewed By: pdillinger

Differential Revision: D72191287

Pulled By: hx235

fbshipit-source-id: 08d3fd52645ad526aec34842215c68b3ef06a9c9
---
 tools/db_crashtest.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index aef6bc3b8906..21cfc850151b 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -343,9 +343,6 @@
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
-    # TODO(hx235): enable `track_and_verify_wals` again after resolving the issues
-    # it has with write fault injection and TXN
-    "track_and_verify_wals": 0,
     "enable_remote_compaction": lambda: random.choice([0, 1]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
 }
@@ -963,22 +960,36 @@ def finalize_and_sanitize(src_params):
         dest_params["check_multiget_consistency"] = 0
         dest_params["check_multiget_entity_consistency"] = 0
     if dest_params.get("disable_wal") == 0:
-        if dest_params.get("reopen") > 0 or (
-            dest_params.get("manual_wal_flush_one_in")
-            and dest_params.get("column_families") != 1
+        if (
+            dest_params.get("reopen") > 0
+            or (
+                dest_params.get("manual_wal_flush_one_in")
+                and dest_params.get("column_families") != 1
+            )
+            or (
+                dest_params.get("use_txn") != 0
+                and dest_params.get("use_optimistic_txn") == 0
+            )
         ):
-            # Reopen with WAL currently requires persisting WAL data before closing for reopen.
+            # 1. Reopen with WAL currently requires persisting WAL data before closing for reopen.
             # Previous injected WAL write errors may not be cleared by the time of closing and ready
             # for persisting WAL.
             # To simplify, we disable any WAL write error injection.
             # TODO(hx235): support WAL write error injection with reopen
-            # TODO(hx235): support excluding WAL from metadata write fault injection so we don't
-            # have to disable metadata write fault injection to other file
             #
-            # WAL write failure can drop buffered WAL data. This can cause
+            # 2. WAL write failure can drop buffered WAL data. This can cause
             # inconsistency when one CF has a successful flush during auto
             # recovery. Disable the fault injection in this path for now until
             # we have a fix that allows auto recovery.
+            #
+            # 3. Pessimistic transactions use 2PC, which can't auto-recover from WAL write errors.
+            # This is because RocksDB cannot easily discard the corrupted WAL without risking the
+            # loss of uncommitted prepared data within the same WAL.
+            # Therefore disabling WAL write error injection in stress tests to prevent crashing
+            # since stress test does not support injecting errors that can' be auto-recovered.
+            #
+            # TODO(hx235): support excluding WAL from metadata write fault injection so we don't
+            # have to disable metadata write fault injection to other file
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
     # Enabling block_align with compression is not supported

From 5735ff4e0322fe82e1201b4df18afae445365c42 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 2 Apr 2025 15:46:02 -0700
Subject: [PATCH 041/500] Update window build cmake to download newer snappy
 version and Java cmake_minimum_required (#13514)

Summary:
**Context/Summary:**

- This is an attempt to fix our [build-window-vs2022 failure](https://github.com/facebook/rocksdb/actions/runs/14215681026/job/39831770554?fbclid=IwZXh0bgNhZW0CMTAAAR2BQLjp8kC1u1yyvN1_S5qwmrHEZOfzxJdcbj2vq7mvwwq83n1cbkmiBCA_aem_ygYxQA5EUmxh2y4EjMlTfg) below. snappy-1.1.8's cmake_minimum_required  being less than 3.5 seems to trigger the complaint. Hopefully downloading the 1.2.2 which is the [first version starting to use higher cmake_minimum_required version](https://github.com/google/snappy/releases/tag/1.2.2) solves the failure.

```
    Directory: D:\a\rocksdb\rocksdb\thirdparty\snappy-1.1.8

Mode                 LastWriteTime         Length Name
----                 -------------         ------ ----
d----            4/2/2025  9:02 AM                build
CMake Error at CMakeLists.txt:29 (cmake_minimum_required):
  Compatibility with CMake < 3.5 has been removed from CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CMake that the project requires at least <min> but has been updated
  to work with policies introduced by <max> or earlier.

  Or, add -DCMAKE_POLICY_VERSION_MINIMUM=3.5 to try configuring anyway.
 ```
- The downloaded snappy do not include the content under nested repos Google Test and Google Benchmark. But snappy cmake by default will attempt to build them. Since we don't change snappy, we don't need building such development suit. This PR also disabled snappy cmake's attempt to build them.

- By running above changes, the same build [complained](https://github.com/facebook/rocksdb/actions/runs/14228883966/job/39874927730?pr=13514) about java cmakelists requiring too low cmake_minimum_required as well.  So this PR also upgraded its cmake_minimum_required to be 3.11 aligning with its warning message
```
if(${CMAKE_VERSION} VERSION_LESS "3.11.4")
    message("Please consider switching to CMake 3.11.4 or newer")
endif()
```

**Test plan**
Monitor build-window-vs2022 for this PR

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13514

Reviewed By: pdillinger

Differential Revision: D72333581

Pulled By: hx235

fbshipit-source-id: 1a9096738d39c8b1d270fe17fbd78c1ea4c4c45e
---
 .github/actions/windows-build-steps/action.yml | 14 +++++++-------
 java/CMakeLists.txt                            |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/actions/windows-build-steps/action.yml b/.github/actions/windows-build-steps/action.yml
index 9213f2e828fc..dc535a477415 100644
--- a/.github/actions/windows-build-steps/action.yml
+++ b/.github/actions/windows-build-steps/action.yml
@@ -11,9 +11,9 @@ runs:
       CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe
       CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe
       JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8
-      SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.1.8
-      SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.1.8;${{ github.workspace }}/thirdparty/snappy-1.1.8/build
-      SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.1.8/build/Debug/snappy.lib
+      SNAPPY_HOME: ${{ github.workspace }}/thirdparty/snappy-1.2.2
+      SNAPPY_INCLUDE: ${{ github.workspace }}/thirdparty/snappy-1.2.2;${{ github.workspace }}/thirdparty/snappy-1.2.2/build
+      SNAPPY_LIB_DEBUG: ${{ github.workspace }}/thirdparty/snappy-1.2.2/build/Debug/snappy.lib
     run: |-
       # NOTE: if ... Exit $LASTEXITCODE lines needed to exit and report failure
       echo ===================== Install Dependencies =====================
@@ -22,14 +22,14 @@ runs:
       mkdir $Env:THIRDPARTY_HOME
       cd $Env:THIRDPARTY_HOME
       echo "Building Snappy dependency..."
-      curl -Lo snappy-1.1.8.zip https://github.com/google/snappy/archive/refs/tags/1.1.8.zip
+      curl -Lo snappy-1.2.2.zip https://github.com/google/snappy/archive/refs/tags/1.2.2.zip
       if(!$?) { Exit $LASTEXITCODE }
-      unzip -q snappy-1.1.8.zip
+      unzip -q snappy-1.2.2.zip
       if(!$?) { Exit $LASTEXITCODE }
-      cd snappy-1.1.8
+      cd snappy-1.2.2
       mkdir build
       cd build
-      & cmake -G "$Env:CMAKE_GENERATOR" ..
+      & cmake -G "$Env:CMAKE_GENERATOR" .. -DSNAPPY_BUILD_TESTS=OFF -DSNAPPY_BUILD_BENCHMARKS=OFF
       if(!$?) { Exit $LASTEXITCODE }
       msbuild Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
       if(!$?) { Exit $LASTEXITCODE }
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index a60847ead37d..ffc374102699 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.11)
 
 set(JAVA_JUNIT_VERSION "4.13.1")
 set(JAVA_HAMCR_VERSION "2.2")

From 24e2b05e6179fda5770f3a4203fa7cdfda7fd936 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 2 Apr 2025 16:07:56 -0700
Subject: [PATCH 042/500] Multi scan API (#13473)

Summary:
A multi scan API for users to pass a set of scan ranges and have the table readers determine the optimal strategy for performing the scans. This might include coalescing of IOs across scans, for example. The requested scans should be in increasing key order. The scan start keys and other info is passed to NewMultiScanIterator, which in turn uses the newly added Prepare() interface in Iterator to update the iterator. The Prepare() takes a vector of ScanOptions, which contain the start keys and optional upper bounds, as well as user defined parameters in the property_bag taht are passed through as is to external table readers.

The initial implementation plumbs this through to the ExternalTableReader. This PR also fixes an issue of premature destruction of the external table iterator after the first scan of the multi-scan. The `LevelIterator` treats an invalid iterator as a potential end of file and destroys the table iterator in order to move to the next file. To prevent that, this PR defines the `NextAndGetResult` interface that the external table iterator must implement. The result returned by `NextAndGetResult` differentiates between iterator invalidation due to out of bound vs end of file.

Eventually, I envision the `MultiScanIterator` to be built on top of a producer-consumer queue like container, with RocksDB (producer) enqueueing keys and values into the container and the application (consumer) dequeueing them. Unlike a traditional producer consumer queue, there is no concurrency here. The results will be buffered in the container, and when the buffer is empty a new batch will be read from the child iterators. This will allow the virtual function call overhead to be amortized over many entries.

TODO (in future PRs):
1. Update the internal implementation of Prepare to trim the ScanOptions range based on the intersection with the table key range, taking into consideration unbounded scans and opaque user defined bounds.
2. Long term, take advantage of Prepare in BlockBasedTableIterator, atleast for the upper bound case.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13473

Reviewed By: pdillinger

Differential Revision: D71447559

Pulled By: anand1976

fbshipit-source-id: 31668abb0c529aa1ac1738ae46c36cbddf9148f1
---
 db/arena_wrapped_db_iter.h               |   4 +
 db/db_impl/db_impl.cc                    |  10 +
 db/db_impl/db_impl.h                     |   5 +
 db/db_iter.h                             |  12 +
 db/version_set.cc                        |  16 +-
 include/rocksdb/advanced_iterator.h      |  36 +++
 include/rocksdb/db.h                     |  13 ++
 include/rocksdb/external_table.h         |  49 +++-
 include/rocksdb/iterator.h               |  13 ++
 include/rocksdb/multi_scan.h             | 223 ++++++++++++++++++
 include/rocksdb/options.h                |  83 +++----
 include/rocksdb/utilities/stackable_db.h |   7 +
 table/external_table.cc                  |  73 ++++--
 table/internal_iterator.h                |  16 +-
 table/iterator_wrapper.h                 |   8 +
 table/merging_iterator.cc                |   6 +
 table/table_test.cc                      | 278 +++++++++++++++++++++--
 17 files changed, 759 insertions(+), 93 deletions(-)
 create mode 100644 include/rocksdb/advanced_iterator.h
 create mode 100644 include/rocksdb/multi_scan.h

diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
index 801988bfca7b..531fd5bca4a7 100644
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@@ -99,6 +99,10 @@ class ArenaWrappedDBIter : public Iterator {
 
   bool PrepareValue() override { return db_iter_->PrepareValue(); }
 
+  void Prepare(const std::vector<ScanOptions>& scan_opts) override {
+    db_iter_->Prepare(scan_opts);
+  }
+
   void Init(Env* env, const ReadOptions& read_options,
             const ImmutableOptions& ioptions,
             const MutableCFOptions& mutable_cf_options, const Version* version,
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index fa33bd62d54d..8aa21a719367 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3796,6 +3796,16 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
   return s.ok() || s.IsIncomplete();
 }
 
+std::unique_ptr<MultiScan> DBImpl::NewMultiScan(
+    const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
+    const std::vector<ScanOptions>& scan_opts) {
+  std::unique_ptr<Iterator> iter(NewIterator(_read_options, column_family));
+  iter->Prepare(scan_opts);
+  std::unique_ptr<MultiScan> ms_iter =
+      std::make_unique<MultiScan>(scan_opts, std::move(iter));
+  return ms_iter;
+}
+
 Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
                               ColumnFamilyHandle* column_family) {
   if (_read_options.io_activity != Env::IOActivity::kUnknown &&
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 80f46ebd2c92..57fa0a5e838b 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -379,6 +379,11 @@ class DBImpl : public DB {
                       const std::vector<ColumnFamilyHandle*>& column_families,
                       std::vector<Iterator*>* iterators) override;
 
+  using DB::NewMultiScan;
+  std::unique_ptr<MultiScan> NewMultiScan(
+      const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
+      const std::vector<ScanOptions>& scan_opts) override;
+
   const Snapshot* GetSnapshot() override;
   void ReleaseSnapshot(const Snapshot* snapshot) override;
 
diff --git a/db/db_iter.h b/db/db_iter.h
index 084ed80d41a0..0f8074151507 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -220,6 +220,17 @@ class DBIter final : public Iterator {
 
   bool PrepareValue() override;
 
+  void Prepare(const std::vector<ScanOptions>& scan_opts) override {
+    std::optional<std::vector<ScanOptions>> new_scan_opts;
+    new_scan_opts.emplace(scan_opts);
+    scan_opts_.swap(new_scan_opts);
+    if (!scan_opts.empty()) {
+      iter_.Prepare(&scan_opts_.value());
+    } else {
+      iter_.Prepare(nullptr);
+    }
+  }
+
  private:
   class BlobReader {
    public:
@@ -455,6 +466,7 @@ class DBIter final : public Iterator {
   const Slice* const timestamp_lb_;
   const size_t timestamp_size_;
   std::string saved_timestamp_;
+  std::optional<std::vector<ScanOptions>> scan_opts_;
 };
 
 // Return a new iterator that converts internal keys (yielded by
diff --git a/db/version_set.cc b/db/version_set.cc
index 2daf5b957390..537287577e41 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1002,7 +1002,8 @@ class LevelIterator final : public InternalIterator {
         skip_filters_(skip_filters),
         allow_unprepared_value_(allow_unprepared_value),
         is_next_read_sequential_(false),
-        to_return_sentinel_(false) {
+        to_return_sentinel_(false),
+        scan_opts_(nullptr) {
     // Empty level is not supported.
     assert(flevel_ != nullptr && flevel_->num_files > 0);
     if (range_tombstone_iter_ptr_) {
@@ -1098,6 +1099,13 @@ class LevelIterator final : public InternalIterator {
     read_seq_ = read_seq;
   }
 
+  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+    scan_opts_ = scan_opts;
+    if (file_iter_.iter()) {
+      file_iter_.Prepare(scan_opts_);
+    }
+  }
+
  private:
   // Return true if at least one invalid file is seen and skipped.
   bool SkipEmptyFileForward();
@@ -1223,6 +1231,7 @@ class LevelIterator final : public InternalIterator {
   bool prefix_exhausted_ = false;
   // Whether next/prev key is a sentinel key.
   bool to_return_sentinel_ = false;
+  const std::vector<ScanOptions>* scan_opts_;
 
   // Sets flags for if we should return the sentinel key next.
   // The condition for returning sentinel is reaching the end of current
@@ -1533,6 +1542,11 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
   }
 
   InternalIterator* old_iter = file_iter_.Set(iter);
+  // Since this is a new table iterator, no need to call Prepare() if
+  // scan_opts_ is null
+  if (iter && scan_opts_) {
+    file_iter_.Prepare(scan_opts_);
+  }
 
   // Update the read pattern for PrefetchBuffer.
   if (is_next_read_sequential_) {
diff --git a/include/rocksdb/advanced_iterator.h b/include/rocksdb/advanced_iterator.h
new file mode 100644
index 000000000000..abab5aeb4574
--- /dev/null
+++ b/include/rocksdb/advanced_iterator.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class IterBoundCheck : char {
+  kUnknown = 0,
+  kOutOfBound,
+  kInbound,
+};
+
+// This structure encapsulates the result of NextAndGetResult()
+struct IterateResult {
+  // The lifetime of key is guaranteed until Next()/NextAndGetResult() is
+  // called.
+  Slice key;
+  // If the iterator becomes invalid after a NextAndGetResult(), the table
+  // iterator should set this to indicate whether it became invalid due
+  // to the next key being out of bound (kOutOfBound) or it reached end
+  // of file (kUnknown). If the iiterator is still valid, this should
+  // be set to kInbound.
+  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
+  // If false, PrepareValue() needs to be called before value()
+  // This is useful if the table reader wants to materialize the value in a
+  // lazy manner. In that case, it can set this to false and RocksDB
+  // guarantees that it'll call PrepareValue() before calling value().
+  bool value_prepared = true;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 067b718445da..2727bbcb2290 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -22,6 +22,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/metadata.h"
+#include "rocksdb/multi_scan.h"
 #include "rocksdb/options.h"
 #include "rocksdb/snapshot.h"
 #include "rocksdb/sst_file_writer.h"
@@ -1073,6 +1074,18 @@ class DB {
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families) = 0;
 
+  // Get an iterator that scans multiple key ranges. The scan ranges should
+  // be in increasing order of start key. See multi_scan_iterator.h for more
+  // details.
+  virtual std::unique_ptr<MultiScan> NewMultiScan(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const std::vector<ScanOptions>& /*scan_opts*/) {
+    std::unique_ptr<Iterator> iter(NewErrorIterator(Status::NotSupported()));
+    std::unique_ptr<MultiScan> ms_iter =
+        std::make_unique<MultiScan>(std::move(iter));
+    return ms_iter;
+  }
+
   // Return a handle to the current DB state.  Iterators created with
   // this handle will all observe a stable snapshot of the current DB
   // state.  The caller must call ReleaseSnapshot(result) when the
diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index 87f4b9e6ba0d..d449532143cb 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -5,9 +5,10 @@
 
 #pragma once
 
+#include "rocksdb/advanced_iterator.h"
 #include "rocksdb/customizable.h"
 #include "rocksdb/file_checksum.h"
-#include "rocksdb/iterator.h"
+#include "rocksdb/iterator_base.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 
@@ -58,6 +59,47 @@ class ExternalTableFactory;
 // the external table implementation.
 // TODO: Specify which options are relevant
 
+class ExternalTableIterator : public IteratorBase {
+ public:
+  virtual ~ExternalTableIterator() {}
+
+  // This can optionally be called to prepare the iterator for a series
+  // of scans. The scan_opts parameter specifies the order of scans to
+  // follow, as well as the limits for those scans. After calling this,
+  // the caller will Seek() the iterator to successive start keys in scan_opts.
+  //
+  // If Prepare() is called again with a different scan_opts pointer, it
+  // means the iterator will be reused for a new multi scan. If scan_opts
+  // is null, then the previous Prepare() can be discarded.
+  //
+  // The caller guarantees the lifetime of scan_opts until its either cleared
+  // or replaced by another Prepare().
+  // TODO: Update the contract to trim the scan_opts range to only include
+  // scans that potentially intersect the file key range.
+  //
+  // If the sequence of Seeks is interrupted by seeking to some other target
+  // key, then the iterator is free to discard anything done during Prepare.
+  virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0;
+
+  // Similar to Next(), except it also fills the result and returns whether
+  // the iterator is on a valid key or not
+  virtual bool NextAndGetResult(IterateResult* result) = 0;
+
+  // Prepares the value if its lazily materialized. The implementation can
+  // request that this be called by setting value_prepared to false in
+  // IterateResult. Next() should always implicitly materialize the
+  // value.
+  virtual bool PrepareValue() = 0;
+
+  // Return the current key's value
+  virtual Slice value() const = 0;
+
+  // Return the current position bounds check result - kInbound if the
+  // position is a valid key, kOutOfBound if the key is out of bound (i.e
+  // scan has terminated), or kUnknown if end of file.
+  virtual IterBoundCheck UpperBoundCheckResult() = 0;
+};
+
 class ExternalTableReader {
  public:
   virtual ~ExternalTableReader() {}
@@ -65,8 +107,9 @@ class ExternalTableReader {
   // Return an Iterator that can be used to scan the table file.
   // The read_options can optionally contain the upper bound
   // key (exclusive) of the scan in iterate_upper_bound.
-  virtual Iterator* NewIterator(const ReadOptions& read_options,
-                                const SliceTransform* prefix_extractor) = 0;
+  virtual ExternalTableIterator* NewIterator(
+      const ReadOptions& read_options,
+      const SliceTransform* prefix_extractor) = 0;
 
   // Point lookup the given key and return its value
   virtual Status Get(const ReadOptions& read_options, const Slice& key,
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index 51bead99b907..0a36cb2f8559 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -21,6 +21,7 @@
 #include <string>
 
 #include "rocksdb/iterator_base.h"
+#include "rocksdb/options.h"
 #include "rocksdb/wide_columns.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -93,6 +94,18 @@ class Iterator : public IteratorBase {
     assert(false);
     return Slice();
   }
+
+  // RocksDB Internal - DO NOT USE
+  // Prepare the iterator to scan the ranges specified in scan_opts. The
+  // upper bound and other table specific limits may be specified. This will
+  // typically be followed by Seeks to the start keys in the order they're
+  // specified in scan_opts. If the user does a Seek to some other target key,
+  // the iterator should disregard the scan_opts from that point onwards and
+  // behave like a normal iterator. Its the user's responsibility to again
+  // call Prepare().
+  // If Prepare() is called, it overrides the iterate_upper_bound in
+  // ReadOptions
+  virtual void Prepare(const std::vector<ScanOptions>& /*scan_opts*/) {}
 };
 
 // Return an empty iterator (yields nothing).
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
new file mode 100644
index 000000000000..73b6d766e21a
--- /dev/null
+++ b/include/rocksdb/multi_scan.h
@@ -0,0 +1,223 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// EXPERIMENTAL
+//
+// An iterator that returns results from multiple scan ranges. The ranges are
+// expected to be in increasing sorted order.
+// The results are returned in nested container objects that can be iterated
+// using an std::input_iterator.
+//
+// MultiScan
+//     |
+//     ---
+//       |
+//  MultiScanIterator  <-- std::input_iterator (returns a Scan object for each
+//         |                                    scan range)
+//         ---
+//           |
+//          Scan
+//            |
+//            ---
+//              |
+//          ScanIterator <-- std::input_iterator (returns the KVs of a single
+//                                                scan range)
+//
+// The application on top of RocksDB
+// would use this as follows -
+//
+//  std::vector<ScanOptions> scans{{.start = Slice("bar")},
+//                              {.start = Slice("foo")}};
+//  std::unique_ptr<MultiScanIterator> iter.reset(
+//                                      db->NewMultiScanIterator());
+//  try {
+//    for (auto scan : *iter) {
+//      for (auto it : scan) {
+//        // Do something with key - it.first
+//        // Do something with value - it.second
+//      }
+//    }
+//  } catch (Status s) {
+//  }
+
+// A container object encapsulating a single scan range. It supports an
+// std::input_iterator for a single pass iteration of the KVs in the range.
+// A Status exception is thrown if there is an error in scanning the range.
+class Scan {
+ public:
+  class ScanIterator;
+
+  Scan(Iterator* db_iter) : db_iter_(db_iter) {}
+
+  ScanIterator begin() { return ScanIterator(db_iter_); }
+
+  std::nullptr_t end() { return nullptr; }
+
+  class ScanIterator {
+   public:
+    using self_type = ScanIterator;
+    using value_type = std::pair<Slice, Slice>;
+    using reference = std::pair<Slice, Slice>&;
+    using pointer = std::pair<Slice, Slice>*;
+    using difference_type = int;
+    using iterator_category = std::input_iterator_tag;
+
+    explicit ScanIterator(Iterator* db_iter) : db_iter_(db_iter) {
+      valid_ = db_iter_->Valid();
+      if (valid_) {
+        result_ = value_type(db_iter_->key(), db_iter_->value());
+      }
+    }
+
+    ScanIterator() : db_iter_(nullptr), valid_(false) {}
+
+    ~ScanIterator() { assert(status_.ok()); }
+
+    ScanIterator& operator++() {
+      if (!valid_) {
+        throw Status::InvalidArgument("Trying to advance invalid iterator");
+      } else {
+        db_iter_->Next();
+        status_ = db_iter_->status();
+        if (!status_.ok()) {
+          throw status_;
+        } else {
+          valid_ = db_iter_->Valid();
+          if (valid_) {
+            result_ = value_type(db_iter_->key(), db_iter_->value());
+          }
+        }
+      }
+      return *this;
+    }
+
+    bool operator==(std::nullptr_t /*other*/) const { return !valid_; }
+
+    bool operator!=(std::nullptr_t /*other*/) const { return valid_; }
+
+    reference operator*() {
+      if (!valid_) {
+        throw Status::InvalidArgument("Trying to deref invalid iterator");
+      }
+      return result_;
+    }
+    reference operator->() {
+      if (!valid_) {
+        throw Status::InvalidArgument("Trying to deref invalid iterator");
+      }
+      return result_;
+    }
+
+   private:
+    Iterator* db_iter_;
+    bool valid_;
+    Status status_;
+    value_type result_;
+  };
+
+ private:
+  Iterator* db_iter_;
+};
+
+// A container object encapsulating the scan ranges for a multi scan.
+// It supports an std::input_iterator for a single pass iteration of the
+// ScanOptions in scan_opts, which can be dereferenced to get the container
+// (Scan) for a single range.
+// A Status exception is thrown if there is an error.
+class MultiScan {
+ public:
+  MultiScan(const std::vector<ScanOptions>& scan_opts,
+            std::unique_ptr<Iterator>&& db_iter)
+      : scan_opts_(scan_opts), db_iter_(std::move(db_iter)) {}
+
+  explicit MultiScan(std::unique_ptr<Iterator>&& db_iter)
+      : db_iter_(std::move(db_iter)) {}
+
+  class MultiScanIterator {
+   public:
+    MultiScanIterator(MultiScanIterator&) = delete;
+    MultiScanIterator operator=(MultiScanIterator&) = delete;
+
+    using self_type = MultiScanIterator;
+    using value_type = Scan;
+    using reference = Scan&;
+    using pointer = Scan*;
+    using difference_type = int;
+    using iterator_category = std::input_iterator_tag;
+
+    MultiScanIterator(const std::vector<ScanOptions>& scan_opts,
+                      Iterator* db_iter)
+        : scan_opts_(scan_opts), idx_(0), db_iter_(db_iter), scan_(db_iter_) {
+      if (scan_opts_.empty()) {
+        throw Status::InvalidArgument("Zero scans in multi-scan");
+      }
+      db_iter_->Seek(*scan_opts_[idx_].range.start);
+      status_ = db_iter_->status();
+      if (!status_.ok()) {
+        throw status_;
+      }
+    }
+
+    MultiScanIterator(const std::vector<ScanOptions>& scan_opts)
+        : scan_opts_(scan_opts),
+          idx_(scan_opts_.size()),
+          db_iter_(nullptr),
+          scan_(nullptr) {}
+
+    ~MultiScanIterator() { assert(status_.ok()); }
+
+    MultiScanIterator& operator++() {
+      if (idx_ >= scan_opts_.size()) {
+        throw Status::InvalidArgument("Index out of range");
+      }
+      idx_++;
+      if (idx_ < scan_opts_.size()) {
+        db_iter_->Seek(*scan_opts_[idx_].range.start);
+        status_ = db_iter_->status();
+        if (!status_.ok()) {
+          throw status_;
+        }
+      }
+      return *this;
+    }
+
+    bool operator==(std::nullptr_t /*other*/) const {
+      return idx_ >= scan_opts_.size();
+    }
+
+    bool operator!=(std::nullptr_t /*other*/) const {
+      return idx_ < scan_opts_.size();
+    }
+
+    reference operator*() { return scan_; }
+    reference operator->() { return scan_; }
+
+   private:
+    const std::vector<ScanOptions>& scan_opts_;
+    size_t idx_;
+    Iterator* db_iter_;
+    Status status_;
+    Scan scan_;
+  };
+
+  MultiScanIterator begin() {
+    return MultiScanIterator(scan_opts_, db_iter_.get());
+  }
+
+  std::nullptr_t end() { return nullptr; }
+
+ private:
+  const std::vector<ScanOptions> scan_opts_;
+  std::unique_ptr<Iterator> db_iter_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 181e7781b67c..34f2a8b14ca0 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1703,6 +1703,50 @@ enum ReadTier {
   kMemtableTier = 0x3     // data in memtable. used for memtable-only iterators.
 };
 
+// A range of keys. In case of user_defined timestamp, if enabled, `start` and
+// `limit` should point to key without timestamp part.
+struct Range {
+  Slice start;
+  Slice limit;
+
+  Range() {}
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+// A key range with optional endpoints. In case of user_defined timestamp, if
+// enabled, `start` and `limit` should point to key without timestamp part.
+struct RangeOpt {
+  // When start.has_value() == false, refers to starting before every key
+  OptSlice start;
+  // When limit.has_value() == false, refers to ending after every key
+  OptSlice limit;
+
+  RangeOpt() {}
+  RangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
+};
+
+// EXPERIMENTAL
+//
+// Options for a RocksDB scan request. Only forward scans for now.
+// We may add other options such as prefix scan in the future.
+struct ScanOptions {
+  // The scan range. Mandatory for start to be set, limit is optional
+  RangeOpt range;
+
+  // A map of name,value pairs that can be passed by the user to an
+  // external table reader. This is completely opaque to RocksDB and is
+  // ignored by the natively supported table readers like block based and plain
+  // table. This is only useful for Iterator.
+  std::optional<std::unordered_map<std::string, std::string>> property_bag;
+
+  // An unbounded scan with a start key
+  ScanOptions(const Slice& _start) : range(_start, OptSlice()) {}
+
+  // A bounded scan with a start key and upper bound
+  ScanOptions(const Slice& _start, const Slice& _upper_bound)
+      : range(_start, _upper_bound) {}
+};
+
 // Options that control read operations
 struct ReadOptions {
   // *** BEGIN options relevant to point lookups as well as scans ***
@@ -2000,22 +2044,6 @@ struct ReadOptions {
   // EXPERIMENTAL
   Env::IOActivity io_activity = Env::IOActivity::kUnknown;
 
-  // EXPERIMENTAL
-  // An optional weight of values to be returned by a scan. Once the
-  // weight is reached or exceeded the scan is terminated (i.e Next()
-  // invalidates the iterator). In the case of a DB with one of the built-in
-  // table formats, such as BlockBasedTable, the weight is simply the number
-  // of key-value pairs. In the case of an ExternalTableReader, the weight is
-  // passed through to the table reader and the interpretation is upto the
-  // reader implementation.
-  uint64_t weight = 0;
-
-  // A map of name,value pairs that can be passed by the user to an
-  // external table reader. This is completely opaque to RocksDB and is
-  // ignored by the natively supported table readers like block based and plain
-  // table. This is only useful for Iterator.
-  std::optional<std::unordered_map<std::string, std::string>> property_bag;
-
   // *** END options for RocksDB internal use only ***
 
   ReadOptions() {}
@@ -2249,29 +2277,6 @@ struct CompactRangeOptions {
   double blob_garbage_collection_age_cutoff = -1;
 };
 
-// A range of keys. In case of user_defined timestamp, if enabled, `start` and
-// `limit` should point to key without timestamp part.
-struct Range {
-  Slice start;
-  Slice limit;
-
-  Range() {}
-  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
-};
-
-// A key range with optional endpoints. In case of user_defined timestamp, if
-// enabled, `start` and `limit` should point to key without timestamp part.
-struct RangeOpt {
-  // When start.has_value() == false, refers to starting before every key
-  OptSlice start;
-  // When limit.has_value() == false, refers to ending after every key
-  OptSlice limit;
-
-  RangeOpt() {}
-  RangeOpt(const OptSlice& s, const OptSlice& l) : start(s), limit(l) {}
-  // RangeOpt(const Slice& s, const Slice& l) : start(s), limit(l) {}
-};
-
 // IngestExternalFileOptions is used by IngestExternalFile()
 struct IngestExternalFileOptions {
   // Can be set to true to move the files instead of copying them.
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 4cea4dafff5d..bea4b0d133f2 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -289,6 +289,13 @@ class StackableDB : public DB {
     return db_->NewAttributeGroupIterator(options, column_families);
   }
 
+  using DB::NewMultiScan;
+  std::unique_ptr<MultiScan> NewMultiScan(
+      const ReadOptions& opts, ColumnFamilyHandle* column_family,
+      const std::vector<ScanOptions>& scan_opts) override {
+    return db_->NewMultiScan(opts, column_family, scan_opts);
+  }
+
   const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
 
   void ReleaseSnapshot(const Snapshot* snapshot) override {
diff --git a/table/external_table.cc b/table/external_table.cc
index 83c313a3d7b1..ad611c5d29ac 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -14,16 +14,17 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace {
 
-class ExternalTableIterator : public InternalIterator {
+class ExternalTableIteratorAdapter : public InternalIterator {
  public:
-  explicit ExternalTableIterator(Iterator* iterator)
+  explicit ExternalTableIteratorAdapter(ExternalTableIterator* iterator)
       : iterator_(iterator), valid_(false) {}
 
   // No copying allowed
-  ExternalTableIterator(const ExternalTableIterator&) = delete;
-  ExternalTableIterator& operator=(const ExternalTableIterator&) = delete;
+  ExternalTableIteratorAdapter(const ExternalTableIteratorAdapter&) = delete;
+  ExternalTableIteratorAdapter& operator=(const ExternalTableIteratorAdapter&) =
+      delete;
 
-  ~ExternalTableIterator() override {}
+  ~ExternalTableIteratorAdapter() override {}
 
   bool Valid() const override { return valid_; }
 
@@ -31,7 +32,7 @@ class ExternalTableIterator : public InternalIterator {
     status_ = Status::OK();
     if (iterator_) {
       iterator_->SeekToFirst();
-      UpdateKey();
+      UpdateKey(OptSlice());
     }
   }
 
@@ -39,7 +40,7 @@ class ExternalTableIterator : public InternalIterator {
     status_ = Status::OK();
     if (iterator_) {
       iterator_->SeekToLast();
-      UpdateKey();
+      UpdateKey(OptSlice());
     }
   }
 
@@ -50,7 +51,7 @@ class ExternalTableIterator : public InternalIterator {
       status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
       if (status_.ok()) {
         iterator_->Seek(pkey.user_key);
-        UpdateKey();
+        UpdateKey(OptSlice());
       }
     }
   }
@@ -62,7 +63,7 @@ class ExternalTableIterator : public InternalIterator {
       status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
       if (status_.ok()) {
         iterator_->SeekForPrev(pkey.user_key);
-        UpdateKey();
+        UpdateKey(OptSlice());
       }
     }
   }
@@ -70,14 +71,44 @@ class ExternalTableIterator : public InternalIterator {
   void Next() override {
     if (iterator_) {
       iterator_->Next();
-      UpdateKey();
+      UpdateKey(OptSlice());
+    }
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    if (iterator_) {
+      valid_ = iterator_->NextAndGetResult(&result_);
+      result->value_prepared = result_.value_prepared;
+      result->bound_check_result = result_.bound_check_result;
+      if (valid_) {
+        UpdateKey(result_.key);
+        result->key = key();
+      }
+    } else {
+      valid_ = false;
+    }
+    return valid_;
+  }
+
+  bool PrepareValue() override {
+    if (iterator_ && !result_.value_prepared) {
+      valid_ = iterator_->PrepareValue();
+      result_.value_prepared = true;
+    }
+    return valid_;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    if (iterator_) {
+      result_.bound_check_result = iterator_->UpperBoundCheckResult();
     }
+    return result_.bound_check_result;
   }
 
   void Prev() override {
     if (iterator_) {
       iterator_->Prev();
-      UpdateKey();
+      UpdateKey(OptSlice());
     }
   }
 
@@ -97,18 +128,26 @@ class ExternalTableIterator : public InternalIterator {
 
   Status status() const override { return status_; }
 
+  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+    if (iterator_) {
+      iterator_->Prepare(scan_opts->data(), scan_opts->size());
+    }
+  }
+
  private:
-  std::unique_ptr<Iterator> iterator_;
+  std::unique_ptr<ExternalTableIterator> iterator_;
   InternalKey key_;
   bool valid_;
   Status status_;
+  IterateResult result_;
 
-  void UpdateKey() {
+  void UpdateKey(OptSlice res) {
     if (iterator_) {
       valid_ = iterator_->Valid();
       status_ = iterator_->status();
       if (valid_ && status_.ok()) {
-        key_.Set(iterator_->key(), 0, ValueType::kTypeValue);
+        key_.Set(res.has_value() ? res.value() : iterator_->key(), 0,
+                 ValueType::kTypeValue);
       }
     }
   }
@@ -134,10 +173,10 @@ class ExternalTableReaderAdapter : public TableReader {
       bool /* allow_unprepared_value */ = false) override {
     auto iterator = reader_->NewIterator(read_options, prefix_extractor);
     if (arena == nullptr) {
-      return new ExternalTableIterator(iterator);
+      return new ExternalTableIteratorAdapter(iterator);
     } else {
-      auto* mem = arena->AllocateAligned(sizeof(ExternalTableIterator));
-      return new (mem) ExternalTableIterator(iterator);
+      auto* mem = arena->AllocateAligned(sizeof(ExternalTableIteratorAdapter));
+      return new (mem) ExternalTableIteratorAdapter(iterator);
     }
   }
 
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index 8ecbb0f90b4f..f6b6998b1d53 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -10,6 +10,7 @@
 
 #include "db/dbformat.h"
 #include "file/readahead_file_info.h"
+#include "rocksdb/advanced_iterator.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/status.h"
@@ -19,19 +20,6 @@ namespace ROCKSDB_NAMESPACE {
 
 class PinnedIteratorsManager;
 
-enum class IterBoundCheck : char {
-  kUnknown = 0,
-  kOutOfBound,
-  kInbound,
-};
-
-struct IterateResult {
-  Slice key;
-  IterBoundCheck bound_check_result = IterBoundCheck::kUnknown;
-  // If false, PrepareValue() needs to be called before value().
-  bool value_prepared = true;
-};
-
 template <class TValue>
 class InternalIteratorBase : public Cleanable {
  public:
@@ -212,6 +200,8 @@ class InternalIteratorBase : public Cleanable {
   // used by MergingIterator and LevelIterator for now.
   virtual bool IsDeleteRangeSentinelKey() const { return false; }
 
+  virtual void Prepare(const std::vector<ScanOptions>* /*scan_opts*/) {}
+
  protected:
   void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) {
     Seek(target);
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index b53076910ec6..398ec9e3d0fe 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -195,6 +195,14 @@ class IteratorWrapperBase {
     return iter_->IsDeleteRangeSentinelKey();
   }
 
+  // scan_opts lifetime is guaranteed until the iterator is destructed, or
+  // Prepare() is called with a new scan_opts
+  void Prepare(const std::vector<ScanOptions>* scan_opts) {
+    if (iter_) {
+      iter_->Prepare(scan_opts);
+    }
+  }
+
  private:
   void Update() {
     valid_ = iter_->Valid();
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 375c811c59fc..0a47ec130f3f 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -482,6 +482,12 @@ class MergingIterator : public InternalIterator {
            current_->IsValuePinned();
   }
 
+  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+    for (auto& child : children_) {
+      child.iter.Prepare(scan_opts);
+    }
+  }
+
  private:
   // Represents an element in the min/max heap. Each HeapItem corresponds to a
   // point iterator or a range tombstone iterator, differentiated by
diff --git a/table/table_test.cc b/table/table_test.cc
index ba5b01a132fd..919dc285eaa4 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6615,29 +6615,66 @@ class ExternalTableReaderTest : public DBTestBase {
     uint64_t file_size_;
   };
 
-  class DummyExternalTableIterator : public Iterator {
+  class DummyExternalTableIterator : public ExternalTableIterator {
    public:
     explicit DummyExternalTableIterator(
-        const ReadOptions& ro, const std::map<std::string, std::string>& kv_map)
-        : weight_(ro.weight), kv_map_(kv_map), valid_(false) {}
+        const ReadOptions& /*ro*/,
+        const std::map<std::string, std::string>& kv_map)
+        : scan_options_(nullptr),
+          num_opts_(0),
+          scan_idx_(0),
+          kv_map_(kv_map),
+          valid_(false) {}
 
     bool Valid() const override { return valid_; }
 
     void SeekToFirst() override {
-      iter_ = kv_map_.begin();
-      valid_ = iter_ != kv_map_.end();
-      status_ = Status::OK();
+      if (scan_options_) {
+        status_ = Status::InvalidArgument();
+      } else {
+        iter_ = kv_map_.begin();
+        valid_ = iter_ != kv_map_.end();
+        status_ = Status::OK();
+      }
     }
 
     void SeekToLast() override {
-      valid_ = false;
-      status_ = Status::NotSupported();
+      if (scan_options_) {
+        status_ = Status::InvalidArgument();
+      } else {
+        if (!kv_map_.empty()) {
+          iter_ = kv_map_.begin();
+          for (uint64_t i = 0; i < kv_map_.size() - 1; ++i) {
+            iter_++;
+          }
+          valid_ = true;
+        } else {
+          valid_ = false;
+        }
+        status_ = Status::OK();
+      }
     }
 
     void Seek(const Slice& target) override {
-      iter_ = kv_map_.find(target.ToString());
-      valid_ = iter_ != kv_map_.end();
-      status_ = Status::OK();
+      if (status_.ok()) {
+        iter_ = kv_map_.find(target.ToString());
+        valid_ = iter_ != kv_map_.end();
+        eof_ = iter_ == kv_map_.end();
+      }
+      if (scan_options_) {
+        if (scan_idx_ >= num_opts_ ||
+            target != scan_options_[scan_idx_].range.start.value().ToString()) {
+          status_ = Status::InvalidArgument();
+        } else {
+          if (valid_ && scan_options_[scan_idx_].range.limit.has_value() &&
+              iter_->first.compare(
+                  scan_options_[scan_idx_].range.limit.value().ToString()) >=
+                  0) {
+            valid_ = false;
+          }
+          scan_idx_++;
+        }
+      }
     }
 
     void SeekForPrev(const Slice& /*target*/) override {
@@ -6647,9 +6684,37 @@ class ExternalTableReaderTest : public DBTestBase {
 
     void Next() override {
       iter_++;
-      weight_--;
-      valid_ = iter_ != kv_map_.end() && weight_ > 0;
-      // status_ is still ok. valid_ indicates end of scan
+      valid_ = iter_ != kv_map_.end();
+      eof_ = iter_ == kv_map_.end();
+      if (valid_ && scan_options_ &&
+          scan_options_[scan_idx_ - 1].range.limit.has_value() &&
+          iter_->first.compare(
+              scan_options_[scan_idx_ - 1].range.limit.value().ToString()) >=
+              0) {
+        valid_ = false;
+      }
+      // status_ is still ok. !valid_ indicates end of scan
+    }
+
+    bool NextAndGetResult(IterateResult* result) override {
+      Next();
+      if (valid_) {
+        result->key = key();
+        result->bound_check_result = IterBoundCheck::kInbound;
+        result->value_prepared = true;
+      } else {
+        result->key = Slice();
+        result->bound_check_result =
+            eof_ ? IterBoundCheck::kUnknown : IterBoundCheck::kOutOfBound;
+        result->value_prepared = false;
+      }
+      return valid_;
+    }
+
+    bool PrepareValue() override { return valid_ ? true : false; }
+
+    IterBoundCheck UpperBoundCheckResult() override {
+      return eof_ ? IterBoundCheck::kUnknown : IterBoundCheck::kOutOfBound;
     }
 
     void Prev() override {
@@ -6672,10 +6737,18 @@ class ExternalTableReaderTest : public DBTestBase {
       return Slice(iter_->second);
     }
 
+    void Prepare(const ScanOptions scan_opts[], size_t num_opts) override {
+      scan_options_ = scan_opts;
+      num_opts_ = num_opts;
+    }
+
    private:
-    uint64_t weight_;
+    const ScanOptions* scan_options_;
+    size_t num_opts_;
+    size_t scan_idx_;
     std::map<std::string, std::string> kv_map_;
     bool valid_ = false;
+    bool eof_ = false;
     Status status_ = Status::OK();
     std::map<std::string, std::string>::iterator iter_;
   };
@@ -6688,8 +6761,9 @@ class ExternalTableReaderTest : public DBTestBase {
       EXPECT_OK(s);
     }
 
-    Iterator* NewIterator(const ReadOptions& read_options,
-                          const SliceTransform* /*prefix_extractor*/) override {
+    ExternalTableIterator* NewIterator(
+        const ReadOptions& read_options,
+        const SliceTransform* /*prefix_extractor*/) override {
       return new DummyExternalTableIterator(read_options, kv_map_);
     }
 
@@ -6808,8 +6882,7 @@ TEST_F(ExternalTableReaderTest, BasicTest) {
       {}, file_path, ExternalTableOptions(prefix_extractor, nullptr), &reader));
 
   ReadOptions ro;
-  ro.weight = 1;
-  std::unique_ptr<Iterator> iter(reader->NewIterator(ro, nullptr));
+  std::unique_ptr<ExternalTableIterator> iter(reader->NewIterator(ro, nullptr));
   ASSERT_NE(iter, nullptr);
   iter->Seek("foo");
   ASSERT_TRUE(iter->Valid() && iter->status().ok());
@@ -6854,7 +6927,6 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) {
   ASSERT_OK(reader->Open(ingest_file));
 
   ReadOptions ro;
-  ro.weight = 1;
   std::unique_ptr<Iterator> iter(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
   iter->Seek("foo");
@@ -6865,6 +6937,172 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) {
   ASSERT_TRUE(iter->status().ok());
 }
 
+TEST_F(ExternalTableReaderTest, DBIterTest) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_reader_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  // This test doesn't work with some custom Envs, like EncryptedEnv
+  options.env = Env::Default();
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory(
+      new DummyExternalTableFactory());
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Put("foo2", "bar2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  ifo.allow_db_generated_files = true;
+  ifo.fill_cache = false;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::unique_ptr<Iterator> iter(db->NewIterator({}, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->value(), "bar");
+  iter->Next();
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->key(), "foo2");
+  ASSERT_EQ(iter->value(), "bar2");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+}
+
+TEST_F(ExternalTableReaderTest, DBMultiScanTest) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_reader_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  // This test doesn't work with some custom Envs, like EncryptedEnv
+  options.env = Env::Default();
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory(
+      new DummyExternalTableFactory());
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(writer->Put("k" + ss.str(), "val" + ss.str()));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  ifo.allow_db_generated_files = true;
+  ifo.fill_cache = false;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
+  ReadOptions ro;
+  std::vector<ScanOptions> scan_options(
+      {ScanOptions(key_ranges[0], key_ranges[1]),
+       ScanOptions(key_ranges[2], key_ranges[3])});
+  std::unique_ptr<MultiScan> iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 32);
+  } catch (Status status) {
+    std::cerr << "Iterator returned status " << status.ToString();
+    abort();
+  }
+  iter.reset();
+
+  // Test the overlapping scan case
+  key_ranges[1] = "k30";
+  scan_options[0] = ScanOptions(key_ranges[0], key_ranges[1]);
+  iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 52);
+  } catch (Status status) {
+    std::cerr << "Iterator returned status " << status.ToString();
+    abort();
+  }
+  iter.reset();
+
+  // Test the no limit scan case
+  scan_options[0] = ScanOptions(key_ranges[0]);
+  scan_options[1] = ScanOptions(key_ranges[2]);
+  iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        if (it.first.ToString().compare(key_ranges[idx + 1]) == 0) {
+          break;
+        }
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 52);
+  } catch (Status status) {
+    std::cerr << "Iterator returned status " << status.ToString();
+    abort();
+  }
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 4069afeede6363a2f723ea72a6b24bd10fe0d0c7 Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Fri, 4 Apr 2025 17:13:56 -0700
Subject: [PATCH 043/500] Add safeguarding from resurrected cutoff UDT from
 previous session (#13521)

Summary:
Public APIs like `DB::GetFullHistoryTsLow` and `DB::IncreaseFullHistoryTsLow` have such safeguarding, allowing them to only be invoked when user defined timestamp is enabled. This PR adds safeguarding into related internal APIs in `ColumnFamilyData` to properly handle the case when the UDT feature are toggled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13521

Test Plan: ./db_with_timestamp_basic_test --gtest_filter="*EnableDisableUDT*"

Reviewed By: cbi42

Differential Revision: D72475234

Pulled By: jowlyzhang

fbshipit-source-id: 194c07287e3100da95450b04c76552c9d4a86c2d
---
 db/column_family.h                 |  11 +++
 db/db_with_timestamp_basic_test.cc | 106 +++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 29 deletions(-)

diff --git a/db/column_family.h b/db/column_family.h
index 71401834ba80..31b0575a1b27 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -537,6 +537,12 @@ class ColumnFamilyData {
     assert(!ts_low.empty());
     const Comparator* ucmp = user_comparator();
     assert(ucmp);
+    // Guard against resurrected full_history_ts_low persisted in MANIFEST
+    // from previous DB sessions. This could happen if UDT was enabled and then
+    // disabled.
+    if (ucmp->timestamp_size() == 0) {
+      return;
+    }
     if (full_history_ts_low_.empty() ||
         ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
       full_history_ts_low_ = std::move(ts_low);
@@ -544,6 +550,11 @@ class ColumnFamilyData {
   }
 
   const std::string& GetFullHistoryTsLow() const {
+    const Comparator* ucmp = user_comparator();
+    assert(ucmp);
+    if (ucmp->timestamp_size() == 0) {
+      assert(full_history_ts_low_.empty());
+    }
     return full_history_ts_low_;
   }
 
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index af328707aac7..268e22fbedac 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -19,6 +19,13 @@
 #include "utilities/merge_operators/string_append/stringappend2.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+std::string EncodeAsUint64(uint64_t v) {
+  std::string dst;
+  PutFixed64(&dst, v);
+  return dst;
+}
+}  // namespace
 class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
  public:
   DBBasicTestWithTimestamp()
@@ -3746,17 +3753,42 @@ INSTANTIATE_TEST_CASE_P(
         test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp,
         test::UserDefinedTimestampTestMode::kNormal));
 
-TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) {
+// Test params:
+// 1) whether to flush before close
+class EnableDisableUDTTest : public DBBasicTestWithTimestampBase,
+                             public testing::WithParamInterface<bool> {
+ public:
+  EnableDisableUDTTest()
+      : DBBasicTestWithTimestampBase("/enable_disable_udt") {}
+};
+
+INSTANTIATE_TEST_CASE_P(EnableDisableUDTTest, EnableDisableUDTTest,
+                        ::testing::Values(true, false));
+
+TEST_P(EnableDisableUDTTest, Basic) {
   Options options = CurrentOptions();
+  // Un-flushed data before close will involve a WAL replay on DB reopen.
+  bool flush_before_close = GetParam();
   options.env = env_;
-  // Create a column family without user-defined timestamps.
   options.comparator = BytewiseComparator();
   options.persist_user_defined_timestamps = true;
   DestroyAndReopen(options);
 
+  ReadOptions ropts;
+  std::string read_ts;
+  std::string value;
+  std::string key_ts;
+
   // Create one SST file, its user keys have no user-defined timestamps.
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val1"));
-  ASSERT_OK(Flush(0));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val0"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", "val0"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
+  ASSERT_EQ("val0", value);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound());
+  if (flush_before_close) {
+    ASSERT_OK(Flush(0));
+  }
   Close();
 
   // Reopen the existing column family and enable user-defined timestamps
@@ -3765,47 +3797,63 @@ TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) {
   options.persist_user_defined_timestamps = false;
   options.allow_concurrent_memtable_write = false;
   Reopen(options);
-
-  std::string value;
-  ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument());
-  std::string read_ts;
-  PutFixed64(&read_ts, 0);
-  ReadOptions ropts;
+  // Read data from previous session before and after compaction.
+  read_ts = EncodeAsUint64(1);
   Slice read_ts_slice = read_ts;
   ropts.timestamp = &read_ts_slice;
-  std::string key_ts;
-  // Entries in pre-existing SST files are treated as if they have minimum
-  // user-defined timestamps.
-  ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
-  ASSERT_EQ("val1", value);
-  ASSERT_EQ(read_ts, key_ts);
+  for (int i = 0; i < 2; i++) {
+    ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument());
+    // Entries in pre-existing SST files are treated as if they have minimum
+    // user-defined timestamps.
+    ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
+    ASSERT_EQ("val0", value);
+    ASSERT_EQ(EncodeAsUint64(0), key_ts);
+    ASSERT_TRUE(db_->Get(ropts, "bar", &value, &key_ts).IsNotFound());
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  }
 
   // Do timestamped read / write.
-  std::string write_ts;
-  PutFixed64(&write_ts, 1);
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val2"));
-  read_ts.clear();
-  PutFixed64(&read_ts, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", EncodeAsUint64(1), "val1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", EncodeAsUint64(1), "val1"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz", EncodeAsUint64(2)));
   ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts));
-  ASSERT_EQ("val2", value);
-  ASSERT_EQ(write_ts, key_ts);
+  ASSERT_EQ("val1", value);
+  ASSERT_EQ(EncodeAsUint64(1), key_ts);
+  ASSERT_OK(db_->Get(ropts, "bar", &value, &key_ts));
+  ASSERT_EQ("val1", value);
+  ASSERT_EQ(EncodeAsUint64(1), key_ts);
+  read_ts = EncodeAsUint64(2);
+  ASSERT_TRUE(db_->Get(ropts, "bar", &value, &key_ts).IsNotFound());
   // The user keys in this SST file don't have user-defined timestamps either,
   // because `persist_user_defined_timestamps` flag is set to false.
-  ASSERT_OK(Flush(0));
+  if (flush_before_close) {
+    ASSERT_OK(Flush(0));
+  }
   Close();
 
   // Reopen the existing column family while disabling user-defined timestamps.
   options.comparator = BytewiseComparator();
   Reopen(options);
 
-  ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument());
-  ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
-  ASSERT_EQ("val2", value);
+  // Read data from previous session before and after compaction.
+  for (int i = 0; i < 2; i++) {
+    ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument());
+    ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
+    ASSERT_EQ("val1", value);
+    ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound());
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  }
 
   // Continue to write / read the column family without user-defined timestamps.
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "val2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", "val2"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "bar", "baz"));
   ASSERT_OK(db_->Get(ReadOptions(), "foo", &value));
-  ASSERT_EQ("val3", value);
+  ASSERT_EQ("val2", value);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "bar", &value).IsNotFound());
+  if (flush_before_close) {
+    ASSERT_OK(Flush(0));
+  }
   Close();
 }
 

From 07b09c75488a631d083b77f67833dc875e014f3e Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 7 Apr 2025 09:39:54 -0700
Subject: [PATCH 044/500] Persist tail size of remote compaction output file to
 manifest (#13522)

Summary:
**Context/Summary:**

This is to fix a bug that tail size of remote compaction output SST file is not persisted to manifest in primary instance. This prevent us from using direct tail prefetch optimization each time opening this SST file.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13522

Test Plan: Modify existing UT that failed before the fix

Reviewed By: anand1976

Differential Revision: D72479612

Pulled By: hx235

fbshipit-source-id: 1ba8aa66fac71b9196589f60076229c29a103706
---
 db/compaction/compaction_service_job.cc       |  3 ++-
 db/compaction/compaction_service_test.cc      | 18 ++++++++++++-
 db/version_edit.h                             | 27 +++++++++++++++++++
 table/block_based/block_based_table_reader.cc |  1 +
 .../bug_fixes/remote_compact_populate.md      |  1 +
 5 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/remote_compact_populate.md

diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index c5151f34b5be..7a6b07c5d9ea 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -240,7 +240,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     meta.marked_for_compaction = file.marked_for_compaction;
     meta.unique_id = file.unique_id;
     meta.temperature = file.file_temperature;
-
+    meta.tail_size =
+        FileMetaData::CalculateTailSize(file_size, file.table_properties);
     auto cfd = compaction->column_family_data();
     CompactionOutputs* compaction_outputs =
         sub_compact->Outputs(file.is_proximal_level_output);
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 6f99a3781458..d6680ac62db1 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -277,8 +277,17 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
   Statistics* primary_statistics = GetPrimaryStatistics();
   Statistics* compactor_statistics = GetCompactorStatistics();
 
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::PrefetchTail::TaiSizeNotRecorded",
+      [&](void* /* arg */) {
+        // Trigger assertion to verify precise tail prefetch size calculation
+        assert(false);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
   GenerateTestData();
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  SyncPoint::GetInstance()->DisableProcessing();
   VerifyTestData();
 
   auto my_cs = GetCompactionService();
@@ -380,6 +389,7 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
   ASSERT_FALSE(result.stats.is_full_compaction);
 
   Close();
+  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(CompactionServiceTest, ManualCompaction) {
@@ -890,6 +900,12 @@ TEST_F(CompactionServiceTest, TruncatedOutput) {
   Slice end(end_str);
   uint64_t comp_num = my_cs->GetCompactionNum();
 
+  // Skip calculating tail size to avoid crashing due to truncated file size
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::CalculateTailSize", [&](void* arg) {
+        bool* skip = static_cast<bool*>(arg);
+        *skip = true;
+      });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
         CompactionServiceResult* compaction_result =
@@ -906,7 +922,7 @@ TEST_F(CompactionServiceTest, TruncatedOutput) {
           ASSERT_OK(s);
           ASSERT_GT(file_size, 0);
 
-          ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 2));
+          ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 4));
         }
       });
   SyncPoint::GetInstance()->EnableProcessing();
diff --git a/db/version_edit.h b/db/version_edit.h
index 9189b4628109..25b794fd3359 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -380,6 +380,33 @@ struct FileMetaData {
     assert(!res || fd.smallest_seqno == fd.largest_seqno);
     return res;
   }
+
+  static uint64_t CalculateTailSize(uint64_t file_size,
+                                    const TableProperties& props) {
+#ifndef NDEBUG
+    bool skip = false;
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::CalculateTailSize", &skip);
+    if (skip) {
+      return 0;
+    }
+#endif  // NDEBUG
+    uint64_t tail_size = 0;
+
+    // Differentiate between a file with no data blocks (tail_start_offset = 0)
+    // and a file with unknown tail_start_offset (also set to 0 due to
+    // non-negative integer storage limitation)
+    bool contain_no_data_blocks =
+        props.num_entries == 0 ||
+        (props.num_entries > 0 &&
+         (props.num_entries == props.num_range_deletions));
+
+    if (props.tail_start_offset > 0 || contain_no_data_blocks) {
+      assert(props.tail_start_offset <= file_size);
+      tail_size = file_size - props.tail_start_offset;
+    }
+
+    return tail_size;
+  }
 };
 
 // A compressed copy of file meta data that just contain minimum data needed
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 103f687f812c..dd904ccd906d 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -914,6 +914,7 @@ Status BlockBasedTable::PrefetchTail(
                      "TailPrefetchStats.",
                      file->file_name().c_str(), tail_prefetch_size);
     }
+    TEST_SYNC_POINT("BlockBasedTable::PrefetchTail::TaiSizeNotRecorded");
   }
   size_t prefetch_off;
   size_t prefetch_len;
diff --git a/unreleased_history/bug_fixes/remote_compact_populate.md b/unreleased_history/bug_fixes/remote_compact_populate.md
new file mode 100644
index 000000000000..e1bd531cb0c7
--- /dev/null
+++ b/unreleased_history/bug_fixes/remote_compact_populate.md
@@ -0,0 +1 @@
+Fix a bug where tail size of remote compaction output is not persisted in primary db's manifest

From 72571d09ad61e6ef6721499d629082944d9fda62 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 7 Apr 2025 12:50:56 -0700
Subject: [PATCH 045/500] Clean up in repair, file ingestion and cf import
 (#13524)

Summary:
**Context/Summary:**
Rebased on https://github.com/facebook/rocksdb/pull/13522/files, this is to use the refactored function to calculate tail size from table property "tail_start_offset"

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13524

Test Plan: CI

Reviewed By: anand1976

Differential Revision: D72576262

Pulled By: hx235

fbshipit-source-id: 78c126bc64024c2341d183d6871e06d55fd27501
---
 db/external_sst_file_ingestion_job.cc | 12 ++----------
 db/import_column_family_job.cc        | 11 ++---------
 db/repair.cc                          |  9 +--------
 3 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index a65dacb7a114..2a45516b4a2e 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -610,16 +610,8 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
       current_time = oldest_ancester_time =
           static_cast<uint64_t>(temp_current_time);
     }
-    uint64_t tail_size = 0;
-    bool contain_no_data_blocks = file->table_properties.num_entries > 0 &&
-                                  (file->table_properties.num_entries ==
-                                   file->table_properties.num_range_deletions);
-    if (file->table_properties.tail_start_offset > 0 ||
-        contain_no_data_blocks) {
-      uint64_t file_size = file->fd.GetFileSize();
-      assert(file->table_properties.tail_start_offset <= file_size);
-      tail_size = file_size - file->table_properties.tail_start_offset;
-    }
+    uint64_t tail_size = FileMetaData::CalculateTailSize(
+        file->fd.GetFileSize(), file->table_properties);
 
     bool marked_for_compaction =
         file->table_properties.num_range_deletions == 1 &&
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 44a1c5d099a9..2a725726b913 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -201,15 +201,8 @@ Status ImportColumnFamilyJob::Run() {
       const auto& f = files_to_import_[i][j];
       const auto& file_metadata = *metadatas_[i][j];
 
-      uint64_t tail_size = 0;
-      bool contain_no_data_blocks = f.table_properties.num_entries > 0 &&
-                                    (f.table_properties.num_entries ==
-                                     f.table_properties.num_range_deletions);
-      if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) {
-        uint64_t file_size = f.fd.GetFileSize();
-        assert(f.table_properties.tail_start_offset <= file_size);
-        tail_size = file_size - f.table_properties.tail_start_offset;
-      }
+      uint64_t tail_size = FileMetaData::CalculateTailSize(f.fd.GetFileSize(),
+                                                           f.table_properties);
 
       VersionEdit dummy_version_edit;
       dummy_version_edit.AddFile(
diff --git a/db/repair.cc b/db/repair.cc
index eaeb77795a1a..6d184eba8b1c 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -578,14 +578,7 @@ class Repairer {
           static_cast<bool>(props->user_defined_timestamps_persisted);
     }
     if (status.ok()) {
-      uint64_t tail_size = 0;
-      bool contain_no_data_blocks =
-          props->num_entries > 0 &&
-          (props->num_entries == props->num_range_deletions);
-      if (props->tail_start_offset > 0 || contain_no_data_blocks) {
-        assert(props->tail_start_offset <= file_size);
-        tail_size = file_size - props->tail_start_offset;
-      }
+      uint64_t tail_size = FileMetaData::CalculateTailSize(file_size, *props);
       t->meta.tail_size = tail_size;
     }
     ColumnFamilyData* cfd = nullptr;

From 5e10baa412d06492c7ec4a4ded0949f6981c94e5 Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Mon, 7 Apr 2025 21:44:36 -0700
Subject: [PATCH 046/500] Delete max_write_buffer_number_to_maintain (#13491)

Summary:
As titled. This option has been marked deprecated since introduction of a better option `max_write_buffer_size_to_maintain` and acts as its fallback since RocksDB 6.5.0 The internal user we know these options were created for migrated to `max_write_buffer_size_to_maintain` for a long time too.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13491

Test Plan: existing tests

Reviewed By: cbi42

Differential Revision: D71984601

Pulled By: jowlyzhang

fbshipit-source-id: c264d4809e311f60fdbad817ebfade256db549b6
---
 db/c.cc                                       | 10 ---
 db/c_test.c                                   | 12 ---
 db/column_family.cc                           |  6 --
 db/memtable_list.cc                           | 14 +---
 db/memtable_list.h                            |  5 --
 db/memtable_list_test.cc                      | 12 +--
 db_stress_tool/db_stress_common.h             |  1 -
 db_stress_tool/db_stress_gflags.cc            | 14 ----
 db_stress_tool/db_stress_test_base.cc         |  2 -
 include/rocksdb/advanced_options.h            |  9 ---
 include/rocksdb/c.h                           |  5 --
 java/rocksjni/options.cc                      | 45 -----------
 .../AdvancedColumnFamilyOptionsInterface.java | 47 -----------
 .../java/org/rocksdb/ColumnFamilyOptions.java | 16 ----
 java/src/main/java/org/rocksdb/Options.java   | 16 ----
 .../main/java/org/rocksdb/Transaction.java    | 77 +++++++------------
 .../org/rocksdb/ColumnFamilyOptionsTest.java  | 13 ----
 .../MutableColumnFamilyOptionsTest.java       | 68 ++++++++++------
 .../test/java/org/rocksdb/OptionsTest.java    | 13 ----
 options/cf_options.cc                         |  6 +-
 options/cf_options.h                          |  2 -
 options/options.cc                            |  4 -
 options/options_helper.cc                     |  2 -
 options/options_settable_test.cc              |  1 -
 options/options_test.cc                       |  2 -
 table/multiget_context.h                      |  4 +-
 test_util/testutil.cc                         |  1 -
 tools/db_bench_tool.cc                        | 16 ----
 tools/db_bench_tool_test.cc                   |  1 -
 ...ete_max_write_buffer_number_to_maintain.md |  1 +
 .../optimistic_transaction_db_impl.cc         |  3 +-
 .../pessimistic_transaction_db.cc             |  3 +-
 .../write_prepared_transaction_test.cc        |  3 +-
 33 files changed, 85 insertions(+), 349 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md

diff --git a/db/c.cc b/db/c.cc
index b101540ffa1b..819d928193e7 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3804,16 +3804,6 @@ int rocksdb_options_get_min_write_buffer_number_to_merge(
   return opt->rep.min_write_buffer_number_to_merge;
 }
 
-void rocksdb_options_set_max_write_buffer_number_to_maintain(
-    rocksdb_options_t* opt, int n) {
-  opt->rep.max_write_buffer_number_to_maintain = n;
-}
-
-int rocksdb_options_get_max_write_buffer_number_to_maintain(
-    rocksdb_options_t* opt) {
-  return opt->rep.max_write_buffer_number_to_maintain;
-}
-
 void rocksdb_options_set_max_write_buffer_size_to_maintain(
     rocksdb_options_t* opt, int64_t n) {
   opt->rep.max_write_buffer_size_to_maintain = n;
diff --git a/db/c_test.c b/db/c_test.c
index 18bf2961ded3..373bdcc6d43b 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -2146,10 +2146,6 @@ int main(int argc, char** argv) {
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(o));
 
-    rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64);
-    CheckCondition(64 ==
-                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
-
     rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000);
     CheckCondition(50000 ==
                    rocksdb_options_get_max_write_buffer_size_to_maintain(o));
@@ -2407,8 +2403,6 @@ int main(int argc, char** argv) {
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(copy));
-    CheckCondition(
-        64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
     CheckCondition(50000 ==
                    rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
     CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy));
@@ -2596,12 +2590,6 @@ int main(int argc, char** argv) {
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(o));
 
-    rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128);
-    CheckCondition(
-        128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
-    CheckCondition(64 ==
-                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
-
     rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000);
     CheckCondition(9000 ==
                    rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
diff --git a/db/column_family.cc b/db/column_family.cc
index ffb89c75408a..3232834ca320 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -262,15 +262,10 @@ ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
   if (result.max_write_buffer_number < 2) {
     result.max_write_buffer_number = 2;
   }
-  // fall back max_write_buffer_number_to_maintain if
-  // max_write_buffer_size_to_maintain is not set
   if (result.max_write_buffer_size_to_maintain < 0) {
     result.max_write_buffer_size_to_maintain =
         result.max_write_buffer_number *
         static_cast<int64_t>(result.write_buffer_size);
-  } else if (result.max_write_buffer_size_to_maintain == 0 &&
-             result.max_write_buffer_number_to_maintain < 0) {
-    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
   }
   // bloom filter size shouldn't exceed 1/4 of memtable size.
   if (result.memtable_prefix_bloom_size_ratio > 0.25) {
@@ -577,7 +572,6 @@ ColumnFamilyData::ColumnFamilyData(
       write_buffer_manager_(write_buffer_manager),
       mem_(nullptr),
       imm_(ioptions_.min_write_buffer_number_to_merge,
-           ioptions_.max_write_buffer_number_to_maintain,
            ioptions_.max_write_buffer_size_to_maintain),
       super_version_(nullptr),
       super_version_number_(0),
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 2643110a13c3..4e6587792971 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -51,9 +51,7 @@ void MemTableListVersion::UnrefMemTable(
 
 MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old)
-    : max_write_buffer_number_to_maintain_(
-          old.max_write_buffer_number_to_maintain_),
-      max_write_buffer_size_to_maintain_(
+    : max_write_buffer_size_to_maintain_(
           old.max_write_buffer_size_to_maintain_),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
   memlist_ = old.memlist_;
@@ -69,10 +67,8 @@ MemTableListVersion::MemTableListVersion(
 
 MemTableListVersion::MemTableListVersion(
     size_t* parent_memtable_list_memory_usage,
-    int max_write_buffer_number_to_maintain,
     int64_t max_write_buffer_size_to_maintain)
-    : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
-      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+    : max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
       parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
 
 void MemTableListVersion::Ref() { ++refs_; }
@@ -323,8 +319,7 @@ void MemTableListVersion::Remove(ReadOnlyMemTable* m,
   memlist_.remove(m);
 
   m->MarkFlushed();
-  if (max_write_buffer_size_to_maintain_ > 0 ||
-      max_write_buffer_number_to_maintain_ > 0) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
     memlist_history_.push_front(m);
     // Unable to get size of mutable memtable at this point, pass 0 to
     // TrimHistory as a best effort.
@@ -356,9 +351,6 @@ bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
     // whether to trim history
     return MemoryAllocatedBytesExcludingLast() + usage >=
            static_cast<size_t>(max_write_buffer_size_to_maintain_);
-  } else if (max_write_buffer_number_to_maintain_ > 0) {
-    return memlist_.size() + memlist_history_.size() >
-           static_cast<size_t>(max_write_buffer_number_to_maintain_);
   } else {
     return false;
   }
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 155878bdc268..4d06421ba41c 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -45,7 +45,6 @@ class MemTableListVersion {
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
                                const MemTableListVersion& old);
   explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
-                               int max_write_buffer_number_to_maintain,
                                int64_t max_write_buffer_size_to_maintain);
 
   void Ref();
@@ -209,8 +208,6 @@ class MemTableListVersion {
   // (used during Transaction validation)
   std::list<ReadOnlyMemTable*> memlist_history_;
 
-  // Maximum number of MemTables to keep in memory (including both flushed
-  const int max_write_buffer_number_to_maintain_;
   // Maximum size of MemTables to keep in memory (including both flushed
   // and not-yet-flushed tables).
   const int64_t max_write_buffer_size_to_maintain_;
@@ -238,13 +235,11 @@ class MemTableList {
  public:
   // A list of memtables.
   explicit MemTableList(int min_write_buffer_number_to_merge,
-                        int max_write_buffer_number_to_maintain,
                         int64_t max_write_buffer_size_to_maintain)
       : imm_flush_needed(false),
         imm_trim_needed(false),
         min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
         current_(new MemTableListVersion(&current_memory_usage_,
-                                         max_write_buffer_number_to_maintain,
                                          max_write_buffer_size_to_maintain)),
         num_flush_not_started_(0),
         commit_in_progress_(false),
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index cefb4653d616..97e36e00f60c 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -220,7 +220,7 @@ class MemTableListTest : public testing::Test {
 
 TEST_F(MemTableListTest, Empty) {
   // Create an empty MemTableList and validate basic functions.
-  MemTableList list(1, 0, 0);
+  MemTableList list(1, 0);
 
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -239,10 +239,8 @@ TEST_F(MemTableListTest, Empty) {
 TEST_F(MemTableListTest, GetTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
-  int max_write_buffer_number_to_maintain = 0;
   int64_t max_write_buffer_size_to_maintain = 0;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
@@ -407,10 +405,8 @@ TEST_F(MemTableListTest, GetTest) {
 TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
-  int max_write_buffer_number_to_maintain = 2;
   int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   SequenceNumber seq = 1;
@@ -653,11 +649,9 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Create MemTableList
   int min_write_buffer_number_to_merge = 3;
-  int max_write_buffer_number_to_maintain = 7;
   int64_t max_write_buffer_size_to_maintain =
       7 * static_cast<int>(options.write_buffer_size);
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   // Create some MemTables
@@ -949,13 +943,11 @@ TEST_F(MemTableListTest, AtomicFlushTest) {
 
   // Create MemTableLists
   int min_write_buffer_number_to_merge = 3;
-  int max_write_buffer_number_to_maintain = 7;
   int64_t max_write_buffer_size_to_maintain =
       7 * static_cast<int64_t>(options.write_buffer_size);
   autovector<MemTableList*> lists;
   for (int i = 0; i != num_cfs; ++i) {
     lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
-                                        max_write_buffer_number_to_maintain,
                                         max_write_buffer_size_to_maintain));
   }
 
@@ -1104,11 +1096,9 @@ TEST_F(MemTableListWithTimestampTest, GetTableNewestUDT) {
 
   // Create MemTableList
   int min_write_buffer_number_to_merge = 1;
-  int max_write_buffer_number_to_maintain = 4;
   int64_t max_write_buffer_size_to_maintain =
       4 * static_cast<int>(options.write_buffer_size);
   MemTableList list(min_write_buffer_number_to_merge,
-                    max_write_buffer_number_to_maintain,
                     max_write_buffer_size_to_maintain);
 
   // Create some MemTables
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 0871a87f9e70..0e392fb575e5 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -106,7 +106,6 @@ DECLARE_uint64(db_write_buffer_size);
 DECLARE_int32(write_buffer_size);
 DECLARE_int32(max_write_buffer_number);
 DECLARE_int32(min_write_buffer_number_to_merge);
-DECLARE_int32(max_write_buffer_number_to_maintain);
 DECLARE_int64(max_write_buffer_size_to_maintain);
 DECLARE_bool(use_write_buffer_manager);
 DECLARE_double(memtable_prefix_bloom_size_ratio);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 94028f07b40c..1650b989b102 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -168,20 +168,6 @@ DEFINE_int32(min_write_buffer_number_to_merge,
              "writing less data to storage if there are duplicate records in"
              " each of these individual write buffers.");
 
-DEFINE_int32(max_write_buffer_number_to_maintain,
-             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
-             "The total maximum number of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
 DEFINE_int64(max_write_buffer_size_to_maintain,
              ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
              "The total maximum size of write buffers to maintain in memory "
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 8403ee3e9c4b..40cd731a4b98 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4047,8 +4047,6 @@ void InitializeOptionsFromFlags(
   options.max_write_buffer_number = FLAGS_max_write_buffer_number;
   options.min_write_buffer_number_to_merge =
       FLAGS_min_write_buffer_number_to_merge;
-  options.max_write_buffer_number_to_maintain =
-      FLAGS_max_write_buffer_number_to_maintain;
   options.max_write_buffer_size_to_maintain =
       FLAGS_max_write_buffer_size_to_maintain;
   options.memtable_prefix_bloom_size_ratio =
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 369603e7f7c8..15268d457af9 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -171,15 +171,6 @@ struct AdvancedColumnFamilyOptions {
   // Default: 1
   int min_write_buffer_number_to_merge = 1;
 
-  // DEPRECATED
-  // The total maximum number of write buffers to maintain in memory including
-  // copies of buffers that have already been flushed.  Unlike
-  // max_write_buffer_number, this parameter does not affect flushing.
-  // This parameter is being replaced by max_write_buffer_size_to_maintain.
-  // If both parameters are set to non-zero values, this parameter will be
-  // ignored.
-  int max_write_buffer_number_to_maintain = 0;
-
   // The target number of write history bytes to hold in memory. Write history
   // comprises the latest write buffers (memtables). To reach the target, write
   // buffers that were most recently flushed to SST files may be retained in
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index f2616ea3e7f8..9ec09defb85d 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1448,11 +1448,6 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API int
 rocksdb_options_get_min_write_buffer_number_to_merge(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void
-rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
-                                                        int);
-extern ROCKSDB_LIBRARY_API int
-rocksdb_options_get_max_write_buffer_number_to_maintain(rocksdb_options_t*);
-extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_size_to_maintain(rocksdb_options_t*,
                                                       int64_t);
 extern ROCKSDB_LIBRARY_API int64_t
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index c986511a3f2f..8f6f1903e326 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -2456,28 +2456,6 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
       ->min_write_buffer_number_to_merge =
       static_cast<int>(jmin_write_buffer_number_to_merge);
 }
-/*
- * Class:     org_rocksdb_Options
- * Method:    maxWriteBufferNumberToMaintain
- * Signature: (J)I
- */
-jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv*, jclass,
-                                                             jlong jhandle) {
-  return reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
-      ->max_write_buffer_number_to_maintain;
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    setMaxWriteBufferNumberToMaintain
- * Signature: (JI)V
- */
-void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain(
-    JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) {
-  reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle)
-      ->max_write_buffer_number_to_maintain =
-      static_cast<int>(jmax_write_buffer_number_to_maintain);
-}
 
 /*
  * Class:     org_rocksdb_Options
@@ -4496,29 +4474,6 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
       static_cast<int>(jmin_write_buffer_number_to_merge);
 }
 
-/*
- * Class:     org_rocksdb_ColumnFamilyOptions
- * Method:    maxWriteBufferNumberToMaintain
- * Signature: (J)I
- */
-jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain(
-    JNIEnv*, jclass, jlong jhandle) {
-  return reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
-      ->max_write_buffer_number_to_maintain;
-}
-
-/*
- * Class:     org_rocksdb_ColumnFamilyOptions
- * Method:    setMaxWriteBufferNumberToMaintain
- * Signature: (JI)V
- */
-void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain(
-    JNIEnv*, jclass, jlong jhandle, jint jmax_write_buffer_number_to_maintain) {
-  reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle)
-      ->max_write_buffer_number_to_maintain =
-      static_cast<int>(jmax_write_buffer_number_to_maintain);
-}
-
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    setCompressionType
diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index d1d1123dded4..867f5ca959bd 100644
--- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -44,53 +44,6 @@ T setMinWriteBufferNumberToMerge(
    */
   int minWriteBufferNumberToMerge();
 
-  /**
-   * The total maximum number of write buffers to maintain in memory including
-   * copies of buffers that have already been flushed.  Unlike
-   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()},
-   * this parameter does not affect flushing.
-   * This controls the minimum amount of write history that will be available
-   * in memory for conflict checking when Transactions are used.
-   * <p>
-   * When using an OptimisticTransactionDB:
-   * If this value is too low, some transactions may fail at commit time due
-   * to not being able to determine whether there were any write conflicts.
-   * <p>
-   * When using a TransactionDB:
-   * If Transaction::SetSnapshot is used, TransactionDB will read either
-   * in-memory write buffers or SST files to do write-conflict checking.
-   * Increasing this value can reduce the number of reads to SST files
-   * done for conflict detection.
-   * <p>
-   * Setting this value to 0 will cause write buffers to be freed immediately
-   * after they are flushed.
-   * If this value is set to -1,
-   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
-   * will be used.
-   * <p>
-   * Default:
-   * If using a TransactionDB/OptimisticTransactionDB, the default value will
-   * be set to the value of
-   * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()}
-   * if it is not explicitly set by the user. Otherwise, the default is 0.
-   *
-   * @param maxWriteBufferNumberToMaintain The maximum number of write
-   *     buffers to maintain
-   *
-   * @return the reference to the current options.
-   */
-  T setMaxWriteBufferNumberToMaintain(
-      int maxWriteBufferNumberToMaintain);
-
-  /**
-   * The total maximum number of write buffers to maintain in memory including
-   * copies of buffers that have already been flushed.
-   *
-   * @return maxWriteBufferNumberToMaintain The maximum number of write buffers
-   *     to maintain
-   */
-  int maxWriteBufferNumberToMaintain();
-
   /**
    * Allows thread-safe inplace updates.
    * If inplace_callback function is not set,
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
index 3af4d2a8ed6f..d25f8c73bc7b 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -835,19 +835,6 @@ public boolean paranoidFileChecks() {
     return paranoidFileChecks(nativeHandle_);
   }
 
-  @Override
-  public ColumnFamilyOptions setMaxWriteBufferNumberToMaintain(
-      final int maxWriteBufferNumberToMaintain) {
-    setMaxWriteBufferNumberToMaintain(
-        nativeHandle_, maxWriteBufferNumberToMaintain);
-    return this;
-  }
-
-  @Override
-  public int maxWriteBufferNumberToMaintain() {
-    return maxWriteBufferNumberToMaintain(nativeHandle_);
-  }
-
   @Override
   public ColumnFamilyOptions setCompactionPriority(
       final CompactionPriority compactionPriority) {
@@ -1467,9 +1454,6 @@ private static native void setMaxBytesForLevelMultiplierAdditional(
   private static native int[] maxBytesForLevelMultiplierAdditional(long handle);
   private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks);
   private static native boolean paranoidFileChecks(long handle);
-  private static native void setMaxWriteBufferNumberToMaintain(
-      final long handle, final int maxWriteBufferNumberToMaintain);
-  private static native int maxWriteBufferNumberToMaintain(final long handle);
   private static native void setCompactionPriority(
       final long handle, final byte compactionPriority);
   private static native byte compactionPriority(final long handle);
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index c184e140f602..675837df7a09 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -1762,19 +1762,6 @@ public boolean paranoidFileChecks() {
     return paranoidFileChecks(nativeHandle_);
   }
 
-  @Override
-  public Options setMaxWriteBufferNumberToMaintain(
-      final int maxWriteBufferNumberToMaintain) {
-    setMaxWriteBufferNumberToMaintain(
-        nativeHandle_, maxWriteBufferNumberToMaintain);
-    return this;
-  }
-
-  @Override
-  public int maxWriteBufferNumberToMaintain() {
-    return maxWriteBufferNumberToMaintain(nativeHandle_);
-  }
-
   @Override
   public Options setCompactionPriority(
       final CompactionPriority compactionPriority) {
@@ -2443,9 +2430,6 @@ private static native void setMaxBytesForLevelMultiplierAdditional(
   private static native int[] maxBytesForLevelMultiplierAdditional(long handle);
   private static native void setParanoidFileChecks(long handle, boolean paranoidFileChecks);
   private static native boolean paranoidFileChecks(long handle);
-  private static native void setMaxWriteBufferNumberToMaintain(
-      final long handle, final int maxWriteBufferNumberToMaintain);
-  private static native int maxWriteBufferNumberToMaintain(final long handle);
   private static native void setCompactionPriority(
       final long handle, final byte compactionPriority);
   private static native byte compactionPriority(final long handle);
diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java
index d1ddcbcbe6c7..12e4082c11b0 100644
--- a/java/src/main/java/org/rocksdb/Transaction.java
+++ b/java/src/main/java/org/rocksdb/Transaction.java
@@ -203,7 +203,7 @@ public void prepare() throws RocksDBException {
    * Status::Busy() may be returned if the transaction could not guarantee
    * that there are no write conflicts. Status::TryAgain() may be returned
    * if the memtable history size is not large enough
-   *  (See max_write_buffer_number_to_maintain).
+   *  (See max_write_buffer_size_to_maintain).
    * <p>
    * If this transaction was created by a {@link TransactionDB},
    * Status::Expired() may be returned if this transaction has lived for
@@ -689,8 +689,7 @@ public List<byte[]> multiGetAsList(final ReadOptions readOptions, final List<byt
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -770,8 +769,7 @@ public byte[] getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -821,8 +819,7 @@ public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -880,8 +877,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions, final byte[] key, f
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -933,8 +929,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions, final ByteBuffer ke
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -988,8 +983,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -1050,8 +1044,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -1106,8 +1099,7 @@ public GetStatus getForUpdate(final ReadOptions readOptions,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *     {@link Status.Code#MergeInProgress} if merge operations cannot be
    *     resolved.
    *
@@ -1393,8 +1385,7 @@ public RocksIterator getIterator(final ColumnFamilyHandle columnFamilyHandle) {
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *
    * @param columnFamilyHandle The column family to put the key/value into
    * @param key the specified key to be inserted.
@@ -1430,8 +1421,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
    *     {@link Status.Code#Busy} if there is a write conflict,
    *     {@link Status.Code#TimedOut} if a lock could not be acquired,
    *     {@link Status.Code#TryAgain} if the memtable history size is not large
-   *         enough. See
-   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *         enough.
    *
    * @param columnFamilyHandle The column family to put the key/value into
    * @param key the specified key to be inserted.
@@ -1460,8 +1450,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
@@ -1536,8 +1525,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be inserted.
    * @param value the value associated with the specified key.
@@ -1575,8 +1563,7 @@ public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBExce
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to put the key/value into
    * @param key the specified key to be inserted.
@@ -1645,8 +1632,7 @@ public void put(final byte[][] keyParts, final byte[][] valueParts)
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to merge the key/value into
    * @param key the specified key to be merged.
@@ -1683,8 +1669,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to merge the key/value into
    * @param key the specified key to be merged.
@@ -1713,8 +1698,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be merged.
    * @param value the value associated with the specified key.
@@ -1741,8 +1725,7 @@ public void merge(final byte[] key, final byte[] value)
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be merged.
    * @param value the value associated with the specified key.
@@ -1778,8 +1761,7 @@ public void merge(final ByteBuffer key, final ByteBuffer value) throws RocksDBEx
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle in which to apply the merge
    * @param key the specified key to be merged.
@@ -1821,8 +1803,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle in which to apply the merge
    * @param key the specified key to be merged.
@@ -1849,8 +1830,7 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -1885,8 +1865,7 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -1914,8 +1893,7 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle,
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be deleted.
    *
@@ -2001,8 +1979,7 @@ public void delete(final byte[][] keyParts) throws RocksDBException {
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -2038,8 +2015,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param columnFamilyHandle The column family to delete the key/value from
    * @param key the specified key to be deleted.
@@ -2068,8 +2044,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte
    *    {@link Status.Code#Busy} if there is a write conflict,
    *    {@link Status.Code#TimedOut} if a lock could not be acquired,
    *    {@link Status.Code#TryAgain} if the memtable history size is not large
-   *       enough. See
-   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *       enough.
    *
    * @param key the specified key to be deleted.
    *
diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
index 35a04a697f84..c345e80c030f 100644
--- a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
@@ -561,19 +561,6 @@ public void maxTableFilesSizeFIFO() {
     }
   }
 
-  @Test
-  public void maxWriteBufferNumberToMaintain() {
-    try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
-      int intValue = rand.nextInt();
-      // Size has to be positive
-      intValue = (intValue < 0) ? -intValue : intValue;
-      intValue = (intValue == 0) ? intValue + 1 : intValue;
-      opt.setMaxWriteBufferNumberToMaintain(intValue);
-      assertThat(opt.maxWriteBufferNumberToMaintain()).
-          isEqualTo(intValue);
-    }
-  }
-
   @Test
   public void compactionPriorities() {
     try (final ColumnFamilyOptions opt = new ColumnFamilyOptions()) {
diff --git a/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
index d858a150dfc9..58e3f4be21fd 100644
--- a/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java
@@ -96,30 +96,54 @@ public void mutableColumnFamilyOptions_parse() {
   public void mutableColumnFamilyOptions_parse_getOptions_output() {
     final String optionsString =
         "bottommost_compression=kDisableCompressionOption;  sample_for_compression=0;  "
-        + "blob_garbage_collection_age_cutoff=0.250000;  blob_garbage_collection_force_threshold=0.800000;"
-        + "arena_block_size=1048576;  enable_blob_garbage_collection=false;  level0_stop_writes_trigger=36;  min_blob_size=65536;"
-        + "blob_compaction_readahead_size=262144;  blob_file_starting_level=5;  prepopulate_blob_cache=kDisable;"
-        + "compaction_options_universal={allow_trivial_move=false;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;"
-        + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width=4294967295;size_ratio=1;};  "
-        + "target_file_size_base=67108864;  max_bytes_for_level_base=268435456;  memtable_whole_key_filtering=false;  "
-        + "soft_pending_compaction_bytes_limit=68719476736;  blob_compression_type=kNoCompression;  max_write_buffer_number=2;  "
-        + "ttl=2592000;  compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;};  "
-        + "check_flush_compaction_key_order=true;  max_successive_merges=0;  inplace_update_num_locks=10000;  "
-        + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;"
+        + "blob_garbage_collection_age_cutoff=0.250000;  "
+        + "blob_garbage_collection_force_threshold=0.800000;"
+        + "arena_block_size=1048576;  enable_blob_garbage_collection=false;  "
+        + "level0_stop_writes_trigger=36;  min_blob_size=65536;"
+        + "blob_compaction_readahead_size=262144;  blob_file_starting_level=5;  "
+        + "prepopulate_blob_cache=kDisable;"
+        + "compaction_options_universal={allow_trivial_move=false;stop_style="
+        + "kCompactionStopStyleTotalSize;min_merge_width=2;"
+        + "compression_size_percent=-1;max_size_amplification_percent=200;max_merge_width="
+        + "4294967295;size_ratio=1;};  "
+        + "target_file_size_base=67108864;  max_bytes_for_level_base=268435456;  "
+        + "memtable_whole_key_filtering=false;  "
+        + "soft_pending_compaction_bytes_limit=68719476736;  blob_compression_type=kNoCompression; "
+        + " max_write_buffer_number=2;  "
+        + "ttl=2592000;  "
+        + "compaction_options_fifo={allow_compaction=false;age_for_warm=0;max_table_files_size="
+        + "1073741824;};  "
+        + "check_flush_compaction_key_order=true;  max_successive_merges=0;  "
+        + "inplace_update_num_locks=10000;  "
+        + "bottommost_compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;"
+        + "max_dict_bytes=0;"
         + "strategy=0;max_dict_buffer_bytes=0;level=32767;window_bits=-14;};  "
-        + "target_file_size_multiplier=1;  max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17};  "
-        + "enable_blob_files=true;  level0_slowdown_writes_trigger=20;  compression=kLZ4HCCompression;  level0_file_num_compaction_trigger=4;  "
-        + "blob_file_size=268435456;  prefix_extractor=nullptr;  max_bytes_for_level_multiplier=10.000000;  write_buffer_size=67108864;  "
-        + "disable_auto_compactions=false;  max_compaction_bytes=1677721600;  memtable_huge_page_size=0;  "
-        + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_bytes=0;strategy=0;max_dict_buffer_bytes=0;"
+        + "target_file_size_multiplier=1;  "
+        + "max_bytes_for_level_multiplier_additional=5:{7}:{9}:{11}:{13}:{15}:{17};  "
+        + "enable_blob_files=true;  level0_slowdown_writes_trigger=20;  "
+        + "compression=kLZ4HCCompression;  level0_file_num_compaction_trigger=4;  "
+        + "blob_file_size=268435456;  prefix_extractor=nullptr;  "
+        + "max_bytes_for_level_multiplier=10.000000;  write_buffer_size=67108864;  "
+        + "disable_auto_compactions=false;  max_compaction_bytes=1677721600;  "
+        + "memtable_huge_page_size=0;  "
+        + "compression_opts={enabled=false;parallel_threads=1;zstd_max_train_bytes=0;max_dict_"
+        + "bytes=0;strategy=0;max_dict_buffer_bytes=0;"
         + "level=32767;window_bits=-14;};  "
-        + "hard_pending_compaction_bytes_limit=274877906944;  periodic_compaction_seconds=0;  paranoid_file_checks=true;  "
-        + "memtable_prefix_bloom_size_ratio=7.500000;  max_sequential_skip_in_iterations=8;  report_bg_io_stats=true;  "
-        + "compaction_pri=kMinOverlappingRatio;  compaction_style=kCompactionStyleLevel;  memtable_factory=SkipListFactory;  "
-        + "comparator=leveldb.BytewiseComparator;  bloom_locality=0;  compaction_filter_factory=nullptr;  "
-        + "min_write_buffer_number_to_merge=1;  max_write_buffer_number_to_maintain=0;  compaction_filter=nullptr;  merge_operator=nullptr;  "
-        + "num_levels=7;  optimize_filters_for_hits=false;  force_consistency_checks=true;  table_factory=BlockBasedTable;  "
-        + "max_write_buffer_size_to_maintain=0;  memtable_insert_with_hint_prefix_extractor=nullptr;  level_compaction_dynamic_level_bytes=false;  "
+        + "hard_pending_compaction_bytes_limit=274877906944;  periodic_compaction_seconds=0;  "
+        + "paranoid_file_checks=true;  "
+        + "memtable_prefix_bloom_size_ratio=7.500000;  max_sequential_skip_in_iterations=8;  "
+        + "report_bg_io_stats=true;  "
+        + "compaction_pri=kMinOverlappingRatio;  compaction_style=kCompactionStyleLevel;  "
+        + "memtable_factory=SkipListFactory;  "
+        + "comparator=leveldb.BytewiseComparator;  bloom_locality=0;  "
+        + "compaction_filter_factory=nullptr;  "
+        + "min_write_buffer_number_to_merge=1;  compaction_filter=nullptr;  "
+        + "merge_operator=nullptr;  "
+        + "num_levels=7;  optimize_filters_for_hits=false;  force_consistency_checks=true;  "
+        + "table_factory=BlockBasedTable;  "
+        + "max_write_buffer_size_to_maintain=0;  "
+        + "memtable_insert_with_hint_prefix_extractor=nullptr;  "
+        + "level_compaction_dynamic_level_bytes=false;  "
         + "inplace_update_support=false;  experimental_mempurge_threshold=0.003";
 
     final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf =
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index 6615b6761477..316086c6bd40 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -1201,19 +1201,6 @@ public void statistics() {
     }
   }
 
-  @Test
-  public void maxWriteBufferNumberToMaintain() {
-    try (final Options options = new Options()) {
-      int intValue = rand.nextInt();
-      // Size has to be positive
-      intValue = (intValue < 0) ? -intValue : intValue;
-      intValue = (intValue == 0) ? intValue + 1 : intValue;
-      options.setMaxWriteBufferNumberToMaintain(intValue);
-      assertThat(options.maxWriteBufferNumberToMaintain()).
-          isEqualTo(intValue);
-    }
-  }
-
   @Test
   public void compactionPriorities() {
     try (final Options options = new Options()) {
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 85ecc994f39b..f8a2e044daa1 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -749,9 +749,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {0, OptionType::kInt, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"max_write_buffer_number_to_maintain",
-         {offsetof(struct ImmutableCFOptions,
-                   max_write_buffer_number_to_maintain),
-          OptionType::kInt, OptionVerificationType::kNormal,
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone, nullptr}},
         {"max_write_buffer_size_to_maintain",
          {offsetof(struct ImmutableCFOptions,
@@ -987,8 +985,6 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
       compaction_filter_factory(cf_options.compaction_filter_factory),
       min_write_buffer_number_to_merge(
           cf_options.min_write_buffer_number_to_merge),
-      max_write_buffer_number_to_maintain(
-          cf_options.max_write_buffer_number_to_maintain),
       max_write_buffer_size_to_maintain(
           cf_options.max_write_buffer_size_to_maintain),
       inplace_update_support(cf_options.inplace_update_support),
diff --git a/options/cf_options.h b/options/cf_options.h
index 51236394e342..13d5e1dbc84c 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -40,8 +40,6 @@ struct ImmutableCFOptions {
 
   int min_write_buffer_number_to_merge;
 
-  int max_write_buffer_number_to_maintain;
-
   int64_t max_write_buffer_size_to_maintain;
 
   bool inplace_update_support;
diff --git a/options/options.cc b/options/options.cc
index 2ee431406651..0a9e30e67c76 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -43,8 +43,6 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
     : max_write_buffer_number(options.max_write_buffer_number),
       min_write_buffer_number_to_merge(
           options.min_write_buffer_number_to_merge),
-      max_write_buffer_number_to_maintain(
-          options.max_write_buffer_number_to_maintain),
       max_write_buffer_size_to_maintain(
           options.max_write_buffer_size_to_maintain),
       inplace_update_support(options.inplace_update_support),
@@ -192,8 +190,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log, "            Options.num_levels: %d", num_levels);
   ROCKS_LOG_HEADER(log, "       Options.min_write_buffer_number_to_merge: %d",
                    min_write_buffer_number_to_merge);
-  ROCKS_LOG_HEADER(log, "    Options.max_write_buffer_number_to_maintain: %d",
-                   max_write_buffer_number_to_maintain);
   ROCKS_LOG_HEADER(log,
                    "    Options.max_write_buffer_size_to_maintain: %" PRIu64,
                    max_write_buffer_size_to_maintain);
diff --git a/options/options_helper.cc b/options/options_helper.cc
index c026ff195497..d3a7edd9e703 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -313,8 +313,6 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
   cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory;
   cf_opts->min_write_buffer_number_to_merge =
       ioptions.min_write_buffer_number_to_merge;
-  cf_opts->max_write_buffer_number_to_maintain =
-      ioptions.max_write_buffer_number_to_maintain;
   cf_opts->max_write_buffer_size_to_maintain =
       ioptions.max_write_buffer_size_to_maintain;
   cf_opts->inplace_update_support = ioptions.inplace_update_support;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 6fab7daeeba1..2b9c89b16796 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -626,7 +626,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "level0_file_num_compaction_trigger=14;"
       "compaction_filter=urxcqstuwnCompactionFilter;"
       "soft_pending_compaction_bytes_limit=0;"
-      "max_write_buffer_number_to_maintain=84;"
       "max_write_buffer_size_to_maintain=2147483648;"
       "merge_operator=aabcxehazrMergeOperator;"
       "memtable_prefix_bloom_size_ratio=0.4642;"
diff --git a/options/options_test.cc b/options/options_test.cc
index 159cfec85570..bacee1d1edd7 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -200,7 +200,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
-  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
   ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level.size(), 8U);
@@ -2498,7 +2497,6 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
-  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
   ASSERT_EQ(new_cf_opt.max_write_buffer_size_to_maintain, -99999);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level.size(), 8U);
diff --git a/table/multiget_context.h b/table/multiget_context.h
index a82c08aabe3c..52dcf1b174c4 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -219,7 +219,9 @@ class MultiGetContext {
         while (++index_ < range_->end_ &&
                (Mask{1} << index_) &
                    (range_->ctx_->value_mask_ | range_->skip_mask_ |
-                    range_->invalid_mask_));
+                    range_->invalid_mask_)) {
+          // empty loop body
+        }
         return *this;
       }
 
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index 35884a7b3789..d3af4260c27d 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -386,7 +386,6 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
   cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
   cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
   cf_opt->max_write_buffer_number = rnd->Uniform(100);
-  cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
   cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
   cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
   cf_opt->num_levels = rnd->Uniform(100);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 9155be672e7b..d152ea2f2b73 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -459,20 +459,6 @@ DEFINE_int32(min_write_buffer_number_to_merge,
              " writing less data to storage if there are duplicate records "
              " in each of these individual write buffers.");
 
-DEFINE_int32(max_write_buffer_number_to_maintain,
-             ROCKSDB_NAMESPACE::Options().max_write_buffer_number_to_maintain,
-             "The total maximum number of write buffers to maintain in memory "
-             "including copies of buffers that have already been flushed. "
-             "Unlike max_write_buffer_number, this parameter does not affect "
-             "flushing. This controls the minimum amount of write history "
-             "that will be available in memory for conflict checking when "
-             "Transactions are used. If this value is too low, some "
-             "transactions may fail at commit time due to not being able to "
-             "determine whether there were any write conflicts. Setting this "
-             "value to 0 will cause write buffers to be freed immediately "
-             "after they are flushed.  If this value is set to -1, "
-             "'max_write_buffer_number' will be used.");
-
 DEFINE_int64(max_write_buffer_size_to_maintain,
              ROCKSDB_NAMESPACE::Options().max_write_buffer_size_to_maintain,
              "The total maximum size of write buffers to maintain in memory "
@@ -4271,8 +4257,6 @@ class Benchmark {
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
         FLAGS_min_write_buffer_number_to_merge;
-    options.max_write_buffer_number_to_maintain =
-        FLAGS_max_write_buffer_number_to_maintain;
     options.max_write_buffer_size_to_maintain =
         FLAGS_max_write_buffer_size_to_maintain;
     options.max_background_jobs = FLAGS_max_background_jobs;
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index e2546ff1c173..1b68e5dbfebf 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -253,7 +253,6 @@ const std::string options_file_content = R"OPTIONS_FILE(
   level0_slowdown_writes_trigger=50
   level0_file_num_compaction_trigger=10
   expanded_compaction_factor=25
-  max_write_buffer_number_to_maintain=0
   max_write_buffer_size_to_maintain=0
   verify_checksums_in_compaction=true
   merge_operator=nullptr
diff --git a/unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md b/unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md
new file mode 100644
index 000000000000..ecfb945ec973
--- /dev/null
+++ b/unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md
@@ -0,0 +1 @@
+AdvancedColumnFamilyOptions.max_write_buffer_number_to_maintain is deleted. It's deprecated since introduction of a better option max_write_buffer_size_to_maintain since RocksDB 6.5.0.
\ No newline at end of file
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index 3ad9d517739d..dc854342bc57 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -81,8 +81,7 @@ Status OptimisticTransactionDB::Open(
   for (auto& column_family : column_families_copy) {
     ColumnFamilyOptions* options = &column_family.options;
 
-    if (options->max_write_buffer_size_to_maintain == 0 &&
-        options->max_write_buffer_number_to_maintain == 0) {
+    if (options->max_write_buffer_size_to_maintain == 0) {
       // Setting to -1 will set the History size to
       // max_write_buffer_number * write_buffer_size.
       options->max_write_buffer_size_to_maintain = -1;
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index 37fd80e86259..823b474e2ffa 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -284,8 +284,7 @@ void TransactionDB::PrepareWrap(
   for (size_t i = 0; i < column_families->size(); i++) {
     ColumnFamilyOptions* cf_options = &(*column_families)[i].options;
 
-    if (cf_options->max_write_buffer_size_to_maintain == 0 &&
-        cf_options->max_write_buffer_number_to_maintain == 0) {
+    if (cf_options->max_write_buffer_size_to_maintain == 0) {
       // Setting to -1 will set the History size to
       // max_write_buffer_number * write_buffer_size.
       cf_options->max_write_buffer_size_to_maintain = -1;
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 28443c525baf..9781694e61d5 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -791,7 +791,8 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
   const int kAttemptImmMemTable = 1;
   for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
        attempt++) {
-    options.max_write_buffer_number_to_maintain = 3;
+    options.max_write_buffer_size_to_maintain =
+        3 * static_cast<int>(options.write_buffer_size);
     ASSERT_OK(ReOpen());
 
     WriteOptions write_options;

From 6d802639f7dc35bf765dbe1ed6b3942e4d76375d Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Tue, 8 Apr 2025 15:16:55 -0700
Subject: [PATCH 047/500] Fix a data race reported for secondary (#13529)

Summary:
Fix a reported data race, accessing `manifest_reader_` without locking `mutex_` could race with another `DBImpl::Secondary::TryCatchUpWithPrimary` thread that is updating to a new manifest in `ReactiveVersionSet::MaybeSwitchManifest`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13529

Test Plan: Existing tests

Reviewed By: hx235

Differential Revision: D72655645

Pulled By: jowlyzhang

fbshipit-source-id: 08599862346bb39a6872c3adfd7f0097fc633849
---
 db/db_impl/db_impl_secondary.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index b58d63e52606..469f056d690e 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -705,13 +705,13 @@ Status DBImplSecondary::CheckConsistency() {
 
 Status DBImplSecondary::TryCatchUpWithPrimary() {
   assert(versions_.get() != nullptr);
-  assert(manifest_reader_.get() != nullptr);
   Status s;
   // read the manifest and apply new changes to the secondary instance
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   JobContext job_context(0, true /*create_superversion*/);
   {
     InstrumentedMutexLock lock_guard(&mutex_);
+    assert(manifest_reader_.get() != nullptr);
     s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
             ->ReadAndApply(&mutex_, &manifest_reader_,
                            manifest_reader_status_.get(), &cfds_changed,

From f7764cb6b209f11cb42aa70142d6907c5d6c5084 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:18:33 -0700
Subject: [PATCH 048/500] Remove fail_if_options_file_error DB option (#13504)

Summary:
The fail_if_options_file_error has been deprecated for more than a year. This PR removes it from the code base. https://github.com/facebook/rocksdb/issues/12056 fixed a bug that was blocking the option from removal. https://github.com/facebook/rocksdb/issues/12249 marked it as deprecated.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13504

Reviewed By: hx235

Differential Revision: D72194063

Pulled By: anand1976

fbshipit-source-id: 0aa7cf56e60c48c7e7654743d3e64922ce65225d
---
 db/column_family_test.cc                      |  1 -
 db/db_filesnapshot.cc                         | 16 +++----
 db/db_impl/db_impl.cc                         | 18 ++-----
 db/db_options_test.cc                         | 45 ++++++++---------
 db/db_test_util.cc                            |  1 -
 db_stress_tool/db_stress_common.h             |  1 -
 db_stress_tool/db_stress_gflags.cc            |  4 --
 db_stress_tool/db_stress_test_base.cc         |  3 --
 include/rocksdb/options.h                     |  8 ----
 java/rocksjni/options.cc                      | 46 ------------------
 .../test/java/org/rocksdb/DBOptionsTest.java  |  9 ----
 .../test/java/org/rocksdb/OptionsTest.java    |  9 ----
 options/db_options.cc                         |  4 +-
 options/db_options.h                          |  1 -
 options/options_helper.cc                     |  2 -
 options/options_settable_test.cc              |  1 -
 tools/db_crashtest.py                         |  1 -
 .../remove_fail_if_options_file_error.md      |  1 +
 utilities/checkpoint/checkpoint_test.cc       | 48 -------------------
 utilities/ttl/ttl_test.cc                     |  1 -
 20 files changed, 32 insertions(+), 188 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_fail_if_options_file_error.md

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index d84799b57c42..1ec6cb81f277 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -72,7 +72,6 @@ class ColumnFamilyTestBase : public testing::Test {
     env_->skip_fsync_ = true;
     dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
-    db_options_.fail_if_options_file_error = true;
     db_options_.env = env_;
   }
 
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index e9ae7981ae2c..9b8f602c0310 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -75,11 +75,9 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   ret.emplace_back(CurrentFileName(""));
   ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
-  // The OPTIONS file number is zero in read-write mode when OPTIONS file
-  // writing failed and the DB was configured with
-  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
-  // number is zero when no OPTIONS file exist at all. In those cases we do not
-  // record any OPTIONS file in the live file list.
+  // In read-only mode the OPTIONS file number is zero when no OPTIONS file
+  // exist at all. In this cases we do not record any OPTIONS file in the live
+  // file list.
   if (versions_->options_file_number() != 0) {
     ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
   }
@@ -369,11 +367,9 @@ Status DBImpl::GetLiveFilesStorageInfo(
     }
   }
 
-  // The OPTIONS file number is zero in read-write mode when OPTIONS file
-  // writing failed and the DB was configured with
-  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
-  // number is zero when no OPTIONS file exist at all. In those cases we do not
-  // record any OPTIONS file in the live file list.
+  // In read-only mode the OPTIONS file number is zero when no OPTIONS file
+  // exist at all. In this cases we do not record any OPTIONS file in the live
+  // file list.
   if (options_number != 0) {
     results.emplace_back();
     LiveFileStorageInfo& info = results.back();
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 8aa21a719367..1dc558557b5c 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1470,14 +1470,9 @@ Status DBImpl::SetDBOptions(
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
     new_options.Dump(immutable_db_options_.info_log.get());
     if (!persist_options_status.ok()) {
-      if (immutable_db_options_.fail_if_options_file_error) {
-        s = Status::IOError(
-            "SetDBOptions() succeeded, but unable to persist options",
-            persist_options_status.ToString());
-      }
-      ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                     "Unable to persist options in SetDBOptions() -- %s",
-                     persist_options_status.ToString().c_str());
+      s = Status::IOError(
+          "SetDBOptions() succeeded, but unable to persist options",
+          persist_options_status.ToString());
     }
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
@@ -5445,12 +5440,7 @@ Status DBImpl::WriteOptionsFile(const WriteOptions& write_options,
   if (!s.ok()) {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "Unnable to persist options -- %s", s.ToString().c_str());
-    if (immutable_db_options_.fail_if_options_file_error) {
-      s = Status::IOError("Unable to persist options.", s.ToString().c_str());
-    } else {
-      // Ignore error
-      s = Status::OK();
-    }
+    s = Status::IOError("Unable to persist options.", s.ToString().c_str());
   }
 
   // Restore lock if appropriate
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index df0d4ca3c795..cfe0b8f96522 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -322,31 +322,26 @@ TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) {
   }
   Options options;
   options.create_if_missing = true;
-  // Try with fail_if_options_file_error=false/true to update the options
-  for (bool on_error : {false, true}) {
-    options.fail_if_options_file_error = on_error;
-    options.env = env_;
-    options.disable_auto_compactions = false;
-
-    options.memtable_factory.reset(new DummySkipListFactory());
-    Reopen(options);
-
-    ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
-    ASSERT_OK(
-        dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
-    ColumnFamilyDescriptor cfd;
-    ASSERT_OK(cfh->GetDescriptor(&cfd));
-    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
-                 DummySkipListFactory::kClassName());
-    ColumnFamilyHandle* test = nullptr;
-    ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
-    ASSERT_OK(test->GetDescriptor(&cfd));
-    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
-                 DummySkipListFactory::kClassName());
-
-    ASSERT_OK(dbfull()->DropColumnFamily(test));
-    delete test;
-  }
+  options.env = env_;
+  options.disable_auto_compactions = false;
+
+  options.memtable_factory.reset(new DummySkipListFactory());
+  Reopen(options);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ASSERT_OK(dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
+  ColumnFamilyDescriptor cfd;
+  ASSERT_OK(cfh->GetDescriptor(&cfd));
+  ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+               DummySkipListFactory::kClassName());
+  ColumnFamilyHandle* test = nullptr;
+  ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
+  ASSERT_OK(test->GetDescriptor(&cfd));
+  ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+               DummySkipListFactory::kClassName());
+
+  ASSERT_OK(dbfull()->DropColumnFamily(test));
+  delete test;
 }
 
 TEST_F(DBOptionsTest, SetBytesPerSync) {
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 64a85bc41032..bec9bbd475b8 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -592,7 +592,6 @@ Options DBTestBase::GetOptions(
       options_override.level_compaction_dynamic_level_bytes;
   options.env = env_;
   options.create_if_missing = true;
-  options.fail_if_options_file_error = true;
   return options;
 }
 
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 0e392fb575e5..274bb36d7a3d 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -320,7 +320,6 @@ DECLARE_int32(approximate_size_one_in);
 DECLARE_bool(best_efforts_recovery);
 DECLARE_bool(skip_verifydb);
 DECLARE_bool(paranoid_file_checks);
-DECLARE_bool(fail_if_options_file_error);
 DECLARE_uint64(batch_protection_bytes_per_key);
 DECLARE_uint32(memtable_protection_bytes_per_key);
 DECLARE_uint32(block_protection_bytes_per_key);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 1650b989b102..a1a0bb15b829 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1084,10 +1084,6 @@ DEFINE_bool(paranoid_file_checks, true,
             "After writing every SST file, reopen it and read all the keys "
             "and validate checksums");
 
-DEFINE_bool(fail_if_options_file_error, false,
-            "Fail operations that fail to detect or properly persist options "
-            "file.");
-
 DEFINE_uint64(batch_protection_bytes_per_key, 0,
               "If nonzero, enables integrity protection in `WriteBatch` at the "
               "specified number of bytes per key. Currently the only supported "
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 40cd731a4b98..13f4b0e3c585 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3376,8 +3376,6 @@ void StressTest::PrintEnv() const {
           FLAGS_sync_fault_injection);
   fprintf(stdout, "Best efforts recovery     : %d\n",
           static_cast<int>(FLAGS_best_efforts_recovery));
-  fprintf(stdout, "Fail if OPTIONS file error: %d\n",
-          static_cast<int>(FLAGS_fail_if_options_file_error));
   fprintf(stdout, "User timestamp size bytes : %d\n",
           static_cast<int>(FLAGS_user_timestamp_size));
   fprintf(stdout, "Persist user defined timestamps : %d\n",
@@ -4255,7 +4253,6 @@ void InitializeOptionsFromFlags(
 
   options.best_efforts_recovery = FLAGS_best_efforts_recovery;
   options.paranoid_file_checks = FLAGS_paranoid_file_checks;
-  options.fail_if_options_file_error = FLAGS_fail_if_options_file_error;
 
   if (FLAGS_user_timestamp_size > 0) {
     CheckAndSetOptionsForUserTimestamp(options);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 34f2a8b14ca0..a9b5bb373e18 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1297,14 +1297,6 @@ struct DBOptions {
   // currently.
   WalFilter* wal_filter = nullptr;
 
-  // DEPRECATED: This option might be removed in a future release.
-  //
-  // If true, then DB::Open, CreateColumnFamily, DropColumnFamily, and
-  // SetOptions will fail if options file is not properly persisted.
-  //
-  // DEFAULT: true
-  bool fail_if_options_file_error = true;
-
   // If true, then print malloc stats together with rocksdb.stats
   // when printing to LOG.
   // DEFAULT: false
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 8f6f1903e326..2bb07cf45828 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -2055,29 +2055,6 @@ void Java_org_rocksdb_Options_setWalFilter(JNIEnv*, jclass, jlong jhandle,
   opt->wal_filter = wal_filter;
 }
 
-/*
- * Class:     org_rocksdb_Options
- * Method:    setFailIfOptionsFileError
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setFailIfOptionsFileError(
-    JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  opt->fail_if_options_file_error =
-      static_cast<bool>(jfail_if_options_file_error);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    failIfOptionsFileError
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_failIfOptionsFileError(JNIEnv*, jclass,
-                                                         jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  return static_cast<jboolean>(opt->fail_if_options_file_error);
-}
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    setDumpMallocStats
@@ -7479,29 +7456,6 @@ void Java_org_rocksdb_DBOptions_setWalFilter(JNIEnv*, jclass, jlong jhandle,
   opt->wal_filter = wal_filter;
 }
 
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    setFailIfOptionsFileError
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError(
-    JNIEnv*, jclass, jlong jhandle, jboolean jfail_if_options_file_error) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  opt->fail_if_options_file_error =
-      static_cast<bool>(jfail_if_options_file_error);
-}
-
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    failIfOptionsFileError
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(JNIEnv*, jclass,
-                                                           jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  return static_cast<jboolean>(opt->fail_if_options_file_error);
-}
-
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setDumpMallocStats
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index a71345f744a3..cf3ef22ddeb4 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -656,15 +656,6 @@ public String name() {
     }
   }
 
-  @Test
-  public void failIfOptionsFileError() {
-    try (final DBOptions opt = new DBOptions()) {
-      final boolean boolValue = rand.nextBoolean();
-      opt.setFailIfOptionsFileError(boolValue);
-      assertThat(opt.failIfOptionsFileError()).isEqualTo(boolValue);
-    }
-  }
-
   @Test
   public void dumpMallocStats() {
     try (final DBOptions opt = new DBOptions()) {
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index 316086c6bd40..c78d0f76b3a4 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -902,15 +902,6 @@ public String name() {
     }
   }
 
-  @Test
-  public void failIfOptionsFileError() {
-    try (final Options opt = new Options()) {
-      final boolean boolValue = rand.nextBoolean();
-      opt.setFailIfOptionsFileError(boolValue);
-      assertThat(opt.failIfOptionsFileError()).isEqualTo(boolValue);
-    }
-  }
-
   @Test
   public void dumpMallocStats() {
     try (final Options opt = new Options()) {
diff --git a/options/db_options.cc b/options/db_options.cc
index 8453b101dd00..3e06c4ceb687 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -319,8 +319,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"fail_if_options_file_error",
-         {offsetof(struct ImmutableDBOptions, fail_if_options_file_error),
-          OptionType::kBoolean, OptionVerificationType::kNormal,
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"enable_pipelined_write",
          {offsetof(struct ImmutableDBOptions, enable_pipelined_write),
@@ -772,7 +771,6 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       allow_2pc(options.allow_2pc),
       row_cache(options.row_cache),
       wal_filter(options.wal_filter),
-      fail_if_options_file_error(options.fail_if_options_file_error),
       dump_malloc_stats(options.dump_malloc_stats),
       avoid_flush_during_recovery(options.avoid_flush_during_recovery),
       allow_ingest_behind(options.allow_ingest_behind),
diff --git a/options/db_options.h b/options/db_options.h
index 0de6cccf7b0a..c23a6f1c945f 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -77,7 +77,6 @@ struct ImmutableDBOptions {
   bool allow_2pc;
   std::shared_ptr<Cache> row_cache;
   WalFilter* wal_filter;
-  bool fail_if_options_file_error;
   bool dump_malloc_stats;
   bool avoid_flush_during_recovery;
   bool allow_ingest_behind;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index d3a7edd9e703..e7ae9e70c837 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -153,8 +153,6 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.allow_2pc = immutable_db_options.allow_2pc;
   options.row_cache = immutable_db_options.row_cache;
   options.wal_filter = immutable_db_options.wal_filter;
-  options.fail_if_options_file_error =
-      immutable_db_options.fail_if_options_file_error;
   options.dump_malloc_stats = immutable_db_options.dump_malloc_stats;
   options.avoid_flush_during_recovery =
       immutable_db_options.avoid_flush_during_recovery;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 2b9c89b16796..66627c428c40 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -433,7 +433,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "use_direct_io_for_flush_and_compaction=false;"
                              "max_log_file_size=4607;"
                              "advise_random_on_open=true;"
-                             "fail_if_options_file_error=false;"
                              "enable_pipelined_write=false;"
                              "unordered_write=false;"
                              "allow_concurrent_memtable_write=true;"
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 21cfc850151b..26152b22bee6 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -91,7 +91,6 @@
     # (see below `finalize_and_sanitize`).
     "inplace_update_support": random.choice([0] * 9 + [1]),
     "expected_values_dir": lambda: setup_expected_values_dir(),
-    "fail_if_options_file_error": lambda: random.randint(0, 1),
     "flush_one_in": lambda: random.choice([1000, 1000000]),
     "manual_wal_flush_one_in": lambda: random.choice([0, 1000]),
     "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]),
diff --git a/unreleased_history/public_api_changes/remove_fail_if_options_file_error.md b/unreleased_history/public_api_changes/remove_fail_if_options_file_error.md
new file mode 100644
index 000000000000..822940568baa
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_fail_if_options_file_error.md
@@ -0,0 +1 @@
+The fail_if_options_file_error option in DBOptions has been removed. The behavior now is to always return failure in any API that fails to persist the OPTIONS file.
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index e71c795f654b..a514c3400f2d 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -847,54 +847,6 @@ TEST_P(CheckpointTestWithWalParams, CheckpointWithUnsyncedDataDropped) {
   db_ = nullptr;
 }
 
-TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
-  // Regression test for a bug where checkpoint failed on a DB where persisting
-  // OPTIONS file failed and the DB was opened with
-  // `fail_if_options_file_error == false`.
-  Options options = CurrentOptions();
-  options.fail_if_options_file_error = false;
-  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
-
-  // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
-  // operation when inside the OPTIONS file persisting code.
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
-  fault_fs->SetThreadLocalErrorContext(
-      FaultInjectionIOType::kWrite, 7 /* seed*/, 1 /* one_in */,
-      false /* retryable */, false /* has_data_loss*/);
-  SyncPoint::GetInstance()->SetCallBack(
-      "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
-        fault_fs->EnableThreadLocalErrorInjection(
-            FaultInjectionIOType::kMetadataWrite);
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "FaultInjectionTestFS::InjectMetadataWriteError:Injected",
-      [fault_fs](void* /* arg */) {
-        fault_fs->DisableThreadLocalErrorInjection(
-            FaultInjectionIOType::kMetadataWrite);
-      });
-  options.env = fault_fs_env.get();
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  Reopen(options);
-  ASSERT_OK(Put("key1", "val1"));
-  Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
-  delete checkpoint;
-
-  // Make sure it's usable.
-  options.env = env_;
-  DB* snapshot_db;
-  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
-  ReadOptions read_opts;
-  std::string get_result;
-  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
-  ASSERT_EQ("val1", get_result);
-  delete snapshot_db;
-  delete db_;
-  db_ = nullptr;
-}
-
 TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
   ASSERT_OK(Put("foo", "foo_value"));
   ASSERT_OK(Flush());
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 4bbf11505d49..37bfa7d662a0 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -617,7 +617,6 @@ TEST_F(TtlTest, UnregisteredMergeOperator) {
    public:
     const char* Name() const override { return "UnregisteredMergeOperator"; }
   };
-  options_.fail_if_options_file_error = true;
   options_.merge_operator = std::make_shared<UnregisteredMergeOperator>();
   OpenTtl();
   CloseTtl();

From 46c37a632735aa541e30b665810197b1fff6a41c Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Thu, 10 Apr 2025 16:20:30 -0700
Subject: [PATCH 049/500] Fix issue with reverse iteration with unprepared
 value (#13531)

Summary:
When ReadOptions.allow_unprepared_value is true, a `Iterator::PrepareValue()` call is needed to prepare the value after an entry is pinpointed, to only load the blob when it's actually needed. And it uses the `saved_key_.GetUserKey()` to prepare value.
https://github.com/facebook/rocksdb/blob/6d802639f7dc35bf765dbe1ed6b3942e4d76375d/db/db_iter.cc#L319

In the reverse iteration case, when the `FindValueForCurrentKeyUsingSeek()` path is used, `saved_key_` is only updated when `ReadOptions.iter_start_ts` is specified. This PR fixes it by updating `saved_key_` for the other case too.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13531

Test Plan: The FIXME test that reproduce the bug is updated

Reviewed By: pdillinger

Differential Revision: D72681397

Pulled By: jowlyzhang

fbshipit-source-id: 6c239da53c9beed1560d30013474f2ba542b245c
---
 db/db_iter.cc                      |  2 ++
 db/db_with_timestamp_basic_test.cc | 42 +++++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index c5a099103653..a2703ee7ad7b 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -1224,6 +1224,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
 
     if (timestamp_lb_ != nullptr) {
       saved_key_.SetInternalKey(ikey);
+    } else {
+      saved_key_.SetUserKey(ikey.user_key);
     }
 
     valid_ = true;
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index 268e22fbedac..cf088e7ae054 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -1487,13 +1487,24 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp,
-       FIXME_ReverseIterationWithBlobAndUnpreparedValue) {
+class ReverseIterationWithUnpreparedBlobTest
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<std::tuple<bool, uint64_t>> {
+ public:
+  ReverseIterationWithUnpreparedBlobTest()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_reverse_with_unprepare") {}
+};
+INSTANTIATE_TEST_CASE_P(ReverseIterationWithUnpreparedBlobTest,
+                        ReverseIterationWithUnpreparedBlobTest,
+                        ::testing::Combine(::testing::Values(true, false),
+                                           ::testing::Values(0, 2)));
+TEST_P(ReverseIterationWithUnpreparedBlobTest, Basic) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   options.env = env_;
   options.enable_blob_files = true;
-  options.max_sequential_skip_in_iterations = 0;
+  options.max_sequential_skip_in_iterations = std::get<1>(GetParam());
 
   const size_t kTimestampSize = Timestamp(0, 0).size();
   TestComparator test_cmp(kTimestampSize);
@@ -1508,7 +1519,7 @@ TEST_F(DBBasicTestWithTimestamp,
   for (uint64_t key = 0; key <= kMaxKey; ++key) {
     for (size_t i = 0; i < write_timestamps.size(); ++i) {
       ASSERT_OK(db_->Put(WriteOptions(), Key1(key), write_timestamps[i],
-                         "value" + std::to_string(i)));
+                         Key1(key) + "value" + std::to_string(i)));
     }
   }
 
@@ -1520,17 +1531,28 @@ TEST_F(DBBasicTestWithTimestamp,
 
     ReadOptions read_opts;
     read_opts.timestamp = &read_timestamp;
-    read_opts.allow_unprepared_value = true;
+    read_opts.allow_unprepared_value = std::get<0>(GetParam());
 
     std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
 
     it->SeekForPrev(Key1(kMaxKey));
-    ASSERT_TRUE(it->Valid());
-    ASSERT_OK(it->status());
+    uint64_t key = kMaxKey;
+    int count = 0;
+    while (it->Valid()) {
+      ASSERT_OK(it->status());
 
-    // FIXME: PrepareValue() should succeed and status() should remain OK
-    ASSERT_FALSE(it->PrepareValue());
-    ASSERT_TRUE(it->status().IsCorruption());
+      ASSERT_TRUE(it->PrepareValue());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_OK(it->status());
+      ASSERT_EQ(it->key(), Key1(key));
+      ASSERT_EQ(it->timestamp(), Timestamp(3, 0));
+      ASSERT_EQ(it->value(), Key1(key) + "value" + std::to_string(1));
+      key--;
+      count++;
+      it->Prev();
+    }
+    ASSERT_OK(it->status());
+    ASSERT_EQ(kMaxKey + 1, count);
   }
 
   Close();

From 56359da69132d769e97f0a7cc89681d3500e166d Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 10 Apr 2025 17:53:33 -0700
Subject: [PATCH 050/500] Trigger memtable flush based on number of hidden
 entries scanned (#13523)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Introduce a mutable CF option `memtable_op_scan_flush_trigger`. When a DB iterator scans this number of hidden entries (tombstones, overwritten puts) from the active memtable in a Seek() or Next() operation, it marks the memtable to be eligible for flush. Subsequent write operations will schedule the marked memtable for flush.

The main change is small and is in db_iter.cc. Some refactoring is done to consolidate and simplify creation of `ArenaWrappedDBIter` and `DBIter`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13523

Test Plan:
- new unit tests added.
- added `memtable_op_scan_flush_trigger` in crash test
- benchmark:
The following benchmark was done with a previous version of the PR where the option was `memtable_tombstone_scan_limit` and it concerns tombstone only. The results should still be applicable for the case when there's no overwritten puts.

Tests that when memtable has many tombstones, the option helps to improve scan performance:
```
TEST_TMPDIR=/dev/shm ./db_bench --benchmarks=seekrandomwhilewriting --expand_range_tombstones=true --writes_per_range_tombstone=1 --max_num_range_tombstones=10000000 --perf_level=2 --range_tombstone_width=100 --memtable_tombstone_scan_limit=

memtable_tombstone_scan_limit = 10000
seekrandomwhilewriting :      18.527 micros/op 53973 ops/sec 18.527 seconds 1000000 operations; (7348 of 1000000 found)
next_on_memtable_count = 122305248
grep "flush_started" /dev/shm/dbbench/LOG | wc
      8     200    2417

memtable_tombstone_scan_limit=200
seekrandomwhilewriting :       4.918 micros/op 203315 ops/sec 4.918 seconds 1000000 operations; (4510 of 1000000 found)
next_on_memtable_count = 1853167
grep "flush_started" /dev/shm/dbbench/LOG | wc
    184    4600   54121

When memtable_tombstone_scan_limit=200, more flush is trigged to drop tombstones sooner and improve scan performance.
```

Tests that the new option does not introduce noticeable regression:
```
TEST_TMPDIR=/dev/shm ./db_bench --benchmarks=seekrandomwhilewriting[-X5] --expand_range_tombstones=true --writes_per_range_tombstone=1 --max_num_range_tombstones=10000000 --perf_level=2 --range_tombstone_width=100 --seed=123

Main:
seekrandomwhilewriting [AVG 5 runs] : 46049 (± 4512) ops/sec
PR:
seekrandomwhilewriting [AVG 5 runs] : 46100 (± 4470) ops/sec

The results are noisy with this PR performing better and worse in different runs, with no noticeable regression.
```

Reviewed By: pdillinger

Differential Revision: D72596434

Pulled By: cbi42

fbshipit-source-id: 2d51a0221dc20dac844aeba2ad3999d075a4cf91
---
 db/arena_wrapped_db_iter.cc                   |  50 ++--
 db/arena_wrapped_db_iter.h                    |  23 +-
 db/column_family.cc                           |   7 +
 db/db_impl/db_impl.cc                         |  38 +--
 db/db_impl/db_impl_readonly.cc                |  36 +--
 db/db_impl/db_impl_secondary.cc               |  15 +-
 db/db_iter.cc                                 |  53 ++--
 db/db_iter.h                                  |  81 ++++--
 db/db_iter_stress_test.cc                     |   5 +-
 db/db_iter_test.cc                            | 272 +++++++-----------
 db/db_iterator_test.cc                        | 160 +++++++++++
 db/flush_job.cc                               |   4 +-
 db/memtable.cc                                |   5 +
 db/memtable.h                                 |   7 +
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   5 +
 db_stress_tool/db_stress_test_base.cc         |   2 +
 include/rocksdb/advanced_options.h            |  14 +
 options/cf_options.cc                         |   7 +-
 options/cf_options.h                          |   7 +-
 options/options.cc                            |   6 +-
 options/options_helper.cc                     |   2 +
 options/options_settable_test.cc              |   3 +-
 table/sst_file_reader.cc                      |  10 +-
 tools/db_bench_tool.cc                        |   7 +
 tools/db_crashtest.py                         |   1 +
 .../tombstone_scan_flush_trigger.md           |   1 +
 27 files changed, 489 insertions(+), 333 deletions(-)
 create mode 100644 unreleased_history/new_features/tombstone_scan_flush_trigger.md

diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index 21fb15504061..d24a918368ba 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -42,9 +42,9 @@ Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
 void ArenaWrappedDBIter::Init(
     Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, const Version* version,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
-    uint64_t version_number, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
+    const SequenceNumber& sequence, uint64_t version_number,
+    ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
+    bool expose_blob_index, bool allow_refresh, ReadOnlyMemTable* active_mem) {
   read_options_ = read_options;
   if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
                              FSSupportedOps::kAsyncIO)) {
@@ -52,15 +52,14 @@ void ArenaWrappedDBIter::Init(
   }
   read_options_.total_order_seek |= ioptions.prefix_seek_opt_in_only;
 
-  auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(env, read_options_, ioptions, mutable_cf_options,
-                              ioptions.user_comparator,
-                              /* iter */ nullptr, version, sequence, true,
-                              max_sequential_skip_in_iteration, read_callback,
-                              cfh, expose_blob_index);
+  db_iter_ = DBIter::NewIter(
+      env, read_options_, ioptions, mutable_cf_options,
+      ioptions.user_comparator, /*internal_iter=*/nullptr, version, sequence,
+      read_callback, cfh, expose_blob_index, active_mem, &arena_);
 
   sv_number_ = version_number;
   allow_refresh_ = allow_refresh;
+  allow_mark_memtable_for_flush_ = active_mem;
   memtable_range_tombstone_iter_ = nullptr;
 }
 
@@ -166,9 +165,8 @@ void ArenaWrappedDBIter::DoRefresh(const Snapshot* snapshot,
     read_callback_->Refresh(read_seq);
   }
   Init(env, read_options_, cfd->ioptions(), sv->mutable_cf_options, sv->current,
-       read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-       sv->version_number, read_callback_, cfh_, expose_blob_index_,
-       allow_refresh_);
+       read_seq, sv->version_number, read_callback_, cfh_, expose_blob_index_,
+       allow_refresh_, allow_mark_memtable_for_flush_ ? sv->mem : nullptr);
 
   InternalIterator* internal_iter = db_impl->NewInternalIterator(
       read_options_, cfd, sv, &arena_, read_seq,
@@ -253,20 +251,26 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
 }
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, const Version* version,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    uint64_t version_number, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh, bool expose_blob_index, bool allow_refresh) {
-  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
-             max_sequential_skip_in_iterations, version_number, read_callback,
-             cfh, expose_blob_index, allow_refresh);
+    Env* env, const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh,
+    SuperVersion* sv, const SequenceNumber& sequence,
+    ReadCallback* read_callback, DBImpl* db_impl, bool expose_blob_index,
+    bool allow_refresh, bool allow_mark_memtable_for_flush) {
+  ArenaWrappedDBIter* db_iter = new ArenaWrappedDBIter();
+  db_iter->Init(env, read_options, cfh->cfd()->ioptions(),
+                sv->mutable_cf_options, sv->current, sequence,
+                sv->version_number, read_callback, cfh, expose_blob_index,
+                allow_refresh,
+                allow_mark_memtable_for_flush ? sv->mem : nullptr);
   if (cfh != nullptr && allow_refresh) {
-    iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index);
+    db_iter->StoreRefreshInfo(cfh, read_callback, expose_blob_index);
   }
 
-  return iter;
+  InternalIterator* internal_iter = db_impl->NewInternalIterator(
+      db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), sequence,
+      /*allow_unprepared_value=*/true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+
+  return db_iter;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
index 531fd5bca4a7..647ed62c908c 100644
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@@ -19,7 +19,6 @@
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
-#include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -103,13 +102,15 @@ class ArenaWrappedDBIter : public Iterator {
     db_iter_->Prepare(scan_opts);
   }
 
+  // FIXME: we could just pass SV in for mutable cf option, version and version
+  // number, but this is used by SstFileReader which does not have a SV.
   void Init(Env* env, const ReadOptions& read_options,
             const ImmutableOptions& ioptions,
             const MutableCFOptions& mutable_cf_options, const Version* version,
-            const SequenceNumber& sequence,
-            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            const SequenceNumber& sequence, uint64_t version_number,
             ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
-            bool expose_blob_index, bool allow_refresh);
+            bool expose_blob_index, bool allow_refresh,
+            ReadOnlyMemTable* active_mem);
 
   // Store some parameters so we can refresh the iterator at a later point
   // with these same params
@@ -132,20 +133,16 @@ class ArenaWrappedDBIter : public Iterator {
   ReadCallback* read_callback_;
   bool expose_blob_index_ = false;
   bool allow_refresh_ = true;
+  bool allow_mark_memtable_for_flush_ = true;
   // If this is nullptr, it means the mutable memtable does not contain range
   // tombstone when added under this DBIter.
   std::unique_ptr<TruncatedRangeDelIterator>* memtable_range_tombstone_iter_ =
       nullptr;
 };
 
-// Generate the arena wrapped iterator class.
-// `cfh` is used for reneweal. If left null, renewal will not
-// be supported.
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, const Version* version,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    uint64_t version_number, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false,
-    bool allow_refresh = true);
+    Env* env, const ReadOptions& read_options, ColumnFamilyHandleImpl* cfh,
+    SuperVersion* sv, const SequenceNumber& sequence,
+    ReadCallback* read_callback, DBImpl* db_impl, bool expose_blob_index,
+    bool allow_refresh, bool allow_mark_memtable_for_flush);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/column_family.cc b/db/column_family.cc
index 3232834ca320..6b642fa4cd3d 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -448,6 +448,13 @@ ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
     result.preclude_last_level_data_seconds = 0;
   }
 
+  if (read_only && result.memtable_op_scan_flush_trigger != 0) {
+    ROCKS_LOG_WARN(db_options.info_log.get(),
+                   "option memtable_op_scan_flush_trigger is sanitized to "
+                   "0(disabled) for read only DB.");
+    result.memtable_op_scan_flush_trigger = 0;
+  }
+
   return result;
 }
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 1dc558557b5c..35dc300b9f70 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3857,11 +3857,12 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
 
     auto iter = new ForwardIterator(this, read_options, cfd, sv,
                                     /* allow_unprepared_value */ true);
-    result = NewDBIterator(
-        env_, read_options, cfd->ioptions(), sv->mutable_cf_options,
-        cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */, cfh);
+    result = DBIter::NewIter(env_, read_options, cfd->ioptions(),
+                             sv->mutable_cf_options, cfd->user_comparator(),
+                             iter, sv->current, kMaxSequenceNumber,
+                             /*read_callback=*/nullptr, cfh,
+                             /*expose_blob_index=*/false,
+                             /*active_mem=*/sv->mem);
   } else {
     // Note: no need to consider the special case of
     // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
@@ -3939,18 +3940,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(
   // Laying out the iterators in the order of being accessed makes it more
   // likely that any iterator pointer is close to the iterator it points to so
   // that they are likely to be in the same cache line and/or page.
-  ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, cfh->cfd()->ioptions(), sv->mutable_cf_options,
-      sv->current, snapshot,
-      sv->mutable_cf_options.max_sequential_skip_in_iterations,
-      sv->version_number, read_callback, cfh, expose_blob_index, allow_refresh);
-
-  InternalIterator* internal_iter = NewInternalIterator(
-      db_iter->GetReadOptions(), cfh->cfd(), sv, db_iter->GetArena(), snapshot,
-      /* allow_unprepared_value */ true, db_iter);
-  db_iter->SetIterUnderDBIter(internal_iter);
-
-  return db_iter;
+  return NewArenaWrappedDbIterator(
+      env_, read_options, cfh, sv, snapshot, read_callback, this,
+      expose_blob_index, allow_refresh, /*allow_mark_memtable_for_flush=*/true);
 }
 
 std::unique_ptr<Iterator> DBImpl::NewCoalescingIterator(
@@ -4075,13 +4067,11 @@ Status DBImpl::NewIterators(
                                       cf_sv_pair.super_version,
                                       /* allow_unprepared_value */ true);
       iterators->push_back(
-          NewDBIterator(env_, read_options, cf_sv_pair.cfd->ioptions(),
-                        cf_sv_pair.super_version->mutable_cf_options,
-                        cf_sv_pair.cfd->user_comparator(), iter,
-                        cf_sv_pair.super_version->current, kMaxSequenceNumber,
-                        cf_sv_pair.super_version->mutable_cf_options
-                            .max_sequential_skip_in_iterations,
-                        nullptr /*read_callback*/, cf_sv_pair.cfh));
+          DBIter::NewIter(env_, read_options, cf_sv_pair.cfd->ioptions(),
+                          cf_sv_pair.super_version->mutable_cf_options,
+                          cf_sv_pair.cfd->user_comparator(), iter,
+                          cf_sv_pair.super_version->current, kMaxSequenceNumber,
+                          nullptr /*read_callback*/, cf_sv_pair.cfh));
     }
   } else {
     for (const auto& cf_sv_pair : cf_sv_pairs) {
diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
index dac0d9660037..31934ee192c7 100644
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -185,16 +185,10 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& _read_options,
           ? static_cast<const SnapshotImpl*>(read_options.snapshot)->number_
           : latest_snapshot;
   ReadCallback* read_callback = nullptr;  // No read callback provided.
-  auto db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, cfd->ioptions(), super_version->mutable_cf_options,
-      super_version->current, read_seq,
-      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
-      super_version->version_number, read_callback);
-  auto internal_iter = NewInternalIterator(
-      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
-      read_seq, /* allow_unprepared_value */ true, db_iter);
-  db_iter->SetIterUnderDBIter(internal_iter);
-  return db_iter;
+  return NewArenaWrappedDbIterator(
+      env_, read_options, cfh, super_version, read_seq, read_callback, this,
+      /*expose_blob_index=*/false, /*allow_refresh=*/false,
+      /*allow_mark_memtable_for_flush=*/false);
 }
 
 Status DBImplReadOnly::NewIterators(
@@ -231,36 +225,32 @@ Status DBImplReadOnly::NewIterators(
           ? static_cast<const SnapshotImpl*>(read_options.snapshot)->number_
           : latest_snapshot;
 
-  autovector<std::tuple<ColumnFamilyData*, SuperVersion*>> cfd_to_sv;
+  autovector<std::tuple<ColumnFamilyHandleImpl*, SuperVersion*>> cfh_to_sv;
 
   const bool check_read_ts =
       read_options.timestamp && read_options.timestamp->size() > 0;
   for (auto cfh : column_families) {
     auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
-    cfd_to_sv.emplace_back(cfd, sv);
+    cfh_to_sv.emplace_back(static_cast_with_check<ColumnFamilyHandleImpl>(cfh),
+                           sv);
     if (check_read_ts) {
       const Status s =
           FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp));
       if (!s.ok()) {
-        for (auto prev_entry : cfd_to_sv) {
+        for (auto prev_entry : cfh_to_sv) {
           std::get<1>(prev_entry)->Unref();
         }
         return s;
       }
     }
   }
-  assert(cfd_to_sv.size() == column_families.size());
-  for (auto [cfd, sv] : cfd_to_sv) {
+  assert(cfh_to_sv.size() == column_families.size());
+  for (auto [cfh, sv] : cfh_to_sv) {
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, cfd->ioptions(), sv->mutable_cf_options,
-        sv->current, read_seq,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        sv->version_number, read_callback);
-    auto* internal_iter = NewInternalIterator(
-        db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq,
-        /* allow_unprepared_value */ true, db_iter);
-    db_iter->SetIterUnderDBIter(internal_iter);
+        env_, read_options, cfh, sv, read_seq, read_callback, this,
+        /*expose_blob_index=*/false, /*allow_refresh=*/false,
+        /*allow_mark_memtable_for_flush=*/false);
     iterators->push_back(db_iter);
   }
 
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 469f056d690e..6e6b248d76c6 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -566,17 +566,10 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
   assert(snapshot == kMaxSequenceNumber);
   snapshot = versions_->LastSequence();
   assert(snapshot != kMaxSequenceNumber);
-  auto db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, cfh->cfd()->ioptions(),
-      super_version->mutable_cf_options, super_version->current, snapshot,
-      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
-      super_version->version_number, read_callback, cfh, expose_blob_index,
-      allow_refresh);
-  auto internal_iter = NewInternalIterator(
-      db_iter->GetReadOptions(), cfh->cfd(), super_version, db_iter->GetArena(),
-      snapshot, /* allow_unprepared_value */ true, db_iter);
-  db_iter->SetIterUnderDBIter(internal_iter);
-  return db_iter;
+  return NewArenaWrappedDbIterator(env_, read_options, cfh, super_version,
+                                   snapshot, read_callback, this,
+                                   expose_blob_index, allow_refresh,
+                                   /*allow_mark_memtable_for_flush=*/false);
 }
 
 Status DBImplSecondary::NewIterators(
diff --git a/db/db_iter.cc b/db/db_iter.cc
index a2703ee7ad7b..4ceffd357242 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -9,7 +9,6 @@
 
 #include "db/db_iter.h"
 
-#include <iostream>
 #include <limits>
 #include <string>
 
@@ -42,9 +41,8 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
                const MutableCFOptions& mutable_cf_options,
                const Comparator* cmp, InternalIterator* iter,
                const Version* version, SequenceNumber s, bool arena_mode,
-               uint64_t max_sequential_skip_in_iterations,
                ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
-               bool expose_blob_index)
+               bool expose_blob_index, ReadOnlyMemTable* active_mem)
     : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       env_(_env),
       clock_(ioptions.clock),
@@ -58,11 +56,21 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       read_callback_(read_callback),
       sequence_(s),
       statistics_(ioptions.stats),
-      max_skip_(max_sequential_skip_in_iterations),
+      max_skip_(mutable_cf_options.max_sequential_skip_in_iterations),
       max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
       num_internal_keys_skipped_(0),
       iterate_lower_bound_(read_options.iterate_lower_bound),
       iterate_upper_bound_(read_options.iterate_upper_bound),
+      cfh_(cfh),
+      timestamp_ub_(read_options.timestamp),
+      timestamp_lb_(read_options.iter_start_ts),
+      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0),
+      active_mem_(active_mem),
+      memtable_seqno_lb_((active_mem_ && !active_mem_->IsEmpty())
+                             ? active_mem_->GetFirstSequenceNumber()
+                             : kMaxSequenceNumber),
+      memtable_op_scan_flush_trigger_(
+          mutable_cf_options.memtable_op_scan_flush_trigger),
       direction_(kForward),
       valid_(false),
       current_entry_is_merged_(false),
@@ -76,11 +84,7 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       expose_blob_index_(expose_blob_index),
       allow_unprepared_value_(read_options.allow_unprepared_value),
       is_blob_(false),
-      arena_mode_(arena_mode),
-      cfh_(cfh),
-      timestamp_ub_(read_options.timestamp),
-      timestamp_lb_(read_options.iter_start_ts),
-      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
+      arena_mode_(arena_mode) {
   RecordTick(statistics_, NO_ITERATOR_CREATED);
   if (pin_thru_lifetime_) {
     pinned_iters_mgr_.StartPinning();
@@ -369,6 +373,8 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
   // to one.
   bool reseek_done = false;
 
+  uint64_t mem_ops_scanned = 0;
+  bool marked_for_flush = false;
   do {
     // Will update is_key_seqnum_zero_ as soon as we parsed the current key
     // but we need to save the previous value to be used in the loop.
@@ -425,6 +431,12 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
           CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+        if (memtable_op_scan_flush_trigger_ && active_mem_ &&
+            ikey_.sequence >= memtable_seqno_lb_ && !marked_for_flush &&
+            ++mem_ops_scanned >= memtable_op_scan_flush_trigger_) {
+          active_mem_->MarkForFlush();
+          marked_for_flush = true;
+        }
       } else {
         assert(!skipping_saved_key ||
                CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
@@ -446,6 +458,12 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
                                       !iter_.iter()->IsKeyPinned() /* copy */);
               skipping_saved_key = true;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              if (memtable_op_scan_flush_trigger_ && active_mem_ &&
+                  ikey_.sequence >= memtable_seqno_lb_ && !marked_for_flush &&
+                  ++mem_ops_scanned >= memtable_op_scan_flush_trigger_) {
+                active_mem_->MarkForFlush();
+                marked_for_flush = true;
+              }
             }
             break;
           case kTypeValue:
@@ -1792,21 +1810,4 @@ void DBIter::SeekToLast() {
         StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
   }
 }
-
-Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
-                        const ImmutableOptions& ioptions,
-                        const MutableCFOptions& mutable_cf_options,
-                        const Comparator* user_key_comparator,
-                        InternalIterator* internal_iter, const Version* version,
-                        const SequenceNumber& sequence,
-                        uint64_t max_sequential_skip_in_iterations,
-                        ReadCallback* read_callback,
-                        ColumnFamilyHandleImpl* cfh, bool expose_blob_index) {
-  DBIter* db_iter = new DBIter(
-      env, read_options, ioptions, mutable_cf_options, user_key_comparator,
-      internal_iter, version, sequence, false,
-      max_sequential_skip_in_iterations, read_callback, cfh, expose_blob_index);
-  return db_iter;
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_iter.h b/db/db_iter.h
index 0f8074151507..3e67c9c4ce4a 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -57,6 +57,33 @@ class Version;
 // numbers, deletion markers, overwrites, etc.
 class DBIter final : public Iterator {
  public:
+  // Return a new DBIter that reads from `internal_iter` at the specified
+  // `sequence` number.
+  //
+  // @param active_mem Pointer to the active memtable that `internal_iter`
+  // is reading from. If not null, the memtable can be marked for flush
+  // according to option mutable_cf_options.memtable_op_scan_flush_trigger.
+  // @param arena_mode If true, the DBIter will be allocated from the arena.
+  static DBIter* NewIter(Env* env, const ReadOptions& read_options,
+                         const ImmutableOptions& ioptions,
+                         const MutableCFOptions& mutable_cf_options,
+                         const Comparator* user_key_comparator,
+                         InternalIterator* internal_iter,
+                         const Version* version, const SequenceNumber& sequence,
+                         ReadCallback* read_callback,
+                         ColumnFamilyHandleImpl* cfh = nullptr,
+                         bool expose_blob_index = false,
+                         ReadOnlyMemTable* active_mem = nullptr,
+                         Arena* arena = nullptr) {
+    void* mem = arena ? arena->AllocateAligned(sizeof(DBIter))
+                      : operator new(sizeof(DBIter));
+    DBIter* db_iter = new (mem)
+        DBIter(env, read_options, ioptions, mutable_cf_options,
+               user_key_comparator, internal_iter, version, sequence, arena,
+               read_callback, cfh, expose_blob_index, active_mem);
+    return db_iter;
+  }
+
   // The following is grossly complicated. TODO: clean it up
   // Which direction is the iterator currently moving?
   // (1) When moving forward:
@@ -113,14 +140,6 @@ class DBIter final : public Iterator {
     uint64_t skip_count_;
   };
 
-  DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableOptions& ioptions,
-         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-         InternalIterator* iter, const Version* version, SequenceNumber s,
-         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
-         ReadCallback* read_callback, ColumnFamilyHandleImpl* cfh,
-         bool expose_blob_index);
-
   // No copying allowed
   DBIter(const DBIter&) = delete;
   void operator=(const DBIter&) = delete;
@@ -232,6 +251,14 @@ class DBIter final : public Iterator {
   }
 
  private:
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableOptions& ioptions,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, const Version* version, SequenceNumber s,
+         bool arena_mode, ReadCallback* read_callback,
+         ColumnFamilyHandleImpl* cfh, bool expose_blob_index,
+         ReadOnlyMemTable* active_mem);
+
   class BlobReader {
    public:
     BlobReader(const Version* version, ReadTier read_tier,
@@ -436,6 +463,21 @@ class DBIter final : public Iterator {
   IterKey prefix_;
 
   Status status_;
+  Slice lazy_blob_index_;
+
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+  ColumnFamilyHandleImpl* cfh_;
+  const Slice* const timestamp_ub_;
+  const Slice* const timestamp_lb_;
+  const size_t timestamp_size_;
+  std::string saved_timestamp_;
+  std::optional<std::vector<ScanOptions>> scan_opts_;
+  ReadOnlyMemTable* active_mem_;
+  SequenceNumber memtable_seqno_lb_;
+  uint32_t memtable_op_scan_flush_trigger_;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
@@ -454,30 +496,7 @@ class DBIter final : public Iterator {
   // the stacked BlobDB implementation is used, false otherwise.
   bool expose_blob_index_;
   bool allow_unprepared_value_;
-  Slice lazy_blob_index_;
   bool is_blob_;
   bool arena_mode_;
-  // List of operands for merge operator.
-  MergeContext merge_context_;
-  LocalStatistics local_stats_;
-  PinnedIteratorsManager pinned_iters_mgr_;
-  ColumnFamilyHandleImpl* cfh_;
-  const Slice* const timestamp_ub_;
-  const Slice* const timestamp_lb_;
-  const size_t timestamp_size_;
-  std::string saved_timestamp_;
-  std::optional<std::vector<ScanOptions>> scan_opts_;
 };
-
-// Return a new iterator that converts internal keys (yielded by
-// "*internal_iter") that were live at the specified `sequence` number
-// into appropriate user keys.
-Iterator* NewDBIterator(
-    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options,
-    const Comparator* user_key_comparator, InternalIterator* internal_iter,
-    const Version* version, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
-    ColumnFamilyHandleImpl* cfh = nullptr, bool expose_blob_index = false);
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index daecbcc7acb6..234350601930 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -528,11 +528,10 @@ TEST_F(DBIteratorStressTest, StressTest) {
                   internal_iter->target_hidden_fraction =
                       target_hidden_fraction;
                   internal_iter->trace = trace;
-                  db_iter.reset(NewDBIterator(
+                  db_iter.reset(DBIter::NewIter(
                       env_, ropt, ImmutableOptions(options),
                       MutableCFOptions(options), BytewiseComparator(),
-                      internal_iter, nullptr /* version */, sequence,
-                      options.max_sequential_skip_in_iterations,
+                      internal_iter, /*version=*/nullptr, sequence,
                       nullptr /*read_callback*/));
                 }
 
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index cf8321808f9f..55ddb08d6835 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -259,10 +259,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -294,10 +293,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -322,10 +320,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -356,10 +353,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -393,10 +389,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -425,10 +420,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     SetPerfLevel(kEnableCount);
@@ -465,10 +459,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -492,10 +485,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -517,10 +509,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -554,10 +545,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     SetPerfLevel(kEnableCount);
@@ -586,10 +576,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -631,10 +620,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -664,10 +652,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -696,10 +683,9 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -710,10 +696,9 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
@@ -735,11 +720,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
   }
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -782,10 +766,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
@@ -820,10 +804,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
@@ -851,10 +835,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, 202 /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
@@ -886,10 +870,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       }
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
@@ -906,10 +890,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
     }
     internal_iter->AddPut("c", "200");
     internal_iter->Finish();
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 200 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -944,10 +927,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       }
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
@@ -981,10 +964,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       }
       internal_iter->Finish();
 
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
@@ -1033,10 +1016,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 0;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -1081,10 +1063,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -1127,10 +1108,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -1167,10 +1147,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -1204,10 +1183,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToLast();
@@ -1236,10 +1214,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -1275,10 +1252,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
@@ -1314,10 +1290,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       internal_iter->Finish();
 
       ro.max_skippable_internal_keys = i;
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
-          options.max_sequential_skip_in_iterations,
+
           nullptr /* read_callback */));
 
       db_iter->SeekToFirst();
@@ -1369,10 +1345,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
       options.max_sequential_skip_in_iterations = 1000;
       ro.max_skippable_internal_keys = i;
-      std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+      std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
+          env_, ro, ioptions, MutableCFOptions(options), BytewiseComparator(),
           internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
-          options.max_sequential_skip_in_iterations,
           nullptr /* read_callback */));
 
       db_iter->SeekToFirst();
@@ -1412,11 +1387,11 @@ TEST_F(DBIteratorTest, DBIteratorTimedPutBasic) {
   internal_iter->AddTimedPut("d", "3", /*write_unix_time=*/0);
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  options.max_sequential_skip_in_iterations = 1;
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      7 /* sequence */, /*max_sequential_skip_in_iterations*/ 1,
-      nullptr /* read_callback */));
+      7 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1463,11 +1438,10 @@ TEST_F(DBIteratorTest, DBIterator1) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      1 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      1 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1493,11 +1467,10 @@ TEST_F(DBIteratorTest, DBIterator2) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      0 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      0 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1519,11 +1492,10 @@ TEST_F(DBIteratorTest, DBIterator3) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1545,11 +1517,10 @@ TEST_F(DBIteratorTest, DBIterator4) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      4 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      4 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1580,10 +1551,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1605,10 +1575,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 1 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1630,10 +1599,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1655,10 +1623,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 3 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1680,10 +1647,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1705,10 +1671,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1730,10 +1695,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1753,10 +1717,9 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_2");
     internal_iter->AddPut("b", "val_b");
     internal_iter->Finish();
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
@@ -1785,10 +1748,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1810,10 +1772,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 1 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1835,10 +1796,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1860,10 +1820,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 3 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -1881,10 +1840,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1906,10 +1864,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1931,10 +1888,9 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1976,10 +1932,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2013,10 +1968,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2056,10 +2010,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2099,10 +2052,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2147,10 +2099,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2196,10 +2147,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2239,10 +2189,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 9 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2288,10 +2237,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 13 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2338,10 +2286,9 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 14 /* sequence */,
-        options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2371,11 +2318,10 @@ TEST_F(DBIteratorTest, DBIterator8) {
   internal_iter->AddPut("b", "0");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2403,11 +2349,10 @@ TEST_F(DBIteratorTest, DBIterator9) {
     internal_iter->AddMerge("d", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ImmutableOptions(options), MutableCFOptions(options),
         BytewiseComparator(), internal_iter, nullptr /* version */,
-        10 /* sequence */, options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        10 /* sequence */, nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2471,11 +2416,10 @@ TEST_F(DBIteratorTest, DBIterator10) {
   internal_iter->AddPut("d", "4");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2512,7 +2456,7 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
   internal_iter->AddPut("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
       10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
@@ -2542,11 +2486,10 @@ TEST_F(DBIteratorTest, DBIterator11) {
   internal_iter->AddMerge("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      1 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      1 /* sequence */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2571,7 +2514,7 @@ TEST_F(DBIteratorTest, DBIterator12) {
   internal_iter->AddSingleDeletion("b");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
       10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
@@ -2610,11 +2553,11 @@ TEST_F(DBIteratorTest, DBIterator13) {
   internal_iter->AddPut(key, "8");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  options.max_sequential_skip_in_iterations = 3;
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
-      nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2640,11 +2583,11 @@ TEST_F(DBIteratorTest, DBIterator14) {
   internal_iter->AddPut("c", "9");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  options.max_sequential_skip_in_iterations = 1;
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
-      nullptr /* read_callback */));
+      4 /* sequence */, nullptr /* read_callback */));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2680,11 +2623,11 @@ class DBIterWithMergeIterTest : public testing::Test {
     InternalIterator* merge_iter =
         NewMergingIterator(&icomp_, child_iters.data(), 2u);
 
-    db_iter_.reset(NewDBIterator(
+    options_.max_sequential_skip_in_iterations = 3;
+    db_iter_.reset(DBIter::NewIter(
         env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
         BytewiseComparator(), merge_iter, nullptr /* version */,
-        8 /* read data earlier than seqId 8 */,
-        3 /* max iterators before reseek */, nullptr /* read_callback */));
+        8 /* read data earlier than seqId 8 */, nullptr /* read_callback */));
   }
 
   Env* env_;
@@ -3120,11 +3063,10 @@ TEST_F(DBIteratorTest, SeekPrefixTombstones) {
   internal_iter->Finish();
 
   ro.prefix_same_as_start = true;
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */));
 
   int skipped_keys = 0;
 
@@ -3157,11 +3099,10 @@ TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
     Slice lower_bound(lower_bound_str);
     ro.iterate_lower_bound = &lower_bound;
     Options options;
-    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ImmutableOptions(options), MutableCFOptions(options),
         BytewiseComparator(), internal_iter, nullptr /* version */,
-        10 /* sequence */, options.max_sequential_skip_in_iterations,
-        nullptr /* read_callback */));
+        10 /* sequence */, nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     if (i == kNumKeys + 1) {
@@ -3197,11 +3138,10 @@ TEST_F(DBIteratorTest, PrevLowerBound) {
   Slice lower_bound(lower_bound_str);
   ro.iterate_lower_bound = &lower_bound;
   Options options;
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */));
 
   db_iter->SeekToLast();
   for (int i = kNumKeys; i >= kLowerBound; --i) {
@@ -3226,11 +3166,10 @@ TEST_F(DBIteratorTest, SeekLessLowerBound) {
   Slice lower_bound(lower_bound_str);
   ro.iterate_lower_bound = &lower_bound;
   Options options;
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */));
 
   auto before_lower_bound_str = std::to_string(kLowerBound - 1);
   Slice before_lower_bound(lower_bound_str);
@@ -3252,11 +3191,10 @@ TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
   }
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+  std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, options.max_sequential_skip_in_iterations,
-      nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */));
 
   db_iter->SeekForPrev("a");
   ASSERT_TRUE(db_iter->Valid());
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ad3afd17f4f2..ccb4ff188ab4 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -3824,6 +3824,166 @@ TEST_F(DBIteratorTest, IteratorsConsistentViewExplicitSnapshot) {
   }
 }
 
+TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithSeek) {
+  // Tests that option memtable_op_scan_flush_trigger works when the limit
+  // is reached during a Seek() operation.
+  const int kTrigger = 10;
+  Random* r = Random::GetTLSInstance();
+
+  for (int trigger : {kTrigger, kTrigger + 1}) {
+    for (bool delete_only : {false, true}) {
+      Options options;
+      options.create_if_missing = true;
+      options.memtable_op_scan_flush_trigger = trigger;
+      options.level_compaction_dynamic_level_bytes = true;
+      DestroyAndReopen(options);
+
+      // Base data that will be covered by a consecutive sequence of tombstones.
+      int kNumKeys = delete_only ? kTrigger : kTrigger / 2;
+      for (int i = 0; i < kNumKeys; ++i) {
+        ASSERT_OK(Put(Key(i), r->RandomString(100)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+      ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+      if (delete_only) {
+        for (int i = 0; i < kNumKeys; ++i) {
+          ASSERT_OK(SingleDelete(Key(i)));
+        }
+      } else {
+        for (int i = 0; i < kNumKeys; ++i) {
+          ASSERT_OK(Put(Key(i), r->RandomString(100)));
+        }
+        for (int i = 0; i < kNumKeys; ++i) {
+          ASSERT_OK(Delete(Key(i)));
+        }
+      }
+
+      SetPerfLevel(PerfLevel::kEnableCount);
+      get_perf_context()->Reset();
+      ReadOptions ro;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+      // Seek to the first key, this will scan through all the tombstones and
+      // hidden puts
+      iter->Seek(Key(0));
+      ASSERT_FALSE(
+          iter->Valid());  // All keys are deleted, so iterator is not valid
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger);
+
+      // Skipping kNumTrigger memtable entries in a single iterator operation
+      // should mark the memtable for flush.
+      //
+      // At the end of a write, we check and update memtable to request a flush
+      ASSERT_OK(Put(Key(11), "val"));
+      // Before a write, we schedule memtables for flush if requested.
+      ASSERT_OK(Put(Key(12), "val"));
+      ASSERT_OK(db_->WaitForCompact({}));
+
+      if (trigger <= kTrigger) {
+        // Check if memtable was flushed due to scan trigger
+        ASSERT_EQ(1, NumTableFilesAtLevel(0));
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(0, val);
+      } else {
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(kNumKeys, val);
+      }
+    }
+  }
+}
+
+TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithNext) {
+  // Tests that option memtable_op_scan_flush_trigger works when the limit
+  // is reached during a Next() operation, and not trigger a flush when
+  // the limit is reached across multiple Next() operations.
+  const int kTrigger = 10;
+  Random* r = Random::GetTLSInstance();
+
+  for (int trigger : {kTrigger, kTrigger + 1}) {
+    for (bool delete_only : {false, true}) {
+      Options options;
+      options.create_if_missing = true;
+      options.memtable_op_scan_flush_trigger = trigger;
+      options.level_compaction_dynamic_level_bytes = true;
+      DestroyAndReopen(options);
+
+      // Base data that will be covered by a consecutive sequence of tombstones.
+      int kNumKeys = delete_only ? kTrigger : kTrigger / 2;
+      for (int i = 0; i <= kNumKeys; ++i) {
+        ASSERT_OK(Put(Key(i), r->RandomString(100)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+      ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+      ASSERT_OK(Put(Key(0), "val"));
+      if (delete_only) {
+        for (int i = 1; i <= kNumKeys; ++i) {
+          ASSERT_OK(SingleDelete(Key(i)));
+        }
+      } else {
+        for (int i = 1; i <= kNumKeys; ++i) {
+          ASSERT_OK(Put(Key(i), r->RandomString(100)));
+        }
+        for (int i = 1; i <= kNumKeys; ++i) {
+          ASSERT_OK(Delete(Key(i)));
+        }
+      }
+
+      // Total number of tombstones and hidden puts scanned across multiple
+      // Next() operations below will be kTrigger, and it should not trigger a
+      // flush when the limit is kTrigger + 1.
+      ASSERT_OK(Put(Key(kNumKeys + 1), "v1"));
+      ASSERT_OK(Delete(Key(kNumKeys + 2)));
+      ASSERT_OK(Put(Key(kNumKeys + 3), "v3"));
+
+      SetPerfLevel(PerfLevel::kEnableCount);
+      get_perf_context()->Reset();
+      ReadOptions ro;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+      iter->Seek(Key(0));
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->value(), "val");
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, 0);
+      iter->Next();
+      // kTrigger tombstones and invisible puts and 1 for the visible put
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger + 1);
+      iter->Next();
+      ASSERT_EQ(get_perf_context()->next_on_memtable_count, kTrigger + 3);
+
+      // Skipping kNumTrigger memtable entries in a single iterator operation
+      // should mark the memtable for flush.
+      //
+      // At the end of a write, we check and update memtable to request a flush
+      ASSERT_OK(Put(Key(11), "val"));
+      // Before a write, we schedule memtables for flush if requested.
+      ASSERT_OK(Put(Key(12), "val"));
+      ASSERT_OK(db_->WaitForCompact({}));
+
+      if (trigger <= kTrigger) {
+        // Check if memtable was flushed due to scan trigger
+        ASSERT_EQ(1, NumTableFilesAtLevel(0));
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(0, val);
+      } else {
+        uint64_t val = 0;
+        ASSERT_TRUE(
+            db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
+        ASSERT_EQ(kNumKeys + 1, val);
+      }
+    }
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 366f33a6b47c..8808315857d4 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -901,9 +901,9 @@ Status FlushJob::WriteLevel0Table() {
     for (ReadOnlyMemTable* m : mems_) {
       ROCKS_LOG_INFO(db_options_.info_log,
                      "[%s] [JOB %d] Flushing memtable id %" PRIu64
-                     " with next log file: %" PRIu64 "\n",
+                     " with next log file: %" PRIu64 ", marked_for_flush: %d\n",
                      cfd_->GetName().c_str(), job_context_->job_id, m->GetID(),
-                     m->GetNextLogNumber());
+                     m->GetNextLogNumber(), m->IsMarkedForFlush());
       if (logical_strip_timestamp) {
         memtables.push_back(m->NewTimestampStrippingIterator(
             ro, /*seqno_to_time_mapping=*/nullptr, &arena,
diff --git a/db/memtable.cc b/db/memtable.cc
index 5f5450276b38..396d21404bba 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -175,6 +175,11 @@ size_t MemTable::ApproximateMemoryUsage() {
 }
 
 bool MemTable::ShouldFlushNow() {
+  if (IsMarkedForFlush()) {
+    // TODO: dedicated flush reason when marked for flush
+    return true;
+  }
+
   // This is set if memtable_max_range_deletions is > 0,
   // and that many range deletions are done
   if (memtable_max_range_deletions_ > 0 &&
diff --git a/db/memtable.h b/db/memtable.h
index 7032a3af449c..21532e4566ba 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -30,6 +30,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
 #include "table/multiget_context.h"
+#include "util/atomic.h"
 #include "util/cast_util.h"
 #include "util/dynamic_bloom.h"
 #include "util/hash.h"
@@ -496,6 +497,10 @@ class ReadOnlyMemTable {
     return false;
   }
 
+  void MarkForFlush() { marked_for_flush_.StoreRelaxed(true); }
+
+  bool IsMarkedForFlush() const { return marked_for_flush_.LoadRelaxed(); }
+
  protected:
   friend class MemTableList;
 
@@ -524,6 +529,8 @@ class ReadOnlyMemTable {
 
   // Flush job info of the current memtable.
   std::unique_ptr<FlushJobInfo> flush_job_info_;
+
+  RelaxedAtomic<bool> marked_for_flush_{false};
 };
 
 class MemTable final : public ReadOnlyMemTable {
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 274bb36d7a3d..1d8f979cf05c 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -422,6 +422,7 @@ DECLARE_uint32(commit_bypass_memtable_one_in);
 DECLARE_bool(track_and_verify_wals);
 DECLARE_bool(enable_remote_compaction);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
+DECLARE_uint32(memtable_op_scan_flush_trigger);
 
 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index a1a0bb15b829..32e1aad2262d 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1469,4 +1469,9 @@ DEFINE_bool(
     ROCKSDB_NAMESPACE::ReadOptions().auto_refresh_iterator_with_snapshot,
     "ReadOptions.auto_refresh_iterator_with_snapshot");
 
+DEFINE_uint32(
+    memtable_op_scan_flush_trigger,
+    ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_op_scan_flush_trigger,
+    "Sets CF option memtable_op_scan_flush_trigger.");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 13f4b0e3c585..beb0cd1aed68 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4316,6 +4316,8 @@ void InitializeOptionsFromFlags(
   if (FLAGS_enable_remote_compaction) {
     options.compaction_service = std::make_shared<DbStressCompactionService>();
   }
+
+  options.memtable_op_scan_flush_trigger = FLAGS_memtable_op_scan_flush_trigger;
 }
 
 void InitializeOptionsGeneral(
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 15268d457af9..d110d9cde0e2 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -1096,6 +1096,20 @@ struct AdvancedColumnFamilyOptions {
   // additional key comparison during memtable lookup.
   bool paranoid_memory_checks = false;
 
+  // When an iterator scans this number of invisible entries (tombstones or
+  // hidden puts) from the active memtable during a single iterator operation,
+  // we will attempt to flush the memtable. Currently only forward scans are
+  // supported (SeekToFirst(), Seek() and Next()).
+  // This option helps to reduce the overhead of scanning through a
+  // large number of entries in memtable.
+  // Users should consider enable deletion-triggered-compaction (see
+  // CompactOnDeletionCollectorFactory) together with this option to compact
+  // away tombstones after the memtable is flushed.
+  //
+  // Default: 0 (disabled)
+  // Dynamically changeable through the SetOptions() API.
+  uint32_t memtable_op_scan_flush_trigger = 0;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff --git a/options/cf_options.cc b/options/cf_options.cc
index f8a2e044daa1..fa60053eaec2 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -694,7 +694,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, memtable_max_range_deletions),
           OptionType::kUInt32T, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
-
+        {"memtable_op_scan_flush_trigger",
+         {offsetof(struct MutableCFOptions, memtable_op_scan_flush_trigger),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
 };
 
 static std::unordered_map<std::string, OptionTypeInfo>
@@ -1176,6 +1179,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  bottommost_file_compaction_delay);
   ROCKS_LOG_INFO(log, "                   uncache_aggressiveness: %" PRIu32,
                  uncache_aggressiveness);
+  ROCKS_LOG_INFO(log, "             memtable_op_scan_flush_trigger: %" PRIu32,
+                 memtable_op_scan_flush_trigger);
 
   // Universal Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d",
diff --git a/options/cf_options.h b/options/cf_options.h
index 13d5e1dbc84c..47d8fa7fb208 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -173,7 +173,8 @@ struct MutableCFOptions {
         memtable_max_range_deletions(options.memtable_max_range_deletions),
         bottommost_file_compaction_delay(
             options.bottommost_file_compaction_delay),
-        uncache_aggressiveness(options.uncache_aggressiveness) {
+        uncache_aggressiveness(options.uncache_aggressiveness),
+        memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger) {
     RefreshDerivedOptions(options.num_levels, options.compaction_style);
   }
 
@@ -228,7 +229,8 @@ struct MutableCFOptions {
         sample_for_compression(0),
         memtable_max_range_deletions(0),
         bottommost_file_compaction_delay(0),
-        uncache_aggressiveness(0) {}
+        uncache_aggressiveness(0),
+        memtable_op_scan_flush_trigger(0) {}
 
   explicit MutableCFOptions(const Options& options);
 
@@ -336,6 +338,7 @@ struct MutableCFOptions {
   uint32_t memtable_max_range_deletions;
   uint32_t bottommost_file_compaction_delay;
   uint32_t uncache_aggressiveness;
+  uint32_t memtable_op_scan_flush_trigger;
 
   // Derived options
   // Per-level target file size.
diff --git a/options/options.cc b/options/options.cc
index 0a9e30e67c76..85dbc51ea92f 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -111,7 +111,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       blob_file_starting_level(options.blob_file_starting_level),
       blob_cache(options.blob_cache),
       prepopulate_blob_cache(options.prepopulate_blob_cache),
-      persist_user_defined_timestamps(options.persist_user_defined_timestamps) {
+      persist_user_defined_timestamps(options.persist_user_defined_timestamps),
+      memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -283,6 +284,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "      Options.max_sequential_skip_in_iterations: %" PRIu64,
                    max_sequential_skip_in_iterations);
+  ROCKS_LOG_HEADER(
+      log, "           Options.memtable_op_scan_flush_trigger: %" PRIu32,
+      memtable_op_scan_flush_trigger);
   ROCKS_LOG_HEADER(log,
                    "                   Options.max_compaction_bytes: %" PRIu64,
                    max_compaction_bytes);
diff --git a/options/options_helper.cc b/options/options_helper.cc
index e7ae9e70c837..89436141024d 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -299,6 +299,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->default_write_temperature = moptions.default_write_temperature;
   cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions;
   cf_opts->uncache_aggressiveness = moptions.uncache_aggressiveness;
+  cf_opts->memtable_op_scan_flush_trigger =
+      moptions.memtable_op_scan_flush_trigger;
 }
 
 void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 66627c428c40..d0daf2fa504a 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -674,7 +674,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "memtable_max_range_deletions=999999;"
       "bottommost_file_compaction_delay=7200;"
       "uncache_aggressiveness=1234;"
-      "paranoid_memory_checks=1;",
+      "paranoid_memory_checks=1;"
+      "memtable_op_scan_flush_trigger=123;",
       new_options));
 
   ASSERT_NE(new_options->blob_cache.get(), nullptr);
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index a970666affa5..d0a4e8de4598 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -166,11 +166,11 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
                       ? roptions.snapshot->GetSequenceNumber()
                       : kMaxSequenceNumber;
   ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
-  res->Init(
-      r->options.env, roptions, r->ioptions, r->moptions, nullptr /* version */,
-      sequence, r->moptions.max_sequential_skip_in_iterations,
-      0 /* version_number */, nullptr /* read_callback */, nullptr /* cfh */,
-      true /* expose_blob_index */, false /* allow_refresh */);
+  res->Init(r->options.env, roptions, r->ioptions, r->moptions,
+            nullptr /* version */, sequence, 0 /* version_number */,
+            nullptr /* read_callback */, nullptr /* cfh */,
+            true /* expose_blob_index */, false /* allow_refresh */,
+            /*active_mem=*/nullptr);
   auto internal_iter = r->table_reader->NewIterator(
       res->GetReadOptions(), r->moptions.prefix_extractor.get(),
       res->GetArena(), false /* skip_filters */,
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index d152ea2f2b73..49a6ac07b07e 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1279,6 +1279,11 @@ DEFINE_bool(explicit_snapshot, false,
             "When set to true iterators will be initialized with explicit "
             "snapshot");
 
+DEFINE_uint32(memtable_op_scan_flush_trigger,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .memtable_op_scan_flush_trigger,
+              "Setting for CF option memtable_op_scan_flush_trigger.");
+
 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     const char* ctype) {
   assert(ctype);
@@ -4747,6 +4752,8 @@ class Benchmark {
     options.block_protection_bytes_per_key =
         FLAGS_block_protection_bytes_per_key;
     options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
+    options.memtable_op_scan_flush_trigger =
+        FLAGS_memtable_op_scan_flush_trigger;
   }
 
   void InitializeOptionsGeneral(Options* opts, ToolHooks& hooks) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 26152b22bee6..8597e26d1112 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -344,6 +344,7 @@
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     "enable_remote_compaction": lambda: random.choice([0, 1]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
+    "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
 }
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
 # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR
diff --git a/unreleased_history/new_features/tombstone_scan_flush_trigger.md b/unreleased_history/new_features/tombstone_scan_flush_trigger.md
new file mode 100644
index 000000000000..a44b2213ab90
--- /dev/null
+++ b/unreleased_history/new_features/tombstone_scan_flush_trigger.md
@@ -0,0 +1 @@
+* Add a new CF option `memtable_op_scan_flush_trigger` that triggers a flush of the memtable if an iterator's Seek()/Next() scans over a certain number of invisible entries from the memtable.

From 2a0ee4ddd841306f1cc4c6f79ca25f97012a6cf9 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 11 Apr 2025 10:08:29 -0700
Subject: [PATCH 051/500] Refactor wal related naming and more (#13490)

Summary:
* Clarify in API comments which `log_` options in DBOptions relate to WALs, info log, and/or manifest files.
* Rename a bunch of "log" things to "wal" for clarity, especially in DBImpl. (More to go, especially some more challenging cases like `DBImpl::logs_`, but a step in the right direction IMHO)
* Simplify DBImpl ctor by moving constant initializers to field definitions.
* Use RelaxedAtomic for (renamed) `wals_total_size_`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13490

Test Plan: existing tests

Reviewed By: cbi42

Differential Revision: D71939382

Pulled By: pdillinger

fbshipit-source-id: 852f4737eca83e6ad653010cc197ad1b6e6bae13
---
 db/column_family_test.cc                      |   2 +-
 db/db_filesnapshot.cc                         |   8 +-
 db/db_impl/db_impl.cc                         |  93 ++-----
 db/db_impl/db_impl.h                          | 260 +++++++++---------
 db/db_impl/db_impl_compaction_flush.cc        |   8 +-
 db/db_impl/db_impl_debug.cc                   |   6 +-
 db/db_impl/db_impl_files.cc                   |  70 ++---
 db/db_impl/db_impl_open.cc                    |  36 +--
 db/db_impl/db_impl_write.cc                   | 260 +++++++++---------
 db/db_kv_checksum_test.cc                     |  16 +-
 db/db_test.cc                                 |   4 +-
 db/db_wal_test.cc                             |   4 +-
 db/db_write_test.cc                           |   2 +-
 db/job_context.h                              |  14 +-
 db/memtable.h                                 |   6 +-
 db/write_thread.h                             |   6 +-
 include/rocksdb/db.h                          |   6 +-
 include/rocksdb/options.h                     |  17 +-
 include/rocksdb/utilities/stackable_db.h      |   4 +-
 monitoring/stats_history_test.cc              |   2 +-
 .../transactions/pessimistic_transaction.cc   |   8 +-
 .../transactions/write_unprepared_txn.cc      |   2 +-
 22 files changed, 407 insertions(+), 427 deletions(-)

diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 1ec6cb81f277..3a2ca0617636 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -2175,7 +2175,7 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
   ASSERT_TRUE(has_cf2_sst);
 
   ASSERT_OK(Flush(0));
-  ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+  ASSERT_EQ(0, dbfull()->TEST_wals_total_size());
   Close();
 }
 
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 9b8f602c0310..d5244877503e 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -183,14 +183,14 @@ Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) {
   return s;
 }
 
-Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_log_file) {
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<WalFile>* current_wal_file) {
   uint64_t current_logfile_number;
   {
     InstrumentedMutexLock l(&mutex_);
-    current_logfile_number = logfile_number_;
+    current_logfile_number = cur_wal_number_;
   }
 
-  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_wal_file);
 }
 
 Status DBImpl::GetLiveFilesStorageInfo(
@@ -330,7 +330,7 @@ Status DBImpl::GetLiveFilesStorageInfo(
   const uint64_t options_size = versions_->options_file_size_;
   const uint64_t min_log_num = MinLogNumberToKeep();
   // Ensure consistency with manifest for track_and_verify_wals_in_manifest
-  const uint64_t max_log_num = logfile_number_;
+  const uint64_t max_log_num = cur_wal_number_;
 
   mutex_.Unlock();
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 35dc300b9f70..571e1e2675eb 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -169,7 +169,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
                bool read_only)
     : dbname_(dbname),
       own_info_log_(options.info_log == nullptr),
-      init_logger_creation_s_(),
       initial_db_options_(SanitizeOptions(dbname, options, read_only,
                                           &init_logger_creation_s_)),
       env_(initial_db_options_.env),
@@ -185,7 +184,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
              immutable_db_options_.use_adaptive_mutex),
 #endif  // COERCE_CONTEXT_SWITCH
-      default_cf_handle_(nullptr),
       error_handler_(this, immutable_db_options_, &mutex_),
       event_logger_(immutable_db_options_.info_log.get()),
       max_total_in_memory_state_(0),
@@ -194,45 +192,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
           file_options_, immutable_db_options_)),
       seq_per_batch_(seq_per_batch),
       batch_per_txn_(batch_per_txn),
-      next_job_id_(1),
-      shutting_down_(false),
-      reject_new_background_jobs_(false),
-      db_lock_(nullptr),
-      manual_compaction_paused_(false),
       bg_cv_(&mutex_),
-      logfile_number_(0),
-      log_dir_synced_(false),
-      log_empty_(true),
-      persist_stats_cf_handle_(nullptr),
-      log_sync_cv_(&log_write_mutex_),
-      total_log_size_(0),
-      is_snapshot_supported_(true),
+      wal_sync_cv_(&wal_write_mutex_),
       write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
       write_thread_(immutable_db_options_),
       nonmem_write_thread_(immutable_db_options_),
       write_controller_(mutable_db_options_.delayed_write_rate),
-      last_batch_group_size_(0),
-      unscheduled_flushes_(0),
-      unscheduled_compactions_(0),
-      bg_bottom_compaction_scheduled_(0),
-      bg_compaction_scheduled_(0),
-      num_running_compactions_(0),
-      bg_flush_scheduled_(0),
-      num_running_flushes_(0),
-      bg_purge_scheduled_(0),
-      disable_delete_obsolete_files_(0),
-      pending_purge_obsolete_files_(0),
       delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
-      has_unpersisted_data_(false),
-      unable_to_release_oldest_log_(false),
-      num_running_ingest_file_(0),
       wal_manager_(immutable_db_options_, file_options_, io_tracer_,
                    seq_per_batch),
-      bg_work_paused_(0),
-      bg_compaction_paused_(0),
-      refitting_level_(false),
-      opened_successfully_(false),
-      periodic_task_scheduler_(),
       two_write_queues_(options.two_write_queues),
       manual_wal_flush_(options.manual_wal_flush),
       // last_sequencee_ is always maintained by the main queue that also writes
@@ -250,14 +218,11 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       // requires a custom gc for compaction, we use that to set use_custom_gc_
       // as well.
       use_custom_gc_(seq_per_batch),
-      shutdown_initiated_(false),
       own_sfm_(options.sst_file_manager == nullptr),
-      closed_(false),
       atomic_flush_install_cv_(&mutex_),
       blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
                      &error_handler_, &event_logger_,
-                     immutable_db_options_.listeners, dbname_),
-      lock_wal_count_(0) {
+                     immutable_db_options_.listeners, dbname_) {
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
   assert(batch_per_txn_ || seq_per_batch_);
@@ -636,8 +601,8 @@ Status DBImpl::CloseHelper() {
     mutex_.Lock();
   }
   {
-    InstrumentedMutexLock lock(&log_write_mutex_);
-    for (auto l : logs_to_free_) {
+    InstrumentedMutexLock lock(&wal_write_mutex_);
+    for (auto l : wals_to_free_) {
       delete l;
     }
     for (auto& log : logs_) {
@@ -1180,11 +1145,11 @@ Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
 
 void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
   mutex_.AssertHeld();
-  if (!job_context->logs_to_free.empty()) {
-    for (auto l : job_context->logs_to_free) {
+  if (!job_context->wals_to_free.empty()) {
+    for (auto l : job_context->wals_to_free) {
       AddToLogsToFreeQueue(l);
     }
-    job_context->logs_to_free.clear();
+    job_context->wals_to_free.clear();
   }
 }
 
@@ -1443,7 +1408,7 @@ Status DBImpl::SetDBOptions(
         WriteThread::Writer w;
         write_thread_.EnterUnbatched(&w, &mutex_);
         if (wal_other_option_changed ||
-            total_log_size_ > GetMaxTotalWalSize()) {
+            wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize()) {
           Status purge_wal_status = SwitchWAL(&write_context);
           if (!purge_wal_status.ok()) {
             ROCKS_LOG_WARN(immutable_db_options_.info_log,
@@ -1507,8 +1472,8 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
   if (manual_wal_flush_) {
     IOStatus io_s;
     {
-      // We need to lock log_write_mutex_ since logs_ might change concurrently
-      InstrumentedMutexLock wl(&log_write_mutex_);
+      // We need to lock wal_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&wal_write_mutex_);
       log::Writer* cur_log_writer = logs_.back().writer;
       io_s = cur_log_writer->WriteBuffer(write_options);
     }
@@ -1535,7 +1500,7 @@ Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
 }
 
 bool DBImpl::WALBufferIsEmpty() {
-  InstrumentedMutexLock l(&log_write_mutex_);
+  InstrumentedMutexLock l(&wal_write_mutex_);
   log::Writer* cur_log_writer = logs_.back().writer;
   auto res = cur_log_writer->BufferIsEmpty();
   return res;
@@ -1543,7 +1508,7 @@ bool DBImpl::WALBufferIsEmpty() {
 
 Status DBImpl::GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size) {
   assert(number_to_size.empty());
-  InstrumentedMutexLock l(&log_write_mutex_);
+  InstrumentedMutexLock l(&wal_write_mutex_);
   for (auto& log : logs_) {
     auto* open_file = log.writer->file();
     if (open_file) {
@@ -1585,15 +1550,15 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
   uint64_t up_to_number;
 
   {
-    InstrumentedMutexLock l(&log_write_mutex_);
+    InstrumentedMutexLock l(&wal_write_mutex_);
     assert(!logs_.empty());
 
-    maybe_active_number = logfile_number_;
+    maybe_active_number = cur_wal_number_;
     up_to_number =
         include_current_wal ? maybe_active_number : maybe_active_number - 1;
 
     while (logs_.front().number <= up_to_number && logs_.front().IsSyncing()) {
-      log_sync_cv_.Wait();
+      wal_sync_cv_.Wait();
     }
     // First check that logs are safe to sync in background.
     if (include_current_wal &&
@@ -1617,7 +1582,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
       }
     }
 
-    need_wal_dir_sync = !log_dir_synced_;
+    need_wal_dir_sync = !wal_dir_synced_;
   }
 
   if (include_current_wal) {
@@ -1690,7 +1655,7 @@ IOStatus DBImpl::SyncWalImpl(bool include_current_wal,
                              /*arg=*/nullptr);
   }
   {
-    InstrumentedMutexLock l(&log_write_mutex_);
+    InstrumentedMutexLock l(&wal_write_mutex_);
     for (auto* wal : wals_internally_closed) {
       // We can only modify the state of log::Writer under the mutex
       bool was_closed = wal->PublishIfClosed();
@@ -1807,9 +1772,9 @@ Status DBImpl::UnlockWAL() {
 
 void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
                             VersionEdit* synced_wals) {
-  log_write_mutex_.AssertHeld();
-  if (synced_dir && logfile_number_ == up_to) {
-    log_dir_synced_ = true;
+  wal_write_mutex_.AssertHeld();
+  if (synced_dir && cur_wal_number_ == up_to) {
+    wal_dir_synced_ = true;
   }
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
     auto& wal = *it;
@@ -1831,7 +1796,7 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
           (immutable_db_options_.background_close_inactive_wals &&
            wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize())) {
         // Fully synced
-        logs_to_free_.push_back(wal.ReleaseWriter());
+        wals_to_free_.push_back(wal.ReleaseWriter());
         it = logs_.erase(it);
       } else {
         wal.FinishSync();
@@ -1844,17 +1809,17 @@ void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
       ++it;
     }
   }
-  log_sync_cv_.SignalAll();
+  wal_sync_cv_.SignalAll();
 }
 
 void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
-  log_write_mutex_.AssertHeld();
+  wal_write_mutex_.AssertHeld();
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
        ++it) {
     auto& wal = *it;
     wal.FinishSync();
   }
-  log_sync_cv_.SignalAll();
+  wal_sync_cv_.SignalAll();
 }
 
 SequenceNumber DBImpl::GetLatestSequenceNumber() const {
@@ -1923,10 +1888,10 @@ void DBImpl::BackgroundCallPurge() {
   TEST_SYNC_POINT("DBImpl::BackgroundCallPurge:beforeMutexLock");
   mutex_.Lock();
 
-  while (!logs_to_free_queue_.empty()) {
-    assert(!logs_to_free_queue_.empty());
-    log::Writer* log_writer = *(logs_to_free_queue_.begin());
-    logs_to_free_queue_.pop_front();
+  while (!wals_to_free_queue_.empty()) {
+    assert(!wals_to_free_queue_.empty());
+    log::Writer* log_writer = *(wals_to_free_queue_.begin());
+    wals_to_free_queue_.pop_front();
     mutex_.Unlock();
     delete log_writer;
     mutex_.Lock();
@@ -3592,7 +3557,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ReadOptions& read_options,
     edit.AddColumnFamily(column_family_name);
     uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
     edit.SetColumnFamily(new_id);
-    edit.SetLogNumber(logfile_number_);
+    edit.SetLogNumber(cur_wal_number_);
     edit.SetComparatorName(cf_options.comparator->Name());
     edit.SetPersistUserDefinedTimestamps(
         cf_options.persist_user_defined_timestamps);
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 57fa0a5e838b..5f617ac9c99e 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -173,10 +173,10 @@ struct DBOpenLogRecordReadReporter : public log::Reader::Reporter {
 
   void OldLogRecord(size_t bytes) override;
 
-  uint64_t GetCorruptedLogNumber() const { return corrupted_log_number_; }
+  uint64_t GetCorruptedLogNumber() const { return corrupted_wal_number_; }
 
  private:
-  uint64_t corrupted_log_number_ = kMaxSequenceNumber;
+  uint64_t corrupted_wal_number_ = kMaxSequenceNumber;
 };
 
 // While DB is the public interface of RocksDB, and DBImpl is the actual
@@ -535,11 +535,11 @@ class DBImpl : public DB {
 
   // Get the known flushed sizes of WALs that might still be written to
   // or have pending sync.
-  // NOTE: unlike alive_log_files_, this function includes WALs that might
+  // NOTE: unlike alive_wal_files_, this function includes WALs that might
   // be obsolete (but not obsolete to a pending Checkpoint) and not yet fully
   // synced.
   Status GetOpenWalSizes(std::map<uint64_t, uint64_t>& number_to_size);
-  Status GetCurrentWalFile(std::unique_ptr<WalFile>* current_log_file) override;
+  Status GetCurrentWalFile(std::unique_ptr<WalFile>* current_wal_file) override;
   Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override;
 
   Status GetUpdatesSince(
@@ -1073,7 +1073,7 @@ class DBImpl : public DB {
 
   void AddToLogsToFreeQueue(log::Writer* log_writer) {
     mutex_.AssertHeld();
-    logs_to_free_queue_.push_back(log_writer);
+    wals_to_free_queue_.push_back(log_writer);
   }
 
   void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
@@ -1138,7 +1138,7 @@ class DBImpl : public DB {
   bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
 
   bool TEST_IsLogGettingFlushed() {
-    return alive_log_files_.begin()->getting_flushed;
+    return alive_wal_files_.begin()->getting_flushed;
   }
 
   Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
@@ -1218,7 +1218,9 @@ class DBImpl : public DB {
 
   uint64_t TEST_LogfileNumber();
 
-  uint64_t TEST_total_log_size() const { return total_log_size_; }
+  uint64_t TEST_wals_total_size() const {
+    return wals_total_size_.LoadRelaxed();
+  }
 
   void TEST_GetAllBlockCaches(std::unordered_set<const Cache*>* cache_set);
 
@@ -1376,16 +1378,16 @@ class DBImpl : public DB {
 
   // State below is protected by mutex_
   // With two_write_queues enabled, some of the variables that accessed during
-  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
-  // logs_, logfile_number_. Refer to the definition of each variable below for
+  // WriteToWAL need different synchronization: wal_empty_, alive_wal_files_,
+  // logs_, cur_wal_number_. Refer to the definition of each variable below for
   // more description.
   //
   // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
   // cachelines.
   mutable CacheAlignedInstrumentedMutex mutex_;
 
-  ColumnFamilyHandleImpl* default_cf_handle_;
-  InternalStats* default_cf_internal_stats_;
+  ColumnFamilyHandleImpl* default_cf_handle_ = nullptr;
+  InternalStats* default_cf_internal_stats_ = nullptr;
 
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
@@ -1397,7 +1399,7 @@ class DBImpl : public DB {
 
   // only used for dynamically adjusting max_total_wal_size. it is a sum of
   // [write_buffer_size * max_write_buffer_number] over all column families
-  std::atomic<uint64_t> max_total_in_memory_state_;
+  std::atomic<uint64_t> max_total_in_memory_state_ = 0;
 
   // The options to access storage files
   const FileOptions file_options_;
@@ -1424,14 +1426,14 @@ class DBImpl : public DB {
 
   // Each flush or compaction gets its own job id. this counter makes sure
   // they're unique
-  std::atomic<int> next_job_id_;
+  std::atomic<int> next_job_id_ = 1;
 
-  std::atomic<bool> shutting_down_;
+  std::atomic<bool> shutting_down_ = false;
 
   // No new background jobs can be queued if true. This is used to prevent new
   // background jobs from being queued after WaitForCompact() completes waiting
   // all background jobs then attempts to close when close_db_ option is true.
-  bool reject_new_background_jobs_;
+  bool reject_new_background_jobs_ = false;
 
   // RecoveryContext struct stores the context about version edits along
   // with corresponding column_family_data and column_family_options.
@@ -1563,7 +1565,7 @@ class DBImpl : public DB {
   Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
                    WriteCallback* callback = nullptr,
                    UserWriteCallback* user_write_cb = nullptr,
-                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                   uint64_t* wal_used = nullptr, uint64_t log_ref = 0,
                    bool disable_memtable = false, uint64_t* seq_used = nullptr,
                    size_t batch_cnt = 0,
                    PreReleaseCallback* pre_release_callback = nullptr,
@@ -1574,7 +1576,7 @@ class DBImpl : public DB {
   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
                             WriteCallback* callback = nullptr,
                             UserWriteCallback* user_write_cb = nullptr,
-                            uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                            uint64_t* wal_used = nullptr, uint64_t log_ref = 0,
                             bool disable_memtable = false,
                             uint64_t* seq_used = nullptr);
 
@@ -1601,7 +1603,7 @@ class DBImpl : public DB {
   Status WriteImplWALOnly(
       WriteThread* write_thread, const WriteOptions& options,
       WriteBatch* updates, WriteCallback* callback,
-      UserWriteCallback* user_write_cb, uint64_t* log_used,
+      UserWriteCallback* user_write_cb, uint64_t* wal_used,
       const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
       PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
       const PublishLastSeq publish_last_seq, const bool disable_memtable);
@@ -1762,9 +1764,9 @@ class DBImpl : public DB {
     }
   };
 
-  struct LogFileNumberSize {
-    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
-    LogFileNumberSize() {}
+  struct WalFileNumberSize {
+    explicit WalFileNumberSize(uint64_t _number) : number(_number) {}
+    WalFileNumberSize() {}
     void AddSize(uint64_t new_size) { size += new_size; }
     uint64_t number;
     uint64_t size = 0;
@@ -1848,13 +1850,13 @@ class DBImpl : public DB {
     uint64_t attempt_truncate_size = SIZE_MAX;
   };
 
-  struct LogContext {
-    explicit LogContext(bool need_sync = false)
-        : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
-    bool need_log_sync = false;
-    bool need_log_dir_sync = false;
+  struct WalContext {
+    explicit WalContext(bool need_sync = false)
+        : need_wal_sync(need_sync), need_wal_dir_sync(need_sync) {}
+    bool need_wal_sync = false;
+    bool need_wal_dir_sync = false;
     log::Writer* writer = nullptr;
-    LogFileNumberSize* log_file_number_size = nullptr;
+    WalFileNumberSize* wal_file_number_size = nullptr;
     uint64_t prev_size = SIZE_MAX;
   };
 
@@ -2063,10 +2065,10 @@ class DBImpl : public DB {
       JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
 
   // REQUIRES: log_numbers are sorted in ascending order
-  // corrupted_log_found is set to true if we recover from a corrupted log file.
+  // corrupted_wal_found is set to true if we recover from a corrupted log file.
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                          SequenceNumber* next_sequence, bool read_only,
-                         bool is_retry, bool* corrupted_log_found,
+                         bool is_retry, bool* corrupted_wal_found,
                          RecoveryContext* recovery_ctx);
 
   void SetupLogFilesRecovery(
@@ -2174,12 +2176,12 @@ class DBImpl : public DB {
   // log file to its actual size, thereby freeing preallocated space.
   // Return success even if truncate fails
   Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
-                                    LogFileNumberSize* log);
+                                    WalFileNumberSize* log);
 
-  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // Restore alive_wal_files_ and wals_total_size_ after recovery.
   // It needs to run only when there's no flush during recovery
   // (e.g. avoid_flush_during_recovery=true). May also trigger flush
-  // in case total_log_size > max_total_wal_size.
+  // in case wals_total_size > max_total_wal_size.
   Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
 
   // num_bytes: for slowdown case, delay time is calculated based on
@@ -2328,7 +2330,7 @@ class DBImpl : public DB {
 
   // REQUIRES: mutex locked
   Status PreprocessWrite(const WriteOptions& write_options,
-                         LogContext* log_context, WriteContext* write_context);
+                         WalContext* log_context, WriteContext* write_context);
 
   // Merge write batches in the write group into merged_batch.
   // Returns OK if merge is successful.
@@ -2339,20 +2341,21 @@ class DBImpl : public DB {
 
   IOStatus WriteToWAL(const WriteBatch& merged_batch,
                       const WriteOptions& write_options,
-                      log::Writer* log_writer, uint64_t* log_used,
+                      log::Writer* log_writer, uint64_t* wal_used,
                       uint64_t* log_size,
-                      LogFileNumberSize& log_file_number_size,
+                      WalFileNumberSize& wal_file_number_size,
                       SequenceNumber sequence);
 
-  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
-                      log::Writer* log_writer, uint64_t* log_used,
-                      bool need_log_sync, bool need_log_dir_sync,
-                      SequenceNumber sequence,
-                      LogFileNumberSize& log_file_number_size);
+  IOStatus WriteGroupToWAL(const WriteThread::WriteGroup& write_group,
+                           log::Writer* log_writer, uint64_t* wal_used,
+                           bool need_wal_sync, bool need_wal_dir_sync,
+                           SequenceNumber sequence,
+                           WalFileNumberSize& wal_file_number_size);
 
-  IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
-                                uint64_t* log_used,
-                                SequenceNumber* last_sequence, size_t seq_inc);
+  IOStatus ConcurrentWriteGroupToWAL(const WriteThread::WriteGroup& write_group,
+                                     uint64_t* wal_used,
+                                     SequenceNumber* last_sequence,
+                                     size_t seq_inc);
 
   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
   // Caller must hold mutex_.
@@ -2719,7 +2722,7 @@ class DBImpl : public DB {
       ErrorIteratorFuncType error_iterator_func);
 
   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
-  FileLock* db_lock_;
+  FileLock* db_lock_ = nullptr;
 
   // Guards changes to DB and CF options to ensure consistency between
   // * In-memory options objects
@@ -2733,20 +2736,20 @@ class DBImpl : public DB {
   // Guards reads and writes to in-memory stats_history_.
   InstrumentedMutex stats_history_mutex_;
 
-  // In addition to mutex_, log_write_mutex_ protects writes to logs_ and
-  // logfile_number_. With two_write_queues it also protects alive_log_files_,
-  // and log_empty_. Refer to the definition of each variable below for more
+  // In addition to mutex_, wal_write_mutex_ protects writes to logs_ and
+  // cur_wal_number_. With two_write_queues it also protects alive_wal_files_,
+  // and wal_empty_. Refer to the definition of each variable below for more
   // details.
-  // Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and
-  // mutex_, the order should be first mutex_ and then log_write_mutex_.
-  InstrumentedMutex log_write_mutex_;
+  // Note: to avoid deadlock, if needed to acquire both wal_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then wal_write_mutex_.
+  InstrumentedMutex wal_write_mutex_;
 
   // If zero, manual compactions are allowed to proceed. If non-zero, manual
   // compactions may still be running, but will quickly fail with
   // `Status::Incomplete`. The value indicates how many threads have paused
   // manual compactions. It is accessed in read mode outside the DB mutex in
   // compaction code paths.
-  std::atomic<int> manual_compaction_paused_;
+  std::atomic<int> manual_compaction_paused_ = false;
 
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
@@ -2762,106 +2765,114 @@ class DBImpl : public DB {
   // * whenever SetOptions successfully updates options.
   // * whenever a column family is dropped.
   InstrumentedCondVar bg_cv_;
-  // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
-  // must be under either mutex_ or log_write_mutex_. Since after ::Open,
-  // logfile_number_ is currently updated only in write_thread_, it can be read
+
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_ = nullptr;
+
+  bool persistent_stats_cfd_exists_ = true;
+
+  // Writes are protected by locking both mutex_ and wal_write_mutex_, and reads
+  // must be under either mutex_ or wal_write_mutex_. Since after ::Open,
+  // cur_wal_number_ is currently updated only in write_thread_, it can be read
   // from the same write_thread_ without any locks.
-  uint64_t logfile_number_;
+  uint64_t cur_wal_number_ = 0;
+
   // Log files that we can recycle. Must be protected by db mutex_.
-  std::deque<uint64_t> log_recycle_files_;
+  std::deque<uint64_t> wal_recycle_files_;
+
   // The minimum log file number taht can be recycled, if log recycling is
   // enabled. This is used to ensure that log files created by previous
   // instances of the database are not recycled, as we cannot be sure they
   // were created in the recyclable format.
-  uint64_t min_log_number_to_recycle_;
-  // Protected by log_write_mutex_.
-  bool log_dir_synced_;
-  // Without two_write_queues, read and writes to log_empty_ are protected by
+  uint64_t min_wal_number_to_recycle_ = 0;
+
+  // Protected by wal_write_mutex_.
+  bool wal_dir_synced_ = false;
+
+  // Without two_write_queues, read and writes to wal_empty_ are protected by
   // mutex_. Since it is currently updated/read only in write_thread_, it can be
   // accessed from the same write_thread_ without any locks. With
   // two_write_queues writes, where it can be updated in different threads,
-  // read and writes are protected by log_write_mutex_ instead. This is to avoid
-  // expensive mutex_ lock during WAL write, which update log_empty_.
-  bool log_empty_;
-
-  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
-
-  bool persistent_stats_cfd_exists_ = true;
+  // read and writes are protected by wal_write_mutex_ instead. This is to avoid
+  // expensive mutex_ lock during WAL write, which update wal_empty_.
+  bool wal_empty_ = true;
 
   // The current WAL file and those that have not been found obsolete from
   // memtable flushes. A WAL not on this list might still be pending writer
-  // flush and/or sync and close and might still be in logs_. alive_log_files_
-  // is protected by mutex_ and log_write_mutex_ with details as follows:
+  // flush and/or sync and close and might still be in logs_. alive_wal_files_
+  // is protected by mutex_ and wal_write_mutex_ with details as follows:
   // 1. read by FindObsoleteFiles() which can be called in either application
-  //    thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
+  //    thread or RocksDB bg threads, both mutex_ and wal_write_mutex_ are
   //    held.
-  // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
+  // 2. pop_front() by FindObsoleteFiles(), both mutex_ and wal_write_mutex_
   //    are held.
   // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
   //    (actually called by Open()), only mutex_ is held because at this point,
   //    the DB::Open() call has not returned success to application, and the
   //    only other thread(s) that can conflict are bg threads calling
-  //    FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
-  //    are held when accessing alive_log_files_.
+  //    FindObsoleteFiles() which ensure that both mutex_ and wal_write_mutex_
+  //    are held when accessing alive_wal_files_.
   // 4. read by DBImpl::Open() is protected by mutex_.
-  // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
+  // 5. push_back() by SwitchMemtable(). Both mutex_ and wal_write_mutex_ are
   //    held. This is done by the write group leader. Note that in the case of
   //    two-write-queues, another WAL-only write thread can be writing to the
   //    WAL concurrently. See 9.
-  // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
+  // 6. read by SwitchWAL() with both mutex_ and wal_write_mutex_ held. This is
   //    done by write group leader.
   // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
-  //    two-write-queues. Only log_write_mutex_ is held to protect concurrent
+  //    two-write-queues. Only wal_write_mutex_ is held to protect concurrent
   //    pop_front() by FindObsoleteFiles().
-  // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
+  // 8. read by PreprocessWrite() by the write group leader. wal_write_mutex_
   //    is held to protect the data structure from concurrent pop_front() by
   //    FindObsoleteFiles().
   // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
-  //    of two-write-queues. Only log_write_mutex_ is held. This suffices to
+  //    of two-write-queues. Only wal_write_mutex_ is held. This suffices to
   //    protect the data structure from concurrent push_back() by current
   //    write group leader as well as pop_front() by FindObsoleteFiles().
-  std::deque<LogFileNumberSize> alive_log_files_;
+  std::deque<WalFileNumberSize> alive_wal_files_;
+
+  // Total size of all "alive" WALs (for easy access without synchronization)
+  RelaxedAtomic<uint64_t> wals_total_size_{0};
 
   // Log files that aren't fully synced, and the current log file.
   // Synchronization:
   // 1. read by FindObsoleteFiles() which can be called either in application
-  //    thread or RocksDB bg threads. log_write_mutex_ is always held, while
+  //    thread or RocksDB bg threads. wal_write_mutex_ is always held, while
   //    some reads are performed without mutex_.
-  // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
-  // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
-  // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
+  // 2. pop_front() by FindObsoleteFiles() with only wal_write_mutex_ held.
+  // 3. read by DBImpl::Open() with both mutex_ and wal_write_mutex_.
+  // 4. emplace_back() by DBImpl::Open() with both mutex_ and wal_write_mutex.
   //    Note that at this point, DB::Open() has not returned success to
   //    application, thus the only other thread(s) that can conflict are bg
   //    threads calling FindObsoleteFiles(). See 1.
-  // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
+  // 5. iteration and clear() from CloseHelper() always hold wal_write_mutex
   //    and mutex_.
   // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
-  //    log_write_mutex_. These two can be called by application threads after
+  //    wal_write_mutex_. These two can be called by application threads after
   //    DB::Open() returns success to applications.
-  // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
+  // 7. read by SyncWAL(), another API, protected by only wal_write_mutex_.
   // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
-  //    log_write_mutex_.
-  // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
-  // 10. read by SyncClosedWals() protected by only log_write_mutex_. This can
+  //    wal_write_mutex_.
+  // 9. erase() by MarkLogsSynced() protected by wal_write_mutex_.
+  // 10. read by SyncClosedWals() protected by only wal_write_mutex_. This can
   //     happen in bg flush threads after DB::Open() returns success to
   //     applications.
   // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
-  //     holds only the log_write_mutex_. This is done by the write group
+  //     holds only the wal_write_mutex_. This is done by the write group
   //     leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
-  //     can happen concurrently. This is fine because log_write_mutex_ is used
+  //     can happen concurrently. This is fine because wal_write_mutex_ is used
   //     by all parties. See 2, 5, 9.
   // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
-  //     log_write_mutex_. This happens in the write group leader.
+  //     wal_write_mutex_. This happens in the write group leader.
   // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
-  //     log_write_mutex_. This happens in the write group leader. Can conflict
+  //     wal_write_mutex_. This happens in the write group leader. Can conflict
   //     with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
   //     SyncClosedWals(), etc. as well as application threads calling
   //     FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
-  //     require at least log_write_mutex_.
+  //     require at least wal_write_mutex_.
   // 14. iteration called in WriteToWAL(write_group) protected by
-  //     log_write_mutex_. This is done by write group leader when
+  //     wal_write_mutex_. This is done by write group leader when
   //     two-write-queues is disabled and write needs to sync logs.
-  // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
+  // 15. back() called in ConcurrentWriteToWAL() protected by wal_write_mutex_.
   //     This can be done by the write group leader if two-write-queues is
   //     enabled. It can also be done by another WAL-only write thread.
   //
@@ -2878,23 +2889,22 @@ class DBImpl : public DB {
   std::deque<LogWriterNumber> logs_;
 
   // Signaled when getting_synced becomes false for some of the logs_.
-  InstrumentedCondVar log_sync_cv_;
+  InstrumentedCondVar wal_sync_cv_;
   // This is the app-level state that is written to the WAL but will be used
   // only during recovery. Using this feature enables not writing the state to
   // memtable on normal writes and hence improving the throughput. Each new
   // write of the state will replace the previous state entirely even if the
   // keys in the two consecutive states do not overlap.
-  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+  // It is protected by wal_write_mutex_ when two_write_queues_ is enabled.
   // Otherwise only the heaad of write_thread_ can access it.
   WriteBatch cached_recoverable_state_;
   std::atomic<bool> cached_recoverable_state_empty_ = {true};
-  std::atomic<uint64_t> total_log_size_;
 
   // If this is non-empty, we need to delete these log files in background
-  // threads. Protected by log_write_mutex_.
-  autovector<log::Writer*> logs_to_free_;
+  // threads. Protected by wal_write_mutex_.
+  autovector<log::Writer*> wals_to_free_;
 
-  bool is_snapshot_supported_;
+  bool is_snapshot_supported_ = true;
 
   std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
 
@@ -2918,7 +2928,7 @@ class DBImpl : public DB {
   // sleep if it uses up the quota.
   // Note: This is to protect memtable and compaction. If the batch only writes
   // to the WAL its size need not to be included in this.
-  uint64_t last_batch_group_size_;
+  uint64_t last_batch_group_size_ = 0;
 
   FlushScheduler flush_scheduler_;
 
@@ -2977,32 +2987,32 @@ class DBImpl : public DB {
   std::unordered_set<uint64_t> files_grabbed_for_purge_;
 
   // A queue to store log writers to close. Protected by db mutex_.
-  std::deque<log::Writer*> logs_to_free_queue_;
+  std::deque<log::Writer*> wals_to_free_queue_;
 
   std::deque<SuperVersion*> superversions_to_free_queue_;
 
-  int unscheduled_flushes_;
+  int unscheduled_flushes_ = 0;
 
-  int unscheduled_compactions_;
+  int unscheduled_compactions_ = 0;
 
   // count how many background compactions are running or have been scheduled in
   // the BOTTOM pool
-  int bg_bottom_compaction_scheduled_;
+  int bg_bottom_compaction_scheduled_ = 0;
 
   // count how many background compactions are running or have been scheduled
-  int bg_compaction_scheduled_;
+  int bg_compaction_scheduled_ = 0;
 
   // stores the number of compactions are currently running
-  int num_running_compactions_;
+  int num_running_compactions_ = 0;
 
   // number of background memtable flush jobs, submitted to the HIGH pool
-  int bg_flush_scheduled_;
+  int bg_flush_scheduled_ = 0;
 
   // stores the number of flushes are currently running
-  int num_running_flushes_;
+  int num_running_flushes_ = 0;
 
   // number of background obsolete file purge jobs, submitted to the HIGH pool
-  int bg_purge_scheduled_;
+  int bg_purge_scheduled_ = 0;
 
   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
 
@@ -3012,11 +3022,11 @@ class DBImpl : public DB {
   // This enables two different threads to call
   // EnableFileDeletions() and DisableFileDeletions()
   // without any synchronization
-  int disable_delete_obsolete_files_;
+  int disable_delete_obsolete_files_ = 0;
 
   // Number of times FindObsoleteFiles has found deletable files and the
   // corresponding call to PurgeObsoleteFiles has not yet finished.
-  int pending_purge_obsolete_files_;
+  int pending_purge_obsolete_files_ = 0;
 
   // last time when DeleteObsoleteFiles with full scan was executed. Originally
   // initialized with startup time.
@@ -3028,12 +3038,12 @@ class DBImpl : public DB {
   // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
   std::mutex switch_mutex_;
   // Number of threads intending to write to memtable
-  std::atomic<size_t> pending_memtable_writes_ = {};
+  std::atomic<size_t> pending_memtable_writes_{0};
 
   // A flag indicating whether the current rocksdb database has any
   // data that is not yet persisted into either WAL or SST file.
   // Used when disableWAL is true.
-  std::atomic<bool> has_unpersisted_data_;
+  std::atomic<bool> has_unpersisted_data_{false};
 
   // if an attempt was made to flush all column families that
   // the oldest log depends on but uncommitted data in the oldest
@@ -3041,26 +3051,26 @@ class DBImpl : public DB {
   // We must attempt to free the dependent memtables again
   // at a later time after the transaction in the oldest
   // log is fully commited.
-  bool unable_to_release_oldest_log_;
+  bool unable_to_release_oldest_log_{false};
 
   // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
   // calls.
   // REQUIRES: mutex held
-  int num_running_ingest_file_;
+  int num_running_ingest_file_ = 0;
 
   WalManager wal_manager_;
 
   // A value of > 0 temporarily disables scheduling of background work
-  int bg_work_paused_;
+  int bg_work_paused_ = 0;
 
   // A value of > 0 temporarily disables scheduling of background compaction
-  int bg_compaction_paused_;
+  int bg_compaction_paused_ = 0;
 
   // Guard against multiple concurrent refitting
-  bool refitting_level_;
+  bool refitting_level_ = false;
 
   // Indicate DB was opened successfully
-  bool opened_successfully_;
+  bool opened_successfully_ = false;
 
   // The min threshold to triggere bottommost compaction for removing
   // garbages, among all column families.
@@ -3106,13 +3116,13 @@ class DBImpl : public DB {
   // error recovery from going on in parallel. The latter, shutting_down_,
   // is set a little later during the shutdown after scheduling memtable
   // flushes
-  std::atomic<bool> shutdown_initiated_;
+  std::atomic<bool> shutdown_initiated_{false};
   // Flag to indicate whether sst_file_manager object was allocated in
   // DB::Open() or passed to us
   bool own_sfm_;
 
   // Flag to check whether Close() has been called on this DB
-  bool closed_;
+  bool closed_ = false;
   // save the closing status, for re-calling the close()
   Status closing_status_;
   // mutex for DB::Close()
@@ -3148,7 +3158,7 @@ class DBImpl : public DB {
 
   // The number of LockWAL called without matching UnlockWAL call.
   // See also lock_wal_write_token_
-  uint32_t lock_wal_count_;
+  uint32_t lock_wal_count_ = 0;
 };
 
 class GetWithTimestampReadCallback : public ReadCallback {
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index d0b1074b3b25..9ae28aa8dba0 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -168,7 +168,7 @@ Status DBImpl::FlushMemTableToOutputFile(
   // had not been committed yet. Make sure we sync them to keep the persisted
   // WAL state at least as new as the persisted SST state.
   const bool needs_to_sync_closed_wals =
-      logfile_number_ > 0 &&
+      cur_wal_number_ > 0 &&
       (versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1 ||
        allow_2pc());
 
@@ -224,7 +224,7 @@ Status DBImpl::FlushMemTableToOutputFile(
   bool need_cancel = false;
   IOStatus log_io_s = IOStatus::OK();
   if (needs_to_sync_closed_wals) {
-    // SyncClosedWals() may unlock and re-lock the log_write_mutex multiple
+    // SyncClosedWals() may unlock and re-lock the wal_write_mutex multiple
     // times.
     VersionEdit synced_wals;
     bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress();
@@ -512,7 +512,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
                        job_context->job_id, flush_reason);
   }
 
-  if (logfile_number_ > 0) {
+  if (cur_wal_number_ > 0) {
     // TODO (yanqin) investigate whether we should sync the closed logs for
     // single column family case.
     VersionEdit synced_wals;
@@ -528,7 +528,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
     if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
         !log_io_s.IsColumnFamilyDropped()) {
-      if (total_log_size_ > 0) {
+      if (wals_total_size_.LoadRelaxed() > 0) {
         error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
       } else {
         // If the WAL is empty, we use different error reason
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index 3f29d06341d7..ee48b0798673 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -224,13 +224,13 @@ void DBImpl::TEST_EndWrite(void* w) {
 }
 
 size_t DBImpl::TEST_LogsToFreeSize() {
-  InstrumentedMutexLock l(&log_write_mutex_);
-  return logs_to_free_.size();
+  InstrumentedMutexLock l(&wal_write_mutex_);
+  return wals_to_free_.size();
 }
 
 uint64_t DBImpl::TEST_LogfileNumber() {
   InstrumentedMutexLock l(&mutex_);
-  return logfile_number_;
+  return cur_wal_number_;
 }
 
 void DBImpl::TEST_GetAllBlockCaches(
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 2c4b3bfde925..e2dc53e7d4ab 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -28,7 +28,7 @@ uint64_t DBImpl::MinLogNumberToKeep() {
   return versions_->min_log_number_to_keep();
 }
 
-uint64_t DBImpl::MinLogNumberToRecycle() { return min_log_number_to_recycle_; }
+uint64_t DBImpl::MinLogNumberToRecycle() { return min_wal_number_to_recycle_; }
 
 uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
   mutex_.AssertHeld();
@@ -272,77 +272,77 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
 
   // logs_ is empty when called during recovery, in which case there can't yet
   // be any tracked obsolete logs
-  log_write_mutex_.Lock();
+  wal_write_mutex_.Lock();
 
-  if (alive_log_files_.empty() || logs_.empty()) {
+  if (alive_wal_files_.empty() || logs_.empty()) {
     mutex_.AssertHeld();
     // We may reach here if the db is DBImplSecondary
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
     return;
   }
 
   bool mutex_unlocked = false;
-  if (!alive_log_files_.empty() && !logs_.empty()) {
+  if (!alive_wal_files_.empty() && !logs_.empty()) {
     uint64_t min_log_number = job_context->log_number;
-    size_t num_alive_log_files = alive_log_files_.size();
+    size_t num_alive_wal_files = alive_wal_files_.size();
     // find newly obsoleted log files
-    while (alive_log_files_.begin()->number < min_log_number) {
-      auto& earliest = *alive_log_files_.begin();
+    while (alive_wal_files_.begin()->number < min_log_number) {
+      auto& earliest = *alive_wal_files_.begin();
       if (immutable_db_options_.recycle_log_file_num >
-              log_recycle_files_.size() &&
+              wal_recycle_files_.size() &&
           earliest.number >= MinLogNumberToRecycle()) {
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "adding log %" PRIu64 " to recycle list\n",
                        earliest.number);
-        log_recycle_files_.push_back(earliest.number);
+        wal_recycle_files_.push_back(earliest.number);
       } else {
         job_context->log_delete_files.push_back(earliest.number);
       }
       if (job_context->size_log_to_delete == 0) {
-        job_context->prev_total_log_size = total_log_size_;
-        job_context->num_alive_log_files = num_alive_log_files;
+        job_context->prev_wals_total_size = wals_total_size_.LoadRelaxed();
+        job_context->num_alive_wal_files = num_alive_wal_files;
       }
       job_context->size_log_to_delete += earliest.size;
-      total_log_size_ -= earliest.size;
-      alive_log_files_.pop_front();
+      wals_total_size_.FetchSubRelaxed(earliest.size);
+      alive_wal_files_.pop_front();
 
       // Current log should always stay alive since it can't have
       // number < MinLogNumber().
-      assert(alive_log_files_.size());
+      assert(alive_wal_files_.size());
     }
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
     mutex_.Unlock();
     mutex_unlocked = true;
     TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr);
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
     while (!logs_.empty() && logs_.front().number < min_log_number) {
       auto& log = logs_.front();
       if (log.IsSyncing()) {
-        log_sync_cv_.Wait();
+        wal_sync_cv_.Wait();
         // logs_ could have changed while we were waiting.
         continue;
       }
       // This WAL file is not live, so it's OK if we never sync the rest of it.
       // If it's already closed, then it's been fully synced. If
       // !background_close_inactive_wals then we need to Close it before
-      // removing from logs_ but not blocking while holding log_write_mutex_.
+      // removing from logs_ but not blocking while holding wal_write_mutex_.
       if (!immutable_db_options_.background_close_inactive_wals &&
           log.writer->file()) {
         // We are taking ownership of and pinning the front entry, so we can
         // expect it to be the same after releasing and re-acquiring the lock
         log.PrepareForSync();
-        log_write_mutex_.Unlock();
+        wal_write_mutex_.Unlock();
         // TODO: maybe check the return value of Close.
         // TODO: plumb Env::IOActivity, Env::IOPriority
         auto s = log.writer->file()->Close({});
         s.PermitUncheckedError();
-        log_write_mutex_.Lock();
+        wal_write_mutex_.Lock();
         log.writer->PublishIfClosed();
         assert(&log == &logs_.front());
         log.FinishSync();
-        log_sync_cv_.SignalAll();
+        wal_sync_cv_.SignalAll();
       }
-      logs_to_free_.push_back(log.ReleaseWriter());
+      wals_to_free_.push_back(log.ReleaseWriter());
       logs_.pop_front();
     }
     // Current log cannot be obsolete.
@@ -350,16 +350,16 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   }
 
   // We're just cleaning up for DB::Write().
-  assert(job_context->logs_to_free.empty());
-  job_context->logs_to_free = logs_to_free_;
+  assert(job_context->wals_to_free.empty());
+  job_context->wals_to_free = wals_to_free_;
 
-  logs_to_free_.clear();
-  log_write_mutex_.Unlock();
+  wals_to_free_.clear();
+  wal_write_mutex_.Unlock();
   if (mutex_unlocked) {
     mutex_.Lock();
   }
-  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
-                                        log_recycle_files_.end());
+  job_context->log_recycle_files.assign(wal_recycle_files_.begin(),
+                                        wal_recycle_files_.end());
 }
 
 // Delete obsolete files and log status and information of file deletion
@@ -431,7 +431,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
                                             state.sst_live.end());
   std::unordered_set<uint64_t> blob_live_set(state.blob_live.begin(),
                                              state.blob_live.end());
-  std::unordered_set<uint64_t> log_recycle_files_set(
+  std::unordered_set<uint64_t> wal_recycle_files_set(
       state.log_recycle_files.begin(), state.log_recycle_files.end());
   std::unordered_set<uint64_t> quarantine_files_set(
       state.files_to_quarantine.begin(), state.files_to_quarantine.end());
@@ -491,13 +491,13 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       std::unique(candidate_files.begin(), candidate_files.end()),
       candidate_files.end());
 
-  if (state.prev_total_log_size > 0) {
+  if (state.prev_wals_total_size > 0) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "[JOB %d] Try to delete WAL files size %" PRIu64
                    ", prev total WAL file size %" PRIu64
                    ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
                    state.job_id, state.size_log_to_delete,
-                   state.prev_total_log_size, state.num_alive_log_files);
+                   state.prev_wals_total_size, state.num_alive_wal_files);
   }
 
   std::vector<std::string> old_info_log_files;
@@ -532,7 +532,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
   optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number);
 
   // Close WALs before trying to delete them.
-  for (const auto w : state.logs_to_free) {
+  for (const auto w : state.wals_to_free) {
     // TODO: maybe check the return value of Close.
     // TODO: plumb Env::IOActivity, Env::IOPriority
     auto s = w->Close({});
@@ -559,8 +559,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       case kWalFile:
         keep = ((number >= state.log_number) ||
                 (number == state.prev_log_number) ||
-                (log_recycle_files_set.find(number) !=
-                 log_recycle_files_set.end()));
+                (wal_recycle_files_set.find(number) !=
+                 wal_recycle_files_set.end()));
         break;
       case kDescriptorFile:
         // Keep my manifest file, and any newer incarnations'
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index ab6902075f46..0e365c9b34bb 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1119,7 +1119,7 @@ void DBOpenLogRecordReadReporter::Corruption(size_t bytes, const Status& s,
                  static_cast<int>(bytes), s.ToString().c_str());
   if (status != nullptr && status->ok()) {
     *status = s;
-    corrupted_log_number_ = log_number;
+    corrupted_wal_number_ = log_number;
   }
 }
 
@@ -1902,8 +1902,8 @@ void DBImpl::FinishLogFilesRecovery(int job_id, const Status& status) {
 }
 
 Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
-                                          LogFileNumberSize* log_ptr) {
-  LogFileNumberSize log(wal_number);
+                                          WalFileNumberSize* log_ptr) {
+  WalFileNumberSize log(wal_number);
   std::string fname =
       LogFileName(immutable_db_options_.GetWalDir(), wal_number);
   Status s;
@@ -1946,27 +1946,27 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
   assert(immutable_db_options_.avoid_flush_during_recovery);
   // Mark these as alive so they'll be considered for deletion later by
   // FindObsoleteFiles()
-  total_log_size_ = 0;
-  log_empty_ = false;
+  wals_total_size_.StoreRelaxed(0);
+  wal_empty_ = false;
   uint64_t min_wal_with_unflushed_data =
       versions_->MinLogNumberWithUnflushedData();
   for (auto wal_number : wal_numbers) {
     if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
       // In non-2pc mode, the WAL files not backing unflushed data are not
-      // alive, thus should not be added to the alive_log_files_.
+      // alive, thus should not be added to the alive_wal_files_.
       continue;
     }
     // We preallocate space for wals, but then after a crash and restart, those
     // preallocated space are not needed anymore. It is likely only the last
     // log has such preallocated space, so we only truncate for the last log.
-    LogFileNumberSize log;
+    WalFileNumberSize log;
     s = GetLogSizeAndMaybeTruncate(
         wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
     if (!s.ok()) {
       break;
     }
-    total_log_size_ += log.size;
-    alive_log_files_.push_back(log);
+    wals_total_size_.FetchAddRelaxed(log.size);
+    alive_wal_files_.push_back(log);
   }
   return s;
 }
@@ -2449,18 +2449,18 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     if (s.ok()) {
       // Prevent log files created by previous instance from being recycled.
       // They might be in alive_log_file_, and might get recycled otherwise.
-      impl->min_log_number_to_recycle_ = new_log_number;
+      impl->min_wal_number_to_recycle_ = new_log_number;
     }
     if (s.ok()) {
-      InstrumentedMutexLock wl(&impl->log_write_mutex_);
-      impl->logfile_number_ = new_log_number;
+      InstrumentedMutexLock wl(&impl->wal_write_mutex_);
+      impl->cur_wal_number_ = new_log_number;
       assert(new_log != nullptr);
       assert(impl->logs_.empty());
       impl->logs_.emplace_back(new_log_number, new_log);
     }
 
     if (s.ok()) {
-      impl->alive_log_files_.emplace_back(impl->logfile_number_);
+      impl->alive_wal_files_.emplace_back(impl->cur_wal_number_);
       // In WritePrepared there could be gap in sequence numbers. This breaks
       // the trick we use in kPointInTimeRecovery which assumes the first seq in
       // the log right after the corrupted log is one larger than the last seq
@@ -2473,14 +2473,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       if (recovered_seq != kMaxSequenceNumber) {
         WriteBatch empty_batch;
         WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
-        uint64_t log_used, log_size;
+        uint64_t wal_used, log_size;
         log::Writer* log_writer = impl->logs_.back().writer;
-        LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back();
+        WalFileNumberSize& wal_file_number_size = impl->alive_wal_files_.back();
 
-        assert(log_writer->get_log_number() == log_file_number_size.number);
+        assert(log_writer->get_log_number() == wal_file_number_size.number);
         impl->mutex_.AssertHeld();
-        s = impl->WriteToWAL(empty_batch, write_options, log_writer, &log_used,
-                             &log_size, log_file_number_size, recovered_seq);
+        s = impl->WriteToWAL(empty_batch, write_options, log_writer, &wal_used,
+                             &log_size, wal_file_number_size, recovered_seq);
         if (s.ok()) {
           // Need to fsync, otherwise it might get lost after a power reset.
           s = impl->FlushWAL(write_options, false);
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 16a80a17bfa5..d6639a4b29a3 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -157,7 +157,7 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
   if (s.ok()) {
     s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
                   /*user_write_cb=*/nullptr,
-                  /*log_used=*/nullptr);
+                  /*wal_used=*/nullptr);
   }
   return s;
 }
@@ -314,7 +314,7 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
 
 Status DBImpl::WriteImpl(const WriteOptions& write_options,
                          WriteBatch* my_batch, WriteCallback* callback,
-                         UserWriteCallback* user_write_cb, uint64_t* log_used,
+                         UserWriteCallback* user_write_cb, uint64_t* wal_used,
                          uint64_t log_ref, bool disable_memtable,
                          uint64_t* seq_used, size_t batch_cnt,
                          PreReleaseCallback* pre_release_callback,
@@ -444,7 +444,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // they don't consume sequence.
     return WriteImplWALOnly(
         &nonmem_write_thread_, write_options, my_batch, callback, user_write_cb,
-        log_used, log_ref, seq_used, batch_cnt, pre_release_callback,
+        wal_used, log_ref, seq_used, batch_cnt, pre_release_callback,
         assign_order, kDontPublishLastSeq, disable_memtable);
   }
 
@@ -458,7 +458,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // sequence in in increasing order, iii) call pre_release_callback serially
     Status status = WriteImplWALOnly(
         &write_thread_, write_options, my_batch, callback, user_write_cb,
-        log_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
+        wal_used, log_ref, &seq, sub_batch_cnt, pre_release_callback,
         kDoAssignOrder, kDoPublishLastSeq, disable_memtable);
     TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
     if (!status.ok()) {
@@ -477,7 +477,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
   if (immutable_db_options_.enable_pipelined_write) {
     return PipelinedWriteImpl(write_options, my_batch, callback, user_write_cb,
-                              log_used, log_ref, disable_memtable, seq_used);
+                              wal_used, log_ref, disable_memtable, seq_used);
   }
 
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
@@ -535,8 +535,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // STATE_COMPLETED conditional below handles exit
   }
   if (w.state == WriteThread::STATE_COMPLETED) {
-    if (log_used != nullptr) {
-      *log_used = w.log_used;
+    if (wal_used != nullptr) {
+      *wal_used = w.wal_used;
     }
     if (seq_used != nullptr) {
       *seq_used = w.sequence;
@@ -552,7 +552,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   // when it finds suitable, and finish them in the same write batch.
   // This is how a write job could be done by the other writer.
   WriteContext write_context;
-  LogContext log_context(write_options.sync);
+  // FIXME: also check disableWAL like others?
+  WalContext wal_context(write_options.sync);
   WriteThread::WriteGroup write_group;
   bool in_parallel_group = false;
   uint64_t last_sequence = kMaxSequenceNumber;
@@ -566,7 +567,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // PreprocessWrite does its own perf timing.
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
-    status = PreprocessWrite(write_options, &log_context, &write_context);
+    status = PreprocessWrite(write_options, &wal_context, &write_context);
     if (!two_write_queues_) {
       // Assign it after ::PreprocessWrite since the sequence might advance
       // inside it by WriteRecoverableState
@@ -692,23 +693,21 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
     if (!two_write_queues_) {
       if (status.ok() && !write_options.disableWAL) {
-        assert(log_context.log_file_number_size);
-        log_context.prev_size = log_context.writer->file()->GetFileSize();
-        LogFileNumberSize& log_file_number_size =
-            *(log_context.log_file_number_size);
+        assert(wal_context.wal_file_number_size);
+        wal_context.prev_size = wal_context.writer->file()->GetFileSize();
         PERF_TIMER_GUARD(write_wal_time);
-        io_s =
-            WriteToWAL(write_group, log_context.writer, log_used,
-                       log_context.need_log_sync, log_context.need_log_dir_sync,
-                       last_sequence + 1, log_file_number_size);
+        io_s = WriteGroupToWAL(write_group, wal_context.writer, wal_used,
+                               wal_context.need_wal_sync,
+                               wal_context.need_wal_dir_sync, last_sequence + 1,
+                               *wal_context.wal_file_number_size);
       }
     } else {
       if (status.ok() && !write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
         // LastAllocatedSequence is increased inside WriteToWAL under
         // wal_write_mutex_ to ensure ordered events in WAL
-        io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
-                                    seq_inc);
+        io_s = ConcurrentWriteGroupToWAL(write_group, wal_used, &last_sequence,
+                                         seq_inc);
       } else {
         // Otherwise we inc seq number for memtable writes
         last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -720,16 +719,16 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     last_sequence += seq_inc;
     // Seqno assigned to this write are [current_sequence, last_sequence]
 
-    if (log_context.need_log_sync) {
+    if (wal_context.need_wal_sync) {
       VersionEdit synced_wals;
-      log_write_mutex_.Lock();
+      wal_write_mutex_.Lock();
       if (status.ok()) {
-        MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+        MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
                        &synced_wals);
       } else {
-        MarkLogsNotSynced(logfile_number_);
+        MarkLogsNotSynced(cur_wal_number_);
       }
-      log_write_mutex_.Unlock();
+      wal_write_mutex_.Unlock();
       if (status.ok() && synced_wals.IsWalAddition()) {
         InstrumentedMutexLock l(&mutex_);
         // TODO: plumb Env::IOActivity, Env::IOPriority
@@ -764,7 +763,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         writer->sequence = next_sequence;
         if (writer->pre_release_callback) {
           Status ws = writer->pre_release_callback->Callback(
-              writer->sequence, disable_memtable, writer->log_used, index++,
+              writer->sequence, disable_memtable, writer->wal_used, index++,
               pre_release_callback_cnt);
           if (!ws.ok()) {
             status = pre_release_cb_status = ws;
@@ -882,10 +881,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       }
     }
     if (!w.status.ok()) {
-      if (log_context.prev_size < SIZE_MAX) {
-        InstrumentedMutexLock l(&log_write_mutex_);
-        if (logs_.back().number == log_context.log_file_number_size->number) {
-          logs_.back().SetAttemptTruncateSize(log_context.prev_size);
+      if (wal_context.prev_size < SIZE_MAX) {
+        InstrumentedMutexLock l(&wal_write_mutex_);
+        if (logs_.back().number == wal_context.wal_file_number_size->number) {
+          logs_.back().SetAttemptTruncateSize(wal_context.prev_size);
         }
       }
       HandleMemTableInsertFailure(w.status);
@@ -902,7 +901,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                                   WriteBatch* my_batch, WriteCallback* callback,
                                   UserWriteCallback* user_write_cb,
-                                  uint64_t* log_used, uint64_t log_ref,
+                                  uint64_t* wal_used, uint64_t log_ref,
                                   bool disable_memtable, uint64_t* seq_used) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
@@ -919,10 +918,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     if (w.callback && !w.callback->AllowWriteBatching()) {
       write_thread_.WaitForMemTableWriters();
     }
-    LogContext log_context(!write_options.disableWAL && write_options.sync);
+    WalContext wal_context(!write_options.disableWAL && write_options.sync);
     // PreprocessWrite does its own perf timing.
     PERF_TIMER_STOP(write_pre_and_post_process_time);
-    w.status = PreprocessWrite(write_options, &log_context, &write_context);
+    w.status = PreprocessWrite(write_options, &wal_context, &write_context);
     PERF_TIMER_START(write_pre_and_post_process_time);
 
     // This can set non-OK status if callback fail.
@@ -991,13 +990,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                           wal_write_group.size - 1);
         RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
       }
-      assert(log_context.log_file_number_size);
-      LogFileNumberSize& log_file_number_size =
-          *(log_context.log_file_number_size);
-      io_s =
-          WriteToWAL(wal_write_group, log_context.writer, log_used,
-                     log_context.need_log_sync, log_context.need_log_dir_sync,
-                     current_sequence, log_file_number_size);
+      assert(wal_context.wal_file_number_size);
+      WalFileNumberSize& wal_file_number_size =
+          *(wal_context.wal_file_number_size);
+      io_s = WriteGroupToWAL(wal_write_group, wal_context.writer, wal_used,
+                             wal_context.need_wal_sync,
+                             wal_context.need_wal_dir_sync, current_sequence,
+                             wal_file_number_size);
       w.status = io_s;
     }
 
@@ -1009,13 +1008,13 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     }
 
     VersionEdit synced_wals;
-    if (log_context.need_log_sync) {
-      InstrumentedMutexLock l(&log_write_mutex_);
+    if (wal_context.need_wal_sync) {
+      InstrumentedMutexLock l(&wal_write_mutex_);
       if (w.status.ok()) {
-        MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+        MarkLogsSynced(cur_wal_number_, wal_context.need_wal_dir_sync,
                        &synced_wals);
       } else {
-        MarkLogsNotSynced(logfile_number_);
+        MarkLogsNotSynced(cur_wal_number_);
       }
     }
     if (w.status.ok() && synced_wals.IsWalAddition()) {
@@ -1156,7 +1155,7 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
 Status DBImpl::WriteImplWALOnly(
     WriteThread* write_thread, const WriteOptions& write_options,
     WriteBatch* my_batch, WriteCallback* callback,
-    UserWriteCallback* user_write_cb, uint64_t* log_used,
+    UserWriteCallback* user_write_cb, uint64_t* wal_used,
     const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
     PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
     const PublishLastSeq publish_last_seq, const bool disable_memtable) {
@@ -1169,8 +1168,8 @@ Status DBImpl::WriteImplWALOnly(
   write_thread->JoinBatchGroup(&w);
   assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
   if (w.state == WriteThread::STATE_COMPLETED) {
-    if (log_used != nullptr) {
-      *log_used = w.log_used;
+    if (wal_used != nullptr) {
+      *wal_used = w.wal_used;
     }
     if (seq_used != nullptr) {
       *seq_used = w.sequence;
@@ -1186,10 +1185,10 @@ Status DBImpl::WriteImplWALOnly(
 
     // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
     // without paying the cost of obtaining the mutex.
-    LogContext log_context;
+    WalContext wal_context;
     WriteContext write_context;
     Status status =
-        PreprocessWrite(write_options, &log_context, &write_context);
+        PreprocessWrite(write_options, &wal_context, &write_context);
     WriteStatusCheckOnLocked(status);
 
     if (!status.ok()) {
@@ -1286,8 +1285,8 @@ Status DBImpl::WriteImplWALOnly(
   }
   Status status;
   if (!write_options.disableWAL) {
-    IOStatus io_s =
-        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+    IOStatus io_s = ConcurrentWriteGroupToWAL(write_group, wal_used,
+                                              &last_sequence, seq_inc);
     status = io_s;
     // last_sequence may not be set if there is an error
     // This error checking and return is moved up to avoid using uninitialized
@@ -1339,7 +1338,7 @@ Status DBImpl::WriteImplWALOnly(
       if (!writer->CallbackFailed() && writer->pre_release_callback) {
         assert(writer->sequence != kMaxSequenceNumber);
         Status ws = writer->pre_release_callback->Callback(
-            writer->sequence, disable_memtable, writer->log_used, index++,
+            writer->sequence, disable_memtable, writer->wal_used, index++,
             pre_release_callback_cnt);
         if (!ws.ok()) {
           status = ws;
@@ -1421,9 +1420,9 @@ void DBImpl::HandleMemTableInsertFailure(const Status& status) {
 }
 
 Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
-                               LogContext* log_context,
+                               WalContext* wal_context,
                                WriteContext* write_context) {
-  assert(write_context != nullptr && log_context != nullptr);
+  assert(write_context != nullptr && wal_context != nullptr);
   Status status;
 
   if (error_handler_.IsDBStopped()) {
@@ -1433,7 +1432,8 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
 
   PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
 
-  if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
+  if (UNLIKELY(status.ok() &&
+               wals_total_size_.LoadRelaxed() > GetMaxTotalWalSize())) {
     assert(versions_);
     InstrumentedMutexLock l(&mutex_);
     const ColumnFamilySet* const column_families =
@@ -1502,17 +1502,17 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
       WriteBufferManagerStallWrites();
     }
   }
-  InstrumentedMutexLock l(&log_write_mutex_);
-  if (status.ok() && log_context->need_log_sync) {
+  InstrumentedMutexLock l(&wal_write_mutex_);
+  if (status.ok() && wal_context->need_wal_sync) {
     // Wait until the parallel syncs are finished. Any sync process has to sync
     // the front log too so it is enough to check the status of front()
-    // We do a while loop since log_sync_cv_ is signalled when any sync is
+    // We do a while loop since wal_sync_cv_ is signalled when any sync is
     // finished
     // Note: there does not seem to be a reason to wait for parallel sync at
     // this early step but it is not important since parallel sync (SyncWAL) and
-    // need_log_sync are usually not used together.
+    // need_wal_sync are usually not used together.
     while (logs_.front().IsSyncing()) {
-      log_sync_cv_.Wait();
+      wal_sync_cv_.Wait();
     }
     for (auto& log : logs_) {
       // This is just to prevent the logs to be synced by a parallel SyncWAL
@@ -1523,12 +1523,12 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
       log.PrepareForSync();
     }
   } else {
-    log_context->need_log_sync = false;
+    wal_context->need_wal_sync = false;
   }
-  log_context->writer = logs_.back().writer;
-  log_context->need_log_dir_sync =
-      log_context->need_log_dir_sync && !log_dir_synced_;
-  log_context->log_file_number_size = std::addressof(alive_log_files_.back());
+  wal_context->writer = logs_.back().writer;
+  wal_context->need_wal_dir_sync =
+      wal_context->need_wal_dir_sync && !wal_dir_synced_;
+  wal_context->wal_file_number_size = std::addressof(alive_wal_files_.back());
 
   return status;
 }
@@ -1579,12 +1579,12 @@ Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
 }
 
 // When two_write_queues_ is disabled, this function is called from the only
-// write thread. Otherwise this must be called holding log_write_mutex_.
+// write thread. Otherwise this must be called holding wal_write_mutex_.
 IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                             const WriteOptions& write_options,
-                            log::Writer* log_writer, uint64_t* log_used,
+                            log::Writer* log_writer, uint64_t* wal_used,
                             uint64_t* log_size,
-                            LogFileNumberSize& log_file_number_size,
+                            WalFileNumberSize& wal_file_number_size,
                             SequenceNumber sequence) {
   assert(log_size != nullptr);
 
@@ -1596,7 +1596,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   }
   *log_size = log_entry.size();
   // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
-  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+  // from the two queues anyway and wal_write_mutex_ is already held. Otherwise
   // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
   // from possible concurrent calls via the FlushWAL by the application.
   const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
@@ -1604,7 +1604,7 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
   // when we do not need any locking.
   if (UNLIKELY(needs_locking)) {
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
   }
   IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord(
       write_options, versions_->GetColumnFamiliesTimestampSizeForRecord());
@@ -1614,24 +1614,24 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   io_s = log_writer->AddRecord(write_options, log_entry, sequence);
 
   if (UNLIKELY(needs_locking)) {
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
   }
-  if (log_used != nullptr) {
-    *log_used = logfile_number_;
-    assert(*log_used == log_file_number_size.number);
+  if (wal_used != nullptr) {
+    *wal_used = cur_wal_number_;
+    assert(*wal_used == wal_file_number_size.number);
   }
-  total_log_size_ += log_entry.size();
-  log_file_number_size.AddSize(*log_size);
-  log_empty_ = false;
+  wals_total_size_.FetchAddRelaxed(log_entry.size());
+  wal_file_number_size.AddSize(*log_size);
+  wal_empty_ = false;
 
   return io_s;
 }
 
-IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
-                            log::Writer* log_writer, uint64_t* log_used,
-                            bool need_log_sync, bool need_log_dir_sync,
-                            SequenceNumber sequence,
-                            LogFileNumberSize& log_file_number_size) {
+IOStatus DBImpl::WriteGroupToWAL(const WriteThread::WriteGroup& write_group,
+                                 log::Writer* log_writer, uint64_t* wal_used,
+                                 bool need_wal_sync, bool need_wal_dir_sync,
+                                 SequenceNumber sequence,
+                                 WalFileNumberSize& wal_file_number_size) {
   IOStatus io_s;
   assert(!two_write_queues_);
   assert(!write_group.leader->disable_wal);
@@ -1646,10 +1646,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   }
 
   if (merged_batch == write_group.leader->batch) {
-    write_group.leader->log_used = logfile_number_;
+    write_group.leader->wal_used = cur_wal_number_;
   } else if (write_with_wal > 1) {
     for (auto writer : write_group) {
-      writer->log_used = logfile_number_;
+      writer->wal_used = cur_wal_number_;
     }
   }
 
@@ -1661,14 +1661,14 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   WriteOptions write_options;
   write_options.rate_limiter_priority =
       write_group.leader->rate_limiter_priority;
-  io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used,
-                    &log_size, log_file_number_size, sequence);
+  io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
+                    &log_size, wal_file_number_size, sequence);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
   }
 
-  if (io_s.ok() && need_log_sync) {
+  if (io_s.ok() && need_wal_sync) {
     StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
     // It's safe to access logs_ with unlocked mutex_ here because:
     //  - we've set getting_synced=true for all logs,
@@ -1678,15 +1678,15 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
     //  - as long as other threads don't modify it, it's safe to read
     //    from std::deque from multiple threads concurrently.
     //
-    // Sync operation should work with locked log_write_mutex_, because:
+    // Sync operation should work with locked wal_write_mutex_, because:
     //   when DBOptions.manual_wal_flush_ is set,
     //   FlushWAL function will be invoked by another thread.
-    //   if without locked log_write_mutex_, the log file may get data
+    //   if without locked wal_write_mutex_, the log file may get data
     //   corruption
 
     const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
     if (UNLIKELY(needs_locking)) {
-      log_write_mutex_.Lock();
+      wal_write_mutex_.Lock();
     }
 
     if (io_s.ok()) {
@@ -1709,10 +1709,10 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
     }
 
     if (UNLIKELY(needs_locking)) {
-      log_write_mutex_.Unlock();
+      wal_write_mutex_.Unlock();
     }
 
-    if (io_s.ok() && need_log_dir_sync) {
+    if (io_s.ok() && need_wal_dir_sync) {
       // We only sync WAL directory the first time WAL syncing is
       // requested, so that in case users never turn on WAL sync,
       // we can avoid the disk I/O in the write code path.
@@ -1727,7 +1727,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   }
   if (io_s.ok()) {
     auto stats = default_cf_internal_stats_;
-    if (need_log_sync) {
+    if (need_wal_sync) {
       stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
       RecordTick(stats_, WAL_FILE_SYNCED);
     }
@@ -1744,8 +1744,8 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   return io_s;
 }
 
-IOStatus DBImpl::ConcurrentWriteToWAL(
-    const WriteThread::WriteGroup& write_group, uint64_t* log_used,
+IOStatus DBImpl::ConcurrentWriteGroupToWAL(
+    const WriteThread::WriteGroup& write_group, uint64_t* wal_used,
     SequenceNumber* last_sequence, size_t seq_inc) {
   IOStatus io_s;
 
@@ -1762,14 +1762,14 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
     return io_s;
   }
 
-  // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+  // We need to lock wal_write_mutex_ since logs_ and alive_wal_files might be
   // pushed back concurrently
-  log_write_mutex_.Lock();
+  wal_write_mutex_.Lock();
   if (merged_batch == write_group.leader->batch) {
-    write_group.leader->log_used = logfile_number_;
+    write_group.leader->wal_used = cur_wal_number_;
   } else if (write_with_wal > 1) {
     for (auto writer : write_group) {
-      writer->log_used = logfile_number_;
+      writer->wal_used = cur_wal_number_;
     }
   }
   *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
@@ -1777,9 +1777,9 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
   WriteBatchInternal::SetSequence(merged_batch, sequence);
 
   log::Writer* log_writer = logs_.back().writer;
-  LogFileNumberSize& log_file_number_size = alive_log_files_.back();
+  WalFileNumberSize& wal_file_number_size = alive_wal_files_.back();
 
-  assert(log_writer->get_log_number() == log_file_number_size.number);
+  assert(log_writer->get_log_number() == wal_file_number_size.number);
 
   uint64_t log_size;
 
@@ -1787,13 +1787,13 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
   WriteOptions write_options;
   write_options.rate_limiter_priority =
       write_group.leader->rate_limiter_priority;
-  io_s = WriteToWAL(*merged_batch, write_options, log_writer, log_used,
-                    &log_size, log_file_number_size, sequence);
+  io_s = WriteToWAL(*merged_batch, write_options, log_writer, wal_used,
+                    &log_size, wal_file_number_size, sequence);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
   }
-  log_write_mutex_.Unlock();
+  wal_write_mutex_.Unlock();
 
   if (io_s.ok()) {
     const bool concurrent = true;
@@ -1821,7 +1821,7 @@ Status DBImpl::WriteRecoverableState() {
     bool dont_care_bool;
     SequenceNumber next_seq;
     if (two_write_queues_) {
-      log_write_mutex_.Lock();
+      wal_write_mutex_.Lock();
     }
     SequenceNumber seq;
     if (two_write_queues_) {
@@ -1846,7 +1846,7 @@ Status DBImpl::WriteRecoverableState() {
       HandleMemTableInsertFailure(status);
     }
     if (two_write_queues_) {
-      log_write_mutex_.Unlock();
+      wal_write_mutex_.Unlock();
     }
     if (status.ok() && recoverable_state_pre_release_callback_) {
       const bool DISABLE_MEMTABLE = true;
@@ -1927,11 +1927,11 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
   assert(write_context != nullptr);
   Status status;
 
-  if (alive_log_files_.begin()->getting_flushed) {
+  if (alive_wal_files_.begin()->getting_flushed) {
     return status;
   }
 
-  auto oldest_alive_log = alive_log_files_.begin()->number;
+  auto oldest_alive_log = alive_wal_files_.begin()->number;
   bool flush_wont_release_oldest_log = false;
   if (allow_2pc()) {
     auto oldest_log_with_uncommitted_prep =
@@ -1961,14 +1961,14 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
     // transactions then we cannot flush this log until those transactions are
     // commited.
     unable_to_release_oldest_log_ = false;
-    alive_log_files_.begin()->getting_flushed = true;
+    alive_wal_files_.begin()->getting_flushed = true;
   }
 
   ROCKS_LOG_INFO(
       immutable_db_options_.info_log,
       "Flushing all column families with data in WAL number %" PRIu64
       ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
-      oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+      oldest_alive_log, wals_total_size_.LoadRelaxed(), GetMaxTotalWalSize());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
   autovector<ColumnFamilyData*> cfds;
@@ -2438,21 +2438,21 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
   // Do this without holding the dbmutex lock.
   assert(versions_->prev_log_number() == 0);
   if (two_write_queues_) {
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
   }
-  bool creating_new_log = !log_empty_;
+  bool creating_new_log = !wal_empty_;
   if (two_write_queues_) {
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
   }
   uint64_t recycle_log_number = 0;
   // If file deletion is disabled, don't recycle logs since it'll result in
   // the file getting renamed
   if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
-      !log_recycle_files_.empty() && IsFileDeletionsEnabled()) {
-    recycle_log_number = log_recycle_files_.front();
+      !wal_recycle_files_.empty() && IsFileDeletionsEnabled()) {
+    recycle_log_number = wal_recycle_files_.front();
   }
   uint64_t new_log_number =
-      creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+      creating_new_log ? versions_->NewFileNumber() : cur_wal_number_;
   // For use outside of holding DB mutex
   const MutableCFOptions mutable_cf_options_copy =
       cfd->GetLatestMutableCFOptions();
@@ -2478,14 +2478,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
   mutex_.Unlock();
   if (creating_new_log) {
     PredecessorWALInfo info;
-    log_write_mutex_.Lock();
+    wal_write_mutex_.Lock();
     if (!logs_.empty()) {
       log::Writer* cur_log_writer = logs_.back().writer;
       info = PredecessorWALInfo(cur_log_writer->get_log_number(),
                                 cur_log_writer->file()->GetFileSize(),
                                 cur_log_writer->GetLastSeqnoRecorded());
     }
-    log_write_mutex_.Unlock();
+    wal_write_mutex_.Unlock();
     // TODO: Write buffer size passed in should be max of all CF's instead
     // of mutable_cf_options.write_buffer_size.
     io_s = CreateWAL(write_options, new_log_number, recycle_log_number,
@@ -2526,11 +2526,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
     // concurrent full purges don't delete the file while we're recycling it.
     // To achieve that we hold the old log number in the recyclable list until
     // after it has been renamed.
-    assert(log_recycle_files_.front() == recycle_log_number);
-    log_recycle_files_.pop_front();
+    assert(wal_recycle_files_.front() == recycle_log_number);
+    wal_recycle_files_.pop_front();
   }
   if (s.ok() && creating_new_log) {
-    InstrumentedMutexLock l(&log_write_mutex_);
+    InstrumentedMutexLock l(&wal_write_mutex_);
     assert(new_log != nullptr);
     if (!logs_.empty()) {
       // Alway flush the buffer of the last log before switching to a new one
@@ -2552,11 +2552,11 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
       }
     }
     if (s.ok()) {
-      logfile_number_ = new_log_number;
-      log_empty_ = true;
-      log_dir_synced_ = false;
-      logs_.emplace_back(logfile_number_, new_log);
-      alive_log_files_.emplace_back(logfile_number_);
+      cur_wal_number_ = new_log_number;
+      wal_empty_ = true;
+      wal_dir_synced_ = false;
+      logs_.emplace_back(cur_wal_number_, new_log);
+      alive_wal_files_.emplace_back(cur_wal_number_);
     }
   }
 
@@ -2587,7 +2587,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
     // obsolete. So we should track the WAL obsoletion event before actually
     // updating the empty CF's log number.
     uint64_t min_wal_number_to_keep =
-        versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+        versions_->PreComputeMinLogNumberWithUnflushedData(cur_wal_number_);
     if (min_wal_number_to_keep >
         versions_->GetWalSet().GetMinWalNumberToKeep()) {
       // TODO: plumb Env::IOActivity, Env::IOPriority
@@ -2622,7 +2622,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
 
       for (auto cf : empty_cfs) {
         if (cf->IsEmpty()) {
-          cf->SetLogNumber(logfile_number_);
+          cf->SetLogNumber(cur_wal_number_);
           // MEMPURGE: No need to change this, because new adds
           // should still receive new sequence numbers.
           cf->mem()->SetCreationSeq(versions_->LastSequence());
@@ -2639,14 +2639,14 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
       // advance the log number. no need to persist this in the manifest
       if (cf->IsEmpty()) {
         if (creating_new_log) {
-          cf->SetLogNumber(logfile_number_);
+          cf->SetLogNumber(cur_wal_number_);
         }
         cf->mem()->SetCreationSeq(versions_->LastSequence());
       }
     }
   }
 
-  cfd->mem()->SetNextLogNumber(logfile_number_);
+  cfd->mem()->SetNextLogNumber(cur_wal_number_);
   assert(new_mem != nullptr);
   cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
   if (new_imm) {
@@ -2658,7 +2658,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context,
     // we always try to flush all immutable memtable. For atomic flush, these
     // two memtables will be marked eligible for flush in the same call to
     // AssignAtomicFlushSeq().
-    new_imm->SetNextLogNumber(logfile_number_);
+    new_imm->SetNextLogNumber(cur_wal_number_);
     cfd->imm()->Add(new_imm, &context->memtables_to_free_);
   }
   new_mem->Ref();
diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc
index 6eea6e5b4ba0..7d18688f0788 100644
--- a/db/db_kv_checksum_test.cc
+++ b/db/db_kv_checksum_test.cc
@@ -312,12 +312,12 @@ TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
     // Corrupted write batch leads to read-only mode, so we have to
     // reopen for every attempt.
     Reopen(options);
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
 
     SyncPoint::GetInstance()->EnableProcessing();
     ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
     // Confirm that nothing was written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
     SyncPoint::GetInstance()->DisableProcessing();
 
@@ -350,12 +350,12 @@ TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
     // Corrupted write batch leads to read-only mode, so we have to
     // reopen for every attempt.
     ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
 
     SyncPoint::GetInstance()->EnableProcessing();
     ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
     // Confirm that nothing was written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
     SyncPoint::GetInstance()->DisableProcessing();
 
@@ -487,7 +487,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
     // Reopen DB since it failed WAL write which lead to read-only mode
     Reopen(options);
     SyncPoint::GetInstance()->EnableProcessing();
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
     leader_batch_and_status =
         GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
                       8 /* protection_bytes_per_key */, op_type1_);
@@ -499,7 +499,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
     SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
     ASSERT_EQ(1, leader_count);
     // Nothing should have been written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
 
     corrupt_byte_offset++;
@@ -599,7 +599,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
     // Reopen DB since it failed WAL write which lead to read-only mode
     ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
     SyncPoint::GetInstance()->EnableProcessing();
-    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    auto log_size_pre_write = dbfull()->TEST_wals_total_size();
     leader_batch_and_status =
         GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
                       8 /* protection_bytes_per_key */, op_type1_);
@@ -612,7 +612,7 @@ TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
 
     ASSERT_EQ(1, leader_count);
     // Nothing should have been written to WAL
-    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_wals_total_size());
     ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
 
     corrupt_byte_offset++;
diff --git a/db/db_test.cc b/db/db_test.cc
index b1c181a1f3ca..b3511f3eecd8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3407,7 +3407,7 @@ class ModelDB : public DB {
   }
 
   Status GetCurrentWalFile(
-      std::unique_ptr<LogFile>* /*current_log_file*/) override {
+      std::unique_ptr<LogFile>* /*current_wal_file*/) override {
     return Status::OK();
   }
 
@@ -6414,7 +6414,7 @@ TEST_F(DBTest, TestLogCleanup) {
 
   for (int i = 0; i < 100000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
-    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // only 2 memtables will be alive, so wals_to_free needs to always be below
     // 2
     ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
   }
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index f89cfe59463b..da9ef31587f7 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -3024,13 +3024,13 @@ TEST_F(DBWALTest, GetCompressedWalsAfterSync) {
   options.wal_compression = kZSTD;
   DestroyAndReopen(options);
 
-  // Write something to memtable and WAL so that log_empty_ will be false after
+  // Write something to memtable and WAL so that wal_empty_ will be false after
   // next DB::Open().
   ASSERT_OK(Put("a", "v"));
 
   Reopen(options);
 
-  // New WAL is created, thanks to !log_empty_.
+  // New WAL is created, thanks to !wal_empty_.
   ASSERT_OK(dbfull()->TEST_SwitchWAL());
 
   ASSERT_OK(Put("b", "v"));
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index 2dfcd864f5a5..e3afd219dcc3 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -987,7 +987,7 @@ TEST_P(DBWriteTest, RecycleLogToggleTest) {
 
   options.recycle_log_file_num = 1;
   Reopen(options);
-  // 1.log is added to alive_log_files_
+  // 1.log is added to alive_wal_files_
   ASSERT_OK(Put(Key(2), "val1"));
   ASSERT_OK(Flush());
   // 1.log should be deleted and not recycled, since it
diff --git a/db/job_context.h b/db/job_context.h
index 766502ca4602..3d2fe933a5c2 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -123,7 +123,7 @@ struct JobContext {
         break;
       }
     }
-    return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+    return memtables_to_free.size() > 0 || wals_to_free.size() > 0 ||
            job_snapshot != nullptr || sv_have_sth;
   }
 
@@ -193,7 +193,7 @@ struct JobContext {
   // contexts for installing superversions for multiple column families
   std::vector<SuperVersionContext> superversion_contexts;
 
-  autovector<log::Writer*> logs_to_free;
+  autovector<log::Writer*> wals_to_free;
 
   // the current manifest_file_number, log_number and prev_log_number
   // that corresponds to the set of files in 'live'.
@@ -207,8 +207,8 @@ struct JobContext {
   uint64_t prev_log_number;
 
   uint64_t min_pending_output = 0;
-  uint64_t prev_total_log_size = 0;
-  size_t num_alive_log_files = 0;
+  uint64_t prev_wals_total_size = 0;
+  size_t num_alive_wal_files = 0;
   uint64_t size_log_to_delete = 0;
 
   // Snapshot taken before flush/compaction job.
@@ -237,18 +237,18 @@ struct JobContext {
     for (auto m : memtables_to_free) {
       delete m;
     }
-    for (auto l : logs_to_free) {
+    for (auto l : wals_to_free) {
       delete l;
     }
 
     memtables_to_free.clear();
-    logs_to_free.clear();
+    wals_to_free.clear();
     job_snapshot.reset();
   }
 
   ~JobContext() {
     assert(memtables_to_free.size() == 0);
-    assert(logs_to_free.size() == 0);
+    assert(wals_to_free.size() == 0);
   }
 };
 
diff --git a/db/memtable.h b/db/memtable.h
index 21532e4566ba..bd64499024f6 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -355,13 +355,13 @@ class ReadOnlyMemTable {
   // be flushed to storage
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  uint64_t GetNextLogNumber() const { return mem_next_logfile_number_; }
+  uint64_t GetNextLogNumber() const { return mem_next_walfile_number_; }
 
   // Sets the next active logfile number when this memtable is about to
   // be flushed to storage
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+  void SetNextLogNumber(uint64_t num) { mem_next_walfile_number_ = num; }
 
   // REQUIRES: db_mutex held.
   void SetID(uint64_t id) { id_ = id; }
@@ -516,7 +516,7 @@ class ReadOnlyMemTable {
   VersionEdit edit_;
 
   // The log files earlier than this number can be deleted.
-  uint64_t mem_next_logfile_number_{0};
+  uint64_t mem_next_walfile_number_{0};
 
   // Memtable id to track flush.
   uint64_t id_ = 0;
diff --git a/db/write_thread.h b/db/write_thread.h
index 42256970f413..7adf362dcba7 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -132,7 +132,7 @@ class WriteThread {
     size_t protection_bytes_per_key;
     PreReleaseCallback* pre_release_callback;
     PostMemTableCallback* post_memtable_callback;
-    uint64_t log_used;  // log number that this batch was inserted into
+    uint64_t wal_used;  // log number that this batch was inserted into
     uint64_t log_ref;   // log number that memtable insert should reference
     WriteCallback* callback;
     UserWriteCallback* user_write_cb;
@@ -161,7 +161,7 @@ class WriteThread {
           protection_bytes_per_key(0),
           pre_release_callback(nullptr),
           post_memtable_callback(nullptr),
-          log_used(0),
+          wal_used(0),
           log_ref(0),
           callback(nullptr),
           user_write_cb(nullptr),
@@ -190,7 +190,7 @@ class WriteThread {
           protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
           pre_release_callback(_pre_release_callback),
           post_memtable_callback(_post_memtable_callback),
-          log_used(0),
+          wal_used(0),
           log_ref(_log_ref),
           callback(_callback),
           user_write_cb(_user_write_cb),
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 2727bbcb2290..58eafe2dc40a 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1893,12 +1893,12 @@ class DB {
   // Retrieve information about the current wal file
   //
   // Note that the log might have rolled after this call in which case
-  // the current_log_file would not point to the current log file.
+  // the current_wal_file would not point to the current log file.
   //
-  // Additionally, for the sake of optimization current_log_file->StartSequence
+  // Additionally, for the sake of optimization current_wal_file->StartSequence
   // would always be set to 0
   virtual Status GetCurrentWalFile(
-      std::unique_ptr<WalFile>* current_log_file) = 0;
+      std::unique_ptr<WalFile>* current_wal_file) = 0;
 
   // IngestExternalFile() will load a list of external SST files (1) into the DB
   // Two primary modes are supported:
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a9b5bb373e18..c4b19fea7895 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -815,6 +815,7 @@ struct DBOptions {
   // If it is non empty, the log files will be in the specified dir,
   // and the db data dir's absolute path will be used as the log file
   // name's prefix.
+  // NOTE: not for WALs
   std::string db_log_dir = "";
 
   // This specifies the absolute dir path for write-ahead logs (WAL).
@@ -895,21 +896,24 @@ struct DBOptions {
   // be created.
   // If max_log_file_size == 0, all logs will be written to one
   // log file.
+  // NOTE: not for WALs
   size_t max_log_file_size = 0;
 
   // Time for the info log file to roll (in seconds).
   // If specified with non-zero value, log file will be rolled
   // if it has been active longer than `log_file_time_to_roll`.
   // Default: 0 (disabled)
+  // NOTE: not for WALs
   size_t log_file_time_to_roll = 0;
 
   // Maximal info log files to be kept.
   // Default: 1000
+  // NOTE: not for WALs
   size_t keep_log_file_num = 1000;
 
-  // Recycle log files.
-  // If non-zero, we will reuse previously written log files for new
-  // logs, overwriting the old data.  The value indicates how many
+  // Recycle WAL files.
+  // If non-zero, we will reuse previously written WAL files for new
+  // WALs, overwriting the old data.  The value indicates how many
   // such files we will keep around at any point in time for later
   // use.  This is more efficient because the blocks are already
   // allocated and fdatasync does not need to update the inode after
@@ -1415,9 +1419,10 @@ struct DBOptions {
   // prefix_same_as_start=true can take advantage of prefix seek optimizations.
   bool prefix_seek_opt_in_only = false;
 
-  // The number of bytes to prefetch when reading the log. This is mostly useful
-  // for reading a remotely located log, as it can save the number of
-  // round-trips. If 0, then the prefetching is disabled.
+  // The number of bytes to prefetch when reading the DB manifest and WAL files
+  // during DB::Open (and variants). This is mostly useful for reading a
+  // remotely located log, as it can save the number of round-trips. If 0, then
+  // the prefetching is disabled.
   //
   // Default: 0
   size_t log_readahead_size = 0;
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index bea4b0d133f2..b4fa0fc92d91 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -517,8 +517,8 @@ class StackableDB : public DB {
   }
 
   Status GetCurrentWalFile(
-      std::unique_ptr<WalFile>* current_log_file) override {
-    return db_->GetCurrentWalFile(current_log_file);
+      std::unique_ptr<WalFile>* current_wal_file) override {
+    return db_->GetCurrentWalFile(current_wal_file);
   }
 
   Status GetCreationTimeOfOldestFile(uint64_t* creation_time) override {
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index 295e7bf3daa3..fab5914742aa 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -616,7 +616,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   // LogNumbers: default: 16, stats: 10, pikachu: 5
   // Since in recovery process, cfd_stats column is created after WAL is
   // created, synced and MANIFEST is persisted, its log number which depends on
-  // logfile_number_ will be different. Since "pikachu" is never flushed, thus
+  // cur_wal_number_ will be different. Since "pikachu" is never flushed, thus
   // its log_number should be the smallest of the three.
   ASSERT_OK(Flush());
   ASSERT_LT(cfd_test->GetLogNumber(), cfd_stats->GetLogNumber());
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index e978ad863135..711de009d0ba 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -811,7 +811,7 @@ Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
   }
   auto s = db_impl_->WriteImpl(
       write_options_, wb,
-      /*callback*/ nullptr, /*user_write_cb=*/nullptr, /*log_used*/ nullptr,
+      /*callback*/ nullptr, /*user_write_cb=*/nullptr, /*wal_used*/ nullptr,
       /*log_ref*/ 0, /*disable_memtable*/ false, &seq_used, /*batch_cnt=*/0,
       /*pre_release_callback=*/nullptr, post_mem_cb);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
@@ -825,7 +825,7 @@ Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
   uint64_t seq_used = kMaxSequenceNumber;
   auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr,
                                /*user_write_cb=*/nullptr,
-                               /*log_used*/ nullptr, /*log_ref*/ 0,
+                               /*wal_used*/ nullptr, /*log_ref*/ 0,
                                /*disable_memtable*/ false, &seq_used);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   if (s.ok()) {
@@ -917,7 +917,7 @@ Status WriteCommittedTxn::CommitInternal() {
     s = db_impl_->WriteImpl(
         write_options_, working_batch, /*callback*/ nullptr,
         /*user_write_cb=*/nullptr,
-        /*log_used*/ nullptr, /*log_ref*/ log_number_,
+        /*wal_used*/ nullptr, /*log_ref*/ log_number_,
         /*disable_memtable*/ false, &seq_used,
         /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, post_mem_cb,
         /*wbwi=*/std::make_shared<WriteBatchWithIndex>(std::move(write_batch_)),
@@ -929,7 +929,7 @@ Status WriteCommittedTxn::CommitInternal() {
   } else {
     s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
                             /*user_write_cb=*/nullptr,
-                            /*log_used*/ nullptr, /*log_ref*/ log_number_,
+                            /*wal_used*/ nullptr, /*log_ref*/ log_number_,
                             /*disable_memtable*/ false, &seq_used,
                             /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
                             post_mem_cb);
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index 8e9647b8c477..444c1c9b6350 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -374,7 +374,7 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
   uint64_t seq_used = kMaxSequenceNumber;
   // log_number_ should refer to the oldest log containing uncommitted data
   // from the current transaction. This means that if log_number_ is set,
-  // WriteImpl should not overwrite that value, so set log_used to nullptr if
+  // WriteImpl should not overwrite that value, so set wal_used to nullptr if
   // log_number_ is already set.
   s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
                           /*callback*/ nullptr, /*user_write_cb=*/nullptr,

From 84a8dd994c338d266e626b4d4a7631654731935a Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 11 Apr 2025 11:35:57 -0700
Subject: [PATCH 052/500] Some MultiScan code cleanup (#13530)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13530

Reviewed By: pdillinger

Differential Revision: D72677865

Pulled By: anand1976

fbshipit-source-id: 63e7a15b6e8cd61b676e3b22e1c04c7446adcbd3
---
 include/rocksdb/external_table.h |  2 +-
 include/rocksdb/multi_scan.h     | 42 +++++++++++++-------
 include/rocksdb/options.h        |  2 +-
 table/table_test.cc              | 67 ++++++++++++++++++++++++++------
 4 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index d449532143cb..22db93ce140d 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -89,7 +89,7 @@ class ExternalTableIterator : public IteratorBase {
   // request that this be called by setting value_prepared to false in
   // IterateResult. Next() should always implicitly materialize the
   // value.
-  virtual bool PrepareValue() = 0;
+  bool PrepareValue() override = 0;
 
   // Return the current key's value
   virtual Slice value() const = 0;
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
index 73b6d766e21a..dc173e48e6df 100644
--- a/include/rocksdb/multi_scan.h
+++ b/include/rocksdb/multi_scan.h
@@ -37,8 +37,8 @@ namespace ROCKSDB_NAMESPACE {
 //
 //  std::vector<ScanOptions> scans{{.start = Slice("bar")},
 //                              {.start = Slice("foo")}};
-//  std::unique_ptr<MultiScanIterator> iter.reset(
-//                                      db->NewMultiScanIterator());
+//  std::unique_ptr<MultiScan> iter.reset(
+//                                      db->NewMultiScan());
 //  try {
 //    for (auto scan : *iter) {
 //      for (auto it : scan) {
@@ -46,9 +46,23 @@ namespace ROCKSDB_NAMESPACE {
 //        // Do something with value - it.second
 //      }
 //    }
-//  } catch (Status s) {
+//  } catch (MultiScanException& ex) {
+//    // Check ex.status()
+//  } catch (std::logic_error& ex) {
+//    // Check ex.what()
 //  }
 
+class MultiScanException : public std::runtime_error {
+ public:
+  explicit MultiScanException(Status& s)
+      : std::runtime_error(s.ToString()), s_(s) {}
+
+  Status& status() { return s_; }
+
+ private:
+  Status s_;
+};
+
 // A container object encapsulating a single scan range. It supports an
 // std::input_iterator for a single pass iteration of the KVs in the range.
 // A Status exception is thrown if there is an error in scanning the range.
@@ -56,7 +70,7 @@ class Scan {
  public:
   class ScanIterator;
 
-  Scan(Iterator* db_iter) : db_iter_(db_iter) {}
+  explicit Scan(Iterator* db_iter) : db_iter_(db_iter) {}
 
   ScanIterator begin() { return ScanIterator(db_iter_); }
 
@@ -84,12 +98,12 @@ class Scan {
 
     ScanIterator& operator++() {
       if (!valid_) {
-        throw Status::InvalidArgument("Trying to advance invalid iterator");
+        throw std::logic_error("Trying to advance invalid iterator");
       } else {
         db_iter_->Next();
         status_ = db_iter_->status();
         if (!status_.ok()) {
-          throw status_;
+          throw MultiScanException(status_);
         } else {
           valid_ = db_iter_->Valid();
           if (valid_) {
@@ -106,13 +120,13 @@ class Scan {
 
     reference operator*() {
       if (!valid_) {
-        throw Status::InvalidArgument("Trying to deref invalid iterator");
+        throw std::logic_error("Trying to deref invalid iterator");
       }
       return result_;
     }
     reference operator->() {
       if (!valid_) {
-        throw Status::InvalidArgument("Trying to deref invalid iterator");
+        throw std::logic_error("Trying to deref invalid iterator");
       }
       return result_;
     }
@@ -144,7 +158,7 @@ class MultiScan {
 
   class MultiScanIterator {
    public:
-    MultiScanIterator(MultiScanIterator&) = delete;
+    MultiScanIterator(const MultiScanIterator&) = delete;
     MultiScanIterator operator=(MultiScanIterator&) = delete;
 
     using self_type = MultiScanIterator;
@@ -158,16 +172,16 @@ class MultiScan {
                       Iterator* db_iter)
         : scan_opts_(scan_opts), idx_(0), db_iter_(db_iter), scan_(db_iter_) {
       if (scan_opts_.empty()) {
-        throw Status::InvalidArgument("Zero scans in multi-scan");
+        throw std::logic_error("Zero scans in multi-scan");
       }
       db_iter_->Seek(*scan_opts_[idx_].range.start);
       status_ = db_iter_->status();
       if (!status_.ok()) {
-        throw status_;
+        throw MultiScanException(status_);
       }
     }
 
-    MultiScanIterator(const std::vector<ScanOptions>& scan_opts)
+    explicit MultiScanIterator(const std::vector<ScanOptions>& scan_opts)
         : scan_opts_(scan_opts),
           idx_(scan_opts_.size()),
           db_iter_(nullptr),
@@ -177,14 +191,14 @@ class MultiScan {
 
     MultiScanIterator& operator++() {
       if (idx_ >= scan_opts_.size()) {
-        throw Status::InvalidArgument("Index out of range");
+        throw std::logic_error("Index out of range");
       }
       idx_++;
       if (idx_ < scan_opts_.size()) {
         db_iter_->Seek(*scan_opts_[idx_].range.start);
         status_ = db_iter_->status();
         if (!status_.ok()) {
-          throw status_;
+          throw MultiScanException(status_);
         }
       }
       return *this;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index c4b19fea7895..73e00bc49880 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1737,7 +1737,7 @@ struct ScanOptions {
   std::optional<std::unordered_map<std::string, std::string>> property_bag;
 
   // An unbounded scan with a start key
-  ScanOptions(const Slice& _start) : range(_start, OptSlice()) {}
+  explicit ScanOptions(const Slice& _start) : range(_start, OptSlice()) {}
 
   // A bounded scan with a start key and upper bound
   ScanOptions(const Slice& _start, const Slice& _upper_bound)
diff --git a/table/table_test.cc b/table/table_test.cc
index 919dc285eaa4..efc6219ceeff 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6624,7 +6624,10 @@ class ExternalTableReaderTest : public DBTestBase {
           num_opts_(0),
           scan_idx_(0),
           kv_map_(kv_map),
-          valid_(false) {}
+          valid_(false) {
+      TEST_SYNC_POINT_CALLBACK("DummyExternalTableIterator::Constructor",
+                               &status_);
+    }
 
     bool Valid() const override { return valid_; }
 
@@ -6946,8 +6949,8 @@ TEST_F(ExternalTableReaderTest, DBIterTest) {
   options.env = Env::Default();
   ASSERT_OK(DestroyDB(dbname, options));
 
-  std::shared_ptr<ExternalTableFactory> factory(
-      new DummyExternalTableFactory());
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>();
   options.table_factory = NewExternalTableFactory(factory);
 
   // Create a file
@@ -7000,8 +7003,8 @@ TEST_F(ExternalTableReaderTest, DBMultiScanTest) {
   options.env = Env::Default();
   ASSERT_OK(DestroyDB(dbname, options));
 
-  std::shared_ptr<ExternalTableFactory> factory(
-      new DummyExternalTableFactory());
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>();
   options.table_factory = NewExternalTableFactory(factory);
 
   // Create a file
@@ -7048,8 +7051,13 @@ TEST_F(ExternalTableReaderTest, DBMultiScanTest) {
       idx += 2;
     }
     ASSERT_EQ(count, 32);
-  } catch (Status status) {
-    std::cerr << "Iterator returned status " << status.ToString();
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
     abort();
   }
   iter.reset();
@@ -7070,8 +7078,13 @@ TEST_F(ExternalTableReaderTest, DBMultiScanTest) {
       idx += 2;
     }
     ASSERT_EQ(count, 52);
-  } catch (Status status) {
-    std::cerr << "Iterator returned status " << status.ToString();
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
     abort();
   }
   iter.reset();
@@ -7094,11 +7107,43 @@ TEST_F(ExternalTableReaderTest, DBMultiScanTest) {
       idx += 2;
     }
     ASSERT_EQ(count, 52);
-  } catch (Status status) {
-    std::cerr << "Iterator returned status " << status.ToString();
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DummyExternalTableIterator::Constructor", [](void* arg) {
+        Status* status = static_cast<Status*>(arg);
+        *status = Status::IOError();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  iter = db->NewMultiScan(ro, cfh, scan_options);
+  try {
+    for (auto range : *iter) {
+      // Should not get here. Iterator should throw an exception
+      assert(false);
+      for (auto it : range) {
+        (void)it;
+        assert(false);
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_EQ(ex.status(), Status::IOError());
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
     abort();
   }
   iter.reset();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());

From 29c6610617ddc1b486f12b99c16e7c9851e80430 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 14 Apr 2025 12:08:38 -0700
Subject: [PATCH 053/500] Add compaction explicit prefetch stats (#13520)

Summary:
**Context/Summary:**
This PR adds new stats to measure compaction readahead size for rocksdb managed prefetching (not FS prefetching). It can be used to verify compaction read-ahead is doing what's configured. This PR also excludes compaction readahead stats from user scan readahead stats measured in existing stats so there is a cleaner separating between these two.

Bonus: this PR also included some typo fixing about "io activities"

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13520

Test Plan: Modified existing test to verify stats

Reviewed By: archang19

Differential Revision: D72892850

Pulled By: hx235

fbshipit-source-id: 1a73182061baa044c9c9193a2b0fd967ffe75c4a
---
 db_stress_tool/db_stress_listener.h           |  4 +--
 file/file_prefetch_buffer.cc                  |  6 +++-
 file/file_prefetch_buffer.h                   |  4 +++
 file/prefetch_test.cc                         | 36 ++++++++++++++++---
 include/rocksdb/statistics.h                  | 11 ++++--
 java/rocksjni/portal.h                        | 10 ++++--
 .../main/java/org/rocksdb/HistogramType.java  |  4 ++-
 monitoring/statistics.cc                      |  1 +
 table/block_based/block_prefetcher.cc         |  7 ++--
 .../behavior_changes/ra_stats_user_only.md    |  1 +
 .../new_features/compact_ra_stats.md          |  2 ++
 utilities/fault_injection_fs.cc               |  4 +--
 utilities/fault_injection_fs.h                | 15 ++++----
 13 files changed, 79 insertions(+), 26 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/ra_stats_user_only.md
 create mode 100644 unreleased_history/new_features/compact_ra_stats.md

diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h
index 35c70b5a1036..6edbaf7896d5 100644
--- a/db_stress_tool/db_stress_listener.h
+++ b/db_stress_tool/db_stress_listener.h
@@ -265,7 +265,7 @@ class DbStressListener : public EventListener {
       fault_fs_guard->DisableAllThreadLocalErrorInjection();
       // TODO(hx235): only exempt the flush thread during error recovery instead
       // of all the flush threads from error injection
-      fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection(
+      fault_fs_guard->SetIOActivitiesExcludedFromFaultInjection(
           {Env::IOActivity::kFlush});
     }
   }
@@ -275,7 +275,7 @@ class DbStressListener : public EventListener {
     RandomSleep();
     if (FLAGS_error_recovery_with_no_fault_injection && fault_fs_guard) {
       fault_fs_guard->EnableAllThreadLocalErrorInjection();
-      fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection({});
+      fault_fs_guard->SetIOActivitiesExcludedFromFaultInjection({});
     }
   }
 
diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
index 7683db861732..dadc8e46ec07 100644
--- a/file/file_prefetch_buffer.cc
+++ b/file/file_prefetch_buffer.cc
@@ -126,6 +126,8 @@ Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
 
   if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
     RecordTick(stats_, PREFETCH_BYTES, read_len);
+  } else if (usage_ == FilePrefetchBufferUsage::kCompactionPrefetch) {
+    RecordInHistogram(stats_, COMPACTION_PREFETCH_BYTES, read_len);
   }
   if (!use_fs_buffer) {
     // Update the buffer size.
@@ -154,7 +156,9 @@ Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts,
                                &(buf->del_fn_), /*aligned_buf =*/nullptr);
   req.status.PermitUncheckedError();
   if (s.ok()) {
-    RecordTick(stats_, PREFETCH_BYTES, read_len);
+    if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
+      RecordTick(stats_, PREFETCH_BYTES, read_len);
+    }
     buf->async_read_in_progress_ = true;
   }
   return s;
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
index b8b6812bc83d..51c0b4441a06 100644
--- a/file/file_prefetch_buffer.h
+++ b/file/file_prefetch_buffer.h
@@ -134,6 +134,7 @@ struct BufferInfo {
 enum class FilePrefetchBufferUsage {
   kTableOpenPrefetchTail,
   kUserScanPrefetch,
+  kCompactionPrefetch,
   kUnknown,
 };
 
@@ -574,6 +575,9 @@ class FilePrefetchBuffer {
                            size_t& read_len, uint64_t& aligned_useful_len);
 
   void UpdateStats(bool found_in_buffer, size_t length_found) {
+    if (usage_ != FilePrefetchBufferUsage::kUserScanPrefetch) {
+      return;
+    }
     if (found_in_buffer) {
       RecordTick(stats_, PREFETCH_HITS);
     }
diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc
index 2c0919ed9522..c651046dd246 100644
--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@@ -299,9 +299,18 @@ TEST_P(PrefetchTest, Basic) {
   const uint64_t prev_table_open_prefetch_tail_hit =
       options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
 
+  HistogramData pre_compaction_prefetch_bytes;
+  options.statistics->histogramData(COMPACTION_PREFETCH_BYTES,
+                                    &pre_compaction_prefetch_bytes);
+  ASSERT_EQ(pre_compaction_prefetch_bytes.count, 0);
+
   // commenting out the line below causes the example to work correctly
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
 
+  HistogramData post_compaction_prefetch_bytes;
+  options.statistics->histogramData(COMPACTION_PREFETCH_BYTES,
+                                    &post_compaction_prefetch_bytes);
+
   HistogramData cur_table_open_prefetch_tail_read;
   options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
                                     &cur_table_open_prefetch_tail_read);
@@ -318,6 +327,7 @@ TEST_P(PrefetchTest, Basic) {
     ASSERT_GT(fs->GetPrefetchCount(), 1);
     ASSERT_EQ(0, buff_prefetch_count);
     fs->ClearPrefetchCount();
+    ASSERT_EQ(post_compaction_prefetch_bytes.count, 0);
   } else {
     ASSERT_FALSE(fs->IsPrefetchCalled());
     // To rule out false positive by the SST file tail prefetch during
@@ -331,6 +341,20 @@ TEST_P(PrefetchTest, Basic) {
               prev_table_open_prefetch_tail_hit);
     ASSERT_GE(cur_table_open_prefetch_tail_miss,
               prev_table_open_prefetch_tail_miss);
+
+    ASSERT_GT(post_compaction_prefetch_bytes.count, 0);
+
+    // Not an exact match due to potential roundup/down for alignment
+    auto expected_compaction_readahead_size =
+        Options().compaction_readahead_size;
+    ASSERT_LE(post_compaction_prefetch_bytes.max,
+              expected_compaction_readahead_size * 1.1);
+    ASSERT_GE(post_compaction_prefetch_bytes.max,
+              expected_compaction_readahead_size * 0.9);
+    ASSERT_LE(post_compaction_prefetch_bytes.average,
+              expected_compaction_readahead_size * 1.1);
+    ASSERT_GE(post_compaction_prefetch_bytes.average,
+              expected_compaction_readahead_size * 0.9);
   }
 
   for (bool disable_io : {false, true}) {
@@ -3251,8 +3275,9 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) {
   ReadaheadParams readahead_params;
   readahead_params.initial_readahead_size = 8192;
   readahead_params.max_readahead_size = 8192;
-  FilePrefetchBuffer fpb(readahead_params, true, false, fs(), nullptr,
-                         stats.get());
+  FilePrefetchBuffer fpb(
+      readahead_params, true, false, fs(), nullptr, stats.get(),
+      nullptr /* cb */, FilePrefetchBufferUsage::kUserScanPrefetch /* usage */);
   Slice result;
   // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
   // it will do a read of offset 0 and length - (4096 + 8192) 12288.
@@ -3497,9 +3522,10 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) {
   size_t num_buffers = use_async_prefetch ? 2 : 1;
   readahead_params.num_buffers = num_buffers;
 
-  FilePrefetchBuffer fpb(readahead_params, true /* enable */,
-                         false /* track_min_offset */, fs(), clock(),
-                         stats.get());
+  FilePrefetchBuffer fpb(
+      readahead_params, true /* enable */, false /* track_min_offset */, fs(),
+      clock(), stats.get(), nullptr /* cb */,
+      FilePrefetchBufferUsage::kUserScanPrefetch /* usage */);
 
   int overlap_buffer_write_ct = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 00b95e8d1fd3..50f27bcba9d2 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -520,10 +520,11 @@ enum Tickers : uint32_t {
   // Number of bytes prefetched during user initiated scan
   PREFETCH_BYTES,
 
-  // Number of prefetched bytes that were actually useful
+  // Number of prefetched bytes that were actually useful during user initiated
+  // scan
   PREFETCH_BYTES_USEFUL,
 
-  // Number of FS reads avoided due to scan prefetching
+  // Number of FS reads avoided due to prefetching during user initiated scan
   PREFETCH_HITS,
 
   // Footer corruption detected when opening an SST file for reading
@@ -657,13 +658,17 @@ enum Histograms : uint32_t {
   ASYNC_READ_BYTES,
   POLL_WAIT_MICROS,
 
+  // Number of bytes for RocksDB's prefetching (as opposed to file
+  // system's prefetch) on SST file during compaction read
+  COMPACTION_PREFETCH_BYTES,
+
   // Number of prefetched bytes discarded by RocksDB.
   PREFETCHED_BYTES_DISCARDED,
 
   // Wait time for aborting async read in FilePrefetchBuffer destructor
   ASYNC_PREFETCH_ABORT_MICROS,
 
-  // Number of bytes read for RocksDB's prefetching contents (as opposed to file
+  // Number of bytes for RocksDB's prefetching contents (as opposed to file
   // system's prefetch) from the end of SST table during block based table open
   TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
 
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index d0f288ca8281..e611d65d18fb 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5889,8 +5889,11 @@ class HistogramTypeJni {
         return 0x3C;
       case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES:
         return 0x3D;
+      case ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES:
+        return 0x3F;
       case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
-        // 0x3D for backwards compatibility on current minor version.
+        // 0x3E is reserved for backwards compatibility on current minor
+        // version.
         return 0x3E;
       default:
         // undefined/default
@@ -6033,8 +6036,11 @@ class HistogramTypeJni {
       case 0x3D:
         return ROCKSDB_NAMESPACE::Histograms::
             TABLE_OPEN_PREFETCH_TAIL_READ_BYTES;
+      case 0x3F:
+        return ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES;
       case 0x3E:
-        // 0x1F for backwards compatibility on current minor version.
+        // 0x3E is reserved for backwards compatibility on current minor
+        // version.
         return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;
 
       default:
diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java
index 10d382e7b912..3825c90a4515 100644
--- a/java/src/main/java/org/rocksdb/HistogramType.java
+++ b/java/src/main/java/org/rocksdb/HistogramType.java
@@ -210,7 +210,9 @@ public enum HistogramType {
    */
   TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x3D),
 
-  // 0x3E for backwards compatibility on current minor version.
+  COMPACTION_PREFETCH_BYTES((byte) 0x3F),
+
+  // 0x3E is reserved for backwards compatibility on current minor version.
   HISTOGRAM_ENUM_MAX((byte) 0x3E);
 
   private final byte value;
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 05163d3e29e1..e74028bee6e2 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -336,6 +336,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
      "rocksdb.error.handler.autoresume.retry.count"},
     {ASYNC_READ_BYTES, "rocksdb.async.read.bytes"},
     {POLL_WAIT_MICROS, "rocksdb.poll.wait.micros"},
+    {COMPACTION_PREFETCH_BYTES, "rocksdb.compaction.prefetch.bytes"},
     {PREFETCHED_BYTES_DISCARDED, "rocksdb.prefetched.bytes.discarded"},
     {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
     {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc
index 52f0ef8fdfc2..a4cfb027b26d 100644
--- a/table/block_based/block_prefetcher.cc
+++ b/table/block_based/block_prefetcher.cc
@@ -58,9 +58,10 @@ void BlockPrefetcher::PrefetchIfNeeded(
     // implicit_auto_readahead is set.
     readahead_params.initial_readahead_size = compaction_readahead_size_;
     readahead_params.max_readahead_size = compaction_readahead_size_;
-    rep->CreateFilePrefetchBufferIfNotExists(readahead_params,
-                                             &prefetch_buffer_,
-                                             /*readaheadsize_cb=*/nullptr);
+    rep->CreateFilePrefetchBufferIfNotExists(
+        readahead_params, &prefetch_buffer_,
+        /*readaheadsize_cb=*/nullptr,
+        /*usage=*/FilePrefetchBufferUsage::kCompactionPrefetch);
     return;
   }
 
diff --git a/unreleased_history/behavior_changes/ra_stats_user_only.md b/unreleased_history/behavior_changes/ra_stats_user_only.md
new file mode 100644
index 000000000000..ea219c3d4785
--- /dev/null
+++ b/unreleased_history/behavior_changes/ra_stats_user_only.md
@@ -0,0 +1 @@
+Make stats `PREFETCH_BYTES_USEFUL`, `PREFETCH_HITS`, `PREFETCH_BYTES` only account for prefetching during user initiated scan
diff --git a/unreleased_history/new_features/compact_ra_stats.md b/unreleased_history/new_features/compact_ra_stats.md
new file mode 100644
index 000000000000..574b6b67ca2b
--- /dev/null
+++ b/unreleased_history/new_features/compact_ra_stats.md
@@ -0,0 +1,2 @@
+Provide histogram stats `COMPACTION_PREFETCH_BYTES` to measure number of bytes for RocksDB's prefetching (as opposed to file
+system's prefetch) on SST file during compaction read
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 82d3217258d0..36dec96eba89 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -1379,7 +1379,7 @@ IOStatus FaultInjectionTestFS::MaybeInjectThreadLocalReadError(
   ErrorContext* ctx =
       static_cast<ErrorContext*>(injected_thread_local_read_error_.Get());
   if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in ||
-      ShouldIOActivtiesExcludedFromFaultInjection(io_options.io_activity)) {
+      ShouldIOActivitiesExcludedFromFaultInjection(io_options.io_activity)) {
     return IOStatus::OK();
   }
 
@@ -1465,7 +1465,7 @@ IOStatus FaultInjectionTestFS::MaybeInjectThreadLocalError(
 
   ErrorContext* ctx = GetErrorContextFromFaultInjectionIOType(type);
   if (ctx == nullptr || !ctx->enable_error_injection || !ctx->one_in ||
-      ShouldIOActivtiesExcludedFromFaultInjection(io_options.io_activity) ||
+      ShouldIOActivitiesExcludedFromFaultInjection(io_options.io_activity) ||
       (type == FaultInjectionIOType::kWrite &&
        ShouldExcludeFromWriteFaultInjection(file_name))) {
     return IOStatus::OK();
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 9ea2a3bb963f..1f82c5144d10 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -424,10 +424,11 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     allow_link_open_file_ = allow_link_open_file;
   }
 
-  bool ShouldIOActivtiesExcludedFromFaultInjection(Env::IOActivity io_activty) {
+  bool ShouldIOActivitiesExcludedFromFaultInjection(
+      Env::IOActivity io_activity) {
     MutexLock l(&mutex_);
-    return io_activties_excluded_from_fault_injection.find(io_activty) !=
-           io_activties_excluded_from_fault_injection.end();
+    return io_activities_excluded_from_fault_injection.find(io_activity) !=
+           io_activities_excluded_from_fault_injection.end();
   }
 
   void AssertNoOpenFile() { assert(open_managed_files_.empty()); }
@@ -520,10 +521,10 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     return count;
   }
 
-  void SetIOActivtiesExcludedFromFaultInjection(
-      const std::set<Env::IOActivity>& io_activties) {
+  void SetIOActivitiesExcludedFromFaultInjection(
+      const std::set<Env::IOActivity>& io_activities) {
     MutexLock l(&mutex_);
-    io_activties_excluded_from_fault_injection = io_activties;
+    io_activities_excluded_from_fault_injection = io_activities;
   }
 
   void SetFileTypesExcludedFromWriteFaultInjection(
@@ -627,7 +628,7 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   };
 
   std::set<FileType> file_types_excluded_from_write_fault_injection_;
-  std::set<Env::IOActivity> io_activties_excluded_from_fault_injection;
+  std::set<Env::IOActivity> io_activities_excluded_from_fault_injection;
   ThreadLocalPtr injected_thread_local_read_error_;
   ThreadLocalPtr injected_thread_local_write_error_;
   ThreadLocalPtr injected_thread_local_metadata_read_error_;

From 1ec5a07d8e1ca737651ebe1e2775af325e2313fa Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 16 Apr 2025 15:18:48 -0700
Subject: [PATCH 054/500] Support atomic_flush for ingesting WBWI (#13545)

Summary:
add support for atomic_flush when using WBWI ingestion [feature](https://github.com/facebook/rocksdb/blob/29c6610617ddc1b486f12b99c16e7c9851e80430/include/rocksdb/utilities/transaction_db.h#L387). Transaction DB usually uses WAL so atomic_flush is not as helpful. This is to prepare for a follow up PR that enables ingesting WBWI without using transaction DB.

This PR also removes a redundant parameter `prep_log` for the WBWI ingestion feature.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13545

Test Plan:
- unti test added
- stress test will be added as we add support to ingest WBWI without using transaction DB.

Reviewed By: jowlyzhang

Differential Revision: D73062342

Pulled By: cbi42

fbshipit-source-id: e05da55dfabb8241a042214b9d50b1b49d42613e
---
 db/db_impl/db_impl.h                          |  3 +-
 db/db_impl/db_impl_write.cc                   | 64 +++++++++++++------
 tools/db_crashtest.py                         |  1 -
 .../transactions/pessimistic_transaction.cc   |  4 +-
 utilities/transactions/transaction_test.cc    | 35 +++++++++-
 5 files changed, 82 insertions(+), 25 deletions(-)

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 5f617ac9c99e..941df3c6c528 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1570,8 +1570,7 @@ class DBImpl : public DB {
                    size_t batch_cnt = 0,
                    PreReleaseCallback* pre_release_callback = nullptr,
                    PostMemTableCallback* post_memtable_callback = nullptr,
-                   std::shared_ptr<WriteBatchWithIndex> wbwi = nullptr,
-                   uint64_t min_prep_log = 0);
+                   std::shared_ptr<WriteBatchWithIndex> wbwi = nullptr);
 
   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
                             WriteCallback* callback = nullptr,
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index d6639a4b29a3..7ef8bfc30116 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -192,7 +192,7 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
 
 Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
                           const WBWIMemTable::SeqnoRange& assigned_seqno,
-                          uint64_t prep_log,
+                          uint64_t min_prep_log,
                           SequenceNumber last_seqno_after_ingest,
                           bool memtable_updated, bool ignore_missing_cf) {
   // Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
@@ -238,12 +238,30 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
     wbwi_memtable->AssignSequenceNumbers(assigned_seqno);
     // This is needed to keep the WAL that contains Prepare alive until
     // committed data in this memtable is persisted.
-    wbwi_memtable->SetMinPrepLog(prep_log);
+    wbwi_memtable->SetMinPrepLog(min_prep_log);
     memtables.push_back(wbwi_memtable);
     cfd->Ref();
     cfds.push_back(cfd);
   }
 
+  autovector<ColumnFamilyData*> cfds_for_atomic_flush;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds_for_atomic_flush);
+    for (auto cfd : cfds_for_atomic_flush) {
+      bool found = false;
+      for (auto existing_cfd : cfds) {
+        if (existing_cfd == cfd) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        cfd->Ref();
+        cfds.push_back(cfd);
+      }
+    }
+  }
+
   // Stop writes to the DB by entering both write threads
   WriteThread::Writer nonmem_w;
   if (two_write_queues_) {
@@ -253,15 +271,16 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
 
   // Switch memtable and add WBWIMemTables
   Status s;
-  for (size_t i = 0; i < memtables.size(); ++i) {
-    assert(!immutable_db_options_.atomic_flush);
-    // NOTE: to support atomic flush, need to call
-    // SelectColumnFamiliesForAtomicFlush()
+  for (size_t i = 0; i < cfds.size(); ++i) {
     WriteContext write_context;
     // TODO: not switch on empty memtable, may need to update metadata
     //   like NextLogNumber(), earliest_seqno and memtable id.
-    s = SwitchMemtable(cfds[i], &write_context, memtables[i],
-                       last_seqno_after_ingest);
+    if (i < memtables.size()) {
+      s = SwitchMemtable(cfds[i], &write_context, memtables[i],
+                         last_seqno_after_ingest);
+    } else {
+      s = SwitchMemtable(cfds[i], &write_context);
+    }
     if (!s.ok()) {
       // SwitchMemtable() can only fail if a new WAL is to be created, this
       // should only happen for the first call to SwitchMemtable(). log will
@@ -301,9 +320,18 @@ Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
         continue;
       }
       cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        // TODO: a new flush reason for ingesting memtable
+        GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
+                             &flush_req);
+        EnqueuePendingFlush(flush_req);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
       FlushRequest flush_req;
-      // TODO: a new flush reason for ingesting memtable
-      GenerateFlushRequest({cfd}, FlushReason::kExternalFileIngestion,
+      GenerateFlushRequest(cfds, FlushReason::kExternalFileIngestion,
                            &flush_req);
       EnqueuePendingFlush(flush_req);
     }
@@ -319,8 +347,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
                          uint64_t* seq_used, size_t batch_cnt,
                          PreReleaseCallback* pre_release_callback,
                          PostMemTableCallback* post_memtable_callback,
-                         std::shared_ptr<WriteBatchWithIndex> wbwi,
-                         uint64_t prep_log) {
+                         std::shared_ptr<WriteBatchWithIndex> wbwi) {
   assert(!seq_per_batch_ || batch_cnt != 0);
   assert(my_batch == nullptr || my_batch->Count() == 0 ||
          write_options.protection_bytes_per_key == 0 ||
@@ -410,7 +437,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         "DeleteRange is not compatible with row cache.");
   }
   if (wbwi) {
-    assert(prep_log > 0);
+    assert(log_ref > 0);
     // Used only in WriteCommittedTxn::CommitInternal() with no `callback`.
     assert(!callback);
     if (immutable_db_options_.unordered_write) {
@@ -421,10 +448,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       return Status::NotSupported(
           "Ingesting WriteBatch does not support pipelined_write");
     }
-    if (immutable_db_options_.atomic_flush) {
-      return Status::NotSupported(
-          "Ingesting WriteBatch does not support atomic_flush");
-    }
   }
   // Otherwise IsLatestPersistentState optimization does not make sense
   assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
@@ -856,7 +879,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
         assert(ub <= versions_->LastAllocatedSequence());
       }
       status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
-                          prep_log, last_sequence,
+                          /*min_prep_log=*/log_ref, last_sequence,
                           /*memtable_updated=*/memtable_update_count > 0,
                           write_options.ignore_missing_column_families);
     }
@@ -1918,7 +1941,10 @@ void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
   assert(immutable_db_options_.atomic_flush);
   auto seq = versions_->LastSequence();
   for (auto cfd : cfds) {
-    cfd->imm()->AssignAtomicFlushSeq(seq);
+    // cfd can be nullptr, see ScheduleFlushes()
+    if (cfd) {
+      cfd->imm()->AssignAtomicFlushSeq(seq);
+    }
   }
 }
 
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 8597e26d1112..8e9dc22f6d54 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1029,7 +1029,6 @@ def finalize_and_sanitize(src_params):
     ):
         dest_params["enable_blob_files"] = 0
         dest_params["allow_setting_blob_options_dynamically"] = 0
-        dest_params["atomic_flush"] = 0
         dest_params["allow_concurrent_memtable_write"] = 0
         dest_params["use_put_entity_one_in"] = 0
         dest_params["use_get_entity"] = 0
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 711de009d0ba..5f6a5d4164a3 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -920,8 +920,8 @@ Status WriteCommittedTxn::CommitInternal() {
         /*wal_used*/ nullptr, /*log_ref*/ log_number_,
         /*disable_memtable*/ false, &seq_used,
         /*batch_cnt=*/0, /*pre_release_callback=*/nullptr, post_mem_cb,
-        /*wbwi=*/std::make_shared<WriteBatchWithIndex>(std::move(write_batch_)),
-        /*min_prep_log=*/log_number_);
+        /*wbwi=*/
+        std::make_shared<WriteBatchWithIndex>(std::move(write_batch_)));
     // Reset write_batch_ since it's accessed in transaction clean up and
     // might be used for transaction reuse.
     write_batch_ = WriteBatchWithIndex(cmp_, 0, true, 0,
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 5a465800e685..a59ec00a0b69 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -8924,13 +8924,15 @@ class CommitBypassMemtableTest : public DBTestBase,
   TransactionDBOptions txn_db_opts;
 
   void SetUpTransactionDB(
-      uint32_t threshold = std::numeric_limits<uint32_t>::max()) {
+      uint32_t threshold = std::numeric_limits<uint32_t>::max(),
+      bool atomic_flush = false) {
     options = CurrentOptions();
     options.create_if_missing = true;
     options.allow_2pc = true;
     options.two_write_queues = GetParam();
     // Avoid write stall
     options.max_write_buffer_number = 8;
+    options.atomic_flush = atomic_flush;
     // Destroy the DB to recreate as a TransactionDB.
     Close();
     Destroy(options, true);
@@ -9453,6 +9455,37 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
   }
 }
 
+TEST_P(CommitBypassMemtableTest, AtomicFlushTest) {
+  const uint32_t threshold = 10;
+  SetUpTransactionDB(/*threshold=*/threshold, /*atomic_flush=*/true);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> cfs = {"cf0", "cf1", "cf2"};
+  CreateColumnFamilies(cfs, options);
+
+  // Seed data in CF1 and 2 as atomic flush picks CFs with non-empty memtable
+  ASSERT_OK(db_->Put({}, handles_[1], "key1", "val1"));
+  ASSERT_OK(db_->Put({}, handles_[2], "key2", "val2"));
+
+  // Write to cf 0, should see cf1 and cf2 flushed too
+  auto txn = txn_db->BeginTransaction({}, {}, nullptr);
+  for (uint32_t i = 0; i <= threshold; ++i) {
+    ASSERT_OK(txn->Put(handles_[0], "key" + std::to_string(i),
+                       "cf0" + std::to_string(i)));
+  }
+  ASSERT_OK(txn->SetName("cf0"));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  delete txn;
+
+  ASSERT_OK(db_->WaitForCompact({}));
+  for (size_t i = 0; i < 3; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+}
+
 TEST_P(CommitBypassMemtableTest, MergeAndMultiCF) {
   // disable_flush allows testing Get path with memtables.
   for (bool disable_flush : {false, true}) {

From 0e736666a0a2bd612f0f6a0a88abce2e8104cda6 Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Wed, 16 Apr 2025 16:32:45 -0700
Subject: [PATCH 055/500] Add a test for using atomic_replace_range to ingeset
 and replace data (#13549)

Summary:
Add a test to cover an internal user's expected behavior of using atomic_replace_range feature to atomically ingest a version key and a data file.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13549

Test Plan: This is a test

Reviewed By: cbi42

Differential Revision: D73142626

Pulled By: jowlyzhang

fbshipit-source-id: a5bdc24b762cbe91dd4d94242b9e1539c9feaf61
---
 db/external_sst_file_basic_test.cc | 75 ++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index fe6d9282fe30..bab07ba4b835 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -2693,6 +2693,81 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
   }
 }
 
+// This tests an internal user's exact usage and expectation of the
+// IngestExternalFiles APIs to bulk load and replace files.
+TEST_F(ExternalSSTFileBasicTest,
+       AtomicReplaceColumnFamilyWithIngestedVersionKey) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  options.num_levels = 7;
+  options.disallow_memtable_writes = false;
+
+  DestroyAndReopen(options);
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  std::string data_file_original = sst_files_dir_ + "data_original";
+  ASSERT_OK(sst_file_writer.Open(data_file_original));
+  ASSERT_OK(sst_file_writer.Put("ukey1", "uval1_orig"));
+  ASSERT_OK(sst_file_writer.Put("ukey2", "uval2_orig"));
+  ASSERT_OK(sst_file_writer.Finish());
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(),
+                                    {data_file_original},
+                                    IngestExternalFileOptions()));
+
+  ASSERT_OK(Put("data_version", "v_original"));
+  ASSERT_OK(Flush());
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "data_version", &value));
+  ASSERT_EQ(value, "v_original");
+  ASSERT_OK(db_->Get(ReadOptions(), "ukey1", &value));
+  ASSERT_EQ(value, "uval1_orig");
+  ASSERT_OK(db_->Get(ReadOptions(), "ukey2", &value));
+  ASSERT_EQ(value, "uval2_orig");
+  // Set up a 1) data version key file on L0, and 2) a user data file on L6
+  // to test the initial transitioning to use `atomic_replace_range`.
+  ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+
+  // Test multiple cycles of replacing by atomically ingest a data file and a
+  // version key file while replace the whole range in the column family.
+  for (int i = 0; i < 10; i++) {
+    std::string version_file_path =
+        sst_files_dir_ + "version" + std::to_string(i);
+    ASSERT_OK(sst_file_writer.Open(version_file_path));
+    ASSERT_OK(sst_file_writer.Put("data_version", "v" + std::to_string(i)));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    std::string file_path = sst_files_dir_ + std::to_string(i);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("ukey1", "uval1" + std::to_string(i)));
+    ASSERT_OK(sst_file_writer.Put("ukey2", "uval2" + std::to_string(i)));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    IngestExternalFileArg arg;
+    arg.column_family = db_->DefaultColumnFamily();
+    arg.external_files = {version_file_path, file_path};
+    arg.atomic_replace_range = {{nullptr, nullptr}};
+    // Test both fail_if_not_bottomost_level: true and false
+    arg.options.fail_if_not_bottommost_level = i % 2 == 0;
+    arg.options.snapshot_consistency = false;
+    // Ingest 1) a new data version file and 2) a new user data file while erase
+    // the whole column family
+    Status s = db_->IngestExternalFiles({arg});
+    ASSERT_OK(s);
+
+    // Check ingestion result and the expected LSM shape:
+    // Two files on L6, 1) a data version file 2) a user data file.
+    ASSERT_OK(db_->Get(ReadOptions(), "ukey1", &value));
+    ASSERT_EQ(value, "uval1" + std::to_string(i));
+    ASSERT_OK(db_->Get(ReadOptions(), "ukey2", &value));
+    ASSERT_EQ(value, "uval2" + std::to_string(i));
+    ASSERT_OK(db_->Get(ReadOptions(), "data_version", &value));
+    ASSERT_EQ(value, "v" + std::to_string(i));
+    ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+  }
+
+  Close();
+}
+
 TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
   for (bool disallow_memtable : {false, true}) {
     Options options = GetDefaultOptions();

From 695c653e11c9181df2336e987d6827478db180c1 Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Wed, 16 Apr 2025 17:24:12 -0700
Subject: [PATCH 056/500] Correctly initialize file size for reopened writable
 file (#13534)

Summary:
A reopened writable file's size is not correctly tracked in the `WritableFile`'s internal state.  This PR adds a querying to the file system to get the initial file size in the reopen case and use it to populate posix `WritableFile`'s internal state.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13534

Reviewed By: anand1976

Differential Revision: D72756628

Pulled By: jowlyzhang

fbshipit-source-id: 6f02b5c5da069fe49055d7b75bec9e7e47d5cd71
---
 env/fs_posix.cc                               | 30 ++++++++++++-------
 env/io_posix.cc                               | 10 ++++---
 env/io_posix.h                                |  5 ++--
 ...x_reopened_writable_file_size_incorrect.md |  1 +
 4 files changed, 30 insertions(+), 16 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md

diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 82fb9fba337b..61fd2c5e614c 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -322,8 +322,17 @@ class PosixFileSystem : public FileSystem {
     if (options.use_mmap_writes) {
       MaybeForceDisableMmap(fd);
     }
+    uint64_t initial_file_size = 0;
+    if (reopen) {
+      s = GetFileSize(fname, IOOptions(), &initial_file_size, nullptr);
+      if (!s.ok()) {
+        close(fd);
+        return s;
+      }
+    }
     if (options.use_mmap_writes && !forceMmapOff_) {
-      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+      result->reset(
+          new PosixMmapFile(fname, fd, page_size_, options, initial_file_size));
     } else if (options.use_direct_writes && !options.use_mmap_writes) {
 #ifdef OS_MACOSX
       if (fcntl(fd, F_NOCACHE, 1) == -1) {
@@ -343,7 +352,7 @@ class PosixFileSystem : public FileSystem {
 #endif
       result->reset(new PosixWritableFile(
           fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
-          options));
+          options, initial_file_size));
     } else {
       // disable mmap writes
       EnvOptions no_mmap_writes_options = options;
@@ -352,7 +361,7 @@ class PosixFileSystem : public FileSystem {
           new PosixWritableFile(fname, fd,
                                 GetLogicalBlockSizeForWriteIfNeeded(
                                     no_mmap_writes_options, fname, fd),
-                                no_mmap_writes_options));
+                                no_mmap_writes_options, initial_file_size));
     }
     return s;
   }
@@ -418,7 +427,8 @@ class PosixFileSystem : public FileSystem {
       MaybeForceDisableMmap(fd);
     }
     if (options.use_mmap_writes && !forceMmapOff_) {
-      result->reset(new PosixMmapFile(fname, fd, page_size_, options));
+      result->reset(new PosixMmapFile(fname, fd, page_size_, options,
+                                      /*initial_file_size=*/0));
     } else if (options.use_direct_writes && !options.use_mmap_writes) {
 #ifdef OS_MACOSX
       if (fcntl(fd, F_NOCACHE, 1) == -1) {
@@ -438,16 +448,16 @@ class PosixFileSystem : public FileSystem {
 #endif
       result->reset(new PosixWritableFile(
           fname, fd, GetLogicalBlockSizeForWriteIfNeeded(options, fname, fd),
-          options));
+          options, /*initial_file_size=*/0));
     } else {
       // disable mmap writes
       FileOptions no_mmap_writes_options = options;
       no_mmap_writes_options.use_mmap_writes = false;
-      result->reset(
-          new PosixWritableFile(fname, fd,
-                                GetLogicalBlockSizeForWriteIfNeeded(
-                                    no_mmap_writes_options, fname, fd),
-                                no_mmap_writes_options));
+      result->reset(new PosixWritableFile(
+          fname, fd,
+          GetLogicalBlockSizeForWriteIfNeeded(no_mmap_writes_options, fname,
+                                              fd),
+          no_mmap_writes_options, /*initial_file_size=*/0));
     }
     return s;
   }
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 231e88daef39..db1a6da64666 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -1138,7 +1138,8 @@ IOStatus PosixMmapFile::Msync() {
 }
 
 PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
-                             const EnvOptions& options)
+                             const EnvOptions& options,
+                             uint64_t initial_file_size)
     : filename_(fname),
       fd_(fd),
       page_size_(page_size),
@@ -1147,7 +1148,7 @@ PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
       limit_(nullptr),
       dst_(nullptr),
       last_sync_(nullptr),
-      file_offset_(0) {
+      file_offset_(initial_file_size) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   allow_fallocate_ = options.allow_fallocate;
   fallocate_with_keep_size_ = options.fallocate_with_keep_size;
@@ -1317,12 +1318,13 @@ IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
  */
 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
                                      size_t logical_block_size,
-                                     const EnvOptions& options)
+                                     const EnvOptions& options,
+                                     uint64_t initial_file_size)
     : FSWritableFile(options),
       filename_(fname),
       use_direct_io_(options.use_direct_writes),
       fd_(fd),
-      filesize_(0),
+      filesize_(initial_file_size),
       logical_sector_size_(logical_block_size) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   allow_fallocate_ = options.allow_fallocate;
diff --git a/env/io_posix.h b/env/io_posix.h
index 60788df9bf8b..c85ff0122d26 100644
--- a/env/io_posix.h
+++ b/env/io_posix.h
@@ -374,7 +374,8 @@ class PosixWritableFile : public FSWritableFile {
  public:
   explicit PosixWritableFile(const std::string& fname, int fd,
                              size_t logical_block_size,
-                             const EnvOptions& options);
+                             const EnvOptions& options,
+                             uint64_t initial_file_size);
   virtual ~PosixWritableFile();
 
   // Need to implement this so the file is truncated correctly
@@ -469,7 +470,7 @@ class PosixMmapFile : public FSWritableFile {
 
  public:
   PosixMmapFile(const std::string& fname, int fd, size_t page_size,
-                const EnvOptions& options);
+                const EnvOptions& options, uint64_t initial_file_size);
   ~PosixMmapFile();
 
   // Means Close() will properly take care of truncate
diff --git a/unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md b/unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md
new file mode 100644
index 000000000000..405b8fb19203
--- /dev/null
+++ b/unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md
@@ -0,0 +1 @@
+Fix a bug in Posix file system that the FSWritableFile created via `FileSystem::ReopenWritableFile` internally does not track the correct file size.
\ No newline at end of file

From 31b23974702d2457e3213281f81b1d86bfd3120c Mon Sep 17 00:00:00 2001
From: Zaidoon Abd Al Hadi <zaidoon@cloudflare.com>
Date: Wed, 16 Apr 2025 20:45:38 -0700
Subject: [PATCH 057/500] Expose Options::memtable_op_scan_flush_trigger
 through C API (#13537)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13537

Reviewed By: jowlyzhang

Differential Revision: D73141407

Pulled By: cbi42

fbshipit-source-id: c7e04b403a17773e651f4922976f213b817f7adc
---
 db/c.cc             | 10 ++++++++++
 db/c_test.c         | 10 ++++++++++
 include/rocksdb/c.h |  5 +++++
 3 files changed, 25 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 819d928193e7..1b0571efa947 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3295,6 +3295,16 @@ uint64_t rocksdb_options_get_periodic_compaction_seconds(
   return opt->rep.periodic_compaction_seconds;
 }
 
+void rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t* opt,
+                                                        uint32_t n) {
+  opt->rep.memtable_op_scan_flush_trigger = n;
+}
+
+uint32_t rocksdb_options_get_memtable_op_scan_flush_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_op_scan_flush_trigger;
+}
+
 void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
   opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
 }
diff --git a/db/c_test.c b/db/c_test.c
index 373bdcc6d43b..2142748e5674 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -2129,6 +2129,10 @@ int main(int argc, char** argv) {
     CheckCondition(100000 ==
                    rocksdb_options_get_periodic_compaction_seconds(o));
 
+    rocksdb_options_set_memtable_op_scan_flush_trigger(o, 100);
+    CheckCondition(100 ==
+                   rocksdb_options_get_memtable_op_scan_flush_trigger(o));
+
     rocksdb_options_set_ttl(o, 5000);
     CheckCondition(5000 == rocksdb_options_get_ttl(o));
 
@@ -2566,6 +2570,12 @@ int main(int argc, char** argv) {
     CheckCondition(100000 ==
                    rocksdb_options_get_periodic_compaction_seconds(o));
 
+    rocksdb_options_set_memtable_op_scan_flush_trigger(copy, 800);
+    CheckCondition(800 ==
+                   rocksdb_options_get_memtable_op_scan_flush_trigger(copy));
+    CheckCondition(100 ==
+                   rocksdb_options_get_memtable_op_scan_flush_trigger(o));
+
     rocksdb_options_set_ttl(copy, 8000);
     CheckCondition(8000 == rocksdb_options_get_ttl(copy));
     CheckCondition(5000 == rocksdb_options_get_ttl(o));
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 9ec09defb85d..60ae92fca14e 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1341,6 +1341,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_periodic_compaction_seconds(
     rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_options_get_periodic_compaction_seconds(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t*,
+                                                   uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_memtable_op_scan_flush_trigger(rocksdb_options_t*);
 
 enum {
   rocksdb_statistics_level_disable_all = 0,

From 6d83a75595e58f55d3349a323445a6790f04c2f4 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 17 Apr 2025 11:25:11 -0700
Subject: [PATCH 058/500] Pass FileSystem pointer and FileOptions to
 ExternalTableReader (#13551)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13551

Reviewed By: jaykorean

Differential Revision: D73157052

Pulled By: anand1976

fbshipit-source-id: 580a9104a86b11e3b0b624bb8aa2cf176dc7a27a
---
 include/rocksdb/external_table.h | 11 +++++++++--
 table/external_table.cc          |  5 +++--
 table/table_test.cc              |  5 ++++-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index 22db93ce140d..cfa4152c5a7b 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -8,6 +8,7 @@
 #include "rocksdb/advanced_iterator.h"
 #include "rocksdb/customizable.h"
 #include "rocksdb/file_checksum.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/iterator_base.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
@@ -196,11 +197,17 @@ class ExternalTableBuilder {
 struct ExternalTableOptions {
   const std::shared_ptr<const SliceTransform>& prefix_extractor;
   const Comparator* comparator;
+  const std::shared_ptr<FileSystem>& fs;
+  const FileOptions& file_options;
 
   ExternalTableOptions(
       const std::shared_ptr<const SliceTransform>& _prefix_extractor,
-      const Comparator* _comparator)
-      : prefix_extractor(_prefix_extractor), comparator(_comparator) {}
+      const Comparator* _comparator, const std::shared_ptr<FileSystem>& _fs,
+      const FileOptions& _file_options)
+      : prefix_extractor(_prefix_extractor),
+        comparator(_comparator),
+        fs(_fs),
+        file_options(_file_options) {}
 };
 
 struct ExternalTableBuilderOptions {
diff --git a/table/external_table.cc b/table/external_table.cc
index ad611c5d29ac..9fed7d8024b8 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -287,8 +287,9 @@ class ExternalTableFactoryAdapter : public TableFactory {
       std::unique_ptr<TableReader>* table_reader,
       bool /* prefetch_index_and_filter_in_cache */) const override {
     std::unique_ptr<ExternalTableReader> reader;
-    ExternalTableOptions ext_topts(topts.prefix_extractor,
-                                   topts.ioptions.user_comparator);
+    ExternalTableOptions ext_topts(
+        topts.prefix_extractor, topts.ioptions.user_comparator,
+        topts.ioptions.fs, FileOptions(topts.env_options));
     auto status =
         inner_->NewTableReader(ro, file->file_name(), ext_topts, &reader);
     if (!status.ok()) {
diff --git a/table/table_test.cc b/table/table_test.cc
index efc6219ceeff..151b24b6b7ef 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6882,7 +6882,10 @@ TEST_F(ExternalTableReaderTest, BasicTest) {
   std::unique_ptr<ExternalTableReader> reader;
   std::shared_ptr<SliceTransform> prefix_extractor;
   ASSERT_OK(factory->NewTableReader(
-      {}, file_path, ExternalTableOptions(prefix_extractor, nullptr), &reader));
+      {}, file_path,
+      ExternalTableOptions(prefix_extractor, /*comparator=*/nullptr,
+                           /*fs=*/nullptr, FileOptions()),
+      &reader));
 
   ReadOptions ro;
   std::unique_ptr<ExternalTableIterator> iter(reader->NewIterator(ro, nullptr));

From 925c63a96b48255d5514758994db884893b0cce0 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 17 Apr 2025 12:06:40 -0700
Subject: [PATCH 059/500] Experimental API `IngestWriteBatchWithIndex()`
 (#13550)

Summary:
add support for ingesting a WriteBatchWithIndex into the DB with the new API `IngestWriteBatchWithIndex()`. This ingestion works similarly as `TransactionOptions::commit_bypass_memtable` where the WBWI will be ingested as an immutable memtable. Since this skips memtable writes, it improves the write performance when writing a large write batch into the DB. Currently this API only supports `disableWAL=true`. Support for WAL write will be in a follow up if needed.

For a WBWI to be ingestable, we needed to call `SetTrackPerCFStat()` at WBWI creation. This PR removes this step for simpler usage and per CF stats will always be tracked in WBWI. `WBWIIteratorImpl::TestOutOfBound()` is optimized to offset the performance impact.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13550

Test Plan:
- new unit test
- stress test option ingest_wbwi_one_in and ran a few runs of `python3 ./tools/db_crashtest.py blackbox --enable_pipelined_write=0 --use_timed_put_one_in=0 --use_put_entity_one_in=0 --ingest_wbwi_one_in=10 --test_batches_snapshots=0 --enable_blob_files=0 --preserve_unverified_changes=1 --avoid_flush_during_recovery=1 --disable_wal=1 --inplace_update_support=0 --interval=40`

Reviewed By: jowlyzhang

Differential Revision: D73152223

Pulled By: cbi42

fbshipit-source-id: 339f8ed26ac5a798238870df3ba857ba1add759b
---
 db/db_impl/db_impl.h                          | 14 ++--
 db/db_impl/db_impl_write.cc                   | 77 +++++++++++++++----
 db/db_write_test.cc                           | 74 ++++++++++++++++++
 db_stress_tool/db_stress_common.h             |  1 +
 db_stress_tool/db_stress_gflags.cc            |  5 ++
 db_stress_tool/no_batched_ops_stress.cc       | 48 +++++++++++-
 include/rocksdb/db.h                          | 16 ++++
 include/rocksdb/utilities/transaction_db.h    |  2 +
 .../utilities/write_batch_with_index.h        |  5 +-
 memtable/wbwi_memtable.h                      |  2 +-
 tools/db_crashtest.py                         |  8 ++
 .../new_features/ingest_wbwi.md               |  1 +
 .../transactions/pessimistic_transaction.cc   |  5 +-
 .../write_batch_with_index.cc                 | 40 ++++------
 .../write_batch_with_index_internal.h         |  5 ++
 .../write_batch_with_index_test.cc            |  4 -
 16 files changed, 247 insertions(+), 60 deletions(-)
 create mode 100644 unreleased_history/new_features/ingest_wbwi.md

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 941df3c6c528..0b208add7135 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -256,6 +256,10 @@ class DBImpl : public DB {
   Status WriteWithCallback(const WriteOptions& options, WriteBatch* updates,
                            UserWriteCallback* user_write_cb) override;
 
+  Status IngestWriteBatchWithIndex(
+      const WriteOptions& options,
+      std::shared_ptr<WriteBatchWithIndex> wbwi) override;
+
   using DB::Get;
   Status Get(const ReadOptions& _read_options,
              ColumnFamilyHandle* column_family, const Slice& key,
@@ -1531,11 +1535,11 @@ class DBImpl : public DB {
   // ingests `wbwi` is done.
   // @param memtable_updated Whether the same write that ingests wbwi has
   // updated memtable. This is useful for determining whether to set bg
-  // error when IngestWBWI fails.
-  Status IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
-                    const WBWIMemTable::SeqnoRange& assigned_seqno,
-                    uint64_t min_prep_log, SequenceNumber last_seqno,
-                    bool memtable_updated, bool ignore_missing_cf);
+  // error when IngestWBWIAsMemtable fails.
+  Status IngestWBWIAsMemtable(std::shared_ptr<WriteBatchWithIndex> wbwi,
+                              const WBWIMemTable::SeqnoRange& assigned_seqno,
+                              uint64_t min_prep_log, SequenceNumber last_seqno,
+                              bool memtable_updated, bool ignore_missing_cf);
 
   // If disable_memtable is set the application logic must guarantee that the
   // batch will still be skipped from memtable during the recovery. An excption
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 7ef8bfc30116..7f91ed65f7d7 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -190,11 +190,38 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
   return s;
 }
 
-Status DBImpl::IngestWBWI(std::shared_ptr<WriteBatchWithIndex> wbwi,
-                          const WBWIMemTable::SeqnoRange& assigned_seqno,
-                          uint64_t min_prep_log,
-                          SequenceNumber last_seqno_after_ingest,
-                          bool memtable_updated, bool ignore_missing_cf) {
+Status DBImpl::IngestWriteBatchWithIndex(
+    const WriteOptions& write_options,
+    std::shared_ptr<WriteBatchWithIndex> wbwi) {
+  if (!wbwi) {
+    return Status::InvalidArgument("Batch is nullptr!");
+  }
+  if (!write_options.disableWAL) {
+    return Status::NotSupported(
+        "IngestWriteBatchWithIndex does not support disableWAL=true");
+  }
+  Status s;
+  if (write_options.protection_bytes_per_key > 0) {
+    s = WriteBatchInternal::UpdateProtectionInfo(
+        wbwi->GetWriteBatch(), write_options.protection_bytes_per_key);
+  }
+  if (s.ok()) {
+    WriteBatch dummy_empty_batch;
+    s = WriteImpl(
+        write_options, /*updates=*/&dummy_empty_batch, /*callback=*/nullptr,
+        /*user_write_cb=*/nullptr, /*log_used=*/nullptr, /*log_ref=*/0,
+        /*disable_memtable=*/false, /*seq_used=*/nullptr,
+        /*batch_cnt=*/0, /*pre_release_callback=*/nullptr,
+        /*post_memtable_callback=*/nullptr, /*wbwi=*/wbwi);
+  }
+  return s;
+}
+
+Status DBImpl::IngestWBWIAsMemtable(
+    std::shared_ptr<WriteBatchWithIndex> wbwi,
+    const WBWIMemTable::SeqnoRange& assigned_seqno, uint64_t min_prep_log,
+    SequenceNumber last_seqno_after_ingest, bool memtable_updated,
+    bool ignore_missing_cf) {
   // Keys in new memtable have seqno > last_seqno_after_ingest >= keys in wbwi.
   assert(assigned_seqno.upper_bound <= last_seqno_after_ingest);
   // Keys in the current memtable have seqno <= LastSequence() < keys in wbwi.
@@ -436,9 +463,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     return Status::NotSupported(
         "DeleteRange is not compatible with row cache.");
   }
+  // Whether the WBWI is from transaction commit or a direct write
+  // (IngestWriteBatchWithIndex())
+  bool ingest_wbwi_for_commit = false;
   if (wbwi) {
-    assert(log_ref > 0);
-    // Used only in WriteCommittedTxn::CommitInternal() with no `callback`.
+    if (my_batch->HasCommit()) {
+      ingest_wbwi_for_commit = true;
+      assert(log_ref);
+    } else {
+      // Only supports disableWAL for directly ingesting WBWI for now.
+      assert(write_options.disableWAL);
+    }
     assert(!callback);
     if (immutable_db_options_.unordered_write) {
       return Status::NotSupported(
@@ -448,6 +483,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       return Status::NotSupported(
           "Ingesting WriteBatch does not support pipelined_write");
     }
+    if (!wbwi->GetOverwriteKey()) {
+      return Status::NotSupported(
+          "WriteBatchWithIndex ingestion requires overwrite_key=true");
+    }
   }
   // Otherwise IsLatestPersistentState optimization does not make sense
   assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
@@ -658,7 +697,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
             continue;
           }
           // TODO: maybe handle the tracing status?
-          tracer_->Write(writer->batch).PermitUncheckedError();
+          if (wbwi && !ingest_wbwi_for_commit) {
+            // for transaction write, tracer only needs the commit marker which
+            // is in writer->batch
+            tracer_->Write(wbwi->GetWriteBatch()).PermitUncheckedError();
+          } else {
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
         }
       }
     }
@@ -860,12 +905,13 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // handle exit, false means somebody else did
     should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
   }
-  if (wbwi) {
-    if (status.ok() && w.status.ok()) {
+  if (wbwi && status.ok() && w.status.ok()) {
+    uint32_t wbwi_count = wbwi->GetWriteBatch()->Count();
+    // skip empty batch case
+    if (wbwi_count) {
       // w.batch contains (potentially empty) commit time batch updates,
       // only ingest wbwi if w.batch is applied to memtable successfully
       uint32_t memtable_update_count = w.batch->Count();
-      uint32_t wbwi_count = wbwi->GetWriteBatch()->Count();
       // Seqno assigned to this write are [last_seq + 1 - seq_inc, last_seq].
       // seq_inc includes w.batch (memtable updates) and wbwi
       // w.batch gets first `memtable_update_count` sequence numbers.
@@ -878,10 +924,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       if (two_write_queues_) {
         assert(ub <= versions_->LastAllocatedSequence());
       }
-      status = IngestWBWI(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
-                          /*min_prep_log=*/log_ref, last_sequence,
-                          /*memtable_updated=*/memtable_update_count > 0,
-                          write_options.ignore_missing_column_families);
+      status =
+          IngestWBWIAsMemtable(wbwi, {/*lower_bound=*/lb, /*upper_bound=*/ub},
+                               /*min_prep_log=*/log_ref, last_sequence,
+                               /*memtable_updated=*/memtable_update_count > 0,
+                               write_options.ignore_missing_column_families);
     }
   }
 
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index e3afd219dcc3..f6eeac7c5ee6 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -1000,6 +1000,80 @@ TEST_P(DBWriteTest, RecycleLogToggleTest) {
   ASSERT_EQ(Get(Key(1)), "val2");
 }
 
+TEST_P(DBWriteTest, IngestWriteBatchWithIndex) {
+  if (GetParam() == kPipelinedWrite) {
+    return;
+  }
+
+  Options options = GetOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Options cf_options = GetOptions();
+  cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  CreateColumnFamilies({"cf1", "cf2"}, cf_options);
+  ReopenWithColumnFamilies({"default", "cf1", "cf2"},
+                           {options, cf_options, cf_options});
+
+  // default cf
+  auto wbwi1 = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                     /*overwrite_key=*/true);
+  ASSERT_OK(wbwi1->Put("key1", "value1"));
+  ASSERT_OK(wbwi1->Put("key2", "value2"));
+  if (GetParam() == kPipelinedWrite) {
+    ASSERT_TRUE(db_->IngestWriteBatchWithIndex({}, wbwi1).IsNotSupported());
+    return;
+  }
+  // Test disableWAL=false
+  ASSERT_TRUE(db_->IngestWriteBatchWithIndex({}, wbwi1).IsNotSupported());
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi1));
+  ASSERT_EQ("value1", Get("key1"));
+  ASSERT_EQ("value2", Get("key2"));
+
+  // Test with overwrites
+  auto wbwi = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                    /*overwrite_key=*/true);
+  ASSERT_OK(wbwi->Put("key2", "value3"));
+  ASSERT_OK(wbwi->Delete("key1"));  // Delete an existing key
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi));
+  ASSERT_EQ("NOT_FOUND", Get("key1"));
+  ASSERT_EQ("value3", Get("key2"));
+
+  auto wbwi2 = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                     /*overwrite_key=*/true);
+  ASSERT_OK(wbwi2->Put(handles_[1], "cf1_key1", "cf1_value1"));
+  ASSERT_OK(wbwi2->Delete(handles_[1], "cf1_key2"));
+  // Test ingestion with column family
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi2));
+  ASSERT_EQ("cf1_value1", Get(1, "cf1_key1"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "cf1_key2"));
+
+  auto wbwi3 = std::make_shared<WriteBatchWithIndex>(options.comparator, 0,
+                                                     /*overwrite_key=*/true);
+  ASSERT_OK(wbwi3->Merge(handles_[2], "cf2_key1", "cf2_value1"));
+  ASSERT_OK(wbwi3->Merge(handles_[2], "cf2_key1", "cf2_value2"));
+  // Test ingestion with merge operations
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, wbwi3));
+  ASSERT_EQ("cf2_value1,cf2_value2", Get(2, "cf2_key1"));
+
+  // Test with overwrite_key = false
+  auto wbwi_no_overwrite = std::make_shared<WriteBatchWithIndex>(
+      options.comparator, 0, /*overwrite_key=*/false);
+  ASSERT_OK(wbwi_no_overwrite->Put("key1", "value1"));
+  Status s = db_->IngestWriteBatchWithIndex(wo, wbwi_no_overwrite);
+  ASSERT_TRUE(s.IsNotSupported());
+
+  auto empty_wbwi = std::make_shared<WriteBatchWithIndex>(
+      options.comparator, 0, /*overwrite_key=*/true);
+  ASSERT_OK(db_->IngestWriteBatchWithIndex(wo, empty_wbwi));
+
+  DestroyAndReopen(options);
+  // Should fail when trying to ingest to non-existent column family
+  ASSERT_NOK(db_->IngestWriteBatchWithIndex(wo, wbwi2));
+}
+
 INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
                         testing::Values(DBTestBase::kDefault,
                                         DBTestBase::kConcurrentWALWrites,
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 1d8f979cf05c..ed0d50c9ec70 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -423,6 +423,7 @@ DECLARE_bool(track_and_verify_wals);
 DECLARE_bool(enable_remote_compaction);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
 DECLARE_uint32(memtable_op_scan_flush_trigger);
+DECLARE_uint32(ingest_wbwi_one_in);
 
 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 32e1aad2262d..dbe6fc09007a 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -842,6 +842,11 @@ DEFINE_bool(track_and_verify_wals,
 DEFINE_bool(enable_remote_compaction, false,
             "Enable (simulated) Remote Compaction");
 
+DEFINE_uint32(ingest_wbwi_one_in, 0,
+              "If set, will call"
+              "IngestWriteBatchWithIndex() instead of regular write operations "
+              "once every N writes.");
+
 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
   if (value < 0 || value > 100) {
     fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", flagname,
diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 44165563c621..347c03b6519d 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -1845,7 +1845,17 @@ class NonBatchedOpsStressTest : public StressTest {
       } else if (FLAGS_use_merge) {
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->Merge(write_opts, cfh, k, v);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->Merge(cfh, k, v);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->Merge(write_opts, cfh, k, v);
+            }
           } else {
             s = db_->Merge(write_opts, cfh, k, write_ts, v);
           }
@@ -1857,7 +1867,17 @@ class NonBatchedOpsStressTest : public StressTest {
       } else {
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->Put(write_opts, cfh, k, v);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->Put(cfh, k, v);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->Put(write_opts, cfh, k, v);
+            }
           } else {
             s = db_->Put(write_opts, cfh, k, write_ts, v);
           }
@@ -1949,7 +1969,17 @@ class NonBatchedOpsStressTest : public StressTest {
         }
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->Delete(write_opts, cfh, key);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->Delete(cfh, key);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->Delete(write_opts, cfh, key);
+            }
           } else {
             s = db_->Delete(write_opts, cfh, key, write_ts);
           }
@@ -2006,7 +2036,17 @@ class NonBatchedOpsStressTest : public StressTest {
         }
         if (!FLAGS_use_txn) {
           if (FLAGS_user_timestamp_size == 0) {
-            s = db_->SingleDelete(write_opts, cfh, key);
+            if (FLAGS_ingest_wbwi_one_in &&
+                thread->rand.OneIn(FLAGS_ingest_wbwi_one_in)) {
+              auto wbwi = std::make_shared<WriteBatchWithIndex>(
+                  options_.comparator, 0, /*overwrite_key=*/true);
+              s = wbwi->SingleDelete(cfh, key);
+              if (s.ok()) {
+                s = db_->IngestWriteBatchWithIndex(write_opts, wbwi);
+              }
+            } else {
+              s = db_->SingleDelete(write_opts, cfh, key);
+            }
           } else {
             s = db_->SingleDelete(write_opts, cfh, key, write_ts);
           }
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 58eafe2dc40a..db2060c33b79 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -31,6 +31,7 @@
 #include "rocksdb/types.h"
 #include "rocksdb/user_write_callback.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/version.h"
 #include "rocksdb/wide_columns.h"
 
@@ -633,6 +634,21 @@ class DB {
         "WriteWithCallback not implemented for this interface.");
   }
 
+  // EXPERIMENTAL, subject to change
+  // Ingest a WriteBatchWithIndex into DB, bypassing memtable writes for better
+  // write performance. Useful when there is a large number of updates
+  // in the write batch.
+  // The WriteBatchWithIndex must be created with overwrite_key=true.
+  // Currently this requires WriteOptions::disableWAL=true.
+  // The following options are currently not supported:
+  // - unordered_write
+  // - enable_pipelined_write
+  virtual Status IngestWriteBatchWithIndex(
+      const WriteOptions& /*options*/,
+      std::shared_ptr<WriteBatchWithIndex> /*wbwi*/) {
+    return Status::NotSupported("IngestWriteBatchWithIndex not implemented.");
+  }
+
   // If the column family specified by "column_family" contains an entry for
   // "key", return the corresponding value in "*value". If the entry is a plain
   // key-value, return the value as-is; if it is a wide-column entity, return
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 156583333d33..29cc12846f92 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -368,6 +368,8 @@ struct TransactionOptions {
   // Only supports write-committed policy. If set to true, the transaction will
   // skip memtable write and ingest into the DB directly during Commit(). This
   // makes Commit() much faster for transactions with many operations.
+  // Transaction neeeds to call Prepare() before Commit() for this option to
+  // take effect.
   // Transactions with Merge() or PutEntity() is not supported yet.
   //
   // Note that the transaction will be ingested as an immutable memtable for
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 6ff8b587099d..9cfc364cfe07 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -90,6 +90,8 @@ class WBWIIterator {
   // Returns n where the current entry is the n-th update to the current key.
   // The update count starts from 1.
   // Only valid if WBWI is created with overwrite_key = true.
+  // With overwrite_key=false, update count for each entry is not maintained,
+  // see UpdateExistingEntryWithCfId().
   virtual uint32_t GetUpdateCount() const { return 0; }
 };
 
@@ -374,9 +376,6 @@ class WriteBatchWithIndex : public WriteBatchBase {
     uint32_t entry_count = 0;
     uint32_t overwritten_sd_count = 0;
   };
-  // Will track CF ID, per CF entry count and overwritten sd count.
-  // Should be enabled when WBWI is empty for correct tracking.
-  void SetTrackPerCFStat(bool track);
   const std::unordered_map<uint32_t, CFStat>& GetCFStats() const;
 
   bool GetOverwriteKey() const;
diff --git a/memtable/wbwi_memtable.h b/memtable/wbwi_memtable.h
index 3f0ae3e23d5b..b3231b4d565d 100644
--- a/memtable/wbwi_memtable.h
+++ b/memtable/wbwi_memtable.h
@@ -235,7 +235,7 @@ class WBWIMemTable final : public ReadOnlyMemTable {
   uint64_t num_entries_;
   // WBWI can contains updates to multiple CFs. `cf_id_` determines which CF
   // this memtable is for.
-  uint32_t cf_id_;
+  const uint32_t cf_id_;
 };
 
 class WBWIMemTableIterator final : public InternalIterator {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 8e9dc22f6d54..5b4397d2c4dc 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -345,6 +345,7 @@
     "enable_remote_compaction": lambda: random.choice([0, 1]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
+    "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
 }
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
 # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR
@@ -1035,6 +1036,13 @@ def finalize_and_sanitize(src_params):
         dest_params["use_multi_get_entity"] = 0
         dest_params["enable_pipelined_write"] = 0
         dest_params["use_attribute_group"] = 0
+    if (
+        dest_params.get("enable_pipelined_write", 0)
+        or dest_params.get("unordered_write", 0)
+        or dest_params.get("disable_wal", 0) == 0
+        or dest_params.get("user_timestamp_size", 0)
+    ):
+        dest_params["ingest_wbwi_one_in"] = 0
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
diff --git a/unreleased_history/new_features/ingest_wbwi.md b/unreleased_history/new_features/ingest_wbwi.md
new file mode 100644
index 000000000000..f778a1e86642
--- /dev/null
+++ b/unreleased_history/new_features/ingest_wbwi.md
@@ -0,0 +1 @@
+* Introduce API `IngestWriteBatchWithIndex()` for ingesting updates into DB while bypassing memtable writes. This improves performance when writing a large write batch to the DB.
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 5f6a5d4164a3..f8dbaf07c9f8 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -110,8 +110,6 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
     commit_bypass_memtable_threshold_ =
         db_options.txn_commit_bypass_memtable_threshold;
   }
-  write_batch_.SetTrackPerCFStat(commit_bypass_memtable_threshold_ <
-                                 std::numeric_limits<uint32_t>::max());
 }
 
 PessimisticTransaction::~PessimisticTransaction() {
@@ -914,6 +912,9 @@ Status WriteCommittedTxn::CommitInternal() {
   TEST_SYNC_POINT_CALLBACK("WriteCommittedTxn::CommitInternal:bypass_memtable",
                            static_cast<void*>(&bypass_memtable));
   if (bypass_memtable) {
+    // Used for differentiating commiting WBWI vs directly ingesting WBWI
+    // see (IngestWriteBatchWithIndex())
+    assert(working_batch->HasCommit());
     s = db_impl_->WriteImpl(
         write_options_, working_batch, /*callback*/ nullptr,
         /*user_write_cb=*/nullptr,
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 2970ce6e5028..ca324d9da59c 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -32,8 +32,7 @@ struct WriteBatchWithIndex::Rep {
         skip_list(comparator, &arena),
         last_sub_batch_offset(0),
         sub_batch_cnt(1),
-        overwrite_key(_overwrite_key),
-        track_cf_stat(false) {}
+        overwrite_key(_overwrite_key) {}
   ReadableWriteBatch write_batch;
   WriteBatchEntryComparator comparator;
   Arena arena;
@@ -45,10 +44,10 @@ struct WriteBatchWithIndex::Rep {
   // Total number of sub-batches in the write batch. Default is 1.
   size_t sub_batch_cnt;
 
-  bool overwrite_key;
-  bool track_cf_stat;
+  const bool overwrite_key;
   // Tracks ids of CFs that have updates in this WBWI, number of updates and
-  // number of overwritten single deletions per cf.
+  // number of overwritten single deletions per cf. Useful for WBWIMemTable
+  // when this WBWI is ingested into a DB.
   std::unordered_map<uint32_t, CFStat> cf_id_to_stat;
 
   // In overwrite mode, find the existing entry for the same key and update it
@@ -126,15 +125,13 @@ bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
     last_sub_batch_offset = last_entry_offset;
     sub_batch_cnt++;
   }
-  if (track_cf_stat) {
-    if (most_recent_entry->has_single_del &&
-        !most_recent_entry->has_overwritten_single_del) {
-      cf_id_to_stat[column_family_id].overwritten_sd_count++;
-      most_recent_entry->has_overwritten_single_del = true;
-    }
-    if (type == kSingleDeleteRecord) {
-      most_recent_entry->has_single_del = true;
-    }
+  if (most_recent_entry->has_single_del &&
+      !most_recent_entry->has_overwritten_single_del) {
+    cf_id_to_stat[column_family_id].overwritten_sd_count++;
+    most_recent_entry->has_overwritten_single_del = true;
+  }
+  if (type == kSingleDeleteRecord) {
+    most_recent_entry->has_single_del = true;
   }
   // Some sanity check for using Merge and SD on the same key.
   if (iter.Entry().type == kSingleDeleteRecord) {
@@ -196,12 +193,10 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id,
       key.size(), update_count);
   skip_list.Insert(index_entry);
 
-  if (track_cf_stat) {
-    if (type == kSingleDeleteRecord) {
-      index_entry->has_single_del = true;
-    }
-    cf_id_to_stat[column_family_id].entry_count++;
+  if (type == kSingleDeleteRecord) {
+    index_entry->has_single_del = true;
   }
+  cf_id_to_stat[column_family_id].entry_count++;
 }
 
 void WriteBatchWithIndex::Rep::Clear() {
@@ -1164,15 +1159,8 @@ const Comparator* WriteBatchWithIndexInternal::GetUserComparator(
   return ucmps.GetComparator(cf_id);
 }
 
-void WriteBatchWithIndex::SetTrackPerCFStat(bool track) {
-  // Should be set when the wbwi contains no update.
-  assert(GetWriteBatch()->Count() == 0);
-  rep->track_cf_stat = track;
-}
-
 const std::unordered_map<uint32_t, WriteBatchWithIndex::CFStat>&
 WriteBatchWithIndex::GetCFStats() const {
-  assert(rep->track_cf_stat);
   return rep->cf_id_to_stat;
 }
 
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h
index 79134217e200..6871a922ae5f 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -406,6 +406,11 @@ class WBWIIteratorImpl final : public WBWIIterator {
   bool out_of_bound_ = false;
 
   bool TestOutOfBound() const {
+    if (!iterate_lower_bound_ && !iterate_upper_bound_) {
+      // The Entry() call below is non-trivial, tests the common and cheaper
+      // no bound case first.
+      return false;
+    }
     const Slice& curKey = Entry().key;
     return AtOrAfterUpperBound(&curKey) || BeforeLowerBound(&curKey);
   }
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 9e26d734baf7..2c8b71203e17 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -3646,7 +3646,6 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
 
 TEST_P(WriteBatchWithIndexTest, TrackAndClearCFStats) {
   std::string value;
-  batch_->SetTrackPerCFStat(true);
   ASSERT_OK(batch_->Put("A", "val"));
   ASSERT_OK(batch_->SingleDelete("B"));
 
@@ -3735,7 +3734,6 @@ TEST_F(WBWIMemTableTest, ReadFromWBWIMemtable) {
   Random& rnd = *Random::GetTLSInstance();
   auto wbwi = std::make_shared<WriteBatchWithIndex>(
       cmp, 0, /*overwrite_key=*/true, 0, 0);
-  wbwi->SetTrackPerCFStat(true);
   std::vector<std::pair<std::string, std::string>> expected;
   const int kNumUpdate = 10000;
   expected.resize(kNumUpdate);
@@ -3999,7 +3997,6 @@ TEST_F(WBWIMemTableTest, IterEmitSingleDelete) {
 
   auto wbwi = std::make_shared<WriteBatchWithIndex>(
       cmp, 0, /*overwrite_key=*/true, 0, 0);
-  wbwi->SetTrackPerCFStat(true);
 
   ASSERT_OK(wbwi->Put(DBTestBase::Key(0), "val0"));
   ASSERT_OK(wbwi->SingleDelete(DBTestBase::Key(0)));
@@ -4153,7 +4150,6 @@ TEST_F(WBWIMemTableTest, WBWIMemTableWithMerge) {
 
   auto wbwi = std::make_shared<WriteBatchWithIndex>(
       cmp, 0, /*overwrite_key=*/true, 0, 0);
-  wbwi->SetTrackPerCFStat(true);
   std::unique_ptr<WBWIMemTable> wbwi_mem{
       new WBWIMemTable(wbwi, cmp,
                        /*cf_id=*/0, &immutable_opts, &mutable_cf_options,

From 476a98ca30469053d11937ec6967123d4db58f51 Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Thu, 17 Apr 2025 13:19:52 -0700
Subject: [PATCH 060/500] Add a new GetNewestUserDefinedTimestamp API (#13547)

Summary:
This PR adds a DB::GetNewestUserDefinedTimestamp API to get the newest timestamp of the column family. This is only for when the column family enables user defined timestamp.
It checks the mutable memtable, the immutable memtable and the SST files, and returns the first newest user defined timestamp found. When user defined timestamp is not persisted in SST files, there is metadata in MANIFEST tracking upperbound of flushed timestamps, so the newest timestamp in SST files can be found. If user defined timestamps are
persisted in SST files, currently no timestamp metadata info is persisted. A NotSupported status will be returned if SST files need to be checked in that case.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13547

Test Plan: Added tests

Reviewed By: cbi42

Differential Revision: D73123575

Pulled By: jowlyzhang

fbshipit-source-id: 460ac4f9c96926d3c8fcf7944edab8dc0feae1dd
---
 db/db_impl/db_impl.cc                         |  64 ++++++++++
 db/db_impl/db_impl.h                          |   3 +
 db/db_test.cc                                 |   5 +
 db/db_with_timestamp_basic_test.cc            | 111 ++++++++++++++++++
 db/memtable.cc                                |   7 +-
 db/memtable.h                                 |   8 +-
 db/memtable_list.cc                           |  13 ++
 db/memtable_list.h                            |   6 +
 include/rocksdb/db.h                          |  19 +++
 include/rocksdb/utilities/stackable_db.h      |   5 +
 .../new_features/get_newest_udt.md            |   1 +
 util/udt_util.cc                              |  14 +++
 util/udt_util.h                               |   4 +
 13 files changed, 250 insertions(+), 10 deletions(-)
 create mode 100644 unreleased_history/new_features/get_newest_udt.md

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 571e1e2675eb..8ecbc3660194 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -74,6 +74,7 @@
 #include "options/cf_options.h"
 #include "options/options_helper.h"
 #include "options/options_parser.h"
+#include "util/udt_util.h"
 #ifdef ROCKSDB_JEMALLOC
 #include "port/jemalloc_helper.h"
 #endif
@@ -1855,6 +1856,69 @@ Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
   return Status::OK();
 }
 
+Status DBImpl::GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
+                                             std::string* newest_timestamp) {
+  if (newest_timestamp == nullptr) {
+    return Status::InvalidArgument("newest_timestamp is nullptr");
+  }
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  if (cfd->ioptions().persist_user_defined_timestamps) {
+    return Status::NotSupported(
+        "GetNewestUserDefinedTimestamp doesn't support the case when user"
+        "defined timestamps are persisted.");
+  }
+
+  Status status;
+  // Acquire SuperVersion
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    bool enter_write_thread = sv->mem == cfd->mem();
+    WriteThread::Writer w;
+    // Enter write thread to read the mutable memtable to avoid racing access
+    // with concurrent writes. No need to enter nonmem_write_thread_ since this
+    // call only care about memtable writes, not WAL writes.
+    if (enter_write_thread) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WaitForPendingWrites();
+    }
+    *newest_timestamp = sv->mem->GetNewestUDT().ToString();
+    assert(!newest_timestamp->empty() || sv->mem->IsEmpty());
+    if (enter_write_thread) {
+      write_thread_.ExitUnbatched(&w);
+    }
+  }
+  // Read from immutable memtables if nothing found in mutable memtable.
+  if (newest_timestamp->empty()) {
+    *newest_timestamp = sv->imm->GetNewestUDT().ToString();
+  }
+  // Read from SST files if no result can be found in memtables.
+  if (newest_timestamp->empty() && sv->current->GetSstFilesSize() != 0) {
+    // full_history_ts_low is used to track the exclusive upperbound of
+    // flushed user defined timestamp. So we can use it to deduce the newest
+    // timestamp in the SST files that the column family has seen.
+    Slice full_history_ts_low = sv->full_history_ts_low;
+    if (!full_history_ts_low.empty()) {
+      GetU64CutoffTsFromFullHistoryTsLow(&full_history_ts_low,
+                                         newest_timestamp);
+    }
+  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
+  return status;
+}
+
 InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
                                               Arena* arena,
                                               SequenceNumber sequence,
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 0b208add7135..dd4d3e90a4cd 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -506,6 +506,9 @@ class DBImpl : public DB {
   Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
                              std::string* ts_low) override;
 
+  Status GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
+                                       std::string* newest_timestamp) override;
+
   Status GetDbIdentity(std::string& identity) const override;
 
   virtual Status GetDbIdentityFromIdentityFile(const IOOptions& opts,
diff --git a/db/db_test.cc b/db/db_test.cc
index b3511f3eecd8..8cf5b12a959e 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3446,6 +3446,11 @@ class ModelDB : public DB {
     return Status::OK();
   }
 
+  Status GetNewestUserDefinedTimestamp(
+      ColumnFamilyHandle* /*cf*/, std::string* /*newest_timestamp*/) override {
+    return Status::OK();
+  }
+
   ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
 
  private:
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index cf088e7ae054..1e20ae018477 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -4914,6 +4914,117 @@ TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) {
   Close();
 }
 
+class GetNewestUserDefinedTimestampTest : public DBBasicTestWithTimestampBase {
+ public:
+  explicit GetNewestUserDefinedTimestampTest()
+      : DBBasicTestWithTimestampBase("get_newest_udt_test") {}
+};
+
+TEST_F(GetNewestUserDefinedTimestampTest, Basic) {
+  std::string newest_timestamp;
+  // UDT disabled, get InvalidArgument.
+  ASSERT_TRUE(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)
+                  .IsInvalidArgument());
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 5;
+  options.min_write_buffer_number_to_merge = 4;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+  DestroyAndReopen(options);
+  // UDT persisted, get NotSupported.
+  ASSERT_TRUE(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp)
+                  .IsNotSupported());
+
+  options.persist_user_defined_timestamps = false;
+  options.allow_concurrent_memtable_write = false;
+
+  DestroyAndReopen(options);
+  ASSERT_TRUE(
+      db_->GetNewestUserDefinedTimestamp(nullptr, nullptr).IsInvalidArgument());
+
+  ColumnFamilyHandleImpl* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+      db_->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  // The column family hasn't seen any user defined timestamp
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_TRUE(newest_timestamp.empty());
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(1), "val1"));
+  // Testing get newest timestamp from mutable memtable.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(1), newest_timestamp);
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(2), "val2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd));
+  // Testing get the newest timestamp from immutable memtable because the
+  // mutable one is empty.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(2), newest_timestamp);
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(3), "val3"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(4), "val4"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd));
+  // Testing get the newest timestamp from the more recent immutable memtable
+  // when there are multiple immutable memtables.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(4), newest_timestamp);
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(5), "val5"));
+  // Testing get newest timestamp from mutable memtable when it has data, in the
+  // presence of immutable memtables.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(5), newest_timestamp);
+
+  ASSERT_OK(Flush());
+  // After flushing and all the user defined timestamp are flushed. User defined
+  // timestamp info for SST files is available from MANIFEST.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(5), newest_timestamp);
+
+  Reopen(options);
+  // Similar after flush, when there is no memtables, but some SST files,
+  // if MANIFEST records the upperbound of flushed timestamps because timestamps
+  // are not persisted in SST files, this info can be found.
+  ASSERT_OK(db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+  ASSERT_EQ(EncodeAsUint64(5), newest_timestamp);
+
+  Close();
+}
+
+TEST_F(GetNewestUserDefinedTimestampTest, ConcurrentWrites) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.persist_user_defined_timestamps = false;
+  options.allow_concurrent_memtable_write = false;
+
+  DestroyAndReopen(options);
+
+  std::vector<std::thread> threads;
+  threads.reserve(10);
+  std::atomic<uint64_t> current_ts{0};
+  for (int i = 0; i < 10; i++) {
+    threads.emplace_back([this, i, &current_ts]() {
+      if (i % 2 == 0) {
+        std::string newest_timestamp;
+        ASSERT_OK(
+            db_->GetNewestUserDefinedTimestamp(nullptr, &newest_timestamp));
+      } else {
+        uint64_t write_ts = current_ts.fetch_add(1);
+        ASSERT_OK(db_->Put(WriteOptions(), Key(1), EncodeAsUint64(write_ts),
+                           "val" + std::to_string(i)));
+      }
+    });
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/memtable.cc b/db/memtable.cc
index 396d21404bba..5b4bfdd9c936 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -147,7 +147,6 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
   const Comparator* ucmp = cmp.user_comparator();
   assert(ucmp);
   ts_sz_ = ucmp->timestamp_size();
-  persist_user_defined_timestamps_ = ioptions.persist_user_defined_timestamps;
 }
 
 MemTable::~MemTable() {
@@ -1806,7 +1805,7 @@ uint64_t MemTable::GetMinLogContainingPrepSection() {
 }
 
 void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) {
-  if (ts_sz_ == 0 || persist_user_defined_timestamps_) {
+  if (ts_sz_ == 0) {
     return;
   }
   const Comparator* ucmp = GetInternalKeyComparator().user_comparator();
@@ -1817,9 +1816,7 @@ void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) {
 }
 
 const Slice& MemTable::GetNewestUDT() const {
-  // This path should not be invoked for MemTables that does not enable the UDT
-  // in Memtable only feature.
-  assert(ts_sz_ > 0 && !persist_user_defined_timestamps_);
+  assert(ts_sz_ > 0);
   return newest_udt_;
 }
 
diff --git a/db/memtable.h b/db/memtable.h
index bd64499024f6..3968cfb4b180 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -825,6 +825,8 @@ class MemTable final : public ReadOnlyMemTable {
            is_range_del_table_empty_;
   }
 
+  //  Gets the newest user defined timestamps in the memtable. This should only
+  //  be called when user defined timestamp is enabled.
   const Slice& GetNewestUDT() const override;
 
   // Returns Corruption status if verification fails.
@@ -900,14 +902,10 @@ class MemTable final : public ReadOnlyMemTable {
   // Size in bytes for the user-defined timestamps.
   size_t ts_sz_;
 
-  // Whether to persist user-defined timestamps
-  bool persist_user_defined_timestamps_;
-
   // Newest user-defined timestamp contained in this MemTable. For ts1, and ts2
   // if Comparator::CompareTimestamp(ts1, ts2) > 0, ts1 is considered newer than
   // ts2. We track this field for a MemTable if its column family has UDT
-  // feature enabled and the `persist_user_defined_timestamp` flag is false.
-  // Otherwise, this field just contains an empty Slice.
+  // feature enabled.
   Slice newest_udt_;
 
   // Updates flush_state_ using ShouldFlushNow()
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 4e6587792971..93d8b05f836d 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -374,6 +374,19 @@ bool MemTableListVersion::TrimHistory(autovector<ReadOnlyMemTable*>* to_delete,
   return ret;
 }
 
+const Slice& MemTableListVersion::GetNewestUDT() const {
+  static Slice kEmptySlice;
+  for (auto it = memlist_.begin(); it != memlist_.end(); ++it) {
+    ReadOnlyMemTable* m = *it;
+    Slice timestamp = m->GetNewestUDT();
+    assert(!timestamp.empty() || m->IsEmpty());
+    if (!timestamp.empty()) {
+      return m->GetNewestUDT();
+    }
+  }
+  return kEmptySlice;
+}
+
 // Returns true if there is at least one memtable on which flush has
 // not yet started.
 bool MemTableList::IsFlushPending() const {
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 4d06421ba41c..eb42e1c7276a 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -149,6 +149,12 @@ class MemTableListVersion {
 
   int NumFlushed() const { return static_cast<int>(memlist_history_.size()); }
 
+  // Gets the newest user defined timestamps from the immutable memtables.
+  // This returns the newest user defined timestamp found in the most recent
+  // immutable memtable. This should only be called when user defined timestamp
+  // is enabled.
+  const Slice& GetNewestUDT() const;
+
  private:
   friend class MemTableList;
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index db2060c33b79..ae78b6a3ce1a 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1798,6 +1798,25 @@ class DB {
   virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
                                      std::string* ts_low) = 0;
 
+  // EXPERIMENTAL
+  // Get the newest timestamp of the column family. This is only for when the
+  // column family enables user defined timestamp and when timestamps are not
+  // persisted in SST files, a.k.a `persist_user_defined_timestamps=false`.
+  // This checks the mutable memtable, the immutable memtable and the SST files,
+  // and returns the first newest user defined timestamp found.
+  // When user defined timestamp is not persisted in SST files, metadata in
+  // MANIFEST tracks the most recently seen timestamp for SST files, so the
+  // newest timestamp in SST files can be found.
+  // OK status is returned if finding the newest timestamp succeeds, if
+  // `newest_timestamp` is empty, it means the column family hasn't seen any
+  // timestamp. The returned timestamp is encoded, util method `DecodeU64Ts` can
+  // be used to decode it into uint64_t.
+  // User-defined timestamp is required to be increasing per key, the return
+  // value of this API would be most useful if the user-defined timestamp is
+  // monotonically increasing across keys.
+  virtual Status GetNewestUserDefinedTimestamp(
+      ColumnFamilyHandle* column_family, std::string* newest_timestamp) = 0;
+
   // Suspend deleting obsolete files. Compactions will continue to occur,
   // but no obsolete files will be deleted. To resume file deletions, each
   // call to DisableFileDeletions() must be matched by a subsequent call to
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index b4fa0fc92d91..89549941cb91 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -512,6 +512,11 @@ class StackableDB : public DB {
     return db_->GetFullHistoryTsLow(column_family, ts_low);
   }
 
+  Status GetNewestUserDefinedTimestamp(ColumnFamilyHandle* column_family,
+                                       std::string* newest_timestamp) override {
+    return db_->GetNewestUserDefinedTimestamp(column_family, newest_timestamp);
+  }
+
   Status GetSortedWalFiles(VectorWalPtr& files) override {
     return db_->GetSortedWalFiles(files);
   }
diff --git a/unreleased_history/new_features/get_newest_udt.md b/unreleased_history/new_features/get_newest_udt.md
new file mode 100644
index 000000000000..920208686d6d
--- /dev/null
+++ b/unreleased_history/new_features/get_newest_udt.md
@@ -0,0 +1 @@
+A new API DB::GetNewestUserDefinedTimestamp is added to return the newest user defined timestamp seen in a column family
\ No newline at end of file
diff --git a/util/udt_util.cc b/util/udt_util.cc
index 3246574d61bb..555dcf5d1645 100644
--- a/util/udt_util.cc
+++ b/util/udt_util.cc
@@ -429,6 +429,20 @@ void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
   PutFixed64(full_history_ts_low, cutoff_udt_ts + 1);
 }
 
+void GetU64CutoffTsFromFullHistoryTsLow(Slice* full_history_ts_low,
+                                        std::string* cutoff_ts) {
+  uint64_t full_history_ts_low_int = 0;
+  [[maybe_unused]] bool format_res =
+      GetFixed64(full_history_ts_low, &full_history_ts_low_int);
+  assert(format_res);
+  assert(full_history_ts_low_int > 0);
+  if (full_history_ts_low_int > 0) {
+    PutFixed64(cutoff_ts, full_history_ts_low_int - 1);
+  } else {
+    PutFixed64(cutoff_ts, 0);
+  }
+}
+
 std::tuple<OptSlice, OptSlice> MaybeAddTimestampsToRange(
     const OptSlice& start, const OptSlice& end, size_t ts_sz,
     std::string* start_with_ts, std::string* end_with_ts, bool exclusive_end) {
diff --git a/util/udt_util.h b/util/udt_util.h
index 8252bab64fca..a9736e433d6c 100644
--- a/util/udt_util.h
+++ b/util/udt_util.h
@@ -275,6 +275,10 @@ Status ValidateUserDefinedTimestampsOptions(
 void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts,
                                         std::string* full_history_ts_low);
 
+// The reverse of `GetFullHistoryTsLowFromU64CutoffTs`.
+void GetU64CutoffTsFromFullHistoryTsLow(Slice* full_history_ts_low,
+                                        std::string* cutoff_ts);
+
 // `start` is the inclusive lower user key bound without user-defined timestamp.
 // `end` is the upper user key bound without user-defined timestamp.
 // By default, `end` is treated as being exclusive. If `exclusive_end` is set to

From 9b186c8d11c5fe20d1f988673e7b1e698ac6f69d Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 17 Apr 2025 17:43:05 -0700
Subject: [PATCH 061/500] Add base_input_level and output_level in
 CompactionServiceJobInfo (#13555)

Summary:
Similar to https://github.com/facebook/rocksdb/pull/13029, add `base_input_level` (a.k.a. start_level) and `output_level` to `CompactionServiceJobInfo`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13555

Test Plan:
Updated Unit Tests
```
./compaction_service_test
```

Reviewed By: anand1976

Differential Revision: D73213504

Pulled By: jaykorean

fbshipit-source-id: abb3b0025bc12245b812ef589fe77e9a30ba0c46
---
 db/compaction/compaction_service_job.cc  |  4 +-
 db/compaction/compaction_service_test.cc | 66 +++++++++++++++++++-----
 include/rocksdb/options.h                | 13 ++++-
 3 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 7a6b07c5d9ea..620d12c81068 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -77,7 +77,9 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact),
       thread_pri_, compaction->compaction_reason(),
       compaction->is_full_compaction(), compaction->is_manual_compaction(),
-      compaction->bottommost_level());
+      compaction->bottommost_level(), compaction->start_level(),
+      compaction->output_level());
+
   CompactionServiceScheduleResponse response =
       db_options_.compaction_service->Schedule(info, compaction_input_binary);
   switch (response.status) {
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index d6680ac62db1..573e99ece68c 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -22,9 +22,9 @@ class MyTestCompactionService : public CompactionService {
         options_(options),
         statistics_(statistics),
         start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
-                    false, false, false),
+                    false, false, false, -1, -1),
         wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
-                   false, false, false),
+                   false, false, false, -1, -1),
         listeners_(listeners),
         table_properties_collector_factories_(
             std::move(table_properties_collector_factories)) {}
@@ -1266,17 +1266,31 @@ TEST_F(CompactionServiceTest, PrecludeLastLevel) {
 
   // Verify Output Stats
   auto my_cs = GetCompactionService();
-  CompactionServiceResult result;
-  my_cs->GetResult(&result);
-  ASSERT_OK(result.status);
-  ASSERT_GT(result.internal_stats.output_level_stats.cpu_micros, 0);
-  ASSERT_GT(result.internal_stats.output_level_stats.micros, 0);
-  ASSERT_EQ(result.internal_stats.output_level_stats.num_output_records +
-                result.internal_stats.proximal_level_stats.num_output_records,
-            kNumTrigger * kNumKeys);
-  ASSERT_EQ(result.internal_stats.output_level_stats.num_output_files +
-                result.internal_stats.proximal_level_stats.num_output_files,
-            2);
+  {
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_GT(result.internal_stats.output_level_stats.cpu_micros, 0);
+    ASSERT_GT(result.internal_stats.output_level_stats.micros, 0);
+    ASSERT_EQ(result.internal_stats.output_level_stats.num_output_records +
+                  result.internal_stats.proximal_level_stats.num_output_records,
+              kNumTrigger * kNumKeys);
+    ASSERT_EQ(result.internal_stats.output_level_stats.num_output_files +
+                  result.internal_stats.proximal_level_stats.num_output_files,
+              2);
+
+    CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+    ASSERT_EQ(0, info.base_input_level);
+    ASSERT_EQ(kNumLevels - 1, info.output_level);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  // Disable Preclude feature and run full compaction to the bottommost level
+  {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+    ASSERT_EQ(kNumLevels - 2, info.base_input_level);
+    ASSERT_EQ(kNumLevels - 1, info.output_level);
+  }
 }
 
 TEST_F(CompactionServiceTest, ConcurrentCompaction) {
@@ -1346,12 +1360,16 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(true, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(1, info.base_input_level);
+  ASSERT_EQ(2, info.output_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(Env::USER, info.priority);
   ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
   ASSERT_EQ(true, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(1, info.base_input_level);
+  ASSERT_EQ(2, info.output_level);
 
   // Test priority BOTTOM
   env_->SetBackgroundThreads(1, Env::BOTTOM);
@@ -1383,18 +1401,24 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(Env::BOTTOM, info.priority);
+  ASSERT_EQ(0, info.base_input_level);
+  ASSERT_EQ(db_->NumberLevels() - 1, info.output_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(Env::BOTTOM, info.priority);
   ASSERT_EQ(CompactionReason::kLevelL0FilesNum, info.compaction_reason);
   ASSERT_EQ(false, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
+  ASSERT_EQ(0, info.base_input_level);
+  ASSERT_EQ(db_->NumberLevels() - 1, info.output_level);
 
   // Test Non-Bottommost Level
   options.num_levels = 4;
   ReopenWithCompactionService(&options);
   my_cs =
       static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+  int compaction_num = my_cs->GetCompactionNum();
+  ASSERT_EQ(0, compaction_num);
 
   for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
     for (int j = 0; j < 10; j++) {
@@ -1403,16 +1427,22 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
     }
     ASSERT_OK(Flush());
   }
-
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // This is trivial move. Done locally.
+  ASSERT_EQ(0, my_cs->GetCompactionNum());
   info = my_cs->GetCompactionInfoForStart();
   ASSERT_EQ(false, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(false, info.bottommost_level);
+  ASSERT_EQ(-1, info.base_input_level);
+  ASSERT_EQ(-1, info.output_level);
   info = my_cs->GetCompactionInfoForWait();
   ASSERT_EQ(false, info.is_manual_compaction);
   ASSERT_EQ(false, info.is_full_compaction);
   ASSERT_EQ(false, info.bottommost_level);
+  ASSERT_EQ(-1, info.base_input_level);
+  ASSERT_EQ(-1, info.output_level);
 
   // Test Full Compaction + Bottommost Level
   options.num_levels = 6;
@@ -1427,7 +1457,10 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
     }
     ASSERT_OK(Flush());
   }
+  MoveFilesToLevel(options.num_levels - 1);
 
+  // Force final level compaction
+  // base_input_level == output_level == last_level
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
@@ -1439,10 +1472,15 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
   info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(options.num_levels - 1, info.base_input_level);
+  ASSERT_EQ(options.num_levels - 1, info.output_level);
   ASSERT_EQ(true, info.is_manual_compaction);
   ASSERT_EQ(true, info.is_full_compaction);
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(CompactionReason::kManualCompaction, info.compaction_reason);
+  ASSERT_EQ(options.num_levels - 1, info.base_input_level);
+  ASSERT_EQ(options.num_levels - 1, info.output_level);
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
 }
 
 TEST_F(CompactionServiceTest, FallbackLocalAuto) {
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 73e00bc49880..947df9855e6c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -477,12 +477,19 @@ struct CompactionServiceJobInfo {
   bool is_manual_compaction;
   bool bottommost_level;
 
+  // the smallest input level of the compaction.
+  // (same as Compaction::start_level and CompactionJobInfo::base_input_level)
+  int base_input_level;
+  // the output level of the compaction.
+  int output_level;
+
   CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
                            std::string db_session_id_, uint64_t job_id_,
                            Env::Priority priority_,
                            CompactionReason compaction_reason_,
                            bool is_full_compaction_, bool is_manual_compaction_,
-                           bool bottommost_level_)
+                           bool bottommost_level_, int base_input_level_,
+                           int output_level_)
       : db_name(std::move(db_name_)),
         db_id(std::move(db_id_)),
         db_session_id(std::move(db_session_id_)),
@@ -491,7 +498,9 @@ struct CompactionServiceJobInfo {
         compaction_reason(compaction_reason_),
         is_full_compaction(is_full_compaction_),
         is_manual_compaction(is_manual_compaction_),
-        bottommost_level(bottommost_level_) {}
+        bottommost_level(bottommost_level_),
+        base_input_level(base_input_level_),
+        output_level(output_level_) {}
 };
 
 struct CompactionServiceScheduleResponse {

From 05fa171beb4fdda0efae51e8b2175bff80cbd859 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Fri, 18 Apr 2025 16:43:56 -0700
Subject: [PATCH 062/500] Add Logger to CompactionServiceOptionsOverride
 (#13559)

Summary:
As title

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13559

Test Plan: CI

Reviewed By: anand1976

Differential Revision: D73267683

Pulled By: jaykorean

fbshipit-source-id: 6a3d3da07a36ad3bbfad3f749e7dfd67b7b626c8
---
 db/compaction/compaction_service_test.cc | 1 +
 db/db_impl/db_impl_secondary.cc          | 1 +
 include/rocksdb/options.h                | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 573e99ece68c..1f245cb62c09 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -84,6 +84,7 @@ class MyTestCompactionService : public CompactionService {
     options_override.table_factory = options_.table_factory;
     options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
     options_override.statistics = statistics_;
+    options_override.info_log = options_.info_log;
     if (!listeners_.empty()) {
       options_override.listeners = listeners_;
     }
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 6e6b248d76c6..5f6a2db3f72c 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -1014,6 +1014,7 @@ Status DB::OpenAndCompact(
   // We will close the DB after the compaction anyway.
   // Open as many files as needed for the compaction.
   db_options.max_open_files = -1;
+  db_options.info_log = override_options.info_log;
 
   // 4. Filter CFs that are needed for OpenAndCompact()
   // We do not need to open all column families for the remote compaction.
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 947df9855e6c..230d708263a6 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2515,6 +2515,9 @@ struct CompactionServiceOptionsOverride {
   // to set it here.
   std::shared_ptr<Statistics> statistics = nullptr;
 
+  // Info Log. If not overriden, default one will be used.
+  std::shared_ptr<Logger> info_log = nullptr;
+
   // Only compaction generated SST files use this user defined table properties
   // collector.
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>

From 0be3abf7b6cfa763b2b7c561b9c1d862c868ed90 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 21 Apr 2025 10:19:14 -0700
Subject: [PATCH 063/500] Arbitrary string map in
 CompactionServiceOptionsOverride (#13552)

Summary:
Adding an arbitrary options map so that any additional overridable options can be added without RocksDB change. Unknown options will be ignored

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13552

Test Plan:
Unit Test added
```
./db_secondary_test -- --gtest_filter="*OptionsOverrideTest*"
```

Reviewed By: hx235

Differential Revision: D73203789

Pulled By: jaykorean

fbshipit-source-id: 176bd9849d2bc60e78657c119e10a1a2a0988cd1
---
 db/db_impl/db_impl_secondary.cc               | 31 +++++++-
 db/db_secondary_test.cc                       | 75 +++++++++++++++++++
 include/rocksdb/options.h                     |  3 +
 ..._in_compaction_service_options_override.md |  1 +
 4 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 5f6a2db3f72c..5f0c8bb8b295 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -984,7 +984,7 @@ Status DB::OpenAndCompact(
   }
 
   // 2. Load the options
-  DBOptions db_options;
+  DBOptions base_db_options;
   ConfigOptions config_options;
   config_options.env = override_options.env;
   config_options.ignore_unknown_options = true;
@@ -997,13 +997,22 @@ Status DB::OpenAndCompact(
   std::string options_file_name =
       OptionsFileName(name, compaction_input.options_file_number);
 
-  s = LoadOptionsFromFile(config_options, options_file_name, &db_options,
+  s = LoadOptionsFromFile(config_options, options_file_name, &base_db_options,
                           &all_column_families);
   if (!s.ok()) {
     return s;
   }
 
-  // 3. Override pointer configurations in DBOptions with
+  // 3. Options to Override
+  // Override serializable configurations from override_options.options_map
+  DBOptions db_options;
+  s = GetDBOptionsFromMap(config_options, base_db_options,
+                          override_options.options_map, &db_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Override options that are directly set as shared ptrs in
   // CompactionServiceOptionsOverride
   db_options.env = override_options.env;
   db_options.file_checksum_gen_factory =
@@ -1024,6 +1033,18 @@ Status DB::OpenAndCompact(
   std::vector<ColumnFamilyDescriptor> column_families;
   for (auto& cf : all_column_families) {
     if (cf.name == compaction_input.cf_name) {
+      ColumnFamilyOptions cf_options;
+      // Override serializable configurations from override_options.options_map
+      s = GetColumnFamilyOptionsFromMap(config_options, cf.options,
+                                        override_options.options_map,
+                                        &cf_options);
+      if (!s.ok()) {
+        return s;
+      }
+      cf.options = std::move(cf_options);
+
+      // Override options that are directly set as shared ptrs in
+      // CompactionServiceOptionsOverride
       cf.options.comparator = override_options.comparator;
       cf.options.merge_operator = override_options.merge_operator;
       cf.options.compaction_filter = override_options.compaction_filter;
@@ -1035,6 +1056,7 @@ Status DB::OpenAndCompact(
           override_options.sst_partitioner_factory;
       cf.options.table_properties_collector_factories =
           override_options.table_properties_collector_factories;
+
       column_families.emplace_back(cf);
     } else if (cf.name == kDefaultColumnFamilyName) {
       column_families.emplace_back(cf);
@@ -1051,6 +1073,9 @@ Status DB::OpenAndCompact(
   }
   assert(db);
 
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0", db);
+
   // 6. Find the handle of the Column Family that this will compact
   ColumnFamilyHandle* cfh = nullptr;
   for (auto* handle : handles) {
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index 5be4feecf74c..e983a580b9a2 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -508,6 +508,81 @@ TEST_F(DBSecondaryTest, OpenAsSecondary) {
   verify_db_func("new_foo_value", "new_bar_value");
 }
 
+TEST_F(DBSecondaryTest, OptionsOverrideTest) {
+  Options options;
+  options.env = env_;
+  options.preserve_internal_time_seconds = 300;
+  options.compaction_readahead_size = 200;
+  options.blob_compaction_readahead_size = 100;
+  Reopen(options);
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  input.options_file_number = dbfull()->GetVersionSet()->options_file_number();
+  input.cf_name = kDefaultColumnFamilyName;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+  ASSERT_EQ(db_->GetOptions().compaction_readahead_size, 200);
+  ASSERT_EQ(db_->GetOptions().blob_compaction_readahead_size, 100);
+
+  Close();
+
+  std::string compaction_input_binary;
+  ASSERT_OK(input.Write(&compaction_input_binary));
+  std::string compaction_result_binary;
+
+  CompactionServiceOptionsOverride override_options;
+  override_options.env = env_;
+  override_options.table_factory.reset(
+      NewBlockBasedTableFactory(BlockBasedTableOptions()));
+
+  ASSERT_OK(
+      StringToMap("compaction_readahead_size=8388608;"
+                  "blob_compaction_readahead_size=4194304;"
+                  "some_invalid_option=ignore_me;"
+                  "env=this_should_not_fail;"
+                  "max_open_files=100;",  // this should be always overriden as
+                                          // -1 in remote compaction
+                  &override_options.options_map));
+
+  bool verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0",
+      [&](void* arg) {
+        auto secondary_db = static_cast<DB*>(arg);
+        auto secondary_db_options = secondary_db->GetOptions();
+        // DBOption
+        ASSERT_EQ(secondary_db_options.compaction_readahead_size, 8388608);
+        ASSERT_EQ(secondary_db_options.max_open_files, -1);
+        // CFOption
+        ASSERT_EQ(secondary_db_options.blob_compaction_readahead_size, 4194304);
+        verified = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(DB::OpenAndCompact(OpenAndCompactOptions(), dbname_,
+                               secondary_path_, compaction_input_binary,
+                               &compaction_result_binary, override_options));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_TRUE(verified);
+}
+
 namespace {
 class TraceFileEnv : public EnvWrapper {
  public:
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 230d708263a6..63fdcb15da8c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2522,6 +2522,9 @@ struct CompactionServiceOptionsOverride {
   // collector.
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
       table_properties_collector_factories;
+
+  // All other options to override. Unknown options will be ignored.
+  std::unordered_map<std::string, std::string> options_map;
 };
 
 struct OpenAndCompactOptions {
diff --git a/unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md b/unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md
new file mode 100644
index 000000000000..530599233bc0
--- /dev/null
+++ b/unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md
@@ -0,0 +1 @@
+Added arbitrary string map for additional options to be overriden for remote compactions

From 7eb1adb532648206f9f970211b6268e418f0830e Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 21 Apr 2025 10:36:45 -0700
Subject: [PATCH 064/500] Pass FSWritableFile pointer to ExternalTableBuilder
 (#13560)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13560

Reviewed By: jaykorean

Differential Revision: D73296242

Pulled By: anand1976

fbshipit-source-id: b692a5c6ad32b40b3c2c1ca7a93bd04139856bce
---
 include/rocksdb/external_table.h |  4 +++-
 table/external_table.cc          |  3 ++-
 table/table_test.cc              | 25 ++++++++++++++++---------
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index cfa4152c5a7b..3adfdf4f3368 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -244,9 +244,11 @@ class ExternalTableFactory : public Customizable {
       const ExternalTableOptions& table_options,
       std::unique_ptr<ExternalTableReader>* table_reader) const = 0;
 
+  // The table builder should use the file pointer to append to the file.
+  // Do not sync or close the file after finishing. RocksDB will do that.
   virtual ExternalTableBuilder* NewTableBuilder(
       const ExternalTableBuilderOptions& builder_options,
-      const std::string& file_path) const = 0;
+      const std::string& file_path, FSWritableFile* file) const = 0;
 };
 
 // Allocate a TableFactory that wraps around an ExternalTableFactory. Use this
diff --git a/table/external_table.cc b/table/external_table.cc
index 9fed7d8024b8..2858543118a3 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -308,7 +308,8 @@ class ExternalTableFactoryAdapter : public TableFactory {
         topts.read_options, topts.write_options,
         topts.moptions.prefix_extractor, topts.ioptions.user_comparator,
         topts.column_family_name, topts.reason);
-    builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name()));
+    builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name(),
+                                          file->writable_file()));
     if (builder) {
       return new ExternalTableBuilderAdapter(std::move(builder));
     }
diff --git a/table/table_test.cc b/table/table_test.cc
index 151b24b6b7ef..692c028d328c 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6533,8 +6533,9 @@ class ExternalTableReaderTest : public DBTestBase {
  protected:
   class DummyExternalTableFile {
    public:
-    explicit DummyExternalTableFile(const std::string& file_path)
-        : file_path_(file_path), file_size_(0) {
+    explicit DummyExternalTableFile(const std::string& file_path,
+                                    FSWritableFile* file)
+        : file_path_(file_path), file_(file), file_size_(0) {
       props_.comparator_name = BytewiseComparator()->Name();
     }
 
@@ -6547,7 +6548,11 @@ class ExternalTableReaderTest : public DBTestBase {
       }
       props_.num_entries = kv_vec.size();
       file_size_ = buf_.length();
-      return WriteStringToFile(Env::Default(), buf_, file_path_);
+      if (file_) {
+        return file_->Append(buf_, IOOptions(), /*dbg=*/nullptr);
+      } else {
+        return WriteStringToFile(Env::Default(), buf_, file_path_);
+      }
     }
 
     Status Deserialize(std::map<std::string, std::string>& kv_map) {
@@ -6610,6 +6615,7 @@ class ExternalTableReaderTest : public DBTestBase {
     }
 
     std::string file_path_;
+    FSWritableFile* file_;
     std::string buf_;
     TableProperties props_;
     uint64_t file_size_;
@@ -6759,7 +6765,7 @@ class ExternalTableReaderTest : public DBTestBase {
   class DummyExternalTableReader : public ExternalTableReader {
    public:
     explicit DummyExternalTableReader(const std::string& file_path)
-        : file_(file_path) {
+        : file_(file_path, /*file=*/nullptr) {
       Status s = file_.Deserialize(kv_map_);
       EXPECT_OK(s);
     }
@@ -6811,8 +6817,9 @@ class ExternalTableReaderTest : public DBTestBase {
 
   class DummyExternalTableBuilder : public ExternalTableBuilder {
    public:
-    explicit DummyExternalTableBuilder(const std::string& file_path)
-        : file_(file_path) {}
+    explicit DummyExternalTableBuilder(const std::string& file_path,
+                                       FSWritableFile* file)
+        : file_(file_path, file) {}
 
     void Add(const Slice& key, const Slice& value) override {
       if (!kv_vec_.empty()) {
@@ -6856,8 +6863,8 @@ class ExternalTableReaderTest : public DBTestBase {
 
     ExternalTableBuilder* NewTableBuilder(
         const ExternalTableBuilderOptions& /*opts*/,
-        const std::string& file_path) const override {
-      return new DummyExternalTableBuilder(file_path);
+        const std::string& file_path, FSWritableFile* file) const override {
+      return new DummyExternalTableBuilder(file_path, file);
     }
   };
 };
@@ -6874,7 +6881,7 @@ TEST_F(ExternalTableReaderTest, BasicTest) {
                                     std::shared_ptr<const SliceTransform>(),
                                     BytewiseComparator(), "default",
                                     TableFileCreationReason::kMisc),
-        file_path));
+        file_path, /*file=*/nullptr));
     builder->Add("foo", "bar");
     ASSERT_OK(builder->Finish());
   }

From c237022831aa129aa707bc28e0702a1617ef23b5 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 21 Apr 2025 15:58:58 -0700
Subject: [PATCH 065/500] Update for next release 10.3.0 (#13566)

Summary:
Updated version, HISTORY and compatibility script for 10.3 release (no folly hash update in this release).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13566

Test Plan: CI

Reviewed By: anand1976

Differential Revision: D73391839

Pulled By: jaykorean

fbshipit-source-id: 075bb1f9f25caf96c4fcca7f4a315666acd5a288
---
 HISTORY.md                                    | 23 +++++++++++++++++++
 tools/check_format_compatible.sh              |  2 +-
 .../behavior_changes/ra_stats_user_only.md    |  1 -
 ...x_reopened_writable_file_size_incorrect.md |  1 -
 .../bug_fixes/remote_compact_populate.md      |  1 -
 .../new_features/compact_ra_stats.md          |  2 --
 .../new_features/get_newest_udt.md            |  1 -
 .../new_features/ingest_wbwi.md               |  1 -
 .../tombstone_scan_flush_trigger.md           |  1 -
 ...ete_max_write_buffer_number_to_maintain.md |  1 -
 .../dep_max_compact_memtable_level.md         |  1 -
 .../deprecate-ignore-range-del.md             |  1 -
 .../deprecate-promote-l0.md                   |  1 -
 ..._in_compaction_service_options_override.md |  1 -
 .../remove_fail_if_options_file_error.md      |  1 -
 15 files changed, 24 insertions(+), 15 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/ra_stats_user_only.md
 delete mode 100644 unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md
 delete mode 100644 unreleased_history/bug_fixes/remote_compact_populate.md
 delete mode 100644 unreleased_history/new_features/compact_ra_stats.md
 delete mode 100644 unreleased_history/new_features/get_newest_udt.md
 delete mode 100644 unreleased_history/new_features/ingest_wbwi.md
 delete mode 100644 unreleased_history/new_features/tombstone_scan_flush_trigger.md
 delete mode 100644 unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md
 delete mode 100644 unreleased_history/public_api_changes/dep_max_compact_memtable_level.md
 delete mode 100644 unreleased_history/public_api_changes/deprecate-ignore-range-del.md
 delete mode 100644 unreleased_history/public_api_changes/deprecate-promote-l0.md
 delete mode 100644 unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md
 delete mode 100644 unreleased_history/public_api_changes/remove_fail_if_options_file_error.md

diff --git a/HISTORY.md b/HISTORY.md
index 9846f240916b..6a9ff81169fc 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,29 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.2.0 (04/21/2025)
+### New Features
+* Provide histogram stats `COMPACTION_PREFETCH_BYTES` to measure number of bytes for RocksDB's prefetching (as opposed to file
+system's prefetch) on SST file during compaction read
+* A new API DB::GetNewestUserDefinedTimestamp is added to return the newest user defined timestamp seen in a column family
+* Introduce API `IngestWriteBatchWithIndex()` for ingesting updates into DB while bypassing memtable writes. This improves performance when writing a large write batch to the DB.
+* Add a new CF option `memtable_op_scan_flush_trigger` that triggers a flush of the memtable if an iterator's Seek()/Next() scans over a certain number of invisible entries from the memtable.
+
+### Public API Changes
+* AdvancedColumnFamilyOptions.max_write_buffer_number_to_maintain is deleted. It's deprecated since introduction of a better option max_write_buffer_size_to_maintain since RocksDB 6.5.0.
+* Deprecated API `DB::MaxMemCompactionLevel()`.
+* Deprecated `ReadOptions::ignore_range_deletions`.
+* Deprecated API `experimental::PromoteL0()`.
+* Added arbitrary string map for additional options to be overriden for remote compactions
+* The fail_if_options_file_error option in DBOptions has been removed. The behavior now is to always return failure in any API that fails to persist the OPTIONS file.
+
+### Behavior Changes
+* Make stats `PREFETCH_BYTES_USEFUL`, `PREFETCH_HITS`, `PREFETCH_BYTES` only account for prefetching during user initiated scan
+
+### Bug Fixes
+* Fix a bug in Posix file system that the FSWritableFile created via `FileSystem::ReopenWritableFile` internally does not track the correct file size.
+* Fix a bug where tail size of remote compaction output is not persisted in primary db's manifest
+
 ## 10.1.0 (03/24/2025)
 ### New Features
 * Added a new `DBOptions.calculate_sst_write_lifetime_hint_set` setting that allows to customize which compaction styles SST write lifetime hint calculation is allowed on. Today RocksDB supports only two modes `kCompactionStyleLevel` and `kCompactionStyleUniversal`.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 9b228f1b18bc..9aa8cc1a0401 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -135,7 +135,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/ra_stats_user_only.md b/unreleased_history/behavior_changes/ra_stats_user_only.md
deleted file mode 100644
index ea219c3d4785..000000000000
--- a/unreleased_history/behavior_changes/ra_stats_user_only.md
+++ /dev/null
@@ -1 +0,0 @@
-Make stats `PREFETCH_BYTES_USEFUL`, `PREFETCH_HITS`, `PREFETCH_BYTES` only account for prefetching during user initiated scan
diff --git a/unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md b/unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md
deleted file mode 100644
index 405b8fb19203..000000000000
--- a/unreleased_history/bug_fixes/fix_reopened_writable_file_size_incorrect.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix a bug in Posix file system that the FSWritableFile created via `FileSystem::ReopenWritableFile` internally does not track the correct file size.
\ No newline at end of file
diff --git a/unreleased_history/bug_fixes/remote_compact_populate.md b/unreleased_history/bug_fixes/remote_compact_populate.md
deleted file mode 100644
index e1bd531cb0c7..000000000000
--- a/unreleased_history/bug_fixes/remote_compact_populate.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix a bug where tail size of remote compaction output is not persisted in primary db's manifest
diff --git a/unreleased_history/new_features/compact_ra_stats.md b/unreleased_history/new_features/compact_ra_stats.md
deleted file mode 100644
index 574b6b67ca2b..000000000000
--- a/unreleased_history/new_features/compact_ra_stats.md
+++ /dev/null
@@ -1,2 +0,0 @@
-Provide histogram stats `COMPACTION_PREFETCH_BYTES` to measure number of bytes for RocksDB's prefetching (as opposed to file
-system's prefetch) on SST file during compaction read
diff --git a/unreleased_history/new_features/get_newest_udt.md b/unreleased_history/new_features/get_newest_udt.md
deleted file mode 100644
index 920208686d6d..000000000000
--- a/unreleased_history/new_features/get_newest_udt.md
+++ /dev/null
@@ -1 +0,0 @@
-A new API DB::GetNewestUserDefinedTimestamp is added to return the newest user defined timestamp seen in a column family
\ No newline at end of file
diff --git a/unreleased_history/new_features/ingest_wbwi.md b/unreleased_history/new_features/ingest_wbwi.md
deleted file mode 100644
index f778a1e86642..000000000000
--- a/unreleased_history/new_features/ingest_wbwi.md
+++ /dev/null
@@ -1 +0,0 @@
-* Introduce API `IngestWriteBatchWithIndex()` for ingesting updates into DB while bypassing memtable writes. This improves performance when writing a large write batch to the DB.
diff --git a/unreleased_history/new_features/tombstone_scan_flush_trigger.md b/unreleased_history/new_features/tombstone_scan_flush_trigger.md
deleted file mode 100644
index a44b2213ab90..000000000000
--- a/unreleased_history/new_features/tombstone_scan_flush_trigger.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add a new CF option `memtable_op_scan_flush_trigger` that triggers a flush of the memtable if an iterator's Seek()/Next() scans over a certain number of invisible entries from the memtable.
diff --git a/unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md b/unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md
deleted file mode 100644
index ecfb945ec973..000000000000
--- a/unreleased_history/public_api_changes/delete_max_write_buffer_number_to_maintain.md
+++ /dev/null
@@ -1 +0,0 @@
-AdvancedColumnFamilyOptions.max_write_buffer_number_to_maintain is deleted. It's deprecated since introduction of a better option max_write_buffer_size_to_maintain since RocksDB 6.5.0.
\ No newline at end of file
diff --git a/unreleased_history/public_api_changes/dep_max_compact_memtable_level.md b/unreleased_history/public_api_changes/dep_max_compact_memtable_level.md
deleted file mode 100644
index 9b41bd977275..000000000000
--- a/unreleased_history/public_api_changes/dep_max_compact_memtable_level.md
+++ /dev/null
@@ -1 +0,0 @@
-* Deprecated API `DB::MaxMemCompactionLevel()`.
diff --git a/unreleased_history/public_api_changes/deprecate-ignore-range-del.md b/unreleased_history/public_api_changes/deprecate-ignore-range-del.md
deleted file mode 100644
index d4e09e6ec2ee..000000000000
--- a/unreleased_history/public_api_changes/deprecate-ignore-range-del.md
+++ /dev/null
@@ -1 +0,0 @@
-* Deprecated `ReadOptions::ignore_range_deletions`.
diff --git a/unreleased_history/public_api_changes/deprecate-promote-l0.md b/unreleased_history/public_api_changes/deprecate-promote-l0.md
deleted file mode 100644
index bcc31298299f..000000000000
--- a/unreleased_history/public_api_changes/deprecate-promote-l0.md
+++ /dev/null
@@ -1 +0,0 @@
-* Deprecated API `experimental::PromoteL0()`.
diff --git a/unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md b/unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md
deleted file mode 100644
index 530599233bc0..000000000000
--- a/unreleased_history/public_api_changes/options_map_in_compaction_service_options_override.md
+++ /dev/null
@@ -1 +0,0 @@
-Added arbitrary string map for additional options to be overriden for remote compactions
diff --git a/unreleased_history/public_api_changes/remove_fail_if_options_file_error.md b/unreleased_history/public_api_changes/remove_fail_if_options_file_error.md
deleted file mode 100644
index 822940568baa..000000000000
--- a/unreleased_history/public_api_changes/remove_fail_if_options_file_error.md
+++ /dev/null
@@ -1 +0,0 @@
-The fail_if_options_file_error option in DBOptions has been removed. The behavior now is to always return failure in any API that fails to persist the OPTIONS file.

From 1614345a525cfa43c11725936dba446529e00cb5 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 22 Apr 2025 09:08:36 -0700
Subject: [PATCH 066/500] add missing version.h change for 10.3 release
 (#13567)

Summary:
Follow up for https://github.com/facebook/rocksdb/pull/13566

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13567

Test Plan: CI

Reviewed By: pdillinger

Differential Revision: D73407482

Pulled By: jaykorean

fbshipit-source-id: 0bb7492473c0691a50d25288f0350ab097958de7
---
 include/rocksdb/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 274b4e01e5b4..126599544d0e 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 2
+#define ROCKSDB_MINOR 3
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From c368c6afe8299534f07f6586f96b476e6ede8e25 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 22 Apr 2025 13:02:36 -0700
Subject: [PATCH 067/500] Minor compression refactoring (#13539)

Summary:
* Mostly, remove `sample_for_compression` from CompressionInfo because it's not used by the core function it serves, `CompressData()`. Confusing (and inefficient), especially in db_bench where it appears to use `FLAGS_sample_for_compression` in places where it is actually ignored.
* Various clarifying comments, clean-ups, and tiny optimizations
* Prepare some structures like `CompressionDict` for more usage
* Some TODOs and FIXMEs about some things I've noticed are amiss, confusing, or excessive
* A notable optimization opportunity that might become a "pay as you go" improvement for the potential indirection costs of customizable compression: use C++23's resize_and_overwrite() in compress functions to avoid zeroing the string buffer contents before populating it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13539

Test Plan: existing tests / CI

Reviewed By: hx235

Differential Revision: D73451273

Pulled By: pdillinger

fbshipit-source-id: 0373627466d695043d21146ce34d52f189ae9432
---
 cache/compressed_secondary_cache.cc           |  4 +-
 db/blob/blob_file_builder.cc                  |  3 +-
 db/blob/blob_file_builder_test.cc             |  3 +-
 db/blob/blob_file_reader_test.cc              |  3 +-
 db/blob/blob_source_test.cc                   |  3 +-
 memory/memory_allocator_impl.h                |  2 +-
 .../block_based/block_based_table_builder.cc  | 35 ++++++++------
 table/block_based/block_based_table_builder.h |  3 +-
 table/format.h                                |  1 +
 tools/db_bench_tool.cc                        | 12 ++---
 util/compression.h                            | 47 ++++++++++++-------
 utilities/blob_db/blob_db_impl.cc             |  8 ++--
 12 files changed, 69 insertions(+), 55 deletions(-)

diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index 4d3d0a2cddf7..d912c58b0317 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -200,11 +200,9 @@ Status CompressedSecondaryCache::InsertInternal(
     PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size);
     CompressionContext compression_context(cache_options_.compression_type,
                                            cache_options_.compression_opts);
-    uint64_t sample_for_compression{0};
     CompressionInfo compression_info(
         cache_options_.compression_opts, compression_context,
-        CompressionDict::GetEmptyDict(), cache_options_.compression_type,
-        sample_for_compression);
+        CompressionDict::GetEmptyDict(), cache_options_.compression_type);
 
     bool success =
         CompressData(val, compression_info,
diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc
index 1cb6833b5918..919d7c60ed6d 100644
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@@ -267,10 +267,9 @@ Status BlobFileBuilder::CompressBlobIfNeeded(
   // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
   CompressionOptions opts;
   CompressionContext context(blob_compression_type_, opts);
-  constexpr uint64_t sample_for_compression = 0;
 
   CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                       blob_compression_type_, sample_for_compression);
+                       blob_compression_type_);
 
   constexpr uint32_t compression_format_version = 2;
 
diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc
index 8a2ecff13a74..0981029d09de 100644
--- a/db/blob/blob_file_builder_test.cc
+++ b/db/blob/blob_file_builder_test.cc
@@ -405,10 +405,9 @@ TEST_F(BlobFileBuilderTest, Compression) {
 
   CompressionOptions opts;
   CompressionContext context(kSnappyCompression, opts);
-  constexpr uint64_t sample_for_compression = 0;
 
   CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                       kSnappyCompression, sample_for_compression);
+                       kSnappyCompression);
 
   std::string compressed_value;
   ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc
index 676cbed41e85..9881dc362750 100644
--- a/db/blob/blob_file_reader_test.cc
+++ b/db/blob/blob_file_reader_test.cc
@@ -75,9 +75,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
   } else {
     CompressionOptions opts;
     CompressionContext context(compression, opts);
-    constexpr uint64_t sample_for_compression = 0;
     CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         compression, sample_for_compression);
+                         compression);
 
     constexpr uint32_t compression_format_version = 2;
 
diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc
index d0e9def7d8b8..8a021969e4fe 100644
--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@@ -77,9 +77,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
   } else {
     CompressionOptions opts;
     CompressionContext context(compression, opts);
-    constexpr uint64_t sample_for_compression = 0;
     CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         compression, sample_for_compression);
+                         compression);
 
     constexpr uint32_t compression_format_version = 2;
 
diff --git a/memory/memory_allocator_impl.h b/memory/memory_allocator_impl.h
index f1d3b9472ccc..a71ce0accdfb 100644
--- a/memory/memory_allocator_impl.h
+++ b/memory/memory_allocator_impl.h
@@ -31,7 +31,7 @@ using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
 inline CacheAllocationPtr AllocateBlock(size_t size,
                                         MemoryAllocator* allocator) {
   if (allocator) {
-    auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+    auto block = static_cast<char*>(allocator->Allocate(size));
     return CacheAllocationPtr(block, allocator);
   }
   return CacheAllocationPtr(new char[size]);
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index ad85daa5a6c4..7f459aa836e2 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -120,7 +120,8 @@ bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size,
 // format_version is the block format as defined in include/rocksdb/table.h
 Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
                     CompressionType* type, uint32_t format_version,
-                    bool allow_sample, std::string* compressed_output,
+                    uint64_t sample_for_compression,
+                    std::string* compressed_output,
                     std::string* sampled_output_fast,
                     std::string* sampled_output_slow) {
   assert(type);
@@ -132,9 +133,9 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
   // The users can use these stats to decide if it is worthwhile
   // enabling compression and they also get a hint about which
   // compression algorithm wil be beneficial.
-  if (allow_sample && info.SampleForCompression() &&
+  if (sample_for_compression > 0 &&
       Random::GetTLSInstance()->OneIn(
-          static_cast<int>(info.SampleForCompression()))) {
+          static_cast<int>(sample_for_compression))) {
     // Sampling with a fast compression algorithm
     if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
       CompressionType c =
@@ -142,8 +143,7 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
       CompressionOptions options;
       CompressionContext context(c, options);
       CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c,
-                               info.SampleForCompression());
+                               CompressionDict::GetEmptyDict(), c);
 
       CompressData(uncompressed_data, info_tmp,
                    GetCompressFormatForVersion(format_version),
@@ -156,8 +156,7 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
       CompressionOptions options;
       CompressionContext context(c, options);
       CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c,
-                               info.SampleForCompression());
+                               CompressionDict::GetEmptyDict(), c);
 
       CompressData(uncompressed_data, info_tmp,
                    GetCompressFormatForVersion(format_version),
@@ -1268,15 +1267,15 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
     }
     assert(compression_dict != nullptr);
     CompressionInfo compression_info(r->compression_opts, compression_ctx,
-                                     *compression_dict, *type,
-                                     r->sample_for_compression);
+                                     *compression_dict, *type);
 
     std::string sampled_output_fast;
     std::string sampled_output_slow;
     *block_contents = CompressBlock(
         uncompressed_block_data, compression_info, type,
-        r->table_options.format_version, is_data_block /* allow_sample */,
-        compressed_output, &sampled_output_fast, &sampled_output_slow);
+        r->table_options.format_version,
+        is_data_block ? r->sample_for_compression : 0U, compressed_output,
+        &sampled_output_fast, &sampled_output_slow);
 
     if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
       // Currently compression sampling is only enabled for data block.
@@ -1487,6 +1486,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
   // Starts empty; see FilterBlockBuilder::AddWithPrevKey
   std::string prev_block_last_key_no_ts;
   while (r->pc_rep->write_queue.pop(slot)) {
+    // FIXME: this is weird popping off write queue just to wait again on
+    // compress queue
     assert(slot != nullptr);
     slot->Take(block_rep);
     assert(block_rep != nullptr);
@@ -1963,12 +1964,16 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
           r->compression_opts.max_dict_bytes, r->compression_opts.level);
     }
   } else {
+    // ZSTD "raw content dictionary" - "Any buffer is a valid raw content
+    // dictionary."
     dict = std::move(compression_dict_samples);
   }
-  r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
-                                                r->compression_opts.level));
-  r->verify_dict.reset(
-      new UncompressionDict(dict, r->compression_type == kZSTD));
+  if (r->table_options.verify_compression) {
+    r->verify_dict.reset(
+        new UncompressionDict(std::string(dict), r->compression_type == kZSTD));
+  }
+  r->compression_dict.reset(new CompressionDict(
+      std::move(dict), r->compression_type, r->compression_opts.level));
 
   auto get_iterator_for_block = [&r](size_t i) {
     auto& data_block = r->data_block_buffers[i];
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 61f5ad78e5a5..8bb5e3c074ad 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -202,7 +202,8 @@ class BlockBasedTableBuilder : public TableBuilder {
 
 Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
                     CompressionType* type, uint32_t format_version,
-                    bool do_sample, std::string* compressed_output,
+                    uint64_t sample_for_compression,
+                    std::string* compressed_output,
                     std::string* sampled_output_fast,
                     std::string* sampled_output_slow);
 
diff --git a/table/format.h b/table/format.h
index dac5d695be45..5bf1077866fd 100644
--- a/table/format.h
+++ b/table/format.h
@@ -382,6 +382,7 @@ struct BlockContents {
 
   // The additional memory space taken by the block data.
   size_t usable_size() const {
+    // FIXME: doesn't account for possible block trailer
     if (allocation.get() != nullptr) {
       auto allocator = allocation.get_deleter().allocator;
       if (allocator) {
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 49a6ac07b07e..1d39d6d1bd1a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2914,8 +2914,7 @@ class Benchmark {
       CompressionOptions opts;
       CompressionContext context(FLAGS_compression_type_e, opts);
       CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                           FLAGS_compression_type_e,
-                           FLAGS_sample_for_compression);
+                           FLAGS_compression_type_e);
       bool result = CompressSlice(info, Slice(input_str), &compressed);
 
       if (!result) {
@@ -4135,8 +4134,7 @@ class Benchmark {
     opts.level = FLAGS_compression_level;
     CompressionContext context(FLAGS_compression_type_e, opts);
     CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         FLAGS_compression_type_e,
-                         FLAGS_sample_for_compression);
+                         FLAGS_compression_type_e);
     // Compress 1G
     while (ok && bytes < int64_t(1) << 30) {
       compressed.clear();
@@ -4166,9 +4164,9 @@ class Benchmark {
     compression_opts.level = FLAGS_compression_level;
     CompressionContext compression_ctx(FLAGS_compression_type_e,
                                        compression_opts);
-    CompressionInfo compression_info(
-        compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
-        FLAGS_compression_type_e, FLAGS_sample_for_compression);
+    CompressionInfo compression_info(compression_opts, compression_ctx,
+                                     CompressionDict::GetEmptyDict(),
+                                     FLAGS_compression_type_e);
     UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
     UncompressionInfo uncompression_info(uncompression_ctx,
                                          UncompressionDict::GetEmptyDict(),
diff --git a/util/compression.h b/util/compression.h
index e7ddcc2ff8b7..95011b7b9635 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -153,7 +153,8 @@ struct CompressionDict {
   std::string dict_;
 
  public:
-  CompressionDict(std::string dict, CompressionType type, int level) {
+  CompressionDict() = default;
+  CompressionDict(std::string&& dict, CompressionType type, int level) {
     dict_ = std::move(dict);
 #ifdef ZSTD
     zstd_cdict_ = nullptr;
@@ -173,6 +174,25 @@ struct CompressionDict {
 #endif  // ZSTD
   }
 
+  CompressionDict(CompressionDict&& other) {
+#ifdef ZSTD
+    zstd_cdict_ = other.zstd_cdict_;
+    other.zstd_cdict_ = nullptr;
+#endif  // ZSTD
+    dict_ = std::move(other.dict_);
+  }
+  CompressionDict& operator=(CompressionDict&& other) {
+    if (this == &other) {
+      return *this;
+    }
+#ifdef ZSTD
+    zstd_cdict_ = other.zstd_cdict_;
+    other.zstd_cdict_ = nullptr;
+#endif  // ZSTD
+    dict_ = std::move(other.dict_);
+    return *this;
+  }
+
   ~CompressionDict() {
 #ifdef ZSTD
     size_t res = 0;
@@ -189,18 +209,16 @@ struct CompressionDict {
 #endif  // ZSTD
 
   Slice GetRawDict() const { return dict_; }
+  bool empty() const { return dict_.empty(); }
 
   static const CompressionDict& GetEmptyDict() {
     static CompressionDict empty_dict{};
     return empty_dict;
   }
 
-  CompressionDict() = default;
-  // Disable copy/move
+  // Disable copy
   CompressionDict(const CompressionDict&) = delete;
   CompressionDict& operator=(const CompressionDict&) = delete;
-  CompressionDict(CompressionDict&&) = delete;
-  CompressionDict& operator=(CompressionDict&&) = delete;
 };
 
 // Holds dictionary and related data, like ZSTD's digested uncompression
@@ -225,7 +243,7 @@ struct UncompressionDict {
   ZSTD_DDict* zstd_ddict_ = nullptr;
 #endif  // ROCKSDB_ZSTD_DDICT
 
-  UncompressionDict(std::string dict, bool using_zstd)
+  UncompressionDict(std::string&& dict, bool using_zstd)
       : dict_(std::move(dict)), slice_(dict_) {
 #ifdef ROCKSDB_ZSTD_DDICT
     if (!slice_.empty() && using_zstd) {
@@ -408,31 +426,27 @@ class CompressionContext {
   CompressionContext& operator=(const CompressionContext&) = delete;
 };
 
+// TODO: rename
 class CompressionInfo {
   const CompressionOptions& opts_;
   const CompressionContext& context_;
   const CompressionDict& dict_;
   const CompressionType type_;
-  const uint64_t sample_for_compression_;
 
  public:
   CompressionInfo(const CompressionOptions& _opts,
                   const CompressionContext& _context,
-                  const CompressionDict& _dict, CompressionType _type,
-                  uint64_t _sample_for_compression)
-      : opts_(_opts),
-        context_(_context),
-        dict_(_dict),
-        type_(_type),
-        sample_for_compression_(_sample_for_compression) {}
+                  const CompressionDict& _dict, CompressionType _type)
+      : opts_(_opts), context_(_context), dict_(_dict), type_(_type) {}
 
   const CompressionOptions& options() const { return opts_; }
   const CompressionContext& context() const { return context_; }
   const CompressionDict& dict() const { return dict_; }
   CompressionType type() const { return type_; }
-  uint64_t SampleForCompression() const { return sample_for_compression_; }
 };
 
+// This is like a working area, reusable for different dicts, etc.
+// TODO: refactor / consolidate
 class UncompressionContext {
  private:
   CompressionContextCache* ctx_cache_ = nullptr;
@@ -958,7 +972,7 @@ inline bool BZip2_Compress(const CompressionInfo& /*info*/,
 
   // Initialize the output size.
   _stream.avail_out = static_cast<unsigned int>(length);
-  _stream.next_out = reinterpret_cast<char*>(&(*output)[output_header_len]);
+  _stream.next_out = output->data() + output_header_len;
 
   bool compressed = false;
   st = BZ2_bzCompress(&_stream, BZ_FINISH);
@@ -1336,6 +1350,7 @@ inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
       output, static_cast<uint32_t>(length));
 
   size_t compressBound = ZSTD_compressBound(length);
+  // TODO: use resize_and_overwrite with c++23
   output->resize(static_cast<size_t>(output_header_len + compressBound));
   size_t outlen = 0;
   ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 00d15e90ccf1..2ed8761fe1ac 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1162,10 +1162,10 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   CompressionType type = bdb_options_.compression;
   CompressionOptions opts;
   CompressionContext context(type, opts);
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type,
-                       0 /* sample_for_compression */);
-  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false,
-                compression_output, nullptr, nullptr);
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type);
+  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat,
+                0 /* sample_for_compression */, compression_output, nullptr,
+                nullptr);
   return *compression_output;
 }
 

From 9998478c642e99dfc8c4b409ec104ddf76fcb5b8 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 22 Apr 2025 15:31:46 -0700
Subject: [PATCH 068/500] Deflake test
 DBPropertiesTest.AggregatedTableProperties (#13568)

Summary:
This test was failing sporadically for me, like

```
db/db_properties_test.cc:247: Failure
Expected: (static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b)) <
(bias), actual: 0.113964 vs 0.1
```

I tried waiting for compaction in the test, but that made it fail consistently. Based on inspection of the test and the related test AggregatedTablePropertiesAtLevel already using `disable_auto_compactions = true`, I'm applying that to this test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13568

Test Plan: Parallel runs of the unit test, before and after

Reviewed By: jaykorean

Differential Revision: D73463685

Pulled By: pdillinger

fbshipit-source-id: 84df7cc9bdcd1caa108a7be254ffbebbe9a77de7
---
 db/db_properties_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 01ab37e21ebf..771c2dc8d881 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -377,6 +377,8 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
         NewBloomFilterPolicy(kBloomBitsPerKey, false));
     table_options.block_size = 1024;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    // The checks assume kTableCount number of files
+    options.disable_auto_compactions = true;
 
     DestroyAndReopen(options);
 
@@ -567,7 +569,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   options.target_file_size_base = 8192;
   options.max_bytes_for_level_base = 10000;
   options.max_bytes_for_level_multiplier = 2;
-  // This ensures there no compaction happening when we call GetProperty().
+  // The checks assume kTableCount number of files
   options.disable_auto_compactions = true;
   options.merge_operator.reset(new TestPutOperator());
 

From bcda3bda04ff34e8089dbb867c8e65a02ed3db0f Mon Sep 17 00:00:00 2001
From: Jesson Yo <aphostrophy@gmail.com>
Date: Wed, 23 Apr 2025 10:33:06 -0700
Subject: [PATCH 069/500] add SST file manager to C api (#13404)

Summary:
we want to limit the maximum disk space used by RocksDB in one of our Go services, as it runs on a highly disk-constrained network switch.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13404

Reviewed By: cbi42

Differential Revision: D73517940

Pulled By: jaykorean

fbshipit-source-id: ae91fc7a4992399e20f06cc67dad8130cf19049e
---
 db/c.cc             | 66 +++++++++++++++++++++++++++++++++++++++++++++
 db/c_test.c         | 17 ++++++++++++
 include/rocksdb/c.h | 46 +++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 1b0571efa947..859f2d0b601e 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -30,6 +30,7 @@
 #include "rocksdb/perf_context.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/sst_file_manager.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
@@ -113,6 +114,7 @@ using ROCKSDB_NAMESPACE::Slice;
 using ROCKSDB_NAMESPACE::SliceParts;
 using ROCKSDB_NAMESPACE::SliceTransform;
 using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileManager;
 using ROCKSDB_NAMESPACE::SstFileMetaData;
 using ROCKSDB_NAMESPACE::SstFileWriter;
 using ROCKSDB_NAMESPACE::Status;
@@ -226,6 +228,9 @@ struct rocksdb_cache_t {
 struct rocksdb_write_buffer_manager_t {
   std::shared_ptr<WriteBufferManager> rep;
 };
+struct rocksdb_sst_file_manager_t {
+  std::shared_ptr<SstFileManager> rep;
+};
 struct rocksdb_livefiles_t {
   std::vector<LiveFileMetaData> rep;
 };
@@ -5239,6 +5244,67 @@ ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall(
   wbm->rep->SetAllowStall(new_allow_stall);
 }
 
+rocksdb_sst_file_manager_t* rocksdb_sst_file_manager_create(
+    rocksdb_env_t* env) {
+  rocksdb_sst_file_manager_t* sfm = new rocksdb_sst_file_manager_t;
+  sfm->rep.reset(ROCKSDB_NAMESPACE::NewSstFileManager(env->rep));
+  return sfm;
+}
+
+void rocksdb_sst_file_manager_destroy(rocksdb_sst_file_manager_t* sfm) {
+  delete sfm;
+}
+
+void rocksdb_sst_file_manager_set_max_allowed_space_usage(
+    rocksdb_sst_file_manager_t* sfm, uint64_t max_allowed_space) {
+  sfm->rep->SetMaxAllowedSpaceUsage(max_allowed_space);
+}
+
+void rocksdb_sst_file_manager_set_compaction_buffer_size(
+    rocksdb_sst_file_manager_t* sfm, uint64_t compaction_buffer_size) {
+  sfm->rep->SetCompactionBufferSize(compaction_buffer_size);
+}
+
+bool rocksdb_sst_file_manager_is_max_allowed_space_reached(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->IsMaxAllowedSpaceReached();
+}
+
+bool rocksdb_sst_file_manager_is_max_allowed_space_reached_including_compactions(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->IsMaxAllowedSpaceReachedIncludingCompactions();
+}
+
+uint64_t rocksdb_sst_file_manager_get_total_size(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetTotalSize();
+}
+
+int64_t rocksdb_sst_file_manager_get_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetDeleteRateBytesPerSecond();
+}
+
+void rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm, int64_t delete_rate) {
+  return sfm->rep->SetDeleteRateBytesPerSecond(delete_rate);
+}
+
+double rocksdb_sst_file_manager_get_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetMaxTrashDBRatio();
+}
+
+void rocksdb_sst_file_manager_set_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm, double ratio) {
+  return sfm->rep->SetMaxTrashDBRatio(ratio);
+}
+
+uint64_t rocksdb_sst_file_manager_get_total_trash_size(
+    rocksdb_sst_file_manager_t* sfm) {
+  return sfm->rep->GetTotalTrashSize();
+}
+
 rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
                                         uint64_t target_size) {
   rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
diff --git a/db/c_test.c b/db/c_test.c
index 2142748e5674..2324611f2fe6 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -4050,6 +4050,23 @@ int main(int argc, char** argv) {
     rocksdb_cache_destroy(lru);
   }
 
+  StartPhase("sst_file_manager");
+  {
+    rocksdb_sst_file_manager_t* sst_file_manager;
+    sst_file_manager = rocksdb_sst_file_manager_create(env);
+    rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(sst_file_manager,
+                                                              1);
+    rocksdb_sst_file_manager_set_max_trash_db_ratio(sst_file_manager, 0.75);
+
+    CheckCondition(1 ==
+                   rocksdb_sst_file_manager_get_delete_rate_bytes_per_second(
+                       sst_file_manager));
+    CheckCondition(0.75 == rocksdb_sst_file_manager_get_max_trash_db_ratio(
+                               sst_file_manager));
+
+    rocksdb_sst_file_manager_destroy(sst_file_manager);
+  }
+
   StartPhase("cancel_all_background_work");
   rocksdb_cancel_all_background_work(db, 1);
 
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 60ae92fca14e..09a5f8ba1cb8 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -79,6 +79,7 @@ typedef struct rocksdb_hyper_clock_cache_options_t
     rocksdb_hyper_clock_cache_options_t;
 typedef struct rocksdb_cache_t rocksdb_cache_t;
 typedef struct rocksdb_write_buffer_manager_t rocksdb_write_buffer_manager_t;
+typedef struct rocksdb_sst_file_manager_t rocksdb_sst_file_manager_t;
 typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
 typedef struct rocksdb_compactionfiltercontext_t
     rocksdb_compactionfiltercontext_t;
@@ -2225,6 +2226,51 @@ extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_buffer_size(
 extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall(
     rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall);
 
+/* SstFileManager */
+
+extern ROCKSDB_LIBRARY_API rocksdb_sst_file_manager_t*
+rocksdb_sst_file_manager_create(rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_manager_destroy(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_sst_file_manager_set_max_allowed_space_usage(
+    rocksdb_sst_file_manager_t* sfm, uint64_t max_allowed_space);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_sst_file_manager_set_compaction_buffer_size(
+    rocksdb_sst_file_manager_t* sfm, uint64_t compaction_buffer_size);
+
+extern ROCKSDB_LIBRARY_API bool
+rocksdb_sst_file_manager_is_max_allowed_space_reached(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API bool
+rocksdb_sst_file_manager_is_max_allowed_space_reached_including_compactions(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_manager_get_total_size(rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_sst_file_manager_get_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_sst_file_manager_set_delete_rate_bytes_per_second(
+    rocksdb_sst_file_manager_t* sfm, int64_t delete_rate);
+
+extern ROCKSDB_LIBRARY_API double
+rocksdb_sst_file_manager_get_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_file_manager_set_max_trash_db_ratio(
+    rocksdb_sst_file_manager_t* sfm, double ratio);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_sst_file_manager_get_total_trash_size(rocksdb_sst_file_manager_t* sfm);
+
 /* HyperClockCache */
 
 extern ROCKSDB_LIBRARY_API rocksdb_hyper_clock_cache_options_t*

From 613e1a9a388e2326c74a4d1aa348f4be6151e18c Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 23 Apr 2025 14:52:56 -0700
Subject: [PATCH 070/500] Verify flush output file record count + minor clean
 up (#13556)

Summary:
**Context/Summary:**
Similar to https://github.com/facebook/rocksdb/commit/0a43d8a261b9c633c0a4e369b1ef33aa5ee32810, this is to verify flush output file contains the exact number of keys (represented by its `TableProperties::num_entries`) as added to table builder for block-based and plain table format. The implementation reuses a temporary compaction stats to record output record and existing input record (with some refactoring)

**Bonus:**
following https://github.com/facebook/rocksdb/commit/0a43d8a261b9c633c0a4e369b1ef33aa5ee32810#r154313564, limit compaction output record count check within block based table and plain table format as well as removing extra test setting; fix some typo

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13556

Test Plan: New test

Reviewed By: jaykorean

Differential Revision: D73229644

Pulled By: hx235

fbshipit-source-id: 2a7796450048b3bcb2d5c38f2b5fc6b53e4aae37
---
 db/builder.cc                              | 15 +++--
 db/builder.h                               |  9 +--
 db/compaction/compaction_job.cc            |  9 ++-
 db/compaction/compaction_job_test.cc       |  5 --
 db/compaction/tiered_compaction_test.cc    |  4 ++
 db/corruption_test.cc                      |  3 -
 db/db_flush_test.cc                        | 57 +++++++++++++++++
 db/db_impl/db_impl_open.cc                 | 71 ++++++++++++++-------
 db/db_test.cc                              |  1 -
 db/flush_job.cc                            | 73 ++++++++++++++--------
 db/internal_stats.h                        |  5 +-
 include/rocksdb/options.h                  |  3 +-
 utilities/backup/backup_engine.cc          |  2 +-
 utilities/transactions/transaction_test.cc |  3 -
 14 files changed, 188 insertions(+), 72 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index a39bcf3b4765..631530bf5666 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -74,8 +74,8 @@ Status BuildTable(
     EventLogger* event_logger, int job_id, TableProperties* table_properties,
     Env::WriteLifeTimeHint write_hint, const std::string* full_history_ts_low,
     BlobFileCompletionCallback* blob_callback, Version* version,
-    uint64_t* num_input_entries, uint64_t* memtable_payload_bytes,
-    uint64_t* memtable_garbage_bytes) {
+    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes,
+    InternalStats::CompactionStats* flush_stats) {
   assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          tboptions.column_family_name.empty());
@@ -253,6 +253,10 @@ Status BuildTable(
       }
       builder->Add(key_after_flush, value_after_flush);
 
+      if (flush_stats) {
+        flush_stats->num_output_records++;
+      }
+
       s = meta->UpdateBoundaries(key_after_flush, value_after_flush,
                                  ikey.sequence, ikey.type);
       if (!s.ok()) {
@@ -284,6 +288,9 @@ Status BuildTable(
         auto tombstone = range_del_it->Tombstone();
         std::pair<InternalKey, Slice> kv = tombstone.Serialize();
         builder->Add(kv.first.Encode(), kv.second);
+        if (flush_stats) {
+          flush_stats->num_output_records++;
+        }
         InternalKey tombstone_end = tombstone.SerializeEndKey();
         meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_,
                                        tboptions.internal_comparator);
@@ -305,9 +312,9 @@ Status BuildTable(
 
     TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
     const bool empty = builder->IsEmpty();
-    if (num_input_entries != nullptr) {
+    if (flush_stats) {
       assert(c_iter.HasNumInputEntryScanned());
-      *num_input_entries =
+      flush_stats->num_input_records =
           c_iter.NumInputEntryScanned() + num_unfragmented_tombstones;
     }
     if (!s.ok() || empty) {
diff --git a/db/builder.h b/db/builder.h
index 08dd5fcab001..93e66c76e0a0 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -10,6 +10,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/internal_stats.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/seqno_to_time_mapping.h"
 #include "db/table_properties_collector.h"
@@ -34,7 +35,6 @@ class SnapshotChecker;
 class TableCache;
 class TableBuilder;
 class WritableFileWriter;
-class InternalStats;
 class BlobFileCompletionCallback;
 
 // Convenience function for NewTableBuilder on the embedded table_factory.
@@ -49,6 +49,7 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
 //
 // @param column_family_name Name of the column family that is also identified
 //    by column_family_id, or empty string if unknown.
+// @param flush_stats treat flush as level 0 compaction in internal stats
 Status BuildTable(
     const std::string& dbname, VersionSet* versions,
     const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
@@ -69,8 +70,8 @@ Status BuildTable(
     Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
     const std::string* full_history_ts_low = nullptr,
     BlobFileCompletionCallback* blob_callback = nullptr,
-    Version* version = nullptr, uint64_t* num_input_entries = nullptr,
-    uint64_t* memtable_payload_bytes = nullptr,
-    uint64_t* memtable_garbage_bytes = nullptr);
+    Version* version = nullptr, uint64_t* memtable_payload_bytes = nullptr,
+    uint64_t* memtable_garbage_bytes = nullptr,
+    InternalStats::CompactionStats* flush_stats = nullptr);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index f534eb142d94..88ad5490f511 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -878,7 +878,14 @@ Status CompactionJob::Run() {
   UpdateCompactionJobOutputStats(internal_stats_);
 
   // Verify number of output records
-  if (status.ok() && db_options_.compaction_verify_record_count) {
+  // Only verify on table with format collects table properties
+  const auto& mutable_cf_options = compact_->compaction->mutable_cf_options();
+  if (status.ok() &&
+      (mutable_cf_options.table_factory->IsInstanceOf(
+           TableFactory::kBlockBasedTableName()) ||
+       mutable_cf_options.table_factory->IsInstanceOf(
+           TableFactory::kPlainTableName())) &&
+      db_options_.compaction_verify_record_count) {
     uint64_t total_output_num = 0;
     for (const auto& state : compact_->sub_compact_states) {
       for (const auto& output : state.GetOutputs()) {
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 36a4e5f0430a..b7afc07b996c 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -232,11 +232,6 @@ class CompactionJobTestBase : public testing::Test {
     // set default for the tests
     mutable_cf_options_.target_file_size_base = 1024 * 1024;
     mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024;
-
-    // Turn off compaction_verify_record_count MockTables
-    if (table_type == TableTypeForTest::kMockTable) {
-      db_options_.compaction_verify_record_count = false;
-    }
   }
 
   void SetUp() override {
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index ba32dcbb05e2..879dc0712aa0 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -225,6 +225,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
     flush_stats.micros = 1;
     flush_stats.bytes_written = bytes_per_file;
     flush_stats.num_output_files = 1;
+    flush_stats.num_input_records = kNumKeys;
+    flush_stats.num_output_records = kNumKeys;
     expect_stats[0].Add(flush_stats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -1080,6 +1082,8 @@ TEST_F(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
     flush_stats.micros = 1;
     flush_stats.bytes_written = bytes_per_file;
     flush_stats.num_output_files = 1;
+    flush_stats.num_input_records = kNumKeys;
+    flush_stats.num_output_records = kNumKeys;
     expect_stats[0].Add(flush_stats);
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index e20cd20df65f..d7f87faefed4 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -851,9 +851,6 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
   options.env = env_.get();
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
-  // Skip verifying record count against TableProperties for
-  // MockTables
-  options.compaction_verify_record_count = false;
   Status s;
   for (const auto& mode : corruption_modes) {
     delete db_;
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index b72de9a6886e..f8353974fb46 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -3504,6 +3504,63 @@ TEST_F(DBFlushTest, DBStuckAfterAtomicFlushError) {
   ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
+
+TEST_F(DBFlushTest, VerifyOutputRecordCount) {
+  for (bool use_plain_table : {false, true}) {
+    Options options = CurrentOptions();
+    options.flush_verify_memtable_count = true;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    DestroyAndReopen(options);
+    // Verify flush output record count verification in different table
+    // formats
+    if (use_plain_table) {
+      options.table_factory.reset(NewPlainTableFactory());
+    }
+
+    // Verify that flush output record count verification does not produce false
+    // positives.
+    ASSERT_OK(Merge("k0", "v1"));
+    ASSERT_OK(Put("k1", "v1"));
+    ASSERT_OK(Put("k2", "v1"));
+    ASSERT_OK(SingleDelete("k2"));
+    ASSERT_OK(Delete("k2"));
+    ASSERT_OK(Delete("k3"));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), "k1", "k3"));
+    ASSERT_OK(Flush());
+
+    // Verify that flush output record count verification catch corruption
+    DestroyAndReopen(options);
+    if (use_plain_table) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "PlainTableBuilder::Add::skip",
+          [&](void* skip) { *(bool*)skip = true; });
+
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "BlockBasedTableBuilder::Add::skip",
+          [&](void* skip) { *(bool*)skip = true; });
+    }
+    SyncPoint::GetInstance()->EnableProcessing();
+    const char* expect =
+        "Number of keys in flush output SST files does not match";
+
+    // 1. During DB open flush
+    ASSERT_OK(Put("k1", "v1"));
+    ASSERT_OK(Put("k2", "v1"));
+    Status s = TryReopen(options);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), expect));
+
+    // 2. During regular flush
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("k1", "v1"));
+    ASSERT_OK(Put("k2", "v1"));
+    s = Flush();
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), expect));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 0e365c9b34bb..f19ab4965835 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -2000,6 +2000,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   const size_t ts_sz = ucmp->timestamp_size();
   const bool logical_strip_timestamp =
       ts_sz > 0 && !cfd->ioptions().persist_user_defined_timestamps;
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
+                                             1 /* count */);
   {
     ScopedArenaPtr<InternalIterator> iter(
         logical_strip_timestamp
@@ -2072,19 +2075,20 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
           kMaxSequenceNumber);
       Version* version = cfd->current();
       version->Ref();
-      uint64_t num_input_entries = 0;
-      s = BuildTable(dbname_, versions_.get(), immutable_db_options_, tboptions,
-                     file_options_for_compaction_, cfd->table_cache(),
-                     iter.get(), std::move(range_del_iters), &meta,
-                     &blob_file_additions, snapshot_seqs, earliest_snapshot,
-                     earliest_write_conflict_snapshot, kMaxSequenceNumber,
-                     snapshot_checker, paranoid_file_checks,
-                     cfd->internal_stats(), &io_s, io_tracer_,
-                     BlobFileCreationReason::kRecovery,
-                     nullptr /* seqno_to_time_mapping */, &event_logger_,
-                     job_id, nullptr /* table_properties */, write_hint,
-                     nullptr /*full_history_ts_low*/, &blob_callback_, version,
-                     &num_input_entries);
+      TableProperties temp_table_proerties;
+      s = BuildTable(
+          dbname_, versions_.get(), immutable_db_options_, tboptions,
+          file_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, &blob_file_additions,
+          snapshot_seqs, earliest_snapshot, earliest_write_conflict_snapshot,
+          kMaxSequenceNumber, snapshot_checker, paranoid_file_checks,
+          cfd->internal_stats(), &io_s, io_tracer_,
+          BlobFileCreationReason::kRecovery,
+          nullptr /* seqno_to_time_mapping */, &event_logger_, job_id,
+          &temp_table_proerties /* table_properties */, write_hint,
+          nullptr /*full_history_ts_low*/, &blob_callback_, version,
+          nullptr /* memtable_payload_bytes */,
+          nullptr /* memtable_garbage_bytes */, &flush_stats);
       version->Unref();
       LogFlush(immutable_db_options_.info_log);
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
@@ -2100,10 +2104,31 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
       }
 
       uint64_t total_num_entries = mem->NumEntries();
-      if (s.ok() && total_num_entries != num_input_entries) {
+      if (s.ok() && total_num_entries != flush_stats.num_input_records) {
         std::string msg = "Expected " + std::to_string(total_num_entries) +
                           " entries in memtable, but read " +
-                          std::to_string(num_input_entries);
+                          std::to_string(flush_stats.num_input_records);
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "[%s] [JOB %d] Level-0 flush during recover: %s",
+                       cfd->GetName().c_str(), job_id, msg.c_str());
+        if (immutable_db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+      // Only verify on table with format collects table properties
+      const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      if (s.ok() &&
+          (mutable_cf_options.table_factory->IsInstanceOf(
+               TableFactory::kBlockBasedTableName()) ||
+           mutable_cf_options.table_factory->IsInstanceOf(
+               TableFactory::kPlainTableName())) &&
+          flush_stats.num_output_records != temp_table_proerties.num_entries) {
+        std::string msg =
+            "Number of keys in flush output SST files does not match "
+            "number of keys added to the table. Expected " +
+            std::to_string(flush_stats.num_output_records) + " but there are " +
+            std::to_string(temp_table_proerties.num_entries) +
+            " in output SST files";
         ROCKS_LOG_WARN(immutable_db_options_.info_log,
                        "[%s] [JOB %d] Level-0 flush during recover: %s",
                        cfd->GetName().c_str(), job_id, msg.c_str());
@@ -2151,25 +2176,25 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     }
   }
 
-  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
-  stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
+  flush_stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
 
   if (has_output) {
-    stats.bytes_written = meta.fd.GetFileSize();
-    stats.num_output_files = 1;
+    flush_stats.bytes_written = meta.fd.GetFileSize();
+    flush_stats.num_output_files = 1;
   }
 
   const auto& blobs = edit->GetBlobFileAdditions();
   for (const auto& blob : blobs) {
-    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+    flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
   }
 
-  stats.num_output_files_blob = static_cast<int>(blobs.size());
+  flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
 
-  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER,
+                                            flush_stats);
   cfd->internal_stats()->AddCFStats(
       InternalStats::BYTES_FLUSHED,
-      stats.bytes_written + stats.bytes_written_blob);
+      flush_stats.bytes_written + flush_stats.bytes_written_blob);
   RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
diff --git a/db/db_test.cc b/db/db_test.cc
index 8cf5b12a959e..81b4c2ed1b9a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -5463,7 +5463,6 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
   options.max_bytes_for_level_multiplier = 8;
   options.max_background_compactions = 1;
   options.num_levels = 5;
-  options.compaction_verify_record_count = false;
   std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
   options.table_factory = mtf;
 
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 8808315857d4..452e1ed9e677 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -868,7 +868,9 @@ Status FlushJob::WriteLevel0Table() {
       ts_sz > 0 && !cfd_->ioptions().persist_user_defined_timestamps;
 
   std::vector<BlobFileAddition> blob_file_additions;
-
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats flush_stats(CompactionReason::kFlush,
+                                             1 /* count**/);
   {
     auto write_hint = base_->storage_info()->CalculateSSTWriteHint(
         /*level=*/0, db_options_.calculate_sst_write_lifetime_hint_set);
@@ -887,7 +889,7 @@ Status FlushJob::WriteLevel0Table() {
     ro.total_order_seek = true;
     ro.io_activity = Env::IOActivity::kFlush;
     Arena arena;
-    uint64_t total_num_entries = 0, total_num_deletes = 0;
+    uint64_t total_num_input_entries = 0, total_num_deletes = 0;
     uint64_t total_data_size = 0;
     size_t total_memory_usage = 0;
     uint64_t total_num_range_deletes = 0;
@@ -922,7 +924,7 @@ Status FlushJob::WriteLevel0Table() {
       if (range_del_iter != nullptr) {
         range_del_iters.emplace_back(range_del_iter);
       }
-      total_num_entries += m->NumEntries();
+      total_num_input_entries += m->NumEntries();
       total_num_deletes += m->NumDeletion();
       total_data_size += m->GetDataSize();
       total_memory_usage += m->ApproximateMemoryUsage();
@@ -934,11 +936,12 @@ Status FlushJob::WriteLevel0Table() {
     //  "Write Buffer Full", should make update flush_reason_ accordingly.
     event_logger_->Log() << "job" << job_context_->job_id << "event"
                          << "flush_started" << "num_memtables" << mems_.size()
-                         << "num_entries" << total_num_entries << "num_deletes"
-                         << total_num_deletes << "total_data_size"
-                         << total_data_size << "memory_usage"
-                         << total_memory_usage << "num_range_deletes"
-                         << total_num_range_deletes << "flush_reason"
+                         << "total_num_input_entries" << total_num_input_entries
+                         << "num_deletes" << total_num_deletes
+                         << "total_data_size" << total_data_size
+                         << "memory_usage" << total_memory_usage
+                         << "num_range_deletes" << total_num_range_deletes
+                         << "flush_reason"
                          << GetFlushReasonString(flush_reason_);
 
     {
@@ -976,7 +979,6 @@ Status FlushJob::WriteLevel0Table() {
       meta_.oldest_ancester_time = oldest_ancester_time;
       meta_.file_creation_time = current_time;
 
-      uint64_t num_input_entries = 0;
       uint64_t memtable_payload_bytes = 0;
       uint64_t memtable_garbage_bytes = 0;
       IOStatus io_s;
@@ -1010,16 +1012,38 @@ Status FlushJob::WriteLevel0Table() {
           cfd_->internal_stats(), &io_s, io_tracer_,
           BlobFileCreationReason::kFlush, seqno_to_time_mapping_.get(),
           event_logger_, job_context_->job_id, &table_properties_, write_hint,
-          full_history_ts_low, blob_callback_, base_, &num_input_entries,
-          &memtable_payload_bytes, &memtable_garbage_bytes);
+          full_history_ts_low, blob_callback_, base_, &memtable_payload_bytes,
+          &memtable_garbage_bytes, &flush_stats);
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s);
       // TODO: Cleanup io_status in BuildTable and table builders
       assert(!s.ok() || io_s.ok());
       io_s.PermitUncheckedError();
-      if (num_input_entries != total_num_entries && s.ok()) {
-        std::string msg = "Expected " + std::to_string(total_num_entries) +
+      if (s.ok() && total_num_input_entries != flush_stats.num_input_records) {
+        std::string msg = "Expected " +
+                          std::to_string(total_num_input_entries) +
                           " entries in memtables, but read " +
-                          std::to_string(num_input_entries);
+                          std::to_string(flush_stats.num_input_records);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+
+      // Only verify on table with format collects table properties
+      if (s.ok() &&
+          (mutable_cf_options_.table_factory->IsInstanceOf(
+               TableFactory::kBlockBasedTableName()) ||
+           mutable_cf_options_.table_factory->IsInstanceOf(
+               TableFactory::kPlainTableName())) &&
+          flush_stats.num_output_records != table_properties_.num_entries) {
+        std::string msg =
+            "Number of keys in flush output SST files does not match "
+            "number of keys added to the table. Expected " +
+            std::to_string(flush_stats.num_output_records) + " but there are " +
+            std::to_string(table_properties_.num_entries) +
+            " in output SST files";
         ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
                        cfd_->GetName().c_str(), job_context_->job_id,
                        msg.c_str());
@@ -1085,12 +1109,10 @@ Status FlushJob::WriteLevel0Table() {
   // Piggyback FlushJobInfo on the first first flushed memtable.
   mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
 
-  // Note that here we treat flush as level 0 compaction in internal stats
-  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
   const uint64_t micros = clock_->NowMicros() - start_micros;
   const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
-  stats.micros = micros;
-  stats.cpu_micros = cpu_micros;
+  flush_stats.micros = micros;
+  flush_stats.cpu_micros = cpu_micros;
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Flush lasted %" PRIu64
@@ -1099,22 +1121,23 @@ Status FlushJob::WriteLevel0Table() {
                  cpu_micros);
 
   if (has_output) {
-    stats.bytes_written = meta_.fd.GetFileSize();
-    stats.num_output_files = 1;
+    flush_stats.bytes_written = meta_.fd.GetFileSize();
+    flush_stats.num_output_files = 1;
   }
 
   const auto& blobs = edit_->GetBlobFileAdditions();
   for (const auto& blob : blobs) {
-    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+    flush_stats.bytes_written_blob += blob.GetTotalBlobBytes();
   }
 
-  stats.num_output_files_blob = static_cast<int>(blobs.size());
+  flush_stats.num_output_files_blob = static_cast<int>(blobs.size());
 
-  RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
-  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+  RecordTimeToHistogram(stats_, FLUSH_TIME, flush_stats.micros);
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_,
+                                             flush_stats);
   cfd_->internal_stats()->AddCFStats(
       InternalStats::BYTES_FLUSHED,
-      stats.bytes_written + stats.bytes_written_blob);
+      flush_stats.bytes_written + flush_stats.bytes_written_blob);
   RecordFlushIOStats();
 
   return s;
diff --git a/db/internal_stats.h b/db/internal_stats.h
index e7fa002c4ccb..cc1b1317df61 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -697,7 +697,10 @@ class InternalStats {
   // a full cache, which would force a re-scan on the next GetStats.
   std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
       cache_entry_stats_collector_;
-  // Per-ColumnFamily/level compaction stats
+
+  // Per-column family and level compaction statistics, including flush and file
+  // ingestion. These are treated as compactions to L0 or the level where the
+  // file was ingested.
   std::vector<CompactionStats> comp_stats_;
   std::vector<CompactionStats> comp_stats_by_pri_;
   CompactionStats per_key_placement_comp_stats_;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 63fdcb15da8c..a116e165f413 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -605,7 +605,8 @@ struct DBOptions {
   // DEPRECATED: This option might be removed in a future release.
   //
   // If true, during memtable flush, RocksDB will validate total entries
-  // read in flush, and compare with counter inserted into it.
+  // read in flush, total entries written in the SST and compare them with
+  // counter of keys added.
   //
   // The option is here to turn the feature off in case this new validation
   // feature has a bug. The option may be removed in the future once the
diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc
index 76ce993b20ff..b9b5c27f2371 100644
--- a/utilities/backup/backup_engine.cc
+++ b/utilities/backup/backup_engine.cc
@@ -2825,7 +2825,7 @@ Status BackupEngineImpl::GetFileDbIdentities(Env* src_env,
     // Try to get table properties from the table reader of sst_reader
     if (!sst_reader.ReadTableProperties(&tp).ok()) {
       // FIXME (peterd): this logic is untested and seems obsolete.
-      // Try to use table properites from the initialization of sst_reader
+      // Try to use table properties from the initialization of sst_reader
       table_properties = sst_reader.GetInitTableProperties();
     } else {
       table_properties = tp.get();
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index a59ec00a0b69..228293815fe1 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -2558,9 +2558,6 @@ TEST_P(TransactionTest, FlushTest2) {
       case 0:
         break;
       case 1:
-        // Skip verifying record count against TableProperties for
-        // MockTables
-        options.compaction_verify_record_count = false;
         options.table_factory.reset(new mock::MockTableFactory());
         break;
       case 2: {

From 0560544e86c1f97f8d1da348f2647aadaefbd095 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 24 Apr 2025 12:27:10 -0700
Subject: [PATCH 071/500] Fix ExternalTableOptions initialization (#13572)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13572

Reviewed By: moakbari

Differential Revision: D73568773

Pulled By: anand1976

fbshipit-source-id: d61d76cb864e3af111bb05dc1ee51a8b3f1eaf17
---
 table/external_table.cc | 7 ++++---
 table/table_test.cc     | 5 ++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/table/external_table.cc b/table/external_table.cc
index 2858543118a3..2161544c2907 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -287,9 +287,10 @@ class ExternalTableFactoryAdapter : public TableFactory {
       std::unique_ptr<TableReader>* table_reader,
       bool /* prefetch_index_and_filter_in_cache */) const override {
     std::unique_ptr<ExternalTableReader> reader;
-    ExternalTableOptions ext_topts(
-        topts.prefix_extractor, topts.ioptions.user_comparator,
-        topts.ioptions.fs, FileOptions(topts.env_options));
+    FileOptions fopts(topts.env_options);
+    ExternalTableOptions ext_topts(topts.prefix_extractor,
+                                   topts.ioptions.user_comparator,
+                                   topts.ioptions.fs, fopts);
     auto status =
         inner_->NewTableReader(ro, file->file_name(), ext_topts, &reader);
     if (!status.ok()) {
diff --git a/table/table_test.cc b/table/table_test.cc
index 692c028d328c..2730185bae3b 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6855,8 +6855,11 @@ class ExternalTableReaderTest : public DBTestBase {
 
     Status NewTableReader(
         const ReadOptions& /*read_options*/, const std::string& file_path,
-        const ExternalTableOptions& /*topts*/,
+        const ExternalTableOptions& topts,
         std::unique_ptr<ExternalTableReader>* table_reader) const override {
+      // Sanity check some options
+      EXPECT_EQ(topts.file_options.handoff_checksum_type,
+                ChecksumType::kCRC32c);
       table_reader->reset(new DummyExternalTableReader(file_path));
       return Status::OK();
     }

From 6c0e55a2a9e244ebbe71613c48eb3157ecfce9af Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 25 Apr 2025 17:15:03 -0700
Subject: [PATCH 072/500] Fix a bug where lock upgrade can incorrectly return
 deadlock status (#13575)

Summary:
AcquireLocked() returns transaction ids that currently hold the lock for deadlock detection purpose. We should not include the id of the transaction that is trying to acquire the lock, since this would lead to a false-positive deadlock detection where the deadlock is a self-loop. Note that since `wait_ids` is never cleared, there is another bug where if AcquireLocked() fails with kLockLimit, we could do deadlock detection based on `wait_ids` from a previous lock acquire attempt. This PR fixes both bugs.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13575

Test Plan: added a unit test repro that shows deadlock status can be incorrectly returned.

Reviewed By: jaykorean

Differential Revision: D73617887

Pulled By: cbi42

fbshipit-source-id: a6388b3ec53db13e2c502d60199378ea95885841
---
 unreleased_history/bug_fixes/deadlock.md      |  1 +
 .../lock/point/point_lock_manager.cc          | 18 +++++++-
 utilities/transactions/transaction_test.cc    | 44 +++++++++++++++++++
 3 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/deadlock.md

diff --git a/unreleased_history/bug_fixes/deadlock.md b/unreleased_history/bug_fixes/deadlock.md
new file mode 100644
index 000000000000..362b27c90867
--- /dev/null
+++ b/unreleased_history/bug_fixes/deadlock.md
@@ -0,0 +1 @@
+* Fix a bug where transaction lock upgrade can incorrectly fail with a Deadlock status. This happens when a transaction has a non-zero timeout and tries to upgrade a shared lock that is also held by another transaction.
diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 97d3ace29d1c..4cd6e6b16081 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -341,6 +341,7 @@ Status PointLockManager::AcquireWithTimeout(
       }
 
       if (result.ok() || result.IsTimedOut()) {
+        wait_ids.clear();
         result = AcquireLocked(lock_map, stripe, key, env, lock_info,
                                &expire_time_hint, &wait_ids);
       }
@@ -472,13 +473,21 @@ bool PointLockManager::IncrementWaiters(
 // Try to lock this key after we have acquired the mutex.
 // Sets *expire_time to the expiration time in microseconds
 //  or 0 if no expiration.
-// REQUIRED:  Stripe mutex must be held.
+//
+// Returns Status::TimeOut if the lock cannot be acquired due to it being
+// held by other transactions, `txn_ids` will be populated with the id of
+// transactions that hold the lock, excluding lock_info.txn_ids[0].
+// Returns Status::Busy if the lock cannot be acquired due to reaching
+// per CF limit on the number of locks.
+//
+// REQUIRED:  Stripe mutex must be held. txn_ids must be empty.
 Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
                                        const std::string& key, Env* env,
                                        const LockInfo& txn_lock_info,
                                        uint64_t* expire_time,
                                        autovector<TransactionID>* txn_ids) {
   assert(txn_lock_info.txn_ids.size() == 1);
+  assert(txn_ids && txn_ids->empty());
 
   Status result;
   // Check if this key is already locked
@@ -507,7 +516,12 @@ Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
           // lock_cnt does not change
         } else {
           result = Status::TimedOut(Status::SubCode::kLockTimeout);
-          *txn_ids = lock_info.txn_ids;
+          for (auto id : lock_info.txn_ids) {
+            // A transaction is not blocked by itself
+            if (id != txn_lock_info.txn_ids[0]) {
+              txn_ids->push_back(id);
+            }
+          }
         }
       }
     } else {
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 228293815fe1..5f6b5bb425c2 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -9689,6 +9689,50 @@ TEST_P(CommitBypassMemtableTest, MergeMiniStress) {
     VerifyDBFromMap(expected_cf, nullptr, false, nullptr, handles_[0]);
   }
 }
+
+TEST_F(TransactionDBTest, SelfDeadlockBug) {
+  ASSERT_OK(ReOpen());
+
+  // Create two transactions
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  txn_options.lock_timeout = 50;  // 50ms
+  txn_options.deadlock_detect = true;
+
+  ASSERT_OK(db->Put({}, "shared_key", "shared_value"));
+
+  // First transaction
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+  ASSERT_OK(txn1->SetName("txn1"));
+
+  // Second transaction
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+  ASSERT_OK(txn2->SetName("txn2"));
+
+  // Both transactions acquire shared lock on the same key.
+  std::string value;
+  ASSERT_OK(txn1->GetForUpdate(ReadOptions(), "shared_key", &value,
+                               /*exclusive=*/false));
+  ASSERT_OK(txn2->GetForUpdate(ReadOptions(), "shared_key", &value,
+                               /*exclusive=*/false));
+
+  // Second transaction tries to upgrade to exclusive lock, which should
+  // timeout.
+  Status s = txn1->Put({}, "shared_key", "val");
+  // Print out the deadlock info buffer
+  ASSERT_TRUE(db->GetDeadlockInfoBuffer().empty());
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  // After release lock from txn2, txn1 should be able to proceed.
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn1->Put({}, "shared_key", "val"));
+  ASSERT_OK(txn1->Rollback());
+  delete txn1;
+  delete txn2;
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From b2815b6b468b15938110e54a10c1c037548dde54 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 28 Apr 2025 08:43:59 -0700
Subject: [PATCH 073/500] Update folly lib (#13576)

Summary:
After some bisecting, we were able to pinpoint that https://github.com/facebook/folly/commit/7881d1e7858f35ce7176dded26162cf8f575b24c is the commit that breaks the RocksDB build-with-folly.

https://github.com/facebook/folly/commit/8e8186f67de7a23d3a07366946b1617343927d84 is the latest folly that we can update to without additional change.

Fix for the incompatible change will be followed as a separate PR.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13576

Test Plan: CI

Reviewed By: hx235

Differential Revision: D73693236

Pulled By: jaykorean

fbshipit-source-id: ff94e023a361c64dea8388cb8bb9db91a2762894
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4b1d0414ae3c..a10b95a60123 100644
--- a/Makefile
+++ b/Makefile
@@ -2489,7 +2489,7 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard 78286282478e1ae05b2e8cbcf0e2139eab283bea
+	cd third-party/folly && git reset --hard 8e8186f67de7a23d3a07366946b1617343927d84
 	@# NOTE: this hack is required for clang in some cases
 	perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
 	@# NOTE: this hack is required for gcc in some cases

From 72c38871673f34ef22c66b8fcb9292812272a961 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 28 Apr 2025 13:35:48 -0700
Subject: [PATCH 074/500] Fix build (#13579)

Summary:
- [Failed CI run](https://productionresultssa17.blob.core.windows.net/actions-results/fd083599-6c98-4aec-8732-fcb280c96021/workflow-job-run-2f73efd7-c93d-53ea-a18f-1c7e17604f7e/logs/job/job-logs.txt?rsct=text%2Fplain&se=2025-04-28T17%3A15%3A01Z&sig=YJevYF5xH4RClY3klBe6Z3tnCWuYZFLlBYRHwftW9lc%3D&ske=2025-04-29T01%3A55%3A36Z&skoid=ca7593d4-ee42-46cd-af88-8b886a2f84eb&sks=b&skt=2025-04-28T13%3A55%3A36Z&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skv=2025-01-05&sp=r&spr=https&sr=b&st=2025-04-28T17%3A04%3A56Z&sv=2025-01-05)

```
2025-04-28T16:56:00.5775476Z In file included from <stdin>:1:
2025-04-28T16:56:00.5776056Z db/blob/blob_file_meta.h:28:7: error: 'uint64_t' has not been declared
2025-04-28T16:56:00.5776715Z    28 |       uint64_t blob_file_number, uint64_t total_blob_count,
2025-04-28T16:56:00.5777153Z       |       ^~~~~~~~
2025-04-28T16:56:00.5778083Z db/blob/blob_file_meta.h:15:1: note: 'uint64_t' is defined in header '<cstdint>'; this is probably fixable by adding '#include <cstdint>'
2025-04-28T16:56:00.5779293Z    14 | #include "rocksdb/rocksdb_namespace.h"
2025-04-28T16:56:00.5782126Z   +++ |+#include <cstdint>
2025-04-28T16:56:00.5782780Z    15 |
2025-04-28T16:56:00.5783204Z db/blob/blob_file_meta.h:28:34: error: 'uint64_t' has not been declared
2025-04-28T16:56:00.5783832Z    28 |       uint64_t blob_file_number, uint64_t total_blob_count,
2025-04-28T16:56:00.5784301Z       |                                  ^~~~~~~~
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13579

Test Plan: [CI](https://github.com/facebook/rocksdb/actions/runs/14713618495/job/41291839382?pr=13579)

Reviewed By: archang19, cbi42

Differential Revision: D73799590

Pulled By: jaykorean

fbshipit-source-id: 7ead97914c05958bb7146f1934c48615599bc4f8
---
 db/blob/blob_file_meta.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/blob/blob_file_meta.h b/db/blob/blob_file_meta.h
index d7c8a124336d..2e47726f8d11 100644
--- a/db/blob/blob_file_meta.h
+++ b/db/blob/blob_file_meta.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <cassert>
+#include <cstdint>
 #include <iosfwd>
 #include <memory>
 #include <string>

From 9d1a071194de8093bbf3f8f57ffd176278359bf0 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 29 Apr 2025 11:29:22 -0700
Subject: [PATCH 075/500] Use Hex for DebugString (#13580)

Summary:
Addressing belated comment in https://github.com/facebook/rocksdb/pull/13452.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13580

Test Plan:
Temp change in the Unit Test to add a null char to the key and printed

Before the fix
```
DEBUG STRING BEFORE: --- level 0 --- version# 185 ---
 287:1286[1201 .. 1210]['key000000
DEBUG STRING AFTER: --- level 0 --- version# 185 ---
 287:1286[1201 .. 1210]['key000000
```

After the fix
```
DEBUG STRING BEFORE: --- level 0 --- version# 185 ---
 287:1286[1201 .. 1210]['6B657930303030303000' seq:1201, type:1 .. '6B657930303030313800' seq:1210, type:1]
 72:1261[261 .. 270]['6B6579303030313230' seq:261, type:1 .. '6B6579303030313338' seq:270, type:1]
 67:1259[241 .. 250]['6B6579303030303830' seq:241, type:1 .. '6B6579303030303938' seq:250, type:1]
 61:1259[211 .. 220]['6B6579303030303230' seq:211, type:1 .. '6B6579303030303338' seq:220, type:1]
 --- level 1 --- version# 185 ---
 70:1353[0 .. 0]['6B6579303030303030' seq:0, type:1 .. '6B6579303030303139' seq:0, type:1]
 23:1268[21 .. 30]['6B6579303030303230' seq:21, type:1 .. '6B6579303030303239' seq:30, type:1]
 25:1268[31 .. 40]['6B6579303030303330' seq:31, type:1 .. '6B6579303030303339' seq:40, type:1]
 86:1327[0 .. 0]['6B6579303030303430' seq:0, type:1 .. '6B6579303030303539' seq:0, type:1]
 74:1326[0 .. 0]['6B6579303030303630' seq:0, type:1 .. '6B6579303030303739' seq:0, type:1]
 35:1268[81 .. 90]['6B6579303030303830' seq:81, type:1 .. '6B6579303030303839' seq:90, type:1]
 37:1268[91 .. 100]['6B6579303030303930' seq:91, type:1 .. '6B6579303030303939' seq:100, type:1]
 78:1335[0 .. 0]['6B6579303030313030' seq:0, type:1 .. '6B6579303030313139' seq:0, type:1]
 43:1270[121 .. 130]['6B6579303030313230' seq:121, type:1 .. '6B6579303030313239' seq:130, type:1]
 45:1270[131 .. 140]['6B6579303030313330' seq:131, type:1 .. '6B6579303030313339' seq:140, type:1]
 82:1332[0 .. 0]['6B6579303030313430' seq:0, type:1 .. '6B6579303030313539' seq:0, type:1]
 90:1333[0 .. 0]['6B6579303030313630' seq:0, type:1 .. '6B6579303030313739' seq:0, type:1]
 94:1332[0 .. 0]['6B6579303030313830' seq:0, type:1 .. '6B6579303030313939' seq:0, type:1]
 --- level 2 --- version# 185 ---
 --- level 3 --- version# 185 ---
 --- level 4 --- version# 185 ---
 --- level 5 --- version# 185 ---
 --- level 6 --- version# 185 ---

DEBUG STRING AFTER: --- level 0 --- version# 185 ---
 287:1286[1201 .. 1210]['6B657930303030303000' seq:1201, type:1 .. '6B657930303030313800' seq:1210, type:1]
 72:1261[261 .. 270]['6B6579303030313230' seq:261, type:1 .. '6B6579303030313338' seq:270, type:1]
 67:1259[241 .. 250]['6B6579303030303830' seq:241, type:1 .. '6B6579303030303938' seq:250, type:1]
 61:1259[211 .. 220]['6B6579303030303230' seq:211, type:1 .. '6B6579303030303338' seq:220, type:1]
 --- level 1 --- version# 185 ---
 70:1353[0 .. 0]['6B6579303030303030' seq:0, type:1 .. '6B6579303030303139' seq:0, type:1]
 23:1268[21 .. 30]['6B6579303030303230' seq:21, type:1 .. '6B6579303030303239' seq:30, type:1]
 25:1268[31 .. 40]['6B6579303030303330' seq:31, type:1 .. '6B6579303030303339' seq:40, type:1]
 86:1327[0 .. 0]['6B6579303030303430' seq:0, type:1 .. '6B6579303030303539' seq:0, type:1]
 74:1326[0 .. 0]['6B6579303030303630' seq:0, type:1 .. '6B6579303030303739' seq:0, type:1]
 35:1268[81 .. 90]['6B6579303030303830' seq:81, type:1 .. '6B6579303030303839' seq:90, type:1]
 37:1268[91 .. 100]['6B6579303030303930' seq:91, type:1 .. '6B6579303030303939' seq:100, type:1]
 78:1335[0 .. 0]['6B6579303030313030' seq:0, type:1 .. '6B6579303030313139' seq:0, type:1]
 43:1270[121 .. 130]['6B6579303030313230' seq:121, type:1 .. '6B6579303030313239' seq:130, type:1]
 45:1270[131 .. 140]['6B6579303030313330' seq:131, type:1 .. '6B6579303030313339' seq:140, type:1]
 82:1332[0 .. 0]['6B6579303030313430' seq:0, type:1 .. '6B6579303030313539' seq:0, type:1]
 90:1333[0 .. 0]['6B6579303030313630' seq:0, type:1 .. '6B6579303030313739' seq:0, type:1]
 94:1332[0 .. 0]['6B6579303030313830' seq:0, type:1 .. '6B6579303030313939' seq:0, type:1]
 --- level 2 --- version# 185 ---
 --- level 3 --- version# 185 ---
 --- level 4 --- version# 185 ---
 --- level 5 --- version# 185 ---
 --- level 6 --- version# 185 ---
```

Reviewed By: hx235

Differential Revision: D73793661

Pulled By: jaykorean

fbshipit-source-id: d553ad24489cb2eff499b1ece457c6295a1ec697
---
 db/compaction/compaction_service_job.cc | 17 +++++++++--------
 db/db_impl/db_impl_secondary.cc         |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 620d12c81068..f6375de11722 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -113,7 +113,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
   }
 
   std::string debug_str_before_wait =
-      compaction->input_version()->DebugString();
+      compaction->input_version()->DebugString(/*hex=*/true);
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Waiting for remote compaction...",
@@ -124,13 +124,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
                                            &compaction_result_binary);
 
   if (compaction_status != CompactionServiceJobStatus::kSuccess) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "[%s] [JOB %d] Wait() status is not kSuccess. "
-                    "\nDebugString Before Wait():\n%s"
-                    "\nDebugString After Wait():\n%s",
-                    compaction->column_family_data()->GetName().c_str(),
-                    job_id_, debug_str_before_wait.c_str(),
-                    compaction->input_version()->DebugString().c_str());
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] Wait() status is not kSuccess. "
+        "\nDebugString Before Wait():\n%s"
+        "\nDebugString After Wait():\n%s",
+        compaction->column_family_data()->GetName().c_str(), job_id_,
+        debug_str_before_wait.c_str(),
+        compaction->input_version()->DebugString(/*hex=*/true).c_str());
   }
 
   if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 5f0c8bb8b295..b95566523f5a 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -909,7 +909,7 @@ Status DBImplSecondary::CompactWithoutInstallation(
     ROCKS_LOG_ERROR(
         immutable_db_options_.info_log,
         "GetCompactionInputsFromFileNumbers() failed - %s.\n DebugString: %s",
-        s.ToString().c_str(), version->DebugString().c_str());
+        s.ToString().c_str(), version->DebugString(/*hex=*/true).c_str());
     return s;
   }
 

From e3b7dd7b564262dd45496f1696a90e6a575d452d Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 2 May 2025 12:16:02 -0700
Subject: [PATCH 076/500] Add a new transaction option for large transaction
 optimization (#13582)

Summary:
I added `TransactionDBOptions::txn_commit_bypass_memtable_threshold` previously but per DB option is not dynamically changeable. Adding it as a per transaction option to make it easier to use. The option naming is updated to make it easier for customer to understand `large_txn_commit_optimize_threshold`. The transaction DB option `TransactionDBOptions::txn_commit_bypass_memtable_threshold` is marked as deprecated.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13582

Test Plan:
- new unit test
- updated stress test to use this new transaction option

Reviewed By: jowlyzhang

Differential Revision: D73960981

Pulled By: cbi42

fbshipit-source-id: 406f6e0f5f4eb6b336976f9a93b0bc08e61a9662
---
 db_stress_tool/db_stress_test_base.cc         |  10 +-
 include/rocksdb/utilities/transaction_db.h    |  15 +-
 .../new_features/per-txn-threshold.md         |   1 +
 .../transactions/pessimistic_transaction.cc   |   9 +-
 utilities/transactions/transaction_test.cc    | 130 +++++++++++++++++-
 5 files changed, 151 insertions(+), 14 deletions(-)
 create mode 100644 unreleased_history/new_features/per-txn-threshold.md

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index beb0cd1aed68..6b37816c8f03 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -838,11 +838,15 @@ Status StressTest::NewTxn(WriteOptions& write_opts, ThreadState* thread,
         FLAGS_use_only_the_last_commit_time_batch_for_recovery;
     txn_options.lock_timeout = 600000;  // 10 min
     txn_options.deadlock_detect = true;
-    if (FLAGS_commit_bypass_memtable_one_in > 0) {
+    if (FLAGS_commit_bypass_memtable_one_in > 0 &&
+        thread->rand.OneIn(FLAGS_commit_bypass_memtable_one_in)) {
       assert(FLAGS_txn_write_policy == 0);
       assert(FLAGS_user_timestamp_size == 0);
-      txn_options.commit_bypass_memtable =
-          thread->rand.OneIn(FLAGS_commit_bypass_memtable_one_in);
+      if (thread->rand.OneIn(2)) {
+        txn_options.commit_bypass_memtable = true;
+      } else {
+        txn_options.large_txn_commit_optimize_threshold = 1;
+      }
       if (commit_bypass_memtable) {
         *commit_bypass_memtable = txn_options.commit_bypass_memtable;
       }
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 29cc12846f92..32398d9ea83e 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -254,10 +254,12 @@ struct TransactionDBOptions {
   // for more details.
   std::vector<std::shared_ptr<SecondaryIndex>> secondary_indices;
 
-  // EXPERIMENTAL, SUBJECT TO CHANGE
+  // Deprecated, this option may be removed in the future.
+  // Use TransactionOptions::large_txn_commit_optimize_threshold instead.
+  //
   // This option is only valid for write committed. If the number of updates in
-  // a transaction exceeds this threshold, then the transaction commit will skip
-  // insertions into memtable as an optimization to reduce commit latency.
+  // a transaction is at least this threshold, then the transaction commit will
+  // skip insertions into memtable as an optimization to reduce commit latency.
   // See comment for TransactionOptions::commit_bypass_memtable for more detail.
   // Setting TransactionOptions::commit_bypass_memtable to true takes precedence
   // over this option.
@@ -387,6 +389,13 @@ struct TransactionOptions {
   // can cause flush/compaction to report `num_single_del_mismatch` due to
   // consecutive SingleDeletes.
   bool commit_bypass_memtable = false;
+
+  // EXPERIMENTAL, SUBJECT TO CHANGE
+  // When the number of updates in a transaction is at least this threshold,
+  // we will enable optimizations for commiting a large transaction. See
+  // comment for `commit_bypass_memtable` for more optimization detail.
+  uint32_t large_txn_commit_optimize_threshold =
+      std::numeric_limits<uint32_t>::max();
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff --git a/unreleased_history/new_features/per-txn-threshold.md b/unreleased_history/new_features/per-txn-threshold.md
new file mode 100644
index 000000000000..01c6aad53201
--- /dev/null
+++ b/unreleased_history/new_features/per-txn-threshold.md
@@ -0,0 +1 @@
+* Add new experimental `TransactionOptions::large_txn_commit_optimize_threshold` to enable optimizations for large transaction commit with per transaction threshold. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` is deprecated in favor of this transaction option.
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index f8dbaf07c9f8..98634c94cd87 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -105,7 +105,12 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
   commit_timestamp_ = kMaxTxnTimestamp;
 
   if (txn_options.commit_bypass_memtable) {
-    commit_bypass_memtable_threshold_ = 0;
+    // No need to optimize for empty transction
+    commit_bypass_memtable_threshold_ = 1;
+  } else if (txn_options.large_txn_commit_optimize_threshold !=
+             std::numeric_limits<uint32_t>::max()) {
+    commit_bypass_memtable_threshold_ =
+        txn_options.large_txn_commit_optimize_threshold;
   } else {
     commit_bypass_memtable_threshold_ =
         db_options.txn_commit_bypass_memtable_threshold;
@@ -887,7 +892,7 @@ Status WriteCommittedTxn::CommitInternal() {
   // any operations appended to this working_batch will be ignored from WAL
   working_batch->MarkWalTerminationPoint();
 
-  bool bypass_memtable = wb->Count() > commit_bypass_memtable_threshold_;
+  bool bypass_memtable = wb->Count() >= commit_bypass_memtable_threshold_;
   if (!bypass_memtable) {
     // insert prepared batch into Memtable only skipping WAL.
     // Memtable will ignore BeginPrepare/EndPrepare markers
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 5f6b5bb425c2..a1cd6aaee431 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -9414,8 +9414,8 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
   ASSERT_OK(txn1->Commit());
   ASSERT_TRUE(commit_bypass_memtable);
 
-  // Below threshold
-  for (auto num_ops : {threshold, threshold + 1}) {
+  // Test threshold behavior
+  for (auto num_ops : {threshold - 1, threshold}) {
     commit_bypass_memtable = false;
     txn_opts.commit_bypass_memtable = false;
     auto txn = txn_db->BeginTransaction(wopts, txn_opts, txn1);
@@ -9427,7 +9427,7 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
     }
     ASSERT_OK(txn->Prepare());
     ASSERT_OK(txn->Commit());
-    ASSERT_EQ(commit_bypass_memtable, num_ops > threshold);
+    ASSERT_EQ(commit_bypass_memtable, num_ops >= threshold);
     delete txn;
   }
 
@@ -9435,8 +9435,8 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
   std::vector<std::string> cfs = {"pk", "sk"};
   CreateColumnFamilies(cfs, options);
 
-  // Below threshold
-  for (auto num_ops : {threshold, threshold + 1}) {
+  // Test threshold behavior with CFs
+  for (auto num_ops : {threshold - 1, threshold}) {
     commit_bypass_memtable = false;
     txn_opts.commit_bypass_memtable = false;
     auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
@@ -9447,11 +9447,129 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
     }
     ASSERT_OK(txn_cf->Prepare());
     ASSERT_OK(txn_cf->Commit());
-    ASSERT_EQ(commit_bypass_memtable, num_ops > threshold);
+    ASSERT_EQ(commit_bypass_memtable, num_ops >= threshold);
     delete txn_cf;
   }
 }
 
+TEST_P(CommitBypassMemtableTest, OptimizeLargeTxnCommitThreshold) {
+  // Tests TransactionOptions::large_txn_commit_optimize_threshold
+  const uint32_t threshold = 10;
+  SetUpTransactionDB();
+  bool commit_bypass_memtable = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteCommittedTxn::CommitInternal:bypass_memtable",
+      [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Test with transaction option only
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+
+  // Test with transaction below threshold
+  auto txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid1"));
+  ASSERT_OK(txn1->Put("k1", "v1"));
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn1;
+
+  // Test with transaction at threshold
+  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid2"));
+  for (uint32_t i = 0; i < threshold; ++i) {
+    ASSERT_OK(
+        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn1->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn1;
+
+  // Test with both DB option and transaction option - transaction option should
+  // take precedence
+  SetUpTransactionDB(/*threshold=*/threshold * 2);
+
+  // Transaction option is lower than DB option, should use transaction option
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid3"));
+  for (uint32_t i = 0; i < threshold; ++i) {
+    ASSERT_OK(
+        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn1->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn1->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn1;
+
+  // Transaction option is higher than DB option, should use transaction option
+  txn_opts.large_txn_commit_optimize_threshold = threshold * 3;
+  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid4"));
+  for (uint32_t i = 0; i < threshold * 3 - 1; ++i) {
+    ASSERT_OK(
+        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn1->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn1->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn1;
+
+  SetUpTransactionDB();
+  // Test with multiple column families
+  std::vector<std::string> cfs = {"pk", "sk"};
+  CreateColumnFamilies(cfs, options);
+
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+
+  // Below threshold
+  auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_below"));
+  for (uint32_t i = 0; i < threshold - 1; ++i) {
+    ASSERT_OK(txn_cf->Put(handles_[i % 2], "key" + std::to_string(i),
+                          "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn_cf->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn_cf;
+
+  // At threshold
+  txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_at_threshold"));
+  for (uint32_t i = 0; i < threshold; ++i) {
+    ASSERT_OK(txn_cf->Put(handles_[i % 2], "key" + std::to_string(i),
+                          "value" + std::to_string(i)));
+  }
+  ASSERT_OK(txn_cf->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn_cf;
+
+  // Test that commit_bypass_memtable takes precedence over
+  // large_txn_commit_optimize_threshold
+  txn_opts.large_txn_commit_optimize_threshold =
+      threshold * 10;                      // High threshold
+  txn_opts.commit_bypass_memtable = true;  // Should override threshold
+
+  txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_precedence"));
+  ASSERT_OK(txn_cf->Put(handles_[0], "key1", "value1"));  // Just one operation
+  ASSERT_OK(txn_cf->Prepare());
+  commit_bypass_memtable = false;
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);  // Should be true because of
+                                        // commit_bypass_memtable
+  delete txn_cf;
+}
+
 TEST_P(CommitBypassMemtableTest, AtomicFlushTest) {
   const uint32_t threshold = 10;
   SetUpTransactionDB(/*threshold=*/threshold, /*atomic_flush=*/true);

From 1428e950bd7a7678c7a63995337631213416175b Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 2 May 2025 13:10:06 -0700
Subject: [PATCH 077/500] Bug fix and refactoring on parallel compression
 (#13583)

Summary:
While working on some compression refactoring, I noticed that `NotifyCollectTableCollectorsOnBlockAdd()` was being called from multiple threads (with `parallel_threads` > 1), meaning we were violating the promise that TablePropertiesCollectors need not be thread safe (and typically will not be, for efficiency).

Fixing this is a bit awkward or intrusive. Even though it seems weird to expose `block_compressed_bytes_fast` and `block_compressed_bytes_fast` in the public `BlockAdd()` function, and NOT the actual compressed block size used, there are some Meta-internal uses that would at least require negotiation / coordination to deprecate and remove. So it's probably easiest to just keep the awkward functionality and do the necessary modifications to call from a single thread.

The simplest solution that preserves the functionality with `parallel_threads` > 1 (provide the sampling data, expected ordering between `BlockAdd()` and `AddUserKey()`, no races) is to do the compression sampling in the thread building uncompressed blocks. Specifically, moving `NotifyCollectTableCollectorsOnBlockAdd()` and the compression sampling from `CompressAndVerifyBlock()`, which is called in parallel, to table builder `Flush()`, which is only called serially (per file). Even though this adds some compression to that single thread when sampling is enabled, that should be tolerable without complicating the code or regressing performance. Some related or nearby optimizations are included to ensure this.

* Got rid of a lot of unnecessary indirection and unnecessary fields in BlockRep, which should be a step in improving parallel compression performance (still bad IMHO).
* Restructured some `if`s etc. to streamline some logic

This satisfies my original refactoring need to moving the sampling code higher up the stack from `CompressBlock()`, to set up some other upcoming refactorings. The other caller of `CompressBlock()` (legacy BlobDB) doesn't need it, and in fact is better off calling `CompressData()` directly because it does not appear to be dealing with the various "no compression" outcomes introduced by `CompressBlock()`.

Eventual follow-up:
* Performance data below shows how the overhead of parallel compression can make it slower, with available CPUs, compared to serial compression. This infrastructure should be re-designed/re-engineered to reduce thread creation, context switches, etc. Also, more of the processing such as checksumming could be parallelized. (Things dependent on the block location in the file, such as ChecksumModifierForContext and cache warming, cannot be parallelized.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13583
ThreadSanitizer: data race /data/users/peterd/rocksdb/./db_stress_tool/db_stress_table_properties_collector.h:36:5 in rocksdb::DbStressTablePropertiesCollector::BlockAdd(unsigned long, unsigned long, unsigned long)
```

Performance:
```
SUFFIX=`tty | sed 's|/|_|g'`; for ARGS in "-compression_parallel_threads=1 -compression_type=none" "-compression_parallel_threads=1 -compression_type=snappy" "-compression_parallel_threads=4 -compression_type=snappy"; do echo $ARGS; (for I in `seq 1 100`; do ./db_bench -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 $ARGS 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done
```

Average ops/s of 100 runs, running before & after at the same time, using clang DEBUG_LEVEL=0:

-compression_parallel_threads=1 -compression_type=none
Before: 1976319
After: 1983840 (+0.3%)
-compression_parallel_threads=1 -compression_type=snappy
Before: 1945576
After: 1953473 (+0.4%)
-compression_parallel_threads=4 -compression_type=snappy
Before: 1573190
After: 1611881 (+2.4%)
-compression_parallel_threads=4 -sample_for_compression=100 (pretty high sample rate)
Before: 1577167
After: 1589704 (+0.8%)
-compression_parallel_threads=4 -sample_for_compression=10 (crazy high sample rate)
Before: 1581276
After: 1393453 (-11.9%)

As seen, you need a very very high compression sample rate to see a regression. I would expect a setting like 1000 to be more typical.

Test Plan:
Along with existing unit tests + CI, expanded crash test to make its TablePropertiesCollector non-trivial, to exercise the bug (and other potential bugs), which was confirmed with local run of whitebox_crash_test with TSAN:

```

Reviewed By: hx235

Differential Revision: D73944593

Pulled By: pdillinger

fbshipit-source-id: f1dcba4ebdc01e735251037395003945c9b34e62
---
 .../db_stress_table_properties_collector.h    |  31 +-
 .../block_based/block_based_table_builder.cc  | 388 +++++++++---------
 table/block_based/block_based_table_builder.h |  14 -
 .../bug_fixes/parallel_compression_bug.md     |   1 +
 utilities/blob_db/blob_db_impl.cc             |   9 +-
 5 files changed, 223 insertions(+), 220 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/parallel_compression_bug.md

diff --git a/db_stress_tool/db_stress_table_properties_collector.h b/db_stress_tool/db_stress_table_properties_collector.h
index 4723f6fc5d2f..b3f76e446436 100644
--- a/db_stress_tool/db_stress_table_properties_collector.h
+++ b/db_stress_tool/db_stress_table_properties_collector.h
@@ -26,25 +26,50 @@ class DbStressTablePropertiesCollector : public TablePropertiesCollector {
   Status AddUserKey(const Slice& /* key */, const Slice& /* value */,
                     EntryType /*type*/, SequenceNumber /*seq*/,
                     uint64_t /*file_size*/) override {
+    ++keys_added;
+    ++all_calls;
     return Status::OK();
   }
 
-  Status Finish(UserCollectedProperties* /* properties */) override {
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    ++blocks_added;
+    ++all_calls;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    ++all_calls;
+    (*properties)["db_stress_collector_property"] =
+        std::to_string(keys_added) + ";" + std::to_string(blocks_added) + ";" +
+        std::to_string(all_calls);
     return Status::OK();
   }
 
   UserCollectedProperties GetReadableProperties() const override {
-    return UserCollectedProperties{};
+    UserCollectedProperties props;
+    const_cast<DbStressTablePropertiesCollector*>(this)->Finish(&props);
+    return props;
   }
 
   const char* Name() const override {
     return "DbStressTablePropertiesCollector";
   }
 
-  bool NeedCompact() const override { return need_compact_; }
+  bool NeedCompact() const override {
+    ++all_calls;
+    return need_compact_;
+  }
 
  private:
   const bool need_compact_;
+  // These are tracked to detect race conditions that would arise from RocksDB
+  // invoking TablePropertiesCollector functions in an unsynchronized way, as
+  // TablePropertiesCollectors are allowed (encouraged) not to be thread safe.
+  size_t keys_added = 0;
+  size_t blocks_added = 0;
+  // Including race between BlockAdd and AddUserKey (etc.)
+  mutable size_t all_calls = 0;
 };
 
 // A `DbStressTablePropertiesCollectorFactory` creates
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 7f459aa836e2..4bfac6148e07 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -115,59 +115,17 @@ bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size,
          10;
 }
 
-}  // namespace
-
 // format_version is the block format as defined in include/rocksdb/table.h
-Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
-                    CompressionType* type, uint32_t format_version,
-                    uint64_t sample_for_compression,
-                    std::string* compressed_output,
-                    std::string* sampled_output_fast,
-                    std::string* sampled_output_slow) {
-  assert(type);
+CompressionType CompressBlock(const Slice& uncompressed_data,
+                              const CompressionInfo& info,
+                              uint32_t format_version,
+                              std::string* compressed_output) {
   assert(compressed_output);
   assert(compressed_output->empty());
 
-  // If requested, we sample one in every N block with a
-  // fast and slow compression algorithm and report the stats.
-  // The users can use these stats to decide if it is worthwhile
-  // enabling compression and they also get a hint about which
-  // compression algorithm wil be beneficial.
-  if (sample_for_compression > 0 &&
-      Random::GetTLSInstance()->OneIn(
-          static_cast<int>(sample_for_compression))) {
-    // Sampling with a fast compression algorithm
-    if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) {
-      CompressionType c =
-          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
-      CompressionOptions options;
-      CompressionContext context(c, options);
-      CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c);
-
-      CompressData(uncompressed_data, info_tmp,
-                   GetCompressFormatForVersion(format_version),
-                   sampled_output_fast);
-    }
-
-    // Sampling with a slow but high-compression algorithm
-    if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) {
-      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
-      CompressionOptions options;
-      CompressionContext context(c, options);
-      CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c);
-
-      CompressData(uncompressed_data, info_tmp,
-                   GetCompressFormatForVersion(format_version),
-                   sampled_output_slow);
-    }
-  }
-
   int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb;
   if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) {
-    *type = kNoCompression;
-    return uncompressed_data;
+    return kNoCompression;
   }
 
   // Actually compress the data; if the compression method is not supported,
@@ -175,22 +133,21 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
   if (!CompressData(uncompressed_data, info,
                     GetCompressFormatForVersion(format_version),
                     compressed_output)) {
-    *type = kNoCompression;
-    return uncompressed_data;
+    return kNoCompression;
   }
 
   // Check the compression ratio; if it's not good enough, just fall back to
   // uncompressed
   if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(),
                             max_compressed_bytes_per_kb)) {
-    *type = kNoCompression;
-    return uncompressed_data;
+    return kNoCompression;
   }
 
-  *type = info.type();
-  return *compressed_output;
+  return info.type();
 }
 
+}  // namespace
+
 // kBlockBasedTableMagicNumber was picked by running
 //    echo rocksdb.table.block_based | sha1sum
 // and taking the leading 64 bits.
@@ -359,7 +316,7 @@ struct BlockBasedTableBuilder::Rep {
 
   BlockHandle pending_handle;  // Handle to add to index block
 
-  std::string compressed_output;
+  std::string single_threaded_compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
   std::vector<std::unique_ptr<InternalTblPropColl>> table_properties_collectors;
@@ -695,36 +652,13 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     }
 
    private:
-    const size_t kKeysInitSize = 32;
+    static constexpr size_t kKeysInitSize = 32;
     std::vector<std::string> keys_;
     size_t size_;
   };
-  std::unique_ptr<Keys> curr_block_keys;
+  Keys curr_block_keys;
 
-  class BlockRepSlot;
-
-  // BlockRep instances are fetched from and recycled to
-  // block_rep_pool during parallel compression.
-  struct BlockRep {
-    Slice contents;
-    Slice compressed_contents;
-    std::unique_ptr<std::string> data;
-    std::unique_ptr<std::string> compressed_data;
-    CompressionType compression_type;
-    std::unique_ptr<std::string> first_key_in_next_block;
-    std::unique_ptr<Keys> keys;
-    std::unique_ptr<BlockRepSlot> slot;
-    Status status;
-  };
-  // Use a vector of BlockRep as a buffer for a determined number
-  // of BlockRep structures. All data referenced by pointers in
-  // BlockRep will be freed when this vector is destructed.
-  using BlockRepBuffer = std::vector<BlockRep>;
-  BlockRepBuffer block_rep_buf;
-  // Use a thread-safe queue for concurrent access from block
-  // building thread and writer thread.
-  using BlockRepPool = WorkQueue<BlockRep*>;
-  BlockRepPool block_rep_pool;
+  struct BlockRep;
 
   // Use BlockRepSlot to keep block order in write thread.
   // slot_ will pass references to BlockRep
@@ -744,6 +678,32 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     WorkQueue<BlockRep*> slot_;
   };
 
+  // BlockRep instances are fetched from and recycled to
+  // block_rep_pool during parallel compression.
+  struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep {
+    // Uncompressed block contents
+    std::string uncompressed;
+    std::string compressed;
+    CompressionType compression_type = kNoCompression;
+    // For efficiency, the std::string is repeatedly overwritten without
+    // checking for "has no value". Only at the end of its life will it be
+    // assigned "no value". Thus, it needs to start with a value.
+    std::optional<std::string> first_key_in_next_block = std::string{};
+    Keys keys;
+    BlockRepSlot slot;
+    Status status;
+  };
+
+  // Use a vector of BlockRep as a buffer for a determined number
+  // of BlockRep structures. All data referenced by pointers in
+  // BlockRep will be freed when this vector is destructed.
+  using BlockRepBuffer = std::vector<BlockRep>;
+  BlockRepBuffer block_rep_buf;
+  // Use a thread-safe queue for concurrent access from block
+  // building thread and writer thread.
+  using BlockRepPool = WorkQueue<BlockRep*>;
+  BlockRepPool block_rep_pool;
+
   // Compression queue will pass references to BlockRep in block_rep_buf,
   // and those references are always valid before the destruction of
   // block_rep_buf.
@@ -870,22 +830,13 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   std::mutex first_block_mutex;
 
   explicit ParallelCompressionRep(uint32_t parallel_threads)
-      : curr_block_keys(new Keys()),
-        block_rep_buf(parallel_threads),
+      : block_rep_buf(parallel_threads),
         block_rep_pool(parallel_threads),
         compress_queue(parallel_threads),
         write_queue(parallel_threads),
         first_block_processed(false) {
     for (uint32_t i = 0; i < parallel_threads; i++) {
-      block_rep_buf[i].contents = Slice();
-      block_rep_buf[i].compressed_contents = Slice();
-      block_rep_buf[i].data.reset(new std::string());
-      block_rep_buf[i].compressed_data.reset(new std::string());
-      block_rep_buf[i].compression_type = CompressionType();
-      block_rep_buf[i].first_key_in_next_block.reset(new std::string());
-      block_rep_buf[i].keys.reset(new Keys());
-      block_rep_buf[i].slot.reset(new BlockRepSlot());
-      block_rep_buf[i].status = Status::OK();
+      // Prime the queue of available BlockReps
       block_rep_pool.push(&block_rep_buf[i]);
     }
   }
@@ -900,10 +851,9 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     BlockRep* block_rep =
         PrepareBlockInternal(compression_type, first_key_in_next_block);
     assert(block_rep != nullptr);
-    data_block->SwapAndReset(*(block_rep->data));
-    block_rep->contents = *(block_rep->data);
+    data_block->SwapAndReset(block_rep->uncompressed);
     std::swap(block_rep->keys, curr_block_keys);
-    curr_block_keys->Clear();
+    curr_block_keys.Clear();
     return block_rep;
   }
 
@@ -915,9 +865,8 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     BlockRep* block_rep =
         PrepareBlockInternal(compression_type, first_key_in_next_block);
     assert(block_rep != nullptr);
-    std::swap(*(block_rep->data), *data_block);
-    block_rep->contents = *(block_rep->data);
-    block_rep->keys->SwapAssign(*keys);
+    std::swap(block_rep->uncompressed, *data_block);
+    block_rep->keys.SwapAssign(*keys);
     return block_rep;
   }
 
@@ -925,7 +874,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   void EmitBlock(BlockRep* block_rep) {
     assert(block_rep != nullptr);
     assert(block_rep->status.ok());
-    if (!write_queue.push(block_rep->slot.get())) {
+    if (!write_queue.push(&block_rep->slot)) {
       return;
     }
     if (!compress_queue.push(block_rep)) {
@@ -943,7 +892,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   // Reap a block from compression thread
   void ReapBlock(BlockRep* block_rep) {
     assert(block_rep != nullptr);
-    block_rep->compressed_data->clear();
+    block_rep->compressed.clear();
     block_rep_pool.push(block_rep);
 
     if (!first_block_processed.load(std::memory_order_relaxed)) {
@@ -960,12 +909,10 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     block_rep_pool.pop(block_rep);
     assert(block_rep != nullptr);
 
-    assert(block_rep->data);
-
     block_rep->compression_type = compression_type;
 
     if (first_key_in_next_block == nullptr) {
-      block_rep->first_key_in_next_block.reset(nullptr);
+      block_rep->first_key_in_next_block = {};
     } else {
       block_rep->first_key_in_next_block->assign(
           first_key_in_next_block->data(), first_key_in_next_block->size());
@@ -993,6 +940,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
 
   if (rep_->IsParallelCompressionEnabled()) {
     StartParallelCompression();
+  } else if (rep_->compression_type != kNoCompression) {
+    rep_->single_threaded_compressed_output.reserve(table_options.block_size);
   }
 }
 
@@ -1060,7 +1009,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
       // blocks.
       if (ok() && r->state == Rep::State::kUnbuffered) {
         if (r->IsParallelCompressionEnabled()) {
-          r->pc_rep->curr_block_keys->Clear();
+          r->pc_rep->curr_block_keys.Clear();
         } else {
           r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
                                           r->pending_handle,
@@ -1073,7 +1022,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
     // builder after being added to index builder.
     if (r->state == Rep::State::kUnbuffered) {
       if (r->IsParallelCompressionEnabled()) {
-        r->pc_rep->curr_block_keys->PushBack(ikey);
+        r->pc_rep->curr_block_keys.PushBack(ikey);
       } else {
         if (r->filter_builder != nullptr) {
           r->filter_builder->AddWithPrevKey(
@@ -1150,48 +1099,109 @@ void BlockBasedTableBuilder::Flush() {
   if (r->data_block.empty()) {
     return;
   }
-  if (r->IsParallelCompressionEnabled() &&
-      r->state == Rep::State::kUnbuffered) {
-    r->data_block.Finish();
+
+  Slice uncompressed_block_data = r->data_block.Finish();
+
+  // NOTE: compression sampling is done here in the same thread as building
+  // the uncompressed block because of the requirements to call table
+  // property collectors:
+  // * BlockAdd function expects block_compressed_bytes_{fast,slow} for
+  //   historical reasons. Probably a hassle to remove.
+  // * Collector is not thread safe so calls need to be serialized/synchronized.
+  // * Ideally, AddUserKey and BlockAdd calls need to line up such that a
+  //   reported block corresponds to all the keys reported since the previous
+  //   block.
+
+  // If requested, we sample one in every N block with a
+  // fast and slow compression algorithm and report the stats.
+  // The users can use these stats to decide if it is worthwhile
+  // enabling compression and they also get a hint about which
+  // compression algorithm wil be beneficial.
+  if (r->sample_for_compression > 0 &&
+      Random::GetTLSInstance()->OneIn(
+          static_cast<int>(r->sample_for_compression))) {
+    std::string sampled_output_fast;
+    std::string sampled_output_slow;
+
+    // Sampling with a fast compression algorithm
+    if (LZ4_Supported() || Snappy_Supported()) {
+      CompressionType c =
+          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
+      CompressionOptions options;
+      CompressionContext context(c, options);
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c);
+
+      CompressData(uncompressed_block_data, info_tmp,
+                   GetCompressFormatForVersion(r->table_options.format_version),
+                   &sampled_output_fast);
+    }
+
+    // Sampling with a slow but high-compression algorithm
+    if (ZSTD_Supported() || Zlib_Supported()) {
+      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
+      CompressionOptions options;
+      CompressionContext context(c, options);
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c);
+
+      CompressData(uncompressed_block_data, info_tmp,
+                   GetCompressFormatForVersion(r->table_options.format_version),
+                   &sampled_output_slow);
+    }
+
+    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
+      // Currently compression sampling is only enabled for data block.
+      r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(),
+                                            std::memory_order_relaxed);
+      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
+                                                  std::memory_order_relaxed);
+      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
+                                                  std::memory_order_relaxed);
+    }
+
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, uncompressed_block_data.size(),
+        sampled_output_slow.size(), sampled_output_fast.size());
+  } else {
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, uncompressed_block_data.size(),
+        0 /*block_compressed_bytes_slow*/, 0 /*block_compressed_bytes_fast*/);
+  }
+
+  if (rep_->state == Rep::State::kBuffered) {
+    std::string uncompressed_block_holder;
+    uncompressed_block_holder.reserve(rep_->table_options.block_size);
+    r->data_block.SwapAndReset(uncompressed_block_holder);
+    assert(uncompressed_block_data.size() == uncompressed_block_holder.size());
+    rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_holder));
+    rep_->data_begin_offset += uncompressed_block_data.size();
+  } else if (r->IsParallelCompressionEnabled()) {
+    assert(rep_->state == Rep::State::kUnbuffered);
     ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
         r->compression_type, r->first_key_in_next_block, &(r->data_block));
     assert(block_rep != nullptr);
-    r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+    r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
                                              r->get_offset());
     r->pc_rep->EmitBlock(block_rep);
   } else {
-    WriteBlock(&r->data_block, &r->pending_handle, BlockType::kData);
+    assert(rep_->state == Rep::State::kUnbuffered);
+    WriteBlock(uncompressed_block_data, &r->pending_handle, BlockType::kData);
+    r->data_block.Reset();
   }
 }
 
-void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
-                                        BlockHandle* handle,
-                                        BlockType block_type) {
-  block->Finish();
-  std::string uncompressed_block_data;
-  uncompressed_block_data.reserve(rep_->table_options.block_size);
-  block->SwapAndReset(uncompressed_block_data);
-  if (rep_->state == Rep::State::kBuffered) {
-    assert(block_type == BlockType::kData);
-    rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_data));
-    rep_->data_begin_offset += rep_->data_block_buffers.back().size();
-    return;
-  }
-  WriteBlock(uncompressed_block_data, handle, block_type);
-}
-
 void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
                                         BlockHandle* handle,
                                         BlockType block_type) {
   Rep* r = rep_;
   assert(r->state == Rep::State::kUnbuffered);
-  Slice block_contents;
   CompressionType type;
   Status compress_status;
   bool is_data_block = block_type == BlockType::kData;
   CompressAndVerifyBlock(uncompressed_block_data, is_data_block,
                          *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
-                         &(r->compressed_output), &(block_contents), &type,
+                         &r->single_threaded_compressed_output, &type,
                          &compress_status);
   r->SetStatus(compress_status);
   if (!ok()) {
@@ -1200,10 +1210,12 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
 
   TEST_SYNC_POINT_CALLBACK(
       "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData",
-      &r->compressed_output);
-  WriteMaybeCompressedBlock(block_contents, type, handle, block_type,
-                            &uncompressed_block_data);
-  r->compressed_output.clear();
+      &r->single_threaded_compressed_output);
+  WriteMaybeCompressedBlock(type == kNoCompression
+                                ? uncompressed_block_data
+                                : Slice(r->single_threaded_compressed_output),
+                            type, handle, block_type, &uncompressed_block_data);
+  r->single_threaded_compressed_output.clear();
   if (is_data_block) {
     r->props.data_size = r->get_offset();
     ++r->props.num_data_blocks;
@@ -1216,34 +1228,32 @@ void BlockBasedTableBuilder::BGWorkCompression(
   ParallelCompressionRep::BlockRep* block_rep = nullptr;
   while (rep_->pc_rep->compress_queue.pop(block_rep)) {
     assert(block_rep != nullptr);
-    CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
-                           compression_ctx, verify_ctx,
-                           block_rep->compressed_data.get(),
-                           &block_rep->compressed_contents,
-                           &(block_rep->compression_type), &block_rep->status);
-    block_rep->slot->Fill(block_rep);
+    // Skip compression if we are aborting anyway
+    if (ok()) {
+      CompressAndVerifyBlock(block_rep->uncompressed, true, /* is_data_block*/
+                             compression_ctx, verify_ctx,
+                             &block_rep->compressed,
+                             &block_rep->compression_type, &block_rep->status);
+    }
+    block_rep->slot.Fill(block_rep);
   }
 }
 
 void BlockBasedTableBuilder::CompressAndVerifyBlock(
     const Slice& uncompressed_block_data, bool is_data_block,
     const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
-    std::string* compressed_output, Slice* block_contents,
-    CompressionType* type, Status* out_status) {
+    std::string* compressed_output, CompressionType* result_compression_type,
+    Status* out_status) {
   Rep* r = rep_;
-  bool is_status_ok = ok();
-  if (!r->IsParallelCompressionEnabled()) {
-    assert(is_status_ok);
-  }
 
-  if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) {
+  CompressionType type = r->compression_type;
+  if (uncompressed_block_data.size() < kCompressionSizeLimit) {
     StopWatchNano timer(
         r->ioptions.clock,
         ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
 
-    *type = r->compression_type;
 #ifndef NDEBUG
-    if (r->compression_type != kNoCompression &&
+    if (type != kNoCompression &&
         g_hack_mixed_compression_in_block_based_table.LoadRelaxed() > 0U) {
       // If zstd is in the mix, the compression_name table property needs to be
       // set to it, for proper handling of context and dictionaries.
@@ -1251,14 +1261,10 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
       const auto& compressions = GetSupportedCompressions();
       auto counter =
           g_hack_mixed_compression_in_block_based_table.FetchAddRelaxed(1);
-      *type = compressions[counter % compressions.size()];
+      type = compressions[counter % compressions.size()];
     }
 #endif  // !NDEBUG
 
-    if (is_data_block) {
-      r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
-                                                 std::memory_order_relaxed);
-    }
     const CompressionDict* compression_dict;
     if (!is_data_block || r->compression_dict == nullptr) {
       compression_dict = &CompressionDict::GetEmptyDict();
@@ -1267,35 +1273,15 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
     }
     assert(compression_dict != nullptr);
     CompressionInfo compression_info(r->compression_opts, compression_ctx,
-                                     *compression_dict, *type);
+                                     *compression_dict, type);
 
-    std::string sampled_output_fast;
-    std::string sampled_output_slow;
-    *block_contents = CompressBlock(
-        uncompressed_block_data, compression_info, type,
-        r->table_options.format_version,
-        is_data_block ? r->sample_for_compression : 0U, compressed_output,
-        &sampled_output_fast, &sampled_output_slow);
-
-    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
-      // Currently compression sampling is only enabled for data block.
-      assert(is_data_block);
-      r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(),
-                                            std::memory_order_relaxed);
-      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
-                                                  std::memory_order_relaxed);
-      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
-                                                  std::memory_order_relaxed);
-    }
-    // notify collectors on block add
-    NotifyCollectTableCollectorsOnBlockAdd(
-        r->table_properties_collectors, uncompressed_block_data.size(),
-        sampled_output_fast.size(), sampled_output_slow.size());
+    type = CompressBlock(uncompressed_block_data, compression_info,
+                         r->table_options.format_version, compressed_output);
 
     // Some of the compression algorithms are known to be unreliable. If
     // the verify_compression flag is set then try to de-compress the
     // compressed data and compare to the input.
-    if (*type != kNoCompression && r->table_options.verify_compression) {
+    if (r->table_options.verify_compression && type != kNoCompression) {
       // Retrieve the uncompressed contents into a new buffer
       const UncompressionDict* verify_dict;
       if (!is_data_block || r->verify_dict == nullptr) {
@@ -1307,9 +1293,10 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
       BlockContents contents;
       UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
                                            r->compression_type);
-      Status uncompress_status = UncompressBlockData(
-          uncompression_info, block_contents->data(), block_contents->size(),
-          &contents, r->table_options.format_version, r->ioptions);
+      Status uncompress_status =
+          UncompressBlockData(uncompression_info, compressed_output->data(),
+                              compressed_output->size(), &contents,
+                              r->table_options.format_version, r->ioptions);
 
       if (uncompress_status.ok()) {
         bool data_match = contents.data.compare(uncompressed_block_data) == 0;
@@ -1319,36 +1306,38 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
               "Decompressed block did not match pre-compression block";
           ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
           *out_status = Status::Corruption(msg);
-          *type = kNoCompression;
+          type = kNoCompression;
         }
       } else {
         // Decompression reported an error. abort.
         *out_status = Status::Corruption(std::string("Could not decompress: ") +
                                          uncompress_status.getState());
-        *type = kNoCompression;
+        type = kNoCompression;
       }
     }
     if (timer.IsStarted()) {
       RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
                             timer.ElapsedNanos());
     }
+    if (is_data_block) {
+      r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
+                                                 std::memory_order_relaxed);
+      r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
+                                                   std::memory_order_relaxed);
+    }
   } else {
     // Status is not OK, or block is too big to be compressed.
     if (is_data_block) {
       r->uncompressible_input_data_bytes.fetch_add(
-          uncompressed_block_data.size(), std::memory_order_relaxed);
+          uncompressed_block_data.size() + kBlockTrailerSize,
+          std::memory_order_relaxed);
     }
-    *type = kNoCompression;
-  }
-  if (is_data_block) {
-    r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
-                                                 std::memory_order_relaxed);
+    type = kNoCompression;
   }
 
   // Abort compression if the block is too big, or did not pass
   // verification.
-  if (*type == kNoCompression) {
-    *block_contents = uncompressed_block_data;
+  if (type == kNoCompression) {
     bool compression_attempted = !compressed_output->empty();
     RecordTick(r->ioptions.stats, compression_attempted
                                       ? NUMBER_BLOCK_COMPRESSION_REJECTED
@@ -1364,6 +1353,7 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
     RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO,
                compressed_output->size());
   }
+  *result_compression_type = type;
 }
 
 void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
@@ -1501,8 +1491,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
     }
 
     Slice prev_key_no_ts = prev_block_last_key_no_ts;
-    for (size_t i = 0; i < block_rep->keys->Size(); i++) {
-      auto& key = (*block_rep->keys)[i];
+    for (size_t i = 0; i < block_rep->keys.Size(); i++) {
+      auto& key = block_rep->keys[i];
       if (r->filter_builder != nullptr) {
         Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz);
         r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
@@ -1516,10 +1506,14 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
     }
 
     r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
-        block_rep->data->size());
-    WriteMaybeCompressedBlock(block_rep->compressed_contents,
+        block_rep->uncompressed.size());
+    Slice compressed = block_rep->compressed;
+    Slice uncompressed = block_rep->uncompressed;
+    WriteMaybeCompressedBlock(block_rep->compression_type == kNoCompression
+                                  ? uncompressed
+                                  : compressed,
                               block_rep->compression_type, &r->pending_handle,
-                              BlockType::kData, &block_rep->contents);
+                              BlockType::kData, &uncompressed);
     if (!ok()) {
       break;
     }
@@ -1527,15 +1521,15 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
     r->props.data_size = r->get_offset();
     ++r->props.num_data_blocks;
 
-    if (block_rep->first_key_in_next_block == nullptr) {
-      r->index_builder->AddIndexEntry(block_rep->keys->Back(), nullptr,
+    if (!block_rep->first_key_in_next_block.has_value()) {
+      r->index_builder->AddIndexEntry(block_rep->keys.Back(), nullptr,
                                       r->pending_handle,
                                       &r->index_separator_scratch);
     } else {
       Slice first_key_in_next_block =
           Slice(*block_rep->first_key_in_next_block);
       r->index_builder->AddIndexEntry(
-          block_rep->keys->Back(), &first_key_in_next_block, r->pending_handle,
+          block_rep->keys.Back(), &first_key_in_next_block, r->pending_handle,
           &r->index_separator_scratch);
     }
 
@@ -2022,7 +2016,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
           r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
 
       assert(block_rep != nullptr);
-      r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+      r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
                                                r->get_offset());
       r->pc_rep->EmitBlock(block_rep);
     } else {
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 8bb5e3c074ad..a2501e962198 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -118,12 +118,6 @@ class BlockBasedTableBuilder : public TableBuilder {
   // REQUIRES: `rep_->state == kBuffered`
   void EnterUnbuffered();
 
-  // Call block's Finish() method and then
-  // - in buffered mode, buffer the uncompressed block contents.
-  // - in unbuffered mode, write the compressed block contents to file.
-  void WriteBlock(BlockBuilder* block, BlockHandle* handle,
-                  BlockType blocktype);
-
   // Compress and write block content to the file.
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
                   BlockType block_type);
@@ -185,7 +179,6 @@ class BlockBasedTableBuilder : public TableBuilder {
                               const CompressionContext& compression_ctx,
                               UncompressionContext* verify_ctx,
                               std::string* compressed_output,
-                              Slice* result_block_contents,
                               CompressionType* result_compression_type,
                               Status* out_status);
 
@@ -200,13 +193,6 @@ class BlockBasedTableBuilder : public TableBuilder {
   void StopParallelCompression();
 };
 
-Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info,
-                    CompressionType* type, uint32_t format_version,
-                    uint64_t sample_for_compression,
-                    std::string* compressed_output,
-                    std::string* sampled_output_fast,
-                    std::string* sampled_output_slow);
-
 #ifndef NDEBUG
 // 0 == disable the hack
 // > 0 => counter for rotating through compression types
diff --git a/unreleased_history/bug_fixes/parallel_compression_bug.md b/unreleased_history/bug_fixes/parallel_compression_bug.md
new file mode 100644
index 000000000000..849f2d595a16
--- /dev/null
+++ b/unreleased_history/bug_fixes/parallel_compression_bug.md
@@ -0,0 +1 @@
+* Fixed a potential data race with `CompressionOptions::parallel_threads > 1` and a `TablePropertiesCollector` overriding `BlockAdd()`.
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 2ed8761fe1ac..aba7e5d2fa7a 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -29,9 +29,6 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/stackable_db.h"
 #include "rocksdb/utilities/transaction.h"
-#include "table/block_based/block.h"
-#include "table/block_based/block_based_table_builder.h"
-#include "table/block_based/block_builder.h"
 #include "table/meta_blocks.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
@@ -1163,9 +1160,9 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   CompressionOptions opts;
   CompressionContext context(type, opts);
   CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type);
-  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat,
-                0 /* sample_for_compression */, compression_output, nullptr,
-                nullptr);
+  CompressData(raw, info,
+               GetCompressFormatForVersion(kBlockBasedTableVersionFormat),
+               compression_output);
   return *compression_output;
 }
 

From f49d76b7ad307e76b5a830afed525ca75efc32ed Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 5 May 2025 17:42:57 -0700
Subject: [PATCH 078/500] Clarify that `memtable_op_scan_flush_trigger` does
 not support tailing iterator (#13586)

Summary:
clarify in comments and fix one implementation under NewIterator where option `memtable_op_scan_flush_trigger` does not work correctly with tailing iterator yet. This is because tailing iterator can rebuild iterator internally which reads from a newer memtable, and DBIter's reference to active memtable needs to be refreshed. This PR clarifies that `memtable_op_scan_flush_trigger` will have no effect on tailing iterator. We can add the support in the future if needed.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13586

Test Plan: existing tests.

Reviewed By: jaykorean

Differential Revision: D74108099

Pulled By: cbi42

fbshipit-source-id: 7c6608485d57755abc44f3be0b3c5d82a7bc5ca9
---
 db/arena_wrapped_db_iter.cc        |   2 +-
 db/db_impl/db_impl.cc              |  20 ++--
 db/db_iter.h                       |   8 +-
 db/db_iter_stress_test.cc          |   2 +-
 db/db_iter_test.cc                 | 166 +++++++++++++++--------------
 db/forward_iterator.h              |   1 +
 include/rocksdb/advanced_options.h |   2 +
 7 files changed, 104 insertions(+), 97 deletions(-)

diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index d24a918368ba..96441d5d303e 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -55,7 +55,7 @@ void ArenaWrappedDBIter::Init(
   db_iter_ = DBIter::NewIter(
       env, read_options_, ioptions, mutable_cf_options,
       ioptions.user_comparator, /*internal_iter=*/nullptr, version, sequence,
-      read_callback, cfh, expose_blob_index, active_mem, &arena_);
+      read_callback, active_mem, cfh, expose_blob_index, &arena_);
 
   sv_number_ = version_number;
   allow_refresh_ = allow_refresh;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 8ecbc3660194..cfb0ea07ef09 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3886,12 +3886,14 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
 
     auto iter = new ForwardIterator(this, read_options, cfd, sv,
                                     /* allow_unprepared_value */ true);
+    // TODO(cbi): Add support for `memtable_op_scan_flush_trigger` for tailing
+    // iterator. This requires refreshing DBIter's pointer to active_mem when
+    // tailing iterator refreshes to new memtable internally.
     result = DBIter::NewIter(env_, read_options, cfd->ioptions(),
                              sv->mutable_cf_options, cfd->user_comparator(),
                              iter, sv->current, kMaxSequenceNumber,
-                             /*read_callback=*/nullptr, cfh,
-                             /*expose_blob_index=*/false,
-                             /*active_mem=*/sv->mem);
+                             /*read_callback=*/nullptr, /*active_mem=*/nullptr,
+                             cfh, /*expose_blob_index=*/false);
   } else {
     // Note: no need to consider the special case of
     // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
@@ -4095,12 +4097,12 @@ Status DBImpl::NewIterators(
       auto iter = new ForwardIterator(this, read_options, cf_sv_pair.cfd,
                                       cf_sv_pair.super_version,
                                       /* allow_unprepared_value */ true);
-      iterators->push_back(
-          DBIter::NewIter(env_, read_options, cf_sv_pair.cfd->ioptions(),
-                          cf_sv_pair.super_version->mutable_cf_options,
-                          cf_sv_pair.cfd->user_comparator(), iter,
-                          cf_sv_pair.super_version->current, kMaxSequenceNumber,
-                          nullptr /*read_callback*/, cf_sv_pair.cfh));
+      iterators->push_back(DBIter::NewIter(
+          env_, read_options, cf_sv_pair.cfd->ioptions(),
+          cf_sv_pair.super_version->mutable_cf_options,
+          cf_sv_pair.cfd->user_comparator(), iter,
+          cf_sv_pair.super_version->current, kMaxSequenceNumber,
+          nullptr /*read_callback*/, /*active_mem=*/nullptr, cf_sv_pair.cfh));
     }
   } else {
     for (const auto& cf_sv_pair : cf_sv_pairs) {
diff --git a/db/db_iter.h b/db/db_iter.h
index 3e67c9c4ce4a..494bb43f57b0 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -71,9 +71,9 @@ class DBIter final : public Iterator {
                          InternalIterator* internal_iter,
                          const Version* version, const SequenceNumber& sequence,
                          ReadCallback* read_callback,
+                         ReadOnlyMemTable* active_mem,
                          ColumnFamilyHandleImpl* cfh = nullptr,
                          bool expose_blob_index = false,
-                         ReadOnlyMemTable* active_mem = nullptr,
                          Arena* arena = nullptr) {
     void* mem = arena ? arena->AllocateAligned(sizeof(DBIter))
                       : operator new(sizeof(DBIter));
@@ -475,9 +475,9 @@ class DBIter final : public Iterator {
   const size_t timestamp_size_;
   std::string saved_timestamp_;
   std::optional<std::vector<ScanOptions>> scan_opts_;
-  ReadOnlyMemTable* active_mem_;
-  SequenceNumber memtable_seqno_lb_;
-  uint32_t memtable_op_scan_flush_trigger_;
+  ReadOnlyMemTable* const active_mem_;
+  const SequenceNumber memtable_seqno_lb_;
+  const uint32_t memtable_op_scan_flush_trigger_;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index 234350601930..c6d3936b3ccf 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -532,7 +532,7 @@ TEST_F(DBIteratorStressTest, StressTest) {
                       env_, ropt, ImmutableOptions(options),
                       MutableCFOptions(options), BytewiseComparator(),
                       internal_iter, /*version=*/nullptr, sequence,
-                      nullptr /*read_callback*/));
+                      nullptr /*read_callback*/, /*active_mem=*/nullptr));
                 }
 
                 // Do a random operation. It's important to do it on ref_it
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 55ddb08d6835..d18aa0bac4a1 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -262,7 +262,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -296,7 +296,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -323,7 +323,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -356,7 +356,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -392,7 +392,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -423,7 +423,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -462,7 +462,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -488,7 +488,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -512,7 +512,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -548,7 +548,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -579,7 +579,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -623,7 +623,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -655,7 +655,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -686,7 +686,7 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
     ASSERT_OK(db_iter->status());
@@ -699,7 +699,7 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
     ASSERT_OK(db_iter->status());
@@ -723,7 +723,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -770,7 +770,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
 
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -808,7 +808,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
 
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -838,8 +838,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, 202 /* sequence */,
-
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -874,7 +873,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i /* sequence */,
 
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
       ASSERT_OK(db_iter->status());
@@ -893,7 +892,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 200 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -931,7 +930,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
 
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -968,7 +967,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, i + 2 /* sequence */,
 
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -1019,7 +1018,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1066,7 +1065,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1111,7 +1110,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1150,7 +1149,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1186,7 +1185,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1217,7 +1216,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1255,7 +1254,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1294,7 +1293,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
           env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
           internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
 
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1348,7 +1347,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
           env_, ro, ioptions, MutableCFOptions(options), BytewiseComparator(),
           internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
-          nullptr /* read_callback */));
+          nullptr /* read_callback */, /*active_mem=*/nullptr));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1391,7 +1390,7 @@ TEST_F(DBIteratorTest, DBIteratorTimedPutBasic) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      7 /* sequence */, nullptr /* read_callback */));
+      7 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1441,7 +1440,7 @@ TEST_F(DBIteratorTest, DBIterator1) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      1 /* sequence */, nullptr /* read_callback */));
+      1 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1470,7 +1469,7 @@ TEST_F(DBIteratorTest, DBIterator2) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      0 /* sequence */, nullptr /* read_callback */));
+      0 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1495,7 +1494,7 @@ TEST_F(DBIteratorTest, DBIterator3) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1520,7 +1519,7 @@ TEST_F(DBIteratorTest, DBIterator4) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      4 /* sequence */, nullptr /* read_callback */));
+      4 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1554,7 +1553,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1578,7 +1577,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 1 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1602,7 +1601,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1626,7 +1625,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 3 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1650,7 +1649,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1674,7 +1673,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1698,7 +1697,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1720,7 +1719,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 10 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -1751,7 +1750,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1775,7 +1774,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 1 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1799,7 +1798,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1823,7 +1822,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 3 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
     ASSERT_OK(db_iter->status());
@@ -1843,7 +1842,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1867,7 +1866,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1891,7 +1890,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1935,7 +1934,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 0 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1971,7 +1970,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 2 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2013,7 +2012,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 4 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2055,7 +2054,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 5 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2102,7 +2101,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 6 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2150,7 +2149,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 7 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2192,7 +2191,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 9 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2240,7 +2239,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 13 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2289,7 +2288,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
         env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
         internal_iter, nullptr /* version */, 14 /* sequence */,
-        nullptr /* read_callback */));
+        nullptr /* read_callback */, /*active_mem=*/nullptr));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2321,7 +2320,7 @@ TEST_F(DBIteratorTest, DBIterator8) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2349,10 +2348,11 @@ TEST_F(DBIteratorTest, DBIterator9) {
     internal_iter->AddMerge("d", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
-        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, nullptr /* version */,
-        10 /* sequence */, nullptr /* read_callback */));
+    std::unique_ptr<Iterator> db_iter(
+        DBIter::NewIter(env_, ro, ImmutableOptions(options),
+                        MutableCFOptions(options), BytewiseComparator(),
+                        internal_iter, nullptr /* version */, 10 /* sequence */,
+                        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2419,7 +2419,7 @@ TEST_F(DBIteratorTest, DBIterator10) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2459,7 +2459,7 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2489,7 +2489,7 @@ TEST_F(DBIteratorTest, DBIterator11) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      1 /* sequence */, nullptr /* read_callback */));
+      1 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2517,7 +2517,7 @@ TEST_F(DBIteratorTest, DBIterator12) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -2557,7 +2557,7 @@ TEST_F(DBIteratorTest, DBIterator13) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      2 /* sequence */, nullptr /* read_callback */));
+      2 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2587,7 +2587,7 @@ TEST_F(DBIteratorTest, DBIterator14) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      4 /* sequence */, nullptr /* read_callback */));
+      4 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2627,7 +2627,8 @@ class DBIterWithMergeIterTest : public testing::Test {
     db_iter_.reset(DBIter::NewIter(
         env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
         BytewiseComparator(), merge_iter, nullptr /* version */,
-        8 /* read data earlier than seqId 8 */, nullptr /* read_callback */));
+        8 /* read data earlier than seqId 8 */, nullptr /* read_callback */,
+        /*active_mem=*/nullptr));
   }
 
   Env* env_;
@@ -3066,7 +3067,7 @@ TEST_F(DBIteratorTest, SeekPrefixTombstones) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   int skipped_keys = 0;
 
@@ -3099,10 +3100,11 @@ TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
     Slice lower_bound(lower_bound_str);
     ro.iterate_lower_bound = &lower_bound;
     Options options;
-    std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
-        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, nullptr /* version */,
-        10 /* sequence */, nullptr /* read_callback */));
+    std::unique_ptr<Iterator> db_iter(
+        DBIter::NewIter(env_, ro, ImmutableOptions(options),
+                        MutableCFOptions(options), BytewiseComparator(),
+                        internal_iter, nullptr /* version */, 10 /* sequence */,
+                        nullptr /* read_callback */, /*active_mem=*/nullptr));
 
     db_iter->SeekToFirst();
     if (i == kNumKeys + 1) {
@@ -3141,7 +3143,7 @@ TEST_F(DBIteratorTest, PrevLowerBound) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   db_iter->SeekToLast();
   for (int i = kNumKeys; i >= kLowerBound; --i) {
@@ -3169,7 +3171,7 @@ TEST_F(DBIteratorTest, SeekLessLowerBound) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ro, ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   auto before_lower_bound_str = std::to_string(kLowerBound - 1);
   Slice before_lower_bound(lower_bound_str);
@@ -3194,7 +3196,7 @@ TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
   std::unique_ptr<Iterator> db_iter(DBIter::NewIter(
       env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
       BytewiseComparator(), internal_iter, nullptr /* version */,
-      10 /* sequence */, nullptr /* read_callback */));
+      10 /* sequence */, nullptr /* read_callback */, /*active_mem=*/nullptr));
 
   db_iter->SeekForPrev("a");
   ASSERT_TRUE(db_iter->Valid());
diff --git a/db/forward_iterator.h b/db/forward_iterator.h
index 11dde54777e7..81a7f3132980 100644
--- a/db/forward_iterator.h
+++ b/db/forward_iterator.h
@@ -42,6 +42,7 @@ using MinIterHeap =
     std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
                         MinIterComparator>;
 
+// TODO: name to TailingIterator
 /**
  * ForwardIterator is a special type of iterator that only supports Seek()
  * and Next(). It is expected to perform better than TailingIterator by
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index d110d9cde0e2..2e3cb7ab9fba 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -1106,6 +1106,8 @@ struct AdvancedColumnFamilyOptions {
   // CompactOnDeletionCollectorFactory) together with this option to compact
   // away tombstones after the memtable is flushed.
   //
+  // Note that this option has no effect on tailing iterators yet.
+  //
   // Default: 0 (disabled)
   // Dynamically changeable through the SetOptions() API.
   uint32_t memtable_op_scan_flush_trigger = 0;

From 947a63400f201e49d16bde3af4c89a996f49c3f0 Mon Sep 17 00:00:00 2001
From: Till Rohrmann <trohrmann@apache.org>
Date: Tue, 6 May 2025 11:42:10 -0700
Subject: [PATCH 079/500] Allow specifying ReadOptions for WBWI iterator
 (#12968)

Summary:
Allow specifying ReadOptions for WBWI iterator when creating it through the C bindings. This allows to specify upper and lower bounds for the created iterator.

This fixes https://github.com/facebook/rocksdb/issues/12963.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12968

Reviewed By: pdillinger

Differential Revision: D74188049

Pulled By: jaykorean

fbshipit-source-id: 970d9910472dfedaa29a800c6d52bec14c656f3c
---
 db/c.cc                                       | 21 ++++++
 db/c_test.c                                   | 69 +++++++++++++++++++
 include/rocksdb/c.h                           |  9 ++-
 .../utilities/write_batch_with_index.h        |  3 +-
 .../write_batch_with_index.cc                 | 15 +++-
 5 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index 859f2d0b601e..70681e188206 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2632,6 +2632,16 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
   return result;
 }
 
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep =
+      wbwi->rep->NewIteratorWithBase(base_iterator->rep, &options->rep);
+  delete base_iterator;
+  return result;
+}
+
 rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
     rocksdb_column_family_handle_t* column_family) {
@@ -2642,6 +2652,17 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
   return result;
 }
 
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* column_family,
+    const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = wbwi->rep->NewIteratorWithBase(
+      column_family->rep, base_iterator->rep, &options->rep);
+  delete base_iterator;
+  return result;
+}
+
 char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi,
                                            const rocksdb_options_t* options,
                                            const char* key, size_t keylen,
diff --git a/db/c_test.c b/db/c_test.c
index 2324611f2fe6..b6574cd8ecae 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -1362,6 +1362,46 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_wi_destroy(wbi);
   }
 
+  StartPhase("wbwi_iter_readoptions");
+  {
+    rocksdb_readoptions_t* iter_roptions = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_iterate_lower_bound(iter_roptions, "boy", 3);
+    rocksdb_readoptions_set_iterate_upper_bound(iter_roptions, "fool", 4);
+    rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, iter_roptions);
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b",
+                              1);  // should get filtered out
+    rocksdb_writebatch_wi_put(wbi, "cat", 3, "miau", 4);
+    rocksdb_writebatch_wi_put(wbi, "gnu", 3, "muh",
+                              3);  // should get filtered out
+    rocksdb_iterator_t* iter =
+        rocksdb_writebatch_wi_create_iterator_with_base_readopts(wbi, base_iter,
+                                                                 iter_roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_seek_for_prev(iter, "d", 1);
+    CheckIter(iter, "cat", "miau");
+    rocksdb_iter_seek_for_prev(iter, "fool", 3);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_writebatch_wi_destroy(wbi);
+    rocksdb_readoptions_destroy(iter_roptions);
+  }
+
   StartPhase("multiget");
   {
     const char* keys[3] = {"box", "foo", "notfound"};
@@ -1792,6 +1832,35 @@ int main(int argc, char** argv) {
     rocksdb_flush_wal(db, 1, &err);
     CheckNoError(err);
 
+    // Test WriteBatchWithIndex iteration with Column Family
+    rocksdb_writebatch_wi_t* wbwi = rocksdb_writebatch_wi_create(0, true);
+    rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "boat", 4, "row",
+                                 3);  // should be filtered out
+    rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "buffy", 5, "charmed", 7);
+    rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "bus", 3, "yellow",
+                                 6);  // should be filtered out
+    rocksdb_readoptions_t* iter_roptions = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_iterate_lower_bound(iter_roptions, "bu", 2);
+    rocksdb_readoptions_set_iterate_upper_bound(iter_roptions, "buffz", 5);
+    rocksdb_iterator_t* base_iter =
+        rocksdb_create_iterator_cf(db, iter_roptions, handles[1]);
+    rocksdb_iterator_t* wbwi_iter =
+        rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts(
+            wbwi, base_iter, handles[1], iter_roptions);
+
+    CheckCondition(!rocksdb_iter_valid(wbwi_iter));
+    rocksdb_iter_seek_to_first(wbwi_iter);
+    CheckCondition(rocksdb_iter_valid(wbwi_iter));
+    CheckIter(wbwi_iter, "buff", "rocksdb");
+    rocksdb_iter_next(wbwi_iter);
+    CheckIter(wbwi_iter, "buffy", "charmed");
+    rocksdb_iter_next(wbwi_iter);
+    CheckCondition(!rocksdb_iter_valid(wbwi_iter));
+
+    rocksdb_iter_destroy(wbwi_iter);
+    rocksdb_writebatch_wi_destroy(wbwi);
+    rocksdb_readoptions_destroy(iter_roptions);
+
     const char* keys[3] = {"box", "box", "barfooxx"};
     const rocksdb_column_family_handle_t* get_handles[3] = {
         handles[0], handles[1], handles[1]};
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 09a5f8ba1cb8..83c15f1710b0 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -999,13 +999,20 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_writebatch_wi_create_iterator_with_base(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator);
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    const rocksdb_readoptions_t* options);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_writebatch_wi_create_iterator_with_base_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
     rocksdb_column_family_handle_t* cf);
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_writebatch_wi_create_iterator_with_base_cf_readopts(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* cf, const rocksdb_readoptions_t* options);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_wi_update_timestamps(
     rocksdb_writebatch_wi_t* wbwi, const char* ts, size_t tslen, void* state,
     size_t (*get_ts_size)(void*, uint32_t), char** errptr);
-
 /* Options utils */
 
 // Load the latest rocksdb options from the specified db_path.
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 9cfc364cfe07..9d3914c1b44f 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -236,7 +236,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
                                 Iterator* base_iterator,
                                 const ReadOptions* opts = nullptr);
   // default column family
-  Iterator* NewIteratorWithBase(Iterator* base_iterator);
+  Iterator* NewIteratorWithBase(Iterator* base_iterator,
+                                const ReadOptions* opts = nullptr);
 
   // Similar to DB::Get() but will only read the key from this batch.
   // If the batch does not have enough data to resolve Merge operations,
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index ca324d9da59c..0bd6c42fd0d8 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -361,10 +361,19 @@ Iterator* WriteBatchWithIndex::NewIteratorWithBase(
                                read_options);
 }
 
-Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
+Iterator* WriteBatchWithIndex::NewIteratorWithBase(
+    Iterator* base_iterator, const ReadOptions* read_options) {
+  WBWIIteratorImpl* wbwiii;
   // default column family's comparator
-  auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
-                                     &rep->comparator);
+  if (read_options != nullptr) {
+    wbwiii = new WBWIIteratorImpl(
+        0, &(rep->skip_list), &rep->write_batch, &rep->comparator,
+        read_options->iterate_lower_bound, read_options->iterate_upper_bound);
+  } else {
+    wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                                  &rep->comparator);
+  }
+
   return new BaseDeltaIterator(nullptr, base_iterator, wbwiii,
                                rep->comparator.default_comparator(),
                                /* read_options */ nullptr);

From 13d865f6f1f83b0822e9d32d1a9bb0f6b271c530 Mon Sep 17 00:00:00 2001
From: Michael C Huang <mikechuang@meta.com>
Date: Thu, 8 May 2025 15:51:37 -0700
Subject: [PATCH 080/500] Add trivial copy support when FIFO compaction reason
 is kChangeTemperature (#13562)

Summary:
Prior to this PR, for FIFO kChangeTemperature compaction was done by iterating and reading thru the input sst and generate the output sst. This was wasteful since for FIFO we could apply the "trivial" move by copying the input sst to the out sst without need decompress/compress and reading thru the input sst content at all. This PR added "allow_trivial_copy_when_change_temperature" to the CompactionOptionsFIFO.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13562

Reviewed By: cbi42

Differential Revision: D73295404

Pulled By: mikechuangmeta

fbshipit-source-id: 02241c7389797730ecd4a3b636837cb5f912b424
---
 db/compaction/compaction.h                    |   7 +
 db/compaction/compaction_picker_test.cc       |  27 +-
 db/db_compaction_test.cc                      | 204 ++++++++------
 db/db_impl/db_impl.h                          |   2 +-
 db/db_impl/db_impl_compaction_flush.cc        | 250 +++++++++++++++++-
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   3 +
 db_stress_tool/db_stress_test_base.cc         |  11 +-
 file/file_util.cc                             |  39 ++-
 file/file_util.h                              |  18 +-
 include/rocksdb/advanced_options.h            |  15 ++
 include/rocksdb/statistics.h                  |   1 +
 java/rocksjni/portal.h                        |   4 +
 monitoring/statistics.cc                      |   2 +
 options/cf_options.cc                         |  11 +-
 tools/db_crashtest.py                         |   1 +
 ...-tiering-change-temperatur-trivial-copy.md |   1 +
 17 files changed, 484 insertions(+), 113 deletions(-)
 create mode 100644 unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md

diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index fe7fc5026ed8..082cf132b150 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -283,6 +283,13 @@ class Compaction {
   // are non-overlapping and can be trivially moved.
   bool is_trivial_move() const { return is_trivial_move_; }
 
+  bool is_trivial_copy_compaction() const {
+    return immutable_options_.compaction_style == kCompactionStyleFIFO &&
+           compaction_reason_ == CompactionReason::kChangeTemperature &&
+           mutable_cf_options_.compaction_options_fifo
+               .allow_trivial_copy_when_change_temperature;
+  }
+
   // How many total levels are there?
   int number_levels() const { return number_levels_; }
 
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 35193db57eed..73aeae6d1ae3 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -1134,10 +1134,15 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
     fifo_options_.max_table_files_size = kMaxSize;
     fifo_options_.file_temperature_age_thresholds = {
         {Temperature::kCold, kColdThreshold}};
+    fifo_options_.allow_trivial_copy_when_change_temperature = true;
+    fifo_options_.trivial_copy_buffer_size = 16 * 1024 * 1024;
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1186,7 +1191,10 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1253,7 +1261,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1318,7 +1329,10 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
@@ -1385,7 +1399,10 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) {
     mutable_cf_options_.compaction_options_fifo = fifo_options_;
     mutable_cf_options_.level0_file_num_compaction_trigger = 100;
     mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
-    FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+    auto copiedIOptions = ioptions_;
+    copiedIOptions.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker fifo_compaction_picker(copiedIOptions, &icmp_);
 
     int64_t current_time = 0;
     ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index b539251a2998..98536960c8a9 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -9434,104 +9434,134 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
 }
 
 TEST_F(DBCompactionTest, FIFOChangeTemperature) {
-  for (bool write_time_default : {false, true}) {
-    SCOPED_TRACE("write time default? " + std::to_string(write_time_default));
+  for (bool should_allow_trivial_copy : {false, true}) {
+    for (bool write_time_default : {false, true}) {
+      int32_t before_compaction_calls = 0;
+      int32_t after_compaction_calls = 0;
+      if (should_allow_trivial_copy) {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:TriviaCopyBeforeCompaction",
+            [&](void*) { ++before_compaction_calls; });
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:TriviaCopyAfterCompaction",
+            [&](void*) { ++after_compaction_calls; });
+      } else {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:BeforeCompaction",
+            [&](void*) { ++before_compaction_calls; });
 
-    Options options = CurrentOptions();
-    options.compaction_style = kCompactionStyleFIFO;
-    options.num_levels = 1;
-    options.max_open_files = -1;
-    options.level0_file_num_compaction_trigger = 2;
-    options.create_if_missing = true;
-    CompactionOptionsFIFO fifo_options;
-    fifo_options.file_temperature_age_thresholds = {{Temperature::kCold, 1000}};
-    fifo_options.max_table_files_size = 100000000;
-    options.compaction_options_fifo = fifo_options;
-    env_->SetMockSleep();
-    if (write_time_default) {
-      options.default_write_temperature = Temperature::kWarm;
-    }
-    // Should be ignored (TODO: fail?)
-    options.last_level_temperature = Temperature::kHot;
-    Reopen(options);
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+            "DBImpl::BackgroundCompaction:AfterCompaction",
+            [&](void*) { ++after_compaction_calls; });
+      }
 
-    int total_cold = 0;
-    int total_warm = 0;
-    int total_hot = 0;
-    int total_unknown = 0;
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "NewWritableFile::FileOptions.temperature", [&](void* arg) {
-          Temperature temperature = *(static_cast<Temperature*>(arg));
-          if (temperature == Temperature::kCold) {
-            total_cold++;
-          } else if (temperature == Temperature::kWarm) {
-            total_warm++;
-          } else if (temperature == Temperature::kHot) {
-            total_hot++;
-          } else {
-            assert(temperature == Temperature::kUnknown);
-            total_unknown++;
-          }
-        });
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+      SCOPED_TRACE("write time default? " + std::to_string(write_time_default));
 
-    // The file system does not support checksum handoff. The check
-    // will be ignored.
-    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
+      Options options = CurrentOptions();
+      options.compaction_style = kCompactionStyleFIFO;
+      options.num_levels = 1;
+      options.max_open_files = -1;
+      options.level0_file_num_compaction_trigger = 2;
+      options.create_if_missing = true;
+      CompactionOptionsFIFO fifo_options;
+      fifo_options.file_temperature_age_thresholds = {
+          {Temperature::kCold, 1000}};
+      fifo_options.max_table_files_size = 100000000;
+      fifo_options.allow_trivial_copy_when_change_temperature =
+          should_allow_trivial_copy;
+      fifo_options.trivial_copy_buffer_size = 4096;
+      options.compaction_options_fifo = fifo_options;
+      env_->SetMockSleep();
+      if (write_time_default) {
+        options.default_write_temperature = Temperature::kWarm;
+      }
+      // Should be ignored (TODO: fail?)
+      options.last_level_temperature = Temperature::kHot;
+      Reopen(options);
 
-    ASSERT_OK(Put(Key(0), "value1"));
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
+      int total_cold = 0;
+      int total_warm = 0;
+      int total_hot = 0;
+      int total_unknown = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+            Temperature temperature = *(static_cast<Temperature*>(arg));
+            if (temperature == Temperature::kCold) {
+              total_cold++;
+            } else if (temperature == Temperature::kWarm) {
+              total_warm++;
+            } else if (temperature == Temperature::kHot) {
+              total_hot++;
+            } else {
+              assert(temperature == Temperature::kUnknown);
+              total_unknown++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-    // First two L0 files both become eligible for temperature change compaction
-    // They should be compacted one-by-one.
-    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(1200);
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      // The file system does not support checksum handoff. The check
+      // will be ignored.
+      ASSERT_OK(Put(Key(0), "value1"));
+      env_->MockSleepForSeconds(800);
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
 
-    if (write_time_default) {
-      // Also test dynamic option change
-      ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}}));
-    }
+      ASSERT_OK(Put(Key(0), "value1"));
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
 
-    ASSERT_OK(Put(Key(0), "value1"));
-    env_->MockSleepForSeconds(800);
-    ASSERT_OK(Put(Key(2), "value2"));
-    ASSERT_OK(Flush());
+      // First two L0 files both become eligible for temperature change
+      // compaction They should be compacted one-by-one.
+      ASSERT_OK(Put(Key(0), "value1"));
+      env_->MockSleepForSeconds(1200);
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      if (write_time_default) {
+        // Also test dynamic option change
+        ASSERT_OK(db_->SetOptions({{"default_write_temperature", "kHot"}}));
+      }
 
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ASSERT_OK(Put(Key(0), "value1"));
+      env_->MockSleepForSeconds(800);
+      ASSERT_OK(Put(Key(2), "value2"));
+      ASSERT_OK(Flush());
 
-    ColumnFamilyMetaData metadata;
-    db_->GetColumnFamilyMetaData(&metadata);
-    ASSERT_EQ(4, metadata.file_count);
-    if (write_time_default) {
-      ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature);
-      ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature);
-      // Includes obsolete/deleted files moved to cold
-      ASSERT_EQ(total_warm, 3);
-      ASSERT_EQ(total_hot, 1);
-      // Includes non-SST DB files
-      ASSERT_GT(total_unknown, 0);
-    } else {
-      ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
-      ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
-      ASSERT_EQ(total_warm, 0);
-      ASSERT_EQ(total_hot, 0);
-      // Includes non-SST DB files
-      ASSERT_GT(total_unknown, 4);
-    }
-    ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature);
-    ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature);
-    ASSERT_EQ(2, total_cold);
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-    Destroy(options);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+      ColumnFamilyMetaData metadata;
+      db_->GetColumnFamilyMetaData(&metadata);
+      ASSERT_EQ(4, metadata.file_count);
+      if (write_time_default) {
+        ASSERT_EQ(Temperature::kHot, metadata.levels[0].files[0].temperature);
+        ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[1].temperature);
+        // Includes obsolete/deleted files moved to cold
+        ASSERT_EQ(total_warm, 3);
+        ASSERT_EQ(total_hot, 1);
+        // Includes non-SST DB files
+        ASSERT_GT(total_unknown, 0);
+      } else {
+        ASSERT_EQ(Temperature::kUnknown,
+                  metadata.levels[0].files[0].temperature);
+        ASSERT_EQ(Temperature::kUnknown,
+                  metadata.levels[0].files[1].temperature);
+        ASSERT_EQ(total_warm, 0);
+        ASSERT_EQ(total_hot, 0);
+        // Includes non-SST DB files
+        ASSERT_GT(total_unknown, 4);
+      }
+      ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature);
+      ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature);
+      ASSERT_EQ(2, total_cold);
+
+      ASSERT_EQ(2, before_compaction_calls);
+      ASSERT_EQ(2, after_compaction_calls);
+
+      Destroy(options);
+    }
   }
 }
 
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index dd4d3e90a4cd..1062b212ef29 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -2577,7 +2577,7 @@ class DBImpl : public DB {
   bool ShouldntRunManualCompaction(ManualCompactionState* m);
   bool HaveManualCompaction(ColumnFamilyData* cfd);
   bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
-  void UpdateDeletionCompactionStats(const std::unique_ptr<Compaction>& c);
+  void UpdateFIFOCompactionStatus(const std::unique_ptr<Compaction>& c);
 
   // May open and read table files for table property.
   // Should not be called while holding mutex_.
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 9ae28aa8dba0..75d41af343f3 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -13,12 +13,14 @@
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/event_helpers.h"
+#include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
 #include "logging/logging.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/io_status.h"
 #include "rocksdb/options.h"
@@ -3786,11 +3788,251 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                      c->column_family_data()->GetName().c_str(),
                      c->num_input_files(0));
     if (status.ok() && io_s.ok()) {
-      UpdateDeletionCompactionStats(c);
+      UpdateFIFOCompactionStatus(c);
     }
     *made_progress = true;
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
+  } else if (c->is_trivial_copy_compaction()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:TriviaCopyBeforeCompaction",
+        c->column_family_data());
+    assert(c->num_input_files(1) == 0);
+    assert(c->column_family_data()->ioptions().compaction_style ==
+           kCompactionStyleFIFO);
+    assert(c->compaction_reason() == CompactionReason::kChangeTemperature);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    std::vector<FileMetaData> out_files;
+    for (const auto& in_file : *c->inputs(0)) {
+      const uint64_t out_file_number = versions_->NewFileNumber();
+      const std::string in_fname =
+          TableFileName(c->immutable_options().cf_paths,
+                        in_file->fd.GetNumber(), in_file->fd.GetPathId());
+      const std::string out_fname =
+          TableFileName(c->immutable_options().cf_paths, out_file_number,
+                        c->output_path_id());
+
+      // TODO (mikechuang): Currently skip calling
+      // EventHelpers::NotifyTableFileCreationStarted for the trivial copy.
+      // Since it's a trivial copy we should ideally use the exact
+      // TableProperties from the input file but that will break some existing
+      // stress tests. For now skip the listener call for the FIFO
+      // kChangeTemperature trivial copy move.
+
+      int64_t tmp_current_time = 0;
+      auto get_time_status =
+          immutable_db_options_.clock->GetCurrentTime(&tmp_current_time);
+      if (!get_time_status.ok()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] WARNING: Failed to get current time %s "
+                         "status=%s",
+                         c->column_family_data()->GetName().c_str(),
+                         get_time_status.ToString().c_str());
+      }
+      uint64_t out_file_creation_time = static_cast<uint64_t>(tmp_current_time);
+
+      FileOptions copied_file_options = file_options_;
+      copied_file_options.temperature = c->output_temperature();
+      std::unique_ptr<WritableFileWriter> dest_writer;
+      {
+        std::unique_ptr<FSWritableFile> dest_file;
+        IOStatus writable_file_io_status =
+            immutable_db_options_.fs.get()->NewWritableFile(
+                out_fname, copied_file_options, &dest_file, nullptr /* dbg */);
+        TEST_SYNC_POINT_CALLBACK(
+            "NewWritableFile::FileOptions.temperature",
+            const_cast<Temperature*>(&copied_file_options.temperature));
+        if (!writable_file_io_status.ok()) {
+          io_s = writable_file_io_status;
+          ROCKS_LOG_BUFFER(
+              log_buffer,
+              "[%s] Error: Abort trivial copy compaction, failed to open "
+              "NewWritableFile %s\n"
+              " out_fname=%s, temperature=%s, io_status=%s",
+              c->column_family_data()->GetName().c_str(), out_fname.c_str(),
+              temperature_to_string[c->output_temperature()].c_str(),
+              io_s.ToString().c_str());
+          break;
+        }
+
+        FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+        dest_writer.reset(new WritableFileWriter(
+            std::move(dest_file), out_fname, copied_file_options,
+            immutable_db_options_.clock, io_tracer_,
+            immutable_db_options_.stats, Histograms::SST_WRITE_MICROS,
+            c->immutable_options().listeners,
+            immutable_db_options_.file_checksum_gen_factory.get(),
+            tmp_set.Contains(FileType::kTableFile), false));
+      }
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Started copying from: %s\n"
+          " temperature=%s, to: %s, temperature=%s, buffer_size=%" PRIu64,
+          c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+          temperature_to_string[in_file->temperature].c_str(),
+          out_fname.c_str(),
+          temperature_to_string[c->output_temperature()].c_str(),
+          c->mutable_cf_options()
+              .compaction_options_fifo.trivial_copy_buffer_size);
+      // Add IO_LOW HINT for compaction
+      IOOptions copy_files_compaction_io_options;
+      copy_files_compaction_io_options.rate_limiter_priority =
+          Env::IOPriority::IO_LOW;
+      copy_files_compaction_io_options.type = IOType::kData;
+      copy_files_compaction_io_options.io_activity =
+          Env::IOActivity::kCompaction;
+
+      IOStatus copy_file_io_status = CopyFile(
+          immutable_db_options_.fs.get() /* fileSystem */,
+          in_fname /* source */, in_file->temperature /* src_temp_hint */,
+          dest_writer /* dest_writer */, 0 /* size */, true /* use_fsync */,
+          io_tracer_ /* io_tracer*/,
+          c->mutable_cf_options()
+              .compaction_options_fifo
+              .trivial_copy_buffer_size /* max_read_buffer_size
+                                         */
+          ,
+          copy_files_compaction_io_options /* readIOOptions */,
+          copy_files_compaction_io_options /* writeIOOptions */);
+      if (dest_writer) {
+        IOOptions close_files_compaction_io_options;
+        close_files_compaction_io_options.rate_limiter_priority =
+            Env::IOPriority::IO_LOW;
+        close_files_compaction_io_options.type = IOType::kData;
+        close_files_compaction_io_options.io_activity =
+            Env::IOActivity::kCompaction;
+        // Close the dest_write
+        io_s = dest_writer->Close(close_files_compaction_io_options);
+        if (!io_s.ok()) {
+          ROCKS_LOG_BUFFER(
+              log_buffer,
+              "[%s] Failed to close the writer. Failed to copy from: %s\n"
+              " temperature=%s, to=%s, temperature=%s, io_status=%s",
+              c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+              temperature_to_string[in_file->temperature].c_str(),
+              out_fname.c_str(),
+              temperature_to_string[c->output_temperature()].c_str(),
+              io_s.ToString().c_str());
+          break;
+        }
+      }
+
+      io_s = copy_file_io_status;
+
+      if (!io_s.ok()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Failed to copy from: %s\n"
+                         " temperature=%s, to=%s, temperature=%s, io_status=%s",
+                         c->column_family_data()->GetName().c_str(),
+                         in_fname.c_str(),
+                         temperature_to_string[in_file->temperature].c_str(),
+                         out_fname.c_str(),
+                         temperature_to_string[c->output_temperature()].c_str(),
+                         io_s.ToString().c_str());
+        break;
+      }
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] Successfully copying from: %s\n"
+                       " temperature=%s, to=%s, temperature=%s, io_status=%s",
+                       c->column_family_data()->GetName().c_str(),
+                       in_fname.c_str(),
+                       temperature_to_string[in_file->temperature].c_str(),
+                       out_fname.c_str(),
+                       temperature_to_string[c->output_temperature()].c_str(),
+                       io_s.ToString().c_str());
+
+      FileMetaData out_file_metadata{
+          out_file_number,
+          c->output_path_id(),
+          in_file->fd.GetFileSize(),
+          in_file->smallest,
+          in_file->largest,
+          in_file->fd.smallest_seqno,
+          in_file->fd.largest_seqno,
+          false /* marked_for_compact */,
+          c->output_temperature() /* temperature */,
+          in_file->oldest_blob_file_number,
+          in_file->oldest_ancester_time,
+          out_file_creation_time,
+          c->MinInputFileEpochNumber(),
+          dest_writer->GetFileChecksum(),
+          dest_writer->GetFileChecksumFuncName(),
+          in_file->unique_id,
+          in_file->compensated_range_deletion_size,
+          in_file->tail_size,
+          in_file->user_defined_timestamps_persisted};
+
+      out_files.push_back(std::move(out_file_metadata));
+    }
+
+    // Update version set
+    if (status.ok() && io_s.ok()) {
+      // NOTE: ChangeTemperature should only copy one file at one file
+      // hence *c->inputs(0) == out_files.size() == 1 if copy succeeded
+      assert(c->inputs(0)->size() == 1);
+      assert(out_files.size() == 1);
+
+      auto out_file_metadata_it = out_files.begin();
+      for (const auto& in_file : *c->inputs(0)) {
+        if (out_file_metadata_it == out_files.end()) {
+          break;
+        }
+
+        c->edit()->DeleteFile(c->level(), in_file->fd.GetNumber());
+        c->edit()->AddFile(c->level(), *out_file_metadata_it);
+        ++out_file_metadata_it;
+      }
+
+      status = versions_->LogAndApply(
+          c->column_family_data(), read_options, write_options, c->edit(),
+          &mutex_, directories_.GetDbDir(),
+          /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
+          [&c, &compaction_released](const Status& s) {
+            c->ReleaseCompactionFiles(s);
+            compaction_released = true;
+          });
+    }
+
+    // TODO (mikechuang): Currently skip calling
+    // EventHelper::LogAndNotifyTableFileCreationFinished for the trivial copy.
+    // Since it's a trivial copy we should ideally use the exact TableProperties
+    // from the input file but that will break some existing stress tests. For
+    // now skip the listener call for the FIFO kChangeTemperature trivial copy
+    // move.
+
+    if (io_s.ok()) {
+      io_s = versions_->io_status();
+    }
+
+    InstallSuperVersionAndScheduleWork(
+        c->column_family_data(), job_context->superversion_contexts.data());
+    if (status.ok() && io_s.ok()) {
+      UpdateFIFOCompactionStatus(c);
+    } else {
+      for (const auto& in_file : *c->inputs(0)) {
+        const std::string in_fname =
+            TableFileName(c->immutable_options().cf_paths,
+                          in_file->fd.GetNumber(), in_file->fd.GetPathId());
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Failed to do trvial copy compaction: %s"
+            " temperature=%s, to temperature=%s, status=%s, io_status=%s",
+            c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+            temperature_to_string[in_file->temperature].c_str(),
+            temperature_to_string[c->output_temperature()].c_str(),
+            status.ToString().c_str(), io_s.ToString().c_str());
+      }
+    }
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:TriviaCopyAfterCompaction",
+        c->column_family_data());
   } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
@@ -4176,8 +4418,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
   return false;
 }
 
-void DBImpl::UpdateDeletionCompactionStats(
-    const std::unique_ptr<Compaction>& c) {
+void DBImpl::UpdateFIFOCompactionStatus(const std::unique_ptr<Compaction>& c) {
   if (c == nullptr) {
     return;
   }
@@ -4191,6 +4432,9 @@ void DBImpl::UpdateDeletionCompactionStats(
     case CompactionReason::kFIFOTtl:
       RecordTick(stats_, FIFO_TTL_COMPACTIONS);
       break;
+    case CompactionReason::kChangeTemperature:
+      RecordTick(stats_, FIFO_CHANGE_TEMPERATURE_COMPACTIONS);
+      break;
     default:
       assert(false);
       break;
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index ed0d50c9ec70..30dc1844f6ac 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -418,6 +418,7 @@ DECLARE_uint32(uncache_aggressiveness);
 DECLARE_int32(test_ingest_standalone_range_deletion_one_in);
 DECLARE_bool(allow_unprepared_value);
 DECLARE_string(file_temperature_age_thresholds);
+DECLARE_bool(allow_trivial_copy_when_change_temperature);
 DECLARE_uint32(commit_bypass_memtable_one_in);
 DECLARE_bool(track_and_verify_wals);
 DECLARE_bool(enable_remote_compaction);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index dbe6fc09007a..ccafb423c11d 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -536,6 +536,9 @@ DEFINE_string(file_temperature_age_thresholds, "",
               "See CompactionOptionsFIFO::file_temperature_age_thresholds. "
               "empty == unset");
 
+DEFINE_bool(allow_trivial_copy_when_change_temperature, true,
+            "Allow kChangeTemperature to do trivial copy");
+
 static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 6b37816c8f03..e464cf8ad4f3 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -427,6 +427,11 @@ bool StressTest::BuildOptionsTable() {
         std::vector<std::string>{
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}", "{}"});
+    options_tbl.emplace(
+        "allow_trivial_copy_when_change_temperature",
+        std::vector<std::string>{
+            FLAGS_allow_trivial_copy_when_change_temperature ? "true"
+                                                             : "false"});
   }
 
   // NOTE: allow -1 to mean starting disabled but dynamically changing
@@ -4220,10 +4225,14 @@ void InitializeOptionsFromFlags(
       StringToTemperature(FLAGS_default_temperature.c_str());
 
   if (!FLAGS_file_temperature_age_thresholds.empty()) {
+    const std::string allowTrivialCopyBoolStr =
+        FLAGS_allow_trivial_copy_when_change_temperature ? "true" : "false";
     Status s = GetColumnFamilyOptionsFromString(
         {}, options,
         "compaction_options_fifo={file_temperature_age_thresholds=" +
-            FLAGS_file_temperature_age_thresholds + "}",
+            FLAGS_file_temperature_age_thresholds +
+            ";allow_trivial_copy_when_change_temperature=" +
+            allowTrivialCopyBoolStr + "}",
         &options);
     if (!s.ok()) {
       fprintf(stderr, "While setting file_temperature_age_thresholds: %s\n",
diff --git a/file/file_util.cc b/file/file_util.cc
index 105e88690226..b3f6128aae41 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -22,7 +22,10 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint,
                   std::unique_ptr<WritableFileWriter>& dest_writer,
                   uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer) {
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size,
+                  const std::optional<IOOptions>& readIOOptions,
+                  const std::optional<IOOptions>& writeIOOptions) {
   FileOptions soptions;
   IOStatus io_s;
   std::unique_ptr<SequentialFileReader> src_reader;
@@ -38,7 +41,8 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
 
     if (size == 0) {
       // default argument means copy everything
-      io_s = fs->GetFileSize(source, opts, &size, nullptr);
+      io_s =
+          fs->GetFileSize(source, readIOOptions.value_or(opts), &size, nullptr);
       if (!io_s.ok()) {
         return io_s;
       }
@@ -47,14 +51,23 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
         new SequentialFileReader(std::move(srcfile), source, io_tracer));
   }
 
-  char buffer[4096];
+  const size_t read_buffer_size = std::max(
+      static_cast<size_t>(4096), static_cast<size_t>(max_read_buffer_size));
+  std::unique_ptr<char[]> buffer;
+  buffer.reset(new char[read_buffer_size]);
+
+  Env::IOPriority read_rate_limiter_priority = Env::IO_TOTAL;
+  if (readIOOptions.has_value()) {
+    read_rate_limiter_priority = readIOOptions.value().rate_limiter_priority;
+  }
   Slice slice;
   while (size > 0) {
-    size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
+    size_t bytes_to_read = std::min(static_cast<size_t>(read_buffer_size),
+                                    static_cast<size_t>(size));
     // TODO: rate limit copy file
-    io_s = status_to_io_status(
-        src_reader->Read(bytes_to_read, &slice, buffer,
-                         Env::IO_TOTAL /* rate_limiter_priority */));
+    io_s = status_to_io_status(src_reader->Read(
+        bytes_to_read, &slice, buffer.get(),
+        read_rate_limiter_priority /* rate_limiter_priority */));
     if (!io_s.ok()) {
       return io_s;
     }
@@ -65,19 +78,22 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
           std::to_string(dest_writer->GetFileSize()));
     }
 
-    io_s = dest_writer->Append(opts, slice);
+    io_s = dest_writer->Append(writeIOOptions.value_or(opts), slice);
     if (!io_s.ok()) {
       return io_s;
     }
     size -= slice.size();
   }
-  return dest_writer->Sync(opts, use_fsync);
+  return dest_writer->Sync(writeIOOptions.value_or(opts), use_fsync);
 }
 
 IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint, const std::string& destination,
                   Temperature dst_temp, uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer) {
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size,
+                  const std::optional<IOOptions>& readIOOptions,
+                  const std::optional<IOOptions>& writeIOOptions) {
   FileOptions options;
   IOStatus io_s;
   std::unique_ptr<WritableFileWriter> dest_writer;
@@ -96,7 +112,8 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
   }
 
   return CopyFile(fs, source, src_temp_hint, dest_writer, size, use_fsync,
-                  io_tracer);
+                  io_tracer, max_read_buffer_size, readIOOptions,
+                  writeIOOptions);
 }
 
 // Utility function to create a file with the provided contents
diff --git a/file/file_util.h b/file/file_util.h
index 8a72fea27ad3..a8f20c86893a 100644
--- a/file/file_util.h
+++ b/file/file_util.h
@@ -24,18 +24,28 @@ IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint,
                   std::unique_ptr<WritableFileWriter>& dest_writer,
                   uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer);
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size = 4096,
+                  const std::optional<IOOptions>& readIOOptions = {},
+                  const std::optional<IOOptions>& writeIOOptions = {});
 IOStatus CopyFile(FileSystem* fs, const std::string& source,
                   Temperature src_temp_hint, const std::string& destination,
                   Temperature dst_temp, uint64_t size, bool use_fsync,
-                  const std::shared_ptr<IOTracer>& io_tracer);
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  uint64_t max_read_buffer_size = 4096,
+                  const std::optional<IOOptions>& readIOOptions = {},
+                  const std::optional<IOOptions>& writeIOOptions = {});
 inline IOStatus CopyFile(const std::shared_ptr<FileSystem>& fs,
                          const std::string& source, Temperature src_temp_hint,
                          const std::string& destination, Temperature dst_temp,
                          uint64_t size, bool use_fsync,
-                         const std::shared_ptr<IOTracer>& io_tracer) {
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         uint64_t max_read_buffer_size = 4096,
+                         const std::optional<IOOptions>& readIOOptions = {},
+                         const std::optional<IOOptions>& writeIOOptions = {}) {
   return CopyFile(fs.get(), source, src_temp_hint, destination, dst_temp, size,
-                  use_fsync, io_tracer);
+                  use_fsync, io_tracer, max_read_buffer_size, readIOOptions,
+                  writeIOOptions);
 }
 IOStatus CreateFile(FileSystem* fs, const std::string& destination,
                     const std::string& contents, bool use_fsync);
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 2e3cb7ab9fba..067503ba01de 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -115,6 +115,21 @@ struct CompactionOptionsFIFO {
   // Default: empty
   std::vector<FileTemperatureAge> file_temperature_age_thresholds{};
 
+  // EXPERIMENTAL
+  // If true, when compaction is picked for kChangeTemperature reason,
+  // allow the trivia copy of the sst file from source FileSystem to
+  // destination FileSystem. If false, the changeTemperature will be
+  // the non-trivial copy by iterating/appending blocks by blocks of the
+  // sst file.
+  bool allow_trivial_copy_when_change_temperature = false;
+
+  // EXPERIMENTAL
+  // If 'allow_trivia_copy_op_when_change_temperature=true', the tmp buffer size
+  // to copy the file from the source FileSystem to the destnation FileSystem.
+  // If 'allow_trivia_copy_op_when_change_temperature=false', this field will
+  // not be used. The minmum buffer size must be at least 4KiB
+  uint64_t trivial_copy_buffer_size = 4096;
+
   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
   CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
       : max_table_files_size(_max_table_files_size),
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 50f27bcba9d2..21d7705b9f0c 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -516,6 +516,7 @@ enum Tickers : uint32_t {
   // Number of FIFO compactions that drop files based on different reasons
   FIFO_MAX_SIZE_COMPACTIONS,
   FIFO_TTL_COMPACTIONS,
+  FIFO_CHANGE_TEMPERATURE_COMPACTIONS,
 
   // Number of bytes prefetched during user initiated scan
   PREFETCH_BYTES,
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index e611d65d18fb..9f94bcee0273 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5273,6 +5273,8 @@ class TickerTypeJni {
         return -0x56;
       case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT:
         return -0x57;
+      case ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS:
+        return -0x58;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5735,6 +5737,8 @@ class TickerTypeJni {
       case -0x57:
         return ROCKSDB_NAMESPACE::Tickers::
             FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
+      case -0x58:
+        return ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS;
       case -0x54:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index e74028bee6e2..af63c639f34a 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -262,6 +262,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {READAHEAD_TRIMMED, "rocksdb.readahead.trimmed"},
     {FIFO_MAX_SIZE_COMPACTIONS, "rocksdb.fifo.max.size.compactions"},
     {FIFO_TTL_COMPACTIONS, "rocksdb.fifo.ttl.compactions"},
+    {FIFO_CHANGE_TEMPERATURE_COMPACTIONS,
+     "rocksdb.fifo.change_temperature.compactions"},
     {PREFETCH_BYTES, "rocksdb.prefetch.bytes"},
     {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
     {PREFETCH_HITS, "rocksdb.prefetch.hits"},
diff --git a/options/cf_options.cc b/options/cf_options.cc
index fa60053eaec2..d5a61bcdbff8 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -301,7 +301,16 @@ static std::unordered_map<std::string, OptionTypeInfo>
              OptionTypeInfo::Struct("file_temperature_age_thresholds",
                                     &file_temperature_age_type_info, 0,
                                     OptionVerificationType::kNormal,
-                                    OptionTypeFlags::kMutable))}};
+                                    OptionTypeFlags::kMutable))},
+        {"allow_trivial_copy_when_change_temperature",
+         {offsetof(struct CompactionOptionsFIFO,
+                   allow_trivial_copy_when_change_temperature),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"trivial_copy_buffer_size",
+         {offsetof(struct CompactionOptionsFIFO, trivial_copy_buffer_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}}};
 
 static std::unordered_map<std::string, OptionTypeInfo>
     universal_compaction_options_type_info = {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 5b4397d2c4dc..638bbb331c3c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -616,6 +616,7 @@ def is_direct_io_supported(dbname):
             "{{temperature=kCold;age=100}}",
         ]
     ),
+    "allow_trivial_copy_when_change_temperature": lambda: random.choice([0, 1]),
     # tiered storage doesn't support blob db yet
     "enable_blob_files": 0,
     "use_blob_db": 0,
diff --git a/unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md b/unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md
new file mode 100644
index 000000000000..6888e67a10ae
--- /dev/null
+++ b/unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md
@@ -0,0 +1 @@
+* Add new experimental `CompactionOptionsFIFO::allow_trivial_copy_when_change_temperature` along with `CompactionOptionsFIFO::trivial_copy_buffer_size` to allow optimizing FIFO compactions with tiering when kChangeTemperature to move files from source tier FileSystem to another tier FileSystem via trivial and direct copying raw sst file instead of reading thru the content of the SST file then rebuilding the table files.

From 36600d8fa0d5f4a130ef89a81c6efa68c615c2c7 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 8 May 2025 17:39:40 -0700
Subject: [PATCH 081/500] Pass wrapped WritableFileWriter to
 ExternalTableBuilder (#13591)

Summary:
This PR fixes a bug where the file checksum for an external table file was not being calculated by SstFileWriter. The checksum is calculated in WritableFileWriter, so we need to pass that the the external table builder rather than the FSWritableFile pointer directly. However, WritableFileWriter is private to RocksDB, so wrap it in an FSWritableFile and pass it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13591

Test Plan: Add a new test in table_test.cc

Reviewed By: jaykorean

Differential Revision: D74410563

Pulled By: anand1976

fbshipit-source-id: c7fa8142e20da8836589dee5fa50919951cf4046
---
 table/external_table.cc                       | 49 ++++++++++++-
 table/table_test.cc                           | 73 +++++++++++++++----
 .../bug_fixes/external_table_checksum.md      |  1 +
 3 files changed, 103 insertions(+), 20 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/external_table_checksum.md

diff --git a/table/external_table.cc b/table/external_table.cc
index 2161544c2907..e2eb3e4f4ab7 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -219,8 +219,9 @@ class ExternalTableReaderAdapter : public TableReader {
 class ExternalTableBuilderAdapter : public TableBuilder {
  public:
   explicit ExternalTableBuilderAdapter(
-      std::unique_ptr<ExternalTableBuilder>&& builder)
-      : builder_(std::move(builder)), num_entries_(0) {}
+      std::unique_ptr<ExternalTableBuilder>&& builder,
+      std::unique_ptr<FSWritableFile>&& file)
+      : builder_(std::move(builder)), file_(std::move(file)), num_entries_(0) {}
 
   void Add(const Slice& key, const Slice& value) override {
     ParsedInternalKey pkey;
@@ -269,6 +270,7 @@ class ExternalTableBuilderAdapter : public TableBuilder {
  private:
   Status status_;
   std::unique_ptr<ExternalTableBuilder> builder_;
+  std::unique_ptr<FSWritableFile> file_;
   uint64_t num_entries_;
 };
 
@@ -309,10 +311,13 @@ class ExternalTableFactoryAdapter : public TableFactory {
         topts.read_options, topts.write_options,
         topts.moptions.prefix_extractor, topts.ioptions.user_comparator,
         topts.column_family_name, topts.reason);
+    auto file_wrapper =
+        std::make_unique<ExternalTableWritableFileWrapper>(file);
     builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name(),
-                                          file->writable_file()));
+                                          file_wrapper.get()));
     if (builder) {
-      return new ExternalTableBuilderAdapter(std::move(builder));
+      return new ExternalTableBuilderAdapter(std::move(builder),
+                                             std::move(file_wrapper));
     }
     return nullptr;
   }
@@ -320,6 +325,42 @@ class ExternalTableFactoryAdapter : public TableFactory {
   std::unique_ptr<TableFactory> Clone() const override { return nullptr; }
 
  private:
+  // An FSWritableFile subclass for wrapping a WritableFileWriter. The
+  // latter is private to RocksDB, so we wrap it here in order to pass it
+  // to the ExternalTableBuilder. This is necessary for WritableFileWriter
+  // to intercept Append so that it can calculate the file checksum.
+  class ExternalTableWritableFileWrapper : public FSWritableFile {
+   public:
+    explicit ExternalTableWritableFileWrapper(WritableFileWriter* writer)
+        : writer_(writer) {}
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& options,
+                    IODebugContext* /*dbg*/) override {
+      return writer_->Append(options, data);
+    }
+
+    IOStatus Close(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      return writer_->Close(options);
+    }
+
+    IOStatus Flush(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      return writer_->Flush(options);
+    }
+
+    IOStatus Sync(const IOOptions& options, IODebugContext* /*dbg*/) override {
+      return writer_->Sync(options, /*use_fsync=*/false);
+    }
+
+    uint64_t GetFileSize(const IOOptions& options,
+                         IODebugContext* dbg) override {
+      return writer_->writable_file()->GetFileSize(options, dbg);
+    }
+
+   private:
+    WritableFileWriter* writer_;
+  };
+
   std::shared_ptr<ExternalTableFactory> inner_;
 };
 
diff --git a/table/table_test.cc b/table/table_test.cc
index 2730185bae3b..07be36714d9f 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6525,10 +6525,10 @@ TEST_F(CacheUsageOptionsOverridesTest, SanitizeAndValidateOptions) {
   Destroy(options);
 }
 
-class ExternalTableReaderTest : public DBTestBase {
+class ExternalTableTest : public DBTestBase {
  public:
-  ExternalTableReaderTest()
-      : DBTestBase("external_table_reader_test", /*env_do_fsync=*/false) {}
+  ExternalTableTest()
+      : DBTestBase("external_table_test", /*env_do_fsync=*/false) {}
 
  protected:
   class DummyExternalTableFile {
@@ -6872,7 +6872,7 @@ class ExternalTableReaderTest : public DBTestBase {
   };
 };
 
-TEST_F(ExternalTableReaderTest, BasicTest) {
+TEST_F(ExternalTableTest, BasicTest) {
   std::shared_ptr<ExternalTableFactory> factory =
       std::make_shared<DummyExternalTableFactory>();
 
@@ -6920,13 +6920,15 @@ TEST_F(ExternalTableReaderTest, BasicTest) {
   ASSERT_EQ(statuses[1], Status::NotFound());
 }
 
-TEST_F(ExternalTableReaderTest, SstReaderTest) {
+TEST_F(ExternalTableTest, SstReaderTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
   Options options = GetDefaultOptions();
-  std::string dbname = test::PerThreadDBPath("external_table_reader_test");
+  std::string dbname = test::PerThreadDBPath("external_table_test");
   std::string ingest_file = dbname + "test.immutabledb";
   dbname += "_db";
-  // This test doesn't work with some custom Envs, like EncryptedEnv
-  options.env = Env::Default();
 
   std::shared_ptr<ExternalTableFactory> factory =
       std::make_shared<DummyExternalTableFactory>();
@@ -6953,13 +6955,50 @@ TEST_F(ExternalTableReaderTest, SstReaderTest) {
   ASSERT_TRUE(iter->status().ok());
 }
 
-TEST_F(ExternalTableReaderTest, DBIterTest) {
+TEST_F(ExternalTableTest, ExternalFileChecksumTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>();
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Put("foo2", "bar2"));
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer->Finish(&info));
+  writer.reset();
+
+  FileChecksumGenContext cksum_ctx;
+  FileChecksumGenCrc32c cksum_gen(cksum_ctx);
+  std::string file_data;
+  ASSERT_OK(ReadFileToString(options.env, ingest_file, &file_data));
+  cksum_gen.Update(file_data.data(), file_data.size());
+  cksum_gen.Finalize();
+  ASSERT_EQ(info.file_checksum, cksum_gen.GetChecksum());
+}
+
+TEST_F(ExternalTableTest, DBIterTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
   Options options = GetDefaultOptions();
-  std::string dbname = test::PerThreadDBPath("external_table_reader_test");
+  std::string dbname = test::PerThreadDBPath("external_table_test");
   std::string ingest_file = dbname + "test.immutable";
   dbname += "_db";
-  // This test doesn't work with some custom Envs, like EncryptedEnv
-  options.env = Env::Default();
   ASSERT_OK(DestroyDB(dbname, options));
 
   std::shared_ptr<ExternalTableFactory> factory =
@@ -7007,13 +7046,15 @@ TEST_F(ExternalTableReaderTest, DBIterTest) {
   ASSERT_OK(db->Close());
 }
 
-TEST_F(ExternalTableReaderTest, DBMultiScanTest) {
+TEST_F(ExternalTableTest, DBMultiScanTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
   Options options = GetDefaultOptions();
-  std::string dbname = test::PerThreadDBPath("external_table_reader_test");
+  std::string dbname = test::PerThreadDBPath("external_table_test");
   std::string ingest_file = dbname + "test.immutable";
   dbname += "_db";
-  // This test doesn't work with some custom Envs, like EncryptedEnv
-  options.env = Env::Default();
   ASSERT_OK(DestroyDB(dbname, options));
 
   std::shared_ptr<ExternalTableFactory> factory =
diff --git a/unreleased_history/bug_fixes/external_table_checksum.md b/unreleased_history/bug_fixes/external_table_checksum.md
new file mode 100644
index 000000000000..8b6dc226fab8
--- /dev/null
+++ b/unreleased_history/bug_fixes/external_table_checksum.md
@@ -0,0 +1 @@
+Pass wrapped WritableFileWriter pointer to ExternalTableBuilder so that the file checksum can be correctly calculated and returned by SstFileWriter for external table files.

From ef67339175c1aebe039c0671560b608e22656612 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 9 May 2025 12:55:40 -0700
Subject: [PATCH 082/500] Small fix in secondary DB and stress test (#13594)

Summary:
We saw some crash test failure for secondary db. It happens during crash recovery verification. This PR logs the manifest number when such failure happens. This PR also includes a small fix in `TryCatchUpWithPrimary()` that could incorrectly check WAL not found case.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13594

Test Plan: monitor further secondary DB crash test failure.

Reviewed By: archang19

Differential Revision: D74488769

Pulled By: cbi42

fbshipit-source-id: 226e55b2f99a739e93abda3ee91c05b80f59bf6a
---
 db/db_impl/db_impl_secondary.cc         | 14 +++++++-------
 db_stress_tool/no_batched_ops_stress.cc | 15 ++++++++++++---
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index b95566523f5a..d567238b854c 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -728,13 +728,13 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
     // instance
     if (s.ok()) {
       s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
-    }
-    if (s.IsPathNotFound()) {
-      ROCKS_LOG_INFO(
-          immutable_db_options_.info_log,
-          "Secondary tries to read WAL, but WAL file(s) have already "
-          "been purged by primary.");
-      s = Status::OK();
+      if (s.IsPathNotFound()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "Secondary tries to read WAL, but WAL file(s) have already "
+            "been purged by primary.");
+        s = Status::OK();
+      }
     }
     if (s.ok()) {
       for (auto cfd : cfds_changed) {
diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 347c03b6519d..93294423da2f 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -233,6 +233,13 @@ class NonBatchedOpsStressTest : public StressTest {
           }
 
           Status s = secondary_db_->TryCatchUpWithPrimary();
+#ifndef NDEBUG
+          uint64_t manifest_num = static_cast_with_check<DBImpl>(secondary_db_)
+                                      ->TEST_Current_Manifest_FileNo();
+#else
+          uint64_t manifest_num = 0;
+#endif
+
           if (!s.ok()) {
             VerificationAbort(shared,
                               "Secondary failed to catch up to the primary");
@@ -267,9 +274,11 @@ class NonBatchedOpsStressTest : public StressTest {
             assert(!pre_read_expected_values.empty() &&
                    static_cast<size_t>(i - start) <
                        pre_read_expected_values.size());
-            VerifyValueRange(static_cast<int>(cf), i, options, shared, from_db,
-                             /* msg_prefix */ "Secondary get verification", s,
-                             pre_read_expected_values[i - start]);
+            VerifyValueRange(
+                static_cast<int>(cf), i, options, shared, from_db,
+                /* msg_prefix */ "Secondary get verification, manifest: " +
+                    std::to_string(manifest_num),
+                s, pre_read_expected_values[i - start]);
           }
         }
       } else if (method == VerificationMethod::kGetEntity) {

From 0102b1769b55ad29ba3b53c66c0576fc10f2da72 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 12 May 2025 11:53:16 -0700
Subject: [PATCH 083/500] Log pre-compression size written per level in
 compaction stats (#13596)

Summary:
Add a new field to Compaction Stats to track the pre-compression size written to each level. This logged in LOG files as column WPreComp(GB). Also improved logging of compaction_started event to include cf name.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13596

Test Plan:
* Manually check LOG of db_bench runs:
With no compression
```
** Compaction Stats [default] **
Level    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0     21/9     96.06 MB   3.0      0.0     0.0      0.0       0.4       0.4      0.4       0.0   1.0      0.0    202.0      2.22              1.27        98    0.023   3829K      0       0.0       0.0
  L1      6/6    344.89 MB   0.0      1.5     0.3      1.2       1.5       1.5      0.3       0.0   4.4    280.4    279.1      5.52              5.15        10    0.552     13M    44K       0.0       0.0
 Sum     27/15   440.95 MB   0.0      1.5     0.3      1.2       1.9       1.9      0.8       0.0   4.4    200.0    257.0      7.74              6.42       108    0.072     17M    44K       0.0       0.0
 Int      0/0      0.00 KB   0.0      0.3     0.1      0.3       0.4       0.4      0.1       0.0   6.8    219.2    255.7      1.58              1.36        14    0.113   3484K    12K       0.0       0.0

** Compaction Stats [default] **
Priority    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Low      0/0      0.00 KB   0.0      1.5     0.3      1.2       1.5       1.5      0.3       0.0   0.0    280.4    279.1      5.52              5.15        10    0.552     13M    44K       0.0       0.0
High      0/0      0.00 KB   0.0      0.0     0.0      0.0       0.4       0.4      0.4       0.0   0.0      0.0    202.0      2.22              1.27        98    0.023   3829K      0       0.0       0.0
```

With expected compression ratio = 0.5
```
** Compaction Stats [default] **
Level    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  L0     21/10    54.23 MB   2.8      0.0     0.0      0.0       0.2       0.4      0.2       0.0   1.0      0.0    105.2      1.96              1.29        80    0.025   3126K      0       0.0       0.0
  L1      3/3    140.18 MB   0.0      0.5     0.1      0.4       0.5       0.9      0.1       0.0   3.4    131.1    128.1      3.99              3.89         8    0.499   8324K    26K       0.0       0.0
 Sum     24/13   194.41 MB   0.0      0.5     0.1      0.4       0.7       1.3      0.3       0.0   3.5     87.9    120.5      5.96              5.17        88    0.068     11M    26K       0.0       0.0
 Int      0/0      0.00 KB   0.0      0.3     0.1      0.2       0.3       0.6      0.1       0.0   5.7    105.7    125.9      2.45              2.23        23    0.107   4973K    15K       0.0       0.0

** Compaction Stats [default] **
Priority    Files   Size     Score Read(GB)  Rn(GB) Rnp1(GB) Write(GB) WPreComp(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) CompMergeCPU(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop Rblob(GB) Wblob(GB)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Low      0/0      0.00 KB   0.0      0.5     0.1      0.4       0.5       0.9      0.1       0.0   0.0    131.1    128.1      3.99              3.89         8    0.499   8324K    26K       0.0       0.0
High      0/0      0.00 KB   0.0      0.0     0.0      0.0       0.2       0.4      0.2       0.0   0.0      0.0    105.2      1.96              1.29        80    0.025   3126K      0       0.0       0.0
```

Reviewed By: hx235

Differential Revision: D74588464

Pulled By: cbi42

fbshipit-source-id: a998c0433230db4f3d7808636215b886b9ca5220
---
 db/builder.cc                                 |  3 +++
 db/compaction/compaction_job.cc               |  4 ++--
 db/compaction/compaction_outputs.cc           |  1 +
 db/internal_stats.cc                          | 23 ++++++++++++-------
 db/internal_stats.h                           | 11 +++++++++
 .../block_based/block_based_table_builder.cc  | 12 ++++++++++
 table/block_based/block_based_table_builder.h |  2 ++
 table/table_builder.h                         |  3 +++
 .../new_features/log-precompression-size.md   |  1 +
 9 files changed, 50 insertions(+), 10 deletions(-)
 create mode 100644 unreleased_history/new_features/log-precompression-size.md

diff --git a/db/builder.cc b/db/builder.cc
index 631530bf5666..2caa75c04630 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -341,6 +341,9 @@ Status BuildTable(
     }
 
     if (s.ok() && !empty) {
+      if (flush_stats) {
+        flush_stats->bytes_written_pre_comp = builder->PreCompressionSize();
+      }
       uint64_t file_size = builder->FileSize();
       meta->fd.file_size = file_size;
       meta->tail_size = builder->GetTailSize();
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 88ad5490f511..fa7e76012d66 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -2300,8 +2300,8 @@ void CompactionJob::LogCompaction() {
                    cfd->GetName().c_str(), scratch);
     // build event logger report
     auto stream = event_logger_->Log();
-    stream << "job" << job_id_ << "event" << "compaction_started"
-           << "compaction_reason"
+    stream << "job" << job_id_ << "event" << "compaction_started" << "cf_name"
+           << cfd->GetName() << "compaction_reason"
            << GetCompactionReasonString(compaction->compaction_reason());
     for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
       stream << ("files_L" + std::to_string(compaction->level(i)));
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index e1eb1f449394..287dd98c106c 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -54,6 +54,7 @@ Status CompactionOutputs::Finish(
   }
   current_output().finished = true;
   stats_.bytes_written += current_bytes;
+  stats_.bytes_written_pre_comp += builder_->PreCompressionSize();
   stats_.num_output_files = static_cast<int>(outputs_.size());
 
   return s;
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index 8e8e6d27ef10..c25f7c589b1f 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -45,6 +45,8 @@ const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
         {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
         {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
         {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+        {LevelStatType::WRITE_PRE_COMP_GB,
+         LevelStat{"WPreCompGB", "WPreComp(GB)"}},
         {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
         {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
         {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
@@ -100,19 +102,20 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
   int line_size = snprintf(
       buf + written_size, len - written_size,
       "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+      "%s "
       "%s\n",
       // Note that we skip COMPACTED_FILES and merge it with Files column
       group_by.c_str(), hdr(LevelStatType::NUM_FILES),
       hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
       hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
       hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
-      hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
-      hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
-      hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
-      hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
-      hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
-      hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
-      hdr(LevelStatType::W_BLOB_GB));
+      hdr(LevelStatType::WRITE_PRE_COMP_GB), hdr(LevelStatType::W_NEW_GB),
+      hdr(LevelStatType::MOVED_GB), hdr(LevelStatType::WRITE_AMP),
+      hdr(LevelStatType::READ_MBPS), hdr(LevelStatType::WRITE_MBPS),
+      hdr(LevelStatType::COMP_SEC), hdr(LevelStatType::COMP_CPU_SEC),
+      hdr(LevelStatType::COMP_COUNT), hdr(LevelStatType::AVG_SEC),
+      hdr(LevelStatType::KEY_IN), hdr(LevelStatType::KEY_DROP),
+      hdr(LevelStatType::R_BLOB_GB), hdr(LevelStatType::W_BLOB_GB));
 
   written_size += line_size;
   written_size = std::min(written_size, static_cast<int>(len));
@@ -140,6 +143,8 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
       stats.bytes_read_non_output_levels / kGB;
   (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
   (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+  (*level_stats)[LevelStatType::WRITE_PRE_COMP_GB] =
+      stats.bytes_written_pre_comp / kGB;
   (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
   (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
   (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
@@ -164,12 +169,13 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
       buf, len,
       "%4s "      /*  Level */
       "%6d/%-3d " /*  Files */
-      "%8s "      /*  Size */
+      "%10s "     /*  Size */
       "%5.1f "    /*  Score */
       "%8.1f "    /*  Read(GB) */
       "%7.1f "    /*  Rn(GB) */
       "%8.1f "    /*  Rnp1(GB) */
       "%9.1f "    /*  Write(GB) */
+      "%9.1f "    /*  WPreComp(GB) */
       "%8.1f "    /*  Wnew(GB) */
       "%9.1f "    /*  Moved(GB) */
       "%5.1f "    /*  W-Amp */
@@ -193,6 +199,7 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
       stat_value.at(LevelStatType::RN_GB),
       stat_value.at(LevelStatType::RNP1_GB),
       stat_value.at(LevelStatType::WRITE_GB),
+      stat_value.at(LevelStatType::WRITE_PRE_COMP_GB),
       stat_value.at(LevelStatType::W_NEW_GB),
       stat_value.at(LevelStatType::MOVED_GB),
       stat_value.at(LevelStatType::WRITE_AMP),
diff --git a/db/internal_stats.h b/db/internal_stats.h
index cc1b1317df61..a1b4fbe6c555 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -71,6 +71,7 @@ enum class LevelStatType {
   RN_GB,
   RNP1_GB,
   WRITE_GB,
+  WRITE_PRE_COMP_GB,
   W_NEW_GB,
   MOVED_GB,
   WRITE_AMP,
@@ -179,6 +180,9 @@ class InternalStats {
     // Total number of bytes written to table files during compaction
     uint64_t bytes_written;
 
+    // Total number of bytes written pre-compression during compaction
+    uint64_t bytes_written_pre_comp;
+
     // Total number of bytes written to blob files during compaction
     uint64_t bytes_written_blob;
 
@@ -231,6 +235,7 @@ class InternalStats {
           bytes_skipped_output_level(0),
           bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_pre_comp(0),
           bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
@@ -258,6 +263,7 @@ class InternalStats {
           bytes_skipped_output_level(0),
           bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_pre_comp(0),
           bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
@@ -291,6 +297,7 @@ class InternalStats {
           bytes_skipped_output_level(c.bytes_skipped_output_level),
           bytes_read_blob(c.bytes_read_blob),
           bytes_written(c.bytes_written),
+          bytes_written_pre_comp(c.bytes_written_pre_comp),
           bytes_written_blob(c.bytes_written_blob),
           bytes_moved(c.bytes_moved),
           num_input_files_in_non_output_levels(
@@ -321,6 +328,7 @@ class InternalStats {
       bytes_skipped_output_level = c.bytes_skipped_output_level;
       bytes_read_blob = c.bytes_read_blob;
       bytes_written = c.bytes_written;
+      bytes_written_pre_comp = c.bytes_written_pre_comp;
       bytes_written_blob = c.bytes_written_blob;
       bytes_moved = c.bytes_moved;
       num_input_files_in_non_output_levels =
@@ -353,6 +361,7 @@ class InternalStats {
       this->bytes_skipped_output_level = 0;
       this->bytes_read_blob = 0;
       this->bytes_written = 0;
+      this->bytes_written_pre_comp = 0;
       this->bytes_written_blob = 0;
       this->bytes_moved = 0;
       this->num_input_files_in_non_output_levels = 0;
@@ -381,6 +390,7 @@ class InternalStats {
       this->bytes_skipped_output_level += c.bytes_skipped_output_level;
       this->bytes_read_blob += c.bytes_read_blob;
       this->bytes_written += c.bytes_written;
+      this->bytes_written_pre_comp += c.bytes_written_pre_comp;
       this->bytes_written_blob += c.bytes_written_blob;
       this->bytes_moved += c.bytes_moved;
       this->num_input_files_in_non_output_levels +=
@@ -413,6 +423,7 @@ class InternalStats {
       this->bytes_skipped_output_level -= c.bytes_skipped_output_level;
       this->bytes_read_blob -= c.bytes_read_blob;
       this->bytes_written -= c.bytes_written;
+      this->bytes_written_pre_comp -= c.bytes_written_pre_comp;
       this->bytes_written_blob -= c.bytes_written_blob;
       this->bytes_moved -= c.bytes_moved;
       this->num_input_files_in_non_output_levels -=
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 4bfac6148e07..96819b7a7061 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -328,6 +328,10 @@ struct BlockBasedTableBuilder::Rep {
   // all blocks after data blocks till the end of the SST file.
   uint64_t tail_size;
 
+  // The total size of all blocks in this file before they are compressed.
+  // This is used for logging compaction stats.
+  uint64_t pre_compression_size = 0;
+
   // See class Footer
   uint32_t base_context_checksum;
 
@@ -1443,6 +1447,8 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     }
   }
 
+  r->pre_compression_size +=
+      uncompressed_block_data->size() + kBlockTrailerSize;
   r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize);
   if (r->table_options.block_align && is_data_block) {
     size_t pad_bytes =
@@ -1452,6 +1458,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
 
     io_s = r->file->Pad(io_options, pad_bytes);
     if (io_s.ok()) {
+      r->pre_compression_size += pad_bytes;
       r->set_offset(r->get_offset() + pad_bytes);
     } else {
       r->SetIOStatus(io_s);
@@ -1889,6 +1896,7 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
   }
   ios = r->file->Append(io_options, footer.GetSlice());
   if (ios.ok()) {
+    r->pre_compression_size += footer.GetSlice().size();
     r->set_offset(r->get_offset() + footer.GetSlice().size());
   } else {
     r->SetIOStatus(ios);
@@ -2141,6 +2149,10 @@ bool BlockBasedTableBuilder::IsEmpty() const {
   return rep_->props.num_entries == 0 && rep_->props.num_range_deletions == 0;
 }
 
+uint64_t BlockBasedTableBuilder::PreCompressionSize() const {
+  return rep_->pre_compression_size;
+}
+
 uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
 
 uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index a2501e962198..708a0c51922a 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -83,6 +83,8 @@ class BlockBasedTableBuilder : public TableBuilder {
 
   bool IsEmpty() const override;
 
+  uint64_t PreCompressionSize() const override;
+
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
diff --git a/table/table_builder.h b/table/table_builder.h
index 5ed7aba51f3d..10b3476b6b68 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -207,6 +207,9 @@ class TableBuilder {
     return NumEntries() == 0 && GetTableProperties().num_range_deletions == 0;
   }
 
+  // Size of the file before its content is compressed.
+  virtual uint64_t PreCompressionSize() const { return 0; }
+
   // Size of the file generated so far.  If invoked after a successful
   // Finish() call, returns the size of the final generated file.
   virtual uint64_t FileSize() const = 0;
diff --git a/unreleased_history/new_features/log-precompression-size.md b/unreleased_history/new_features/log-precompression-size.md
new file mode 100644
index 000000000000..6266e6fe3f0b
--- /dev/null
+++ b/unreleased_history/new_features/log-precompression-size.md
@@ -0,0 +1 @@
+* Add a new field to Compaction Stats in LOG files for the pre-compression size written to each level.

From 0e3e3493692a522641d3be2d4d927a27238f2c2c Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 12 May 2025 15:42:25 -0700
Subject: [PATCH 084/500] Fix an infinite-loop bug in transaction locking
 (#13585)

Summary:
when a transaction reaches lock limit and times out before it attempts to wait for it (https://github.com/facebook/rocksdb/blob/9d1a071194de8093bbf3f8f57ffd176278359bf0/utilities/transactions/lock/point/point_lock_manager.cc#L320), it can busy-loop forever even though its timeout is expired. This PR fixes this bug by setting a timeout status when its timeout is reached.

This PR also updates the `LockLimit` status from `Busy` to `Aborted`, this matches the check in `Status::IsLockLimit()` and matches the customer usage (https://github.com/facebook/mysql-5.6/blob/c6e4b9f3f93dce206370105fe73ee337ece0c5e7/storage/rocksdb/ha_rocksdb.cc#L10745-L10746).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13585

Test Plan: added a unit test that would infinite-loop before this fix.

Reviewed By: jaykorean

Differential Revision: D74077824

Pulled By: cbi42

fbshipit-source-id: 4993d4e4c71bb1594835e9ec6ff4a74d453a9190
---
 include/rocksdb/status.h                      |  2 +
 .../bug_fixes/lock-limit-timeout.md           |  1 +
 .../lock/point/point_lock_manager.cc          | 33 ++++++++-----
 .../lock/point/point_lock_manager.h           |  2 +
 .../range_tree/range_tree_lock_manager.cc     |  4 +-
 utilities/transactions/transaction_test.cc    | 48 +++++++++++++++++--
 6 files changed, 71 insertions(+), 19 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/lock-limit-timeout.md

diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index 82597239fff7..fad18d673936 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -316,6 +316,8 @@ class Status {
     return Status(kInvalidArgument, kTxnNotPrepared, msg, msg2);
   }
 
+  static Status LockLimit() { return Status(kAborted, kLockLimit); }
+
   // Returns true iff the status indicates success.
   bool ok() const {
     MarkChecked();
diff --git a/unreleased_history/bug_fixes/lock-limit-timeout.md b/unreleased_history/bug_fixes/lock-limit-timeout.md
new file mode 100644
index 000000000000..55eb4726feed
--- /dev/null
+++ b/unreleased_history/bug_fixes/lock-limit-timeout.md
@@ -0,0 +1 @@
+* Fix an infinite-loop bug in transaction locking. This can happen if a transaction reaches lock limit and its time out expires before it attempts to wait for it.
diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 4cd6e6b16081..6f9d95aefa90 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -277,13 +277,13 @@ Status PointLockManager::AcquireWithTimeout(
   autovector<TransactionID> wait_ids;
   result = AcquireLocked(lock_map, stripe, key, env, lock_info,
                          &expire_time_hint, &wait_ids);
-
   if (!result.ok() && timeout != 0) {
     PERF_TIMER_GUARD(key_lock_wait_time);
     PERF_COUNTER_ADD(key_lock_wait_count, 1);
     // If we weren't able to acquire the lock, we will keep retrying as long
     // as the timeout allows.
     bool timed_out = false;
+    bool cv_wait_fail = false;
     do {
       // Decide how long to wait
       int64_t cv_end_time = -1;
@@ -294,8 +294,7 @@ Status PointLockManager::AcquireWithTimeout(
       } else if (end_time > 0) {
         cv_end_time = end_time;
       }
-
-      assert(result.IsBusy() || wait_ids.size() != 0);
+      assert(result.IsLockLimit() == wait_ids.empty());
 
       // We are dependent on a transaction to finish, so perform deadlock
       // detection.
@@ -315,7 +314,12 @@ Status PointLockManager::AcquireWithTimeout(
       if (cv_end_time < 0) {
         // Wait indefinitely
         result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
+        cv_wait_fail = !result.ok();
       } else {
+        // FIXME: in this case, cv_end_time could be `expire_time_hint` from the
+        // current lock holder, a time out does not mean we reached the current
+        // transaction's timeout, and we should continue to retry locking
+        // instead of exiting this while loop below.
         uint64_t now = env->NowMicros();
         if (static_cast<uint64_t>(cv_end_time) > now) {
           // This may be invoked multiple times since we divide
@@ -323,6 +327,10 @@ Status PointLockManager::AcquireWithTimeout(
           (void)ROCKSDB_THREAD_YIELD_CHECK_ABORT();
           result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
                                               cv_end_time - now);
+          cv_wait_fail = !result.ok() && !result.IsTimedOut();
+        } else {
+          // now >= cv_end_time, we already timed out
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
         }
       }
 
@@ -332,6 +340,9 @@ Status PointLockManager::AcquireWithTimeout(
           DecrementWaiters(txn, wait_ids);
         }
       }
+      if (cv_wait_fail) {
+        break;
+      }
 
       if (result.IsTimedOut()) {
         timed_out = true;
@@ -339,12 +350,10 @@ Status PointLockManager::AcquireWithTimeout(
         // acquire lock below (it is possible the lock expired and we
         // were never signaled).
       }
-
-      if (result.ok() || result.IsTimedOut()) {
-        wait_ids.clear();
-        result = AcquireLocked(lock_map, stripe, key, env, lock_info,
-                               &expire_time_hint, &wait_ids);
-      }
+      assert(result.ok() || result.IsTimedOut());
+      wait_ids.clear();
+      result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+                             &expire_time_hint, &wait_ids);
     } while (!result.ok() && !timed_out);
   }
 
@@ -477,8 +486,8 @@ bool PointLockManager::IncrementWaiters(
 // Returns Status::TimeOut if the lock cannot be acquired due to it being
 // held by other transactions, `txn_ids` will be populated with the id of
 // transactions that hold the lock, excluding lock_info.txn_ids[0].
-// Returns Status::Busy if the lock cannot be acquired due to reaching
-// per CF limit on the number of locks.
+// Returns Status::Aborted(kLockLimit) if the lock cannot be acquired due to
+// reaching per CF limit on the number of locks.
 //
 // REQUIRED:  Stripe mutex must be held. txn_ids must be empty.
 Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
@@ -538,7 +547,7 @@ Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
     // Check lock limit
     if (max_num_locks_ > 0 &&
         lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
-      result = Status::Busy(Status::SubCode::kLockLimit);
+      result = Status::LockLimit();
     } else {
       // acquire lock
       stripe->keys.emplace(key, txn_lock_info);
diff --git a/utilities/transactions/lock/point/point_lock_manager.h b/utilities/transactions/lock/point/point_lock_manager.h
index 99183ca1cd2f..c93006df7354 100644
--- a/utilities/transactions/lock/point/point_lock_manager.h
+++ b/utilities/transactions/lock/point/point_lock_manager.h
@@ -209,6 +209,8 @@ class PointLockManager : public LockManager {
   void UnLockKey(PessimisticTransaction* txn, const std::string& key,
                  LockMapStripe* stripe, LockMap* lock_map, Env* env);
 
+  // Returns true if a deadlock is detected.
+  // Will DecrementWaiters() if a deadlock is detected.
   bool IncrementWaiters(const PessimisticTransaction* txn,
                         const autovector<TransactionID>& wait_ids,
                         const std::string& key, const uint32_t& cf_id,
diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
index 584d9ebc2765..7674dab03f3e 100644
--- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
+++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
@@ -130,7 +130,7 @@ Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
     case DB_LOCK_NOTGRANTED:
       return Status::TimedOut(Status::SubCode::kLockTimeout);
     case TOKUDB_OUT_OF_LOCKS:
-      return Status::Busy(Status::SubCode::kLockLimit);
+      return Status::LockLimit();
     case DB_LOCK_DEADLOCK: {
       std::reverse(di_path.begin(), di_path.end());
       dlock_buffer_.AddNewPath(
@@ -139,7 +139,7 @@ Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
     }
     default:
       assert(0);
-      return Status::Busy(Status::SubCode::kLockLimit);
+      return Status::LockLimit();
   }
 
   return Status::OK();
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index a1cd6aaee431..641299ba0540 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -3912,16 +3912,16 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // lock limit reached
   s = txn->Put("W", "w");
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   // re-locking same key shouldn't put us over the limit
   s = txn->Put("X", "xx");
   ASSERT_OK(s);
 
   s = txn->GetForUpdate(read_options, "W", &value);
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
   s = txn->GetForUpdate(read_options, "V", &value);
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   // re-locking same key shouldn't put us over the limit
   s = txn->GetForUpdate(read_options, "Y", &value);
@@ -3940,7 +3940,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // lock limit reached
   s = txn2->Put("M", "m");
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   s = txn->Commit();
   ASSERT_OK(s);
@@ -3967,7 +3967,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 
   // lock limit reached
   s = txn2->Delete("Y");
-  ASSERT_TRUE(s.IsBusy());
+  ASSERT_TRUE(s.IsLockLimit());
 
   s = txn2->Commit();
   ASSERT_OK(s);
@@ -3987,6 +3987,44 @@ TEST_P(TransactionTest, LockLimitTest) {
   delete txn2;
 }
 
+TEST_P(TransactionTest, LockLimitWithTimeoutHangTest) {
+  // Tests a bug where transaction can infinite-loop during lock acquiry.
+  // This happens when lock limit is reached and user specifies a positive
+  // timeout which is reached before the transaction start waiting for it.
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+
+  txn_db_options.max_num_locks = 3;
+  txn_db_options.transaction_lock_timeout = 10;  // 10ms
+  ASSERT_OK(ReOpen());
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_OK(txn->Put("X", "x"));
+  ASSERT_OK(txn->Put("Y", "y"));
+  ASSERT_OK(txn->Put("Z", "z"));
+
+  TransactionOptions txn2_options;
+  txn2_options.lock_timeout = 1;  // 1ms short timeout
+  Transaction* txn2 = db->BeginTransaction(write_options, txn2_options);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void*) {
+        // Sleep for 2ms, so timeout is already passed for txn2 before waiting.
+        // txn2 should fail instead of waiting forever.
+        env->SleepForMicroseconds(2 * 1000);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // This lock attempt should fail and return
+  ASSERT_TRUE(txn2->Put("W", "w").IsLockLimit());
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  delete txn;
+  delete txn2;
+}
+
 TEST_P(TransactionTest, IteratorTest) {
   // This test does writes without snapshot validation, and then tries to create
   // iterator later, which is unsupported in write unprepared.

From 8cb2bfa2335077e5709fd9b1d246a84d36bb85ff Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 12 May 2025 15:58:33 -0700
Subject: [PATCH 085/500] Fix race in accessing MANIFEST number in crash test
 (#13603)

Summary:
https://github.com/facebook/rocksdb/issues/13594 introduced the following data race. This PR attempts to fix it by acquiring DB mutex before accessing MANIFEST file number.
```
WARNING: ThreadSanitizer: data race (pid=9993)
  Write of size 8 at 0x7b60000014e8 by thread T50 (mutexes: write M143969571504678848):
    #0 rocksdb::ParseFileName(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long*, rocksdb::Slice const&, rocksdb::FileType*, rocksdb::WalFileType*) file/filename.cc:326 (librocksdb.so.10.3+0xaa142f)
    https://github.com/facebook/rocksdb/issues/1 rocksdb::ParseFileName(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long*, rocksdb::FileType*, rocksdb::WalFileType*) file/filename.cc:270 (librocksdb.so.10.3+0xaa1e91)
    https://github.com/facebook/rocksdb/issues/2 rocksdb::GetCurrentManifestPath(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, rocksdb::FileSystem*, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, unsigned long*) db/manifest_ops.cc:35 (librocksdb.so.10.3+0x80bd3f)
    https://github.com/facebook/rocksdb/issues/3 rocksdb::ReactiveVersionSet::MaybeSwitchManifest(rocksdb::log::Reader::Reporter*, std::unique_ptr<rocksdb::log::FragmentBufferedReader, std::default_delete<rocksdb::log::FragmentBufferedReader> >*) db/version_set.cc:7553 (librocksdb.so.10.3+0x91ca45)
    https://github.com/facebook/rocksdb/issues/4 rocksdb::ReactiveVersionSet::ReadAndApply(rocksdb::InstrumentedMutex*, std::unique_ptr<rocksdb::log::FragmentBufferedReader, std::default_delete<rocksdb::log::FragmentBufferedReader> >*, rocksdb::Status*, std::unordered_set<rocksdb::ColumnFamilyData*, std::hash<rocksdb::ColumnFamilyData*>, std::equal_to<rocksdb::ColumnFamilyData*>, std::allocator<rocksdb::ColumnFamilyData*> >*, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >*) db/version_set.cc:7531 (librocksdb.so.10.3+0x91de03)
    https://github.com/facebook/rocksdb/issues/5 rocksdb::DBImplSecondary::TryCatchUpWithPrimary() db/db_impl/db_impl_secondary.cc:709 (librocksdb.so.10.3+0x7006d5)
    https://github.com/facebook/rocksdb/issues/6 rocksdb::NonBatchedOpsStressTest::VerifyDb(rocksdb::ThreadState*) const db_stress_tool/no_batched_ops_stress.cc:235 (db_stress+0x48806b)
    https://github.com/facebook/rocksdb/issues/7 rocksdb::ThreadBody(void*) db_stress_tool/db_stress_driver.cc:23 (db_stress+0x4e5019)
    https://github.com/facebook/rocksdb/issues/8 StartThreadWrapper env/env_posix.cc:469 (librocksdb.so.10.3+0xa0977f)

  Previous read of size 8 at 0x7b60000014e8 by thread T44:
    #0 rocksdb::VersionSet::manifest_file_number() const db/version_set.h:1342 (librocksdb.so.10.3+0x69019b)
    https://github.com/facebook/rocksdb/issues/1 rocksdb::DBImpl::TEST_Current_Manifest_FileNo() db/db_impl/db_impl_debug.cc:87 (librocksdb.so.10.3+0x69019b)
    https://github.com/facebook/rocksdb/issues/2 rocksdb::NonBatchedOpsStressTest::VerifyDb(rocksdb::ThreadState*) const db_stress_tool/no_batched_ops_stress.cc:238 (db_stress+0x4880b6)
    https://github.com/facebook/rocksdb/issues/3 rocksdb::ThreadBody(void*) db_stress_tool/db_stress_driver.cc:23 (db_stress+0x4e5019)
    https://github.com/facebook/rocksdb/issues/4 StartThreadWrapper env/env_posix.cc:469 (librocksdb.so.10.3+0xa0977f)
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13603

Test Plan:
compile with TSAN, run `python3 ./tools/db_crashtest.py blackbox --test_secondary=1 --interval=10`
I could not reproduce it on main, but we can monitor if crash test fails with this race again.

Reviewed By: mszeszko-meta

Differential Revision: D74601810

Pulled By: cbi42

fbshipit-source-id: 46e13dcde9b0834053ed74c6f0937954dd36fea2
---
 db/db_impl/db_impl_debug.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index ee48b0798673..138527bb782e 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -84,6 +84,7 @@ void DBImpl::TEST_GetFilesMetaData(
 }
 
 uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  InstrumentedMutexLock l(&mutex_);
   return versions_->manifest_file_number();
 }
 

From 35e1c6c402532bbdca851c067296dfced1826954 Mon Sep 17 00:00:00 2001
From: ran-openai <ran@openai.com>
Date: Tue, 13 May 2025 09:54:37 -0700
Subject: [PATCH 086/500] Add internal_merge_point_lookup_count perfstats to c
 interface (#13599)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13599

Reviewed By: virajthakur

Differential Revision: D74586452

Pulled By: cbi42

fbshipit-source-id: 58f31d96c040ae465afa1caba8cbb7434c72a366
---
 db/c.cc             | 2 ++
 include/rocksdb/c.h | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/db/c.cc b/db/c.cc
index 70681e188206..a6df96143a81 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -4358,6 +4358,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
       return rep->internal_recent_skipped_count;
     case rocksdb_internal_merge_count:
       return rep->internal_merge_count;
+    case rocksdb_internal_merge_point_lookup_count:
+      return rep->internal_merge_point_lookup_count;
     case rocksdb_get_snapshot_time:
       return rep->get_snapshot_time;
     case rocksdb_get_from_memtable_time:
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 83c15f1710b0..5564260abe51 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1883,7 +1883,8 @@ enum {
   rocksdb_blob_decompress_time,
   rocksdb_internal_range_del_reseek_count,
   rocksdb_block_read_cpu_time,
-  rocksdb_total_metric_count = 79
+  rocksdb_internal_merge_point_lookup_count,
+  rocksdb_total_metric_count = 80
 };
 
 extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);

From 9c4b94b9e7f6623170bb7d8e2c3e335c2bf582be Mon Sep 17 00:00:00 2001
From: Yu Zhang <yuzhangyu@fb.com>
Date: Tue, 13 May 2025 11:19:53 -0700
Subject: [PATCH 087/500] Remove flaky test for file ingestion wait time metric
 (#13605)

Summary:
As titled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13605

Test Plan: This is removing a test

Reviewed By: mszeszko-meta

Differential Revision: D74660230

Pulled By: jowlyzhang

fbshipit-source-id: 9c1d46b56d2f9ee43eba645563d4f954645d1ace
---
 db/external_sst_file_basic_test.cc | 49 ------------------------------
 1 file changed, 49 deletions(-)

diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index bab07ba4b835..a247e68128c5 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -260,55 +260,6 @@ TEST_F(ExternalSSTFileBasicTest, Basic) {
   s = sst_file_writer.DeleteRange(Key(100), Key(200));
   ASSERT_NOK(s) << s.ToString();
 
-  DestroyAndReopen(options);
-
-  SyncPoint::GetInstance()->LoadDependency({
-      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
-       "ExternalSSTFileBasicTest.LiveWriteStart"},
-      {"WriteThread::JoinBatchGroup:Wait",
-       "DBImpl::IngestExternalFile:AfterIncIngestFileCounter:2"},
-  });
-  SyncPoint::GetInstance()->EnableProcessing();
-  PerfContext* write_thread_perf_context;
-  std::thread write_thread([&] {
-    TEST_SYNC_POINT("ExternalSSTFileBasicTest.LiveWriteStart");
-    SetPerfLevel(kEnableWait);
-    write_thread_perf_context = get_perf_context();
-    write_thread_perf_context->Reset();
-    ASSERT_OK(db_->Put(WriteOptions(), "bar", "v2"));
-    ASSERT_GT(write_thread_perf_context->write_thread_wait_nanos, 0);
-    // Test sync points were used to make sure this live write enter write
-    // thread after the file ingestion entered write thread. So by the time this
-    // live write finishes, the latest seqno is 1 means file ingestion used
-    // seqno 0.
-    ASSERT_EQ(db_->GetLatestSequenceNumber(), 1U);
-  });
-
-  // Add file using file path
-  SetPerfLevel(kEnableTimeExceptForMutex);
-  PerfContext* perf_ctx = get_perf_context();
-  perf_ctx->Reset();
-  s = DeprecatedAddFile({file1});
-  ASSERT_GT(perf_context.file_ingestion_nanos, 0);
-  ASSERT_GT(perf_context.file_ingestion_blocking_live_writes_nanos, 0);
-  ASSERT_OK(s) << s.ToString();
-  for (int k = 0; k < 100; k++) {
-    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
-  }
-
-  write_thread.join();
-  SyncPoint::GetInstance()->DisableProcessing();
-
-  // Re-ingest the file just to check the perf context not enabled at and below
-  // kEnableWait.
-  SetPerfLevel(kEnableWait);
-  perf_ctx->Reset();
-  IngestExternalFileOptions opts;
-  opts.allow_global_seqno = true;
-  opts.allow_blocking_flush = true;
-  ASSERT_OK(db_->IngestExternalFile({file1}, opts));
-  ASSERT_EQ(perf_context.file_ingestion_nanos, 0);
-  ASSERT_EQ(perf_context.file_ingestion_blocking_live_writes_nanos, 0);
   DestroyAndRecreateExternalSSTFilesDir();
 }
 

From 2a0886b9a70c2842b057097aef5b9e2139a7edee Mon Sep 17 00:00:00 2001
From: Till Rohrmann <trohrmann@apache.org>
Date: Tue, 13 May 2025 14:06:28 -0700
Subject: [PATCH 088/500] Expose pinned WriteBatchWithIndex::GetFromBatchAndDB
 through C bindings (#12970)

Summary:
Expose pinned WriteBatchWithIndex::GetFromBatchAndDB through C bindings so that one can read data from the `WriteBatchWithIndex` and db w/o copying the data.

This fixes https://github.com/facebook/rocksdb/issues/12969.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12970

Reviewed By: cbi42

Differential Revision: D74586418

Pulled By: jaykorean

fbshipit-source-id: a5a4d2e8ce3ddf4c2371fdfdb4e9c3309966a05d
---
 db/c.cc             | 35 +++++++++++++++++++++++++++++++++++
 db/c_test.c         | 18 ++++++++++++++++++
 include/rocksdb/c.h | 11 +++++++++++
 3 files changed, 64 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index a6df96143a81..e96ee8479237 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -2722,6 +2722,23 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db(
   return result;
 }
 
+rocksdb_pinnableslice_t* rocksdb_writebatch_wi_get_pinned_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
+                                          Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
 char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
     const rocksdb_readoptions_t* options,
@@ -2743,6 +2760,24 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
   return result;
 }
 
+rocksdb_pinnableslice_t* rocksdb_writebatch_wi_get_pinned_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = wbwi->rep->GetFromBatchAndDB(
+      db->rep, options->rep, column_family->rep, Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
 void rocksdb_write_writebatch_wi(rocksdb_t* db,
                                  const rocksdb_writeoptions_t* options,
                                  rocksdb_writebatch_wi_t* wbwi, char** errptr) {
diff --git a/db/c_test.c b/db/c_test.c
index b6574cd8ecae..2d2f34bad86c 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -103,6 +103,12 @@ static void CheckValue(char* err, const char* expected, char** actual,
   Free(actual);
 }
 
+static void CheckPinnedValue(char* err, const char* expected,
+                             const char** actual, size_t actual_length) {
+  CheckNoError(err);
+  CheckEqual(expected, *actual, actual_length);
+}
+
 static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
                      const char* key, const char* expected) {
   char* err = NULL;
@@ -1245,6 +1251,8 @@ int main(int argc, char** argv) {
     CheckCondition(count == 3);
     size_t size;
     char* value;
+    const char* pinned_value;
+    rocksdb_pinnableslice_t* p;
     value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size,
                                                  &err);
     CheckValue(err, "c", &value, size);
@@ -1254,9 +1262,19 @@ int main(int argc, char** argv) {
     value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
                                                         "foo", 3, &size, &err);
     CheckValue(err, "hello", &value, size);
+    p = rocksdb_writebatch_wi_get_pinned_from_batch_and_db(wbi, db, roptions,
+                                                           "foo", 3, &err);
+    pinned_value = rocksdb_pinnableslice_value(p, &size);
+    CheckPinnedValue(err, "hello", &pinned_value, size);
+    rocksdb_pinnableslice_destroy(p);
     value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
                                                         "box", 3, &size, &err);
     CheckValue(err, "c", &value, size);
+    p = rocksdb_writebatch_wi_get_pinned_from_batch_and_db(wbi, db, roptions,
+                                                           "box", 3, &err);
+    pinned_value = rocksdb_pinnableslice_value(p, &size);
+    CheckPinnedValue(err, "c", &pinned_value, size);
+    rocksdb_pinnableslice_destroy(p);
     rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
     CheckNoError(err);
     CheckGet(db, roptions, "foo", "hello");
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 5564260abe51..52a7593a262b 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -987,11 +987,22 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
     const rocksdb_readoptions_t* options, const char* key, size_t keylen,
     size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_writebatch_wi_get_pinned_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    char** errptr);
 extern ROCKSDB_LIBRARY_API char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
     rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
     const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family, const char* key,
     size_t keylen, size_t* vallen, char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_pinnableslice_t*
+rocksdb_writebatch_wi_get_pinned_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
 extern ROCKSDB_LIBRARY_API void rocksdb_write_writebatch_wi(
     rocksdb_t* db, const rocksdb_writeoptions_t* options,
     rocksdb_writebatch_wi_t* wbwi, char** errptr);

From df7a3a7168177400dcec7f1188dab389d4b99f16 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 13 May 2025 14:41:28 -0700
Subject: [PATCH 089/500] Add debug printfs in secondary cache adapter
 destructor (#13606)

Summary:
Add debug printfs to troubleshoot an intermittent crash test assertion failure.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13606

Reviewed By: mszeszko-meta

Differential Revision: D74661545

Pulled By: anand1976

fbshipit-source-id: 1b2a30fbbea3dcea5ce1a199344e946da687ff1f
---
 cache/secondary_cache_adapter.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index 57a77bc7fcb0..e1b41fb54d4a 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -121,7 +121,17 @@ CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
     assert(s.ok());
     assert(placeholder_usage_ == 0);
     assert(reserved_usage_ == 0);
-    assert(pri_cache_res_->GetTotalMemoryUsed() == sec_capacity);
+    bool pri_cache_res_mismatch =
+        pri_cache_res_->GetTotalMemoryUsed() != sec_capacity;
+    if (pri_cache_res_mismatch) {
+      fprintf(stderr,
+              "~CacheWithSecondaryAdapter: Primary cache reservation: "
+              "%zu, Secondary cache capacity: %zu, "
+              "Secondary cache reserved: %zu\n",
+              pri_cache_res_->GetTotalMemoryUsed(), sec_capacity,
+              sec_reserved_);
+      assert(pri_cache_res_mismatch);
+    }
   }
 #endif  // NDEBUG
 }

From fc2cf7ead2c529fd9070723ffeddb6462db28e22 Mon Sep 17 00:00:00 2001
From: Miroslav Kovar <miroslavkovar@protonmail.com>
Date: Wed, 14 May 2025 13:19:06 -0700
Subject: [PATCH 090/500] Expose optimized `TransactionBaseImpl::MultiGet`
 through JNI (#13589)

Summary:
Addresses https://github.com/facebook/rocksdb/issues/13587.

This PR exposes the optimized implementation of batched reads through a `Transaction` object to Java clients.

The latency improvement of transactional multiget on production workload achieved by switching the implementation is roughly:
```
quantile=0.2: 21%
quantile=0.5: 28%
quantile=0.8: 46%
quantile=1.0: 239%
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13589

Reviewed By: jaykorean

Differential Revision: D74660169

Pulled By: cbi42

fbshipit-source-id: d01780173e0500c96e5e431ff6645008cbf6e8b5
---
 java/rocksjni/transaction.cc                  | 30 +++++++++++++
 .../main/java/org/rocksdb/Transaction.java    | 42 +++++++++++++++++++
 .../java/org/rocksdb/TransactionTest.java     | 26 ++++++++++++
 3 files changed, 98 insertions(+)

diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc
index e211ebe5d6dd..f457ef331c54 100644
--- a/java/rocksjni/transaction.cc
+++ b/java/rocksjni/transaction.cc
@@ -341,6 +341,36 @@ jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B(
                                                           statuses);
 }
 
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGet
+ * Signature: (JJJ[[B)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGet__JJJ_3_3B(
+    JNIEnv* env, jclass /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jlong jcf_handle, jobjectArray jkeys) {
+  ROCKSDB_NAMESPACE::MultiGetJNIKeys keys;
+  if (!keys.fromByteArrays(env, jkeys)) {
+    return nullptr;
+  }
+
+  auto* txn = reinterpret_cast<ROCKSDB_NAMESPACE::Transaction*>(jhandle);
+  auto* cf_handle =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  size_t num_keys = keys.size();
+  std::vector<ROCKSDB_NAMESPACE::PinnableSlice> values(num_keys);
+  std::vector<ROCKSDB_NAMESPACE::Status> statuses(num_keys);
+
+  txn->MultiGet(
+      *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle),
+      cf_handle, num_keys, keys.slices().data(), values.data(), statuses.data(),
+      /*sorted_input=*/false);
+
+  return ROCKSDB_NAMESPACE::MultiGetJNIValues::byteArrays<
+      ROCKSDB_NAMESPACE::PinnableSlice>(env, values, statuses);
+}
+
 /*
  * Class:     org_rocksdb_Transaction
  * Method:    getForUpdate
diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java
index 12e4082c11b0..ee8656460835 100644
--- a/java/src/main/java/org/rocksdb/Transaction.java
+++ b/java/src/main/java/org/rocksdb/Transaction.java
@@ -661,6 +661,46 @@ public List<byte[]> multiGetAsList(final ReadOptions readOptions, final List<byt
     return Arrays.asList(multiGet(nativeHandle_, readOptions.nativeHandle_, keysArray));
   }
 
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGetAsList} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   * <p>
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   * <p>
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   * <p>
+   * This method uses the optimized path with support for batched reads.
+   *
+   * @param readOptions Read options.=
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetAsList(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final List<byte[]> keys)
+      throws RocksDBException {
+    if (keys.isEmpty()) {
+      return new ArrayList<>(0);
+    }
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    return Arrays.asList(multiGet(
+        nativeHandle_, readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_, keysArray));
+  }
+
   /**
    * Read this key and ensure that this transaction will only
    * be able to be committed if this key is not written outside this
@@ -2877,6 +2917,8 @@ private static native byte[][] multiGet(final long handle, final long readOption
       final byte[][] keys, final long[] columnFamilyHandles) throws RocksDBException;
   private static native byte[][] multiGet(
       final long handle, final long readOptionsHandle, final byte[][] keys) throws RocksDBException;
+  private static native byte[][] multiGet(final long nativeHandle, final long readOptionsHandle,
+      final long cfHandle, final byte[][] keys) throws RocksDBException;
   private static native byte[] getForUpdate(final long handle, final long readOptionsHandle,
       final byte[] key, final int keyOffset, final int keyLength, final long columnFamilyHandle,
       final boolean exclusive, final boolean doValidate) throws RocksDBException;
diff --git a/java/src/test/java/org/rocksdb/TransactionTest.java b/java/src/test/java/org/rocksdb/TransactionTest.java
index 03a6b4ff6b3f..9adc26d97018 100644
--- a/java/src/test/java/org/rocksdb/TransactionTest.java
+++ b/java/src/test/java/org/rocksdb/TransactionTest.java
@@ -345,6 +345,32 @@ public void multiGetAsListForUpdate_conflict() throws RocksDBException {
     }
   }
 
+  @Test
+  public void multiGetAsList() throws RocksDBException {
+    final byte[] k1 = "k1".getBytes(UTF_8);
+    final byte[] k2 = "k2".getBytes(UTF_8);
+    final byte[] k3 = "k3".getBytes(UTF_8);
+    final byte[] v1 = "v1".getBytes(UTF_8);
+    final byte[] v2 = "v2".getBytes(UTF_8);
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        txn.put(testCf, k2, v2);
+        txn.commit();
+      }
+
+      try (final Transaction txn = dbContainer.beginTransaction()) {
+        final List<byte[]> result =
+            txn.multiGetAsList(readOptions, testCf, Arrays.asList(k1, k2, k3));
+        assertThat(result).containsExactly(v1, v2, null);
+      }
+    }
+  }
+
   @Test
   public void name() throws RocksDBException {
     try(final DBContainer dbContainer = startDb();

From 7c9b5806818d206d14d7054433ac2abd0b138daa Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 15 May 2025 17:14:23 -0700
Subject: [PATCH 091/500] Big refactor for preliminary custom compression API
 (#13540)

Summary:
Adds new classes etc. in internal compression.h that are intended to become public APIs for supporting custom/pluggable compression. Some steps remain to allow for pluggable compression and to remove a lot of legacy code (e.g. now called `OLD_CompressData` and `OLD_UncompressData`), but this change refactors the key integration points of SST building and reading and compressed secondary cache over to the new APIs.

Compared with the proposed https://github.com/facebook/rocksdb/issues/7650, this fixes a number of issues including
* Making a clean divide between public and internal APIs (currently just indicated with comments)
* Enough generality that built-in compressions generally fit into the framework rather than needing special treatment
* Avoid exposing obnoxious idioms like `compress_format_version` to the user.
* Enough generality that a compressor mixing algorithms/strategies from other compressors is pretty well supported without an extra schema layer
* Explicit thread-safety contracts (carefully considered)
* Contract details around schema compatibility and extension with code changes (more detail in next PR)
* Customizable "working areas" (e.g. for ZSTD "context")
* Decompression into an arbitrary memory location (rather than involving the decompressor in memory allocation; should facilitate reducing number of objects in block cache)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13540

Test Plan:
This is currently an internal refactor. More testing will come when the new API is migrated to the public API. A test in db_block_cache_test is updated to meaningfully cover a case (cache warming compression dictionary block) that was previously only covered in the crash test.

SST write performance test, like https://github.com/facebook/rocksdb/issues/13583. Compile with CLANG, run before & after simultaneously:

```
SUFFIX=`tty | sed 's|/|_|g'`; for ARGS in "-compression_parallel_threads=1 -compression_type=none" "-compression_parallel_threads=1 -compression_type=snappy" "-compression_parallel_threads=1 -compression_type=zstd" "-compression_parallel_threads=1 -compression_type=zstd -verify_compression=1" "-compression_parallel_threads=1 -compression_type=zstd -compression_max_dict_bytes=8180" "-compression_parallel_threads=4 -compression_type=snappy"; do echo $ARGS; (for I in `seq 1 20`; do ./db_bench -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 $ARGS 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done
```

Before (this PR and with https://github.com/facebook/rocksdb/issues/13583 reverted):
-compression_parallel_threads=1 -compression_type=none
1908372
-compression_parallel_threads=1 -compression_type=snappy
1926093
-compression_parallel_threads=1 -compression_type=zstd
1208259
-compression_parallel_threads=1 -compression_type=zstd -verify_compression=1
997583
-compression_parallel_threads=1 -compression_type=zstd -compression_max_dict_bytes=8180
934246
-compression_parallel_threads=4 -compression_type=snappy
1644849

After:
-compression_parallel_threads=1 -compression_type=none
1956054 (+2.5%)
-compression_parallel_threads=1 -compression_type=snappy
1911433 (-0.8%)
-compression_parallel_threads=1 -compression_type=zstd
1205668 (-0.3%)
-compression_parallel_threads=1 -compression_type=zstd -verify_compression=1
999263 (+0.2%)
-compression_parallel_threads=1 -compression_type=zstd -compression_max_dict_bytes=8180
934322 (+0.0%)
-compression_parallel_threads=4 -compression_type=snappy
1642519 (-0.2%)

Pretty neutral change(s) overall.

SST read performance test (related to https://github.com/facebook/rocksdb/issues/13583). Set up:
```
for COMP in none snappy zstd; do echo $ARGS; ./db_bench -db=/dev/shm/dbbench-$COMP --benchmarks=fillseq,flush -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -compression_type=$COMP; done
```
Test (compile with CLANG, run before & after simultaneously):
```
for COMP in none snappy zstd; do echo $COMP; (for I in `seq 1 5`; do ./db_bench -readonly -db=/dev/shm/dbbench-$COMP --benchmarks=readrandom -num=10000000 -duration=20 -threads=8 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done
```

Before (this PR and with https://github.com/facebook/rocksdb/issues/13583 reverted):
none
1495646
snappy
1172443
zstd
706036
zstd (after constructing with -compression_max_dict_bytes=8180)
656182

After:
none
1494981 (-0.0%)
snappy
1171846 (-0.1%)
zstd
696363 (-1.4%)
zstd (after constructing with -compression_max_dict_bytes=8180)
667585 (+1.7%)

Pretty neutral.

Reviewed By: hx235

Differential Revision: D74626863

Pulled By: pdillinger

fbshipit-source-id: dc8ff3178da9b4eaa7c16aa1bb910c872afaf14a
---
 cache/compressed_secondary_cache.cc           |  67 +-
 cache/compressed_secondary_cache.h            |   2 +
 db/blob/blob_file_builder.cc                  |   4 +-
 db/blob/blob_file_reader.cc                   |   6 +-
 db/blob/blob_file_reader_test.cc              |   4 +-
 db/blob/blob_source_test.cc                   |   4 +-
 db/db_block_cache_test.cc                     |   5 +
 db/db_test2.cc                                |   6 +-
 include/rocksdb/compression_type.h            |   2 +
 port/win/xpress_win.cc                        |  42 +-
 port/win/xpress_win.h                         |   4 +
 .../block_based/block_based_table_builder.cc  | 575 +++++++------
 table/block_based/block_based_table_builder.h |  15 +-
 table/block_based/block_based_table_reader.cc | 152 ++--
 table/block_based/block_based_table_reader.h  |  19 +-
 .../block_based_table_reader_impl.h           |  47 +-
 .../block_based_table_reader_sync_and_async.h |  64 +-
 .../block_based_table_reader_test.cc          |   2 +
 table/block_based/block_cache.cc              |  10 +-
 table/block_based/block_cache.h               |  22 +-
 table/block_based/block_test.cc               |  78 +-
 .../block_based/filter_block_reader_common.cc |   3 +-
 table/block_based/hash_index_reader.cc        |   6 +-
 table/block_based/index_reader_common.cc      |   6 +-
 table/block_based/partitioned_filter_block.cc |   5 +-
 table/block_based/partitioned_index_reader.cc |   2 +-
 .../block_based/uncompression_dict_reader.cc  |   9 +-
 table/block_based/uncompression_dict_reader.h |   8 +-
 table/block_fetcher.cc                        |  55 +-
 table/block_fetcher.h                         |  17 +-
 table/block_fetcher_test.cc                   |   6 +-
 table/format.cc                               |  95 ++-
 table/format.h                                |  31 +-
 table/meta_blocks.cc                          |   8 +-
 table/table_test.cc                           |  22 +-
 tools/db_bench_tool.cc                        |  10 +-
 tools/ldb_cmd.cc                              |   3 +-
 util/cast_util.h                              |  47 ++
 util/compression.cc                           | 782 ++++++++++++++++++
 util/compression.h                            | 621 +++++++++++++-
 utilities/blob_db/blob_db_impl.cc             |  23 +-
 utilities/blob_db/blob_db_impl.h              |   2 +
 utilities/blob_db/blob_dump_tool.cc           |   7 +-
 43 files changed, 2227 insertions(+), 671 deletions(-)

diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index d912c58b0317..70c8ef936891 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -24,7 +24,14 @@ CompressedSecondaryCache::CompressedSecondaryCache(
       cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
           std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
               cache_))),
-      disable_cache_(opts.capacity == 0) {}
+      disable_cache_(opts.capacity == 0) {
+  auto mgr =
+      GetBuiltinCompressionManager(cache_options_.compress_format_version);
+  compressor_ = mgr->GetCompressor(cache_options_.compression_opts,
+                                   cache_options_.compression_type);
+  decompressor_ =
+      mgr->GetDecompressorOptimizeFor(cache_options_.compression_type);
+}
 
 CompressedSecondaryCache::~CompressedSecondaryCache() = default;
 
@@ -97,25 +104,24 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
                             kNoCompression, CacheTier::kVolatileTier,
                             create_context, allocator, &value, &charge);
     } else {
-      UncompressionContext uncompression_context(
-          cache_options_.compression_type);
-      UncompressionInfo uncompression_info(uncompression_context,
-                                           UncompressionDict::GetEmptyDict(),
-                                           cache_options_.compression_type);
-
-      size_t uncompressed_size{0};
-      CacheAllocationPtr uncompressed =
-          UncompressData(uncompression_info, (char*)data_ptr,
-                         handle_value_charge, &uncompressed_size,
-                         cache_options_.compress_format_version, allocator);
-
-      if (!uncompressed) {
-        cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
-        return nullptr;
+      // TODO: can we work some magic with create_cb, which might be based on
+      // custom compression, to decompress without an extra copy in create_cb?
+      Decompressor::Args args;
+      args.compressed_data = Slice(data_ptr, handle_value_charge);
+      args.compression_type = cache_options_.compression_type;
+      s = decompressor_->ExtractUncompressedSize(args);
+      assert(s.ok());
+      if (s.ok()) {
+        auto uncompressed = std::make_unique<char[]>(args.uncompressed_size);
+        s = decompressor_->DecompressBlock(args, uncompressed.get());
+        assert(s.ok());
+        if (s.ok()) {
+          s = helper->create_cb(
+              Slice(uncompressed.get(), args.uncompressed_size), kNoCompression,
+              CacheTier::kVolatileTier, create_context, allocator, &value,
+              &charge);
+        }
       }
-      s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size),
-                            kNoCompression, CacheTier::kVolatileTier,
-                            create_context, allocator, &value, &charge);
     }
   } else {
     // The item was not compressed by us. Let the helper create_cb
@@ -198,18 +204,17 @@ Status CompressedSecondaryCache::InsertInternal(
       type == kNoCompression &&
       !cache_options_.do_not_compress_roles.Contains(helper->role)) {
     PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size);
-    CompressionContext compression_context(cache_options_.compression_type,
-                                           cache_options_.compression_opts);
-    CompressionInfo compression_info(
-        cache_options_.compression_opts, compression_context,
-        CompressionDict::GetEmptyDict(), cache_options_.compression_type);
-
-    bool success =
-        CompressData(val, compression_info,
-                     cache_options_.compress_format_version, &compressed_val);
-
-    if (!success) {
-      return Status::Corruption("Error compressing value.");
+
+    CompressionType to_type = kNoCompression;
+    s = compressor_->CompressBlock(val, &compressed_val, &to_type,
+                                   nullptr /*working_area*/);
+    if (!s.ok()) {
+      return s;
+    }
+    // TODO: allow values not compressed when there's no size savings?
+    assert(to_type == cache_options_.compression_type);
+    if (to_type != cache_options_.compression_type) {
+      return Status::Corruption("Failed to compress value.");
     }
 
     val = Slice(compressed_val);
diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h
index 45eab656e44f..f66d9a0ffe78 100644
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@@ -145,6 +145,8 @@ class CompressedSecondaryCache : public SecondaryCache {
   const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
   std::shared_ptr<Cache> cache_;
   CompressedSecondaryCacheOptions cache_options_;
+  std::unique_ptr<Compressor> compressor_;
+  std::shared_ptr<Decompressor> decompressor_;
   mutable port::Mutex capacity_mutex_;
   std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
   bool disable_cache_;
diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc
index 919d7c60ed6d..3a32269d8eb0 100644
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@@ -278,8 +278,8 @@ Status BlobFileBuilder::CompressBlobIfNeeded(
   {
     StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
                          BLOB_DB_COMPRESSION_MICROS);
-    success =
-        CompressData(*blob, info, compression_format_version, compressed_blob);
+    success = OLD_CompressData(*blob, info, compression_format_version,
+                               compressed_blob);
   }
 
   if (!success) {
diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc
index 0c30efbc119f..0d05b5e57140 100644
--- a/db/blob/blob_file_reader.cc
+++ b/db/blob/blob_file_reader.cc
@@ -602,9 +602,9 @@ Status BlobFileReader::UncompressBlobIfNeeded(
   {
     PERF_TIMER_GUARD(blob_decompress_time);
     StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
-    output = UncompressData(info, value_slice.data(), value_slice.size(),
-                            &uncompressed_size, compression_format_version,
-                            allocator);
+    output = OLD_UncompressData(info, value_slice.data(), value_slice.size(),
+                                &uncompressed_size, compression_format_version,
+                                allocator);
   }
 
   TEST_SYNC_POINT_CALLBACK(
diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc
index 9881dc362750..6297dd461c80 100644
--- a/db/blob/blob_file_reader_test.cc
+++ b/db/blob/blob_file_reader_test.cc
@@ -81,8 +81,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
     constexpr uint32_t compression_format_version = 2;
 
     for (size_t i = 0; i < num; ++i) {
-      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
-                               &compressed_blobs[i]));
+      ASSERT_TRUE(OLD_CompressData(blobs[i], info, compression_format_version,
+                                   &compressed_blobs[i]));
       blobs_to_write[i] = compressed_blobs[i];
       blob_sizes[i] = compressed_blobs[i].size();
     }
diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc
index 8a021969e4fe..01c61ac5e6d3 100644
--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@@ -83,8 +83,8 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
     constexpr uint32_t compression_format_version = 2;
 
     for (size_t i = 0; i < num; ++i) {
-      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
-                               &compressed_blobs[i]));
+      ASSERT_TRUE(OLD_CompressData(blobs[i], info, compression_format_version,
+                                   &compressed_blobs[i]));
       blobs_to_write[i] = compressed_blobs[i];
       blob_sizes[i] = compressed_blobs[i].size();
     }
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index cafb3710092d..1810ef8eb6fa 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -506,6 +506,8 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
   table_options.prepopulate_block_cache =
       BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  // Include a compression dictionary block
+  options.compression_opts.max_dict_bytes = 123;
   DestroyAndReopen(options);
 
   std::string value(kValueSize, 'a');
@@ -537,6 +539,9 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
                 options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
     }
     ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+
+    // Including compression dict
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_MISS));
   }
 
   // Verify compaction not counted
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 6c4f6243719d..644adb624216 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -1722,16 +1722,14 @@ TEST_P(CompressionFailuresTest, CompressionFailures) {
         });
   } else if (compression_failure_type_ == kTestDecompressionFail) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "UncompressBlockData:TamperWithReturnValue", [](void* arg) {
+        "DecompressBlockData:TamperWithReturnValue", [](void* arg) {
           Status* ret = static_cast<Status*>(arg);
           ASSERT_OK(*ret);
           *ret = Status::Corruption("kTestDecompressionFail");
         });
   } else if (compression_failure_type_ == kTestDecompressionCorruption) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "UncompressBlockData:"
-        "TamperWithDecompressionOutput",
-        [](void* arg) {
+        "DecompressBlockData:TamperWithDecompressionOutput", [](void* arg) {
           BlockContents* contents = static_cast<BlockContents*>(arg);
           // Ensure uncompressed data != original data
           const size_t len = contents->data.size() + 1;
diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index 2ca742aa3853..d7ef0b7aa1be 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -27,6 +27,8 @@ enum CompressionType : unsigned char {
   kXpressCompression = 0x6,
   kZSTD = 0x7,
 
+  // TODO: add enum values for user custom compression types
+
   // kDisableCompressionOption is used to disable some compression options.
   kDisableCompressionOption = 0xff,
 };
diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc
index 21904d502674..7e0454f3ec69 100644
--- a/port/win/xpress_win.cc
+++ b/port/win/xpress_win.cc
@@ -151,7 +151,7 @@ char* Decompress(const char* input_data, size_t input_length,
     return nullptr;
   }
 
-  std::unique_ptr<void, decltype(CloseDecompressorFun)> compressorGuard(
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> decompressorGuard(
       decompressor, CloseDecompressorFun);
 
   SIZE_T decompressedBufferSize = 0;
@@ -201,6 +201,46 @@ char* Decompress(const char* input_data, size_t input_length,
   // Return the raw buffer to the caller supporting the tradition
   return outputBuffer.release();
 }
+
+int64_t DecompressToBuffer(const char* input, size_t input_length, char* output,
+                           size_t output_length) {
+  assert(input != nullptr);
+  assert(output != nullptr);
+
+  DECOMPRESSOR_HANDLE decompressor = NULL;
+
+  BOOL success =
+      CreateDecompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                         allocRoutinesPtr,  //  Optional allocation routine
+                         &decompressor);    //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Decompressor LastError "
+              << GetLastError() << std::endl;
+#endif
+    return -1;
+  }
+
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> decompressorGuard(
+      decompressor, CloseDecompressorFun);
+
+  SIZE_T decompressedDataSize = 0;
+
+  success = ::Decompress(decompressor, const_cast<char*>(input), input_length,
+                         output, output_length, &decompressedDataSize);
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to decompress LastError " << GetLastError()
+              << std::endl;
+#endif
+    return -1;
+  }
+
+  return static_cast<int64_t>(decompressedDataSize);
+}
+
 }  // namespace xpress
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/port/win/xpress_win.h b/port/win/xpress_win.h
index 187adffa658a..ab4be3a6f0df 100644
--- a/port/win/xpress_win.h
+++ b/port/win/xpress_win.h
@@ -21,6 +21,10 @@ bool Compress(const char* input, size_t length, std::string* output);
 
 char* Decompress(const char* input_data, size_t input_length,
                  size_t* uncompressed_size);
+
+int64_t DecompressToBuffer(const char* input, size_t input_length, char* output,
+                           size_t output_length);
+
 }  // namespace xpress
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 96819b7a7061..acf0a7e073e5 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -107,45 +107,6 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
   }
 }
 
-bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size,
-                          int max_compressed_bytes_per_kb) {
-  // For efficiency, avoid floating point and division
-  return compressed_size <=
-         (static_cast<uint64_t>(max_compressed_bytes_per_kb) * uncomp_size) >>
-         10;
-}
-
-// format_version is the block format as defined in include/rocksdb/table.h
-CompressionType CompressBlock(const Slice& uncompressed_data,
-                              const CompressionInfo& info,
-                              uint32_t format_version,
-                              std::string* compressed_output) {
-  assert(compressed_output);
-  assert(compressed_output->empty());
-
-  int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb;
-  if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) {
-    return kNoCompression;
-  }
-
-  // Actually compress the data; if the compression method is not supported,
-  // or the compression fails etc., just fall back to uncompressed
-  if (!CompressData(uncompressed_data, info,
-                    GetCompressFormatForVersion(format_version),
-                    compressed_output)) {
-    return kNoCompression;
-  }
-
-  // Check the compression ratio; if it's not good enough, just fall back to
-  // uncompressed
-  if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(),
-                            max_compressed_bytes_per_kb)) {
-    return kNoCompression;
-  }
-
-  return info.type();
-}
-
 }  // namespace
 
 // kBlockBasedTableMagicNumber was picked by running
@@ -224,6 +185,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
   bool decoupled_partitioned_filters_;
 };
 
+struct BlockBasedTableBuilder::WorkingAreaPair {
+  Compressor::ManagedWorkingArea compress;
+  Decompressor::ManagedWorkingArea verify;
+};
+
 struct BlockBasedTableBuilder::Rep {
   const ImmutableOptions ioptions;
   // BEGIN from MutableCFOptions
@@ -263,18 +229,48 @@ struct BlockBasedTableBuilder::Rep {
 
   std::string last_ikey;  // Internal key or empty (unset)
   const Slice* first_key_in_next_block = nullptr;
-  CompressionType compression_type;
+  bool warm_cache = false;
+
   uint64_t sample_for_compression;
   std::atomic<uint64_t> compressible_input_data_bytes;
   std::atomic<uint64_t> uncompressible_input_data_bytes;
   std::atomic<uint64_t> sampled_input_data_bytes;
   std::atomic<uint64_t> sampled_output_slow_data_bytes;
   std::atomic<uint64_t> sampled_output_fast_data_bytes;
-  CompressionOptions compression_opts;
-  std::unique_ptr<CompressionDict> compression_dict;
-  std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
-  std::vector<std::unique_ptr<UncompressionContext>> verify_ctxs;
-  std::unique_ptr<UncompressionDict> verify_dict;
+  uint32_t compression_parallel_threads;
+  int max_compressed_bytes_per_kb;
+  size_t max_dict_sample_bytes = 0;
+
+  // *** Compressors & decompressors - Yes, it seems like a lot here but ***
+  // *** these are distinct fields to minimize extra conditionals and    ***
+  // *** field reads on hot code paths.                                  ***
+
+  // A compressor for blocks in general, without dictionary compression
+  std::unique_ptr<Compressor> basic_compressor;
+  // A compressor using dictionary compression (when applicable)
+  std::unique_ptr<Compressor> compressor_with_dict;
+  // Once configured/determined, points to one of the above Compressors to
+  // use on data blocks.
+  Compressor* data_block_compressor = nullptr;
+  // A decompressor corresponding to basic_compressor (when non-nullptr).
+  // Used for verification and cache warming.
+  std::shared_ptr<Decompressor> basic_decompressor;
+  // When needed, a decompressor for verifying compression using a
+  // dictionary sampled/trained from this file.
+  std::unique_ptr<Decompressor> verify_decompressor_with_dict;
+  // When non-nullptr, compression should be verified with this corresponding
+  // decompressor, except for data blocks. (Points to same as basic_decompressor
+  // when verify_compression is set.)
+  UnownedPtr<Decompressor> verify_decompressor;
+  // Once configured/determined, points to one of the above Decompressors to use
+  // in verifying data blocks.
+  UnownedPtr<Decompressor> data_block_verify_decompressor;
+
+  // Working area for basic_compressor when compression_parallel_threads==1
+  WorkingAreaPair basic_working_area;
+  // Working areas for data_block_compressor, for each of
+  // compression_parallel_threads
+  std::vector<WorkingAreaPair> data_block_working_areas;
 
   size_t data_begin_offset = 0;
 
@@ -303,10 +299,10 @@ struct BlockBasedTableBuilder::Rep {
     kUnbuffered,
     kClosed,
   };
-  State state;
+  State state = State::kUnbuffered;
   // `kBuffered` state is allowed only as long as the buffering of uncompressed
   // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
-  uint64_t buffer_limit;
+  uint64_t buffer_limit = 0;
   std::shared_ptr<CacheReservationManager>
       compression_dict_buffer_cache_res_mgr;
   const bool use_delta_encoding_for_index_values;
@@ -339,7 +335,7 @@ struct BlockBasedTableBuilder::Rep {
   void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
 
   bool IsParallelCompressionEnabled() const {
-    return compression_opts.parallel_threads > 1;
+    return compression_parallel_threads > 1;
   }
 
   Status GetStatus() {
@@ -383,7 +379,7 @@ struct BlockBasedTableBuilder::Rep {
   // Never erase an existing status that is not OK.
   void SetStatus(Status s) {
     if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_opts.parallel_threads
+      // Locking is an overkill for non compression_parallel_threads
       // case but since it's unlikely that s is not OK, we take this cost
       // to be simplicity.
       std::lock_guard<std::mutex> lock(status_mutex);
@@ -396,7 +392,7 @@ struct BlockBasedTableBuilder::Rep {
   // Calling this will also SetStatus(ios)
   void SetIOStatus(IOStatus ios) {
     if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_opts.parallel_threads
+      // Locking is an overkill for non compression_parallel_threads
       // case but since it's unlikely that s is not OK, we take this cost
       // to be simplicity.
       std::lock_guard<std::mutex> lock(io_status_mutex);
@@ -438,22 +434,16 @@ struct BlockBasedTableBuilder::Rep {
             0.75 /* data_block_hash_table_util_ratio */, ts_sz,
             persist_user_defined_timestamps),
         internal_prefix_transform(prefix_extractor.get()),
-        compression_type(tbo.compression_type),
         sample_for_compression(tbo.moptions.sample_for_compression),
         compressible_input_data_bytes(0),
         uncompressible_input_data_bytes(0),
         sampled_input_data_bytes(0),
         sampled_output_slow_data_bytes(0),
         sampled_output_fast_data_bytes(0),
-        compression_opts(tbo.compression_opts),
-        compression_dict(),
-        compression_ctxs(tbo.compression_opts.parallel_threads),
-        verify_ctxs(tbo.compression_opts.parallel_threads),
-        verify_dict(),
-        state((tbo.compression_opts.max_dict_bytes > 0 &&
-               tbo.compression_type != kNoCompression)
-                  ? State::kBuffered
-                  : State::kUnbuffered),
+        compression_parallel_threads(tbo.compression_opts.parallel_threads),
+        max_compressed_bytes_per_kb(
+            tbo.compression_opts.max_compressed_bytes_per_kb),
+        data_block_working_areas(compression_parallel_threads),
         use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
                                             !table_opt.block_align),
         reason(tbo.reason),
@@ -461,7 +451,7 @@ struct BlockBasedTableBuilder::Rep {
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
         create_context(&table_options, &ioptions, ioptions.stats,
-                       compression_type == kZSTD,
+                       /*decompressor=*/nullptr,
                        tbo.moptions.block_protection_bytes_per_key,
                        tbo.internal_comparator.user_comparator(),
                        !use_delta_encoding_for_index_values,
@@ -470,13 +460,90 @@ struct BlockBasedTableBuilder::Rep {
         tail_size(0),
         status_ok(true),
         io_status_ok(true) {
-    if (tbo.target_file_size == 0) {
-      buffer_limit = compression_opts.max_dict_buffer_bytes;
-    } else if (compression_opts.max_dict_buffer_bytes == 0) {
-      buffer_limit = tbo.target_file_size;
-    } else {
-      buffer_limit = std::min(tbo.target_file_size,
-                              compression_opts.max_dict_buffer_bytes);
+    FilterBuildingContext filter_context(table_options);
+
+    filter_context.info_log = ioptions.logger;
+    filter_context.column_family_name = tbo.column_family_name;
+    filter_context.reason = reason;
+
+    // Only populate other fields if known to be in LSM rather than
+    // generating external SST file
+    if (reason != TableFileCreationReason::kMisc) {
+      filter_context.compaction_style = ioptions.compaction_style;
+      filter_context.num_levels = ioptions.num_levels;
+      filter_context.level_at_creation = tbo.level_at_creation;
+      filter_context.is_bottommost = tbo.is_bottommost;
+      assert(filter_context.level_at_creation < filter_context.num_levels);
+    }
+
+    // TODO: get CompressionManager from options and sort out properties
+    auto mgr = GetBuiltinCompressionManager(
+        GetCompressFormatForVersion(table_opt.format_version));
+    props.compression_name = CompressionTypeToString(tbo.compression_type);
+    props.compression_options =
+        CompressionOptionsToString(tbo.compression_opts);
+
+    // Sanitize to only allowing compression when it saves space.
+    max_compressed_bytes_per_kb =
+        std::min(int{1023}, tbo.compression_opts.max_compressed_bytes_per_kb);
+
+    basic_compressor = mgr->GetCompressorForSST(
+        filter_context, tbo.compression_opts, tbo.compression_type);
+    if (basic_compressor) {
+      if (table_options.enable_index_compression) {
+        basic_working_area.compress = basic_compressor->ObtainWorkingArea();
+      }
+      max_dict_sample_bytes = basic_compressor->GetMaxSampleSizeIfWantDict(
+          CacheEntryRole::kDataBlock);
+      if (max_dict_sample_bytes > 0) {
+        state = State::kBuffered;
+        if (tbo.target_file_size == 0) {
+          buffer_limit = tbo.compression_opts.max_dict_buffer_bytes;
+        } else if (tbo.compression_opts.max_dict_buffer_bytes == 0) {
+          buffer_limit = tbo.target_file_size;
+        } else {
+          buffer_limit = std::min(tbo.target_file_size,
+                                  tbo.compression_opts.max_dict_buffer_bytes);
+        }
+      } else {
+        // No distinct data block compressor using dictionary
+        data_block_compressor = basic_compressor.get();
+        for (uint32_t i = 0; i < compression_parallel_threads; i++) {
+          data_block_working_areas[i].compress =
+              data_block_compressor->ObtainWorkingArea();
+        }
+      }
+      basic_decompressor =
+          mgr->GetDecompressorOptimizeFor(tbo.compression_type);
+      create_context.decompressor = basic_decompressor.get();
+
+      if (table_options.verify_compression) {
+        verify_decompressor = basic_decompressor.get();
+        if (table_options.enable_index_compression) {
+          basic_working_area.verify =
+              verify_decompressor->ObtainWorkingArea(tbo.compression_type);
+        }
+        if (state == State::kUnbuffered) {
+          for (uint32_t i = 0; i < compression_parallel_threads; i++) {
+            data_block_working_areas[i].verify =
+                verify_decompressor->ObtainWorkingArea(tbo.compression_type);
+          }
+          data_block_verify_decompressor = verify_decompressor.get();
+        }
+      }
+    }
+
+    switch (table_options.prepopulate_block_cache) {
+      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
+        warm_cache = (reason == TableFileCreationReason::kFlush);
+        break;
+      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
+        warm_cache = false;
+        break;
+      default:
+        // missing case
+        assert(false);
+        warm_cache = false;
     }
 
     const auto compress_dict_build_buffer_charged =
@@ -496,11 +563,6 @@ struct BlockBasedTableBuilder::Rep {
       compression_dict_buffer_cache_res_mgr = nullptr;
     }
 
-    assert(compression_ctxs.size() >= compression_opts.parallel_threads);
-    for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
-      compression_ctxs[i].reset(
-          new CompressionContext(compression_type, compression_opts));
-    }
     if (table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
@@ -524,22 +586,6 @@ struct BlockBasedTableBuilder::Rep {
       // Null filter_policy -> no filter
       filter_builder.reset();
     } else {
-      FilterBuildingContext filter_context(table_options);
-
-      filter_context.info_log = ioptions.logger;
-      filter_context.column_family_name = tbo.column_family_name;
-      filter_context.reason = reason;
-
-      // Only populate other fields if known to be in LSM rather than
-      // generating external SST file
-      if (reason != TableFileCreationReason::kMisc) {
-        filter_context.compaction_style = ioptions.compaction_style;
-        filter_context.num_levels = ioptions.num_levels;
-        filter_context.level_at_creation = tbo.level_at_creation;
-        filter_context.is_bottommost = tbo.is_bottommost;
-        assert(filter_context.level_at_creation < filter_context.num_levels);
-      }
-
       filter_builder.reset(CreateFilterBlockBuilder(
           ioptions, tbo.moptions, filter_context,
           use_delta_encoding_for_index_values, p_index_builder_, ts_sz,
@@ -569,11 +615,6 @@ struct BlockBasedTableBuilder::Rep {
           new TimestampTablePropertiesCollector(
               tbo.internal_comparator.user_comparator()));
     }
-    if (table_options.verify_compression) {
-      for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
-        verify_ctxs[i].reset(new UncompressionContext(compression_type));
-      }
-    }
 
     // These are only needed for populating table properties
     props.column_family_id = tbo.column_family_id;
@@ -604,7 +645,7 @@ struct BlockBasedTableBuilder::Rep {
       base_context_checksum = 0;
     }
 
-    if (alignment > 0 && compression_type != kNoCompression) {
+    if (alignment > 0 && basic_compressor) {
       // With better sanitization in `CompactionPicker::CompactFiles()`, we
       // would not need to handle this case here and could change it to an
       // assertion instead.
@@ -849,11 +890,9 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
 
   // Make a block prepared to be emitted to compression thread
   // Used in non-buffered mode
-  BlockRep* PrepareBlock(CompressionType compression_type,
-                         const Slice* first_key_in_next_block,
+  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
                          BlockBuilder* data_block) {
-    BlockRep* block_rep =
-        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
     assert(block_rep != nullptr);
     data_block->SwapAndReset(block_rep->uncompressed);
     std::swap(block_rep->keys, curr_block_keys);
@@ -862,12 +901,10 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   }
 
   // Used in EnterUnbuffered
-  BlockRep* PrepareBlock(CompressionType compression_type,
-                         const Slice* first_key_in_next_block,
+  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
                          std::string* data_block,
                          std::vector<std::string>* keys) {
-    BlockRep* block_rep =
-        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
     assert(block_rep != nullptr);
     std::swap(block_rep->uncompressed, *data_block);
     block_rep->keys.SwapAssign(*keys);
@@ -907,13 +944,12 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   }
 
  private:
-  BlockRep* PrepareBlockInternal(CompressionType compression_type,
-                                 const Slice* first_key_in_next_block) {
+  BlockRep* PrepareBlockInternal(const Slice* first_key_in_next_block) {
     BlockRep* block_rep = nullptr;
     block_rep_pool.pop(block_rep);
     assert(block_rep != nullptr);
 
-    block_rep->compression_type = compression_type;
+    block_rep->compression_type = kNoCompression;
 
     if (first_key_in_next_block == nullptr) {
       block_rep->first_key_in_next_block = {};
@@ -944,7 +980,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
 
   if (rep_->IsParallelCompressionEnabled()) {
     StartParallelCompression();
-  } else if (rep_->compression_type != kNoCompression) {
+  } else if (rep_->basic_compressor) {
     rep_->single_threaded_compressed_output.reserve(table_options.block_size);
   }
 }
@@ -1136,9 +1172,10 @@ void BlockBasedTableBuilder::Flush() {
       CompressionInfo info_tmp(options, context,
                                CompressionDict::GetEmptyDict(), c);
 
-      CompressData(uncompressed_block_data, info_tmp,
-                   GetCompressFormatForVersion(r->table_options.format_version),
-                   &sampled_output_fast);
+      OLD_CompressData(
+          uncompressed_block_data, info_tmp,
+          GetCompressFormatForVersion(r->table_options.format_version),
+          &sampled_output_fast);
     }
 
     // Sampling with a slow but high-compression algorithm
@@ -1149,9 +1186,10 @@ void BlockBasedTableBuilder::Flush() {
       CompressionInfo info_tmp(options, context,
                                CompressionDict::GetEmptyDict(), c);
 
-      CompressData(uncompressed_block_data, info_tmp,
-                   GetCompressFormatForVersion(r->table_options.format_version),
-                   &sampled_output_slow);
+      OLD_CompressData(
+          uncompressed_block_data, info_tmp,
+          GetCompressFormatForVersion(r->table_options.format_version),
+          &sampled_output_slow);
     }
 
     if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
@@ -1182,8 +1220,8 @@ void BlockBasedTableBuilder::Flush() {
     rep_->data_begin_offset += uncompressed_block_data.size();
   } else if (r->IsParallelCompressionEnabled()) {
     assert(rep_->state == Rep::State::kUnbuffered);
-    ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
-        r->compression_type, r->first_key_in_next_block, &(r->data_block));
+    ParallelCompressionRep::BlockRep* block_rep =
+        r->pc_rep->PrepareBlock(r->first_key_in_next_block, &(r->data_block));
     assert(block_rep != nullptr);
     r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
                                              r->get_offset());
@@ -1203,10 +1241,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   CompressionType type;
   Status compress_status;
   bool is_data_block = block_type == BlockType::kData;
-  CompressAndVerifyBlock(uncompressed_block_data, is_data_block,
-                         *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
-                         &r->single_threaded_compressed_output, &type,
-                         &compress_status);
+  CompressAndVerifyBlock(
+      uncompressed_block_data, is_data_block,
+      is_data_block ? r->data_block_working_areas[0] : r->basic_working_area,
+      &r->single_threaded_compressed_output, &type, &compress_status);
   r->SetStatus(compress_status);
   if (!ok()) {
     return;
@@ -1226,17 +1264,14 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   }
 }
 
-void BlockBasedTableBuilder::BGWorkCompression(
-    const CompressionContext& compression_ctx,
-    UncompressionContext* verify_ctx) {
+void BlockBasedTableBuilder::BGWorkCompression(WorkingAreaPair& working_area) {
   ParallelCompressionRep::BlockRep* block_rep = nullptr;
   while (rep_->pc_rep->compress_queue.pop(block_rep)) {
     assert(block_rep != nullptr);
     // Skip compression if we are aborting anyway
     if (ok()) {
       CompressAndVerifyBlock(block_rep->uncompressed, true, /* is_data_block*/
-                             compression_ctx, verify_ctx,
-                             &block_rep->compressed,
+                             working_area, &block_rep->compressed,
                              &block_rep->compression_type, &block_rep->status);
     }
     block_rep->slot.Fill(block_rep);
@@ -1245,83 +1280,90 @@ void BlockBasedTableBuilder::BGWorkCompression(
 
 void BlockBasedTableBuilder::CompressAndVerifyBlock(
     const Slice& uncompressed_block_data, bool is_data_block,
-    const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
-    std::string* compressed_output, CompressionType* result_compression_type,
-    Status* out_status) {
+    WorkingAreaPair& working_area, std::string* compressed_output,
+    CompressionType* result_compression_type, Status* out_status) {
   Rep* r = rep_;
 
-  CompressionType type = r->compression_type;
-  if (uncompressed_block_data.size() < kCompressionSizeLimit) {
-    StopWatchNano timer(
-        r->ioptions.clock,
-        ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
-
-#ifndef NDEBUG
-    if (type != kNoCompression &&
-        g_hack_mixed_compression_in_block_based_table.LoadRelaxed() > 0U) {
-      // If zstd is in the mix, the compression_name table property needs to be
-      // set to it, for proper handling of context and dictionaries.
-      assert(!ZSTD_Supported() || r->compression_type == kZSTD);
-      const auto& compressions = GetSupportedCompressions();
-      auto counter =
-          g_hack_mixed_compression_in_block_based_table.FetchAddRelaxed(1);
-      type = compressions[counter % compressions.size()];
-    }
-#endif  // !NDEBUG
-
-    const CompressionDict* compression_dict;
-    if (!is_data_block || r->compression_dict == nullptr) {
-      compression_dict = &CompressionDict::GetEmptyDict();
-    } else {
-      compression_dict = r->compression_dict.get();
-    }
-    assert(compression_dict != nullptr);
-    CompressionInfo compression_info(r->compression_opts, compression_ctx,
-                                     *compression_dict, type);
-
-    type = CompressBlock(uncompressed_block_data, compression_info,
-                         r->table_options.format_version, compressed_output);
-
-    // Some of the compression algorithms are known to be unreliable. If
-    // the verify_compression flag is set then try to de-compress the
-    // compressed data and compare to the input.
-    if (r->table_options.verify_compression && type != kNoCompression) {
-      // Retrieve the uncompressed contents into a new buffer
-      const UncompressionDict* verify_dict;
-      if (!is_data_block || r->verify_dict == nullptr) {
-        verify_dict = &UncompressionDict::GetEmptyDict();
+  Compressor* compressor = nullptr;
+  Decompressor* verify_decomp = nullptr;
+  if (is_data_block) {
+    compressor = r->data_block_compressor;
+    verify_decomp = r->data_block_verify_decompressor.get();
+  } else {
+    compressor = r->basic_compressor.get();
+    verify_decomp = r->verify_decompressor.get();
+  }
+
+  CompressionType type = kNoCompression;
+  if (LIKELY(uncompressed_block_data.size() < kCompressionSizeLimit)) {
+    if (compressor) {
+      StopWatchNano timer(
+          r->ioptions.clock,
+          ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
+
+      if (is_data_block) {
+        if (r->data_block_compressor) {
+          *out_status = r->data_block_compressor->CompressBlock(
+              uncompressed_block_data, compressed_output, &type,
+              &working_area.compress);
+          verify_decomp = r->data_block_verify_decompressor.get();
+        }
       } else {
-        verify_dict = r->verify_dict.get();
+        if (r->basic_compressor) {
+          *out_status = r->basic_compressor->CompressBlock(
+              uncompressed_block_data, compressed_output, &type,
+              &working_area.compress);
+          verify_decomp = r->verify_decompressor.get();
+        }
       }
-      assert(verify_dict != nullptr);
-      BlockContents contents;
-      UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
-                                           r->compression_type);
-      Status uncompress_status =
-          UncompressBlockData(uncompression_info, compressed_output->data(),
-                              compressed_output->size(), &contents,
-                              r->table_options.format_version, r->ioptions);
-
-      if (uncompress_status.ok()) {
-        bool data_match = contents.data.compare(uncompressed_block_data) == 0;
-        if (!data_match) {
-          // The result of the compression was invalid. abort.
-          const char* const msg =
-              "Decompressed block did not match pre-compression block";
-          ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
-          *out_status = Status::Corruption(msg);
+      // Post-condition of Compressor::CompressBlock
+      assert(type == kNoCompression || out_status->ok());
+      assert(type == kNoCompression ||
+             r->table_options.verify_compression == (verify_decomp != nullptr));
+
+      // Check for acceptable compression ratio. (For efficiency, avoid floating
+      // point and division.)
+      // TODO: integrate into Compressor?
+      if (compressed_output->size() >
+          (static_cast<uint64_t>(r->max_compressed_bytes_per_kb) *
+           uncompressed_block_data.size()) >>
+          10) {
+        // Prefer to keep uncompressed
+        type = kNoCompression;
+      }
+
+      // Some of the compression algorithms are known to be unreliable. If
+      // the verify_compression flag is set then try to de-compress the
+      // compressed data and compare to the input.
+      if (verify_decomp && type != kNoCompression) {
+        BlockContents contents;
+        Status uncompress_status = DecompressBlockData(
+            compressed_output->data(), compressed_output->size(), type,
+            *verify_decomp, &contents, r->ioptions,
+            /*allocator=*/nullptr, &working_area.verify);
+
+        if (uncompress_status.ok()) {
+          bool data_match = contents.data.compare(uncompressed_block_data) == 0;
+          if (!data_match) {
+            // The result of the compression was invalid. abort.
+            const char* const msg =
+                "Decompressed block did not match pre-compression block";
+            ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
+            *out_status = Status::Corruption(msg);
+            type = kNoCompression;
+          }
+        } else {
+          // Decompression reported an error. abort.
+          *out_status =
+              Status::Corruption(std::string("Could not decompress: ") +
+                                 uncompress_status.getState());
           type = kNoCompression;
         }
-      } else {
-        // Decompression reported an error. abort.
-        *out_status = Status::Corruption(std::string("Could not decompress: ") +
-                                         uncompress_status.getState());
-        type = kNoCompression;
       }
-    }
-    if (timer.IsStarted()) {
-      RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
-                            timer.ElapsedNanos());
+      if (timer.IsStarted()) {
+        RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
+                              timer.ElapsedNanos());
+      }
     }
     if (is_data_block) {
       r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
@@ -1336,7 +1378,6 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
           uncompressed_block_data.size() + kBlockTrailerSize,
           std::memory_order_relaxed);
     }
-    type = kNoCompression;
   }
 
   // Abort compression if the block is too big, or did not pass
@@ -1388,6 +1429,9 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     assert(comp_type == kNoCompression);
   }
 
+  // TODO: consider a variant of this function that puts the trailer after
+  // block_contents (if it comes from a std::string) so we only need one
+  // r->file->Append call
   {
     io_s = r->file->Append(io_options, block_contents);
     if (!io_s.ok()) {
@@ -1423,27 +1467,12 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     }
   }
 
-  {
-    bool warm_cache;
-    switch (r->table_options.prepopulate_block_cache) {
-      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
-        warm_cache = (r->reason == TableFileCreationReason::kFlush);
-        break;
-      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
-        warm_cache = false;
-        break;
-      default:
-        // missing case
-        assert(false);
-        warm_cache = false;
-    }
-    if (warm_cache) {
-      Status s = InsertBlockInCacheHelper(*uncompressed_block_data, handle,
-                                          block_type);
-      if (!s.ok()) {
-        r->SetStatus(s);
-        return;
-      }
+  if (r->warm_cache) {
+    Status s =
+        InsertBlockInCacheHelper(*uncompressed_block_data, handle, block_type);
+    if (!s.ok()) {
+      r->SetStatus(s);
+      return;
     }
   }
 
@@ -1546,14 +1575,12 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
 
 void BlockBasedTableBuilder::StartParallelCompression() {
   rep_->pc_rep.reset(
-      new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
+      new ParallelCompressionRep(rep_->compression_parallel_threads));
   rep_->pc_rep->compress_thread_pool.reserve(
-      rep_->compression_opts.parallel_threads);
-  for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
-    rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
-      BGWorkCompression(*(rep_->compression_ctxs[i]),
-                        rep_->verify_ctxs[i].get());
-    });
+      rep_->compression_parallel_threads);
+  for (uint32_t i = 0; i < rep_->compression_parallel_threads; i++) {
+    rep_->pc_rep->compress_thread_pool.emplace_back(
+        [this, i] { BGWorkCompression(rep_->data_block_working_areas[i]); });
   }
   rep_->pc_rep->write_thread.reset(
       new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); }));
@@ -1584,10 +1611,14 @@ Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
   if (block_cache && helper && helper->create_cb) {
     CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle);
     size_t charge;
+    // NOTE: data blocks (and everything else) will be warmed in decompressed
+    // state, so does not need a dictionary-aware decompressor. The only thing
+    // needing a decompressor here (in create_context) is warming the
+    // (de)compression dictionary, which will clone and save a dict-based
+    // decompressor from the corresponding non-dict decompressor.
     s = WarmInCache(block_cache, key.AsSlice(), block_contents,
                     &rep_->create_context, helper, Cache::Priority::LOW,
                     &charge);
-
     if (s.ok()) {
       BlockBasedTable::UpdateCacheInsertionMetrics(
           block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
@@ -1747,10 +1778,6 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
         rep_->ioptions.merge_operator != nullptr
             ? rep_->ioptions.merge_operator->Name()
             : "nullptr";
-    rep_->props.compression_name =
-        CompressionTypeToString(rep_->compression_type);
-    rep_->props.compression_options =
-        CompressionOptionsToString(rep_->compression_opts);
     rep_->props.prefix_extractor_name =
         rep_->prefix_extractor ? rep_->prefix_extractor->AsString() : "nullptr";
     std::string property_collectors_names = "[";
@@ -1839,19 +1866,19 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
 
 void BlockBasedTableBuilder::WriteCompressionDictBlock(
     MetaIndexBuilder* meta_index_builder) {
-  if (rep_->compression_dict != nullptr &&
-      rep_->compression_dict->GetRawDict().size()) {
+  Slice compression_dict;
+  if (rep_->compressor_with_dict) {
+    compression_dict = rep_->compressor_with_dict->GetSerializedDict();
+  }
+  if (!compression_dict.empty()) {
     BlockHandle compression_dict_block_handle;
     if (ok()) {
-      WriteMaybeCompressedBlock(rep_->compression_dict->GetRawDict(),
-                                kNoCompression, &compression_dict_block_handle,
+      WriteMaybeCompressedBlock(compression_dict, kNoCompression,
+                                &compression_dict_block_handle,
                                 BlockType::kCompressionDictionary);
-#ifndef NDEBUG
-      Slice compression_dict = rep_->compression_dict->GetRawDict();
       TEST_SYNC_POINT_CALLBACK(
           "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
           &compression_dict);
-#endif  // NDEBUG
     }
     if (ok()) {
       meta_index_builder->Add(kCompressionDictBlockName,
@@ -1907,9 +1934,6 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
   Rep* r = rep_;
   assert(r->state == Rep::State::kBuffered);
   r->state = Rep::State::kUnbuffered;
-  const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
-                                  ? r->compression_opts.zstd_max_train_bytes
-                                  : r->compression_opts.max_dict_bytes;
   const size_t kNumBlocksBuffered = r->data_block_buffers.size();
   if (kNumBlocksBuffered == 0) {
     // The below code is neither safe nor necessary for handling zero data
@@ -1934,17 +1958,16 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
   const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
 
-  std::string compression_dict_samples;
-  std::vector<size_t> compression_dict_sample_lens;
+  Compressor::DictSampleArgs samples;
   size_t buffer_idx = kInitSampleIdx;
-  for (size_t i = 0;
-       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+  for (size_t i = 0; i < kNumBlocksBuffered &&
+                     samples.sample_data.size() < r->max_dict_sample_bytes;
        ++i) {
-    size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
-                               r->data_block_buffers[buffer_idx].size());
-    compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
-                                    copy_len);
-    compression_dict_sample_lens.emplace_back(copy_len);
+    size_t copy_len =
+        std::min(r->max_dict_sample_bytes - samples.sample_data.size(),
+                 r->data_block_buffers[buffer_idx].size());
+    samples.sample_data.append(r->data_block_buffers[buffer_idx], 0, copy_len);
+    samples.sample_lens.emplace_back(copy_len);
 
     buffer_idx += kPrimeGeneratorRemainder;
     if (buffer_idx >= kNumBlocksBuffered) {
@@ -1952,30 +1975,41 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
     }
   }
 
-  // final data block flushed, now we can generate dictionary from the samples.
-  // OK if compression_dict_samples is empty, we'll just get empty dictionary.
-  std::string dict;
-  if (r->compression_opts.zstd_max_train_bytes > 0) {
-    if (r->compression_opts.use_zstd_dict_trainer) {
-      dict = ZSTD_TrainDictionary(compression_dict_samples,
-                                  compression_dict_sample_lens,
-                                  r->compression_opts.max_dict_bytes);
+  // final sample data block flushed, now we can generate dictionary
+  r->compressor_with_dict = r->basic_compressor->MaybeCloneSpecialized(
+      CacheEntryRole::kDataBlock, std::move(samples));
+
+  // The compressor might opt not to use a dictionary, in which case we
+  // can use the same compressor as for e.g. index blocks.
+  r->data_block_compressor = r->compressor_with_dict
+                                 ? r->compressor_with_dict.get()
+                                 : r->basic_compressor.get();
+  for (uint32_t i = 0; i < r->compression_parallel_threads; i++) {
+    r->data_block_working_areas[i].compress =
+        r->data_block_compressor->ObtainWorkingArea();
+  }
+  Slice serialized_dict = r->data_block_compressor->GetSerializedDict();
+  if (!serialized_dict.empty() && r->verify_decompressor) {
+    // Get an updated dictionary-aware decompressor for verification.
+    Status s = r->verify_decompressor->MaybeCloneForDict(
+        serialized_dict, &r->verify_decompressor_with_dict);
+    // Dictionary support must be present on the decompressor side if it's on
+    // the compressor side.
+    assert(r->verify_decompressor_with_dict);
+    if (r->verify_decompressor_with_dict) {
+      r->data_block_verify_decompressor =
+          r->verify_decompressor_with_dict.get();
+      for (uint32_t i = 0; i < r->compression_parallel_threads; i++) {
+        r->data_block_working_areas[i].verify =
+            r->data_block_verify_decompressor->ObtainWorkingArea(
+                r->data_block_compressor->GetPreferredCompressionType());
+      }
+      assert(s.ok());
     } else {
-      dict = ZSTD_FinalizeDictionary(
-          compression_dict_samples, compression_dict_sample_lens,
-          r->compression_opts.max_dict_bytes, r->compression_opts.level);
+      assert(!s.ok());
+      r->SetStatus(s);
     }
-  } else {
-    // ZSTD "raw content dictionary" - "Any buffer is a valid raw content
-    // dictionary."
-    dict = std::move(compression_dict_samples);
-  }
-  if (r->table_options.verify_compression) {
-    r->verify_dict.reset(
-        new UncompressionDict(std::string(dict), r->compression_type == kZSTD));
   }
-  r->compression_dict.reset(new CompressionDict(
-      std::move(dict), r->compression_type, r->compression_opts.level));
 
   auto get_iterator_for_block = [&r](size_t i) {
     auto& data_block = r->data_block_buffers[i];
@@ -2021,7 +2055,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       }
 
       ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
-          r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
+          first_key_in_next_block_ptr, &data_block, &keys);
 
       assert(block_rep != nullptr);
       r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
@@ -2207,7 +2241,4 @@ const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter.";
 const std::string BlockBasedTable::kPartitionedFilterBlockPrefix =
     "partitionedfilter.";
 
-#ifndef NDEBUG
-RelaxedAtomic<uint64_t> g_hack_mixed_compression_in_block_based_table{0};
-#endif  // !NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 708a0c51922a..b1c4829c95f5 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -155,7 +155,7 @@ class BlockBasedTableBuilder : public TableBuilder {
   class BlockBasedTablePropertiesCollectorFactory;
   class BlockBasedTablePropertiesCollector;
   Rep* rep_;
-
+  struct WorkingAreaPair;
   struct ParallelCompressionRep;
 
   // Advanced operation: flush any buffered key/value pairs to file.
@@ -171,15 +171,12 @@ class BlockBasedTableBuilder : public TableBuilder {
 
   // Get blocks from mem-table walking thread, compress them and
   // pass them to the write thread. Used in parallel compression mode only
-  void BGWorkCompression(const CompressionContext& compression_ctx,
-                         UncompressionContext* verify_ctx);
+  void BGWorkCompression(WorkingAreaPair& working_area);
 
   // Given uncompressed block content, try to compress it and return result and
   // compression type
   void CompressAndVerifyBlock(const Slice& uncompressed_block_data,
-                              bool is_data_block,
-                              const CompressionContext& compression_ctx,
-                              UncompressionContext* verify_ctx,
+                              bool is_data_block, WorkingAreaPair& working_area,
                               std::string* compressed_output,
                               CompressionType* result_compression_type,
                               Status* out_status);
@@ -195,10 +192,4 @@ class BlockBasedTableBuilder : public TableBuilder {
   void StopParallelCompression();
 };
 
-#ifndef NDEBUG
-// 0 == disable the hack
-// > 0 => counter for rotating through compression types
-extern RelaxedAtomic<uint64_t> g_hack_mixed_compression_in_block_based_table;
-#endif
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index dd904ccd906d..1bf777a87a8c 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -91,13 +91,13 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
 #define INSTANTIATE_BLOCKLIKE_TEMPLATES(T)                                     \
   template Status BlockBasedTable::RetrieveBlock<T>(                           \
       FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro,             \
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,  \
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,              \
       CachableEntry<T>* out_parsed_block, GetContext* get_context,             \
       BlockCacheLookupContext* lookup_context, bool for_compaction,            \
       bool use_cache, bool async_read, bool use_block_cache_for_lookup) const; \
   template Status BlockBasedTable::MaybeReadBlockAndLoadToCache<T>(            \
       FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro,             \
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,  \
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,              \
       bool for_compaction, CachableEntry<T>* block_entry,                      \
       GetContext* get_context, BlockCacheLookupContext* lookup_context,        \
       BlockContents* contents, bool async_read,                                \
@@ -107,7 +107,7 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
       CachableEntry<T>* out_parsed_block) const;
 
 INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock);
-INSTANTIATE_BLOCKLIKE_TEMPLATES(UncompressionDict);
+INSTANTIATE_BLOCKLIKE_TEMPLATES(DecompressorDict);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kData);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kIndex);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kFilterPartitionIndex);
@@ -195,7 +195,7 @@ Status ReadAndParseBlockFromFile(
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
     std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
     BlockCreateContext& create_context, bool maybe_compressed,
-    const UncompressionDict& uncompression_dict,
+    UnownedPtr<Decompressor> decomp,
     const PersistentCacheOptions& cache_options,
     MemoryAllocator* memory_allocator, bool for_compaction, bool async_read) {
   assert(result);
@@ -204,8 +204,8 @@ Status ReadAndParseBlockFromFile(
   BlockFetcher block_fetcher(
       file, prefetch_buffer, footer, options, handle, &contents, ioptions,
       /*do_uncompress*/ maybe_compressed, maybe_compressed,
-      TBlocklike::kBlockType, uncompression_dict, cache_options,
-      memory_allocator, nullptr, for_compaction);
+      TBlocklike::kBlockType, decomp, cache_options, memory_allocator, nullptr,
+      for_compaction);
   Status s;
   // If prefetch_buffer is not allocated, it will fallback to synchronous
   // reading of block contents.
@@ -738,13 +738,20 @@ Status BlockBasedTable::Open(
     return s;
   }
 
+  CompressionType saved_comp_type = CompressionTypeFromString(
+      rep->table_properties ? rep->table_properties->compression_name
+                            : std::string{});
+  if (saved_comp_type != kNoCompression) {
+    // TODO: custom CompressionManager
+    auto mgr = GetBuiltinCompressionManager(
+        GetCompressFormatForVersion(footer.format_version()));
+    rep->decompressor = mgr->GetDecompressorOptimizeFor(saved_comp_type);
+  }
+
   // Populate BlockCreateContext
-  bool blocks_definitely_zstd_compressed =
-      rep->table_properties && (rep->table_properties->compression_name ==
-                                CompressionTypeToString(kZSTD));
   rep->create_context = BlockCreateContext(
       &rep->table_options, &rep->ioptions, rep->ioptions.stats,
-      blocks_definitely_zstd_compressed, block_protection_bytes_per_key,
+      rep->decompressor.get(), block_protection_bytes_per_key,
       rep->internal_comparator.user_comparator(), rep->index_value_is_full,
       rep->index_has_first_key);
 
@@ -997,9 +1004,6 @@ Status BlockBasedTable::ReadPropertiesBlock(
             "Problem reading or processing seqno-to-time mapping: %s",
             s.ToString().c_str());
       }
-      rep_->blocks_maybe_compressed =
-          rep_->table_properties->compression_name !=
-          CompressionTypeToString(kNoCompression);
     }
   } else {
     ROCKS_LOG_ERROR(rep_->ioptions.logger,
@@ -1301,10 +1305,9 @@ Status BlockBasedTable::ReadMetaIndexBlock(
   Status s = ReadAndParseBlockFromFile(
       rep_->file.get(), prefetch_buffer, rep_->footer, ro,
       rep_->footer.metaindex_handle(), &metaindex, rep_->ioptions,
-      rep_->create_context, true /*maybe_compressed*/,
-      UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options,
-      GetMemoryAllocator(rep_->table_options), false /* for_compaction */,
-      false /* async_read */);
+      rep_->create_context, true /*maybe_compressed*/, rep_->decompressor.get(),
+      rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options),
+      false /* for_compaction */, false /* async_read */);
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(rep_->ioptions.logger,
@@ -1343,7 +1346,7 @@ template <typename TBlocklike>
 WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::GetDataBlockFromCache(
     const Slice& cache_key, BlockCacheInterface<TBlocklike> block_cache,
     CachableEntry<TBlocklike>* out_parsed_block, GetContext* get_context,
-    const UncompressionDict* dict) const {
+    UnownedPtr<Decompressor> decomp) const {
   assert(out_parsed_block);
   assert(out_parsed_block->IsEmpty());
 
@@ -1352,12 +1355,24 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::GetDataBlockFromCache(
 
   // Lookup uncompressed cache first
   if (block_cache) {
-    BlockCreateContext create_ctx = rep_->create_context;
-    create_ctx.dict = dict;
     assert(!cache_key.empty());
-    auto cache_handle = block_cache.LookupFull(
-        cache_key, &create_ctx, GetCachePriority<TBlocklike>(), statistics,
-        rep_->ioptions.lowest_used_cache_tier);
+    typename BlockCacheInterface<TBlocklike>::TypedHandle* cache_handle;
+    if (decomp.get() != rep_->decompressor.get() && decomp) {
+      // `decomp` must be a dictionary-aware decompressor, which is only
+      // available in the block cache (so that dictionaries can be evicted
+      // from memory) and can't live in the table reader.
+      // NOTE: inefficient BlockCreateContext copy for dict-aware decompressor
+      // (see TODO in block_cache.h)
+      BlockCreateContext create_ctx = rep_->create_context;
+      create_ctx.decompressor = decomp.get();
+      cache_handle = block_cache.LookupFull(
+          cache_key, &create_ctx, GetCachePriority<TBlocklike>(), statistics,
+          rep_->ioptions.lowest_used_cache_tier);
+    } else {
+      cache_handle = block_cache.LookupFull(
+          cache_key, &rep_->create_context, GetCachePriority<TBlocklike>(),
+          statistics, rep_->ioptions.lowest_used_cache_tier);
+    }
 
     // Avoid updating metrics here if the handle is not complete yet. This
     // happens with MultiGet and secondary cache. So update the metrics only
@@ -1387,10 +1402,9 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::PutDataBlockToCache(
     CachableEntry<TBlocklike>* out_parsed_block,
     BlockContents&& uncompressed_block_contents,
     BlockContents&& compressed_block_contents, CompressionType block_comp_type,
-    const UncompressionDict& uncompression_dict,
-    MemoryAllocator* memory_allocator, GetContext* get_context) const {
+    UnownedPtr<Decompressor> decomp, MemoryAllocator* memory_allocator,
+    GetContext* get_context) const {
   const ImmutableOptions& ioptions = rep_->ioptions;
-  const uint32_t format_version = rep_->table_options.format_version;
   assert(out_parsed_block);
   assert(out_parsed_block->IsEmpty());
 
@@ -1402,12 +1416,10 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::PutDataBlockToCache(
       uncompressed_block_contents.data.empty()) {
     assert(compressed_block_contents.data.data());
     // Retrieve the uncompressed contents into a new buffer
-    UncompressionContext context(block_comp_type);
-    UncompressionInfo info(context, uncompression_dict, block_comp_type);
-    s = UncompressBlockData(info, compressed_block_contents.data.data(),
-                            compressed_block_contents.data.size(),
-                            &uncompressed_block_contents, format_version,
-                            ioptions, memory_allocator);
+    s = DecompressBlockData(
+        compressed_block_contents.data.data(),
+        compressed_block_contents.data.size(), block_comp_type, *decomp,
+        &uncompressed_block_contents, ioptions, memory_allocator);
     if (!s.ok()) {
       return s;
     }
@@ -1520,15 +1532,18 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
   assert(block_cache);
 
   Status s;
-  CachableEntry<UncompressionDict> uncompression_dict;
+  CachableEntry<DecompressorDict> cached_dict;
   if (rep_->uncompression_dict_reader) {
     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
         /* prefetch_buffer= */ nullptr, ro,
         /* get_context= */ nullptr, /* lookup_context= */ nullptr,
-        &uncompression_dict);
+        &cached_dict);
     if (!s.ok()) {
       return s;
     }
+    if (!cached_dict.GetValue()) {
+      return Status::Corruption("Success but no dictionary read");
+    }
   }
 
   // Do the lookup.
@@ -1537,14 +1552,20 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
 
   Statistics* statistics = rep_->ioptions.statistics.get();
 
-  BlockCreateContext create_ctx = rep_->create_context;
-  create_ctx.dict = uncompression_dict.GetValue()
-                        ? uncompression_dict.GetValue()
-                        : &UncompressionDict::GetEmptyDict();
-
-  auto cache_handle =
-      block_cache.LookupFull(key, &create_ctx, GetCachePriority<TBlocklike>(),
-                             statistics, rep_->ioptions.lowest_used_cache_tier);
+  typename BlockCacheInterface<TBlocklike>::TypedHandle* cache_handle;
+  if (cached_dict.GetValue()) {
+    // NOTE: inefficient BlockCreateContext copy for dict-aware decompressor
+    // (see TODO in block_cache.h)
+    BlockCreateContext create_ctx = rep_->create_context;
+    create_ctx.decompressor = cached_dict.GetValue()->decompressor_.get();
+    cache_handle = block_cache.LookupFull(
+        key, &create_ctx, GetCachePriority<TBlocklike>(), statistics,
+        rep_->ioptions.lowest_used_cache_tier);
+  } else {
+    cache_handle = block_cache.LookupFull(
+        key, &rep_->create_context, GetCachePriority<TBlocklike>(), statistics,
+        rep_->ioptions.lowest_used_cache_tier);
+  }
 
   if (!cache_handle) {
     UpdateCacheMissMetrics(TBlocklike::kBlockType, /* get_context = */ nullptr);
@@ -1573,7 +1594,7 @@ template <typename TBlocklike>
 WithBlocklikeCheck<Status, TBlocklike>
 BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
     bool for_compaction, CachableEntry<TBlocklike>* out_parsed_block,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
     BlockContents* contents, bool async_read,
@@ -1597,7 +1618,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
     if (!contents) {
       if (use_block_cache_for_lookup) {
         s = GetDataBlockFromCache(key, block_cache, out_parsed_block,
-                                  get_context, &uncompression_dict);
+                                  get_context, decomp);
         // Value could still be null at this point, so check the cache handle
         // and update the read pattern for prefetching
         if (out_parsed_block->GetValue() ||
@@ -1627,7 +1648,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
       const bool maybe_compressed =
           TBlocklike::kBlockType != BlockType::kFilter &&
           TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
-          rep_->blocks_maybe_compressed;
+          rep_->decompressor;
       // This flag, if true, tells BlockFetcher to return the uncompressed
       // block when ReadBlockContents() is called.
       const bool do_uncompress = maybe_compressed;
@@ -1651,8 +1672,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
             &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed,
-            TBlocklike::kBlockType, uncompression_dict,
-            rep_->persistent_cache_options,
+            TBlocklike::kBlockType, decomp, rep_->persistent_cache_options,
             GetMemoryAllocator(rep_->table_options),
             /*allocator=*/nullptr);
 
@@ -1667,7 +1687,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
           s = block_fetcher.ReadBlockContents();
         }
 
-        contents_comp_type = block_fetcher.get_compression_type();
+        contents_comp_type = block_fetcher.compression_type();
         if (get_context) {
           switch (TBlocklike::kBlockType) {
             case BlockType::kIndex:
@@ -1699,7 +1719,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
           // block in block_fetcher
           s = PutDataBlockToCache(
               key, block_cache, out_parsed_block, std::move(uncomp_contents),
-              std::move(comp_contents), contents_comp_type, uncompression_dict,
+              std::move(comp_contents), contents_comp_type, decomp,
               GetMemoryAllocator(rep_->table_options), get_context);
         }
       } else {
@@ -1715,7 +1735,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
           // the block to the cache.
           s = PutDataBlockToCache(
               key, block_cache, out_parsed_block, std::move(uncomp_contents),
-              std::move(comp_contents), contents_comp_type, uncompression_dict,
+              std::move(comp_contents), contents_comp_type, decomp,
               GetMemoryAllocator(rep_->table_options), get_context);
         }
       }
@@ -1830,7 +1850,7 @@ void BlockBasedTable::FinishTraceRecord(
 template <typename TBlocklike /*, auto*/>
 WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
     CachableEntry<TBlocklike>* out_parsed_block, GetContext* get_context,
     BlockCacheLookupContext* lookup_context, bool for_compaction,
     bool use_cache, bool async_read, bool use_block_cache_for_lookup) const {
@@ -1840,8 +1860,8 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
   Status s;
   if (use_cache) {
     s = MaybeReadBlockAndLoadToCache(
-        prefetch_buffer, ro, handle, uncompression_dict, for_compaction,
-        out_parsed_block, get_context, lookup_context,
+        prefetch_buffer, ro, handle, decomp, for_compaction, out_parsed_block,
+        get_context, lookup_context,
         /*contents=*/nullptr, async_read, use_block_cache_for_lookup);
 
     if (!s.ok()) {
@@ -1865,7 +1885,7 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
   const bool maybe_compressed =
       TBlocklike::kBlockType != BlockType::kFilter &&
       TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
-      rep_->blocks_maybe_compressed;
+      rep_->decompressor;
   std::unique_ptr<TBlocklike> block;
 
   {
@@ -1874,9 +1894,9 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
     StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
     s = ReadAndParseBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
-        rep_->ioptions, rep_->create_context, maybe_compressed,
-        uncompression_dict, rep_->persistent_cache_options,
-        GetMemoryAllocator(rep_->table_options), for_compaction, async_read);
+        rep_->ioptions, rep_->create_context, maybe_compressed, decomp,
+        rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options),
+        for_compaction, async_read);
 
     if (get_context) {
       switch (TBlocklike::kBlockType) {
@@ -2558,8 +2578,8 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
     BlockFetcher block_fetcher(
         rep_->file.get(), &prefetch_buffer, rep_->footer, read_options, handle,
         &contents, rep_->ioptions, false /* decompress */,
-        false /*maybe_compressed*/, BlockType::kData,
-        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+        false /*maybe_compressed*/, BlockType::kData, nullptr /*decompressor*/,
+        rep_->persistent_cache_options);
     s = block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       break;
@@ -2648,12 +2668,12 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks(
       // if it was checked on open.
     } else {
       // FIXME? Need to verify checksums of index and filter partitions?
-      s = BlockFetcher(
-              rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
-              read_options, handle, &contents, rep_->ioptions,
-              false /* decompress */, false /*maybe_compressed*/,
-              GetBlockTypeForMetaBlockByName(meta_block_name),
-              UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options)
+      s = BlockFetcher(rep_->file.get(), nullptr /* prefetch buffer */,
+                       rep_->footer, read_options, handle, &contents,
+                       rep_->ioptions, false /* decompress */,
+                       false /*maybe_compressed*/,
+                       GetBlockTypeForMetaBlockByName(meta_block_name),
+                       nullptr /*decompressor*/, rep_->persistent_cache_options)
               .ReadBlockContents();
     }
     if (!s.ok()) {
@@ -3033,7 +3053,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
 
   // Output compression dictionary
   if (rep_->uncompression_dict_reader) {
-    CachableEntry<UncompressionDict> uncompression_dict;
+    CachableEntry<DecompressorDict> uncompression_dict;
     s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
         nullptr /* prefetch_buffer */, ro, nullptr /* get_context */,
         nullptr /* lookup_context */, &uncompression_dict);
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 513e517aa85a..b31ff87c4c61 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -364,7 +364,7 @@ class BlockBasedTable : public TableReader {
   template <typename TBlocklike>
   WithBlocklikeCheck<Status, TBlocklike> MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
       bool for_compaction, CachableEntry<TBlocklike>* block_entry,
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
       BlockContents* contents, bool async_read,
@@ -376,7 +376,7 @@ class BlockBasedTable : public TableReader {
   template <typename TBlocklike>
   WithBlocklikeCheck<Status, TBlocklike> RetrieveBlock(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
-      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      const BlockHandle& handle, UnownedPtr<Decompressor> decomp,
       CachableEntry<TBlocklike>* block_entry, GetContext* get_context,
       BlockCacheLookupContext* lookup_context, bool for_compaction,
       bool use_cache, bool async_read, bool use_block_cache_for_lookup) const;
@@ -397,7 +397,7 @@ class BlockBasedTable : public TableReader {
       const MultiGetRange* batch,
       const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
       Status* statuses, CachableEntry<Block_kData>* results, char* scratch,
-      const UncompressionDict& uncompression_dict, bool use_fs_scratch);
+      UnownedPtr<Decompressor> decomp, bool use_fs_scratch);
 
   // Get the iterator from the index reader.
   //
@@ -429,7 +429,7 @@ class BlockBasedTable : public TableReader {
   WithBlocklikeCheck<Status, TBlocklike> GetDataBlockFromCache(
       const Slice& cache_key, BlockCacheInterface<TBlocklike> block_cache,
       CachableEntry<TBlocklike>* block, GetContext* get_context,
-      const UncompressionDict* dict) const;
+      UnownedPtr<Decompressor> decomp) const;
 
   // Put a maybe compressed block to the corresponding block caches.
   // This method will perform decompression against block_contents if needed
@@ -447,8 +447,7 @@ class BlockBasedTable : public TableReader {
       CachableEntry<TBlocklike>* cached_block,
       BlockContents&& uncompressed_block_contents,
       BlockContents&& compressed_block_contents,
-      CompressionType block_comp_type,
-      const UncompressionDict& uncompression_dict,
+      CompressionType block_comp_type, UnownedPtr<Decompressor> decomp,
       MemoryAllocator* memory_allocator, GetContext* get_context) const;
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
@@ -650,9 +649,11 @@ struct BlockBasedTable::Rep {
   Slice min_timestamp;
   Slice max_timestamp;
 
-  // If false, blocks in this file are definitely all uncompressed. Knowing this
-  // before reading individual blocks enables certain optimizations.
-  bool blocks_maybe_compressed = true;
+  // If blocks might be compressed, refers to a decompressor that can decompress
+  // them. (nullptr -> no blocks compressed)  However, if (data) blocks are
+  // dictionary compressed, a dictionary-aware decompressor is needed, which
+  // might live in the block cache.
+  std::shared_ptr<Decompressor> decompressor;
 
   // These describe how index is encoded.
   bool index_has_first_key = false;
diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h
index fd0db73af1de..288d3035565f 100644
--- a/table/block_based/block_based_table_reader_impl.h
+++ b/table/block_based/block_based_table_reader_impl.h
@@ -60,34 +60,33 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   }
 
   CachableEntry<Block> block;
-  if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
-    CachableEntry<UncompressionDict> uncompression_dict;
-    // For async scans, don't use the prefetch buffer since an async prefetch
-    // might already be under way and this would invalidate it. Also, the
-    // uncompression dict is typically at the end of the file and would
-    // most likely break the sequentiality of the access pattern.
-    // Same is with auto_readahead_size. It iterates over index to lookup for
-    // data blocks. And this could break the the sequentiality of the access
-    // pattern.
-    s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
-        ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer),
-        ro, get_context, lookup_context, &uncompression_dict);
-    if (!s.ok()) {
-      iter->Invalidate(s);
-      return iter;
+  {
+    CachableEntry<DecompressorDict> dict;
+    Decompressor* decomp = rep_->decompressor.get();
+    if (rep_->uncompression_dict_reader && block_type == BlockType::kData) {
+      // For async scans, don't use the prefetch buffer since an async prefetch
+      // might already be under way and this would invalidate it. Also, the
+      // uncompression dict is typically at the end of the file and would
+      // most likely break the sequentiality of the access pattern.
+      // Same is with auto_readahead_size. It iterates over index to lookup for
+      // data blocks. And this could break the the sequentiality of the access
+      // pattern.
+      s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+          ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer),
+          ro, get_context, lookup_context, &dict);
+      if (!s.ok()) {
+        iter->Invalidate(s);
+        return iter;
+      }
+      assert(dict.GetValue());
+      if (dict.GetValue()) {
+        decomp = dict.GetValue()->decompressor_.get();
+      }
     }
-    const UncompressionDict& dict = uncompression_dict.GetValue()
-                                        ? *uncompression_dict.GetValue()
-                                        : UncompressionDict::GetEmptyDict();
     s = RetrieveBlock(
-        prefetch_buffer, ro, handle, dict, &block.As<IterBlocklike>(),
+        prefetch_buffer, ro, handle, decomp, &block.As<IterBlocklike>(),
         get_context, lookup_context, for_compaction,
         /* use_cache */ true, async_read, use_block_cache_for_lookup);
-  } else {
-    s = RetrieveBlock(
-        prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(),
-        &block.As<IterBlocklike>(), get_context, lookup_context, for_compaction,
-        /* use_cache */ true, async_read, use_block_cache_for_lookup);
   }
 
   if (s.IsTryAgain() && async_read) {
diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h
index 7ec152fc8e93..c6263e150d42 100644
--- a/table/block_based/block_based_table_reader_sync_and_async.h
+++ b/table/block_based/block_based_table_reader_sync_and_async.h
@@ -33,7 +33,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
 (const ReadOptions& options, const MultiGetRange* batch,
  const autovector<BlockHandle, MultiGetContext::MAX_BATCH_SIZE>* handles,
  Status* statuses, CachableEntry<Block_kData>* results, char* scratch,
- const UncompressionDict& uncompression_dict, bool use_fs_scratch) const {
+ UnownedPtr<Decompressor> decomp, bool use_fs_scratch) const {
   RandomAccessFileReader* file = rep_->file.get();
   const Footer& footer = rep_->footer;
   const ImmutableOptions& ioptions = rep_->ioptions;
@@ -51,7 +51,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
 
       // XXX: use_cache=true means double cache query?
       statuses[idx_in_batch] = RetrieveBlock(
-          nullptr, options, handle, uncompression_dict,
+          nullptr, options, handle, decomp,
           &results[idx_in_batch].As<Block_kData>(), mget_iter->get_context,
           /* lookup_context */ nullptr,
           /* for_compaction */ false, /* use_cache */ true,
@@ -298,7 +298,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
         // necessary. Since we're passing the serialized block contents, it
         // will avoid looking up the block cache
         s = MaybeReadBlockAndLoadToCache(
-            nullptr, options, handle, uncompression_dict,
+            nullptr, options, handle, decomp,
             /*for_compaction=*/false, block_entry, mget_iter->get_context,
             /*lookup_context=*/nullptr, &serialized_block,
             /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
@@ -320,11 +320,9 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
           GetBlockCompressionType(serialized_block);
       BlockContents contents;
       if (compression_type != kNoCompression) {
-        UncompressionContext context(compression_type);
-        UncompressionInfo info(context, uncompression_dict, compression_type);
-        s = UncompressSerializedBlock(
-            info, req.result.data() + req_offset, handle.size(), &contents,
-            footer.format_version(), rep_->ioptions, memory_allocator);
+        s = DecompressSerializedBlock(
+            req.result.data() + req_offset, handle.size(), compression_type,
+            *decomp, &contents, rep_->ioptions, memory_allocator);
       } else {
         // There are two cases here:
         // 1) caller uses the shared buffer (scratch or direct io buffer);
@@ -421,10 +419,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
     {
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
-      CachableEntry<UncompressionDict> uncompression_dict;
-      Status uncompression_dict_status;
-      uncompression_dict_status.PermitUncheckedError();
-      bool uncompression_dict_inited = false;
+      CachableEntry<DecompressorDict> dict;
+      Status dict_status;
+      dict_status.PermitUncheckedError();
+      bool dict_inited = false;
       size_t total_len = 0;
 
       // GetContext for any key will do, as the stats will be aggregated
@@ -466,26 +464,26 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
             continue;
           }
 
-          if (!uncompression_dict_inited && rep_->uncompression_dict_reader) {
-            uncompression_dict_status =
-                rep_->uncompression_dict_reader
-                    ->GetOrReadUncompressionDictionary(
-                        nullptr /* prefetch_buffer */, read_options,
-                        get_context, &metadata_lookup_context,
-                        &uncompression_dict);
-            uncompression_dict_inited = true;
+          if (!dict_inited && rep_->uncompression_dict_reader) {
+            dict_status = rep_->uncompression_dict_reader
+                              ->GetOrReadUncompressionDictionary(
+                                  nullptr /* prefetch_buffer */, read_options,
+                                  get_context, &metadata_lookup_context, &dict);
+            dict_inited = true;
           }
 
-          if (!uncompression_dict_status.ok()) {
-            assert(!uncompression_dict_status.IsNotFound());
-            *(miter->s) = uncompression_dict_status;
+          if (!dict_status.ok()) {
+            assert(!dict_status.IsNotFound());
+            *(miter->s) = dict_status;
             data_block_range.SkipKey(miter);
             sst_file_range.SkipKey(miter);
             continue;
+          } else {
+            assert(!dict_inited || dict.GetValue() != nullptr);
+          }
+          if (dict.GetValue()) {
+            create_ctx.decompressor = dict.GetValue()->decompressor_.get();
           }
-          create_ctx.dict = uncompression_dict.GetValue()
-                                ? uncompression_dict.GetValue()
-                                : &UncompressionDict::GetEmptyDict();
 
           if (v.handle.offset() == prev_offset) {
             // This key can reuse the previous block (later on).
@@ -565,11 +563,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
       if (total_len) {
         char* scratch = nullptr;
         bool use_fs_scratch = false;
-        const UncompressionDict& dict = uncompression_dict.GetValue()
-                                            ? *uncompression_dict.GetValue()
-                                            : UncompressionDict::GetEmptyDict();
-        assert(uncompression_dict_inited || !rep_->uncompression_dict_reader);
-        assert(uncompression_dict_status.ok());
+        assert(dict_inited || !rep_->uncompression_dict_reader);
+        assert(dict_status.ok());
 
         if (!rep_->file->use_direct_io()) {
           if (CheckFSFeatureSupport(rep_->ioptions.fs.get(),
@@ -589,7 +584,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
         // 3. If blocks are compressed and no compressed block cache, use
         //    stack buf
         if (!use_fs_scratch && !rep_->file->use_direct_io() &&
-            rep_->blocks_maybe_compressed) {
+            rep_->decompressor) {
           if (total_len <= kMultiGetReadStackBufSize) {
             scratch = stack_buf;
           } else {
@@ -599,7 +594,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet)
         }
         CO_AWAIT(RetrieveMultipleBlocks)
         (read_options, &data_block_range, &block_handles, &statuses[0],
-         &results[0], scratch, dict, use_fs_scratch);
+         &results[0], scratch,
+         dict.GetValue() ? dict.GetValue()->decompressor_.get()
+                         : rep_->decompressor.get(),
+         use_fs_scratch);
         if (get_context) {
           ++(get_context->get_context_stats_.num_sst_read);
         }
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 4a18b6fcda84..19e652cc3ceb 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -190,6 +190,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 
     if (status) {
       *status = s;
+    } else {
+      ASSERT_OK(s);
     }
   }
 
diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc
index 08f5d2158dc5..f71ea5b65041 100644
--- a/table/block_based/block_cache.cc
+++ b/table/block_based/block_cache.cc
@@ -52,10 +52,10 @@ void BlockCreateContext::Create(
       table_options->filter_policy.get(), std::move(block)));
 }
 
-void BlockCreateContext::Create(std::unique_ptr<UncompressionDict>* parsed_out,
+void BlockCreateContext::Create(std::unique_ptr<DecompressorDict>* parsed_out,
                                 BlockContents&& block) {
-  parsed_out->reset(new UncompressionDict(
-      block.data, std::move(block.allocation), using_zstd));
+  parsed_out->reset(new DecompressorDict(
+      block.data, std::move(block.allocation), *decompressor));
 }
 
 namespace {
@@ -69,7 +69,7 @@ const std::array<const Cache::CacheItemHelper*,
         BlockCacheInterface<ParsedFullFilterBlock>::GetFullHelper(),
         BlockCacheInterface<Block_kFilterPartitionIndex>::GetFullHelper(),
         nullptr,  // kProperties
-        BlockCacheInterface<UncompressionDict>::GetFullHelper(),
+        BlockCacheInterface<DecompressorDict>::GetFullHelper(),
         BlockCacheInterface<Block_kRangeDeletion>::GetFullHelper(),
         nullptr,  // kHashIndexPrefixes
         nullptr,  // kHashIndexMetadata
@@ -86,7 +86,7 @@ const std::array<const Cache::CacheItemHelper*,
         BlockCacheInterface<ParsedFullFilterBlock>::GetBasicHelper(),
         BlockCacheInterface<Block_kFilterPartitionIndex>::GetBasicHelper(),
         nullptr,  // kProperties
-        BlockCacheInterface<UncompressionDict>::GetBasicHelper(),
+        BlockCacheInterface<DecompressorDict>::GetBasicHelper(),
         BlockCacheInterface<Block_kRangeDeletion>::GetBasicHelper(),
         nullptr,  // kHashIndexPrefixes
         nullptr,  // kHashIndexMetadata
diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h
index d48a88f07137..2827e0a8ae87 100644
--- a/table/block_based/block_cache.h
+++ b/table/block_based/block_cache.h
@@ -71,15 +71,16 @@ struct BlockCreateContext : public Cache::CreateContext {
   BlockCreateContext() {}
   BlockCreateContext(const BlockBasedTableOptions* _table_options,
                      const ImmutableOptions* _ioptions, Statistics* _statistics,
-                     bool _using_zstd, uint8_t _protection_bytes_per_key,
+                     Decompressor* _decompressor,
+                     uint8_t _protection_bytes_per_key,
                      const Comparator* _raw_ucmp,
                      bool _index_value_is_full = false,
                      bool _index_has_first_key = false)
       : table_options(_table_options),
         ioptions(_ioptions),
         statistics(_statistics),
+        decompressor(_decompressor),
         raw_ucmp(_raw_ucmp),
-        using_zstd(_using_zstd),
         protection_bytes_per_key(_protection_bytes_per_key),
         index_value_is_full(_index_value_is_full),
         index_has_first_key(_index_has_first_key) {}
@@ -87,10 +88,9 @@ struct BlockCreateContext : public Cache::CreateContext {
   const BlockBasedTableOptions* table_options = nullptr;
   const ImmutableOptions* ioptions = nullptr;
   Statistics* statistics = nullptr;
+  // TODO: refactor to avoid copying BlockCreateContext for dict in block cache
+  Decompressor* decompressor = nullptr;
   const Comparator* raw_ucmp = nullptr;
-  const UncompressionDict* dict = nullptr;
-  uint32_t format_version;
-  bool using_zstd = false;
   uint8_t protection_bytes_per_key = 0;
   bool index_value_is_full;
   bool index_has_first_key;
@@ -102,12 +102,10 @@ struct BlockCreateContext : public Cache::CreateContext {
                      CompressionType type, MemoryAllocator* alloc) {
     BlockContents uncompressed_block_contents;
     if (type != CompressionType::kNoCompression) {
-      assert(dict != nullptr);
-      UncompressionContext context(type);
-      UncompressionInfo info(context, *dict, type);
-      Status s = UncompressBlockData(
-          info, data.data(), data.size(), &uncompressed_block_contents,
-          table_options->format_version, *ioptions, alloc);
+      assert(decompressor != nullptr);
+      Status s =
+          DecompressBlockData(data.data(), data.size(), type, *decompressor,
+                              &uncompressed_block_contents, *ioptions, alloc);
       if (!s.ok()) {
         parsed_out->reset();
         return;
@@ -130,7 +128,7 @@ struct BlockCreateContext : public Cache::CreateContext {
               BlockContents&& block);
   void Create(std::unique_ptr<ParsedFullFilterBlock>* parsed_out,
               BlockContents&& block);
-  void Create(std::unique_ptr<UncompressionDict>* parsed_out,
+  void Create(std::unique_ptr<DecompressorDict>* parsed_out,
               BlockContents&& block);
 };
 
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index b1a855263daa..5fcb0964da53 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -833,6 +833,19 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 };
 
+namespace {
+const BlockBasedTableOptions *kTableOptions() {
+  static BlockBasedTableOptions opts{};
+  return &opts;
+}
+Decompressor *kDecompressor() {
+  static auto mgr = GetBuiltinCompressionManager(
+      GetCompressFormatForVersion(kTableOptions()->format_version));
+  static auto decomp = mgr->GetDecompressor();
+  return decomp.get();
+}
+}  // namespace
+
 TEST_F(BlockPerKVChecksumTest, EmptyBlock) {
   // Tests that empty block code path is not broken by per kv checksum.
   BlockBuilder builder(
@@ -845,14 +858,11 @@ TEST_F(BlockPerKVChecksumTest, EmptyBlock) {
 
   std::unique_ptr<Block_kData> data_block;
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = 8;
-  BlockCreateContext create_context{&tbo,
-                                    nullptr,
-                                    nullptr /* statistics */,
-                                    false /* using_zstd */,
-                                    protection_bytes_per_key,
-                                    options.comparator};
+  BlockCreateContext create_context{
+      kTableOptions(),          nullptr,
+      nullptr /* statistics */, kDecompressor(),
+      protection_bytes_per_key, options.comparator};
   create_context.Create(&data_block, std::move(contents));
   std::unique_ptr<DataBlockIter> biter{data_block->NewDataIterator(
       options.comparator, kDisableGlobalSequenceNumber)};
@@ -885,14 +895,10 @@ TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) {
   // Make sure that the checksum construction code path does not break
   // when the block is itself already corrupted.
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = 8;
-  BlockCreateContext create_context{&tbo,
-                                    nullptr /* ioptions */,
-                                    nullptr /* statistics */,
-                                    false /* using_zstd */,
-                                    protection_bytes_per_key,
-                                    options.comparator};
+  BlockCreateContext create_context{
+      kTableOptions(), nullptr /* ioptions */,   nullptr /* statistics */,
+      kDecompressor(), protection_bytes_per_key, options.comparator};
 
   {
     std::string invalid_content = "1";
@@ -950,20 +956,19 @@ TEST_F(BlockPerKVChecksumTest, ApproximateMemory) {
   };
 
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = 8;
   BlockCreateContext with_checksum_create_context{
-      &tbo,
+      kTableOptions(),
       nullptr /* ioptions */,
       nullptr /* statistics */,
-      false /* using_zstd */,
+      kDecompressor(),
       protection_bytes_per_key,
       options.comparator,
       true /* index_value_is_full */};
-  BlockCreateContext create_context{&tbo,
+  BlockCreateContext create_context{kTableOptions(),
                                     nullptr /* ioptions */,
                                     nullptr /* statistics */,
-                                    false /* using_zstd */,
+                                    kDecompressor(),
                                     0,
                                     options.comparator,
                                     true /* index_value_is_full */};
@@ -1054,13 +1059,9 @@ class DataBlockKVChecksumTest
   std::unique_ptr<Block_kData> GenerateDataBlock(
       std::vector<std::string> &keys, std::vector<std::string> &values,
       int num_record) {
-    BlockBasedTableOptions tbo;
-    BlockCreateContext create_context{&tbo,
-                                      nullptr /* statistics */,
-                                      nullptr /* ioptions */,
-                                      false /* using_zstd */,
-                                      GetChecksumLen(),
-                                      Options().comparator};
+    BlockCreateContext create_context{
+        kTableOptions(), nullptr /* statistics */, nullptr /* ioptions */,
+        kDecompressor(), GetChecksumLen(),         Options().comparator};
     builder_ = std::make_unique<BlockBuilder>(
         static_cast<int>(GetRestartInterval()),
         GetUseDeltaEncoding() /* use_delta_encoding */,
@@ -1181,13 +1182,12 @@ class IndexBlockKVChecksumTest
       std::vector<BlockHandle> &block_handles,
       std::vector<std::string> &first_keys, int num_record) {
     Options options = Options();
-    BlockBasedTableOptions tbo;
     uint8_t protection_bytes_per_key = GetChecksumLen();
     BlockCreateContext create_context{
-        &tbo,
+        kTableOptions(),
         nullptr /* ioptions */,
         nullptr /* statistics */,
-        false /* _using_zstd */,
+        kDecompressor(),
         protection_bytes_per_key,
         options.comparator,
         !UseValueDeltaEncoding() /* value_is_full */,
@@ -1324,14 +1324,10 @@ class MetaIndexBlockKVChecksumTest
       std::vector<std::string> &keys, std::vector<std::string> &values,
       int num_record) {
     Options options = Options();
-    BlockBasedTableOptions tbo;
     uint8_t protection_bytes_per_key = GetChecksumLen();
-    BlockCreateContext create_context{&tbo,
-                                      nullptr /* ioptions */,
-                                      nullptr /* statistics */,
-                                      false /* using_zstd */,
-                                      protection_bytes_per_key,
-                                      options.comparator};
+    BlockCreateContext create_context{
+        kTableOptions(), nullptr /* ioptions */,   nullptr /* statistics */,
+        kDecompressor(), protection_bytes_per_key, options.comparator};
     builder_ =
         std::make_unique<BlockBuilder>(static_cast<int>(GetRestartInterval()));
     // add a bunch of records to a block
@@ -1359,14 +1355,10 @@ INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest,
 
 TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
   Options options = Options();
-  BlockBasedTableOptions tbo;
   uint8_t protection_bytes_per_key = GetChecksumLen();
-  BlockCreateContext create_context{&tbo,
-                                    nullptr /* ioptions */,
-                                    nullptr /* statistics */,
-                                    false /* using_zstd */,
-                                    protection_bytes_per_key,
-                                    options.comparator};
+  BlockCreateContext create_context{
+      kTableOptions(), nullptr /* ioptions */,   nullptr /* statistics */,
+      kDecompressor(), protection_bytes_per_key, options.comparator};
   std::vector<int> num_restart_intervals = {1, 16};
   for (const auto num_restart_interval : num_restart_intervals) {
     const int kNumRecords = num_restart_interval * GetRestartInterval();
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
index 343e9406b571..32c43ac09f3c 100644
--- a/table/block_based/filter_block_reader_common.cc
+++ b/table/block_based/filter_block_reader_common.cc
@@ -30,8 +30,7 @@ Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
 
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->filter_handle,
-      UncompressionDict::GetEmptyDict(), filter_block, get_context,
-      lookup_context,
+      /* decomp */ nullptr, filter_block, get_context, lookup_context,
       /* for_compaction */ false, use_cache,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc
index 2cf67367b998..1a6c0aeb0f06 100644
--- a/table/block_based/hash_index_reader.cc
+++ b/table/block_based/hash_index_reader.cc
@@ -76,8 +76,8 @@ Status HashIndexReader::Create(const BlockBasedTable* table,
   BlockFetcher prefixes_block_fetcher(
       file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents,
       ioptions, true /*decompress*/, true /*maybe_compressed*/,
-      BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(),
-      cache_options, memory_allocator);
+      BlockType::kHashIndexPrefixes, rep->decompressor.get(), cache_options,
+      memory_allocator);
   s = prefixes_block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
@@ -87,7 +87,7 @@ Status HashIndexReader::Create(const BlockBasedTable* table,
       file, prefetch_buffer, footer, ro, prefixes_meta_handle,
       &prefixes_meta_contents, ioptions, true /*decompress*/,
       true /*maybe_compressed*/, BlockType::kHashIndexMetadata,
-      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+      rep->decompressor.get(), cache_options, memory_allocator);
   s = prefixes_meta_block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     // TODO: log error
diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc
index 2c0b480e2f3f..6b0a6ab71dce 100644
--- a/table/block_based/index_reader_common.cc
+++ b/table/block_based/index_reader_common.cc
@@ -26,9 +26,9 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   assert(rep != nullptr);
 
   const Status s = table->RetrieveBlock(
-      prefetch_buffer, read_options, rep->index_handle,
-      UncompressionDict::GetEmptyDict(), &index_block->As<Block_kIndex>(),
-      get_context, lookup_context, /* for_compaction */ false, use_cache,
+      prefetch_buffer, read_options, rep->index_handle, rep->decompressor.get(),
+      &index_block->As<Block_kIndex>(), get_context, lookup_context,
+      /* for_compaction */ false, use_cache,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
   return s;
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index ce0b691a47f3..a554364e50da 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -413,8 +413,7 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
 
   const Status s = table()->RetrieveBlock(
       prefetch_buffer, read_options, fltr_blk_handle,
-      UncompressionDict::GetEmptyDict(), filter_block, get_context,
-      lookup_context,
+      /* decomp */ nullptr, filter_block, get_context, lookup_context,
       /* for_compaction */ false, /* use_cache */ true,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
@@ -610,7 +609,7 @@ Status PartitionedFilterBlockReader::CacheDependencies(
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro,
-        handle, UncompressionDict::GetEmptyDict(),
+        handle, /* dict */ nullptr,
         /* for_compaction */ false, &block, nullptr /* get_context */,
         &lookup_context, nullptr /* contents */, false,
         /* use_block_cache_for_lookup */ true);
diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc
index 04c73ba0bbec..da3f3658da59 100644
--- a/table/block_based/partitioned_index_reader.cc
+++ b/table/block_based/partitioned_index_reader.cc
@@ -190,7 +190,7 @@ Status PartitionIndexReader::CacheDependencies(
     // filter blocks
     Status s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro,
-        handle, UncompressionDict::GetEmptyDict(),
+        handle, rep->decompressor.get(),
         /*for_compaction=*/false, &block.As<Block_kIndex>(),
         /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr,
         /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
index b7c9e02f01ba..2a6b25aaa5ee 100644
--- a/table/block_based/uncompression_dict_reader.cc
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -23,7 +23,7 @@ Status UncompressionDictReader::Create(
   assert(!pin || prefetch);
   assert(uncompression_dict_reader);
 
-  CachableEntry<UncompressionDict> uncompression_dict;
+  CachableEntry<DecompressorDict> uncompression_dict;
   if (prefetch || !use_cache) {
     const Status s = ReadUncompressionDictionary(
         table, prefetch_buffer, ro, use_cache, nullptr /* get_context */,
@@ -47,7 +47,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
     const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
     const ReadOptions& read_options, bool use_cache, GetContext* get_context,
     BlockCacheLookupContext* lookup_context,
-    CachableEntry<UncompressionDict>* uncompression_dict) {
+    CachableEntry<DecompressorDict>* uncompression_dict) {
   // TODO: add perf counter for compression dictionary read time
 
   assert(table);
@@ -60,8 +60,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
 
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->compression_dict_handle,
-      UncompressionDict::GetEmptyDict(), uncompression_dict, get_context,
-      lookup_context,
+      /* decomp */ nullptr, uncompression_dict, get_context, lookup_context,
       /* for_compaction */ false, use_cache,
       /* async_read */ false, /* use_block_cache_for_lookup */ true);
 
@@ -79,7 +78,7 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
 Status UncompressionDictReader::GetOrReadUncompressionDictionary(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    CachableEntry<UncompressionDict>* uncompression_dict) const {
+    CachableEntry<DecompressorDict>* uncompression_dict) const {
   assert(uncompression_dict);
 
   if (!uncompression_dict_.IsEmpty()) {
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
index b5d64dbf1458..30ec81482b6e 100644
--- a/table/block_based/uncompression_dict_reader.h
+++ b/table/block_based/uncompression_dict_reader.h
@@ -34,13 +34,13 @@ class UncompressionDictReader {
   Status GetOrReadUncompressionDictionary(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       GetContext* get_context, BlockCacheLookupContext* lookup_context,
-      CachableEntry<UncompressionDict>* uncompression_dict) const;
+      CachableEntry<DecompressorDict>* uncompression_dict) const;
 
   size_t ApproximateMemoryUsage() const;
 
  private:
   UncompressionDictReader(const BlockBasedTable* t,
-                          CachableEntry<UncompressionDict>&& uncompression_dict)
+                          CachableEntry<DecompressorDict>&& uncompression_dict)
       : table_(t), uncompression_dict_(std::move(uncompression_dict)) {
     assert(table_);
   }
@@ -51,10 +51,10 @@ class UncompressionDictReader {
       const BlockBasedTable* table, FilePrefetchBuffer* prefetch_buffer,
       const ReadOptions& read_options, bool use_cache, GetContext* get_context,
       BlockCacheLookupContext* lookup_context,
-      CachableEntry<UncompressionDict>* uncompression_dict);
+      CachableEntry<DecompressorDict>* uncompression_dict);
 
   const BlockBasedTable* table_;
-  CachableEntry<UncompressionDict> uncompression_dict_;
+  CachableEntry<DecompressorDict> uncompression_dict_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index d0ccc2a70e81..af564063ca4e 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -42,11 +42,11 @@ inline void BlockFetcher::ProcessTrailerIfPresent() {
         RecordTick(ioptions_.stats, BLOCK_CHECKSUM_MISMATCH_COUNT);
       }
     }
-    compression_type_ =
+    compression_type() =
         BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_);
   } else {
     // E.g. plain table or cuckoo table
-    compression_type_ = kNoCompression;
+    compression_type() = kNoCompression;
   }
 }
 
@@ -195,7 +195,7 @@ inline void BlockFetcher::CopyBufferToCompressedBuf() {
 }
 
 // Before - Entering this method means the block is uncompressed or do not need
-// to be uncompressed.
+// to be decompressed.
 //
 // The block can be in one of the following buffers:
 // 1. prefetch buffer if prefetch is enabled and the block is prefetched before
@@ -219,14 +219,14 @@ inline void BlockFetcher::GetBlockContents() {
     if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
       CopyBufferToHeapBuf();
     } else if (used_buf_ == compressed_buf_.get()) {
-      if (compression_type_ == kNoCompression &&
+      if (compression_type() == kNoCompression &&
           memory_allocator_ != memory_allocator_compressed_) {
         CopyBufferToHeapBuf();
       } else {
         heap_buf_ = std::move(compressed_buf_);
       }
     } else if (direct_io_buf_.get() != nullptr || use_fs_scratch_) {
-      if (compression_type_ == kNoCompression) {
+      if (compression_type() == kNoCompression) {
         CopyBufferToHeapBuf();
       } else {
         CopyBufferToCompressedBuf();
@@ -241,8 +241,8 @@ inline void BlockFetcher::GetBlockContents() {
 }
 
 // Read a block from the file and verify its checksum. Upon return, io_status_
-// will be updated with the status of the read, and slice_ will be updated
-// with a pointer to the data.
+// will be updated with the status of the read, and slice_ will be
+// updated with a pointer to the data.
 void BlockFetcher::ReadBlock(bool retry) {
   FSReadRequest read_req;
   IOOptions opts;
@@ -283,9 +283,10 @@ void BlockFetcher::ReadBlock(bool retry) {
           block_read_cpu_time,
           ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr);
 
-      io_status_ = file_->Read(
-          opts, handle_.offset(), /*size*/ block_size_with_trailer_,
-          /*result*/ &slice_, /*scratch*/ used_buf_, /*aligned_buf=*/nullptr);
+      io_status_ =
+          file_->Read(opts, handle_.offset(), /*size*/ block_size_with_trailer_,
+                      /*result*/ &slice_, /*scratch*/ used_buf_,
+                      /*aligned_buf=*/nullptr);
       PERF_COUNTER_ADD(block_read_count, 1);
 #ifndef NDEBUG
       if (slice_.data() == &stack_buf_[0]) {
@@ -357,7 +358,7 @@ void BlockFetcher::ReadBlock(bool retry) {
 
 IOStatus BlockFetcher::ReadBlockContents() {
   if (TryGetUncompressBlockFromPersistentCache()) {
-    compression_type_ = kNoCompression;
+    compression_type() = kNoCompression;
 #ifndef NDEBUG
     contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
 #endif  // NDEBUG
@@ -385,19 +386,16 @@ IOStatus BlockFetcher::ReadBlockContents() {
     }
   }
 
-  if (do_uncompress_ && compression_type_ != kNoCompression) {
+  if (do_uncompress_ && compression_type() != kNoCompression) {
     PERF_TIMER_GUARD(block_decompress_time);
-    // compressed page, uncompress, update cache
-    UncompressionContext context(compression_type_);
-    UncompressionInfo info(context, uncompression_dict_, compression_type_);
-    io_status_ = status_to_io_status(UncompressSerializedBlock(
-        info, slice_.data(), block_size_, contents_, footer_.format_version(),
-        ioptions_, memory_allocator_));
+    // Process the compressed block without trailer
+    slice_.size_ = block_size_;
+    decomp_args_.compressed_data = slice_;
+    io_status_ = status_to_io_status(DecompressSerializedBlock(
+        decomp_args_, *decompressor_, contents_, ioptions_, memory_allocator_));
 #ifndef NDEBUG
     num_heap_buf_memcpy_++;
 #endif
-    // Save the compressed block without trailer
-    slice_ = Slice(slice_.data(), block_size_);
   } else {
     GetBlockContents();
     slice_ = Slice();
@@ -410,7 +408,7 @@ IOStatus BlockFetcher::ReadBlockContents() {
 
 IOStatus BlockFetcher::ReadAsyncBlockContents() {
   if (TryGetUncompressBlockFromPersistentCache()) {
-    compression_type_ = kNoCompression;
+    compression_type() = kNoCompression;
 #ifndef NDEBUG
     contents_->has_trailer = footer_.GetBlockTrailerSize() > 0;
 #endif  // NDEBUG
@@ -442,15 +440,14 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() {
         }
         used_buf_ = const_cast<char*>(slice_.data());
 
-        if (do_uncompress_ && compression_type_ != kNoCompression) {
+        if (do_uncompress_ && compression_type() != kNoCompression) {
           PERF_TIMER_GUARD(block_decompress_time);
-          // compressed page, uncompress, update cache
-          UncompressionContext context(compression_type_);
-          UncompressionInfo info(context, uncompression_dict_,
-                                 compression_type_);
-          io_status_ = status_to_io_status(UncompressSerializedBlock(
-              info, slice_.data(), block_size_, contents_,
-              footer_.format_version(), ioptions_, memory_allocator_));
+          // Process the compressed block without trailer
+          slice_.size_ = block_size_;
+          decomp_args_.compressed_data = slice_;
+          io_status_ = status_to_io_status(
+              DecompressSerializedBlock(decomp_args_, *decompressor_, contents_,
+                                        ioptions_, memory_allocator_));
 #ifndef NDEBUG
           num_heap_buf_memcpy_++;
 #endif
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 9441e0a73cae..9360429fab25 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -46,7 +46,7 @@ class BlockFetcher {
                BlockContents* contents,
                const ImmutableOptions& ioptions /* ref retained */,
                bool do_uncompress, bool maybe_compressed, BlockType block_type,
-               const UncompressionDict& uncompression_dict /* ref retained */,
+               UnownedPtr<Decompressor> decompressor,
                const PersistentCacheOptions& cache_options /* ref retained */,
                MemoryAllocator* memory_allocator = nullptr,
                MemoryAllocator* memory_allocator_compressed = nullptr,
@@ -63,7 +63,7 @@ class BlockFetcher {
         block_type_(block_type),
         block_size_(static_cast<size_t>(handle_.size())),
         block_size_with_trailer_(block_size_ + footer.GetBlockTrailerSize()),
-        uncompression_dict_(uncompression_dict),
+        decompressor_(decompressor),
         cache_options_(cache_options),
         memory_allocator_(memory_allocator),
         memory_allocator_compressed_(memory_allocator_compressed),
@@ -81,14 +81,17 @@ class BlockFetcher {
   IOStatus ReadBlockContents();
   IOStatus ReadAsyncBlockContents();
 
-  inline CompressionType get_compression_type() const {
-    return compression_type_;
+  inline CompressionType compression_type() const {
+    return decomp_args_.compression_type;
+  }
+  inline CompressionType& compression_type() {
+    return decomp_args_.compression_type;
   }
   inline size_t GetBlockSizeWithTrailer() const {
     return block_size_with_trailer_;
   }
   inline Slice& GetCompressedBlock() {
-    assert(compression_type_ != kNoCompression);
+    assert(compression_type() != kNoCompression);
     return slice_;
   }
 
@@ -121,7 +124,7 @@ class BlockFetcher {
   const BlockType block_type_;
   const size_t block_size_;
   const size_t block_size_with_trailer_;
-  const UncompressionDict& uncompression_dict_;
+  UnownedPtr<Decompressor> decompressor_;
   const PersistentCacheOptions& cache_options_;
   MemoryAllocator* memory_allocator_;
   MemoryAllocator* memory_allocator_compressed_;
@@ -133,11 +136,11 @@ class BlockFetcher {
   CacheAllocationPtr compressed_buf_;
   char stack_buf_[kDefaultStackBufferSize];
   bool got_from_prefetch_buffer_ = false;
-  CompressionType compression_type_;
   bool for_compaction_ = false;
   bool use_fs_scratch_ = false;
   bool retry_corrupt_read_ = false;
   FSAllocationPtr fs_buf_;
+  Decompressor::Args decomp_args_;
 
   // return true if found
   bool TryGetUncompressBlockFromPersistentCache();
diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc
index 17310edec6ae..0b1fa6c5a68e 100644
--- a/table/block_fetcher_test.cc
+++ b/table/block_fetcher_test.cc
@@ -319,10 +319,12 @@ class BlockFetcherTest : public testing::Test {
     PersistentCacheOptions persistent_cache_options;
     Footer footer;
     ReadFooter(file, &footer);
+    auto mgr = GetBuiltinCompressionManager(
+        GetCompressFormatForVersion(footer.format_version()));
     std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
         file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
         ioptions, do_uncompress, compressed, block_type,
-        UncompressionDict::GetEmptyDict(), persistent_cache_options,
+        mgr->GetDecompressor().get(), persistent_cache_options,
         heap_buf_allocator, compressed_buf_allocator));
 
     ASSERT_OK(fetcher->ReadBlockContents());
@@ -335,7 +337,7 @@ class BlockFetcherTest : public testing::Test {
     if (do_uncompress) {
       *compression_type = kNoCompression;
     } else {
-      *compression_type = fetcher->get_compression_type();
+      *compression_type = fetcher->compression_type();
     }
   }
 
diff --git a/table/format.cc b/table/format.cc
index 46de42fbe9e2..7164044eed64 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -653,70 +653,81 @@ uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
   }
 }
 
-Status UncompressBlockData(const UncompressionInfo& uncompression_info,
-                           const char* data, size_t size,
-                           BlockContents* out_contents, uint32_t format_version,
+Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
+                           BlockContents* out_contents,
                            const ImmutableOptions& ioptions,
                            MemoryAllocator* allocator) {
-  Status ret = Status::OK();
-
-  assert(uncompression_info.type() != kNoCompression &&
-         "Invalid compression type");
+  assert(args.compression_type != kNoCompression && "Invalid compression type");
 
   StopWatchNano timer(ioptions.clock,
                       ShouldReportDetailedTime(ioptions.env, ioptions.stats));
-  size_t uncompressed_size = 0;
-  const char* error_msg = nullptr;
-  CacheAllocationPtr ubuf = UncompressData(
-      uncompression_info, data, size, &uncompressed_size,
-      GetCompressFormatForVersion(format_version), allocator, &error_msg);
-  if (!ubuf) {
-    if (!CompressionTypeSupported(uncompression_info.type())) {
-      ret = Status::NotSupported(
-          "Unsupported compression method for this build",
-          CompressionTypeToString(uncompression_info.type()));
-    } else {
-      std::ostringstream oss;
-      oss << "Corrupted compressed block contents";
-      if (error_msg) {
-        oss << ": " << error_msg;
-      }
-      ret = Status::Corruption(
-          oss.str(), CompressionTypeToString(uncompression_info.type()));
-    }
-    return ret;
+
+  Status s = decompressor.ExtractUncompressedSize(args);
+  if (UNLIKELY(!s.ok())) {
+    return s;
+  }
+  CacheAllocationPtr ubuf = AllocateBlock(args.uncompressed_size, allocator);
+  s = decompressor.DecompressBlock(args, ubuf.get());
+  if (UNLIKELY(!s.ok())) {
+    return s;
   }
 
-  *out_contents = BlockContents(std::move(ubuf), uncompressed_size);
+  *out_contents = BlockContents(std::move(ubuf), args.uncompressed_size);
 
   if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
     RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
                           timer.ElapsedNanos());
   }
-  RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, size);
+  RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM,
+             args.compressed_data.size());
   RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size());
   RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
 
-  TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue",
-                           static_cast<void*>(&ret));
-  TEST_SYNC_POINT_CALLBACK(
-      "UncompressBlockData:"
-      "TamperWithDecompressionOutput",
-      static_cast<void*>(out_contents));
+  TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithReturnValue",
+                           static_cast<void*>(&s));
+  TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithDecompressionOutput",
+                           static_cast<void*>(out_contents));
 
-  return ret;
+  return s;
 }
 
-Status UncompressSerializedBlock(const UncompressionInfo& uncompression_info,
-                                 const char* data, size_t size,
+Status DecompressBlockData(const char* data, size_t size, CompressionType type,
+                           Decompressor& decompressor,
+                           BlockContents* out_contents,
+                           const ImmutableOptions& ioptions,
+                           MemoryAllocator* allocator,
+                           Decompressor::ManagedWorkingArea* working_area) {
+  Decompressor::Args args;
+  args.compressed_data = Slice(data, size);
+  args.compression_type = type;
+  args.working_area = working_area;
+  return DecompressBlockData(args, decompressor, out_contents, ioptions,
+                             allocator);
+}
+
+Status DecompressSerializedBlock(const char* data, size_t size,
+                                 CompressionType type,
+                                 Decompressor& decompressor,
                                  BlockContents* out_contents,
-                                 uint32_t format_version,
                                  const ImmutableOptions& ioptions,
                                  MemoryAllocator* allocator) {
   assert(data[size] != kNoCompression);
-  assert(data[size] == static_cast<char>(uncompression_info.type()));
-  return UncompressBlockData(uncompression_info, data, size, out_contents,
-                             format_version, ioptions, allocator);
+  assert(data[size] == static_cast<char>(type));
+  return DecompressBlockData(data, size, type, decompressor, out_contents,
+                             ioptions, allocator);
+}
+
+Status DecompressSerializedBlock(Decompressor::Args& args,
+                                 Decompressor& decompressor,
+                                 BlockContents* out_contents,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator) {
+  assert(args.compressed_data.data()[args.compressed_data.size()] !=
+         kNoCompression);
+  assert(args.compressed_data.data()[args.compressed_data.size()] ==
+         static_cast<char>(args.compression_type));
+  return DecompressBlockData(args, decompressor, out_contents, ioptions,
+                             allocator);
 }
 
 // Replace the contents of db_host_id with the actual hostname, if db_host_id
diff --git a/table/format.h b/table/format.h
index 5bf1077866fd..5737c2cd2684 100644
--- a/table/format.h
+++ b/table/format.h
@@ -417,21 +417,30 @@ struct BlockContents {
 // The `data` points to serialized block contents read in from file, which
 // must be compressed and include a trailer beyond `size`. A new buffer is
 // allocated with the given allocator (or default) and the uncompressed
-// contents are returned in `out_contents`.
-// format_version is as defined in include/rocksdb/table.h, which is
-// used to determine compression format version.
-Status UncompressSerializedBlock(const UncompressionInfo& info,
-                                 const char* data, size_t size,
+// contents are returned in `out_contents`. Statistics updated.
+Status DecompressSerializedBlock(const char* data, size_t size,
+                                 CompressionType type,
+                                 Decompressor& decompressor,
                                  BlockContents* out_contents,
-                                 uint32_t format_version,
                                  const ImmutableOptions& ioptions,
                                  MemoryAllocator* allocator = nullptr);
 
-// This is a variant of UncompressSerializedBlock that does not expect a
-// block trailer beyond `size`. (CompressionType is taken from `info`.)
-Status UncompressBlockData(const UncompressionInfo& info, const char* data,
-                           size_t size, BlockContents* out_contents,
-                           uint32_t format_version,
+Status DecompressSerializedBlock(Decompressor::Args& args,
+                                 Decompressor& decompressor,
+                                 BlockContents* out_contents,
+                                 const ImmutableOptions& ioptions,
+                                 MemoryAllocator* allocator = nullptr);
+
+// This is a variant of DecompressSerializedBlock that does not expect a
+// block trailer beyond `size`. (CompressionType is passed in.)
+Status DecompressBlockData(
+    const char* data, size_t size, CompressionType type,
+    Decompressor& decompressor, BlockContents* out_contents,
+    const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr,
+    Decompressor::ManagedWorkingArea* working_area = nullptr);
+
+Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
+                           BlockContents* out_contents,
                            const ImmutableOptions& ioptions,
                            MemoryAllocator* allocator = nullptr);
 
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 7d6ab76e294c..bdc96c1c291a 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -282,7 +282,7 @@ Status ReadTablePropertiesHelper(
       BlockFetcher block_fetcher(
           file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
           ioptions, false /* decompress */, false /*maybe_compressed*/,
-          BlockType::kProperties, UncompressionDict::GetEmptyDict(),
+          BlockType::kProperties, nullptr /*decompressor*/,
           PersistentCacheOptions::kEmpty, memory_allocator);
       s = block_fetcher.ReadBlockContents();
       if (!s.ok()) {
@@ -585,7 +585,7 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
   return BlockFetcher(file, prefetch_buffer, footer, read_options,
                       metaindex_handle, metaindex_contents, ioptions,
                       false /* do decompression */, false /*maybe_compressed*/,
-                      BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(),
+                      BlockType::kMetaIndex, nullptr /*decompressor*/,
                       PersistentCacheOptions::kEmpty, memory_allocator)
       .ReadBlockContents();
 }
@@ -638,8 +638,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle,
                       contents, ioptions, false /* decompress */,
                       false /*maybe_compressed*/, block_type,
-                      UncompressionDict::GetEmptyDict(),
-                      PersistentCacheOptions::kEmpty, memory_allocator)
+                      nullptr /*decompressor*/, PersistentCacheOptions::kEmpty,
+                      memory_allocator)
       .ReadBlockContents();
 }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index 07be36714d9f..9ff03dfcda27 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -5675,11 +5675,13 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
       read_options_for_helper.verify_checksums = false;
       PersistentCacheOptions cache_options;
 
-      BlockFetcher block_fetcher(
-          file, nullptr /* prefetch_buffer */, footer, read_options_for_helper,
-          handle, contents, ioptions, false /* decompress */,
-          false /*maybe_compressed*/, block_type,
-          UncompressionDict::GetEmptyDict(), cache_options);
+      auto mgr = GetBuiltinCompressionManager(
+          GetCompressFormatForVersion(footer.format_version()));
+      BlockFetcher block_fetcher(file, nullptr /* prefetch_buffer */, footer,
+                                 read_options_for_helper, handle, contents,
+                                 ioptions, false /* decompress */,
+                                 false /*maybe_compressed*/, block_type,
+                                 mgr->GetDecompressor().get(), cache_options);
 
       ASSERT_OK(block_fetcher.ReadBlockContents());
     };
@@ -5812,12 +5814,13 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   PersistentCacheOptions pcache_opts;
+  auto mgr = GetBuiltinCompressionManager(
+      GetCompressFormatForVersion(footer.format_version()));
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
       false /*maybe_compressed*/, BlockType::kMetaIndex,
-      UncompressionDict::GetEmptyDict(), pcache_opts,
-      nullptr /*memory_allocator*/);
+      mgr->GetDecompressor().get(), pcache_opts, nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents));
 
@@ -5894,12 +5897,13 @@ TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   PersistentCacheOptions pcache_opts;
+  auto mgr = GetBuiltinCompressionManager(
+      GetCompressFormatForVersion(footer.format_version()));
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
       false /*maybe_compressed*/, BlockType::kMetaIndex,
-      UncompressionDict::GetEmptyDict(), pcache_opts,
-      nullptr /*memory_allocator*/);
+      mgr->GetDecompressor().get(), pcache_opts, nullptr /*memory_allocator*/);
   ASSERT_OK(block_fetcher.ReadBlockContents());
   Block metaindex_block(std::move(metaindex_contents));
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 1d39d6d1bd1a..bd5ccfef5f0b 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1284,6 +1284,9 @@ DEFINE_uint32(memtable_op_scan_flush_trigger,
                   .memtable_op_scan_flush_trigger,
               "Setting for CF option memtable_op_scan_flush_trigger.");
 
+DEFINE_bool(verify_compression, false,
+            "See BlockBasedTableOptions::verify_compression");
+
 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     const char* ctype) {
   assert(ctype);
@@ -2830,8 +2833,8 @@ class Benchmark {
                             const Slice& input, std::string* compressed) {
     constexpr uint32_t compress_format_version = 2;
 
-    return CompressData(input, compression_info, compress_format_version,
-                        compressed);
+    return OLD_CompressData(input, compression_info, compress_format_version,
+                            compressed);
   }
 
   void PrintHeader(const Options& options) {
@@ -4178,7 +4181,7 @@ class Benchmark {
     while (ok && bytes < 1024 * 1048576) {
       constexpr uint32_t compress_format_version = 2;
 
-      CacheAllocationPtr uncompressed = UncompressData(
+      CacheAllocationPtr uncompressed = OLD_UncompressData(
           uncompression_info, compressed.data(), compressed.size(),
           &uncompressed_size, compress_format_version);
 
@@ -4482,6 +4485,7 @@ class Benchmark {
           FLAGS_initial_auto_readahead_size;
       block_based_options.num_file_reads_for_auto_readahead =
           FLAGS_num_file_reads_for_auto_readahead;
+      block_based_options.verify_compression = FLAGS_verify_compression;
       BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
           block_based_options.prepopulate_block_cache;
       switch (FLAGS_prepopulate_block_cache) {
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index a9dc34e5d01c..076387ece281 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -39,7 +39,6 @@
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
-#include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_dumper.h"
 #include "tools/ldb_cmd_impl.h"
 #include "util/cast_util.h"
@@ -872,7 +871,7 @@ bool LDBCommand::ParseCompressionTypeOption(
       // types, as this has been *de facto* supported for a long time on the
       // read side with no code to generate them on the write side. We can test
       // that functionality, e.g. in check_format_compatible.sh, with this hack
-      g_hack_mixed_compression_in_block_based_table.StoreRelaxed(1);
+      g_hack_mixed_compression.StoreRelaxed(1);
       // Need to list zstd in compression_name table property if it's
       // potentially in the mix, for proper handling of context and dictionary.
       value = ZSTD_Supported() ? kZSTD : GetSupportedCompressions()[0];
diff --git a/util/cast_util.h b/util/cast_util.h
index 414feda9cbea..60d198a9c845 100644
--- a/util/cast_util.h
+++ b/util/cast_util.h
@@ -85,4 +85,51 @@ class UnownedPtr {
   T* ptr_ = nullptr;
 };
 
+// A smart pointer that tracks an object and an owner, using a statically
+// determined function on those to reclaim the object, if both object and owner
+// are non-null
+template <typename T, class Owner, auto Fn>
+class ManagedPtr {
+ public:
+  ManagedPtr() = default;
+  ManagedPtr(T* ptr, Owner* owner) : ptr_(ptr), owner_(owner) {}
+  ~ManagedPtr() {
+    if (ptr_ && owner_) {
+      if constexpr (std::is_member_function_pointer_v<decltype(Fn)>) {
+        (owner_->*Fn)(ptr_);
+      } else {
+        Fn(owner_, ptr_);
+      }
+    }
+  }
+  // No copies
+  ManagedPtr(const ManagedPtr&) = delete;
+  ManagedPtr& operator=(const ManagedPtr&) = delete;
+  // Moves
+  ManagedPtr(ManagedPtr&& other) noexcept {
+    ptr_ = other.ptr_;
+    owner_ = other.owner_;
+    other.ptr_ = nullptr;
+    other.owner_ = nullptr;
+  }
+  ManagedPtr& operator=(ManagedPtr&& other) noexcept {
+    ptr_ = other.ptr_;
+    owner_ = other.owner_;
+    other.ptr_ = nullptr;
+    other.owner_ = nullptr;
+    return *this;
+  }
+
+  T* get() const { return ptr_; }
+  T* operator->() const { return ptr_; }
+  T& operator*() const { return *ptr_; }
+  operator bool() const { return ptr_ != nullptr; }
+
+  Owner* owner() const { return owner_; }
+
+ private:
+  T* ptr_ = nullptr;
+  Owner* owner_ = nullptr;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.cc b/util/compression.cc
index 197b5a69d121..c3152c580ae5 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -5,6 +5,8 @@
 
 #include "util/compression.h"
 
+#include "options/options_helper.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 StreamingCompress* StreamingCompress::Create(CompressionType compression_type,
@@ -119,4 +121,784 @@ void ZSTDStreamingUncompress::Reset() {
 #endif
 }
 
+// ***********************************************************************
+// BEGIN built-in implementation of customization interface
+// ***********************************************************************
+const Slice& Decompressor::GetSerializedDict() const {
+  // Default: empty slice => no dictionary
+  static Slice kEmptySlice;
+  return kEmptySlice;
+}
+
+namespace {
+
+class BuiltinCompressorV1 : public Compressor {
+ public:
+  explicit BuiltinCompressorV1(const CompressionOptions& opts,
+                               CompressionType type)
+      : opts_(opts), type_(type) {
+    assert(type != kNoCompression);
+  }
+
+  CompressionType GetPreferredCompressionType() const override { return type_; }
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+    std::optional<CompressionContext> tmp_ctx;
+    CompressionContext* ctx = nullptr;
+    if (wa != nullptr && wa->owner() == this) {
+      ctx = static_cast<CompressionContext*>(wa->get());
+    }
+    if (ctx == nullptr) {
+      tmp_ctx.emplace(type_, opts_);
+      ctx = &*tmp_ctx;
+    }
+    CompressionInfo info(opts_, *ctx, CompressionDict::GetEmptyDict(), type_);
+    if (!OLD_CompressData(uncompressed_data, info,
+                          1 /*compress_format_version*/, compressed_output)) {
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+    *out_compression_type = type_;
+    return Status::OK();
+  }
+
+ protected:
+  const CompressionOptions opts_;
+  const CompressionType type_;
+};
+
+class BuiltinCompressorV2 : public Compressor {
+ public:
+  explicit BuiltinCompressorV2(const CompressionOptions& opts,
+                               CompressionType type,
+                               CompressionDict&& dict = {})
+      : opts_(opts), type_(type), dict_(std::move(dict)) {
+    assert(type != kNoCompression);
+  }
+
+  size_t GetMaxSampleSizeIfWantDict(
+      CacheEntryRole /*block_type*/) const override {
+    if (opts_.max_dict_bytes == 0) {
+      // Dictionary compression disabled
+      return 0;
+    } else {
+      return opts_.zstd_max_train_bytes > 0 ? opts_.zstd_max_train_bytes
+                                            : opts_.max_dict_bytes;
+    }
+  }
+
+  // NOTE: empty dict is equivalent to no dict
+  Slice GetSerializedDict() const override { return dict_.GetRawDict(); }
+
+  CompressionType GetPreferredCompressionType() const override { return type_; }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole /*block_type*/, DictSampleArgs&& dict_samples) override {
+    assert(dict_samples.Verify());
+    if (dict_samples.empty()) {
+      // Nothing to specialize on
+      return nullptr;
+    }
+    std::string dict_data;
+    // Migrated from BlockBasedTableBuilder::EnterUnbuffered()
+    if (type_ == kZSTD && opts_.zstd_max_train_bytes > 0) {
+      assert(dict_samples.sample_data.size() <= opts_.zstd_max_train_bytes);
+      if (opts_.use_zstd_dict_trainer) {
+        dict_data = ZSTD_TrainDictionary(dict_samples.sample_data,
+                                         dict_samples.sample_lens,
+                                         opts_.max_dict_bytes);
+      } else {
+        dict_data = ZSTD_FinalizeDictionary(dict_samples.sample_data,
+                                            dict_samples.sample_lens,
+                                            opts_.max_dict_bytes, opts_.level);
+      }
+    } else {
+      assert(dict_samples.sample_data.size() <= opts_.max_dict_bytes);
+      // ZSTD "raw content dictionary" - "Any buffer is a valid raw content
+      // dictionary." Or similar for other compressions.
+      dict_data = std::move(dict_samples.sample_data);
+    }
+    CompressionDict dict{std::move(dict_data), type_, opts_.level};
+    return std::make_unique<BuiltinCompressorV2>(opts_, type_, std::move(dict));
+  }
+
+  // TODO: use ZSTD_CCtx directly
+  ManagedWorkingArea ObtainWorkingArea() override {
+    return ManagedWorkingArea(
+        static_cast<WorkingArea*>(new CompressionContext(type_, opts_)), this);
+  }
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    delete static_cast<CompressionContext*>(wa);
+  }
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+    std::optional<CompressionContext> tmp_ctx;
+    CompressionContext* ctx = nullptr;
+    if (wa != nullptr && wa->owner() == this) {
+      ctx = static_cast<CompressionContext*>(wa->get());
+    }
+    CompressionType type = type_;
+#ifndef NDEBUG
+    if (type != kNoCompression && g_hack_mixed_compression.LoadRelaxed() > 0U) {
+      // If zstd is in the mix, the compression_name table property needs to be
+      // set to it, for proper handling of context and dictionaries.
+      assert(!ZSTD_Supported() || type == kZSTD);
+      const auto& compressions = GetSupportedCompressions();
+      auto counter = g_hack_mixed_compression.FetchAddRelaxed(1);
+      type = compressions[counter % compressions.size()];
+    }
+#endif  // !NDEBUG
+    if (ctx == nullptr) {
+      tmp_ctx.emplace(type, opts_);
+      ctx = &*tmp_ctx;
+    }
+    CompressionInfo info(opts_, *ctx, dict_, type);
+    if (!OLD_CompressData(uncompressed_data, info,
+                          2 /*compress_format_version*/, compressed_output)) {
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+    *out_compression_type = type;
+    return Status::OK();
+  }
+
+ protected:
+  const CompressionOptions opts_;
+  const CompressionType type_;
+  const CompressionDict dict_;
+};
+
+// NOTE: this implementation is intentionally SIMPLE based on existing code
+// and NOT EFFICIENT because this is an old/deprecated format.
+class BuiltinDecompressorV1 : public Decompressor {
+ public:
+  const char* Name() const override { return "BuiltinDecompressorV1"; }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    CacheAllocationPtr throw_away_output;
+    return DoUncompress(args, &throw_away_output, &args.uncompressed_size);
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    uint64_t same_uncompressed_size = 0;
+    CacheAllocationPtr output;
+    Status s = DoUncompress(args, &output, &same_uncompressed_size);
+    if (same_uncompressed_size != args.uncompressed_size) {
+      s = Status::Corruption("Compressed block size mismatch");
+    }
+    if (s.ok()) {
+      // NOTE: simple but inefficient
+      memcpy(uncompressed_output, output.get(), args.uncompressed_size);
+    }
+    return s;
+  }
+
+ protected:
+  Status DoUncompress(const Args& args, CacheAllocationPtr* out_data,
+                      uint64_t* out_uncompressed_size) {
+    assert(args.working_area == nullptr);
+    assert(*out_uncompressed_size == 0);
+
+    // NOTE: simple but inefficient
+    UncompressionContext dummy_ctx{args.compression_type};
+    UncompressionInfo info{dummy_ctx, UncompressionDict::GetEmptyDict(),
+                           args.compression_type};
+    const char* error_message = nullptr;
+    size_t size_t_uncompressed_size = 0;
+    *out_data = OLD_UncompressData(
+        info, args.compressed_data.data(), args.compressed_data.size(),
+        &size_t_uncompressed_size, 1 /*compress_format_version*/,
+        nullptr /*allocator*/, &error_message);
+    if (*out_data == nullptr) {
+      if (error_message != nullptr) {
+        return Status::Corruption(error_message);
+      } else {
+        return Status::Corruption("Corrupted compressed block contents");
+      }
+    }
+    *out_uncompressed_size = size_t_uncompressed_size;
+    assert(*out_uncompressed_size > 0);
+    return Status::OK();
+  }
+};
+
+class BuiltinCompressionManagerV1 : public CompressionManager {
+ public:
+  BuiltinCompressionManagerV1() = default;
+  ~BuiltinCompressionManagerV1() override = default;
+
+  const char* Name() const override { return "BuiltinCompressionManagerV1"; }
+
+  const char* CompatibilityName() const override { return "BuiltinV1"; }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    if (type > kZSTD) {
+      // Unrecognized; fall back on default compression
+      type = ColumnFamilyOptions{}.compression;
+    }
+    if (type == kNoCompression) {
+      return nullptr;
+    } else {
+      return std::make_unique<BuiltinCompressorV1>(opts, type);
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
+  }
+
+ protected:
+  BuiltinDecompressorV1 decompressor_;
+};
+
+// Subroutines for BuiltinDecompressorV2
+
+Status Snappy_DecompressBlock(const Decompressor::Args& args,
+                              char* uncompressed_output) {
+#ifdef SNAPPY
+  if (!snappy::RawUncompress(args.compressed_data.data(),
+                             args.compressed_data.size(),
+                             uncompressed_output)) {
+    return Status::Corruption("Error decompressing snappy data");
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)uncompressed_output;
+  return Status::NotSupported("Snappy not supported in this build");
+#endif
+}
+
+Status Zlib_DecompressBlock(const Decompressor::Args& args, Slice dict,
+                            char* uncompressed_output) {
+#ifdef ZLIB
+  // NOTE: uses "raw" format
+  constexpr int kWindowBits = -14;
+
+  z_stream _stream;
+  memset(&_stream, 0, sizeof(z_stream));
+
+  // For raw inflate, the windowBits should be -8..-15.
+  // If windowBits is bigger than zero, it will use either zlib
+  // header or gzip header. Adding 32 to it will do automatic detection.
+  int st = inflateInit2(&_stream, kWindowBits);
+  if (UNLIKELY(st != Z_OK)) {
+    return Status::Corruption("Failed to initialize zlib inflate: " +
+                              std::to_string(st));
+  }
+
+  if (!dict.empty()) {
+    // Initialize the compression library's dictionary
+    st = inflateSetDictionary(&_stream,
+                              reinterpret_cast<const Bytef*>(dict.data()),
+                              static_cast<unsigned int>(dict.size()));
+    if (UNLIKELY(st != Z_OK)) {
+      return Status::Corruption("Failed to initialize zlib dictionary: " +
+                                std::to_string(st));
+    }
+  }
+
+  _stream.next_in = const_cast<Bytef*>(
+      reinterpret_cast<const Bytef*>(args.compressed_data.data()));
+  _stream.avail_in = static_cast<unsigned int>(args.compressed_data.size());
+
+  _stream.next_out = reinterpret_cast<Bytef*>(uncompressed_output);
+  _stream.avail_out = static_cast<unsigned int>(args.uncompressed_size);
+
+  st = inflate(&_stream, Z_SYNC_FLUSH);
+  if (UNLIKELY(st != Z_STREAM_END)) {
+    inflateEnd(&_stream);
+    // NOTE: Z_OK is still corruption because it means we got the size wrong
+    return Status::Corruption("Failed zlib inflate: " + std::to_string(st));
+  }
+
+  // We should have no bytes left
+  if (_stream.avail_out != 0) {
+    inflateEnd(&_stream);
+    return Status::Corruption("Size mismatch decompressing zlib data");
+  }
+
+  inflateEnd(&_stream);
+  return Status::OK();
+#else
+  (void)args;
+  (void)dict;
+  (void)uncompressed_output;
+  return Status::NotSupported("Zlib not supported in this build");
+#endif
+}
+
+Status BZip2_DecompressBlock(const Decompressor::Args& args,
+                             char* uncompressed_output) {
+#ifdef BZIP2
+  auto uncompressed_size = static_cast<unsigned int>(args.uncompressed_size);
+  if (BZ_OK != BZ2_bzBuffToBuffDecompress(
+                   uncompressed_output, &uncompressed_size,
+                   const_cast<char*>(args.compressed_data.data()),
+                   static_cast<unsigned int>(args.compressed_data.size()),
+                   0 /*small mem*/, 0 /*verbosity*/)) {
+    return Status::Corruption("Error decompressing bzip2 data");
+  }
+  if (uncompressed_size != args.uncompressed_size) {
+    return Status::Corruption("Size mismatch decompressing bzip2 data");
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)uncompressed_output;
+  return Status::NotSupported("BZip2 not supported in this build");
+#endif
+}
+
+Status LZ4_DecompressBlock(const Decompressor::Args& args, Slice dict,
+                           char* uncompressed_output) {
+#ifdef LZ4
+  int expected_uncompressed_size = static_cast<int>(args.uncompressed_size);
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
+  if (!dict.empty()) {
+    LZ4_setStreamDecode(stream, dict.data(), static_cast<int>(dict.size()));
+  }
+  int uncompressed_size = LZ4_decompress_safe_continue(
+      stream, args.compressed_data.data(), uncompressed_output,
+      static_cast<int>(args.compressed_data.size()),
+      expected_uncompressed_size);
+  LZ4_freeStreamDecode(stream);
+#else   // up to r123
+  if (!dict.empty()) {
+    return Status::NotSupported(
+        "This build doesn't support dictionary compression with LZ4");
+  }
+  int uncompressed_size =
+      LZ4_decompress_safe(args.compressed_data.data(), uncompressed_output,
+                          static_cast<int>(args.compressed_data.size()),
+                          expected_uncompressed_size);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+
+  if (uncompressed_size != expected_uncompressed_size) {
+    if (uncompressed_size < 0) {
+      return Status::Corruption("Error decompressing LZ4 data");
+    } else {
+      return Status::Corruption("Size mismatch decompressing LZ4 data");
+    }
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)dict;
+  (void)uncompressed_output;
+  return Status::NotSupported("LZ4 not supported in this build");
+#endif
+}
+
+Status XPRESS_DecompressBlock(const Decompressor::Args& args,
+                              char* uncompressed_output) {
+#ifdef XPRESS
+  int64_t actual_uncompressed_size = port::xpress::DecompressToBuffer(
+      args.compressed_data.data(), args.compressed_data.size(),
+      uncompressed_output, args.uncompressed_size);
+  if (actual_uncompressed_size !=
+      static_cast<int64_t>(args.uncompressed_size)) {
+    if (actual_uncompressed_size < 0) {
+      return Status::Corruption("Error decompressing XPRESS data");
+    } else {
+      return Status::Corruption("Size mismatch decompressing XPRESS data");
+    }
+  }
+  return Status::OK();
+#else
+  (void)args;
+  (void)uncompressed_output;
+  return Status::NotSupported("XPRESS not supported in this build");
+#endif
+}
+
+template <bool kIsDigestedDict = false>
+Status ZSTD_DecompressBlockWithContext(
+    const Decompressor::Args& args,
+    std::conditional_t<kIsDigestedDict, void*, Slice> dict,
+    ZSTDUncompressCachedData::ZSTDNativeContext zstd_context,
+    char* uncompressed_output) {
+#ifdef ZSTD
+  size_t uncompressed_size;
+  assert(zstd_context != nullptr);
+  if constexpr (kIsDigestedDict) {
+#ifdef ROCKSDB_ZSTD_DDICT
+    uncompressed_size = ZSTD_decompress_usingDDict(
+        zstd_context, uncompressed_output, args.uncompressed_size,
+        args.compressed_data.data(), args.compressed_data.size(),
+        static_cast<ZSTD_DDict*>(dict));
+#else
+    static_assert(!kIsDigestedDict,
+                  "Inconsistent expectation of ZSTD digested dict support");
+#endif  // ROCKSDB_ZSTD_DDICT
+  } else if (dict.empty()) {
+    uncompressed_size = ZSTD_decompressDCtx(
+        zstd_context, uncompressed_output, args.uncompressed_size,
+        args.compressed_data.data(), args.compressed_data.size());
+  } else {
+    uncompressed_size = ZSTD_decompress_usingDict(
+        zstd_context, uncompressed_output, args.uncompressed_size,
+        args.compressed_data.data(), args.compressed_data.size(), dict.data(),
+        dict.size());
+  }
+  if (ZSTD_isError(uncompressed_size)) {
+    return Status::Corruption(std::string("ZSTD ") +
+                              ZSTD_getErrorName(uncompressed_size));
+  } else if (uncompressed_size != args.uncompressed_size) {
+    return Status::Corruption("ZSTD decompression size mismatch");
+  } else {
+    return Status::OK();
+  }
+#else
+  (void)args;
+  (void)dict;
+  (void)zstd_context;
+  (void)uncompressed_output;
+  return Status::NotSupported("ZSTD not supported in this build");
+#endif
+}
+
+template <bool kIsDigestedDict = false>
+Status ZSTD_DecompressBlock(
+    const Decompressor::Args& args,
+    std::conditional_t<kIsDigestedDict, void*, Slice> dict,
+    const Decompressor* decompressor, char* uncompressed_output) {
+  if (args.working_area && args.working_area->owner() == decompressor) {
+    auto ctx = static_cast<UncompressionContext*>(args.working_area->get());
+    assert(ctx != nullptr);
+    if (ctx->GetZSTDContext() != nullptr) {
+      return ZSTD_DecompressBlockWithContext<kIsDigestedDict>(
+          args, dict, ctx->GetZSTDContext(), uncompressed_output);
+    }
+  }
+  UncompressionContext tmp_ctx{kZSTD};
+  return ZSTD_DecompressBlockWithContext<kIsDigestedDict>(
+      args, dict, tmp_ctx.GetZSTDContext(), uncompressed_output);
+}
+
+class BuiltinDecompressorV2 : public Decompressor {
+ public:
+  const char* Name() const override { return "BuiltinDecompressorV2"; }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    assert(args.compression_type != kNoCompression);
+    if (args.compression_type == kSnappyCompression) {
+      // Exception to encoding of uncompressed size
+#ifdef SNAPPY
+      size_t uncompressed_length = 0;
+      if (!snappy::GetUncompressedLength(args.compressed_data.data(),
+                                         args.compressed_data.size(),
+                                         &uncompressed_length)) {
+        return Status::Corruption("Error reading snappy compressed length");
+      }
+      args.uncompressed_size = uncompressed_length;
+      return Status::OK();
+#else
+      return Status::NotSupported("Snappy not supported in this build");
+#endif
+    } else {
+      // Extract encoded uncompressed size
+      return Decompressor::ExtractUncompressedSize(args);
+    }
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    switch (args.compression_type) {
+      case kSnappyCompression:
+        return Snappy_DecompressBlock(args, uncompressed_output);
+      case kZlibCompression:
+        return Zlib_DecompressBlock(args, /*dict=*/Slice{},
+                                    uncompressed_output);
+      case kBZip2Compression:
+        return BZip2_DecompressBlock(args, uncompressed_output);
+      case kLZ4Compression:
+      case kLZ4HCCompression:
+        return LZ4_DecompressBlock(args, /*dict=*/Slice{}, uncompressed_output);
+      case kXpressCompression:
+        return XPRESS_DecompressBlock(args, uncompressed_output);
+      case kZSTD:
+        return ZSTD_DecompressBlock(args, /*dict=*/Slice{}, this,
+                                    uncompressed_output);
+      default:
+        return Status::NotSupported(
+            "Compression type not supported or not built-in: " +
+            CompressionTypeToString(args.compression_type));
+    }
+  }
+
+  Status MaybeCloneForDict(const Slice&,
+                           std::unique_ptr<Decompressor>*) override;
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    return sizeof(BuiltinDecompressorV2);
+  }
+};
+
+class BuiltinDecompressorV2WithDict : public BuiltinDecompressorV2 {
+ public:
+  explicit BuiltinDecompressorV2WithDict(const Slice& dict) : dict_(dict) {}
+
+  const char* Name() const override { return "BuiltinDecompressorV2WithDict"; }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    switch (args.compression_type) {
+      case kSnappyCompression:
+        // NOTE: quietly ignores the dictionary (for compatibility)
+        return Snappy_DecompressBlock(args, uncompressed_output);
+      case kZlibCompression:
+        return Zlib_DecompressBlock(args, dict_, uncompressed_output);
+      case kBZip2Compression:
+        // NOTE: quietly ignores the dictionary (for compatibility)
+        return BZip2_DecompressBlock(args, uncompressed_output);
+      case kLZ4Compression:
+      case kLZ4HCCompression:
+        return LZ4_DecompressBlock(args, dict_, uncompressed_output);
+      case kXpressCompression:
+        // NOTE: quietly ignores the dictionary (for compatibility)
+        return XPRESS_DecompressBlock(args, uncompressed_output);
+      case kZSTD:
+        return ZSTD_DecompressBlock(args, dict_, this, uncompressed_output);
+      default:
+        return Status::NotSupported(
+            "Compression type not supported or not built-in: " +
+            CompressionTypeToString(args.compression_type));
+    }
+  }
+
+  const Slice& GetSerializedDict() const override { return dict_; }
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    return sizeof(BuiltinDecompressorV2WithDict);
+  }
+
+ protected:
+  const Slice dict_;
+};
+
+Status BuiltinDecompressorV2::MaybeCloneForDict(
+    const Slice& dict, std::unique_ptr<Decompressor>* out) {
+  *out = std::make_unique<BuiltinDecompressorV2WithDict>(dict);
+  return Status::OK();
+}
+
+class BuiltinDecompressorV2OptimizeZstd : public BuiltinDecompressorV2 {
+ public:
+  const char* Name() const override {
+    return "BuiltinDecompressorV2OptimizeZstd";
+  }
+
+  ManagedWorkingArea ObtainWorkingArea(CompressionType preferred) override {
+    if (preferred == kZSTD) {
+      // TODO: evaluate whether it makes sense to use core local cache here.
+      // (Perhaps not, because explicit WorkingArea could be long-running.)
+      return ManagedWorkingArea(new UncompressionContext(kZSTD), this);
+    } else {
+      return {};
+    }
+  }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    delete static_cast<UncompressionContext*>(wa);
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (LIKELY(args.compression_type == kZSTD)) {
+      return ZSTD_DecompressBlock(args, /*dict=*/Slice{}, this,
+                                  uncompressed_output);
+    } else {
+      return BuiltinDecompressorV2::DecompressBlock(args, uncompressed_output);
+    }
+  }
+
+  Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
+                           std::unique_ptr<Decompressor>* /*out*/) override;
+};
+
+class BuiltinDecompressorV2OptimizeZstdWithDict
+    : public BuiltinDecompressorV2OptimizeZstd {
+ public:
+  BuiltinDecompressorV2OptimizeZstdWithDict(const Slice& dict)
+      :
+#ifdef ROCKSDB_ZSTD_DDICT
+        dict_(dict),
+        ddict_(ZSTD_createDDict_byReference(dict.data(), dict.size())) {
+    assert(ddict_ != nullptr);
+  }
+#else
+        dict_(dict) {
+  }
+#endif  // ROCKSDB_ZSTD_DDICT
+
+  const char* Name() const override {
+    return "BuiltinDecompressorV2OptimizeZstdWithDict";
+  }
+
+  ~BuiltinDecompressorV2OptimizeZstdWithDict() override {
+#ifdef ROCKSDB_ZSTD_DDICT
+    size_t res = ZSTD_freeDDict(ddict_);
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ROCKSDB_ZSTD_DDICT
+  }
+
+  const Slice& GetSerializedDict() const override { return dict_; }
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    size_t sz = sizeof(BuiltinDecompressorV2WithDict);
+#ifdef ROCKSDB_ZSTD_DDICT
+    sz += ZSTD_sizeof_DDict(ddict_);
+#endif  // ROCKSDB_ZSTD_DDICT
+    return sz;
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (LIKELY(args.compression_type == kZSTD)) {
+#ifdef ROCKSDB_ZSTD_DDICT
+      return ZSTD_DecompressBlock</*kIsDigestedDict=*/true>(
+          args, ddict_, this, uncompressed_output);
+#else
+      return ZSTD_DecompressBlock(args, dict_, this, uncompressed_output);
+#endif  // ROCKSDB_ZSTD_DDICT
+    } else {
+      return BuiltinDecompressorV2WithDict(dict_).DecompressBlock(
+          args, uncompressed_output);
+    }
+  }
+
+ protected:
+  const Slice dict_;
+#ifdef ROCKSDB_ZSTD_DDICT
+  ZSTD_DDict* const ddict_;
+#endif  // ROCKSDB_ZSTD_DDICT
+};
+
+Status BuiltinDecompressorV2OptimizeZstd::MaybeCloneForDict(
+    const Slice& serialized_dict, std::unique_ptr<Decompressor>* out) {
+  *out = std::make_unique<BuiltinDecompressorV2OptimizeZstdWithDict>(
+      serialized_dict);
+  return Status::OK();
+}
+
+class BuiltinCompressionManagerV2 : public CompressionManager {
+ public:
+  BuiltinCompressionManagerV2() = default;
+  ~BuiltinCompressionManagerV2() override = default;
+
+  const char* Name() const override { return "BuiltinCompressionManagerV2"; }
+
+  const char* CompatibilityName() const override { return "BuiltinV2"; }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    if (opts.max_compressed_bytes_per_kb <= 0) {
+      // No acceptable compression ratio => no compression
+      return nullptr;
+    }
+    if (type > kZSTD) {
+      // Unrecognized; fall back on default compression
+      type = ColumnFamilyOptions{}.compression;
+    }
+    if (type == kNoCompression) {
+      return nullptr;
+    } else {
+      return std::make_unique<BuiltinCompressorV2>(opts, type);
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return GetGeneralDecompressor();
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType optimize_for_type) override {
+    if (optimize_for_type == kZSTD) {
+      return GetZstdDecompressor();
+    } else {
+      return GetGeneralDecompressor();
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* types_begin,
+      const CompressionType* types_end) override {
+    if (std::find(types_begin, types_end, kZSTD)) {
+      return GetZstdDecompressor();
+    } else {
+      return GetGeneralDecompressor();
+    }
+  }
+
+ protected:
+  BuiltinDecompressorV2 decompressor_;
+  BuiltinDecompressorV2OptimizeZstd zstd_decompressor_;
+
+  inline std::shared_ptr<Decompressor> GetGeneralDecompressor() {
+    return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
+  }
+
+  inline std::shared_ptr<Decompressor> GetZstdDecompressor() {
+    return std::shared_ptr<Decompressor>(shared_from_this(),
+                                         &zstd_decompressor_);
+  }
+};
+
+const std::shared_ptr<BuiltinCompressionManagerV1>
+    kBuiltinCompressionManagerV1 =
+        std::make_shared<BuiltinCompressionManagerV1>();
+const std::shared_ptr<BuiltinCompressionManagerV2>
+    kBuiltinCompressionManagerV2 =
+        std::make_shared<BuiltinCompressionManagerV2>();
+
+}  // namespace
+
+Status CompressionManager::FindCompatibleCompressionManager(
+    Slice compatibility_name, std::shared_ptr<CompressionManager>* out) {
+  if (compatibility_name.compare(CompatibilityName()) == 0) {
+    *out = shared_from_this();
+    return Status::OK();
+  } else if (compatibility_name.compare(
+                 kBuiltinCompressionManagerV1->CompatibilityName()) == 0) {
+    *out = kBuiltinCompressionManagerV1;
+    return Status::OK();
+  } else if (compatibility_name.compare(
+                 kBuiltinCompressionManagerV2->CompatibilityName()) == 0) {
+    *out = kBuiltinCompressionManagerV2;
+    return Status::OK();
+  } else {
+    return Status::NotFound("Compatible compression manager for \"" +
+                            compatibility_name.ToString() + "\"");
+  }
+}
+
+const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
+    int compression_format_version) {
+  static const std::shared_ptr<CompressionManager> v1_as_base =
+      kBuiltinCompressionManagerV1;
+  static const std::shared_ptr<CompressionManager> v2_as_base =
+      kBuiltinCompressionManagerV2;
+  static const std::shared_ptr<CompressionManager> none;
+  if (compression_format_version == 1) {
+    return v1_as_base;
+  } else if (compression_format_version == 2) {
+    return v2_as_base;
+  } else {
+    // Unrecognized. In some cases this is unexpected and the caller can
+    // rightfully crash.
+    return none;
+  }
+}
+
+// ***********************************************************************
+// END built-in implementation of customization interface
+// ***********************************************************************
+
+#ifndef NDEBUG
+RelaxedAtomic<uint64_t> g_hack_mixed_compression{0};
+#endif  // !NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.h b/util/compression.h
index 95011b7b9635..4f23cc320a63 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -11,6 +11,10 @@
 
 #include <algorithm>
 #include <limits>
+
+#include "port/likely.h"
+#include "util/atomic.h"
+#include "util/cast_util.h"
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
 #ifdef OS_FREEBSD
 #include <malloc_np.h>
@@ -144,6 +148,539 @@ class ZSTDUncompressCachedData {
 
 namespace ROCKSDB_NAMESPACE {
 
+// ***********************************************************************
+// BEGIN future compression customization interface
+// ***********************************************************************
+
+// TODO: alias/adapt for compression
+struct FilterBuildingContext;
+
+// A Compressor represents a very specific but potentially adapting strategy for
+// compressing blocks, including the relevant algorithm(s), options, dictionary,
+// etc. as applicable--every input except the sequence of bytes to compress.
+// Compressor is generally thread-safe so can be shared by multiple threads. (It
+// could make sense to convert unique_ptr<Compressor> to
+// shared_ptr<Compressor>.) A Compressor for data files is expected to be used
+// for just one file, so that compression strategy can be explicitly
+// reconsidered for each new file. However, a Compressor for in-memory use could
+// live indefinitely.
+//
+// If a single thread is doing many compressions under the same strategy, it
+// should request a WorkingArea that will in some cases make repeated
+// compression in a single thread more efficient. Unlike the rest of Compressor,
+// each WorkingArea can only be used by one thread at a time. WorkingAreas can
+// have pre-allocated space and/or data structures, and/or thread-local
+// statistics that are later incorporated into shared statistics objects.
+//
+// The Compressor marks each block with a CompressionType to guide
+// decompression. However, the compression dictionary (or whether there is one
+// associated) is determined at Compressor creation time, though the process of
+// getting a Compressor with a dictionary starts with a Compressor without
+// dictionary (which will often be relevant alongside); see relevant functions.
+// If the Compressor wants to decide block-by-block whether to apply the
+// configured dictionary, that would need to be encoded in CompressionType or
+// the compressed output. (NOTE: this was historically NOT encoded in
+// CompressionType and instead implied by BlockType and the presence of a
+// dictionary block in the file. Some of the resulting awkwardness includes
+// a number of built-in CompressionTypes that ignore any dictionary block in
+// the file; therefore they cannot accommodate dictionary compression in the
+// future without a schema change / extension.)
+class Compressor {
+ public:
+  Compressor() = default;
+  virtual ~Compressor() = default;
+
+  // Returns the max total bytes of for all sampled blocks for creating the data
+  // dictionary, or zero indicating dictionary compression should not be
+  // used/configured. This will typically be called after
+  // CompressionManager::GetCompressor() to see if samples should be accumulated
+  // and passed to MaybeCloneSpecialized().
+  virtual size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const {
+    // Default implementation: no dictionary
+    (void)block_type;
+    return 0;
+  }
+
+  // Returns the serialized form of the data dictionary associated with this
+  // Compressor. NOTE: empty dict is equivalent to no dict.
+  virtual Slice GetSerializedDict() const { return Slice(); }
+
+  // If there's a dominant compression type returned by this compressor as
+  // configured, return it. Otherwise, return kDisableCompressionOption.
+  virtual CompressionType GetPreferredCompressionType() const {
+    return CompressionType::kDisableCompressionOption;
+  }
+
+  // Utility struct for providing sample data for the compression dictionary.
+  // Potentially extensible by callers of Compressor (but not recommended)
+  struct DictSampleArgs {
+    // All the sample input blocks stored contiguously
+    std::string sample_data;
+    // The lengths of each of the sample blocks in `sample_data`
+    std::vector<size_t> sample_lens;
+
+    bool empty() { return sample_data.empty(); }
+    bool Verify() {
+      size_t total_len = 0;
+      for (auto len : sample_lens) {
+        total_len += len;
+      }
+      return total_len == sample_data.size();
+    }
+  };
+
+  // Create potential variants of the same Compressor that might be
+  // (a) optimized for a particular block type (does not affect correct
+  //     decompression), and/or
+  // (b) configured to use a compression dictionary, based on the given
+  //     samples (decompression must provide the dictionary from
+  //     GetSerializedDict())
+  // Return of nullptr indicates no specialization exists or was attempted
+  // and the caller is best to use the current Compressor for the desired
+  // scenario. Using CacheEntryRole:kMisc for block_type generally means
+  // "unspecified", and both parameters are merely suggestions. The exact
+  // dictionary associated with a returned compressor must be read from
+  // GetSerializedDict().
+  virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+    // Default implementation: no specialization
+    (void)block_type;
+    (void)dict_samples;
+    // Caller should have checked GetMaxSampleSizeIfWantDict before attempting
+    // to provide dictionary samples
+    assert(dict_samples.empty());
+    return nullptr;
+  }
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated compressions by
+  // reusing working space or thread-local tracking of statistics or trends.
+  // This enables use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+ protected:
+  // To allow for flexible re-use / reclaimation, we have explicit Get and
+  // Release functions, and usually wrap in a special RAII smart pointer.
+  // For example, a WorkingArea could be saved/recycled in thread-local or
+  // core-local storage, or heap managed, etc., though an explicit WorkingArea
+  // is only advised for repeated compression (by a single thread).
+  virtual void ReleaseWorkingArea(WorkingArea*) {}
+
+ public:
+  using ManagedWorkingArea =
+      ManagedPtr<WorkingArea, Compressor, &Compressor::ReleaseWorkingArea>;
+
+  // See struct WorkingArea above
+  virtual ManagedWorkingArea ObtainWorkingArea() {
+    // Default implementation: no working area
+    return {};
+  }
+
+  // Compress `uncompressed_data` to `compressed_output`, which should be
+  // passed in empty. Note that the compressed output will be decompressed
+  // by the sequence Decompressor::ExtractUncompressedSize() followed by
+  // Decompressor::DecompressBlock(), which must also be provided the same
+  // CompressionType saved in `out_compression_type`. (In many configurations,
+  // `compressed_output` will have a prefix storing the uncompressed_data size
+  // before the compressed bytes returned by the underlying compression
+  // algorithm. And the compression type is usually stored adjacent to the
+  // compressed data, or in some cases assumed/asserted based on the particular
+  // Compressor.)
+  //
+  // If return status is not OK, then some fatal condition has arisen. On OK
+  // status, setting `*out_compression_type = kNoCompression` means compression
+  // is declined and the caller should use the original uncompressed_data and
+  // ignore any result in `compressed_output`. Otherwise, compression has
+  // happened with results in `compressed_output` and `out_compression_type`,
+  // which are allowed to vary from call to call.
+  //
+  // The working area is optional and used to optimize repeated compression by
+  // a single thread. ManagedWorkingArea is provided rather than just
+  // WorkingArea so that it can be used only if the `owner` matches expectation.
+  // This could be useful for a Compressor wrapping more than one alternative
+  // underlying Compressor.
+  //
+  // TODO: instead of string, consider a buffer only large enough for max
+  // tolerable compressed size. Does that work for all existing algorithms?
+  // * Looks like Snappy doesn't support that. :(
+  // * But looks like everything else should. :)
+  // Could save CPU by eliminating extra zero-ing and giving up quicker when
+  // ratio is insufficient.
+  virtual Status CompressBlock(Slice uncompressed_data,
+                               std::string* compressed_output,
+                               CompressionType* out_compression_type,
+                               ManagedWorkingArea* working_area) = 0;
+
+  // TODO: something to populate table properties based on settings, after all
+  // or as WorkingAreas released. Maybe also update stats, or that could be in
+  // thread-specific WorkingArea.
+};
+
+// TODO: CompressorBase and CompressorWrapper
+
+// A Decompressor usually has a wide capability to decompress all kinds of
+// compressed data in the scope of a CompressionManager (see that class below),
+// except
+// (a) it might be optimized for or limited to a particular compression type(s)
+//     (see GetDecompressor* functions for in CompressionManager),
+// (b) distinct Decompressors are required to decompress with compression
+//     dictionaries. (Decompressors are generally associated with empty/no
+//     dictionary unless created with MaybeCloneForDict().)
+//
+// Similar to Compressor, Decompressor is generally thread safe except that each
+// WorkingArea can only be used by a single thread at a time.
+//
+// Decompressors known to be associated with no dictionary are typically
+// returned as shared_ptr, because they are broadly usable across threads.
+// Because compression dictionaries are externally managed (see
+// MaybeCloneForDict()), Decompressors associated with compression dictionaries
+// are typically returned as unique_ptr, so that they are more easily
+// guaranteed not to outlive their dictionaries (e.g. in block cache).
+// Decompressors associated with compression dictionaries might include a
+// processed or "digested" form of the raw dictionary for efficient repeated
+// compressions.
+//
+// NOTE: Splitting the interface between ExtractUncompressedSize and
+// DecompressBlock leaves to the caller details of (and flexibility in)
+// allocating buffers for decompressing into. For example, the data could be
+// decompressed into part of a single buffer allocated to hold a block's
+// uncompressed contents along with an in-memory object representation of the
+// block (to reduce fragmentation and other overheads of separate objects).
+class Decompressor {
+ public:
+  Decompressor() = default;
+  virtual ~Decompressor() = default;
+
+  // A name for logging / debugging purposes
+  virtual const char* Name() const = 0;
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated decompressions by
+  // reusing working space or thread-local tracking of statistics. This enables
+  // use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+ protected:
+  // To allow for flexible re-use / reclaimation, we have explicit Obtain and
+  // Release functions, which are typically wrapped in a special RAII smart
+  // pointer. For example, a WorkingArea could be saved/recycled in thread-local
+  // or core-local storage, or heap managed, etc., though an explicit
+  // WorkingArea is only advised for repeated decompression (by a single
+  // thread).
+
+  virtual void ReleaseWorkingArea(WorkingArea* wa) {
+    // Default implementation: no working area
+    (void)wa;
+    assert(wa == nullptr);
+  }
+
+ public:
+  using ManagedWorkingArea =
+      ManagedPtr<WorkingArea, Decompressor, &Decompressor::ReleaseWorkingArea>;
+
+  virtual ManagedWorkingArea ObtainWorkingArea(CompressionType /*preferred*/) {
+    // Default implementation: no working area
+    return {};
+  }
+
+  // If this Decompressor is associated with a (de)compression dictionary
+  // (created with MaybeCloneForDict()), this returns a pointer to those raw (or
+  // "serialized") bytes, which are externally managed (see
+  // MaybeCloneForDict()).
+  // Default: empty slice => no dictionary
+  virtual const Slice& GetSerializedDict() const;
+
+  // Create a variant of this Decompressor in `out` using the specified raw
+  // ("serialized") dictionary. This step is required for decompressing data
+  // compressed with the same dictionary. The new Decompressor references the
+  // given Slice through its lifetime so the data it points to must be managed
+  // by the caller along with (or beyond) the new Decompressor. If the
+  // dictionary is processed into a form reusable by repeated compressions in
+  // many threads, that happens within this call.
+  //
+  // Must return OK if storing a result in `out`. Otherwise, could return values
+  // like NotSupported - dictionary compression is not (yet) supported for this
+  // kind of Decompressor.
+  // Corruption - dictionary is malformed (though many implementations will
+  // accept any data as a dictionary)
+  virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
+                                   std::unique_ptr<Decompressor>* /*out*/) {
+    return Status::NotSupported(
+        "Dictionary compression not (yet) supported by " + std::string(Name()));
+  }
+
+  // Memory size of this object and others it owns. Does not include the
+  // serialized dictionary (when used) which is externally managed.
+  virtual size_t ApproximateOwnedMemoryUsage() const {
+    // Default: negligible
+    return 0;
+  }
+
+  // Potentially extensible by callers of Decompressor (but not recommended)
+  struct Args {
+    CompressionType compression_type = kNoCompression;
+    Slice compressed_data;
+    uint64_t uncompressed_size = 0;
+    ManagedWorkingArea* working_area = nullptr;
+  };
+
+  // For efficiency on the read path, RocksDB strongly prefers the uncompressed
+  // data size to be encoded in the compressed data in an easily accessible way,
+  // so that allocation of a potentially long-lived buffer can be ideally sized.
+  // This function determines the uncompressed size and potentially modifies
+  // `args.compressed_data` to strip off the size metadata, for providing both
+  // to DecompressBlock along with an appropriate buffer based on that size.
+  // Some implementations will leave `compressed_data` unmodified and let
+  // DecompressBlock call a library function that processes a format that
+  // includes size metadata (e.g. Snappy).
+  //
+  // Even for legacy cases without size metadata (e.g. some very old RocksDB
+  // formats), an exact size is required and could require decompressing the
+  // data (here and in DecompressBlock()).
+  //
+  // Return non-OK in case of corrupt data or some other unworkable limitation
+  // or failure.
+  virtual Status ExtractUncompressedSize(Args& args) {
+    // Default implementation:
+    //
+    // Standard format for prepending uncompressed size to the compressed
+    // payload. (RocksDB compress_format_version=2 except Snappy)
+    //
+    // This is historically a varint32, but it is preliminarily generalized
+    // to varint64. (TODO: support that on the write side, at least for some
+    // codecs, in BBT format_version=7)
+    if (LIKELY(GetVarint64(&args.compressed_data, &args.uncompressed_size))) {
+      if (LIKELY(args.uncompressed_size <= SIZE_MAX)) {
+        return Status::OK();
+      } else {
+        return Status::MemoryLimit("Uncompressed size too large for platform");
+      }
+    } else {
+      return Status::Corruption("Unable to extract uncompressed size");
+    }
+  }
+
+  // Called to decompress a block of data after running ExtractUncompressedSize
+  // on it. `args.compressed_data` is what ExtractUncompressedSize left there
+  // after potentially stripping off the uncompressed size metadata. Returns OK
+  // iff uncompressed data of size `uncompressed_size` is written to
+  // `uncompressed_output`.
+  virtual Status DecompressBlock(const Args& args,
+                                 char* uncompressed_output) = 0;
+};
+
+// A CompressionManager represents
+// * When/where/how to use different compressions
+// * A schema (or set of schemas) and implementation for mapping
+//     <CompressionType, dictionary, compressed data>
+//   to uncompressed data (or error), which can expand over time (error in fewer
+//   cases) for a given CompatibilityName() but can never change that mapping
+//   (because that would break backward compatibility, potential quiet
+//   corruption)
+// TODO: consider adding optional streaming compression support (low priority)
+class CompressionManager
+    : public std::enable_shared_from_this<CompressionManager> {
+ public:
+  CompressionManager() = default;
+  virtual ~CompressionManager() = default;
+
+  // TODO: Customizable (for compression side configuration and recording our
+  // compression strategy)
+  virtual const char* Name() const = 0;
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
+  // *************** Peer or variant Compression Managers **************** //
+  // A name for the schema family of this CompressionManager. In short, if
+  // two CompressionManagers have functionally the same Decompressor(s), they
+  // should have the same CompatibilityName(), so that a compatible
+  // CompressionManager/Decompressor might be used if the original is
+  // unavailable. (Name() can be useful in addition to CompatibilityName() for
+  // understanding what compression strategy was used.)
+  virtual const char* CompatibilityName() const = 0;
+
+  // Default implementation checks the current compatibility name and returns
+  // this CompressionManager (via `out`) if appropriate, and otherwise looks
+  // for a matching built-in CompressionManager.
+  virtual Status FindCompatibleCompressionManager(
+      Slice compatibility_name, std::shared_ptr<CompressionManager>* out);
+
+  // ************************* Compressor creation *********************** //
+  // Returning nullptr means compression is entirely disabled for the file,
+  // which is valid at the discretion of the CompressionManager. Returning
+  // nullptr should normally be the result if preferred == kNoCompression.
+  //
+  // These functions must be thread-safe.
+
+  // Get a compressor for an SST file.
+  // SUBJECT TO CHANGE
+  // TODO: is it practical to get ColumnFamilyOptions plumbed into here?
+  virtual std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext&, const CompressionOptions& opts,
+      CompressionType preferred) {
+    return GetCompressor(opts, preferred);
+  }
+
+  // Get a compressor for a generic/unspecified purpose (e.g. in-memory
+  // compression).
+  virtual std::unique_ptr<Compressor> GetCompressor(
+      const CompressionOptions& opts, CompressionType type) = 0;
+
+  // **************************** Decompressors ************************** //
+  // Get a decompressor that is compatible with any blocks compressed by
+  // compressors returned by this CompressionManager (at least this code
+  // revision and earlier). (NOTE: recommended to return a shared_ptr alias of
+  // this shared_ptr to a field that is a Decompressor.)
+  // Justification for not making CompressionManager inherit Decompressor: this
+  // tends to run into the diamond inheritance problem in implementations and
+  // potential overheads of virtual inheritance.
+  virtual std::shared_ptr<Decompressor> GetDecompressor() = 0;
+
+  // Compatible with same as above, but potentially optimized for a certain
+  // expected CompressionType
+  virtual std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType /*optimize_for_type*/) {
+    // Safe default implementation
+    return GetDecompressor();
+  }
+
+  // Get a decompressor that is allowed to have support only for the
+  // CompressionTypes in the given start-to-end array (unique, sorted by
+  // unsigned char)
+  virtual std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* /*types_begin*/,
+      const CompressionType* /*types_end*/) {
+    // Safe default implementation
+    return GetDecompressor();
+  }
+};
+// ***********************************************************************
+// END future compression customization interface
+// ***********************************************************************
+
+class FailureDecompressor : public Decompressor {
+ public:
+  explicit FailureDecompressor(Status&& status) : status_(std::move(status)) {
+    assert(!status_.ok());
+  }
+  ~FailureDecompressor() override { status_.PermitUncheckedError(); }
+
+  const char* Name() const override { return "FailureDecompressor"; }
+
+  Status ExtractUncompressedSize(Args& /*args*/) override { return status_; }
+
+  Status DecompressBlock(const Args& /*args*/,
+                         char* /*uncompressed_output*/) override {
+    return status_;
+  }
+
+ protected:
+  Status status_;
+};
+
+// Owns a decompression dictionary, and associated Decompressor, for storing
+// in the block cache.
+//
+// Justification: for a "processed" dictionary to be saved in block cache, we
+// also need a reference to the decompressor that processed it, to ensure it
+// is recognized properly. At that point, we might as well have the dictionary
+// part of the decompressor identity and track an associated decompressor along
+// with a decompression dictionary in the block cache, and the decompressor
+// hides potential details of processing the dictionary.
+struct DecompressorDict {
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a string parameter is used.
+  std::string dict_str_;
+
+  // Block containing the data for the compression dictionary in case the
+  // constructor that takes a Slice parameter is used and the passed in
+  // CacheAllocationPtr is not nullptr.
+  CacheAllocationPtr dict_allocation_;
+
+  // A Decompressor referencing and using the dictionary owned by this.
+  std::unique_ptr<Decompressor> decompressor_;
+
+  // Approximate owned memory usage
+  size_t memory_usage_;
+
+  DecompressorDict(std::string&& dict, Decompressor& from_decompressor)
+      : dict_str_(std::move(dict)) {
+    Populate(from_decompressor, dict_str_);
+  }
+
+  DecompressorDict(Slice slice, CacheAllocationPtr&& allocation,
+                   Decompressor& from_decompressor)
+      : dict_allocation_(std::move(allocation)) {
+    Populate(from_decompressor, slice);
+  }
+
+  DecompressorDict(DecompressorDict&& rhs) noexcept
+      : dict_str_(std::move(rhs.dict_str_)),
+        dict_allocation_(std::move(rhs.dict_allocation_)),
+        decompressor_(std::move(rhs.decompressor_)),
+        memory_usage_(std::move(rhs.memory_usage_)) {}
+
+  DecompressorDict& operator=(DecompressorDict&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+    dict_str_ = std::move(rhs.dict_str_);
+    dict_allocation_ = std::move(rhs.dict_allocation_);
+    decompressor_ = std::move(rhs.decompressor_);
+    return *this;
+  }
+  // Disable copy
+  DecompressorDict(const DecompressorDict&) = delete;
+  DecompressorDict& operator=(const DecompressorDict&) = delete;
+
+  // The object is self-contained if the string constructor is used, or the
+  // Slice constructor is invoked with a non-null allocation. Otherwise, it
+  // is the caller's responsibility to ensure that the underlying storage
+  // outlives this object.
+  bool own_bytes() const { return !dict_str_.empty() || dict_allocation_; }
+
+  const Slice& GetRawDict() const { return decompressor_->GetSerializedDict(); }
+
+  // For TypedCacheInterface
+  const Slice& ContentSlice() const { return GetRawDict(); }
+  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock;
+  static constexpr BlockType kBlockType = BlockType::kCompressionDictionary;
+
+  size_t ApproximateMemoryUsage() const { return memory_usage_; }
+
+ private:
+  void Populate(Decompressor& from_decompressor, Slice dict) {
+    Status s = from_decompressor.MaybeCloneForDict(dict, &decompressor_);
+    if (decompressor_ == nullptr) {
+      dict_str_ = {};
+      dict_allocation_ = {};
+      assert(!s.ok());
+      decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
+    } else {
+      assert(s.ok());
+    }
+
+    memory_usage_ = sizeof(struct DecompressorDict);
+    memory_usage_ += dict_str_.size();
+    if (dict_allocation_) {
+      auto allocator = dict_allocation_.get_deleter().allocator;
+      if (allocator) {
+        memory_usage_ +=
+            allocator->UsableSize(dict_allocation_.get(), GetRawDict().size());
+      } else {
+        memory_usage_ += GetRawDict().size();
+      }
+    }
+    memory_usage_ += decompressor_->ApproximateOwnedMemoryUsage();
+  }
+};
+
 // Holds dictionary and related data, like ZSTD's digested compression
 // dictionary.
 struct CompressionDict {
@@ -355,7 +892,7 @@ struct UncompressionDict {
   UncompressionDict& operator=(const CompressionDict&) = delete;
 };
 
-class CompressionContext {
+class CompressionContext : public Compressor::WorkingArea {
  private:
 #ifdef ZSTD
   ZSTD_CCtx* zstd_ctx_ = nullptr;
@@ -447,7 +984,7 @@ class CompressionInfo {
 
 // This is like a working area, reusable for different dicts, etc.
 // TODO: refactor / consolidate
-class UncompressionContext {
+class UncompressionContext : public Decompressor::WorkingArea {
  private:
   CompressionContextCache* ctx_cache_ = nullptr;
   ZSTDUncompressCachedData uncomp_cached_data_;
@@ -612,6 +1149,7 @@ inline bool DictCompressionTypeSupported(CompressionType compression_type) {
   }
 }
 
+// WART: does not match OptionsHelper::compression_type_string_map
 inline std::string CompressionTypeToString(CompressionType compression_type) {
   switch (compression_type) {
     case kNoCompression:
@@ -638,8 +1176,56 @@ inline std::string CompressionTypeToString(CompressionType compression_type) {
   }
 }
 
+// WART: does not match OptionsHelper::compression_type_string_map
+inline CompressionType CompressionTypeFromString(
+    std::string compression_type_str) {
+  if (!compression_type_str.empty()) {
+    switch (compression_type_str[0]) {
+      case 'N':
+        if (compression_type_str == "NoCompression") {
+          return kNoCompression;
+        }
+        break;
+      case 'S':
+        if (compression_type_str == "Snappy") {
+          return kSnappyCompression;
+        }
+        break;
+      case 'Z':
+        if (compression_type_str == "ZSTD") {
+          return kZSTD;
+        }
+        if (compression_type_str == "Zlib") {
+          return kZlibCompression;
+        }
+        break;
+      case 'B':
+        if (compression_type_str == "BZip2") {
+          return kBZip2Compression;
+        }
+        break;
+      case 'L':
+        if (compression_type_str == "LZ4") {
+          return kLZ4Compression;
+        }
+        if (compression_type_str == "LZ4HC") {
+          return kLZ4HCCompression;
+        }
+        break;
+      case 'X':
+        if (compression_type_str == "Xpress") {
+          return kXpressCompression;
+        }
+        break;
+      default:;
+    }
+  }
+  // unrecognized
+  return kDisableCompressionOption;
+}
+
 inline std::string CompressionOptionsToString(
-    CompressionOptions& compression_options) {
+    const CompressionOptions& compression_options) {
   std::string result;
   result.reserve(512);
   result.append("window_bits=")
@@ -1543,10 +2129,10 @@ inline std::string ZSTD_FinalizeDictionary(
 #endif  // ROCKSDB_ZDICT_FINALIZE
 }
 
-inline bool CompressData(const Slice& raw,
-                         const CompressionInfo& compression_info,
-                         uint32_t compress_format_version,
-                         std::string* compressed_output) {
+inline bool OLD_CompressData(const Slice& raw,
+                             const CompressionInfo& compression_info,
+                             uint32_t compress_format_version,
+                             std::string* compressed_output) {
   bool ret = false;
 
   // Will return compressed block contents if (1) the compression method is
@@ -1590,7 +2176,7 @@ inline bool CompressData(const Slice& raw,
   return ret;
 }
 
-inline CacheAllocationPtr UncompressData(
+inline CacheAllocationPtr OLD_UncompressData(
     const UncompressionInfo& uncompression_info, const char* data, size_t n,
     size_t* uncompressed_size, uint32_t compress_format_version,
     MemoryAllocator* allocator = nullptr,
@@ -1621,6 +2207,19 @@ inline CacheAllocationPtr UncompressData(
   }
 }
 
+// ***********************************************************************
+// BEGIN built-in implementation of customization interface
+// ***********************************************************************
+
+// NOTE: to avoid compression API depending on block-based table API, uses
+// its own format version. See internal function GetCompressFormatForVersion()
+const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
+    int compression_format_version);
+
+// ***********************************************************************
+// END built-in implementation of customization interface
+// ***********************************************************************
+
 // Records the compression type for subsequent WAL records.
 class CompressionTypeRecord {
  public:
@@ -1797,4 +2396,10 @@ class ZSTDStreamingUncompress final : public StreamingUncompress {
 #endif
 };
 
+#ifndef NDEBUG
+// 0 == disable the hack
+// > 0 => counter for rotating through compression types
+extern RelaxedAtomic<uint64_t> g_hack_mixed_compression;
+#endif
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index aba7e5d2fa7a..7bc20f7bf5d2 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1160,12 +1160,20 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   CompressionOptions opts;
   CompressionContext context(type, opts);
   CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type);
-  CompressData(raw, info,
-               GetCompressFormatForVersion(kBlockBasedTableVersionFormat),
-               compression_output);
+  OLD_CompressData(raw, info,
+                   GetCompressFormatForVersion(kBlockBasedTableVersionFormat),
+                   compression_output);
   return *compression_output;
 }
 
+Decompressor& BlobDecompressor() {
+  static auto mgr = GetBuiltinCompressionManager(
+      GetCompressFormatForVersion(kBlockBasedTableVersionFormat));
+  static auto decompressor = mgr->GetDecompressor();
+
+  return *decompressor;
+}
+
 Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
                                    CompressionType compression_type,
                                    PinnableSlice* value_output) const {
@@ -1177,12 +1185,9 @@ Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
   {
     StopWatch decompression_sw(clock_, statistics_,
                                BLOB_DB_DECOMPRESSION_MICROS);
-    UncompressionContext context(compression_type);
-    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                           compression_type);
-    Status s = UncompressBlockData(
-        info, compressed_value.data(), compressed_value.size(), &contents,
-        kBlockBasedTableVersionFormat, cfh->cfd()->ioptions());
+    Status s = DecompressBlockData(
+        compressed_value.data(), compressed_value.size(), compression_type,
+        BlobDecompressor(), &contents, cfh->cfd()->ioptions());
     if (!s.ok()) {
       return Status::Corruption("Unable to decompress blob.");
     }
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 75776e6a8a7f..42eefd0149b0 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -509,5 +509,7 @@ class BlobDBImpl : public BlobDB {
   uint32_t debug_level_;
 };
 
+Decompressor& BlobDecompressor();
+
 }  // namespace blob_db
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index 933803f8f30d..e42a2fa49ad2 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -19,6 +19,7 @@
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/string_util.h"
+#include "utilities/blob_db/blob_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE::blob_db {
 
@@ -210,9 +211,9 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
     UncompressionContext context(compression);
     UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
                            compression);
-    s = UncompressBlockData(
-        info, slice.data() + key_size, static_cast<size_t>(value_size),
-        &contents, 2 /*compress_format_version*/, ImmutableOptions(Options()));
+    s = DecompressBlockData(
+        slice.data() + key_size, static_cast<size_t>(value_size), compression,
+        BlobDecompressor(), &contents, ImmutableOptions(Options()));
     if (!s.ok()) {
       return s;
     }

From 024194420c95609ee8b4d8f9f57ecd714ec86107 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 15 May 2025 17:19:34 -0700
Subject: [PATCH 092/500] Add ColumnFamily Info to CompactionServiceJobInfo
 (#13615)

Summary:
Similar to https://github.com/facebook/rocksdb/pull/13555, add more info, ColumnFamily Id and name, to `CompactionServiceJobInfo`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13615

Test Plan:
Updated Unit Test
```
./compaction_service_test
```

Reviewed By: archang19

Differential Revision: D74845661

Pulled By: jaykorean

fbshipit-source-id: e2fc61006092b9febec1c6637b92cb00fb6cb73e
---
 db/compaction/compaction_service_job.cc  |  4 ++-
 db/compaction/compaction_service_test.cc | 33 +++++++++++++++++++++---
 include/rocksdb/options.h                | 11 +++++++-
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index f6375de11722..69f51fc1982a 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -74,7 +74,9 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
       compaction->column_family_data()->GetName().c_str(), job_id_,
       compaction_input.output_level, input_files_oss.str().c_str());
   CompactionServiceJobInfo info(
-      dbname_, db_id_, db_session_id_, GetCompactionId(sub_compact),
+      dbname_, db_id_, db_session_id_,
+      compaction->column_family_data()->GetID(),
+      compaction->column_family_data()->GetName(), GetCompactionId(sub_compact),
       thread_pri_, compaction->compaction_reason(),
       compaction->is_full_compaction(), compaction->is_manual_compaction(),
       compaction->bottommost_level(), compaction->start_level(),
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 1f245cb62c09..e59185b64212 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -21,10 +21,10 @@ class MyTestCompactionService : public CompactionService {
       : db_path_(std::move(db_path)),
         options_(options),
         statistics_(statistics),
-        start_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
-                    false, false, false, -1, -1),
-        wait_info_("na", "na", "na", 0, Env::TOTAL, CompactionReason::kUnknown,
-                   false, false, false, -1, -1),
+        start_info_("na", "na", "na", 0, "na", 0, Env::TOTAL,
+                    CompactionReason::kUnknown, false, false, false, -1, -1),
+        wait_info_("na", "na", "na", 0, "na", 0, Env::TOTAL,
+                   CompactionReason::kUnknown, false, false, false, -1, -1),
         listeners_(listeners),
         table_properties_collector_factories_(
             std::move(table_properties_collector_factories)) {}
@@ -434,6 +434,30 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
   ASSERT_OK(result.status);
   ASSERT_TRUE(result.stats.is_manual_compaction);
   ASSERT_TRUE(result.stats.is_remote_compaction);
+
+  auto info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(0, info.cf_id);
+  ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name);
+
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(0, info.cf_id);
+  ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name);
+
+  // Test non-default CF
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(handles_[1]->GetID(), info.cf_id);
+  ASSERT_EQ(handles_[1]->GetName(), info.cf_name);
+
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(handles_[1]->GetID(), info.cf_id);
+  ASSERT_EQ(handles_[1]->GetName(), info.cf_name);
 }
 
 TEST_F(CompactionServiceTest, CompactionOutputFileIOError) {
@@ -1371,6 +1395,7 @@ TEST_F(CompactionServiceTest, CompactionInfo) {
   ASSERT_EQ(true, info.bottommost_level);
   ASSERT_EQ(1, info.base_input_level);
   ASSERT_EQ(2, info.output_level);
+  ASSERT_EQ(kDefaultColumnFamilyName, info.cf_name);
 
   // Test priority BOTTOM
   env_->SetBackgroundThreads(1, Env::BOTTOM);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a116e165f413..6e21fbe7fbf4 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -464,6 +464,12 @@ struct CompactionServiceJobInfo {
   std::string db_name;
   std::string db_id;
   std::string db_session_id;
+
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
+  // the name of the column family where the compaction happened.
+  std::string cf_name;
+
   uint64_t job_id;  // job_id is only unique within the current DB and session,
                     // restart DB will reset the job_id. `db_id` and
                     // `db_session_id` could help you build unique id across
@@ -484,7 +490,8 @@ struct CompactionServiceJobInfo {
   int output_level;
 
   CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
-                           std::string db_session_id_, uint64_t job_id_,
+                           std::string db_session_id_, uint32_t cf_id_,
+                           std::string cf_name_, uint64_t job_id_,
                            Env::Priority priority_,
                            CompactionReason compaction_reason_,
                            bool is_full_compaction_, bool is_manual_compaction_,
@@ -493,6 +500,8 @@ struct CompactionServiceJobInfo {
       : db_name(std::move(db_name_)),
         db_id(std::move(db_id_)),
         db_session_id(std::move(db_session_id_)),
+        cf_id(cf_id_),
+        cf_name(std::move(cf_name_)),
         job_id(job_id_),
         priority(priority_),
         compaction_reason(compaction_reason_),

From b42bf48310c87fd9b2c4ed6b51177efa242405d6 Mon Sep 17 00:00:00 2001
From: Changyu Bi <102700264+cbi42@users.noreply.github.com>
Date: Fri, 16 May 2025 11:51:58 -0700
Subject: [PATCH 093/500] Add stats for WBWI ingestion and transaction size
 (#13611)

Summary:
Add stats to monitor the large transaction optimization. A stat is added for how many times wbwi ingestion is used. A histogram is added to track transaction size. We could also just track write batch size for all writes but I don't want to add the overhead to all writes yet.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13611

Test Plan:
ran `python3 ./tools/db_crashtest.py --txn blackbox  --txn_write_policy=0 --commit_bypass_memtable_one_in=50 --test_batches_snapshots=0 --stats_dump_period_sec=2 --dump_malloc_stats=0 --statistics=1` and manually check LOG files
```
rocksdb.number.wbwi.ingest COUNT : 57
...
rocksdb.num.op.per.transaction P50 : 1.000000 P95 : 1.000000 P99 : 1.000000 P100 : 1.000000 COUNT : 2265 SUM : 2265
```

Reviewed By: jowlyzhang

Differential Revision: D74829087

Pulled By: cbi42

fbshipit-source-id: 5a9c3ab2d4cb6071cedfc47201ce2cf65a77d3c6
---
 db/db_impl/db_impl_write.cc                       | 1 +
 include/rocksdb/statistics.h                      | 9 +++++++++
 monitoring/statistics.cc                          | 2 ++
 utilities/transactions/pessimistic_transaction.cc | 5 ++++-
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 7f91ed65f7d7..667e4750c7d6 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -929,6 +929,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
                                /*min_prep_log=*/log_ref, last_sequence,
                                /*memtable_updated=*/memtable_update_count > 0,
                                write_options.ignore_missing_column_families);
+      RecordTick(stats_, NUMBER_WBWI_INGEST);
     }
   }
 
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 21d7705b9f0c..cec02261fb57 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -536,6 +536,12 @@ enum Tickers : uint32_t {
   FILE_READ_CORRUPTION_RETRY_COUNT,
   FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
 
+  // Counter for the number of times a WBWI is ingested into the DB. This
+  // happens when IngestWriteBatchWithIndex() is used and when large
+  // transaction optimization is enabled through
+  // TransactionOptions::large_txn_commit_optimize_threshold.
+  NUMBER_WBWI_INGEST,
+
   TICKER_ENUM_MAX
 };
 
@@ -673,6 +679,9 @@ enum Histograms : uint32_t {
   // system's prefetch) from the end of SST table during block based table open
   TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
 
+  // Number of operations per transaction.
+  NUM_OP_PER_TRANSACTION,
+
   HISTOGRAM_ENUM_MAX
 };
 
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index af63c639f34a..0dbc0ac2ba8f 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -272,6 +272,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
      "rocksdb.file.read.corruption.retry.count"},
     {FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
      "rocksdb.file.read.corruption.retry.success.count"},
+    {NUMBER_WBWI_INGEST, "rocksdb.number.wbwi.ingest"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@@ -343,6 +344,7 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
     {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"},
     {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
      "rocksdb.table.open.prefetch.tail.read.bytes"},
+    {NUM_OP_PER_TRANSACTION, "rocksdb.num.op.per.transaction"},
 };
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 98634c94cd87..5243ec9a2570 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -892,7 +892,10 @@ Status WriteCommittedTxn::CommitInternal() {
   // any operations appended to this working_batch will be ignored from WAL
   working_batch->MarkWalTerminationPoint();
 
-  bool bypass_memtable = wb->Count() >= commit_bypass_memtable_threshold_;
+  uint32_t wb_count = wb->Count();
+  RecordInHistogram(db_impl_->immutable_db_options_.stats,
+                    NUM_OP_PER_TRANSACTION, wb_count);
+  bool bypass_memtable = wb_count >= commit_bypass_memtable_threshold_;
   if (!bypass_memtable) {
     // insert prepared batch into Memtable only skipping WAL.
     // Memtable will ignore BeginPrepare/EndPrepare markers

From 06d4f569a85a8303f747709f5a22f34dbd08e690 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 16 May 2025 14:41:51 -0700
Subject: [PATCH 094/500] Fix external table ingestion workflow (#13608)

Summary:
Remove the dependency on `allow_db_generated_files` option in `IngestExternalFile` to be set for ingesting external tables. The files are created by SstFileWriter, and we should be able to ingest them. We could make it work by having the external table implementation provide the version and global sequence number related properties, but its safer to have RocksDB generate the table properties block and store it as is in the file.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13608

Test Plan: Add unit test to test basic ingestion and ingestion with atomic_replace_range

Reviewed By: pdillinger

Differential Revision: D74830707

Pulled By: anand1976

fbshipit-source-id: 4a9bea4a4f38f7c24c584262095c5c98cd771ddc
---
 include/rocksdb/external_table.h |  16 ++
 table/external_table.cc          | 129 +++++++++++++--
 table/meta_blocks.cc             | 273 ++++++++++++++++---------------
 table/meta_blocks.h              |   5 +
 table/table_test.cc              | 192 ++++++++++++++++++++--
 5 files changed, 460 insertions(+), 155 deletions(-)

diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index 3adfdf4f3368..4bfad214e253 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -125,6 +125,17 @@ class ExternalTableReader {
                         std::vector<std::string>* values,
                         std::vector<Status>* statuses) = 0;
 
+  // Allocate and return the contents of the properties block. If the builder
+  // supports PutPropertiesBlock(), then this must be supported. The
+  // properties block should be written to the table file as is (no
+  // compression or mutation of any kind), and its offset in the file
+  // should be returned in file_offset.
+  virtual Status GetPropertiesBlock(std::unique_ptr<char[]>* /*property_block*/,
+                                    uint64_t* /*size*/,
+                                    uint64_t* /*file_offset*/) {
+    return Status::NotSupported();
+  }
+
   // Return TableProperties for the file. At a minimum, the following
   // properties need to be returned -
   // comparator_name
@@ -179,6 +190,11 @@ class ExternalTableBuilder {
   // Finish().
   virtual uint64_t FileSize() const = 0;
 
+  // Write the raw properties block as is in the table file
+  virtual Status PutPropertiesBlock(const Slice& /*property_block*/) {
+    return Status::NotSupported();
+  }
+
   //  As mentioned in earlier comments, the following table properties must be
   //  returned at a minimum -
   //  comparator_name
diff --git a/table/external_table.cc b/table/external_table.cc
index e2eb3e4f4ab7..70abe82dba4c 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -5,8 +5,11 @@
 
 #include "rocksdb/external_table.h"
 
+#include "logging/logging.h"
 #include "rocksdb/table.h"
+#include "table/block_based/block.h"
 #include "table/internal_iterator.h"
+#include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
 
@@ -156,8 +159,9 @@ class ExternalTableIteratorAdapter : public InternalIterator {
 class ExternalTableReaderAdapter : public TableReader {
  public:
   explicit ExternalTableReaderAdapter(
+      const ImmutableOptions& ioptions,
       std::unique_ptr<ExternalTableReader>&& reader)
-      : reader_(std::move(reader)) {}
+      : ioptions_(ioptions), reader_(std::move(reader)) {}
 
   ~ExternalTableReaderAdapter() override {}
 
@@ -193,9 +197,34 @@ class ExternalTableReaderAdapter : public TableReader {
   void SetupForCompaction() override {}
 
   std::shared_ptr<const TableProperties> GetTableProperties() const override {
-    std::shared_ptr<TableProperties> props =
-        std::make_shared<TableProperties>(*reader_->GetTableProperties());
-    props->key_largest_seqno = 0;
+    std::shared_ptr<TableProperties> props;
+    std::unique_ptr<char[]> property_block;
+    uint64_t property_block_size = 0;
+    uint64_t property_block_offset = 0;
+    Status s;
+    // Get the raw properties block from the external table reader. We don't
+    // support writing the global sequence number, but we still get and return
+    // the correct global seqno offset in the file to prevent accidental
+    // corruption.
+    s = reader_->GetPropertiesBlock(&property_block, &property_block_size,
+                                    &property_block_offset);
+    if (s.ok()) {
+      std::unique_ptr<TableProperties> table_properties =
+          std::make_unique<TableProperties>();
+      BlockContents block_contents(std::move(property_block),
+                                   property_block_size);
+      Block block(std::move(block_contents));
+      s = ParsePropertiesBlock(ioptions_, property_block_offset, block,
+                               table_properties);
+      if (s.ok()) {
+        props.reset(table_properties.release());
+      }
+    } else {
+      // Fallback to getting a minimal table properties structure from the
+      // external table reader
+      props = std::make_shared<TableProperties>(*reader_->GetTableProperties());
+      props->key_largest_seqno = 0;
+    }
     return props;
   }
 
@@ -213,15 +242,54 @@ class ExternalTableReaderAdapter : public TableReader {
   }
 
  private:
+  const ImmutableOptions& ioptions_;
   std::unique_ptr<ExternalTableReader> reader_;
 };
 
 class ExternalTableBuilderAdapter : public TableBuilder {
  public:
   explicit ExternalTableBuilderAdapter(
+      const TableBuilderOptions& topts,
       std::unique_ptr<ExternalTableBuilder>&& builder,
       std::unique_ptr<FSWritableFile>&& file)
-      : builder_(std::move(builder)), file_(std::move(file)), num_entries_(0) {}
+      : builder_(std::move(builder)),
+        file_(std::move(file)),
+        ioptions_(topts.ioptions) {
+    properties_.num_data_blocks = 1;
+    properties_.index_size = 0;
+    properties_.filter_size = 0;
+    properties_.format_version = 0;
+    properties_.key_largest_seqno = 0;
+    properties_.column_family_id = topts.column_family_id;
+    properties_.column_family_name = topts.column_family_name;
+    properties_.db_id = topts.db_id;
+    properties_.db_session_id = topts.db_session_id;
+    properties_.db_host_id = topts.ioptions.db_host_id;
+    if (!ReifyDbHostIdProperty(topts.ioptions.env, &properties_.db_host_id)
+             .ok()) {
+      ROCKS_LOG_INFO(topts.ioptions.logger,
+                     "db_host_id property will not be set");
+    }
+    properties_.orig_file_number = topts.cur_file_num;
+    properties_.comparator_name = topts.ioptions.user_comparator != nullptr
+                                      ? topts.ioptions.user_comparator->Name()
+                                      : "nullptr";
+    properties_.prefix_extractor_name =
+        topts.moptions.prefix_extractor != nullptr
+            ? topts.moptions.prefix_extractor->AsString()
+            : "nullptr";
+
+    for (auto& factory : *topts.internal_tbl_prop_coll_factories) {
+      assert(factory);
+      std::unique_ptr<InternalTblPropColl> collector{
+          factory->CreateInternalTblPropColl(topts.column_family_id,
+                                             topts.level_at_creation,
+                                             topts.ioptions.num_levels)};
+      if (collector) {
+        table_properties_collectors_.emplace_back(std::move(collector));
+      }
+    }
+  }
 
   void Add(const Slice& key, const Slice& value) override {
     ParsedInternalKey pkey;
@@ -232,7 +300,12 @@ class ExternalTableBuilderAdapter : public TableBuilder {
             "Value type " + std::to_string(pkey.type) + "not supported");
       } else {
         builder_->Add(pkey.user_key, value);
-        num_entries_++;
+        properties_.num_entries++;
+        properties_.raw_key_size += key.size();
+        properties_.raw_value_size += value.size();
+        NotifyCollectTableCollectorsOnAdd(key, value, /*offset=*/0,
+                                          table_properties_collectors_,
+                                          ioptions_.logger);
       }
     }
   }
@@ -247,13 +320,37 @@ class ExternalTableBuilderAdapter : public TableBuilder {
 
   IOStatus io_status() const override { return status_to_io_status(status()); }
 
-  Status Finish() override { return builder_->Finish(); }
+  Status Finish() override {
+    // Approximate the data size
+    properties_.data_size =
+        properties_.raw_key_size + properties_.raw_value_size;
+
+    PropertyBlockBuilder property_block_builder;
+    property_block_builder.AddTableProperty(properties_);
+    UserCollectedProperties more_user_collected_properties;
+    NotifyCollectTableCollectorsOnFinish(
+        table_properties_collectors_, ioptions_.logger, &property_block_builder,
+        more_user_collected_properties, properties_.readable_properties);
+    properties_.user_collected_properties.insert(
+        more_user_collected_properties.begin(),
+        more_user_collected_properties.end());
+
+    Slice prop_block = property_block_builder.Finish();
+    Status s = builder_->PutPropertiesBlock(prop_block);
+    if (s.ok() || s.IsNotSupported()) {
+      // If the builder doesn't support writing the properties block,
+      // we still call Finish() and let the external builder handle it.
+      s = builder_->Finish();
+    }
+
+    return s;
+  }
 
   void Abandon() override { builder_->Abandon(); }
 
   uint64_t FileSize() const override { return builder_->FileSize(); }
 
-  uint64_t NumEntries() const override { return num_entries_; }
+  uint64_t NumEntries() const override { return properties_.num_entries; }
 
   TableProperties GetTableProperties() const override {
     return builder_->GetTableProperties();
@@ -271,7 +368,10 @@ class ExternalTableBuilderAdapter : public TableBuilder {
   Status status_;
   std::unique_ptr<ExternalTableBuilder> builder_;
   std::unique_ptr<FSWritableFile> file_;
-  uint64_t num_entries_;
+  const ImmutableOptions& ioptions_;
+  TableProperties properties_;
+  std::vector<std::unique_ptr<InternalTblPropColl>>
+      table_properties_collectors_;
 };
 
 class ExternalTableFactoryAdapter : public TableFactory {
@@ -288,6 +388,12 @@ class ExternalTableFactoryAdapter : public TableFactory {
       std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /* file_size */,
       std::unique_ptr<TableReader>* table_reader,
       bool /* prefetch_index_and_filter_in_cache */) const override {
+    // SstFileReader specifies largest_seqno as kMaxSequenceNumber to denote
+    // that its unknown
+    if (topts.largest_seqno > 0 && topts.largest_seqno != kMaxSequenceNumber) {
+      return Status::NotSupported(
+          "Ingesting file with sequence number larger than 0");
+    }
     std::unique_ptr<ExternalTableReader> reader;
     FileOptions fopts(topts.env_options);
     ExternalTableOptions ext_topts(topts.prefix_extractor,
@@ -298,7 +404,8 @@ class ExternalTableFactoryAdapter : public TableFactory {
     if (!status.ok()) {
       return status;
     }
-    table_reader->reset(new ExternalTableReaderAdapter(std::move(reader)));
+    table_reader->reset(
+        new ExternalTableReaderAdapter(topts.ioptions, std::move(reader)));
     file.reset();
     return Status::OK();
   }
@@ -316,7 +423,7 @@ class ExternalTableFactoryAdapter : public TableFactory {
     builder.reset(inner_->NewTableBuilder(ext_topts, file->file_name(),
                                           file_wrapper.get()));
     if (builder) {
-      return new ExternalTableBuilderAdapter(std::move(builder),
+      return new ExternalTableBuilderAdapter(topts, std::move(builder),
                                              std::move(file_wrapper));
     }
     return nullptr;
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index bdc96c1c291a..89be11be21dd 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -253,6 +253,144 @@ bool NotifyCollectTableCollectorsOnFinish(
   return all_succeeded;
 }
 
+Status ParsePropertiesBlock(
+    const ImmutableOptions& ioptions, uint64_t offset, Block& properties_block,
+    std::unique_ptr<TableProperties>& new_table_properties) {
+  std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
+
+  //  All pre-defined properties of type uint64_t
+  std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
+      {TablePropertiesNames::kOriginalFileNumber,
+       &new_table_properties->orig_file_number},
+      {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
+      {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
+      {TablePropertiesNames::kIndexPartitions,
+       &new_table_properties->index_partitions},
+      {TablePropertiesNames::kTopLevelIndexSize,
+       &new_table_properties->top_level_index_size},
+      {TablePropertiesNames::kIndexKeyIsUserKey,
+       &new_table_properties->index_key_is_user_key},
+      {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+       &new_table_properties->index_value_is_delta_encoded},
+      {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
+      {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
+      {TablePropertiesNames::kRawValueSize,
+       &new_table_properties->raw_value_size},
+      {TablePropertiesNames::kNumDataBlocks,
+       &new_table_properties->num_data_blocks},
+      {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kNumFilterEntries,
+       &new_table_properties->num_filter_entries},
+      {TablePropertiesNames::kDeletedKeys,
+       &new_table_properties->num_deletions},
+      {TablePropertiesNames::kMergeOperands,
+       &new_table_properties->num_merge_operands},
+      {TablePropertiesNames::kNumRangeDeletions,
+       &new_table_properties->num_range_deletions},
+      {TablePropertiesNames::kFormatVersion,
+       &new_table_properties->format_version},
+      {TablePropertiesNames::kFixedKeyLen,
+       &new_table_properties->fixed_key_len},
+      {TablePropertiesNames::kColumnFamilyId,
+       &new_table_properties->column_family_id},
+      {TablePropertiesNames::kCreationTime,
+       &new_table_properties->creation_time},
+      {TablePropertiesNames::kOldestKeyTime,
+       &new_table_properties->oldest_key_time},
+      {TablePropertiesNames::kNewestKeyTime,
+       &new_table_properties->newest_key_time},
+      {TablePropertiesNames::kFileCreationTime,
+       &new_table_properties->file_creation_time},
+      {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+       &new_table_properties->slow_compression_estimated_data_size},
+      {TablePropertiesNames::kFastCompressionEstimatedDataSize,
+       &new_table_properties->fast_compression_estimated_data_size},
+      {TablePropertiesNames::kTailStartOffset,
+       &new_table_properties->tail_start_offset},
+      {TablePropertiesNames::kUserDefinedTimestampsPersisted,
+       &new_table_properties->user_defined_timestamps_persisted},
+      {TablePropertiesNames::kKeyLargestSeqno,
+       &new_table_properties->key_largest_seqno},
+  };
+
+  Status s;
+  std::string last_key;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    s = iter->status();
+    if (!s.ok()) {
+      break;
+    }
+
+    auto key = iter->key().ToString();
+    // properties block should be strictly sorted with no duplicate key.
+    if (!last_key.empty() &&
+        BytewiseComparator()->Compare(key, last_key) <= 0) {
+      s = Status::Corruption("properties unsorted");
+      break;
+    }
+    last_key = key;
+
+    auto raw_val = iter->value();
+    auto pos = predefined_uint64_properties.find(key);
+
+    if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
+      new_table_properties->external_sst_file_global_seqno_offset =
+          offset + iter->ValueOffset();
+    }
+
+    if (pos != predefined_uint64_properties.end()) {
+      if (key == TablePropertiesNames::kDeletedKeys ||
+          key == TablePropertiesNames::kMergeOperands) {
+        // Insert in user-collected properties for API backwards compatibility
+        new_table_properties->user_collected_properties.insert(
+            {key, raw_val.ToString()});
+      }
+      // handle predefined rocksdb properties
+      uint64_t val;
+      if (!GetVarint64(&raw_val, &val)) {
+        // skip malformed value
+        auto error_msg =
+            "Detect malformed value in properties meta-block:"
+            "\tkey: " +
+            key + "\tval: " + raw_val.ToString();
+        ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
+        continue;
+      }
+      *(pos->second) = val;
+    } else if (key == TablePropertiesNames::kDbId) {
+      new_table_properties->db_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbSessionId) {
+      new_table_properties->db_session_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbHostId) {
+      new_table_properties->db_host_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kFilterPolicy) {
+      new_table_properties->filter_policy_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kColumnFamilyName) {
+      new_table_properties->column_family_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kComparator) {
+      new_table_properties->comparator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kMergeOperator) {
+      new_table_properties->merge_operator_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPrefixExtractorName) {
+      new_table_properties->prefix_extractor_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kPropertyCollectors) {
+      new_table_properties->property_collectors_names = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompression) {
+      new_table_properties->compression_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompressionOptions) {
+      new_table_properties->compression_options = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
+      new_table_properties->seqno_to_time_mapping = raw_val.ToString();
+    } else {
+      // handle user-collected properties
+      new_table_properties->user_collected_properties.insert(
+          {key, raw_val.ToString()});
+    }
+  }
+
+  return s;
+}
+
 // FIXME: should be a parameter for reading table properties to use persistent
 // cache?
 Status ReadTablePropertiesHelper(
@@ -324,140 +462,9 @@ Status ReadTablePropertiesHelper(
 
     uint64_t block_size = block_contents.data.size();
     Block properties_block(std::move(block_contents));
-    // Unfortunately, Block::size() might not equal block_contents.data.size(),
-    // and Block hides block_contents
-    std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
-
     std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
-    // All pre-defined properties of type uint64_t
-    std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
-        {TablePropertiesNames::kOriginalFileNumber,
-         &new_table_properties->orig_file_number},
-        {TablePropertiesNames::kDataSize, &new_table_properties->data_size},
-        {TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
-        {TablePropertiesNames::kIndexPartitions,
-         &new_table_properties->index_partitions},
-        {TablePropertiesNames::kTopLevelIndexSize,
-         &new_table_properties->top_level_index_size},
-        {TablePropertiesNames::kIndexKeyIsUserKey,
-         &new_table_properties->index_key_is_user_key},
-        {TablePropertiesNames::kIndexValueIsDeltaEncoded,
-         &new_table_properties->index_value_is_delta_encoded},
-        {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
-        {TablePropertiesNames::kRawKeySize,
-         &new_table_properties->raw_key_size},
-        {TablePropertiesNames::kRawValueSize,
-         &new_table_properties->raw_value_size},
-        {TablePropertiesNames::kNumDataBlocks,
-         &new_table_properties->num_data_blocks},
-        {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
-        {TablePropertiesNames::kNumFilterEntries,
-         &new_table_properties->num_filter_entries},
-        {TablePropertiesNames::kDeletedKeys,
-         &new_table_properties->num_deletions},
-        {TablePropertiesNames::kMergeOperands,
-         &new_table_properties->num_merge_operands},
-        {TablePropertiesNames::kNumRangeDeletions,
-         &new_table_properties->num_range_deletions},
-        {TablePropertiesNames::kFormatVersion,
-         &new_table_properties->format_version},
-        {TablePropertiesNames::kFixedKeyLen,
-         &new_table_properties->fixed_key_len},
-        {TablePropertiesNames::kColumnFamilyId,
-         &new_table_properties->column_family_id},
-        {TablePropertiesNames::kCreationTime,
-         &new_table_properties->creation_time},
-        {TablePropertiesNames::kOldestKeyTime,
-         &new_table_properties->oldest_key_time},
-        {TablePropertiesNames::kNewestKeyTime,
-         &new_table_properties->newest_key_time},
-        {TablePropertiesNames::kFileCreationTime,
-         &new_table_properties->file_creation_time},
-        {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
-         &new_table_properties->slow_compression_estimated_data_size},
-        {TablePropertiesNames::kFastCompressionEstimatedDataSize,
-         &new_table_properties->fast_compression_estimated_data_size},
-        {TablePropertiesNames::kTailStartOffset,
-         &new_table_properties->tail_start_offset},
-        {TablePropertiesNames::kUserDefinedTimestampsPersisted,
-         &new_table_properties->user_defined_timestamps_persisted},
-        {TablePropertiesNames::kKeyLargestSeqno,
-         &new_table_properties->key_largest_seqno},
-    };
-
-    std::string last_key;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      s = iter->status();
-      if (!s.ok()) {
-        break;
-      }
-
-      auto key = iter->key().ToString();
-      // properties block should be strictly sorted with no duplicate key.
-      if (!last_key.empty() &&
-          BytewiseComparator()->Compare(key, last_key) <= 0) {
-        s = Status::Corruption("properties unsorted");
-        break;
-      }
-      last_key = key;
-
-      auto raw_val = iter->value();
-      auto pos = predefined_uint64_properties.find(key);
-
-      if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
-        new_table_properties->external_sst_file_global_seqno_offset =
-            handle.offset() + iter->ValueOffset();
-      }
-
-      if (pos != predefined_uint64_properties.end()) {
-        if (key == TablePropertiesNames::kDeletedKeys ||
-            key == TablePropertiesNames::kMergeOperands) {
-          // Insert in user-collected properties for API backwards compatibility
-          new_table_properties->user_collected_properties.insert(
-              {key, raw_val.ToString()});
-        }
-        // handle predefined rocksdb properties
-        uint64_t val;
-        if (!GetVarint64(&raw_val, &val)) {
-          // skip malformed value
-          auto error_msg =
-              "Detect malformed value in properties meta-block:"
-              "\tkey: " +
-              key + "\tval: " + raw_val.ToString();
-          ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
-          continue;
-        }
-        *(pos->second) = val;
-      } else if (key == TablePropertiesNames::kDbId) {
-        new_table_properties->db_id = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kDbSessionId) {
-        new_table_properties->db_session_id = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kDbHostId) {
-        new_table_properties->db_host_id = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kFilterPolicy) {
-        new_table_properties->filter_policy_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kColumnFamilyName) {
-        new_table_properties->column_family_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kComparator) {
-        new_table_properties->comparator_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kMergeOperator) {
-        new_table_properties->merge_operator_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kPrefixExtractorName) {
-        new_table_properties->prefix_extractor_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kPropertyCollectors) {
-        new_table_properties->property_collectors_names = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kCompression) {
-        new_table_properties->compression_name = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kCompressionOptions) {
-        new_table_properties->compression_options = raw_val.ToString();
-      } else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
-        new_table_properties->seqno_to_time_mapping = raw_val.ToString();
-      } else {
-        // handle user-collected properties
-        new_table_properties->user_collected_properties.insert(
-            {key, raw_val.ToString()});
-      }
-    }
+    s = ParsePropertiesBlock(ioptions, handle.offset(), properties_block,
+                             new_table_properties);
 
     // Modified version of BlockFetcher checksum verification
     // (See write_global_seqno comment above)
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index a6aacdf5030a..bc7ad18734f0 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -22,6 +22,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+class Block;
 class BlockBuilder;
 class BlockHandle;
 class Env;
@@ -110,6 +111,10 @@ bool NotifyCollectTableCollectorsOnFinish(
     UserCollectedProperties& user_collected_properties,
     UserCollectedProperties& readable_properties);
 
+Status ParsePropertiesBlock(
+    const ImmutableOptions& ioptions, uint64_t offset, Block& block,
+    std::unique_ptr<TableProperties>& new_table_properties);
+
 // Read table properties from a file using known BlockHandle.
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
diff --git a/table/table_test.cc b/table/table_test.cc
index 9ff03dfcda27..b381a88f3196 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6545,6 +6545,13 @@ class ExternalTableTest : public DBTestBase {
 
     Status Serialize(
         const std::vector<std::pair<std::string, std::string>>& kv_vec) {
+      // First append the property block if one exists
+      uint32_t prop_block_size = static_cast<uint32_t>(prop_block_.length());
+      buf_.append(static_cast<char*>(static_cast<void*>(&prop_block_size)),
+                  sizeof(prop_block_size));
+      if (!prop_block_.empty()) {
+        buf_.append(prop_block_);
+      }
       for (auto& kv : kv_vec) {
         SerializeOne(kv.first, kv.second);
         props_.raw_key_size += kv.first.length();
@@ -6565,6 +6572,12 @@ class ExternalTableTest : public DBTestBase {
         return s;
       }
 
+      uint32_t prop_block_size = 0;
+      buf_.copy(static_cast<char*>(static_cast<void*>(&prop_block_size)),
+                sizeof(prop_block_size));
+      buf_.erase(0, sizeof(prop_block_size));
+      prop_block_.assign(buf_.substr(0, prop_block_size));
+      buf_.erase(0, prop_block_size);
       while (buf_.length() > 0) {
         std::pair<std::string, std::string> kv;
         s = DeserializeOne(kv);
@@ -6581,6 +6594,24 @@ class ExternalTableTest : public DBTestBase {
       return s;
     }
 
+    Status PutPropertiesBlock(const Slice& prop_block) {
+      prop_block_.assign(prop_block.data(), prop_block.size());
+      return Status::OK();
+    }
+
+    Status GetPropertiesBlock(std::unique_ptr<char[]>* block, uint64_t* size,
+                              uint64_t* file_offset) {
+      if (!prop_block_.empty()) {
+        *block = std::make_unique<char[]>(prop_block_.length());
+        memcpy(block->get(), prop_block_.data(), prop_block_.length());
+        *size = prop_block_.length();
+        *file_offset = sizeof(uint32_t);
+      } else {
+        *size = 0;
+      }
+      return Status::OK();
+    }
+
     TableProperties GetTableProperties() const { return props_; }
 
     uint64_t FileSize() const { return file_size_; }
@@ -6623,6 +6654,7 @@ class ExternalTableTest : public DBTestBase {
     std::string buf_;
     TableProperties props_;
     uint64_t file_size_;
+    std::string prop_block_;
   };
 
   class DummyExternalTableIterator : public ExternalTableIterator {
@@ -6768,8 +6800,10 @@ class ExternalTableTest : public DBTestBase {
 
   class DummyExternalTableReader : public ExternalTableReader {
    public:
-    explicit DummyExternalTableReader(const std::string& file_path)
-        : file_(file_path, /*file=*/nullptr) {
+    explicit DummyExternalTableReader(const std::string& file_path,
+                                      bool support_property_block)
+        : file_(file_path, /*file=*/nullptr),
+          support_property_block_(support_property_block) {
       Status s = file_.Deserialize(kv_map_);
       EXPECT_OK(s);
     }
@@ -6804,6 +6838,14 @@ class ExternalTableTest : public DBTestBase {
       }
     }
 
+    Status GetPropertiesBlock(std::unique_ptr<char[]>* block, uint64_t* size,
+                              uint64_t* file_offset) override {
+      if (!support_property_block_) {
+        return Status::NotSupported();
+      }
+      return file_.GetPropertiesBlock(block, size, file_offset);
+    }
+
     std::shared_ptr<const TableProperties> GetTableProperties() const override {
       std::shared_ptr<TableProperties> props =
           std::make_shared<TableProperties>();
@@ -6817,13 +6859,16 @@ class ExternalTableTest : public DBTestBase {
    private:
     std::map<std::string, std::string> kv_map_;
     DummyExternalTableFile file_;
+    bool support_property_block_;
   };
 
   class DummyExternalTableBuilder : public ExternalTableBuilder {
    public:
     explicit DummyExternalTableBuilder(const std::string& file_path,
-                                       FSWritableFile* file)
-        : file_(file_path, file) {}
+                                       FSWritableFile* file,
+                                       bool support_property_block)
+        : file_(file_path, file),
+          support_property_block_(support_property_block) {}
 
     void Add(const Slice& key, const Slice& value) override {
       if (!kv_vec_.empty()) {
@@ -6841,6 +6886,13 @@ class ExternalTableTest : public DBTestBase {
 
     uint64_t FileSize() const override { return file_.FileSize(); }
 
+    Status PutPropertiesBlock(const Slice& block) override {
+      if (!support_property_block_) {
+        return Status::NotSupported();
+      }
+      return file_.PutPropertiesBlock(block);
+    }
+
     TableProperties GetTableProperties() const override {
       return file_.GetTableProperties();
     }
@@ -6851,10 +6903,13 @@ class ExternalTableTest : public DBTestBase {
     std::vector<std::pair<std::string, std::string>> kv_vec_;
     DummyExternalTableFile file_;
     Status status_;
+    bool support_property_block_;
   };
 
   class DummyExternalTableFactory : public ExternalTableFactory {
    public:
+    explicit DummyExternalTableFactory(bool support_property_block)
+        : support_property_block_(support_property_block) {}
     const char* Name() const override { return "DummyExternalTableFactory"; }
 
     Status NewTableReader(
@@ -6864,21 +6919,27 @@ class ExternalTableTest : public DBTestBase {
       // Sanity check some options
       EXPECT_EQ(topts.file_options.handoff_checksum_type,
                 ChecksumType::kCRC32c);
-      table_reader->reset(new DummyExternalTableReader(file_path));
+      table_reader->reset(
+          new DummyExternalTableReader(file_path, support_property_block_));
       return Status::OK();
     }
 
     ExternalTableBuilder* NewTableBuilder(
         const ExternalTableBuilderOptions& /*opts*/,
         const std::string& file_path, FSWritableFile* file) const override {
-      return new DummyExternalTableBuilder(file_path, file);
+      return new DummyExternalTableBuilder(file_path, file,
+                                           support_property_block_);
     }
+
+   private:
+    bool support_property_block_;
   };
 };
 
 TEST_F(ExternalTableTest, BasicTest) {
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/false);
 
   std::string file_path = test::PerThreadDBPath("external_table");
   {
@@ -6935,7 +6996,8 @@ TEST_F(ExternalTableTest, SstReaderTest) {
   dbname += "_db";
 
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/false);
   options.table_factory = NewExternalTableFactory(factory);
 
   std::unique_ptr<SstFileWriter> writer;
@@ -6971,7 +7033,8 @@ TEST_F(ExternalTableTest, ExternalFileChecksumTest) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
   options.table_factory = NewExternalTableFactory(factory);
 
   // Create a file
@@ -7006,7 +7069,8 @@ TEST_F(ExternalTableTest, DBIterTest) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
   options.table_factory = NewExternalTableFactory(factory);
 
   // Create a file
@@ -7062,7 +7126,8 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   std::shared_ptr<ExternalTableFactory> factory =
-      std::make_shared<DummyExternalTableFactory>();
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
   options.table_factory = NewExternalTableFactory(factory);
 
   // Create a file
@@ -7206,6 +7271,111 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
 }
+
+TEST_F(ExternalTableTest, IngestionTest) {
+  if (encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-encrypted environment");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("external_table_test");
+  std::string ingest_file = dbname + "test.immutable";
+  dbname += "_db";
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  std::shared_ptr<ExternalTableFactory> factory =
+      std::make_shared<DummyExternalTableFactory>(
+          /*support_property_block=*/true);
+  options.table_factory = NewExternalTableFactory(factory);
+
+  // Create a file
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "bar"));
+  ASSERT_OK(writer->Put("foo2", "bar2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  ifo.allow_db_generated_files = false;
+  ifo.fill_cache = false;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::unique_ptr<Iterator> iter(db->NewIterator({}, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->value(), "bar");
+  iter->Next();
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->key(), "foo2");
+  ASSERT_EQ(iter->value(), "bar2");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  // Create an overlapping file to ingest with atomic_replace_range option
+  ingest_file += "2";
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "val"));
+  ASSERT_OK(writer->Put("foo2", "val2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  ifo.snapshot_consistency = false;
+  s = db->IngestExternalFiles({{cfh,
+                                {ingest_file},
+                                ifo,
+                                {},
+                                {},
+                                Temperature::kUnknown,
+                                {{nullptr, nullptr}}}});
+  ASSERT_OK(s);
+
+  iter.reset(db->NewIterator({}, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->value(), "val");
+  iter->Next();
+  ASSERT_TRUE(iter->Valid() && iter->status().ok());
+  ASSERT_EQ(iter->key(), "foo2");
+  ASSERT_EQ(iter->value(), "val2");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  // Create an overlapping file to ingest without atomic_replace_range option.
+  // This should fail as we don't support ingesting an external file with
+  // non-zero assigned sequence number.
+  ingest_file += "3";
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+  ASSERT_OK(writer->Put("foo", "newval"));
+  ASSERT_OK(writer->Put("foo2", "newval2"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  s = db->IngestExternalFiles(
+      {{cfh, {ingest_file}, ifo, {}, {}, Temperature::kUnknown, {}}});
+  ASSERT_EQ(s, Status::NotSupported());
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 83026c7db2fcfb873374537b217c4b59b0485f16 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 16 May 2025 17:19:15 -0700
Subject: [PATCH 095/500] Fix handling of old files with compression dictionary
 but no compression (#13618)

Summary:
Before the fix to https://github.com/facebook/rocksdb/issues/12409 in https://github.com/facebook/rocksdb/issues/12453, SST files could have a compression dictionary but be configured for no compression. Recent PR https://github.com/facebook/rocksdb/issues/13540 regressed on handling this safely on the read side, which was caught by the format compatibile nightly test (recently expanded to cover dictionary compression in https://github.com/facebook/rocksdb/issues/13414).

This change fixes that regression.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13618

Test Plan: manual and ongoing format compatibility test runs. (I don't think this case is worth introducing a back door to create a uselessly inefficient SST file, considering it's covered by nightly CI.)

Reviewed By: cbi42

Differential Revision: D74914868

Pulled By: pdillinger

fbshipit-source-id: 5a4ab058d0d6da275eefb2df1a7454d8a4b2031f
---
 table/block_based/block_based_table_reader.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 1bf777a87a8c..ebb895027a14 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1243,7 +1243,10 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     }
   }
 
-  if (!rep_->compression_dict_handle.IsNull()) {
+  // NOTE: before the fix to https://github.com/facebook/rocksdb/issues/12409, a
+  // file could have a (de)compression dictionary block without a configured
+  // compression, so we need to ignore the dictionary in that case.
+  if (!rep_->compression_dict_handle.IsNull() && rep_->decompressor) {
     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
     s = UncompressionDictReader::Create(
         this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,

From 9a9a403a89296fb99fee4435387b6077948d7855 Mon Sep 17 00:00:00 2001
From: Zaidoon Abd Al Hadi <zaidoon@cloudflare.com>
Date: Fri, 16 May 2025 17:31:19 -0700
Subject: [PATCH 096/500] add support for event listener to C API (#13601)

Summary:
mostly copied from tikv's fork of rocksdb: https://github.com/tikv/rust-rocksdb/blob/master/librocksdb_sys/crocksdb/c.cc#L2445

fixed https://github.com/facebook/rocksdb/issues/13525

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13601

Reviewed By: hx235

Differential Revision: D74588333

Pulled By: cbi42

fbshipit-source-id: dedfc5866cf9025f9d8b6a33a8133e432554476d
---
 db/c.cc             | 382 ++++++++++++++++++++++++++++++++++++++++++++
 include/rocksdb/c.h | 141 ++++++++++++++++
 2 files changed, 523 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index e96ee8479237..d324ca3f10a0 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -24,6 +24,7 @@
 #include "rocksdb/experimental.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
@@ -50,6 +51,7 @@
 #include "util/stderr_logger.h"
 #include "utilities/merge_operators.h"
 
+using ROCKSDB_NAMESPACE::BackgroundErrorReason;
 using ROCKSDB_NAMESPACE::BackupEngine;
 using ROCKSDB_NAMESPACE::BackupEngineOptions;
 using ROCKSDB_NAMESPACE::BackupID;
@@ -66,7 +68,9 @@ using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
 using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
 using ROCKSDB_NAMESPACE::CompactionFilter;
 using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionJobInfo;
 using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactionReason;
 using ROCKSDB_NAMESPACE::CompactRangeOptions;
 using ROCKSDB_NAMESPACE::Comparator;
 using ROCKSDB_NAMESPACE::CompressionType;
@@ -77,8 +81,11 @@ using ROCKSDB_NAMESPACE::DBOptions;
 using ROCKSDB_NAMESPACE::DbPath;
 using ROCKSDB_NAMESPACE::Env;
 using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::EventListener;
+using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo;
 using ROCKSDB_NAMESPACE::FileLock;
 using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushJobInfo;
 using ROCKSDB_NAMESPACE::FlushOptions;
 using ROCKSDB_NAMESPACE::HistogramData;
 using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
@@ -91,6 +98,7 @@ using ROCKSDB_NAMESPACE::Logger;
 using ROCKSDB_NAMESPACE::LRUCacheOptions;
 using ROCKSDB_NAMESPACE::MemoryAllocator;
 using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MemTableInfo;
 using ROCKSDB_NAMESPACE::MergeOperator;
 using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
 using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
@@ -119,6 +127,7 @@ using ROCKSDB_NAMESPACE::SstFileMetaData;
 using ROCKSDB_NAMESPACE::SstFileWriter;
 using ROCKSDB_NAMESPACE::Status;
 using ROCKSDB_NAMESPACE::StderrLogger;
+using ROCKSDB_NAMESPACE::SubcompactionJobInfo;
 using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
 using ROCKSDB_NAMESPACE::Transaction;
 using ROCKSDB_NAMESPACE::TransactionDB;
@@ -132,6 +141,8 @@ using ROCKSDB_NAMESPACE::WriteBatch;
 using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
 using ROCKSDB_NAMESPACE::WriteBufferManager;
 using ROCKSDB_NAMESPACE::WriteOptions;
+using ROCKSDB_NAMESPACE::WriteStallCondition;
+using ROCKSDB_NAMESPACE::WriteStallInfo;
 
 using std::unordered_set;
 using std::vector;
@@ -141,6 +152,9 @@ extern "C" {
 struct rocksdb_t {
   DB* rep;
 };
+struct rocksdb_status_ptr_t {
+  Status* rep;
+};
 struct rocksdb_backup_engine_t {
   BackupEngine* rep;
 };
@@ -297,6 +311,28 @@ struct rocksdb_compactionfiltercontext_t {
   CompactionFilter::Context rep;
 };
 
+struct rocksdb_flushjobinfo_t {
+  FlushJobInfo rep;
+};
+struct rocksdb_writestallcondition_t {
+  WriteStallCondition rep;
+};
+struct rocksdb_writestallinfo_t {
+  WriteStallInfo rep;
+};
+struct rocksdb_memtableinfo_t {
+  MemTableInfo rep;
+};
+struct rocksdb_compactionjobinfo_t {
+  CompactionJobInfo rep;
+};
+struct rocksdb_subcompactionjobinfo_t {
+  SubcompactionJobInfo rep;
+};
+struct rocksdb_externalfileingestioninfo_t {
+  ExternalFileIngestionInfo rep;
+};
+
 struct rocksdb_statistics_histogram_data_t {
   rocksdb_statistics_histogram_data_t() : rep() {}
   HistogramData rep;
@@ -2991,6 +3027,352 @@ void rocksdb_block_based_options_set_unpartitioned_pinning_tier(
       static_cast<ROCKSDB_NAMESPACE::PinningTier>(v);
 }
 
+/* FlushJobInfo */
+
+const char* rocksdb_flushjobinfo_cf_name(const rocksdb_flushjobinfo_t* info,
+                                         size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+const char* rocksdb_flushjobinfo_file_path(const rocksdb_flushjobinfo_t* info,
+                                           size_t* size) {
+  *size = info->rep.file_path.size();
+  return info->rep.file_path.data();
+}
+
+unsigned char rocksdb_flushjobinfo_triggered_writes_slowdown(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.triggered_writes_slowdown;
+}
+
+unsigned char rocksdb_flushjobinfo_triggered_writes_stop(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.triggered_writes_stop;
+}
+
+uint64_t rocksdb_flushjobinfo_largest_seqno(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.largest_seqno;
+}
+
+uint64_t rocksdb_flushjobinfo_smallest_seqno(
+    const rocksdb_flushjobinfo_t* info) {
+  return info->rep.smallest_seqno;
+}
+
+void rocksdb_reset_status(rocksdb_status_ptr_t* status_ptr) {
+  auto ptr = status_ptr->rep;
+  *ptr = Status::OK();
+}
+
+/* CompactionJobInfo */
+
+void rocksdb_compactionjobinfo_status(const rocksdb_compactionjobinfo_t* info,
+                                      char** errptr) {
+  SaveError(errptr, info->rep.status);
+}
+
+const char* rocksdb_compactionjobinfo_cf_name(
+    const rocksdb_compactionjobinfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+size_t rocksdb_compactionjobinfo_input_files_count(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.input_files.size();
+}
+
+const char* rocksdb_compactionjobinfo_input_file_at(
+    const rocksdb_compactionjobinfo_t* info, size_t pos, size_t* size) {
+  assert(info != nullptr);
+  assert(pos < info->rep.input_files.size());
+
+  const std::string& path = info->rep.input_files[pos];
+  *size = path.size();
+  return path.data();
+}
+
+size_t rocksdb_compactionjobinfo_output_files_count(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.output_files.size();
+}
+
+const char* rocksdb_compactionjobinfo_output_file_at(
+    const rocksdb_compactionjobinfo_t* info, size_t pos, size_t* size) {
+  assert(info != nullptr);
+  assert(pos < info->rep.output_files.size());
+
+  const std::string& path = info->rep.output_files[pos];
+  *size = path.size();
+  return path.data();
+}
+
+uint64_t rocksdb_compactionjobinfo_elapsed_micros(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.elapsed_micros;
+}
+
+uint64_t rocksdb_compactionjobinfo_num_corrupt_keys(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_corrupt_keys;
+}
+
+int rocksdb_compactionjobinfo_base_input_level(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.base_input_level;
+}
+
+int rocksdb_compactionjobinfo_output_level(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.output_level;
+}
+
+size_t rocksdb_compactionjobinfo_num_input_files(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_input_files;
+}
+
+size_t rocksdb_compactionjobinfo_num_input_files_at_output_level(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_input_files_at_output_level;
+}
+
+uint64_t rocksdb_compactionjobinfo_input_records(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_input_records;
+}
+
+uint64_t rocksdb_compactionjobinfo_output_records(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.num_output_records;
+}
+
+uint64_t rocksdb_compactionjobinfo_total_input_bytes(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.total_input_bytes;
+}
+
+uint64_t rocksdb_compactionjobinfo_total_output_bytes(
+    const rocksdb_compactionjobinfo_t* info) {
+  return info->rep.stats.total_output_bytes;
+}
+
+uint32_t rocksdb_compactionjobinfo_compaction_reason(
+    const rocksdb_compactionjobinfo_t* info) {
+  return static_cast<uint32_t>(info->rep.compaction_reason);
+}
+
+/* SubcompactionJobInfo */
+
+void rocksdb_subcompactionjobinfo_status(
+    const rocksdb_subcompactionjobinfo_t* info, char** errptr) {
+  SaveError(errptr, info->rep.status);
+}
+
+const char* rocksdb_subcompactionjobinfo_cf_name(
+    const rocksdb_subcompactionjobinfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+uint64_t rocksdb_subcompactionjobinfo_thread_id(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return info->rep.thread_id;
+}
+
+int rocksdb_subcompactionjobinfo_base_input_level(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return info->rep.base_input_level;
+}
+
+int rocksdb_subcompactionjobinfo_output_level(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return info->rep.output_level;
+}
+
+/* ExternalFileIngestionInfo */
+
+const char* rocksdb_externalfileingestioninfo_cf_name(
+    const rocksdb_externalfileingestioninfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+const char* rocksdb_externalfileingestioninfo_internal_file_path(
+    const rocksdb_externalfileingestioninfo_t* info, size_t* size) {
+  *size = info->rep.internal_file_path.size();
+  return info->rep.internal_file_path.data();
+}
+
+/* External write stall info */
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writestallinfo_cf_name(
+    const rocksdb_writestallinfo_t* info, size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+const rocksdb_writestallcondition_t* rocksdb_writestallinfo_cur(
+    const rocksdb_writestallinfo_t* info) {
+  return reinterpret_cast<const rocksdb_writestallcondition_t*>(
+      &info->rep.condition.cur);
+}
+
+const rocksdb_writestallcondition_t* rocksdb_writestallinfo_prev(
+    const rocksdb_writestallinfo_t* info) {
+  return reinterpret_cast<const rocksdb_writestallcondition_t*>(
+      &info->rep.condition.prev);
+}
+
+const char* rocksdb_memtableinfo_cf_name(const rocksdb_memtableinfo_t* info,
+                                         size_t* size) {
+  *size = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+uint64_t rocksdb_memtableinfo_first_seqno(const rocksdb_memtableinfo_t* info) {
+  return info->rep.first_seqno;
+}
+uint64_t rocksdb_memtableinfo_earliest_seqno(
+    const rocksdb_memtableinfo_t* info) {
+  return info->rep.earliest_seqno;
+}
+uint64_t rocksdb_memtableinfo_num_entries(const rocksdb_memtableinfo_t* info) {
+  return info->rep.num_entries;
+}
+uint64_t rocksdb_memtableinfo_num_deletes(const rocksdb_memtableinfo_t* info) {
+  return info->rep.num_deletes;
+}
+
+/* event listener */
+
+struct rocksdb_eventlistener_t : public EventListener {
+  void* state_{};
+  void (*destructor_)(void*){};
+  void (*on_flush_begin)(void*, rocksdb_t*, const rocksdb_flushjobinfo_t*){};
+  void (*on_flush_completed)(void*, rocksdb_t*,
+                             const rocksdb_flushjobinfo_t*){};
+  void (*on_compaction_begin)(void*, rocksdb_t*,
+                              const rocksdb_compactionjobinfo_t*){};
+  void (*on_compaction_completed)(void*, rocksdb_t*,
+                                  const rocksdb_compactionjobinfo_t*){};
+  void (*on_subcompaction_begin)(void*,
+                                 const rocksdb_subcompactionjobinfo_t*){};
+  void (*on_subcompaction_completed)(void*,
+                                     const rocksdb_subcompactionjobinfo_t*){};
+  void (*on_external_file_ingested)(
+      void*, rocksdb_t*, const rocksdb_externalfileingestioninfo_t*){};
+  void (*on_background_error)(void*, uint32_t, rocksdb_status_ptr_t*){};
+  void (*on_stall_conditions_changed)(void*, const rocksdb_writestallinfo_t*){};
+  void (*on_memtable_sealed)(void*, const rocksdb_memtableinfo_t*){};
+
+  rocksdb_eventlistener_t() = default;
+
+  rocksdb_eventlistener_t(const rocksdb_eventlistener_t&) = delete;
+  rocksdb_eventlistener_t& operator=(const rocksdb_eventlistener_t&) = delete;
+  rocksdb_eventlistener_t(rocksdb_eventlistener_t&&) = delete;
+  rocksdb_eventlistener_t& operator=(rocksdb_eventlistener_t&&) = delete;
+
+  void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_flush_begin(state_, &c_db,
+                   reinterpret_cast<const rocksdb_flushjobinfo_t*>(&info));
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_flush_completed(state_, &c_db,
+                       reinterpret_cast<const rocksdb_flushjobinfo_t*>(&info));
+  }
+
+  void OnCompactionBegin(DB* db, const CompactionJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_compaction_begin(
+        state_, &c_db,
+        reinterpret_cast<const rocksdb_compactionjobinfo_t*>(&info));
+  }
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_compaction_completed(
+        state_, &c_db,
+        reinterpret_cast<const rocksdb_compactionjobinfo_t*>(&info));
+  }
+
+  void OnSubcompactionBegin(const SubcompactionJobInfo& info) override {
+    on_subcompaction_begin(
+        state_, reinterpret_cast<const rocksdb_subcompactionjobinfo_t*>(&info));
+  }
+
+  void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override {
+    on_subcompaction_completed(
+        state_, reinterpret_cast<const rocksdb_subcompactionjobinfo_t*>(&info));
+  }
+
+  void OnExternalFileIngested(DB* db,
+                              const ExternalFileIngestionInfo& info) override {
+    rocksdb_t c_db = {db};
+    on_external_file_ingested(
+        state_, &c_db,
+        reinterpret_cast<const rocksdb_externalfileingestioninfo_t*>(&info));
+  }
+
+  void OnBackgroundError(BackgroundErrorReason reason,
+                         Status* status) override {
+    rocksdb_status_ptr_t* s = new rocksdb_status_ptr_t;
+    s->rep = status;
+    on_background_error(state_, static_cast<uint32_t>(reason), s);
+    delete s;
+  }
+
+  void OnStallConditionsChanged(const WriteStallInfo& info) override {
+    on_stall_conditions_changed(
+        state_, reinterpret_cast<const rocksdb_writestallinfo_t*>(&info));
+  }
+
+  void OnMemTableSealed(const MemTableInfo& info) override {
+    on_memtable_sealed(state_,
+                       reinterpret_cast<const rocksdb_memtableinfo_t*>(&info));
+  }
+
+  ~rocksdb_eventlistener_t() override { destructor_(state_); }
+};
+
+rocksdb_eventlistener_t* rocksdb_eventlistener_create(
+    void* state_, void (*destructor_)(void*), on_flush_begin_cb on_flush_begin,
+    on_flush_completed_cb on_flush_completed,
+    on_compaction_begin_cb on_compaction_begin,
+    on_compaction_completed_cb on_compaction_completed,
+    on_subcompaction_begin_cb on_subcompaction_begin,
+    on_subcompaction_completed_cb on_subcompaction_completed,
+    on_external_file_ingested_cb on_external_file_ingested,
+    on_background_error_cb on_background_error,
+    on_stall_conditions_changed_cb on_stall_conditions_changed,
+    on_memtable_sealed_cb on_memtable_sealed) {
+  rocksdb_eventlistener_t* et = new rocksdb_eventlistener_t;
+  et->state_ = state_;
+  et->destructor_ = destructor_;
+  et->on_flush_begin = on_flush_begin;
+  et->on_flush_completed = on_flush_completed;
+  et->on_compaction_begin = on_compaction_begin;
+  et->on_compaction_completed = on_compaction_completed;
+  et->on_subcompaction_begin = on_subcompaction_begin;
+  et->on_subcompaction_completed = on_subcompaction_completed;
+  et->on_external_file_ingested = on_external_file_ingested;
+  et->on_background_error = on_background_error;
+  et->on_stall_conditions_changed = on_stall_conditions_changed;
+  et->on_memtable_sealed = on_memtable_sealed;
+  return et;
+}
+
+void rocksdb_eventlistener_destroy(rocksdb_eventlistener_t* t) { delete t; }
+
+void rocksdb_options_add_eventlistener(rocksdb_options_t* opt,
+                                       rocksdb_eventlistener_t* t) {
+  opt->rep.listeners.emplace_back(std::shared_ptr<EventListener>(t));
+}
+
 rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() {
   return new rocksdb_cuckoo_table_options_t;
 }
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 52a7593a262b..ee5cc5274642 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -69,6 +69,7 @@ extern "C" {
 /* Exported types */
 
 typedef struct rocksdb_t rocksdb_t;
+typedef struct rocksdb_status_ptr_t rocksdb_status_ptr_t;
 typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t;
 typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t;
 typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t;
@@ -142,6 +143,15 @@ typedef struct rocksdb_statistics_histogram_data_t
     rocksdb_statistics_histogram_data_t;
 typedef struct rocksdb_wait_for_compact_options_t
     rocksdb_wait_for_compact_options_t;
+typedef struct rocksdb_flushjobinfo_t rocksdb_flushjobinfo_t;
+typedef struct rocksdb_compactionjobinfo_t rocksdb_compactionjobinfo_t;
+typedef struct rocksdb_subcompactionjobinfo_t rocksdb_subcompactionjobinfo_t;
+typedef struct rocksdb_externalfileingestioninfo_t
+    rocksdb_externalfileingestioninfo_t;
+typedef struct rocksdb_eventlistener_t rocksdb_eventlistener_t;
+typedef struct rocksdb_writestallinfo_t rocksdb_writestallinfo_t;
+typedef struct rocksdb_writestallcondition_t rocksdb_writestallcondition_t;
+typedef struct rocksdb_memtableinfo_t rocksdb_memtableinfo_t;
 
 /* DB operations */
 
@@ -1145,6 +1155,137 @@ rocksdb_block_based_options_set_unpartitioned_pinning_tier(
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_manager(
     rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm);
 
+/* Flush job info */
+
+extern ROCKSDB_LIBRARY_API const char* rocksdb_flushjobinfo_cf_name(
+    const rocksdb_flushjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_flushjobinfo_file_path(
+    const rocksdb_flushjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_flushjobinfo_triggered_writes_slowdown(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_flushjobinfo_triggered_writes_stop(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_flushjobinfo_largest_seqno(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_flushjobinfo_smallest_seqno(const rocksdb_flushjobinfo_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_reset_status(
+    rocksdb_status_ptr_t* status_ptr);
+
+/* Compaction job info */
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionjobinfo_status(
+    const rocksdb_compactionjobinfo_t* info, char** errptr);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_cf_name(
+    const rocksdb_compactionjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_compactionjobinfo_input_files_count(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_input_file_at(
+    const rocksdb_compactionjobinfo_t*, size_t pos, size_t*);
+extern ROCKSDB_LIBRARY_API size_t rocksdb_compactionjobinfo_output_files_count(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_compactionjobinfo_output_file_at(
+    const rocksdb_compactionjobinfo_t*, size_t pos, size_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_elapsed_micros(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_num_corrupt_keys(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactionjobinfo_base_input_level(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_compactionjobinfo_output_level(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_input_records(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_output_records(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_total_input_bytes(const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionjobinfo_total_output_bytes(
+    const rocksdb_compactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_compactionjobinfo_compaction_reason(
+    const rocksdb_compactionjobinfo_t* info);
+extern ROCKSDB_LIBRARY_API size_t rocksdb_compactionjobinfo_num_input_files(
+    const rocksdb_compactionjobinfo_t* info);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_compactionjobinfo_num_input_files_at_output_level(
+    const rocksdb_compactionjobinfo_t* info);
+
+/* Subcompaction job info */
+extern ROCKSDB_LIBRARY_API void rocksdb_subcompactionjobinfo_status(
+    const rocksdb_subcompactionjobinfo_t*, char**);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_subcompactionjobinfo_cf_name(
+    const rocksdb_subcompactionjobinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_subcompactionjobinfo_thread_id(const rocksdb_subcompactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_base_input_level(
+    const rocksdb_subcompactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_output_level(
+    const rocksdb_subcompactionjobinfo_t*);
+
+/* External file ingestion info */
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_externalfileingestioninfo_cf_name(
+    const rocksdb_externalfileingestioninfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_externalfileingestioninfo_internal_file_path(
+    const rocksdb_externalfileingestioninfo_t*, size_t*);
+
+/* External write stall info */
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writestallinfo_cf_name(
+    const rocksdb_writestallinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API const rocksdb_writestallcondition_t*
+rocksdb_writestallinfo_cur(const rocksdb_writestallinfo_t*);
+extern ROCKSDB_LIBRARY_API const rocksdb_writestallcondition_t*
+rocksdb_writestallinfo_prev(const rocksdb_writestallinfo_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_memtableinfo_cf_name(
+    const rocksdb_memtableinfo_t*, size_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_first_seqno(const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_earliest_seqno(const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_num_entries(const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_memtableinfo_num_deletes(const rocksdb_memtableinfo_t*);
+
+/* Event listener */
+
+typedef void (*on_flush_begin_cb)(void*, rocksdb_t*,
+                                  const rocksdb_flushjobinfo_t*);
+typedef void (*on_flush_completed_cb)(void*, rocksdb_t*,
+                                      const rocksdb_flushjobinfo_t*);
+typedef void (*on_compaction_begin_cb)(void*, rocksdb_t*,
+                                       const rocksdb_compactionjobinfo_t*);
+typedef void (*on_compaction_completed_cb)(void*, rocksdb_t*,
+                                           const rocksdb_compactionjobinfo_t*);
+typedef void (*on_subcompaction_begin_cb)(
+    void*, const rocksdb_subcompactionjobinfo_t*);
+typedef void (*on_subcompaction_completed_cb)(
+    void*, const rocksdb_subcompactionjobinfo_t*);
+typedef void (*on_external_file_ingested_cb)(
+    void*, rocksdb_t*, const rocksdb_externalfileingestioninfo_t*);
+typedef void (*on_background_error_cb)(void*, uint32_t, rocksdb_status_ptr_t*);
+typedef void (*on_stall_conditions_changed_cb)(void*,
+                                               const rocksdb_writestallinfo_t*);
+typedef void (*rocksdb_logger_logv_cb)(void*, uint32_t log_level, const char*);
+typedef void (*on_memtable_sealed_cb)(void*, const rocksdb_memtableinfo_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_eventlistener_t*
+rocksdb_eventlistener_create(
+    void* state_, void (*destructor_)(void*), on_flush_begin_cb on_flush_begin,
+    on_flush_completed_cb on_flush_completed,
+    on_compaction_begin_cb on_compaction_begin,
+    on_compaction_completed_cb on_compaction_completed,
+    on_subcompaction_begin_cb on_subcompaction_begin,
+    on_subcompaction_completed_cb on_subcompaction_completed,
+    on_external_file_ingested_cb on_external_file_ingested,
+    on_background_error_cb on_background_error,
+    on_stall_conditions_changed_cb on_stall_conditions_changed,
+    on_memtable_sealed_cb on_memtable_sealed);
+extern ROCKSDB_LIBRARY_API void rocksdb_eventlistener_destroy(
+    rocksdb_eventlistener_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_add_eventlistener(
+    rocksdb_options_t*, rocksdb_eventlistener_t*);
+
 /* Cuckoo table options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*

From acab405fc11d453670f162be2e656f22d9ade807 Mon Sep 17 00:00:00 2001
From: virajthakur <virajthakur@berkeley.edu>
Date: Fri, 16 May 2025 21:25:50 -0700
Subject: [PATCH 097/500] propagate request_id from app -> Rocks -> FS (#13616)

Summary:
[internal use] Allow the application to pass a request_id per read request to RocksDB and pass it down to the FileSystem (via IODebugContext)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13616

Test Plan:
./db_test --gtest_filter=DBTest.RequestIdPlumbingTest

Validates that RocksDB Api calls with request_id set result in request_id being passed to the filesystem through IODebugContext

Reviewed By: pdillinger

Differential Revision: D74912824

Pulled By: virajthakur

fbshipit-source-id: 4f15fef3ff7b5d700563f993f9b211c991020fb6
---
 db/blob/blob_file_reader.cc                   | 15 +--
 db/db_test.cc                                 | 97 +++++++++++++++++++
 file/file_util.cc                             |  7 +-
 file/file_util.h                              |  9 +-
 file/random_access_file_reader.cc             | 33 ++++---
 file/random_access_file_reader.h              | 16 +--
 file/random_access_file_reader_test.cc        | 20 ++--
 include/rocksdb/file_system.h                 |  9 +-
 include/rocksdb/options.h                     | 13 +++
 table/block_based/block_based_table_reader.cc |  6 +-
 .../block_based_table_reader_sync_and_async.h | 12 ++-
 table/block_based/block_prefetcher.cc         |  3 +-
 table/block_based/partitioned_filter_block.cc |  3 +-
 table/block_fetcher.cc                        | 18 ++--
 table/meta_blocks.cc                          | 10 +-
 trace_replay/io_tracer.cc                     |  2 +-
 trace_replay/io_tracer_test.cc                |  3 +-
 .../plumb_application_request_id_to_fs.md     |  2 +
 util/async_file_reader.cc                     |  2 +-
 util/async_file_reader.h                      | 23 +++--
 20 files changed, 233 insertions(+), 70 deletions(-)
 create mode 100644 unreleased_history/new_features/plumb_application_request_id_to_fs.md

diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc
index 0d05b5e57140..447f090b5070 100644
--- a/db/blob/blob_file_reader.cc
+++ b/db/blob/blob_file_reader.cc
@@ -250,7 +250,8 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
   Status s;
 
   IOOptions io_options;
-  s = file_reader->PrepareIOOptions(read_options, io_options);
+  IODebugContext dbg;
+  s = file_reader->PrepareIOOptions(read_options, io_options, &dbg);
   if (!s.ok()) {
     return s;
   }
@@ -259,13 +260,13 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
     constexpr char* scratch = nullptr;
 
     s = file_reader->Read(io_options, read_offset, read_size, slice, scratch,
-                          aligned_buf);
+                          aligned_buf, &dbg);
   } else {
     buf->reset(new char[read_size]);
     constexpr AlignedBuf* aligned_scratch = nullptr;
 
     s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(),
-                          aligned_scratch);
+                          aligned_scratch, &dbg);
   }
 
   if (!s.ok()) {
@@ -334,7 +335,8 @@ Status BlobFileReader::GetBlob(
     constexpr bool for_compaction = true;
 
     IOOptions io_options;
-    s = file_reader_->PrepareIOOptions(read_options, io_options);
+    IODebugContext dbg;
+    s = file_reader_->PrepareIOOptions(read_options, io_options, &dbg);
     if (!s.ok()) {
       return s;
     }
@@ -463,10 +465,11 @@ void BlobFileReader::MultiGetBlob(
   PERF_COUNTER_ADD(blob_read_count, num_blobs);
   PERF_COUNTER_ADD(blob_read_byte, total_len);
   IOOptions opts;
-  s = file_reader_->PrepareIOOptions(read_options, opts);
+  IODebugContext dbg;
+  s = file_reader_->PrepareIOOptions(read_options, opts, &dbg);
   if (s.ok()) {
     s = file_reader_->MultiRead(opts, read_reqs.data(), read_reqs.size(),
-                                direct_io ? &aligned_buf : nullptr);
+                                direct_io ? &aligned_buf : nullptr, &dbg);
   }
   if (!s.ok()) {
     for (auto& req : read_reqs) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 81b4c2ed1b9a..cda3517d7db5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -144,6 +144,103 @@ TEST_F(DBTest, MockEnvTest) {
   delete db;
 }
 
+TEST_F(DBTest, RequestIdPlumbingTest) {
+  // test that request_id is passed to the filesystem, from
+  // ReadOptions to IODebugContext
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  // Create a mock environment to capture IODebugContext during reads
+  const std::string* captured_request_id_dbg;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::Read:IODebugContext", [&](void* arg) {
+        IODebugContext* dbg = static_cast<IODebugContext*>(arg);
+        if (dbg == nullptr) {
+          captured_request_id_dbg = nullptr;
+        } else {
+          captured_request_id_dbg = dbg->request_id;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+
+  // test request_id plumbing during a get
+  {
+    const std::string test_request_id = "test_request_id_123";
+    ReadOptions read_opts;
+    read_opts.request_id = &test_request_id;
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "k1", &value));
+
+    // Verify the request_id was propagated to the file system
+    ASSERT_NE(captured_request_id_dbg, nullptr);
+    ASSERT_EQ(*captured_request_id_dbg, test_request_id);
+  }
+
+  captured_request_id_dbg = nullptr;
+
+  // test request_id plumbing during iterator seek
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Flush());
+  {
+    ReadOptions read_opts;
+    const std::string request_id = "test_request_id_456";
+    read_opts.request_id = &request_id;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("k2");
+    ASSERT_TRUE(iter->Valid());
+
+    // Verify the request_id was propagated to the file system
+    ASSERT_NE(captured_request_id_dbg, nullptr);
+    ASSERT_EQ(*captured_request_id_dbg, request_id);
+  }
+
+  // test request_id plumbing during multiget
+  captured_request_id_dbg = nullptr;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:IODebugContext", [&](void* arg) {
+        IODebugContext* dbg = static_cast<IODebugContext*>(arg);
+        if (dbg == nullptr) {
+          captured_request_id_dbg = nullptr;
+        } else {
+          captured_request_id_dbg = dbg->request_id;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("k3", "v3"));
+  ASSERT_OK(Put("k4", "v4"));
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions read_opts;
+    const std::string multiget_request_id = "test_request_id_789";
+    read_opts.request_id = &multiget_request_id;
+
+    std::vector<std::string> values;
+    std::vector<Slice> keys = {Slice("k3"), Slice("k4")};
+
+    values.resize(keys.size());
+
+    std::vector<ColumnFamilyHandle*> cfhs(keys.size(),
+                                          db_->DefaultColumnFamily());
+    db_->MultiGet(read_opts, cfhs, keys, &values);
+
+    ASSERT_NE(captured_request_id_dbg, nullptr);
+    ASSERT_EQ(*captured_request_id_dbg, multiget_request_id);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(DBTest, MemEnvTest) {
   std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
   Options options;
diff --git a/file/file_util.cc b/file/file_util.cc
index b3f6128aae41..6e06ea0d95e3 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -247,15 +247,16 @@ IOStatus GenerateOneFileChecksum(
   Slice slice;
   uint64_t offset = 0;
   IOOptions opts;
-  io_s = reader->PrepareIOOptions(read_options, opts);
+  IODebugContext dbg;
+  io_s = reader->PrepareIOOptions(read_options, opts, &dbg);
   if (!io_s.ok()) {
     return io_s;
   }
   while (size > 0) {
     size_t bytes_to_read =
         static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
-    io_s =
-        reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr);
+    io_s = reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr,
+                        &dbg);
     if (!io_s.ok()) {
       return IOStatus::Corruption("file read failed with error: " +
                                   io_s.ToString());
diff --git a/file/file_util.h b/file/file_util.h
index a8f20c86893a..d19a4de6cda0 100644
--- a/file/file_util.h
+++ b/file/file_util.h
@@ -86,7 +86,14 @@ IOStatus GenerateOneFileChecksum(
     const ReadOptions& read_options, Statistics* stats, SystemClock* clock);
 
 inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
-                                         SystemClock* clock, IOOptions& opts) {
+                                         SystemClock* clock, IOOptions& opts,
+                                         IODebugContext* dbg = nullptr) {
+  if (ro.request_id != nullptr) {
+    if (dbg != nullptr && dbg->request_id == nullptr) {
+      dbg->SetRequestId(ro.request_id);
+    }
+  }
+
   if (ro.deadline.count()) {
     std::chrono::microseconds now =
         std::chrono::microseconds(clock->NowMicros());
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index 46f5d1c26262..c8edc86360ec 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -106,11 +106,14 @@ IOStatus RandomAccessFileReader::Create(
 
 IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
                                       size_t n, Slice* result, char* scratch,
-                                      AlignedBuf* aligned_buf) const {
+                                      AlignedBuf* aligned_buf,
+                                      IODebugContext* dbg) const {
   (void)aligned_buf;
   const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority;
 
   TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read:IODebugContext",
+                           const_cast<void*>(static_cast<void*>(dbg)));
 
   // To be paranoid: modify scratch a little bit, so in case underlying
   // FileSystem doesn't fill the buffer but return success and `scratch` returns
@@ -175,7 +178,7 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
           // the opts.timeout before calling file_->Read
           assert(!opts.timeout.count() || allowed == read_size);
           io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
-                             &tmp, buf.Destination(), nullptr);
+                             &tmp, buf.Destination(), dbg);
         }
         if (ShouldNotifyListeners()) {
           auto finish_ts = FileOperationInfo::FinishNow();
@@ -237,7 +240,7 @@ IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
           // the opts.timeout before calling file_->Read
           assert(!opts.timeout.count() || allowed == n);
           io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
-                             scratch + pos, nullptr);
+                             scratch + pos, dbg);
         }
         if (ShouldNotifyListeners()) {
           auto finish_ts = FileOperationInfo::FinishNow();
@@ -311,7 +314,8 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
 IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
                                            FSReadRequest* read_reqs,
                                            size_t num_reqs,
-                                           AlignedBuf* aligned_buf) const {
+                                           AlignedBuf* aligned_buf,
+                                           IODebugContext* dbg) const {
   (void)aligned_buf;  // suppress warning of unused variable in LITE mode
   assert(num_reqs > 0);
 
@@ -420,8 +424,10 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
           remaining_bytes -= request_bytes;
         }
       }
-      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts,
-                              /*IODebugContext*=*/nullptr);
+      TEST_SYNC_POINT_CALLBACK(
+          "RandomAccessFileReader::MultiRead:IODebugContext",
+          const_cast<void*>(static_cast<void*>(dbg)));
+      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, dbg);
       RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs);
     }
 
@@ -475,18 +481,21 @@ IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
 }
 
 IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
-                                                  IOOptions& opts) const {
+                                                  IOOptions& opts,
+                                                  IODebugContext* dbg) const {
   if (clock_ != nullptr) {
-    return PrepareIOFromReadOptions(ro, clock_, opts);
+    return PrepareIOFromReadOptions(ro, clock_, opts, dbg);
   } else {
-    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
+    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts,
+                                    dbg);
   }
 }
 
 IOStatus RandomAccessFileReader::ReadAsync(
     FSReadRequest& req, const IOOptions& opts,
     std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
-    void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
+    void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf,
+    IODebugContext* dbg) {
   IOStatus s;
   // Create a callback and populate info.
   auto read_async_callback =
@@ -532,14 +541,14 @@ IOStatus RandomAccessFileReader::ReadAsync(
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     s = file_->ReadAsync(aligned_req, opts, read_async_callback,
-                         read_async_info, io_handle, del_fn, nullptr /*dbg*/);
+                         read_async_info, io_handle, del_fn, dbg);
   } else {
     StopWatch sw(clock_, stats_, hist_type_,
                  GetFileReadHistograms(stats_, opts.io_activity),
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
-                         io_handle, del_fn, nullptr /*dbg*/);
+                         io_handle, del_fn, dbg);
   }
   RecordTick(stats_, READ_ASYNC_MICROS, elapsed);
 
diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h
index 945e685e3d00..c1de6b973f44 100644
--- a/file/random_access_file_reader.h
+++ b/file/random_access_file_reader.h
@@ -164,7 +164,8 @@ class RandomAccessFileReader {
   // the internally allocated buffer on return, and the result refers to a
   // region in aligned_buf.
   IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
-                char* scratch, AlignedBuf* aligned_buf) const;
+                char* scratch, AlignedBuf* aligned_buf,
+                IODebugContext* dbg = nullptr) const;
 
   // REQUIRES:
   // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
@@ -172,10 +173,12 @@ class RandomAccessFileReader {
   // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
   // MultiRead, the result Slices in reqs refer to aligned_buf.
   IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs,
-                     size_t num_reqs, AlignedBuf* aligned_buf) const;
+                     size_t num_reqs, AlignedBuf* aligned_buf,
+                     IODebugContext* dbg = nullptr) const;
 
-  IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n) const {
-    return file_->Prefetch(offset, n, opts, nullptr);
+  IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n,
+                    IODebugContext* dbg = nullptr) const {
+    return file_->Prefetch(offset, n, opts, dbg);
   }
 
   FSRandomAccessFile* file() { return file_.get(); }
@@ -184,12 +187,13 @@ class RandomAccessFileReader {
 
   bool use_direct_io() const { return file_->use_direct_io(); }
 
-  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const;
+  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts,
+                            IODebugContext* dbg = nullptr) const;
 
   IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
                      std::function<void(FSReadRequest&, void*)> cb,
                      void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
-                     AlignedBuf* aligned_buf);
+                     AlignedBuf* aligned_buf, IODebugContext* dbg = nullptr);
 
   void ReadAsyncCallback(FSReadRequest& req, void* cb_arg);
 };
diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc
index f081795b9d1f..717e985f1adb 100644
--- a/file/random_access_file_reader_test.cc
+++ b/file/random_access_file_reader_test.cc
@@ -147,8 +147,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r0));
     reqs.push_back(std::move(r1));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
@@ -192,8 +193,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r1));
     reqs.push_back(std::move(r2));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
@@ -237,8 +239,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r1));
     reqs.push_back(std::move(r2));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
@@ -274,8 +277,9 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     reqs.push_back(std::move(r0));
     reqs.push_back(std::move(r1));
     AlignedBuf aligned_buf;
-    ASSERT_OK(
-        r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
+    IODebugContext dbg;
+    ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf,
+                           &dbg));
 
     AssertResult(content, reqs);
 
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index ec10a5f12682..cb6ecee9f28b 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -220,6 +220,8 @@ struct FileOptions : EnvOptions {
 
 // A structure to pass back some debugging information from the FileSystem
 // implementation to RocksDB in case of an IO error
+// TODO(virajthakur): Update all calls to FS APIs for writes to pass in
+// IODebugContext
 struct IODebugContext {
   // file_path to be filled in by RocksDB in case of an error
   std::string file_path;
@@ -230,8 +232,9 @@ struct IODebugContext {
   // To be set by the FileSystem implementation
   std::string msg;
 
-  // To be set by the underlying FileSystem implementation.
-  std::string request_id;
+  // To be set by the application, to allow tracing logs/metrics from user ->
+  // RocksDB -> FS.
+  const std::string* request_id = nullptr;
 
   // In order to log required information in IO tracing for different
   // operations, Each bit in trace_data stores which corresponding info from
@@ -255,7 +258,7 @@ struct IODebugContext {
 
   // Called by underlying file system to set request_id and log request_id in
   // IOTracing.
-  void SetRequestId(const std::string& _request_id) {
+  void SetRequestId(const std::string* _request_id) {
     request_id = _request_id;
     trace_data |= (1 << TraceData::kRequestID);
   }
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 6e21fbe7fbf4..ba5b98147abe 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2062,6 +2062,19 @@ struct ReadOptions {
 
   // *** END options for RocksDB internal use only ***
 
+  // *** BEGIN per-request settings for internal team use only ***
+
+  // TODO: create a new struct for per-request options, potentially including
+  // timestamps in point lookups/scans
+
+  // request_id is a unique id assigned by the application. It is used to allow
+  // us to link file system metrics/logs to rocksDB and application logs. This
+  // request_id may not be unique to each RocksDB api call - it could refer to
+  // an application level request that results in multiple RocksDB api calls
+  const std::string* request_id = nullptr;
+
+  // *** END per-request settings for internal team use only ***
+
   ReadOptions() {}
   ReadOptions(bool _verify_checksums, bool _fill_cache);
   explicit ReadOptions(Env::IOActivity _io_activity);
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index ebb895027a14..baab81a33342 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -683,7 +683,8 @@ Status BlockBasedTable::Open(
   //    6. [meta block: index]
   //    7. [meta block: filter]
   IOOptions opts;
-  s = file->PrepareIOOptions(ro, opts);
+  IODebugContext dbg;
+  s = file->PrepareIOOptions(ro, opts, &dbg);
   if (s.ok()) {
     s = ReadFooterFromFile(opts, file.get(), *ioptions.fs,
                            prefetch_buffer.get(), file_size, &footer,
@@ -941,7 +942,8 @@ Status BlockBasedTable::PrefetchTail(
 #endif  // NDEBUG
 
   IOOptions opts;
-  Status s = file->PrepareIOOptions(ro, opts);
+  IODebugContext dbg;
+  Status s = file->PrepareIOOptions(ro, opts, &dbg);
   // Try file system prefetch
   if (s.ok() && !file->use_direct_io() && !force_direct_prefetch) {
     if (!file->Prefetch(opts, prefetch_off, prefetch_len).IsNotSupported()) {
diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h
index c6263e150d42..7c331cbe826d 100644
--- a/table/block_based/block_based_table_reader_sync_and_async.h
+++ b/table/block_based/block_based_table_reader_sync_and_async.h
@@ -138,17 +138,18 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
   AlignedBuf direct_io_buf;
   {
     IOOptions opts;
-    IOStatus s = file->PrepareIOOptions(options, opts);
+    IODebugContext dbg;
+    IOStatus s = file->PrepareIOOptions(options, opts, &dbg);
     if (s.ok()) {
 #if defined(WITH_COROUTINES)
       if (file->use_direct_io()) {
 #endif  // WITH_COROUTINES
         s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(),
-                            &direct_io_buf);
+                            &direct_io_buf, &dbg);
 #if defined(WITH_COROUTINES)
       } else {
         co_await batch->context()->reader().MultiReadAsync(
-            file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf);
+            file, opts, &read_reqs[0], read_reqs.size(), &direct_io_buf, &dbg);
       }
 #endif  // WITH_COROUTINES
     }
@@ -240,10 +241,11 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
           // its not a memory mapped file
           Slice result;
           IOOptions opts;
-          IOStatus io_s = file->PrepareIOOptions(options, opts);
+          IODebugContext dbg;
+          IOStatus io_s = file->PrepareIOOptions(options, opts, &dbg);
           opts.verify_and_reconstruct_read = true;
           io_s = file->Read(opts, handle.offset(), BlockSizeWithTrailer(handle),
-                            &result, const_cast<char*>(data), nullptr);
+                            &result, const_cast<char*>(data), nullptr, &dbg);
           if (io_s.ok()) {
             assert(result.data() == data);
             assert(result.size() == BlockSizeWithTrailer(handle));
diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc
index a4cfb027b26d..38ec3a044179 100644
--- a/table/block_based/block_prefetcher.cc
+++ b/table/block_based/block_prefetcher.cc
@@ -39,7 +39,8 @@ void BlockPrefetcher::PrefetchIfNeeded(
         return;
       }
       IOOptions opts;
-      Status s = rep->file->PrepareIOOptions(read_options, opts);
+      IODebugContext dbg;
+      Status s = rep->file->PrepareIOOptions(read_options, opts, &dbg);
       if (!s.ok()) {
         return;
       }
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index a554364e50da..42cfce462abe 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -591,7 +591,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(
                                   /*usage=*/FilePrefetchBufferUsage::kUnknown);
 
     IOOptions opts;
-    s = rep->file->PrepareIOOptions(ro, opts);
+    IODebugContext dbg;
+    s = rep->file->PrepareIOOptions(ro, opts, &dbg);
     if (s.ok()) {
       s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
                                     static_cast<size_t>(prefetch_len));
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index af564063ca4e..6c73df23bee2 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -74,7 +74,8 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
 inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
   if (prefetch_buffer_ != nullptr) {
     IOOptions opts;
-    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+    IODebugContext dbg;
+    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts, &dbg);
     if (io_s.ok()) {
       bool read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache(
           opts, file_, handle_.offset(), block_size_with_trailer_, &slice_,
@@ -246,7 +247,8 @@ inline void BlockFetcher::GetBlockContents() {
 void BlockFetcher::ReadBlock(bool retry) {
   FSReadRequest read_req;
   IOOptions opts;
-  io_status_ = file_->PrepareIOOptions(read_options_, opts);
+  IODebugContext dbg;
+  io_status_ = file_->PrepareIOOptions(read_options_, opts, &dbg);
   opts.verify_and_reconstruct_read = retry;
   read_req.status.PermitUncheckedError();
   // Actual file read
@@ -256,8 +258,9 @@ void BlockFetcher::ReadBlock(bool retry) {
       PERF_CPU_TIMER_GUARD(
           block_read_cpu_time,
           ioptions_.env ? ioptions_.env->GetSystemClock().get() : nullptr);
-      io_status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_,
-                               &slice_, /*scratch=*/nullptr, &direct_io_buf_);
+      io_status_ =
+          file_->Read(opts, handle_.offset(), block_size_with_trailer_, &slice_,
+                      /*scratch=*/nullptr, &direct_io_buf_, &dbg);
       PERF_COUNTER_ADD(block_read_count, 1);
       used_buf_ = const_cast<char*>(slice_.data());
     } else if (use_fs_scratch_) {
@@ -269,7 +272,7 @@ void BlockFetcher::ReadBlock(bool retry) {
       read_req.len = block_size_with_trailer_;
       read_req.scratch = nullptr;
       io_status_ = file_->MultiRead(opts, &read_req, /*num_reqs=*/1,
-                                    /*AlignedBuf* =*/nullptr);
+                                    /*AlignedBuf* =*/nullptr, &dbg);
       PERF_COUNTER_ADD(block_read_count, 1);
 
       slice_ = Slice(read_req.result.data(), read_req.result.size());
@@ -286,7 +289,7 @@ void BlockFetcher::ReadBlock(bool retry) {
       io_status_ =
           file_->Read(opts, handle_.offset(), /*size*/ block_size_with_trailer_,
                       /*result*/ &slice_, /*scratch*/ used_buf_,
-                      /*aligned_buf=*/nullptr);
+                      /*aligned_buf=*/nullptr, &dbg);
       PERF_COUNTER_ADD(block_read_count, 1);
 #ifndef NDEBUG
       if (slice_.data() == &stack_buf_[0]) {
@@ -417,7 +420,8 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() {
     assert(prefetch_buffer_ != nullptr);
     if (!for_compaction_) {
       IOOptions opts;
-      IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+      IODebugContext dbg;
+      IOStatus io_s = file_->PrepareIOOptions(read_options_, opts, &dbg);
       if (!io_s.ok()) {
         return io_s;
       }
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 89be11be21dd..73764ae4bb5a 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -434,15 +434,16 @@ Status ReadTablePropertiesHelper(
       // If retrying, use a stronger file system read to check and correct
       // data corruption
       IOOptions opts;
-      if (PrepareIOFromReadOptions(ro, ioptions.clock, opts) !=
+      IODebugContext dbg;
+      if (PrepareIOFromReadOptions(ro, ioptions.clock, opts, &dbg) !=
           IOStatus::OK()) {
         return s;
       }
       opts.verify_and_reconstruct_read = true;
       std::unique_ptr<char[]> data(new char[len]);
       Slice result;
-      IOStatus io_s =
-          file->Read(opts, handle.offset(), len, &result, data.get(), nullptr);
+      IOStatus io_s = file->Read(opts, handle.offset(), len, &result,
+                                 data.get(), nullptr, &dbg);
       RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
       if (!io_s.ok()) {
         ROCKS_LOG_INFO(ioptions.info_log,
@@ -574,8 +575,9 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
                                 Footer* footer_out) {
   Footer footer;
   IOOptions opts;
+  IODebugContext dbg;
   Status s;
-  s = file->PrepareIOOptions(read_options, opts);
+  s = file->PrepareIOOptions(read_options, opts, &dbg);
   if (!s.ok()) {
     return s;
   }
diff --git a/trace_replay/io_tracer.cc b/trace_replay/io_tracer.cc
index a860130f8560..e72b80c4f1bf 100644
--- a/trace_replay/io_tracer.cc
+++ b/trace_replay/io_tracer.cc
@@ -82,7 +82,7 @@ Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record,
     uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
     switch (set_pos) {
       case IODebugContext::TraceData::kRequestID: {
-        Slice request_id(dbg->request_id);
+        Slice request_id(*dbg->request_id);
         PutLengthPrefixedSlice(&trace.payload, request_id);
       } break;
       default:
diff --git a/trace_replay/io_tracer_test.cc b/trace_replay/io_tracer_test.cc
index be3af4fb3597..6946fa4be11d 100644
--- a/trace_replay/io_tracer_test.cc
+++ b/trace_replay/io_tracer_test.cc
@@ -145,7 +145,8 @@ TEST_F(IOTracerTest, MultipleRecordsWithDifferentIOOpOptions) {
     // Write record with IODebugContext.
     io_op_data = 0;
     IODebugContext dbg;
-    dbg.SetRequestId("request_id_1");
+    const std::string test_request_id = "request_id_1";
+    dbg.SetRequestId(&test_request_id);
     IOTraceRecord record5(0, TraceType::kIOTracer, io_op_data,
                           GetFileOperation(5), 10 /*latency*/,
                           IOStatus::OK().ToString(), file_name);
diff --git a/unreleased_history/new_features/plumb_application_request_id_to_fs.md b/unreleased_history/new_features/plumb_application_request_id_to_fs.md
new file mode 100644
index 000000000000..3144ae8fd225
--- /dev/null
+++ b/unreleased_history/new_features/plumb_application_request_id_to_fs.md
@@ -0,0 +1,2 @@
+[internal team use only] 
+allow an application-defined request_id to be passed to RocksDB and propagated to the filesystem via IODebugContext
diff --git a/util/async_file_reader.cc b/util/async_file_reader.cc
index 8fa4d19933c4..67acc978b9be 100644
--- a/util/async_file_reader.cc
+++ b/util/async_file_reader.cc
@@ -31,7 +31,7 @@ bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) {
           }
         },
         &awaiter->read_reqs_[i], &awaiter->io_handle_[i], &awaiter->del_fn_[i],
-        /*aligned_buf=*/nullptr);
+        /*aligned_buf=*/nullptr, awaiter->dbg_);
     if (!s.ok()) {
       // For any non-ok status, the FileSystem will not call the callback
       // So let's update the status ourselves
diff --git a/util/async_file_reader.h b/util/async_file_reader.h
index 50a59519491f..989f392cace5 100644
--- a/util/async_file_reader.h
+++ b/util/async_file_reader.h
@@ -36,9 +36,10 @@ class AsyncFileReader {
                                             const IOOptions& opts,
                                             FSReadRequest* read_reqs,
                                             size_t num_reqs,
-                                            AlignedBuf* aligned_buf) noexcept {
-    return ReadOperation<ReadAwaiter>{*this,     file,     opts,
-                                      read_reqs, num_reqs, aligned_buf};
+                                            AlignedBuf* aligned_buf,
+                                            IODebugContext* dbg) noexcept {
+    return ReadOperation<ReadAwaiter>{*this,    file,        opts, read_reqs,
+                                      num_reqs, aligned_buf, dbg};
   }
 
  private:
@@ -49,12 +50,14 @@ class AsyncFileReader {
    public:
     explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file,
                          const IOOptions& opts, FSReadRequest* read_reqs,
-                         size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept
+                         size_t num_reqs, AlignedBuf* /*aligned_buf*/,
+                         IODebugContext* dbg) noexcept
         : reader_(reader),
           file_(file),
           opts_(opts),
           read_reqs_(read_reqs),
           num_reqs_(num_reqs),
+          dbg_(dbg),
           next_(nullptr) {}
 
     bool await_ready() noexcept { return false; }
@@ -82,6 +85,7 @@ class AsyncFileReader {
     const IOOptions& opts_;
     FSReadRequest* read_reqs_;
     size_t num_reqs_;
+    IODebugContext* dbg_;
     autovector<void*, 32> io_handle_;
     autovector<IOHandleDeleter, 32> del_fn_;
     folly::coro::impl::coroutine_handle<> awaiting_coro_;
@@ -101,18 +105,20 @@ class AsyncFileReader {
     explicit ReadOperation(AsyncFileReader& reader,
                            RandomAccessFileReader* file, const IOOptions& opts,
                            FSReadRequest* read_reqs, size_t num_reqs,
-                           AlignedBuf* aligned_buf) noexcept
+                           AlignedBuf* aligned_buf,
+                           IODebugContext* dbg) noexcept
         : reader_(reader),
           file_(file),
           opts_(opts),
           read_reqs_(read_reqs),
           num_reqs_(num_reqs),
-          aligned_buf_(aligned_buf) {}
+          aligned_buf_(aligned_buf),
+          dbg_(dbg) {}
 
     auto viaIfAsync(folly::Executor::KeepAlive<> executor) const {
       return folly::coro::co_viaIfAsync(
-          std::move(executor),
-          Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_});
+          std::move(executor), Awaiter{reader_, file_, opts_, read_reqs_,
+                                       num_reqs_, aligned_buf_, dbg_});
     }
 
    private:
@@ -122,6 +128,7 @@ class AsyncFileReader {
     FSReadRequest* read_reqs_;
     size_t num_reqs_;
     AlignedBuf* aligned_buf_;
+    IODebugContext* dbg_;
   };
 
   // This function does the actual work when this awaitable starts execution

From 77af0424137b9804e0b5d80d00dbc7bbc615babf Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Sat, 17 May 2025 14:43:29 -0700
Subject: [PATCH 098/500] Fix some compression-related assertion failures
 (#13621)

Summary:
showing up in the crash test after https://github.com/facebook/rocksdb/issues/13540
* For an assertion `dict_samples.sample_data.size() <= opts_.max_dict_bytes` we needed to ensure that `zstd_max_train_bytes` only takes effect with kZSTD compression.
* For an assertion with `r->table_options.verify_compression == (verify_decomp != nullptr)` we needed to ensure that `data_block_verify_decompressor` is set even when dictionary compression is attempted but not used.
* Noticed along the way: finish an optimization in `CompressAndVerifyBlock` that was incomplete.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13621

Test Plan:
Both failures were reproducible with hard-coding of some crash test params, and now not getting a failure.
```
--compression_type=zstd --compression_max_dict_bytes=16384 --compression_zstd_max_train_bytes=65536 --compression_max_dict_buffer_bytes=131071 --compression_use_zstd_dict_trainer=1
```
Write performance test like in https://github.com/facebook/rocksdb/issues/13540 shows essentially no change, maybe slightly faster (+0.4%) with verify_compression.

Reviewed By: virajthakur

Differential Revision: D74939103

Pulled By: pdillinger

fbshipit-source-id: 8bac8891bc08e1356eff52cc524e5bb409b0f86f
---
 .../block_based/block_based_table_builder.cc  | 60 +++++++++----------
 util/compression.cc                           |  5 +-
 2 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index acf0a7e073e5..5453d0cb23ac 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1301,21 +1301,10 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
           r->ioptions.clock,
           ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
 
-      if (is_data_block) {
-        if (r->data_block_compressor) {
-          *out_status = r->data_block_compressor->CompressBlock(
-              uncompressed_block_data, compressed_output, &type,
-              &working_area.compress);
-          verify_decomp = r->data_block_verify_decompressor.get();
-        }
-      } else {
-        if (r->basic_compressor) {
-          *out_status = r->basic_compressor->CompressBlock(
-              uncompressed_block_data, compressed_output, &type,
-              &working_area.compress);
-          verify_decomp = r->verify_decompressor.get();
-        }
-      }
+      *out_status =
+          compressor->CompressBlock(uncompressed_block_data, compressed_output,
+                                    &type, &working_area.compress);
+
       // Post-condition of Compressor::CompressBlock
       assert(type == kNoCompression || out_status->ok());
       assert(type == kNoCompression ||
@@ -1989,25 +1978,30 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
         r->data_block_compressor->ObtainWorkingArea();
   }
   Slice serialized_dict = r->data_block_compressor->GetSerializedDict();
-  if (!serialized_dict.empty() && r->verify_decompressor) {
-    // Get an updated dictionary-aware decompressor for verification.
-    Status s = r->verify_decompressor->MaybeCloneForDict(
-        serialized_dict, &r->verify_decompressor_with_dict);
-    // Dictionary support must be present on the decompressor side if it's on
-    // the compressor side.
-    assert(r->verify_decompressor_with_dict);
-    if (r->verify_decompressor_with_dict) {
-      r->data_block_verify_decompressor =
-          r->verify_decompressor_with_dict.get();
-      for (uint32_t i = 0; i < r->compression_parallel_threads; i++) {
-        r->data_block_working_areas[i].verify =
-            r->data_block_verify_decompressor->ObtainWorkingArea(
-                r->data_block_compressor->GetPreferredCompressionType());
-      }
-      assert(s.ok());
+  if (r->verify_decompressor) {
+    if (serialized_dict.empty()) {
+      // No dictionary
+      r->data_block_verify_decompressor = r->verify_decompressor.get();
     } else {
-      assert(!s.ok());
-      r->SetStatus(s);
+      // Get an updated dictionary-aware decompressor for verification.
+      Status s = r->verify_decompressor->MaybeCloneForDict(
+          serialized_dict, &r->verify_decompressor_with_dict);
+      // Dictionary support must be present on the decompressor side if it's on
+      // the compressor side.
+      assert(r->verify_decompressor_with_dict);
+      if (r->verify_decompressor_with_dict) {
+        r->data_block_verify_decompressor =
+            r->verify_decompressor_with_dict.get();
+        for (uint32_t i = 0; i < r->compression_parallel_threads; i++) {
+          r->data_block_working_areas[i].verify =
+              r->data_block_verify_decompressor->ObtainWorkingArea(
+                  r->data_block_compressor->GetPreferredCompressionType());
+        }
+        assert(s.ok());
+      } else {
+        assert(!s.ok());
+        r->SetStatus(s);
+      }
     }
   }
 
diff --git a/util/compression.cc b/util/compression.cc
index c3152c580ae5..0f0210918913 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -184,8 +184,9 @@ class BuiltinCompressorV2 : public Compressor {
       // Dictionary compression disabled
       return 0;
     } else {
-      return opts_.zstd_max_train_bytes > 0 ? opts_.zstd_max_train_bytes
-                                            : opts_.max_dict_bytes;
+      return type_ == kZSTD && opts_.zstd_max_train_bytes > 0
+                 ? opts_.zstd_max_train_bytes
+                 : opts_.max_dict_bytes;
     }
   }
 

From 2ea356d0bea2e9a847792559498c02571dbf1e53 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Sat, 17 May 2025 21:21:14 -0700
Subject: [PATCH 099/500] Start 10.4 release development, and more (#13622)

Summary:
Usual release steps
* Release notes from 10.3 branch
* Update version.h
* Add 10.3.fb to check_format_compatible.sh
* Update folly commit hash. Added a few hacks to fix build errors.

Bonus:
* Add a check_format_compatible.sh sanity check to the per-PR GitHub actions jobs. It should be quick enough and catch typos in release diffs as we've seen in the past.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13622

Test Plan: CI

Reviewed By: jowlyzhang

Differential Revision: D74943843

Pulled By: pdillinger

fbshipit-source-id: 4ff1db9a635e111f8830cadff2d3ee51cf2de512
---
 .github/workflows/pr-jobs.yml                 |  8 ++++
 HISTORY.md                                    | 13 +++++++
 Makefile                                      | 12 +++---
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              | 38 ++++++++++++-------
 unreleased_history/bug_fixes/deadlock.md      |  1 -
 .../bug_fixes/external_table_checksum.md      |  1 -
 .../bug_fixes/lock-limit-timeout.md           |  1 -
 .../bug_fixes/parallel_compression_bug.md     |  1 -
 ...-tiering-change-temperatur-trivial-copy.md |  1 -
 .../new_features/log-precompression-size.md   |  1 -
 .../new_features/per-txn-threshold.md         |  1 -
 .../plumb_application_request_id_to_fs.md     |  2 -
 13 files changed, 53 insertions(+), 29 deletions(-)
 delete mode 100644 unreleased_history/bug_fixes/deadlock.md
 delete mode 100644 unreleased_history/bug_fixes/external_table_checksum.md
 delete mode 100644 unreleased_history/bug_fixes/lock-limit-timeout.md
 delete mode 100644 unreleased_history/bug_fixes/parallel_compression_bug.md
 delete mode 100644 unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md
 delete mode 100644 unreleased_history/new_features/log-precompression-size.md
 delete mode 100644 unreleased_history/new_features/per-txn-threshold.md
 delete mode 100644 unreleased_history/new_features/plumb_application_request_id_to_fs.md

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 7faaff6637a7..ec221bfb0695 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -52,6 +52,14 @@ jobs:
       run: make check-buck-targets
     - name: Simple source code checks
       run: make check-sources
+    - name: Sanity check check_format_compatible.sh
+      run: |-
+        export TEST_TMPDIR=/dev/shm/rocksdb
+        rm -rf /dev/shm/rocksdb
+        mkdir /dev/shm/rocksdb
+        git reset --hard
+        git config --global --add safe.directory /__w/rocksdb/rocksdb
+        SANITY_CHECK=1 LONG_TEST=1 tools/check_format_compatible.sh
   # ========================= Linux With Tests ======================== #
   build-linux:
     if: ${{ github.repository_owner == 'facebook' }}
diff --git a/HISTORY.md b/HISTORY.md
index 6a9ff81169fc..9084ed860765 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,19 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.3.0 (05/17/2025)
+### New Features
+* Add new experimental `CompactionOptionsFIFO::allow_trivial_copy_when_change_temperature` along with `CompactionOptionsFIFO::trivial_copy_buffer_size` to allow optimizing FIFO compactions with tiering when kChangeTemperature to move files from source tier FileSystem to another tier FileSystem via trivial and direct copying raw sst file instead of reading thru the content of the SST file then rebuilding the table files.
+* Add a new field to Compaction Stats in LOG files for the pre-compression size written to each level.
+* Add new experimental `TransactionOptions::large_txn_commit_optimize_threshold` to enable optimizations for large transaction commit with per transaction threshold. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` is deprecated in favor of this transaction option.
+* [internal team use only] Allow an application-defined `request_id` to be passed to RocksDB and propagated to the filesystem via IODebugContext
+
+### Bug Fixes
+* Fix a bug where transaction lock upgrade can incorrectly fail with a Deadlock status. This happens when a transaction has a non-zero timeout and tries to upgrade a shared lock that is also held by another transaction.
+* Pass wrapped WritableFileWriter pointer to ExternalTableBuilder so that the file checksum can be correctly calculated and returned by SstFileWriter for external table files.
+* Fix an infinite-loop bug in transaction locking. This can happen if a transaction reaches lock limit and its time out expires before it attempts to wait for it.
+* Fixed a potential data race with `CompressionOptions::parallel_threads > 1` and a `TablePropertiesCollector` overriding `BlockAdd()`.
+
 ## 10.2.0 (04/21/2025)
 ### New Features
 * Provide histogram stats `COMPACTION_PREFETCH_BYTES` to measure number of bytes for RocksDB's prefetching (as opposed to file
diff --git a/Makefile b/Makefile
index a10b95a60123..3b423ba41660 100644
--- a/Makefile
+++ b/Makefile
@@ -2489,11 +2489,13 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard 8e8186f67de7a23d3a07366946b1617343927d84
-	@# NOTE: this hack is required for clang in some cases
-	perl -pi -e 's/int rv = syscall/int rv = (int)syscall/' third-party/folly/folly/detail/Futex.cpp
-	@# NOTE: this hack is required for gcc in some cases
-	perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h
+	cd third-party/folly && git reset --hard d17bf897cb5bbf8f07b122a614e8cffdc38edcde
+	@# Apparently missing include
+	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
+	@# Warning-as-error on memcpy
+	perl -pi -e 's/memcpy.&ptr/memcpy((void*)&ptr/' third-party/folly/folly/lang/Exception.cpp
+	@# const mismatch
+	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
 	@# NOTE: boost source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on boost headers
 	cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py fetch boost
 
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 126599544d0e..9890c3682fec 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 3
+#define ROCKSDB_MINOR 4
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 9aa8cc1a0401..a1a85f031e1d 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -11,6 +11,8 @@
 # Return value 0 means all regression tests pass. 1 if not pass.
 #
 # Environment options:
+#  SANITY_CHECK=1 - Do a syntax check and git checkout test as a sanity check
+#    that the script hasn't been broken by e.g. adding a new release wrongly.
 #  SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is
 #    a good choice for PR validation as it is relatively fast and will find
 #    most issues.
@@ -135,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
@@ -195,10 +197,14 @@ if [ "$SHORT_TEST" == "" ]; then
   done
 fi
 
+invoke_make()
+{
+    [ "$SANITY_CHECK" ] || make "$*"
+}
 generate_db()
 {
     set +e
-    bash "$script_copy_dir"/generate_random_db.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/generate_random_db.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error loading data from $2 to $1 ====
         exit 1
@@ -209,7 +215,7 @@ generate_db()
 compare_db()
 {
     set +e
-    bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/verify_random_db.sh "$1" "$2" "$3" "$4" "$5"
     if [ $? -ne 0 ]; then
         echo ==== Read different content from $1 and $2 or error happened. ====
         exit 1
@@ -220,7 +226,7 @@ compare_db()
 write_external_sst()
 {
     set +e
-    bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/write_external_sst.sh "$1" "$2" "$3"
     if [ $? -ne 0 ]; then
         echo ==== Error writing external SST file using data from $1 to $3 ====
         exit 1
@@ -231,7 +237,7 @@ write_external_sst()
 ingest_external_sst()
 {
     set +e
-    bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/ingest_external_sst.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error ingesting external SST in $2 to DB at $1 ====
         exit 1
@@ -242,7 +248,7 @@ ingest_external_sst()
 backup_db()
 {
     set +e
-    bash "$script_copy_dir"/backup_db.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/backup_db.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error backing up DB $1 to $2 ====
         exit 1
@@ -253,7 +259,7 @@ backup_db()
 restore_db()
 {
     set +e
-    bash "$script_copy_dir"/restore_db.sh "$1" "$2"
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/restore_db.sh "$1" "$2"
     if [ $? -ne 0 ]; then
         echo ==== Error restoring from $1 to $2 ====
         exit 1
@@ -297,8 +303,8 @@ current_checkout_name="$current_checkout_name ($current_checkout_hash)"
 echo "== Building $current_checkout_name debug"
 git checkout -B $tmp_branch $current_checkout_hash
 force_no_fbcode
-make clean
-DISABLE_WARNING_AS_ERROR=1 make ldb -j$J
+invoke_make clean
+DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
 echo "== Using $current_checkout_name, generate DB with extern SST and ingest"
 current_ext_test_dir=$ext_test_dir"/current"
@@ -318,8 +324,8 @@ do
   echo "== Building $checkout_ref debug"
   git reset --hard $tmp_origin/$checkout_ref
   force_no_fbcode
-  make clean
-  DISABLE_WARNING_AS_ERROR=1 make ldb -j$J
+  invoke_make clean
+  DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
   # We currently assume DB backward compatibility for every branch listed
   echo "== Use $checkout_ref to generate a DB ..."
@@ -376,8 +382,8 @@ done
 echo "== Building $current_checkout_name debug (again, final)"
 git reset --hard $current_checkout_hash
 force_no_fbcode
-make clean
-DISABLE_WARNING_AS_ERROR=1 make ldb -j$J
+invoke_make clean
+DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
 for checkout_ref in "${checkout_refs[@]}"
 do
@@ -404,4 +410,8 @@ do
   fi
 done
 
-echo ==== Compatibility Test PASSED ====
+if [ "$SANITY_CHECK" ]; then
+  echo "==== check_format_compatible.sh sanity check PASSED ===="
+else
+  echo ==== Compatibility Test PASSED ====
+fi
diff --git a/unreleased_history/bug_fixes/deadlock.md b/unreleased_history/bug_fixes/deadlock.md
deleted file mode 100644
index 362b27c90867..000000000000
--- a/unreleased_history/bug_fixes/deadlock.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug where transaction lock upgrade can incorrectly fail with a Deadlock status. This happens when a transaction has a non-zero timeout and tries to upgrade a shared lock that is also held by another transaction.
diff --git a/unreleased_history/bug_fixes/external_table_checksum.md b/unreleased_history/bug_fixes/external_table_checksum.md
deleted file mode 100644
index 8b6dc226fab8..000000000000
--- a/unreleased_history/bug_fixes/external_table_checksum.md
+++ /dev/null
@@ -1 +0,0 @@
-Pass wrapped WritableFileWriter pointer to ExternalTableBuilder so that the file checksum can be correctly calculated and returned by SstFileWriter for external table files.
diff --git a/unreleased_history/bug_fixes/lock-limit-timeout.md b/unreleased_history/bug_fixes/lock-limit-timeout.md
deleted file mode 100644
index 55eb4726feed..000000000000
--- a/unreleased_history/bug_fixes/lock-limit-timeout.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix an infinite-loop bug in transaction locking. This can happen if a transaction reaches lock limit and its time out expires before it attempts to wait for it.
diff --git a/unreleased_history/bug_fixes/parallel_compression_bug.md b/unreleased_history/bug_fixes/parallel_compression_bug.md
deleted file mode 100644
index 849f2d595a16..000000000000
--- a/unreleased_history/bug_fixes/parallel_compression_bug.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fixed a potential data race with `CompressionOptions::parallel_threads > 1` and a `TablePropertiesCollector` overriding `BlockAdd()`.
diff --git a/unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md b/unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md
deleted file mode 100644
index 6888e67a10ae..000000000000
--- a/unreleased_history/new_features/fifo-tiering-change-temperatur-trivial-copy.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add new experimental `CompactionOptionsFIFO::allow_trivial_copy_when_change_temperature` along with `CompactionOptionsFIFO::trivial_copy_buffer_size` to allow optimizing FIFO compactions with tiering when kChangeTemperature to move files from source tier FileSystem to another tier FileSystem via trivial and direct copying raw sst file instead of reading thru the content of the SST file then rebuilding the table files.
diff --git a/unreleased_history/new_features/log-precompression-size.md b/unreleased_history/new_features/log-precompression-size.md
deleted file mode 100644
index 6266e6fe3f0b..000000000000
--- a/unreleased_history/new_features/log-precompression-size.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add a new field to Compaction Stats in LOG files for the pre-compression size written to each level.
diff --git a/unreleased_history/new_features/per-txn-threshold.md b/unreleased_history/new_features/per-txn-threshold.md
deleted file mode 100644
index 01c6aad53201..000000000000
--- a/unreleased_history/new_features/per-txn-threshold.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add new experimental `TransactionOptions::large_txn_commit_optimize_threshold` to enable optimizations for large transaction commit with per transaction threshold. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` is deprecated in favor of this transaction option.
diff --git a/unreleased_history/new_features/plumb_application_request_id_to_fs.md b/unreleased_history/new_features/plumb_application_request_id_to_fs.md
deleted file mode 100644
index 3144ae8fd225..000000000000
--- a/unreleased_history/new_features/plumb_application_request_id_to_fs.md
+++ /dev/null
@@ -1,2 +0,0 @@
-[internal team use only] 
-allow an application-defined request_id to be passed to RocksDB and propagated to the filesystem via IODebugContext

From 7c9e50e37d80bb581a3ebc0faf2ff0684ccc7aa2 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 19 May 2025 09:44:04 -0700
Subject: [PATCH 100/500] check_format_compatible.sh fix (#13625)

Summary:
After I broke it in https://github.com/facebook/rocksdb/issues/13622

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13625

Test Plan: manual run of check_format_compatible.sh

Reviewed By: jowlyzhang

Differential Revision: D75003768

Pulled By: pdillinger

fbshipit-source-id: 6734ae5a8c9034a1e08230a840a04a4a2d7d6a15
---
 tools/check_format_compatible.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index a1a85f031e1d..fa84094f8452 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -199,7 +199,7 @@ fi
 
 invoke_make()
 {
-    [ "$SANITY_CHECK" ] || make "$*"
+    [ "$SANITY_CHECK" ] || make "$@"
 }
 generate_db()
 {

From f91f6bd78e75edf560b7797f43f980e666cc159b Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 19 May 2025 15:33:59 -0700
Subject: [PATCH 101/500] Include file_size in CompactionServiceOutputFile
 (#13620)

Summary:
Instead of using FileSystem::GetFileSize() for each CompactionOutputFile, use the file size that is being tracked internally as part of the output file's metadata. FileSize is now part of `CompactionServiceOutputFile` and serialized in the `CompactionServiceResult`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13620

Test Plan:
Tested with logging Meta's internal offload Infra

```
./compaction_job_test
```

Reviewed By: jowlyzhang

Differential Revision: D75006961

Pulled By: jaykorean

fbshipit-source-id: 008f9dc22bd672746ac180380ada4188713a6b85
---
 db/compaction/compaction_job.h          | 19 +++++++------
 db/compaction/compaction_job_test.cc    |  1 +
 db/compaction/compaction_service_job.cc | 38 ++++++++++++++++---------
 3 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 46deb9cc9bfe..2d01508f8e9a 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -427,8 +427,9 @@ struct CompactionServiceInput {
 // CompactionServiceOutputFile is the metadata for the output SST file
 struct CompactionServiceOutputFile {
   std::string file_name;
-  SequenceNumber smallest_seqno;
-  SequenceNumber largest_seqno;
+  uint64_t file_size{};
+  SequenceNumber smallest_seqno{};
+  SequenceNumber largest_seqno{};
   std::string smallest_internal_key;
   std::string largest_internal_key;
   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
@@ -436,24 +437,26 @@ struct CompactionServiceOutputFile {
   uint64_t epoch_number = kUnknownEpochNumber;
   std::string file_checksum = kUnknownFileChecksum;
   std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
-  uint64_t paranoid_hash;
+  uint64_t paranoid_hash{};
   bool marked_for_compaction;
   UniqueId64x2 unique_id{};
   TableProperties table_properties;
   bool is_proximal_level_output;
-  Temperature file_temperature;
+  Temperature file_temperature = Temperature::kUnknown;
 
   CompactionServiceOutputFile() = default;
   CompactionServiceOutputFile(
-      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
-      std::string _smallest_internal_key, std::string _largest_internal_key,
-      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
-      uint64_t _epoch_number, const std::string& _file_checksum,
+      const std::string& name, uint64_t size, SequenceNumber smallest,
+      SequenceNumber largest, std::string _smallest_internal_key,
+      std::string _largest_internal_key, uint64_t _oldest_ancester_time,
+      uint64_t _file_creation_time, uint64_t _epoch_number,
+      const std::string& _file_checksum,
       const std::string& _file_checksum_func_name, uint64_t _paranoid_hash,
       bool _marked_for_compaction, UniqueId64x2 _unique_id,
       const TableProperties& _table_properties, bool _is_proximal_level_output,
       Temperature _file_temperature)
       : file_name(name),
+        file_size(size),
         smallest_seqno(smallest),
         largest_seqno(largest),
         smallest_internal_key(std::move(_smallest_internal_key)),
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index b7afc07b996c..8c7baa6ef29c 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -1669,6 +1669,7 @@ TEST_F(CompactionJobTest, ResultSerialization) {
     UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
     result.output_files.emplace_back(
         rnd.RandomString(rnd.Uniform(kStrMaxLen)) /* file_name */,
+        rnd64.Uniform(UINT64_MAX) /* file_size */,
         rnd64.Uniform(UINT64_MAX) /* smallest_seqno */,
         rnd64.Uniform(UINT64_MAX) /* largest_seqno */,
         rnd.RandomBinaryString(
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 69f51fc1982a..3c2ff8c09b18 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -221,18 +221,24 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
     }
 
     FileMetaData meta;
-    uint64_t file_size;
-    // FIXME: file_size should be part of CompactionServiceOutputFile so that
-    // we don't get DB corruption if the full file size has not been propagated
-    // back to the caller through the file system (which could have metadata
-    // lag or caching bugs).
-    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    uint64_t file_size = file.file_size;
+
+    // TODO - Clean this up in the next release.
+    // For backward compatibility - in case the remote worker does not populate
+    // the file_size yet. If missing, continue to populate this from the file
+    // system.
+    if (file_size == 0) {
+      s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    }
+
     if (!s.ok()) {
       sub_compact->status = s;
       db_options_.compaction_service->OnInstallation(
           response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
       return CompactionServiceJobStatus::kFailure;
     }
+    assert(file_size > 0);
+
     meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
                              file.smallest_seqno, file.largest_seqno);
     meta.smallest.DecodeFrom(file.smallest_internal_key);
@@ -421,14 +427,14 @@ Status CompactionServiceCompactionJob::Run() {
     for (const auto& output_file : sub_compact->GetOutputs()) {
       auto& meta = output_file.meta;
       compaction_result_->output_files.emplace_back(
-          MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
-          meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
-          meta.largest.Encode().ToString(), meta.oldest_ancester_time,
-          meta.file_creation_time, meta.epoch_number, meta.file_checksum,
-          meta.file_checksum_func_name, output_file.validator.GetHash(),
-          meta.marked_for_compaction, meta.unique_id,
-          *output_file.table_properties, output_file.is_proximal_level,
-          meta.temperature);
+          MakeTableFileName(meta.fd.GetNumber()), meta.fd.GetFileSize(),
+          meta.fd.smallest_seqno, meta.fd.largest_seqno,
+          meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(),
+          meta.oldest_ancester_time, meta.file_creation_time, meta.epoch_number,
+          meta.file_checksum, meta.file_checksum_func_name,
+          output_file.validator.GetHash(), meta.marked_for_compaction,
+          meta.unique_id, *output_file.table_properties,
+          output_file.is_proximal_level, meta.temperature);
     }
   }
 
@@ -528,6 +534,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct CompactionServiceOutputFile, file_name),
           OptionType::kEncodedString, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"file_size",
+         {offsetof(struct CompactionServiceOutputFile, file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"smallest_seqno",
          {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
           OptionType::kUInt64T, OptionVerificationType::kNormal,

From 5bc8abc0ec83c00e0544aa0b72338a7cc6b2c6e6 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Tue, 20 May 2025 15:49:01 -0700
Subject: [PATCH 102/500] New CF option to trigger flush based on average cost
 of scanning memtable (#13593)

Summary:
This PR introduces a new CF option, `memtable_avg_op_scan_flush_trigger`, to support triggering a memtable flush when an iterator skips too many invisible keys from the active memtable. This is a follow up to https://github.com/facebook/rocksdb/pull/13523#discussion_r2038261975, which introduced the option `memtable_op_scan_flush_trigger` for a single expensive iterator step. This PR focus on an expensive stretch of iterator steps, between Seeks and until iterator destruction. To avoid triggering a memtable flush for a stretch that is too small, this option only takes effect when the total number of entries skipped from the active memtable in a stretch of iterator steps exceeds `memtable_op_scan_flush_trigger`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13593

Test Plan:
* New unit tests covering the new option
* Add the option to the crash test.

Reviewed By: hx235

Differential Revision: D74434263

Pulled By: cbi42

fbshipit-source-id: 64f1101efb79c7498e2038eff630713ead8f6f41
---
 db/column_family.cc                           |  19 ++-
 db/db_iter.cc                                 |  46 +++---
 db/db_iter.h                                  |  42 ++++-
 db/db_iterator_test.cc                        | 155 ++++++++++++++++++
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   4 +
 include/rocksdb/advanced_options.h            |  19 +++
 options/cf_options.cc                         |   6 +
 options/cf_options.h                          |   8 +-
 options/options.cc                            |  13 +-
 options/options_helper.cc                     |   2 +
 options/options_settable_test.cc              |   3 +-
 tools/db_crashtest.py                         |   1 +
 .../new_features/avg-flush-trigger.md         |   1 +
 14 files changed, 285 insertions(+), 35 deletions(-)
 create mode 100644 unreleased_history/new_features/avg-flush-trigger.md

diff --git a/db/column_family.cc b/db/column_family.cc
index 6b642fa4cd3d..b4fa2fbf611d 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -448,11 +448,20 @@ ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
     result.preclude_last_level_data_seconds = 0;
   }
 
-  if (read_only && result.memtable_op_scan_flush_trigger != 0) {
-    ROCKS_LOG_WARN(db_options.info_log.get(),
-                   "option memtable_op_scan_flush_trigger is sanitized to "
-                   "0(disabled) for read only DB.");
-    result.memtable_op_scan_flush_trigger = 0;
+  if (read_only) {
+    if (result.memtable_op_scan_flush_trigger) {
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "option memtable_op_scan_flush_trigger is sanitized to "
+                     "0(disabled) for read only DB.");
+      result.memtable_op_scan_flush_trigger = 0;
+    }
+    if (result.memtable_avg_op_scan_flush_trigger) {
+      ROCKS_LOG_WARN(
+          db_options.info_log.get(),
+          "option memtable_avg_op_scan_flush_trigger is sanitized to "
+          "0(disabled) for read only DB.");
+      result.memtable_avg_op_scan_flush_trigger = 0;
+    }
   }
 
   return result;
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 4ceffd357242..25d53ae09a8b 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -66,11 +66,6 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       timestamp_lb_(read_options.iter_start_ts),
       timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0),
       active_mem_(active_mem),
-      memtable_seqno_lb_((active_mem_ && !active_mem_->IsEmpty())
-                             ? active_mem_->GetFirstSequenceNumber()
-                             : kMaxSequenceNumber),
-      memtable_op_scan_flush_trigger_(
-          mutable_cf_options.memtable_op_scan_flush_trigger),
       direction_(kForward),
       valid_(false),
       current_entry_is_merged_(false),
@@ -98,6 +93,25 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
   // prefix_seek_opt_in_only should force total_order_seek whereever the caller
   // is duplicating the original ReadOptions
   assert(!ioptions.prefix_seek_opt_in_only || read_options.total_order_seek);
+  if (active_mem_) {
+    // FIXME: GetEarliestSequenceNumber() may return a seqno that is one smaller
+    // than the smallest seqno in the memtable. This violates its comment and
+    // entries with that seqno may not be in the active memtable. Before it's
+    // fixed, we use GetFirstSequenceNumber() for more accurate result.
+    memtable_seqno_lb_ = active_mem_->IsEmpty()
+                             ? active_mem_->GetEarliestSequenceNumber()
+                             : active_mem_->GetFirstSequenceNumber();
+    memtable_op_scan_flush_trigger_ =
+        mutable_cf_options.memtable_op_scan_flush_trigger;
+    if (memtable_op_scan_flush_trigger_) {
+      // avg_op_scan_flush_trigger_ requires memtable_op_scan_flush_trigger_ > 0
+      avg_op_scan_flush_trigger_ =
+          mutable_cf_options.memtable_avg_op_scan_flush_trigger;
+    }
+  } else {
+    // memtable_op_scan_flush_trigger_ and avg_op_scan_flush_trigger_ are
+    // initialized to 0(disabled) as default.
+  }
 }
 
 Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
@@ -159,6 +173,7 @@ void DBIter::Next() {
   local_stats_.skip_count_ += num_internal_keys_skipped_;
   local_stats_.skip_count_--;
   num_internal_keys_skipped_ = 0;
+  iter_step_since_seek_++;
   bool ok = true;
   if (direction_ == kReverse) {
     is_key_seqnum_zero_ = false;
@@ -373,8 +388,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
   // to one.
   bool reseek_done = false;
 
-  uint64_t mem_ops_scanned = 0;
-  bool marked_for_flush = false;
+  uint64_t mem_hidden_op_scanned = 0;
   do {
     // Will update is_key_seqnum_zero_ as soon as we parsed the current key
     // but we need to save the previous value to be used in the loop.
@@ -431,12 +445,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
           CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
-        if (memtable_op_scan_flush_trigger_ && active_mem_ &&
-            ikey_.sequence >= memtable_seqno_lb_ && !marked_for_flush &&
-            ++mem_ops_scanned >= memtable_op_scan_flush_trigger_) {
-          active_mem_->MarkForFlush();
-          marked_for_flush = true;
-        }
+        MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
       } else {
         assert(!skipping_saved_key ||
                CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
@@ -458,12 +467,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
                                       !iter_.iter()->IsKeyPinned() /* copy */);
               skipping_saved_key = true;
               PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-              if (memtable_op_scan_flush_trigger_ && active_mem_ &&
-                  ikey_.sequence >= memtable_seqno_lb_ && !marked_for_flush &&
-                  ++mem_ops_scanned >= memtable_op_scan_flush_trigger_) {
-                active_mem_->MarkForFlush();
-                marked_for_flush = true;
-              }
+              MarkMemtableForFlushForPerOpTrigger(mem_hidden_op_scanned);
             }
             break;
           case kTypeValue:
@@ -1588,6 +1592,7 @@ void DBIter::Seek(const Slice& target) {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
 
   // Seek the inner iterator based on the target key.
   {
@@ -1664,6 +1669,7 @@ void DBIter::SeekForPrev(const Slice& target) {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
 
   // Seek the inner iterator based on the target key.
   {
@@ -1725,6 +1731,7 @@ void DBIter::SeekToFirst() {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
   ClearSavedValue();
   is_key_seqnum_zero_ = false;
 
@@ -1788,6 +1795,7 @@ void DBIter::SeekToLast() {
   ResetBlobData();
   ResetValueAndColumns();
   ResetInternalKeysSkippedCounter();
+  MarkMemtableForFlushForAvgTrigger();
   ClearSavedValue();
   is_key_seqnum_zero_ = false;
 
diff --git a/db/db_iter.h b/db/db_iter.h
index 494bb43f57b0..e4353875bb63 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -12,7 +12,6 @@
 #include <string>
 
 #include "db/db_impl/db_impl.h"
-#include "db/range_del_aggregator.h"
 #include "memory/arena.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
@@ -62,7 +61,8 @@ class DBIter final : public Iterator {
   //
   // @param active_mem Pointer to the active memtable that `internal_iter`
   // is reading from. If not null, the memtable can be marked for flush
-  // according to option mutable_cf_options.memtable_op_scan_flush_trigger.
+  // according to options mutable_cf_options.memtable_op_scan_flush_trigger
+  // and mutable_cf_options.memtable_avg_op_scan_flush_trigger.
   // @param arena_mode If true, the DBIter will be allocated from the arena.
   static DBIter* NewIter(Env* env, const ReadOptions& read_options,
                          const ImmutableOptions& ioptions,
@@ -145,6 +145,7 @@ class DBIter final : public Iterator {
   void operator=(const DBIter&) = delete;
 
   ~DBIter() override {
+    MarkMemtableForFlushForAvgTrigger();
     ThreadStatus::OperationType cur_op_type =
         ThreadStatusUtil::GetThreadOperation();
     ThreadStatusUtil::SetThreadOperation(
@@ -417,6 +418,36 @@ class DBIter final : public Iterator {
     return true;
   }
 
+  void MarkMemtableForFlushForAvgTrigger() {
+    if (avg_op_scan_flush_trigger_ &&
+        mem_hidden_op_scanned_since_seek_ >= memtable_op_scan_flush_trigger_ &&
+        mem_hidden_op_scanned_since_seek_ >=
+            static_cast<uint64_t>(iter_step_since_seek_) *
+                avg_op_scan_flush_trigger_) {
+      assert(memtable_op_scan_flush_trigger_ > 0);
+      active_mem_->MarkForFlush();
+      avg_op_scan_flush_trigger_ = 0;
+      memtable_op_scan_flush_trigger_ = 0;
+    }
+    iter_step_since_seek_ = 1;
+    mem_hidden_op_scanned_since_seek_ = 0;
+  }
+
+  void MarkMemtableForFlushForPerOpTrigger(uint64_t& mem_hidden_op_scanned) {
+    if (memtable_op_scan_flush_trigger_ &&
+        ikey_.sequence >= memtable_seqno_lb_) {
+      if (++mem_hidden_op_scanned >= memtable_op_scan_flush_trigger_) {
+        active_mem_->MarkForFlush();
+        // Turn off the flush trigger checks.
+        memtable_op_scan_flush_trigger_ = 0;
+        avg_op_scan_flush_trigger_ = 0;
+      }
+      if (avg_op_scan_flush_trigger_) {
+        ++mem_hidden_op_scanned_since_seek_;
+      }
+    }
+  }
+
   const SliceTransform* prefix_extractor_;
   Env* const env_;
   SystemClock* clock_;
@@ -476,8 +507,11 @@ class DBIter final : public Iterator {
   std::string saved_timestamp_;
   std::optional<std::vector<ScanOptions>> scan_opts_;
   ReadOnlyMemTable* const active_mem_;
-  const SequenceNumber memtable_seqno_lb_;
-  const uint32_t memtable_op_scan_flush_trigger_;
+  SequenceNumber memtable_seqno_lb_ = kMaxSequenceNumber;
+  uint32_t memtable_op_scan_flush_trigger_ = 0;
+  uint32_t avg_op_scan_flush_trigger_ = 0;
+  uint32_t iter_step_since_seek_ = 1;
+  uint32_t mem_hidden_op_scanned_since_seek_ = 0;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ccb4ff188ab4..a4477804e0c8 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -3890,6 +3890,7 @@ TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithSeek) {
             db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
         ASSERT_EQ(0, val);
       } else {
+        ASSERT_EQ(0, NumTableFilesAtLevel(0));
         uint64_t val = 0;
         ASSERT_TRUE(
             db_->GetIntProperty("rocksdb.num-deletes-active-mem-table", &val));
@@ -3984,6 +3985,160 @@ TEST_P(DBIteratorTest, MemtableOpsScanFlushTriggerWithNext) {
     }
   }
 }
+
+TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTrigger) {
+  // Tests option memtable_avg_op_scan_flush_trigger with
+  // long tombstone sequences.
+  Random* r = Random::GetTLSInstance();
+
+  const int kAvgTrigger = 10;
+  const int kMaxTrigger = 500;
+  Options options;
+  options.create_if_missing = true;
+  options.memtable_op_scan_flush_trigger = kMaxTrigger;
+  options.memtable_avg_op_scan_flush_trigger = kAvgTrigger;
+  options.level_compaction_dynamic_level_bytes = true;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 1000;
+  // Base data that will be covered by a consecutive sequence of tombstones.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), r->RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    // We issue slightly more deletions than kAvgTrigger between visible keys
+    // to ensure avg skipped entries exceed kAvgTrigger.
+    if (i % (kAvgTrigger + 2) != 0) {
+      ASSERT_OK(SingleDelete(Key(i)));
+    }
+  }
+
+  // Each operation, except the first Seek, is expected to see kAvgTrigger + 1
+  // tombstones (from the active memtable) before it finds the next visible key.
+  SetPerfLevel(PerfLevel::kEnableCount);
+  get_perf_context()->Reset();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  iter->Seek(Key(1));
+  ASSERT_EQ(get_perf_context()->next_on_memtable_count, kAvgTrigger + 1);
+  iter.reset();
+  // Should not flush since total entries skipped is below
+  // memtable_op_scan_flush_trigger
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  get_perf_context()->Reset();
+  iter.reset(db_->NewIterator(ReadOptions()));
+  int num_ops = 1;
+  uint64_t num_skipped = 0;
+  iter->Seek(Key(0));
+  ASSERT_EQ(iter->key(), Key(0));
+  uint64_t last_memtable_next_count =
+      get_perf_context()->next_on_memtable_count;
+  iter->Next();
+  num_ops++;
+  while (iter->Valid()) {
+    ASSERT_OK(iter->status());
+    uint64_t num_skipped_in_op =
+        get_perf_context()->next_on_memtable_count - last_memtable_next_count;
+    ASSERT_GE(num_skipped_in_op, kAvgTrigger + 1);
+    last_memtable_next_count = get_perf_context()->next_on_memtable_count;
+    num_skipped += num_skipped_in_op;
+    iter->Next();
+    num_ops++;
+  }
+  // During iterator destruction we mark memtable for flush
+  iter.reset();
+
+  // avg trigger
+  ASSERT_GE(num_skipped, kAvgTrigger * num_ops);
+  // memtable_op_scan_flush_trigger
+  ASSERT_GE(num_skipped, kMaxTrigger);
+  // Average hidden entries scanned from memtable per operation is more than
+  // kAvgTrigger and the total skipped is more than
+  // memtable_op_scan_flush_trigger, the current memtable should be marked for
+  // flush. The following two writes will trigger the flush.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  // Before a write, we schedule memtables for flush if requested.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTriggerByOverwrites) {
+  // Tests option memtable_avg_op_scan_flush_trigger with overwrites to keys.
+  Random* r = Random::GetTLSInstance();
+
+  const int kAvgTrigger = 25;
+  Options options;
+  options.create_if_missing = true;
+  options.memtable_op_scan_flush_trigger = 250;
+  options.memtable_avg_op_scan_flush_trigger = kAvgTrigger;
+  options.level_compaction_dynamic_level_bytes = true;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 100;
+  // Base data that will be covered by a consecutive sequence of tombstones.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), r->RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+  // One visible key every 10 keys.
+  // Each non-visible user key has 3 non-visible entries in the active memtable.
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i % 10 != 0) {
+      ASSERT_OK(Put(Key(i), r->RandomString(50)));
+      ASSERT_OK(Put(Key(i), r->RandomString(50)));
+      ASSERT_OK(Delete(Key(i)));
+    }
+  }
+
+  SetPerfLevel(PerfLevel::kEnableCount);
+  get_perf_context()->Reset();
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  iter->Seek(Key(1));
+  ASSERT_GT(get_perf_context()->next_on_memtable_count, kAvgTrigger);
+  // Re-seek to trigger check for flush trigger
+  iter->Seek(Key(1));
+  // Should not flush since total entries skipped is below
+  // memtable_op_scan_flush_trigger
+  ASSERT_FALSE(static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+                   ->cfd()
+                   ->mem()
+                   ->IsMarkedForFlush());
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  get_perf_context()->Reset();
+
+  int num_ops = 1;
+  iter->Seek(Key(1));
+  while (iter->Valid()) {
+    num_ops++;
+    iter->Next();
+  }
+  ASSERT_GT(get_perf_context()->next_on_memtable_count, num_ops * kAvgTrigger);
+
+  // Re-seek should check conditions for marking memtable for flush
+  iter->Seek(Key(80));
+
+  // Average hidden entries scanned from memtable per operation is 2.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  // Before a write, we schedule memtables for flush if requested.
+  ASSERT_OK(Put(Key(0), "dummy write"));
+  ASSERT_OK(db_->WaitForCompact({}));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 30dc1844f6ac..6b09e66a8fa0 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -424,6 +424,7 @@ DECLARE_bool(track_and_verify_wals);
 DECLARE_bool(enable_remote_compaction);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
 DECLARE_uint32(memtable_op_scan_flush_trigger);
+DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
 DECLARE_uint32(ingest_wbwi_one_in);
 
 constexpr long KB = 1024;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index ccafb423c11d..b95111932349 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1482,4 +1482,8 @@ DEFINE_uint32(
     ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_op_scan_flush_trigger,
     "Sets CF option memtable_op_scan_flush_trigger.");
 
+DEFINE_uint32(
+    memtable_avg_op_scan_flush_trigger,
+    ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_avg_op_scan_flush_trigger,
+    "Sets CF option memtable_avg_op_scan_flush_trigger.");
 #endif  // GFLAGS
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 067503ba01de..57cd12b08472 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -1127,6 +1127,25 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through the SetOptions() API.
   uint32_t memtable_op_scan_flush_trigger = 0;
 
+  // Similar to `memtable_op_scan_flush_trigger`, but this option applies to
+  // Next() calls between Seeks or until iterator destruction. If the average
+  // of the number of invisible entries scanned from the active memtable, the
+  // memtable will be marked for flush.
+  // Note that to avoid the case where the window between Seeks is too small,
+  // the option only takes effect if the total number of hidden entries scanned
+  // within a window is at least `memtable_op_scan_flush_trigger`. So this
+  // option is only effective when `memtable_op_scan_flush_trigger` is set.
+  //
+  // This option should be set to a lower value than
+  // `memtable_op_scan_flush_trigger`. It covers the case where an iterator
+  // scans through an expensive key range with many invisible entries from the
+  // active memtable, but the number of invisible entries per operation does not
+  // exceed `memtable_op_scan_flush_trigger`.
+  //
+  // Default: 0 (disabled)
+  // Dynamically changeable through the SetOptions() API.
+  uint32_t memtable_avg_op_scan_flush_trigger = 0;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff --git a/options/cf_options.cc b/options/cf_options.cc
index d5a61bcdbff8..c5a2ce54c0ca 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -707,6 +707,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, memtable_op_scan_flush_trigger),
           OptionType::kUInt32T, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"memtable_avg_op_scan_flush_trigger",
+         {offsetof(struct MutableCFOptions, memtable_avg_op_scan_flush_trigger),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
 };
 
 static std::unordered_map<std::string, OptionTypeInfo>
@@ -1190,6 +1194,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  uncache_aggressiveness);
   ROCKS_LOG_INFO(log, "             memtable_op_scan_flush_trigger: %" PRIu32,
                  memtable_op_scan_flush_trigger);
+  ROCKS_LOG_INFO(log, "         memtable_avg_op_scan_flush_trigger: %" PRIu32,
+                 memtable_avg_op_scan_flush_trigger);
 
   // Universal Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d",
diff --git a/options/cf_options.h b/options/cf_options.h
index 47d8fa7fb208..71577ae91773 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -174,7 +174,9 @@ struct MutableCFOptions {
         bottommost_file_compaction_delay(
             options.bottommost_file_compaction_delay),
         uncache_aggressiveness(options.uncache_aggressiveness),
-        memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger) {
+        memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger),
+        memtable_avg_op_scan_flush_trigger(
+            options.memtable_avg_op_scan_flush_trigger) {
     RefreshDerivedOptions(options.num_levels, options.compaction_style);
   }
 
@@ -230,7 +232,8 @@ struct MutableCFOptions {
         memtable_max_range_deletions(0),
         bottommost_file_compaction_delay(0),
         uncache_aggressiveness(0),
-        memtable_op_scan_flush_trigger(0) {}
+        memtable_op_scan_flush_trigger(0),
+        memtable_avg_op_scan_flush_trigger(0) {}
 
   explicit MutableCFOptions(const Options& options);
 
@@ -339,6 +342,7 @@ struct MutableCFOptions {
   uint32_t bottommost_file_compaction_delay;
   uint32_t uncache_aggressiveness;
   uint32_t memtable_op_scan_flush_trigger;
+  uint32_t memtable_avg_op_scan_flush_trigger;
 
   // Derived options
   // Per-level target file size.
diff --git a/options/options.cc b/options/options.cc
index 85dbc51ea92f..d61fd8403182 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -112,7 +112,9 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       blob_cache(options.blob_cache),
       prepopulate_blob_cache(options.prepopulate_blob_cache),
       persist_user_defined_timestamps(options.persist_user_defined_timestamps),
-      memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger) {
+      memtable_op_scan_flush_trigger(options.memtable_op_scan_flush_trigger),
+      memtable_avg_op_scan_flush_trigger(
+          options.memtable_avg_op_scan_flush_trigger) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -284,9 +286,12 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log,
                    "      Options.max_sequential_skip_in_iterations: %" PRIu64,
                    max_sequential_skip_in_iterations);
-  ROCKS_LOG_HEADER(
-      log, "           Options.memtable_op_scan_flush_trigger: %" PRIu32,
-      memtable_op_scan_flush_trigger);
+  ROCKS_LOG_HEADER(log,
+                   "         Options.memtable_op_scan_flush_trigger: %" PRIu32,
+                   memtable_op_scan_flush_trigger);
+  ROCKS_LOG_HEADER(log,
+                   "     Options.memtable_avg_op_scan_flush_trigger: %" PRIu32,
+                   memtable_avg_op_scan_flush_trigger);
   ROCKS_LOG_HEADER(log,
                    "                   Options.max_compaction_bytes: %" PRIu64,
                    max_compaction_bytes);
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 89436141024d..088f5140fcb4 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -301,6 +301,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->uncache_aggressiveness = moptions.uncache_aggressiveness;
   cf_opts->memtable_op_scan_flush_trigger =
       moptions.memtable_op_scan_flush_trigger;
+  cf_opts->memtable_avg_op_scan_flush_trigger =
+      moptions.memtable_avg_op_scan_flush_trigger;
 }
 
 void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index d0daf2fa504a..845f72aa6e7c 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -675,7 +675,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "bottommost_file_compaction_delay=7200;"
       "uncache_aggressiveness=1234;"
       "paranoid_memory_checks=1;"
-      "memtable_op_scan_flush_trigger=123;",
+      "memtable_op_scan_flush_trigger=123;"
+      "memtable_avg_op_scan_flush_trigger=12;",
       new_options));
 
   ASSERT_NE(new_options->blob_cache.get(), nullptr);
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 638bbb331c3c..9653e24a5d52 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -345,6 +345,7 @@
     "enable_remote_compaction": lambda: random.choice([0, 1]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
+    "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
     "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
 }
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
diff --git a/unreleased_history/new_features/avg-flush-trigger.md b/unreleased_history/new_features/avg-flush-trigger.md
new file mode 100644
index 000000000000..53fd31e89dae
--- /dev/null
+++ b/unreleased_history/new_features/avg-flush-trigger.md
@@ -0,0 +1 @@
+* Add a new CF option `memtable_avg_op_scan_flush_trigger` that supports triggering memtable flush when an iterator scans through an expensive range of keys, with the average number of skipped keys from the active memtable exceeding the threshold.

From 09cd25f76305f2110131f51068656ab392dc2bf5 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 20 May 2025 18:50:56 -0700
Subject: [PATCH 103/500] Fix another format compatibility failure (#13628)

Summary:
Some specific old versions around RocksDB 2.5 would compress the metaindex and properties blocks. This hasn't been done since, probably because it interferes with the properties block indicating how to set up for decompression (so the reader can read those blocks before doing any decompression).

To fix backward compatibility, we establish a decompressor early if format_version indicates the file could come from a sufficiently old version.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13628

Test Plan: local and CI runs of tools/check_format_compatible.sh. (I don't believe we need special code to set up a unit test for this case.)

Reviewed By: jowlyzhang

Differential Revision: D75107623

Pulled By: pdillinger

fbshipit-source-id: 97132b8c5e0602e8e27254a11386d866b23cb4f5
---
 table/block_based/block_based_table_reader.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index baab81a33342..f11458f5cee0 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -709,6 +709,13 @@ Status BlockBasedTable::Open(
   rep->file = std::move(file);
   rep->footer = footer;
 
+  // Some ancient versions (~2.5 - 2.7, format_version=1) could compress the
+  // metaindex block, so we need to allow for that
+  if (footer.format_version() < 2) {
+    auto mgr = GetBuiltinCompressionManager(/*compression_format_version=*/1);
+    rep->decompressor = mgr->GetDecompressor();
+  }
+
   // For fully portable/stable cache keys, we need to read the properties
   // block before setting up cache keys. TODO: consider setting up a bootstrap
   // cache key for PersistentCache to use for metaindex and properties blocks.
@@ -743,6 +750,9 @@ Status BlockBasedTable::Open(
       rep->table_properties ? rep->table_properties->compression_name
                             : std::string{});
   if (saved_comp_type != kNoCompression) {
+    // Includes "unrecognized" or "unspecified" case, including some old files
+    // before the compression_name table property was introduced in
+    // version 4.9.0
     // TODO: custom CompressionManager
     auto mgr = GetBuiltinCompressionManager(
         GetCompressFormatForVersion(footer.format_version()));

From 8dc3d77b591443e405b2b171b3eb4f8461ffd2a3 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 21 May 2025 10:09:46 -0700
Subject: [PATCH 104/500] Experimental, preliminary support for custom
 CompressionManager (#13626)

Summary:
This exposes CompressionManager and related classes to the public API and adds `ColumnFamilyOptions::compression_manager` for tying a custom compression strategy to a column family. At the moment, this does not support custom/pluggable compression algorithms, just custom strategies around the built-in algorithms, e.g. which compression to use when and where.

A large part of the change is moving code from internal compression.h to a new public header advanced_compression.h, with some minor changes:
* `Decompressor::ExtractUncompressedSize()` is out-of-lined
* CompressionManager inherits Customizable and some related changes to members of CompressionManager are made. (Core functionality of CompressionManager is unchanged.)

This depends on a smart pointer I'm calling `ManagedPtr` which I'm adding to data_structure.h.

Additionally, advanced_compression.h gets CompressorWrapper and CompressionManagerWrapper as building blocks for overriding aspects of compression strategy while leveraging existing compression algorithms / schemas.

Some pieces needed to support the `compression_manager` option and rudimentary Customizable implementation are included. More work will be needed to make this general and well-behaved (see e.g. https://github.com/facebook/rocksdb/issues/8641; I still hit inscrutible problems every time I touch Customizable).

I'll add a release note for the experimental feature once pluggable compression algorithms and more of the Customizable things are working.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13626

Test Plan:
Added a unit test demonstrating how a custom compressor can "bypass" or "reject" compressions.

Expected next follow-up (probably someone else): use a custom CompressionManager/Compressor to replace the internal hack for testing mixed compressions.

Reviewed By: hx235

Differential Revision: D75028850

Pulled By: pdillinger

fbshipit-source-id: 8565bb8ba4b5fa923b1e29e76b4f7bb4faa42381
---
 db/db_test2.cc                                | 115 ++++
 include/rocksdb/advanced_compression.h        | 510 ++++++++++++++++++
 include/rocksdb/compression_type.h            |   3 +
 include/rocksdb/data_structure.h              |  47 ++
 include/rocksdb/options.h                     |   8 +
 options/cf_options.cc                         |   5 +
 options/cf_options.h                          |   2 +
 options/options_helper.cc                     |   1 +
 options/options_settable_test.cc              |   7 +
 .../block_based/block_based_table_builder.cc  |   7 +-
 table/block_based/block_based_table_reader.h  |   1 +
 table/block_fetcher.h                         |   1 +
 test_util/testutil.cc                         |   4 +-
 tools/ldb_cmd.cc                              |   1 +
 util/cast_util.h                              |  47 --
 util/compression.cc                           |  60 ++-
 util/compression.h                            | 423 +--------------
 17 files changed, 761 insertions(+), 481 deletions(-)
 create mode 100644 include/rocksdb/advanced_compression.h

diff --git a/db/db_test2.cc b/db/db_test2.cc
index 644adb624216..7a056de9cc49 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -1883,6 +1883,121 @@ TEST_F(DBTest2, CompressionOptions) {
   }
 }
 
+TEST_F(DBTest2, CompressionManagerWrapper) {
+  // Test that we can use a custom CompressionManager to wrap the built-in
+  // CompressionManager, thus adopting a custom *strategy* based on existing
+  // algorithms. This will "mark" some blocks (in their contents) as "do not
+  // compress", i.e. no attempt to compress, and some blocks as "reject
+  // compression", i.e. compression attempted but rejected because of ratio
+  // or otherwise. These cases are distinguishable for statistics that
+  // approximate "wasted effort".
+  static std::string kDoNotCompress = "do_not_compress";
+  static std::string kRejectCompression = "reject_compression";
+
+  struct MyCompressor : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+
+    Status CompressBlock(Slice uncompressed_data,
+                         std::string* compressed_output,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      auto begin = uncompressed_data.data();
+      auto end = uncompressed_data.data() + uncompressed_data.size();
+      if (std::search(begin, end, kDoNotCompress.begin(),
+                      kDoNotCompress.end()) != end) {
+        // Do not attempt compression
+        EXPECT_EQ(*out_compression_type, kNoCompression);
+        return Status::OK();
+      } else if (std::search(begin, end, kRejectCompression.begin(),
+                             kRejectCompression.end()) != end) {
+        // Simulate attempted & rejected compression
+        *compressed_output = "blah";
+        EXPECT_EQ(*out_compression_type, kNoCompression);
+        return Status::OK();
+      } else {
+        return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       out_compression_type, working_area);
+      }
+    }
+  };
+  struct MyManager : public CompressionManagerWrapper {
+    using CompressionManagerWrapper::CompressionManagerWrapper;
+    const char* Name() const override { return wrapped_->Name(); }
+    std::unique_ptr<Compressor> GetCompressorForSST(
+        const FilterBuildingContext& context, const CompressionOptions& opts,
+        CompressionType preferred) override {
+      return std::make_unique<MyCompressor>(
+          wrapped_->GetCompressorForSST(context, opts, preferred));
+    }
+  };
+  auto mgr = std::make_shared<MyManager>(GetDefaultBuiltinCompressionManager());
+
+  for (CompressionType type : GetSupportedCompressions()) {
+    for (bool use_wrapper : {false, true}) {
+      if (type == kNoCompression) {
+        continue;
+      }
+      SCOPED_TRACE("Compression type: " + std::to_string(type) +
+                   (use_wrapper ? " with " : " no ") + "wrapper");
+
+      Options options = CurrentOptions();
+      options.compression = type;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      auto PopStat = [&](Tickers t) -> uint64_t {
+        return options.statistics->getAndResetTickerCount(t);
+      };
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+          if ((i % 2) == 0) {
+            // Half for bypass
+            value += kDoNotCompress;
+          } else if (i == 7) {
+            // One for rejection
+            value += kRejectCompression;
+          }
+        }
+        ASSERT_OK(Put(Key(i), value));
+      }
+      ASSERT_OK(Flush());
+
+      if (use_wrapper) {
+        EXPECT_EQ(kCount / 2 - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(kCount / 2, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+        EXPECT_EQ(1 + 1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+      } else {
+        EXPECT_EQ(kCount - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+        EXPECT_EQ(1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+      }
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+    }
+  }
+}
+
 class CompactionStallTestListener : public EventListener {
  public:
   CompactionStallTestListener()
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
new file mode 100644
index 000000000000..f73f5838fda6
--- /dev/null
+++ b/include/rocksdb/advanced_compression.h
@@ -0,0 +1,510 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// APIs for customizing compression in RocksDB.
+//
+// ***********************************************************************
+// EXPERIMENTAL - subject to change while under development
+// ***********************************************************************
+
+#pragma once
+
+#include "rocksdb/cache.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/data_structure.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: alias/adapt for compression
+struct FilterBuildingContext;
+
+// A Compressor represents a very specific but potentially adapting strategy for
+// compressing blocks, including the relevant algorithm(s), options, dictionary,
+// etc. as applicable--every input except the sequence of bytes to compress.
+// Compressor is generally thread-safe so can be shared by multiple threads. (It
+// could make sense to convert unique_ptr<Compressor> to
+// shared_ptr<Compressor>.) A Compressor for data files is expected to be used
+// for just one file, so that compression strategy can be explicitly
+// reconsidered for each new file. However, a Compressor for in-memory use could
+// live indefinitely.
+//
+// If a single thread is doing many compressions under the same strategy, it
+// should request a WorkingArea that will in some cases make repeated
+// compression in a single thread more efficient. Unlike the rest of Compressor,
+// each WorkingArea can only be used by one thread at a time. WorkingAreas can
+// have pre-allocated space and/or data structures, and/or thread-local
+// statistics that are later incorporated into shared statistics objects.
+//
+// The Compressor marks each block with a CompressionType to guide
+// decompression. However, the compression dictionary (or whether there is one
+// associated) is determined at Compressor creation time, though the process of
+// getting a Compressor with a dictionary starts with a Compressor without
+// dictionary (which will often be relevant alongside); see relevant functions.
+// If the Compressor wants to decide block-by-block whether to apply the
+// configured dictionary, that would need to be encoded in CompressionType or
+// the compressed output. (NOTE: this was historically NOT encoded in
+// CompressionType and instead implied by BlockType and the presence of a
+// dictionary block in the file. Some of the resulting awkwardness includes
+// a number of built-in CompressionTypes that ignore any dictionary block in
+// the file; therefore they cannot accommodate dictionary compression in the
+// future without a schema change / extension.)
+class Compressor {
+ public:
+  Compressor() = default;
+  virtual ~Compressor() = default;
+
+  // Returns the max total bytes of for all sampled blocks for creating the data
+  // dictionary, or zero indicating dictionary compression should not be
+  // used/configured. This will typically be called after
+  // CompressionManager::GetCompressor() to see if samples should be accumulated
+  // and passed to MaybeCloneSpecialized().
+  virtual size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const {
+    // Default implementation: no dictionary
+    (void)block_type;
+    return 0;
+  }
+
+  // Returns the serialized form of the data dictionary associated with this
+  // Compressor. NOTE: empty dict is equivalent to no dict.
+  virtual Slice GetSerializedDict() const { return Slice(); }
+
+  // If there's a dominant compression type returned by this compressor as
+  // configured, return it. Otherwise, return kDisableCompressionOption.
+  virtual CompressionType GetPreferredCompressionType() const {
+    return CompressionType::kDisableCompressionOption;
+  }
+
+  // Utility struct for providing sample data for the compression dictionary.
+  // Potentially extensible by callers of Compressor (but not recommended)
+  struct DictSampleArgs {
+    // All the sample input blocks stored contiguously
+    std::string sample_data;
+    // The lengths of each of the sample blocks in `sample_data`
+    std::vector<size_t> sample_lens;
+
+    bool empty() { return sample_data.empty(); }
+    bool Verify() {
+      size_t total_len = 0;
+      for (auto len : sample_lens) {
+        total_len += len;
+      }
+      return total_len == sample_data.size();
+    }
+  };
+
+  // Create potential variants of the same Compressor that might be
+  // (a) optimized for a particular block type (does not affect correct
+  //     decompression), and/or
+  // (b) configured to use a compression dictionary, based on the given
+  //     samples (decompression must provide the dictionary from
+  //     GetSerializedDict())
+  // Return of nullptr indicates no specialization exists or was attempted
+  // and the caller is best to use the current Compressor for the desired
+  // scenario. Using CacheEntryRole:kMisc for block_type generally means
+  // "unspecified", and both parameters are merely suggestions. The exact
+  // dictionary associated with a returned compressor must be read from
+  // GetSerializedDict().
+  virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+    // Default implementation: no specialization
+    (void)block_type;
+    (void)dict_samples;
+    // Caller should have checked GetMaxSampleSizeIfWantDict before attempting
+    // to provide dictionary samples
+    assert(dict_samples.empty());
+    return nullptr;
+  }
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated compressions by
+  // reusing working space or thread-local tracking of statistics or trends.
+  // This enables use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+ protected:
+  // To allow for flexible re-use / reclaimation, we have explicit Get and
+  // Release functions, and usually wrap in a special RAII smart pointer.
+  // For example, a WorkingArea could be saved/recycled in thread-local or
+  // core-local storage, or heap managed, etc., though an explicit WorkingArea
+  // is only advised for repeated compression (by a single thread).
+  virtual void ReleaseWorkingArea(WorkingArea*) {}
+
+ public:
+  using ManagedWorkingArea =
+      ManagedPtr<WorkingArea, Compressor, &Compressor::ReleaseWorkingArea>;
+
+  // See struct WorkingArea above
+  virtual ManagedWorkingArea ObtainWorkingArea() {
+    // Default implementation: no working area
+    return {};
+  }
+
+  // Compress `uncompressed_data` to `compressed_output`, which should be
+  // passed in empty. Note that the compressed output will be decompressed
+  // by the sequence Decompressor::ExtractUncompressedSize() followed by
+  // Decompressor::DecompressBlock(), which must also be provided the same
+  // CompressionType saved in `out_compression_type`. (In many configurations,
+  // `compressed_output` will have a prefix storing the uncompressed_data size
+  // before the compressed bytes returned by the underlying compression
+  // algorithm. And the compression type is usually stored adjacent to the
+  // compressed data, or in some cases assumed/asserted based on the particular
+  // Compressor.)
+  //
+  // If return status is not OK, then some fatal condition has arisen. On OK
+  // status, setting `*out_compression_type = kNoCompression` means compression
+  // is declined and the caller should use the original uncompressed_data and
+  // ignore any result in `compressed_output`. Otherwise, compression has
+  // happened with results in `compressed_output` and `out_compression_type`,
+  // which are allowed to vary from call to call.
+  //
+  // The working area is optional and used to optimize repeated compression by
+  // a single thread. ManagedWorkingArea is provided rather than just
+  // WorkingArea so that it can be used only if the `owner` matches expectation.
+  // This could be useful for a Compressor wrapping more than one alternative
+  // underlying Compressor.
+  //
+  // TODO: instead of string, consider a buffer only large enough for max
+  // tolerable compressed size. Does that work for all existing algorithms?
+  // * Looks like Snappy doesn't support that. :(
+  //   * Except perhaps using the Sink interface
+  // * But looks like everything else should. :)
+  // Could save CPU by eliminating extra zero-ing and giving up quicker when
+  // ratio is insufficient.
+  virtual Status CompressBlock(Slice uncompressed_data,
+                               std::string* compressed_output,
+                               CompressionType* out_compression_type,
+                               ManagedWorkingArea* working_area) = 0;
+
+  // TODO: something to populate table properties based on settings, after all
+  // or as WorkingAreas released. Maybe also update stats, or that could be in
+  // thread-specific WorkingArea.
+};
+
+// A Decompressor usually has a wide capability to decompress all kinds of
+// compressed data in the scope of a CompressionManager (see that class below),
+// except
+// (a) it might be optimized for or limited to a particular compression type(s)
+//     (see GetDecompressor* functions for in CompressionManager),
+// (b) distinct Decompressors are required to decompress with compression
+//     dictionaries. (Decompressors are generally associated with empty/no
+//     dictionary unless created with MaybeCloneForDict().)
+//
+// Similar to Compressor, Decompressor is generally thread safe except that each
+// WorkingArea can only be used by a single thread at a time.
+//
+// Decompressors known to be associated with no dictionary are typically
+// returned as shared_ptr, because they are broadly usable across threads.
+// Because compression dictionaries are externally managed (see
+// MaybeCloneForDict()), Decompressors associated with compression dictionaries
+// are typically returned as unique_ptr, so that they are more easily
+// guaranteed not to outlive their dictionaries (e.g. in block cache).
+// Decompressors associated with compression dictionaries might include a
+// processed or "digested" form of the raw dictionary for efficient repeated
+// compressions.
+//
+// NOTE: Splitting the interface between ExtractUncompressedSize and
+// DecompressBlock leaves to the caller details of (and flexibility in)
+// allocating buffers for decompressing into. For example, the data could be
+// decompressed into part of a single buffer allocated to hold a block's
+// uncompressed contents along with an in-memory object representation of the
+// block (to reduce fragmentation and other overheads of separate objects).
+class Decompressor {
+ public:
+  Decompressor() = default;
+  virtual ~Decompressor() = default;
+
+  // A name for logging / debugging purposes
+  virtual const char* Name() const = 0;
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated decompressions by
+  // reusing working space or thread-local tracking of statistics. This enables
+  // use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+ protected:
+  // To allow for flexible re-use / reclaimation, we have explicit Obtain and
+  // Release functions, which are typically wrapped in a special RAII smart
+  // pointer. For example, a WorkingArea could be saved/recycled in thread-local
+  // or core-local storage, or heap managed, etc., though an explicit
+  // WorkingArea is only advised for repeated decompression (by a single
+  // thread).
+
+  virtual void ReleaseWorkingArea(WorkingArea* wa) {
+    // Default implementation: no working area
+    (void)wa;
+    assert(wa == nullptr);
+  }
+
+ public:
+  using ManagedWorkingArea =
+      ManagedPtr<WorkingArea, Decompressor, &Decompressor::ReleaseWorkingArea>;
+
+  virtual ManagedWorkingArea ObtainWorkingArea(CompressionType /*preferred*/) {
+    // Default implementation: no working area
+    return {};
+  }
+
+  // If this Decompressor is associated with a (de)compression dictionary
+  // (created with MaybeCloneForDict()), this returns a pointer to those raw (or
+  // "serialized") bytes, which are externally managed (see
+  // MaybeCloneForDict()).
+  // Default: empty slice => no dictionary
+  virtual const Slice& GetSerializedDict() const;
+
+  // Create a variant of this Decompressor in `out` using the specified raw
+  // ("serialized") dictionary. This step is required for decompressing data
+  // compressed with the same dictionary. The new Decompressor references the
+  // given Slice through its lifetime so the data it points to must be managed
+  // by the caller along with (or beyond) the new Decompressor. If the
+  // dictionary is processed into a form reusable by repeated compressions in
+  // many threads, that happens within this call.
+  //
+  // Must return OK if storing a result in `out`. Otherwise, could return values
+  // like NotSupported - dictionary compression is not (yet) supported for this
+  // kind of Decompressor.
+  // Corruption - dictionary is malformed (though many implementations will
+  // accept any data as a dictionary)
+  virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
+                                   std::unique_ptr<Decompressor>* /*out*/) {
+    return Status::NotSupported(
+        "Dictionary compression not (yet) supported by " + std::string(Name()));
+  }
+
+  // Memory size of this object and others it owns. Does not include the
+  // serialized dictionary (when used) which is externally managed.
+  virtual size_t ApproximateOwnedMemoryUsage() const {
+    // Default: negligible
+    return 0;
+  }
+
+  // Potentially extensible by callers of Decompressor (but not recommended)
+  struct Args {
+    CompressionType compression_type = kNoCompression;
+    Slice compressed_data;
+    uint64_t uncompressed_size = 0;
+    ManagedWorkingArea* working_area = nullptr;
+  };
+
+  // For efficiency on the read path, RocksDB strongly prefers the uncompressed
+  // data size to be encoded in the compressed data in an easily accessible way,
+  // so that allocation of a potentially long-lived buffer can be ideally sized.
+  // This function determines the uncompressed size and potentially modifies
+  // `args.compressed_data` to strip off the size metadata, for providing both
+  // to DecompressBlock along with an appropriate buffer based on that size.
+  // Some implementations will leave `compressed_data` unmodified and let
+  // DecompressBlock call a library function that processes a format that
+  // includes size metadata (e.g. Snappy).
+  //
+  // Even for legacy cases without size metadata (e.g. some very old RocksDB
+  // formats), an exact size is required and could require decompressing the
+  // data (here and in DecompressBlock()).
+  //
+  // Return non-OK in case of corrupt data or some other unworkable limitation
+  // or failure.
+  //
+  // The default implementation uses a standard format for prepending
+  // uncompressed size to the compressed payload. (RocksDB
+  // compress_format_version=2 except Snappy)
+  virtual Status ExtractUncompressedSize(Args& args);
+
+  // Called to decompress a block of data after running ExtractUncompressedSize
+  // on it. `args.compressed_data` is what ExtractUncompressedSize left there
+  // after potentially stripping off the uncompressed size metadata. Returns OK
+  // iff uncompressed data of size `uncompressed_size` is written to
+  // `uncompressed_output`.
+  virtual Status DecompressBlock(const Args& args,
+                                 char* uncompressed_output) = 0;
+};
+
+// A CompressionManager represents
+// * When/where/how to use different compressions
+// * A schema (or set of schemas) and implementation for mapping
+//     <CompressionType, dictionary, compressed data>
+//   to uncompressed data (or error), which can expand over time (error in fewer
+//   cases) for a given CompatibilityName() but can never change that mapping
+//   (because that would break backward compatibility, potential quiet
+//   corruption)
+// TODO: consider adding optional streaming compression support (low priority)
+class CompressionManager
+    : public std::enable_shared_from_this<CompressionManager>,
+      public Customizable {
+ public:
+  CompressionManager() = default;
+  virtual ~CompressionManager() = default;
+  static const char* Type() { return "CompressionManager"; }
+
+  // *************** Creating various Compression Managers *************** //
+  // A name for the schema family of this CompressionManager. In short, if
+  // two CompressionManagers have functionally the same Decompressor(s), they
+  // should have the same CompatibilityName(), so that a compatible
+  // CompressionManager/Decompressor might be used if the original is
+  // unavailable. (Name() can be useful in addition to CompatibilityName() for
+  // understanding what compression strategy was used.)
+  virtual const char* CompatibilityName() const = 0;
+
+  // Default implementation checks the current compatibility name and returns
+  // this CompressionManager (via `out`) if appropriate, and otherwise defers
+  // to CreateFromString().
+  virtual Status FindCompatibleCompressionManager(
+      Slice compatibility_name, std::shared_ptr<CompressionManager>* out);
+
+  // Create a CompressionManager from a string, including built-in
+  // CompressionManager types.
+  // TODO: ObjectLibrary stuff
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& id,
+                                 std::shared_ptr<CompressionManager>* result);
+
+  // ************************* Compressor creation *********************** //
+  // Returning nullptr means compression is entirely disabled for the file,
+  // which is valid at the discretion of the CompressionManager. Returning
+  // nullptr should normally be the result if preferred == kNoCompression.
+  //
+  // These functions must be thread-safe.
+
+  // Get a compressor for an SST file.
+  // SUBJECT TO CHANGE
+  // TODO: is it practical to get ColumnFamilyOptions plumbed into here?
+  virtual std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext&, const CompressionOptions& opts,
+      CompressionType preferred) {
+    return GetCompressor(opts, preferred);
+  }
+
+  // Get a compressor for a generic/unspecified purpose (e.g. in-memory
+  // compression).
+  virtual std::unique_ptr<Compressor> GetCompressor(
+      const CompressionOptions& opts, CompressionType type) = 0;
+
+  // **************************** Decompressors ************************** //
+  // Get a decompressor that is compatible with any blocks compressed by
+  // compressors returned by this CompressionManager (at least this code
+  // revision and earlier). (NOTE: recommended to return a shared_ptr alias of
+  // this shared_ptr to a field that is a Decompressor.)
+  // Justification for not making CompressionManager inherit Decompressor: this
+  // tends to run into the diamond inheritance problem in implementations and
+  // potential overheads of virtual inheritance.
+  virtual std::shared_ptr<Decompressor> GetDecompressor() = 0;
+
+  // Compatible with same as above, but potentially optimized for a certain
+  // expected CompressionType
+  virtual std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType /*optimize_for_type*/) {
+    // Safe default implementation
+    return GetDecompressor();
+  }
+
+  // Get a decompressor that is allowed to have support only for the
+  // CompressionTypes in the given start-to-end array (unique, sorted by
+  // unsigned char)
+  virtual std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* /*types_begin*/,
+      const CompressionType* /*types_end*/) {
+    // Safe default implementation
+    return GetDecompressor();
+  }
+};
+
+// ************************* Utility wrappers etc. *********************** //
+class CompressorWrapper : public Compressor {
+ public:
+  explicit CompressorWrapper(std::unique_ptr<Compressor> compressor)
+      : wrapped_(std::move(compressor)) {}
+  // No copies
+  CompressorWrapper(const CompressorWrapper&) = delete;
+  CompressorWrapper& operator=(const CompressorWrapper&) = delete;
+
+  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override {
+    return wrapped_->GetMaxSampleSizeIfWantDict(block_type);
+  }
+
+  Slice GetSerializedDict() const override {
+    return wrapped_->GetSerializedDict();
+  }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return wrapped_->GetPreferredCompressionType();
+  }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+    return wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+    return wrapped_->ObtainWorkingArea();
+  }
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* working_area) override {
+    return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                   out_compression_type, working_area);
+  }
+
+ protected:
+  std::unique_ptr<Compressor> wrapped_;
+};
+
+// TODO: CompressorBase, for custom compressions
+
+class CompressionManagerWrapper : public CompressionManager {
+ public:
+  explicit CompressionManagerWrapper(
+      std::shared_ptr<CompressionManager> wrapped)
+      : wrapped_(std::move(wrapped)) {}
+
+  const char* CompatibilityName() const override {
+    return wrapped_->CompatibilityName();
+  }
+
+  Status FindCompatibleCompressionManager(
+      Slice compatibility_name,
+      std::shared_ptr<CompressionManager>* out) override {
+    return wrapped_->FindCompatibleCompressionManager(compatibility_name, out);
+  }
+
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override {
+    return wrapped_->GetCompressorForSST(context, opts, preferred);
+  }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    return wrapped_->GetCompressor(opts, type);
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return wrapped_->GetDecompressor();
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+      CompressionType optimize_for_type) override {
+    return wrapped_->GetDecompressorOptimizeFor(optimize_for_type);
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* types_begin,
+      const CompressionType* types_end) override {
+    return wrapped_->GetDecompressorForTypes(types_begin, types_end);
+  }
+
+ protected:
+  std::shared_ptr<CompressionManager> wrapped_;
+};
+
+// Compression manager that implements built-in compression strategy. The
+// behavior of
+// compression_manager=nullptr with this
+const std::shared_ptr<CompressionManager>&
+GetDefaultBuiltinCompressionManager();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index d7ef0b7aa1be..96377c2427ad 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -182,4 +182,7 @@ struct CompressionOptions {
 #endif
 };
 
+// See advanced_compression.h
+class CompressionManager;
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h
index ffab82c514a5..6d408a95968f 100644
--- a/include/rocksdb/data_structure.h
+++ b/include/rocksdb/data_structure.h
@@ -183,4 +183,51 @@ class SmallEnumSet {
   StateT state_;
 };
 
+// A smart pointer that tracks an object and an owner, using a statically
+// determined function on those to reclaim the object, if both object and owner
+// are non-null
+template <typename T, class Owner, auto Fn>
+class ManagedPtr {
+ public:
+  ManagedPtr() = default;
+  ManagedPtr(T* ptr, Owner* owner) : ptr_(ptr), owner_(owner) {}
+  ~ManagedPtr() {
+    if (ptr_ && owner_) {
+      if constexpr (std::is_member_function_pointer_v<decltype(Fn)>) {
+        (owner_->*Fn)(ptr_);
+      } else {
+        Fn(owner_, ptr_);
+      }
+    }
+  }
+  // No copies
+  ManagedPtr(const ManagedPtr&) = delete;
+  ManagedPtr& operator=(const ManagedPtr&) = delete;
+  // Moves
+  ManagedPtr(ManagedPtr&& other) noexcept {
+    ptr_ = other.ptr_;
+    owner_ = other.owner_;
+    other.ptr_ = nullptr;
+    other.owner_ = nullptr;
+  }
+  ManagedPtr& operator=(ManagedPtr&& other) noexcept {
+    ptr_ = other.ptr_;
+    owner_ = other.owner_;
+    other.ptr_ = nullptr;
+    other.owner_ = nullptr;
+    return *this;
+  }
+
+  T* get() const { return ptr_; }
+  T* operator->() const { return ptr_; }
+  T& operator*() const { return *ptr_; }
+  operator bool() const { return ptr_ != nullptr; }
+
+  Owner* owner() const { return owner_; }
+
+ private:
+  T* ptr_ = nullptr;
+  Owner* owner_ = nullptr;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index ba5b98147abe..6e802f75a923 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -233,6 +233,14 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // different options for compression algorithms
   CompressionOptions compression_opts;
 
+  // EXPERIMENTAL
+  // Customized compression through a callback interface. When non-nullptr,
+  // supersedes the above compression options, except that the above options are
+  // still processed as they historically would be and passed to
+  // CompressionManager::GetCompressorForSST as hints or suggestions. See
+  // advanced_compression.h
+  std::shared_ptr<CompressionManager> compression_manager;
+
   // Number of files to trigger level-0 compaction. A value <0 means that
   // level-0 compaction will not be triggered by number of files at all.
   //
diff --git a/options/cf_options.cc b/options/cf_options.cc
index c5a2ce54c0ca..315e70273331 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -698,6 +698,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
                      name, value, addr);
                }
              })},
+        {"compression_manager",
+         OptionTypeInfo::AsCustomSharedPtr<CompressionManager>(
+             offsetof(struct MutableCFOptions, compression_manager),
+             OptionVerificationType::kByNameAllowNull,
+             (OptionTypeFlags::kMutable | OptionTypeFlags::kAllowNull))},
         // End special case properties
         {"memtable_max_range_deletions",
          {offsetof(struct MutableCFOptions, memtable_max_range_deletions),
diff --git a/options/cf_options.h b/options/cf_options.h
index 71577ae91773..378dfc28e7d5 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -161,6 +161,7 @@ struct MutableCFOptions {
         bottommost_compression(options.bottommost_compression),
         compression_opts(options.compression_opts),
         bottommost_compression_opts(options.bottommost_compression_opts),
+        compression_manager(options.compression_manager),
         last_level_temperature(options.last_level_temperature),
         default_write_temperature(options.default_write_temperature),
         memtable_protection_bytes_per_key(
@@ -330,6 +331,7 @@ struct MutableCFOptions {
   CompressionType bottommost_compression;
   CompressionOptions compression_opts;
   CompressionOptions bottommost_compression_opts;
+  std::shared_ptr<CompressionManager> compression_manager;
   Temperature last_level_temperature;
   Temperature default_write_temperature;
   uint32_t memtable_protection_bytes_per_key;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 088f5140fcb4..9ce73cad094b 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -293,6 +293,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->compression_opts = moptions.compression_opts;
   cf_opts->bottommost_compression = moptions.bottommost_compression;
   cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts;
+  cf_opts->compression_manager = moptions.compression_manager;
   cf_opts->sample_for_compression = moptions.sample_for_compression;
   cf_opts->compression_per_level = moptions.compression_per_level;
   cf_opts->last_level_temperature = moptions.last_level_temperature;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 845f72aa6e7c..294f1b9e1f74 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -530,6 +530,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(const CompactionFilter*)},
       {offsetof(struct ColumnFamilyOptions, compaction_filter_factory),
        sizeof(std::shared_ptr<CompactionFilterFactory>)},
+      {offsetof(struct ColumnFamilyOptions, compression_manager),
+       sizeof(std::shared_ptr<CompressionManager>)},
       {offsetof(struct ColumnFamilyOptions, prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
       {offsetof(struct ColumnFamilyOptions, snap_refresh_nanos),
@@ -619,6 +621,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "strategy=7;max_dict_bytes=8;level=9;window_bits=10;max_compressed_bytes_"
       "per_kb=876;checksum=true};"
       "bottommost_compression=kDisableCompressionOption;"
+      "compression_manager=BuiltinV2;"
       "level0_stop_writes_trigger=33;"
       "num_levels=99;"
       "level0_slowdown_writes_trigger=22;"
@@ -700,6 +703,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       new_options->compaction_options_fifo.file_temperature_age_thresholds[0]
           .age,
       12345);
+  ASSERT_EQ(new_options->compression_manager,
+            GetBuiltinCompressionManager(/*compression_format_version*/ 2));
 
   ColumnFamilyOptions rnd_filled_options = *new_options;
 
@@ -719,6 +724,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::vector<int>)},
       {offsetof(struct MutableCFOptions, compaction_options_fifo),
        sizeof(struct CompactionOptionsFIFO)},
+      {offsetof(struct MutableCFOptions, compression_manager),
+       sizeof(std::shared_ptr<CompressionManager>)},
       {offsetof(struct MutableCFOptions, compression_per_level),
        sizeof(std::vector<CompressionType>)},
       {offsetof(struct MutableCFOptions, max_file_size),
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 5453d0cb23ac..95f25f80784a 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -477,8 +477,11 @@ struct BlockBasedTableBuilder::Rep {
     }
 
     // TODO: get CompressionManager from options and sort out properties
-    auto mgr = GetBuiltinCompressionManager(
-        GetCompressFormatForVersion(table_opt.format_version));
+    auto mgr = tbo.moptions.compression_manager;
+    if (mgr == nullptr) {
+      mgr = GetBuiltinCompressionManager(
+          GetCompressFormatForVersion(table_opt.format_version));
+    }
     props.compression_name = CompressionTypeToString(tbo.compression_type);
     props.compression_options =
         CompressionOptionsToString(tbo.compression_opts);
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index b31ff87c4c61..b01a67d007eb 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -34,6 +34,7 @@
 #include "table/two_level_iterator.h"
 #include "trace_replay/block_cache_tracer.h"
 #include "util/atomic.h"
+#include "util/cast_util.h"
 #include "util/coro_utils.h"
 #include "util/hash_containers.h"
 
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index 9360429fab25..76e59369f093 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -14,6 +14,7 @@
 #include "table/block_based/block_type.h"
 #include "table/format.h"
 #include "table/persistent_cache_options.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index d3af4260c27d..d65cefd60fb7 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -91,9 +91,9 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode) {
   return test_mode != UserDefinedTimestampTestMode::kStripUserDefinedTimestamp;
 }
 
-Slice CompressibleString(Random* rnd, double compressed_fraction, int len,
+Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len,
                          std::string* dst) {
-  int raw = static_cast<int>(len * compressed_fraction);
+  int raw = static_cast<int>(len * compressed_to_fraction);
   if (raw < 1) {
     raw = 1;
   }
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 076387ece281..a581990420a2 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -874,6 +874,7 @@ bool LDBCommand::ParseCompressionTypeOption(
       g_hack_mixed_compression.StoreRelaxed(1);
       // Need to list zstd in compression_name table property if it's
       // potentially in the mix, for proper handling of context and dictionary.
+      // (Older versions of RocksDB could crash if that's not satisfied.)
       value = ZSTD_Supported() ? kZSTD : GetSupportedCompressions()[0];
       return true;
 #endif  // !NDEBUG
diff --git a/util/cast_util.h b/util/cast_util.h
index 60d198a9c845..414feda9cbea 100644
--- a/util/cast_util.h
+++ b/util/cast_util.h
@@ -85,51 +85,4 @@ class UnownedPtr {
   T* ptr_ = nullptr;
 };
 
-// A smart pointer that tracks an object and an owner, using a statically
-// determined function on those to reclaim the object, if both object and owner
-// are non-null
-template <typename T, class Owner, auto Fn>
-class ManagedPtr {
- public:
-  ManagedPtr() = default;
-  ManagedPtr(T* ptr, Owner* owner) : ptr_(ptr), owner_(owner) {}
-  ~ManagedPtr() {
-    if (ptr_ && owner_) {
-      if constexpr (std::is_member_function_pointer_v<decltype(Fn)>) {
-        (owner_->*Fn)(ptr_);
-      } else {
-        Fn(owner_, ptr_);
-      }
-    }
-  }
-  // No copies
-  ManagedPtr(const ManagedPtr&) = delete;
-  ManagedPtr& operator=(const ManagedPtr&) = delete;
-  // Moves
-  ManagedPtr(ManagedPtr&& other) noexcept {
-    ptr_ = other.ptr_;
-    owner_ = other.owner_;
-    other.ptr_ = nullptr;
-    other.owner_ = nullptr;
-  }
-  ManagedPtr& operator=(ManagedPtr&& other) noexcept {
-    ptr_ = other.ptr_;
-    owner_ = other.owner_;
-    other.ptr_ = nullptr;
-    other.owner_ = nullptr;
-    return *this;
-  }
-
-  T* get() const { return ptr_; }
-  T* operator->() const { return ptr_; }
-  T& operator*() const { return *ptr_; }
-  operator bool() const { return ptr_ != nullptr; }
-
-  Owner* owner() const { return owner_; }
-
- private:
-  T* ptr_ = nullptr;
-  Owner* owner_ = nullptr;
-};
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.cc b/util/compression.cc
index 0f0210918913..68ed29c446d2 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -6,6 +6,7 @@
 #include "util/compression.h"
 
 #include "options/options_helper.h"
+#include "rocksdb/convenience.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -124,6 +125,26 @@ void ZSTDStreamingUncompress::Reset() {
 // ***********************************************************************
 // BEGIN built-in implementation of customization interface
 // ***********************************************************************
+Status Decompressor::ExtractUncompressedSize(Args& args) {
+  // Default implementation:
+  //
+  // Standard format for prepending uncompressed size to the compressed
+  // payload. (RocksDB compress_format_version=2 except Snappy)
+  //
+  // This is historically a varint32, but it is preliminarily generalized
+  // to varint64. (TODO: support that on the write side, at least for some
+  // codecs, in BBT format_version=7)
+  if (LIKELY(GetVarint64(&args.compressed_data, &args.uncompressed_size))) {
+    if (LIKELY(args.uncompressed_size <= SIZE_MAX)) {
+      return Status::OK();
+    } else {
+      return Status::MemoryLimit("Uncompressed size too large for platform");
+    }
+  } else {
+    return Status::Corruption("Unable to extract uncompressed size");
+  }
+}
+
 const Slice& Decompressor::GetSerializedDict() const {
   // Default: empty slice => no dictionary
   static Slice kEmptySlice;
@@ -858,22 +879,36 @@ const std::shared_ptr<BuiltinCompressionManagerV2>
 
 }  // namespace
 
+Status CompressionManager::CreateFromString(
+    const ConfigOptions& /*config_options*/, const std::string& id,
+    std::shared_ptr<CompressionManager>* result) {
+  if (id == kNullptrString || id.empty()) {
+    result->reset();
+    return Status::OK();
+  } else if (id.compare(kBuiltinCompressionManagerV1->CompatibilityName()) ==
+                 0 ||
+             id.compare(kBuiltinCompressionManagerV1->Name()) == 0) {
+    *result = kBuiltinCompressionManagerV1;
+    return Status::OK();
+  } else if (id.compare(kBuiltinCompressionManagerV2->CompatibilityName()) ==
+                 0 ||
+             id.compare(kBuiltinCompressionManagerV2->Name()) == 0) {
+    *result = kBuiltinCompressionManagerV2;
+    return Status::OK();
+  } else {
+    return Status::NotFound("Compatible compression manager for \"" + id +
+                            "\"");
+  }
+}
+
 Status CompressionManager::FindCompatibleCompressionManager(
     Slice compatibility_name, std::shared_ptr<CompressionManager>* out) {
   if (compatibility_name.compare(CompatibilityName()) == 0) {
     *out = shared_from_this();
     return Status::OK();
-  } else if (compatibility_name.compare(
-                 kBuiltinCompressionManagerV1->CompatibilityName()) == 0) {
-    *out = kBuiltinCompressionManagerV1;
-    return Status::OK();
-  } else if (compatibility_name.compare(
-                 kBuiltinCompressionManagerV2->CompatibilityName()) == 0) {
-    *out = kBuiltinCompressionManagerV2;
-    return Status::OK();
   } else {
-    return Status::NotFound("Compatible compression manager for \"" +
-                            compatibility_name.ToString() + "\"");
+    return CreateFromString(ConfigOptions(), compatibility_name.ToString(),
+                            out);
   }
 }
 
@@ -895,6 +930,11 @@ const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
   }
 }
 
+const std::shared_ptr<CompressionManager>&
+GetDefaultBuiltinCompressionManager() {
+  return GetBuiltinCompressionManager(2);
+}
+
 // ***********************************************************************
 // END built-in implementation of customization interface
 // ***********************************************************************
diff --git a/util/compression.h b/util/compression.h
index 4f23cc320a63..87545f573404 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -12,9 +12,6 @@
 #include <algorithm>
 #include <limits>
 
-#include "port/likely.h"
-#include "util/atomic.h"
-#include "util/cast_util.h"
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
 #ifdef OS_FREEBSD
 #include <malloc_np.h>
@@ -25,10 +22,12 @@
 #include <string>
 
 #include "memory/memory_allocator_impl.h"
+#include "port/likely.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/options.h"
-#include "rocksdb/table.h"
 #include "table/block_based/block_type.h"
 #include "test_util/sync_point.h"
+#include "util/atomic.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
 #include "util/string_util.h"
@@ -148,422 +147,6 @@ class ZSTDUncompressCachedData {
 
 namespace ROCKSDB_NAMESPACE {
 
-// ***********************************************************************
-// BEGIN future compression customization interface
-// ***********************************************************************
-
-// TODO: alias/adapt for compression
-struct FilterBuildingContext;
-
-// A Compressor represents a very specific but potentially adapting strategy for
-// compressing blocks, including the relevant algorithm(s), options, dictionary,
-// etc. as applicable--every input except the sequence of bytes to compress.
-// Compressor is generally thread-safe so can be shared by multiple threads. (It
-// could make sense to convert unique_ptr<Compressor> to
-// shared_ptr<Compressor>.) A Compressor for data files is expected to be used
-// for just one file, so that compression strategy can be explicitly
-// reconsidered for each new file. However, a Compressor for in-memory use could
-// live indefinitely.
-//
-// If a single thread is doing many compressions under the same strategy, it
-// should request a WorkingArea that will in some cases make repeated
-// compression in a single thread more efficient. Unlike the rest of Compressor,
-// each WorkingArea can only be used by one thread at a time. WorkingAreas can
-// have pre-allocated space and/or data structures, and/or thread-local
-// statistics that are later incorporated into shared statistics objects.
-//
-// The Compressor marks each block with a CompressionType to guide
-// decompression. However, the compression dictionary (or whether there is one
-// associated) is determined at Compressor creation time, though the process of
-// getting a Compressor with a dictionary starts with a Compressor without
-// dictionary (which will often be relevant alongside); see relevant functions.
-// If the Compressor wants to decide block-by-block whether to apply the
-// configured dictionary, that would need to be encoded in CompressionType or
-// the compressed output. (NOTE: this was historically NOT encoded in
-// CompressionType and instead implied by BlockType and the presence of a
-// dictionary block in the file. Some of the resulting awkwardness includes
-// a number of built-in CompressionTypes that ignore any dictionary block in
-// the file; therefore they cannot accommodate dictionary compression in the
-// future without a schema change / extension.)
-class Compressor {
- public:
-  Compressor() = default;
-  virtual ~Compressor() = default;
-
-  // Returns the max total bytes of for all sampled blocks for creating the data
-  // dictionary, or zero indicating dictionary compression should not be
-  // used/configured. This will typically be called after
-  // CompressionManager::GetCompressor() to see if samples should be accumulated
-  // and passed to MaybeCloneSpecialized().
-  virtual size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const {
-    // Default implementation: no dictionary
-    (void)block_type;
-    return 0;
-  }
-
-  // Returns the serialized form of the data dictionary associated with this
-  // Compressor. NOTE: empty dict is equivalent to no dict.
-  virtual Slice GetSerializedDict() const { return Slice(); }
-
-  // If there's a dominant compression type returned by this compressor as
-  // configured, return it. Otherwise, return kDisableCompressionOption.
-  virtual CompressionType GetPreferredCompressionType() const {
-    return CompressionType::kDisableCompressionOption;
-  }
-
-  // Utility struct for providing sample data for the compression dictionary.
-  // Potentially extensible by callers of Compressor (but not recommended)
-  struct DictSampleArgs {
-    // All the sample input blocks stored contiguously
-    std::string sample_data;
-    // The lengths of each of the sample blocks in `sample_data`
-    std::vector<size_t> sample_lens;
-
-    bool empty() { return sample_data.empty(); }
-    bool Verify() {
-      size_t total_len = 0;
-      for (auto len : sample_lens) {
-        total_len += len;
-      }
-      return total_len == sample_data.size();
-    }
-  };
-
-  // Create potential variants of the same Compressor that might be
-  // (a) optimized for a particular block type (does not affect correct
-  //     decompression), and/or
-  // (b) configured to use a compression dictionary, based on the given
-  //     samples (decompression must provide the dictionary from
-  //     GetSerializedDict())
-  // Return of nullptr indicates no specialization exists or was attempted
-  // and the caller is best to use the current Compressor for the desired
-  // scenario. Using CacheEntryRole:kMisc for block_type generally means
-  // "unspecified", and both parameters are merely suggestions. The exact
-  // dictionary associated with a returned compressor must be read from
-  // GetSerializedDict().
-  virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
-    // Default implementation: no specialization
-    (void)block_type;
-    (void)dict_samples;
-    // Caller should have checked GetMaxSampleSizeIfWantDict before attempting
-    // to provide dictionary samples
-    assert(dict_samples.empty());
-    return nullptr;
-  }
-
-  // A WorkingArea is an optional structure (both for callers and
-  // implementations) that can enable optimizing repeated compressions by
-  // reusing working space or thread-local tracking of statistics or trends.
-  // This enables use of ZSTD context, for example.
-  //
-  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
-  struct WorkingArea {};
-
- protected:
-  // To allow for flexible re-use / reclaimation, we have explicit Get and
-  // Release functions, and usually wrap in a special RAII smart pointer.
-  // For example, a WorkingArea could be saved/recycled in thread-local or
-  // core-local storage, or heap managed, etc., though an explicit WorkingArea
-  // is only advised for repeated compression (by a single thread).
-  virtual void ReleaseWorkingArea(WorkingArea*) {}
-
- public:
-  using ManagedWorkingArea =
-      ManagedPtr<WorkingArea, Compressor, &Compressor::ReleaseWorkingArea>;
-
-  // See struct WorkingArea above
-  virtual ManagedWorkingArea ObtainWorkingArea() {
-    // Default implementation: no working area
-    return {};
-  }
-
-  // Compress `uncompressed_data` to `compressed_output`, which should be
-  // passed in empty. Note that the compressed output will be decompressed
-  // by the sequence Decompressor::ExtractUncompressedSize() followed by
-  // Decompressor::DecompressBlock(), which must also be provided the same
-  // CompressionType saved in `out_compression_type`. (In many configurations,
-  // `compressed_output` will have a prefix storing the uncompressed_data size
-  // before the compressed bytes returned by the underlying compression
-  // algorithm. And the compression type is usually stored adjacent to the
-  // compressed data, or in some cases assumed/asserted based on the particular
-  // Compressor.)
-  //
-  // If return status is not OK, then some fatal condition has arisen. On OK
-  // status, setting `*out_compression_type = kNoCompression` means compression
-  // is declined and the caller should use the original uncompressed_data and
-  // ignore any result in `compressed_output`. Otherwise, compression has
-  // happened with results in `compressed_output` and `out_compression_type`,
-  // which are allowed to vary from call to call.
-  //
-  // The working area is optional and used to optimize repeated compression by
-  // a single thread. ManagedWorkingArea is provided rather than just
-  // WorkingArea so that it can be used only if the `owner` matches expectation.
-  // This could be useful for a Compressor wrapping more than one alternative
-  // underlying Compressor.
-  //
-  // TODO: instead of string, consider a buffer only large enough for max
-  // tolerable compressed size. Does that work for all existing algorithms?
-  // * Looks like Snappy doesn't support that. :(
-  // * But looks like everything else should. :)
-  // Could save CPU by eliminating extra zero-ing and giving up quicker when
-  // ratio is insufficient.
-  virtual Status CompressBlock(Slice uncompressed_data,
-                               std::string* compressed_output,
-                               CompressionType* out_compression_type,
-                               ManagedWorkingArea* working_area) = 0;
-
-  // TODO: something to populate table properties based on settings, after all
-  // or as WorkingAreas released. Maybe also update stats, or that could be in
-  // thread-specific WorkingArea.
-};
-
-// TODO: CompressorBase and CompressorWrapper
-
-// A Decompressor usually has a wide capability to decompress all kinds of
-// compressed data in the scope of a CompressionManager (see that class below),
-// except
-// (a) it might be optimized for or limited to a particular compression type(s)
-//     (see GetDecompressor* functions for in CompressionManager),
-// (b) distinct Decompressors are required to decompress with compression
-//     dictionaries. (Decompressors are generally associated with empty/no
-//     dictionary unless created with MaybeCloneForDict().)
-//
-// Similar to Compressor, Decompressor is generally thread safe except that each
-// WorkingArea can only be used by a single thread at a time.
-//
-// Decompressors known to be associated with no dictionary are typically
-// returned as shared_ptr, because they are broadly usable across threads.
-// Because compression dictionaries are externally managed (see
-// MaybeCloneForDict()), Decompressors associated with compression dictionaries
-// are typically returned as unique_ptr, so that they are more easily
-// guaranteed not to outlive their dictionaries (e.g. in block cache).
-// Decompressors associated with compression dictionaries might include a
-// processed or "digested" form of the raw dictionary for efficient repeated
-// compressions.
-//
-// NOTE: Splitting the interface between ExtractUncompressedSize and
-// DecompressBlock leaves to the caller details of (and flexibility in)
-// allocating buffers for decompressing into. For example, the data could be
-// decompressed into part of a single buffer allocated to hold a block's
-// uncompressed contents along with an in-memory object representation of the
-// block (to reduce fragmentation and other overheads of separate objects).
-class Decompressor {
- public:
-  Decompressor() = default;
-  virtual ~Decompressor() = default;
-
-  // A name for logging / debugging purposes
-  virtual const char* Name() const = 0;
-
-  // A WorkingArea is an optional structure (both for callers and
-  // implementations) that can enable optimizing repeated decompressions by
-  // reusing working space or thread-local tracking of statistics. This enables
-  // use of ZSTD context, for example.
-  //
-  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
-  struct WorkingArea {};
-
- protected:
-  // To allow for flexible re-use / reclaimation, we have explicit Obtain and
-  // Release functions, which are typically wrapped in a special RAII smart
-  // pointer. For example, a WorkingArea could be saved/recycled in thread-local
-  // or core-local storage, or heap managed, etc., though an explicit
-  // WorkingArea is only advised for repeated decompression (by a single
-  // thread).
-
-  virtual void ReleaseWorkingArea(WorkingArea* wa) {
-    // Default implementation: no working area
-    (void)wa;
-    assert(wa == nullptr);
-  }
-
- public:
-  using ManagedWorkingArea =
-      ManagedPtr<WorkingArea, Decompressor, &Decompressor::ReleaseWorkingArea>;
-
-  virtual ManagedWorkingArea ObtainWorkingArea(CompressionType /*preferred*/) {
-    // Default implementation: no working area
-    return {};
-  }
-
-  // If this Decompressor is associated with a (de)compression dictionary
-  // (created with MaybeCloneForDict()), this returns a pointer to those raw (or
-  // "serialized") bytes, which are externally managed (see
-  // MaybeCloneForDict()).
-  // Default: empty slice => no dictionary
-  virtual const Slice& GetSerializedDict() const;
-
-  // Create a variant of this Decompressor in `out` using the specified raw
-  // ("serialized") dictionary. This step is required for decompressing data
-  // compressed with the same dictionary. The new Decompressor references the
-  // given Slice through its lifetime so the data it points to must be managed
-  // by the caller along with (or beyond) the new Decompressor. If the
-  // dictionary is processed into a form reusable by repeated compressions in
-  // many threads, that happens within this call.
-  //
-  // Must return OK if storing a result in `out`. Otherwise, could return values
-  // like NotSupported - dictionary compression is not (yet) supported for this
-  // kind of Decompressor.
-  // Corruption - dictionary is malformed (though many implementations will
-  // accept any data as a dictionary)
-  virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
-                                   std::unique_ptr<Decompressor>* /*out*/) {
-    return Status::NotSupported(
-        "Dictionary compression not (yet) supported by " + std::string(Name()));
-  }
-
-  // Memory size of this object and others it owns. Does not include the
-  // serialized dictionary (when used) which is externally managed.
-  virtual size_t ApproximateOwnedMemoryUsage() const {
-    // Default: negligible
-    return 0;
-  }
-
-  // Potentially extensible by callers of Decompressor (but not recommended)
-  struct Args {
-    CompressionType compression_type = kNoCompression;
-    Slice compressed_data;
-    uint64_t uncompressed_size = 0;
-    ManagedWorkingArea* working_area = nullptr;
-  };
-
-  // For efficiency on the read path, RocksDB strongly prefers the uncompressed
-  // data size to be encoded in the compressed data in an easily accessible way,
-  // so that allocation of a potentially long-lived buffer can be ideally sized.
-  // This function determines the uncompressed size and potentially modifies
-  // `args.compressed_data` to strip off the size metadata, for providing both
-  // to DecompressBlock along with an appropriate buffer based on that size.
-  // Some implementations will leave `compressed_data` unmodified and let
-  // DecompressBlock call a library function that processes a format that
-  // includes size metadata (e.g. Snappy).
-  //
-  // Even for legacy cases without size metadata (e.g. some very old RocksDB
-  // formats), an exact size is required and could require decompressing the
-  // data (here and in DecompressBlock()).
-  //
-  // Return non-OK in case of corrupt data or some other unworkable limitation
-  // or failure.
-  virtual Status ExtractUncompressedSize(Args& args) {
-    // Default implementation:
-    //
-    // Standard format for prepending uncompressed size to the compressed
-    // payload. (RocksDB compress_format_version=2 except Snappy)
-    //
-    // This is historically a varint32, but it is preliminarily generalized
-    // to varint64. (TODO: support that on the write side, at least for some
-    // codecs, in BBT format_version=7)
-    if (LIKELY(GetVarint64(&args.compressed_data, &args.uncompressed_size))) {
-      if (LIKELY(args.uncompressed_size <= SIZE_MAX)) {
-        return Status::OK();
-      } else {
-        return Status::MemoryLimit("Uncompressed size too large for platform");
-      }
-    } else {
-      return Status::Corruption("Unable to extract uncompressed size");
-    }
-  }
-
-  // Called to decompress a block of data after running ExtractUncompressedSize
-  // on it. `args.compressed_data` is what ExtractUncompressedSize left there
-  // after potentially stripping off the uncompressed size metadata. Returns OK
-  // iff uncompressed data of size `uncompressed_size` is written to
-  // `uncompressed_output`.
-  virtual Status DecompressBlock(const Args& args,
-                                 char* uncompressed_output) = 0;
-};
-
-// A CompressionManager represents
-// * When/where/how to use different compressions
-// * A schema (or set of schemas) and implementation for mapping
-//     <CompressionType, dictionary, compressed data>
-//   to uncompressed data (or error), which can expand over time (error in fewer
-//   cases) for a given CompatibilityName() but can never change that mapping
-//   (because that would break backward compatibility, potential quiet
-//   corruption)
-// TODO: consider adding optional streaming compression support (low priority)
-class CompressionManager
-    : public std::enable_shared_from_this<CompressionManager> {
- public:
-  CompressionManager() = default;
-  virtual ~CompressionManager() = default;
-
-  // TODO: Customizable (for compression side configuration and recording our
-  // compression strategy)
-  virtual const char* Name() const = 0;
-  virtual std::string GetId() const {
-    std::string id = Name();
-    return id;
-  }
-
-  // *************** Peer or variant Compression Managers **************** //
-  // A name for the schema family of this CompressionManager. In short, if
-  // two CompressionManagers have functionally the same Decompressor(s), they
-  // should have the same CompatibilityName(), so that a compatible
-  // CompressionManager/Decompressor might be used if the original is
-  // unavailable. (Name() can be useful in addition to CompatibilityName() for
-  // understanding what compression strategy was used.)
-  virtual const char* CompatibilityName() const = 0;
-
-  // Default implementation checks the current compatibility name and returns
-  // this CompressionManager (via `out`) if appropriate, and otherwise looks
-  // for a matching built-in CompressionManager.
-  virtual Status FindCompatibleCompressionManager(
-      Slice compatibility_name, std::shared_ptr<CompressionManager>* out);
-
-  // ************************* Compressor creation *********************** //
-  // Returning nullptr means compression is entirely disabled for the file,
-  // which is valid at the discretion of the CompressionManager. Returning
-  // nullptr should normally be the result if preferred == kNoCompression.
-  //
-  // These functions must be thread-safe.
-
-  // Get a compressor for an SST file.
-  // SUBJECT TO CHANGE
-  // TODO: is it practical to get ColumnFamilyOptions plumbed into here?
-  virtual std::unique_ptr<Compressor> GetCompressorForSST(
-      const FilterBuildingContext&, const CompressionOptions& opts,
-      CompressionType preferred) {
-    return GetCompressor(opts, preferred);
-  }
-
-  // Get a compressor for a generic/unspecified purpose (e.g. in-memory
-  // compression).
-  virtual std::unique_ptr<Compressor> GetCompressor(
-      const CompressionOptions& opts, CompressionType type) = 0;
-
-  // **************************** Decompressors ************************** //
-  // Get a decompressor that is compatible with any blocks compressed by
-  // compressors returned by this CompressionManager (at least this code
-  // revision and earlier). (NOTE: recommended to return a shared_ptr alias of
-  // this shared_ptr to a field that is a Decompressor.)
-  // Justification for not making CompressionManager inherit Decompressor: this
-  // tends to run into the diamond inheritance problem in implementations and
-  // potential overheads of virtual inheritance.
-  virtual std::shared_ptr<Decompressor> GetDecompressor() = 0;
-
-  // Compatible with same as above, but potentially optimized for a certain
-  // expected CompressionType
-  virtual std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
-      CompressionType /*optimize_for_type*/) {
-    // Safe default implementation
-    return GetDecompressor();
-  }
-
-  // Get a decompressor that is allowed to have support only for the
-  // CompressionTypes in the given start-to-end array (unique, sorted by
-  // unsigned char)
-  virtual std::shared_ptr<Decompressor> GetDecompressorForTypes(
-      const CompressionType* /*types_begin*/,
-      const CompressionType* /*types_end*/) {
-    // Safe default implementation
-    return GetDecompressor();
-  }
-};
-// ***********************************************************************
-// END future compression customization interface
-// ***********************************************************************
-
 class FailureDecompressor : public Decompressor {
  public:
   explicit FailureDecompressor(Status&& status) : status_(std::move(status)) {

From 1d94aeea448bcc7796c03f34761267a81b176e52 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 22 May 2025 09:42:15 -0700
Subject: [PATCH 105/500] Refactor snapshot context into JobContext and fix
 deadlock on db mutex in WP/WUP (#13632)

Summary:
With WP/WUP, we can deadlock on db mutex here: https://github.com/facebook/rocksdb/blob/8dc3d77b591443e405b2b171b3eb4f8461ffd2a3/db/db_impl/db_impl_compaction_flush.cc#L4626. Here we release a snapshot (which will acquire db mutex) while already holding the mutex. This caused some transaction lock timeout error in crash test. This PR fixes this by refactoring snapshot related context into JobContext and only allow snapshot related context to be initialized once. This also reduces the number of parameters being passed around.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13632

Test Plan:
- existing tests
- this fails with timeout before this fix
```
./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --adm_policy=2 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=0 --allow_unprepared_value=1 --async_io=0 --auto_readahead_size=0 --auto_refresh_iterator_with_snapshot=0 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --bgerror_resume_retry_interval=1000000 --block_align=0 --block_protection_bytes_per_key=4 --block_size=16384 --bloom_before_level=6 --bloom_bits=2 --bottommost_compression_type=lz4 --bottommost_file_compaction_delay=0 --bytes_per_sync=0 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=33554432 --cache_type=fixed_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=1 --check_multiget_entity_consistency=0 --checkpoint_one_in=0 --checksum_type=kxxHash --clear_column_family_one_in=0 --commit_bypass_memtable_one_in=0 --compact_files_one_in=1000 --compact_range_one_in=1000000 --compaction_pri=1 --compaction_readahead_size=0 --compaction_style=1 --compaction_ttl=0 --compress_format_version=2 --compressed_secondary_cache_ratio=0.0 --compressed_secondary_cache_size=0 --compression_checksum=1 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --create_timestamped_snapshot_one_in=0 --daily_offpeak_time_utc= --data_block_index_type=1 --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kWarm --default_write_temperature=kHot --delete_obsolete_files_period_micros=30000000 --delpercent=5 --delrangepercent=0 --destroy_db_initially=1 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=1000000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_checksum_handoff=0 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_remote_compaction=0 --enable_sst_partitioner_factory=0 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=1 --fifo_allow_compaction=1 --file_checksum_impl=none --file_temperature_age_thresholds= --fill_cache=1 --flush_one_in=1000 --format_version=3 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=2 --index_shortening=1 --index_type=0 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=0 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kWarm --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=1000000 --log2_keys_per_lock=10 --log_file_time_to_roll=60 --log_readahead_size=16777216 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=1 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_op_scan_flush_trigger=100 --memtable_prefix_bloom_size_ratio=0 --memtable_protection_bytes_per_key=1 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=0 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=500000 --open_metadata_read_fault_one_in=8 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=32 --open_write_fault_one_in=0 --ops_per_thread=200000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=1 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=0 --partition_pinning=0 --pause_background_one_in=10000 --periodic_compaction_seconds=0 --prefix_size=-1 --prefixpercent=0 --prepopulate_block_cache=0 --preserve_internal_time_seconds=60 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=32 --readahead_size=524288 --readpercent=50 --recycle_log_file_num=0 --reopen=20 --report_bg_io_stats=1 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=bar --sqfc_version=0 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --stats_dump_period_sec=10 --stats_history_buffer_size=0 --strict_bytes_per_sync=0 --subcompactions=4 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=0 --two_write_queues=1 --txn_write_policy=2 --uncache_aggressiveness=2 --universal_max_read_amp=10 --unordered_write=0 --unpartitioned_pinning=1 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=0 --use_attribute_group=0 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=1 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_optimistic_txn=0 --use_put_entity_one_in=0 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_txn=1 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000 --verify_compression=1 --verify_db_one_in=10000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=4194304 --write_dbid_to_manifest=1 --write_fault_one_in=0 --write_identity_file=0 --writepercent=35 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox
```

Reviewed By: hx235

Differential Revision: D75173149

Pulled By: cbi42

fbshipit-source-id: ec68cadc78469730dfe26824e20b8ca4ab993101
---
 db/compaction/compaction_job.cc         |  37 +++----
 db/compaction/compaction_job.h          |  61 +++++------
 db/compaction/compaction_job_test.cc    |  21 ++--
 db/compaction/compaction_service_job.cc |  13 ++-
 db/db_impl/db_impl.h                    |  20 ++--
 db/db_impl/db_impl_compaction_flush.cc  |  88 ++++++----------
 db/db_impl/db_impl_secondary.cc         |   7 +-
 db/db_iter.cc                           |   5 +
 db/db_iter.h                            |  10 +-
 db/flush_job.cc                         |  61 +++++------
 db/flush_job.h                          |  11 +-
 db/flush_job_test.cc                    | 130 ++++++++++++------------
 db/job_context.h                        |  60 +++++++++--
 13 files changed, 254 insertions(+), 270 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index fa7e76012d66..3da86d7483d9 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -133,10 +133,7 @@ CompactionJob::CompactionJob(
     LogBuffer* log_buffer, FSDirectory* db_directory,
     FSDirectory* output_directory, FSDirectory* blob_output_directory,
     Statistics* stats, InstrumentedMutex* db_mutex,
-    ErrorHandler* db_error_handler,
-    std::vector<SequenceNumber> existing_snapshots,
-    SequenceNumber earliest_write_conflict_snapshot,
-    const SnapshotChecker* snapshot_checker, JobContext* job_context,
+    ErrorHandler* db_error_handler, JobContext* job_context,
     std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
     bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
     CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
@@ -173,12 +170,7 @@ CompactionJob::CompactionJob(
       blob_output_directory_(blob_output_directory),
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
-      existing_snapshots_(std::move(existing_snapshots)),
-      earliest_snapshot_(existing_snapshots_.empty()
-                             ? kMaxSequenceNumber
-                             : existing_snapshots_.at(0)),
-      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
-      snapshot_checker_(snapshot_checker),
+      earliest_snapshot_(job_context->GetEarliestSnapshotSequence()),
       job_context_(job_context),
       table_cache_(std::move(table_cache)),
       event_logger_(event_logger),
@@ -193,6 +185,7 @@ CompactionJob::CompactionJob(
       bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
   assert(job_stats_ != nullptr);
   assert(log_buffer_ != nullptr);
+  assert(job_context->snapshot_context_initialized);
 
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
@@ -1183,7 +1176,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // creation across both CompactionJob and CompactionServiceCompactionJob
   sub_compact->AssignRangeDelAggregator(
       std::make_unique<CompactionRangeDelAggregator>(
-          &cfd->internal_comparator(), existing_snapshots_,
+          &cfd->internal_comparator(), job_context_->snapshot_seqs,
           &full_history_ts_low_, &trim_ts_));
 
   // TODO: since we already use C++17, should use
@@ -1324,8 +1317,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
       compaction_filter, db_options_.info_log.get(),
       false /* internal key corruption is expected */,
-      existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
-      snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+      job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
+      compact_->compaction->level(), db_options_.stats);
 
   const auto& mutable_cf_options =
       sub_compact->compaction->mutable_cf_options();
@@ -1361,9 +1354,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 
   auto c_iter = std::make_unique<CompactionIterator>(
       input, cfd->user_comparator(), &merge, versions_->LastSequence(),
-      &existing_snapshots_, earliest_snapshot_,
-      earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
-      env_, ShouldReportDetailedTime(env_, stats_),
+      &(job_context_->snapshot_seqs), earliest_snapshot_,
+      job_context_->earliest_write_conflict_snapshot, job_snapshot_seq,
+      job_context_->snapshot_checker, env_,
+      ShouldReportDetailedTime(env_, stats_),
       /*expect_valid_internal_key=*/true, sub_compact->RangeDelAgg(),
       blob_file_builder.get(), db_options_.allow_data_in_errors,
       db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
@@ -1652,10 +1646,6 @@ Status CompactionJob::FinishCompactionOutputFile(
   Status s = input_status;
 
   // Add range tombstones
-  auto earliest_snapshot = kMaxSequenceNumber;
-  if (existing_snapshots_.size() > 0) {
-    earliest_snapshot = existing_snapshots_[0];
-  }
   if (s.ok()) {
     // Inclusive lower bound, exclusive upper bound
     std::pair<SequenceNumber, SequenceNumber> keep_seqno_range{
@@ -1681,7 +1671,7 @@ Status CompactionJob::FinishCompactionOutputFile(
       s = outputs.AddRangeDels(*sub_compact->RangeDelAgg(), comp_start_user_key,
                                comp_end_user_key, range_del_out_stats,
                                bottommost_level_, cfd->internal_comparator(),
-                               earliest_snapshot, keep_seqno_range,
+                               earliest_snapshot_, keep_seqno_range,
                                next_table_min_key, full_history_ts_low_);
     }
     RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
@@ -2313,9 +2303,10 @@ void CompactionJob::LogCompaction() {
     }
     stream << "score" << compaction->score() << "input_data_size"
            << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno"
-           << (existing_snapshots_.empty()
+           << (job_context_->snapshot_seqs.empty()
                    ? int64_t{-1}  // Use -1 for "none"
-                   : static_cast<int64_t>(existing_snapshots_[0]));
+                   : static_cast<int64_t>(
+                         job_context_->GetEarliestSnapshotSequence()));
     if (compaction->SupportsPerKeyPlacement()) {
       stream << "proximal_after_seqno" << proximal_after_seqno_;
       stream << "preserve_seqno_after" << preserve_seqno_after_;
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 2d01508f8e9a..e7e209c74412 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -142,27 +142,27 @@ class SubcompactionState;
 
 class CompactionJob {
  public:
-  CompactionJob(
-      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-      const MutableDBOptions& mutable_db_options,
-      const FileOptions& file_options, VersionSet* versions,
-      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
-      FSDirectory* db_directory, FSDirectory* output_directory,
-      FSDirectory* blob_output_directory, Statistics* stats,
-      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      SequenceNumber earliest_write_conflict_snapshot,
-      const SnapshotChecker* snapshot_checker, JobContext* job_context,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      bool paranoid_file_checks, bool measure_io_stats,
-      const std::string& dbname, CompactionJobStats* compaction_job_stats,
-      Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
-      const std::atomic<bool>& manual_compaction_canceled,
-      const std::string& db_id = "", const std::string& db_session_id = "",
-      std::string full_history_ts_low = "", std::string trim_ts = "",
-      BlobFileCompletionCallback* blob_callback = nullptr,
-      int* bg_compaction_scheduled = nullptr,
-      int* bg_bottom_compaction_scheduled = nullptr);
+  CompactionJob(int job_id, Compaction* compaction,
+                const ImmutableDBOptions& db_options,
+                const MutableDBOptions& mutable_db_options,
+                const FileOptions& file_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+                FSDirectory* db_directory, FSDirectory* output_directory,
+                FSDirectory* blob_output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+                JobContext* job_context, std::shared_ptr<Cache> table_cache,
+                EventLogger* event_logger, bool paranoid_file_checks,
+                bool measure_io_stats, const std::string& dbname,
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri,
+                const std::shared_ptr<IOTracer>& io_tracer,
+                const std::atomic<bool>& manual_compaction_canceled,
+                const std::string& db_id = "",
+                const std::string& db_session_id = "",
+                std::string full_history_ts_low = "", std::string trim_ts = "",
+                BlobFileCompletionCallback* blob_callback = nullptr,
+                int* bg_compaction_scheduled = nullptr,
+                int* bg_bottom_compaction_scheduled = nullptr);
 
   virtual ~CompactionJob();
 
@@ -321,21 +321,8 @@ class CompactionJob {
   FSDirectory* blob_output_directory_;
   InstrumentedMutex* db_mutex_;
   ErrorHandler* db_error_handler_;
-  // If there were two snapshots with seq numbers s1 and
-  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
-  // entirely within s1 and s2, then the earlier version of k1 can be safely
-  // deleted because that version is not visible in any snapshot.
-  std::vector<SequenceNumber> existing_snapshots_;
 
   SequenceNumber earliest_snapshot_;
-
-  // This is the earliest snapshot that could be used for write-conflict
-  // checking by a transaction.  For any user-key newer than this snapshot, we
-  // should make sure not to remove evidence that a write occurred.
-  SequenceNumber earliest_write_conflict_snapshot_;
-
-  const SnapshotChecker* const snapshot_checker_;
-
   JobContext* job_context_;
 
   std::shared_ptr<Cache> table_cache_;
@@ -524,9 +511,9 @@ class CompactionServiceCompactionJob : private CompactionJob {
       const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
       FSDirectory* output_directory, Statistics* stats,
       InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-      std::vector<SequenceNumber> existing_snapshots,
-      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-      const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+      JobContext* job_context, std::shared_ptr<Cache> table_cache,
+      EventLogger* event_logger, const std::string& dbname,
+      const std::shared_ptr<IOTracer>& io_tracer,
       const std::atomic<bool>& manual_compaction_canceled,
       const std::string& db_id, const std::string& db_session_id,
       std::string output_path,
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 8c7baa6ef29c..89d724e067c1 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -595,11 +595,11 @@ class CompactionJobTestBase : public testing::Test {
       const std::vector<std::vector<FileMetaData*>>& input_files,
       const std::vector<int> input_levels,
       std::function<void(Compaction& comp)>&& verify_func,
-      const std::vector<SequenceNumber>& snapshots = {}) {
+      std::vector<SequenceNumber>&& snapshots = {}) {
     const int kLastLevel = cf_options_.num_levels - 1;
     verify_per_key_placement_ = std::move(verify_func);
     mock::KVVector empty_map;
-    RunCompaction(input_files, input_levels, {empty_map}, snapshots,
+    RunCompaction(input_files, input_levels, {empty_map}, std::move(snapshots),
                   kMaxSequenceNumber, kLastLevel, false);
   }
 
@@ -608,7 +608,7 @@ class CompactionJobTestBase : public testing::Test {
       const std::vector<std::vector<FileMetaData*>>& input_files,
       const std::vector<int>& input_levels,
       const std::vector<mock::KVVector>& expected_results,
-      const std::vector<SequenceNumber>& snapshots = {},
+      std::vector<SequenceNumber>&& snapshots = {},
       SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
       int output_level = 1, bool verify = true,
       std::vector<uint64_t> expected_oldest_blob_file_numbers = {},
@@ -665,13 +665,15 @@ class CompactionJobTestBase : public testing::Test {
                 ucmp_->timestamp_size() == full_history_ts_low_.size());
     const std::atomic<bool> kManualCompactionCanceledFalse{false};
     JobContext job_context(1, false /* create_superversion */);
+    job_context.InitSnapshotContext(snapshot_checker, nullptr,
+                                    earliest_write_conflict_snapshot,
+                                    std::move(snapshots));
     CompactionJob compaction_job(
         0, &compaction, db_options_, mutable_db_options_, env_options_,
         versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
-        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
-        earliest_write_conflict_snapshot, snapshot_checker, &job_context,
-        table_cache_, &event_logger, false, false, dbname_,
-        &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
+        nullptr, nullptr, &mutex_, &error_handler_, &job_context, table_cache_,
+        &event_logger, false, false, dbname_, &compaction_job_stats_,
+        Env::Priority::USER, nullptr /* IOTracer */,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
         env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
         full_history_ts_low_);
@@ -2036,7 +2038,7 @@ TEST_F(CompactionJobTest, CutToAlignGrandparentBoundarySameKey) {
     snapshots.emplace_back(i);
   }
   RunCompaction({lvl0_files, lvl1_files}, input_levels,
-                {expected_file1, expected_file2}, snapshots);
+                {expected_file1, expected_file2}, std::move(snapshots));
 }
 
 TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) {
@@ -2095,7 +2097,8 @@ TEST_F(CompactionJobTest, CutForMaxCompactionBytesSameKey) {
     snapshots.emplace_back(i);
   }
   RunCompaction({lvl0_files, lvl1_files}, input_levels,
-                {expected_file1, expected_file2, expected_file3}, snapshots);
+                {expected_file1, expected_file2, expected_file3},
+                std::move(snapshots));
 }
 
 class CompactionJobTimestampTest : public CompactionJobTestBase {
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 3c2ff8c09b18..11ba31daf3b7 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -41,7 +41,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
   }
 
   compaction_input.cf_name = compaction->column_family_data()->GetName();
-  compaction_input.snapshots = existing_snapshots_;
+  compaction_input.snapshots = job_context_->snapshot_seqs;
   compaction_input.has_begin = sub_compact->start.has_value();
   compaction_input.begin =
       compaction_input.has_begin ? sub_compact->start->ToString() : "";
@@ -304,9 +304,9 @@ CompactionServiceCompactionJob::CompactionServiceCompactionJob(
     VersionSet* versions, const std::atomic<bool>* shutting_down,
     LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
     InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
-    std::vector<SequenceNumber> existing_snapshots,
-    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-    const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+    JobContext* job_context, std::shared_ptr<Cache> table_cache,
+    EventLogger* event_logger, const std::string& dbname,
+    const std::shared_ptr<IOTracer>& io_tracer,
     const std::atomic<bool>& manual_compaction_canceled,
     const std::string& db_id, const std::string& db_session_id,
     std::string output_path,
@@ -315,9 +315,8 @@ CompactionServiceCompactionJob::CompactionServiceCompactionJob(
     : CompactionJob(job_id, compaction, db_options, mutable_db_options,
                     file_options, versions, shutting_down, log_buffer, nullptr,
                     output_directory, nullptr, stats, db_mutex,
-                    db_error_handler, std::move(existing_snapshots),
-                    kMaxSequenceNumber, nullptr, nullptr,
-                    std::move(table_cache), event_logger,
+                    db_error_handler, job_context, std::move(table_cache),
+                    event_logger,
                     compaction->mutable_cf_options().paranoid_file_checks,
                     compaction->mutable_cf_options().report_bg_io_stats, dbname,
                     &(compaction_service_result->stats), Env::Priority::USER,
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 1062b212ef29..0034a7e97764 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1090,10 +1090,7 @@ class DBImpl : public DB {
   void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
 
   // Fill JobContext with snapshot information needed by flush and compaction.
-  void GetSnapshotContext(JobContext* job_context,
-                          std::vector<SequenceNumber>* snapshot_seqs,
-                          SequenceNumber* earliest_write_conflict_snapshot,
-                          SnapshotChecker** snapshot_checker);
+  void InitSnapshotContext(JobContext* job_context);
 
   // Not thread-safe.
   void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
@@ -2051,14 +2048,13 @@ class DBImpl : public DB {
   // Flush the in-memory write buffer to storage.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful. Then
   // installs a new super version for the column family.
-  Status FlushMemTableToOutputFile(
-      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-      bool* madeProgress, JobContext* job_context, FlushReason flush_reason,
-      SuperVersionContext* superversion_context,
-      std::vector<SequenceNumber>& snapshot_seqs,
-      SequenceNumber earliest_write_conflict_snapshot,
-      SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
-      Env::Priority thread_pri);
+  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   bool* madeProgress, JobContext* job_context,
+                                   FlushReason flush_reason,
+                                   SuperVersionContext* superversion_context,
+                                   LogBuffer* log_buffer,
+                                   Env::Priority thread_pri);
 
   // Flush the memtables of (multiple) column families to multiple files on
   // persistent storage.
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 75d41af343f3..a69c80a3cb03 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -145,10 +145,7 @@ IOStatus DBImpl::SyncClosedWals(const WriteOptions& write_options,
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
     bool* made_progress, JobContext* job_context, FlushReason flush_reason,
-    SuperVersionContext* superversion_context,
-    std::vector<SequenceNumber>& snapshot_seqs,
-    SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    SuperVersionContext* superversion_context, LogBuffer* log_buffer,
     Env::Priority thread_pri) {
   mutex_.AssertHeld();
   assert(cfd);
@@ -212,7 +209,6 @@ Status DBImpl::FlushMemTableToOutputFile(
   FlushJob flush_job(
       dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
       file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
-      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
       job_context, flush_reason, log_buffer, directories_.GetDbDir(),
       GetDataDir(cfd, 0U),
       GetCompressionFlush(cfd->ioptions(), mutable_cf_options), stats_,
@@ -397,11 +393,8 @@ Status DBImpl::FlushMemTablesToOutputFiles(
         bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
   }
   assert(bg_flush_args.size() == 1);
-  std::vector<SequenceNumber> snapshot_seqs;
-  SequenceNumber earliest_write_conflict_snapshot;
-  SnapshotChecker* snapshot_checker;
-  GetSnapshotContext(job_context, &snapshot_seqs,
-                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  InitSnapshotContext(job_context);
+
   const auto& bg_flush_arg = bg_flush_args[0];
   ColumnFamilyData* cfd = bg_flush_arg.cfd_;
   // intentional infrequent copy for each flush
@@ -412,8 +405,7 @@ Status DBImpl::FlushMemTablesToOutputFiles(
   FlushReason flush_reason = bg_flush_arg.flush_reason_;
   Status s = FlushMemTableToOutputFile(
       cfd, mutable_cf_options_copy, made_progress, job_context, flush_reason,
-      superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
-      snapshot_checker, log_buffer, thread_pri);
+      superversion_context, log_buffer, thread_pri);
   return s;
 }
 
@@ -448,12 +440,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
   }
 #endif /* !NDEBUG */
 
-  std::vector<SequenceNumber> snapshot_seqs;
-  SequenceNumber earliest_write_conflict_snapshot;
-  SnapshotChecker* snapshot_checker;
-  GetSnapshotContext(job_context, &snapshot_seqs,
-                     &earliest_write_conflict_snapshot, &snapshot_checker);
-
+  InitSnapshotContext(job_context);
   autovector<FSDirectory*> distinct_output_dirs;
   autovector<std::string> distinct_output_dir_paths;
   std::vector<std::unique_ptr<FlushJob>> jobs;
@@ -487,8 +474,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     jobs.emplace_back(new FlushJob(
         dbname_, cfd, immutable_db_options_, mutable_cf_options,
         max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
-        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
-        snapshot_checker, job_context, flush_reason, log_buffer,
+        &shutting_down_, job_context, flush_reason, log_buffer,
         directories_.GetDbDir(), data_dir,
         GetCompressionFlush(cfd->ioptions(), mutable_cf_options), stats_,
         &event_logger_, mutable_cf_options.report_bg_io_stats,
@@ -1518,11 +1504,7 @@ Status DBImpl::CompactFilesImpl(
   // deletion compaction currently not allowed in CompactFiles.
   assert(!c->deletion_compaction());
 
-  std::vector<SequenceNumber> snapshot_seqs;
-  SequenceNumber earliest_write_conflict_snapshot;
-  SnapshotChecker* snapshot_checker;
-  GetSnapshotContext(job_context, &snapshot_seqs,
-                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  InitSnapshotContext(job_context);
 
   std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
       new std::list<uint64_t>::iterator(
@@ -1536,7 +1518,6 @@ Status DBImpl::CompactFilesImpl(
       log_buffer, directories_.GetDbDir(),
       GetDataDir(c->column_family_data(), c->output_path_id()),
       GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
-      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
       job_context, table_cache_, &event_logger_,
       c->mutable_cf_options().paranoid_file_checks,
       c->mutable_cf_options().report_bg_io_stats, dbname_,
@@ -3687,20 +3668,16 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       // compaction is not necessary. Need to make sure mutex is held
       // until we make a copy in the following code
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
-      SnapshotChecker* snapshot_checker = nullptr;
-      std::vector<SequenceNumber> snapshot_seqs;
       // This info is not useful for other scenarios, so save querying existing
       // snapshots for those cases.
       if (cfd->ioptions().compaction_style == kCompactionStyleUniversal &&
           cfd->user_comparator()->timestamp_size() == 0) {
-        SequenceNumber earliest_write_conflict_snapshot;
-        GetSnapshotContext(job_context, &snapshot_seqs,
-                           &earliest_write_conflict_snapshot,
-                           &snapshot_checker);
+        InitSnapshotContext(job_context);
         assert(is_snapshot_supported_ || snapshots_.empty());
       }
       c.reset(cfd->PickCompaction(mutable_cf_options, mutable_db_options_,
-                                  snapshot_seqs, snapshot_checker, log_buffer));
+                                  job_context->snapshot_seqs,
+                                  job_context->snapshot_checker, log_buffer));
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
 
       if (c != nullptr) {
@@ -4154,11 +4131,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     output_level = c->output_level();
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
                              &output_level);
-    std::vector<SequenceNumber> snapshot_seqs;
-    SequenceNumber earliest_write_conflict_snapshot;
-    SnapshotChecker* snapshot_checker;
-    GetSnapshotContext(job_context, &snapshot_seqs,
-                       &earliest_write_conflict_snapshot, &snapshot_checker);
+    InitSnapshotContext(job_context);
     assert(is_snapshot_supported_ || snapshots_.empty());
 
     CompactionJob compaction_job(
@@ -4167,8 +4140,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &shutting_down_, log_buffer, directories_.GetDbDir(),
         GetDataDir(c->column_family_data(), c->output_path_id()),
         GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
-        &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
-        snapshot_checker, job_context, table_cache_, &event_logger_,
+        &error_handler_, job_context, table_cache_, &event_logger_,
         c->mutable_cf_options().paranoid_file_checks,
         c->mutable_cf_options().report_bg_io_stats, dbname_,
         &compaction_job_stats, thread_pri, io_tracer_,
@@ -4601,31 +4573,33 @@ void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
   snapshot_checker_.reset(snapshot_checker);
 }
 
-void DBImpl::GetSnapshotContext(
-    JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
-    SequenceNumber* earliest_write_conflict_snapshot,
-    SnapshotChecker** snapshot_checker_ptr) {
+void DBImpl::InitSnapshotContext(JobContext* job_context) {
   mutex_.AssertHeld();
   assert(job_context != nullptr);
-  assert(snapshot_seqs != nullptr);
-  assert(earliest_write_conflict_snapshot != nullptr);
-  assert(snapshot_checker_ptr != nullptr);
-
-  *snapshot_checker_ptr = snapshot_checker_.get();
-  if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
-    *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+  if (job_context->snapshot_context_initialized) {
+    return;
+  }
+  SnapshotChecker* snapshot_checker = snapshot_checker_.get();
+  if (use_custom_gc_ && !snapshot_checker) {
+    snapshot_checker = DisableGCSnapshotChecker::Instance();
   }
-  if (*snapshot_checker_ptr != nullptr) {
+  std::unique_ptr<ManagedSnapshot> managed_snapshot = nullptr;
+  if (snapshot_checker) {
     // If snapshot_checker is used, that means the flush/compaction may
     // contain values not visible to snapshot taken after
     // flush/compaction job starts. Take a snapshot and it will appear
     // in snapshot_seqs and force compaction iterator to consider such
     // snapshots.
-    const Snapshot* job_snapshot =
-        GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
-    job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
-  }
-  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+    const Snapshot* snapshot =
+        GetSnapshotImpl(/*is_write_conflict_boundary=*/false, /*lock=*/false);
+    managed_snapshot.reset(new ManagedSnapshot(this, snapshot));
+  }
+  SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber;
+  std::vector<SequenceNumber> snapshot_seqs =
+      snapshots_.GetAll(&earliest_write_conflict_snapshot);
+  job_context->InitSnapshotContext(
+      snapshot_checker, std::move(managed_snapshot),
+      earliest_write_conflict_snapshot, std::move(snapshot_seqs));
 }
 
 Status DBImpl::WaitForCompact(
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index d567238b854c..e5f33dc20b40 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -933,7 +933,10 @@ Status DBImplSecondary::CompactWithoutInstallation(
                        immutable_db_options_.info_log.get());
 
   const int job_id = next_job_id_.fetch_add(1);
-
+  JobContext job_context(0, true /*create_superversion*/);
+  std::vector<SequenceNumber> snapshots = input.snapshots;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber,
+                                  std::move(snapshots));
   // use primary host's db_id for running the compaction, but db_session_id is
   // using the local one, which is to make sure the unique id is unique from
   // the remote compactors. Because the id is generated from db_id,
@@ -944,7 +947,7 @@ Status DBImplSecondary::CompactWithoutInstallation(
       job_id, c.get(), immutable_db_options_, mutable_db_options_,
       file_options_for_compaction_, versions_.get(), &shutting_down_,
       &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
-      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+      &job_context, table_cache_, &event_logger_, dbname_, io_tracer_,
       options.canceled ? *options.canceled : kManualCompactionCanceledFalse_,
       input.db_id, db_session_id_, secondary_path_, input, result);
 
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 25d53ae09a8b..42739e006204 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -66,6 +66,11 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       timestamp_lb_(read_options.iter_start_ts),
       timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0),
       active_mem_(active_mem),
+      memtable_seqno_lb_(kMaxSequenceNumber),
+      memtable_op_scan_flush_trigger_(0),
+      avg_op_scan_flush_trigger_(0),
+      iter_step_since_seek_(1),
+      mem_hidden_op_scanned_since_seek_(0),
       direction_(kForward),
       valid_(false),
       current_entry_is_merged_(false),
diff --git a/db/db_iter.h b/db/db_iter.h
index e4353875bb63..6bb64b6e732e 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -507,11 +507,11 @@ class DBIter final : public Iterator {
   std::string saved_timestamp_;
   std::optional<std::vector<ScanOptions>> scan_opts_;
   ReadOnlyMemTable* const active_mem_;
-  SequenceNumber memtable_seqno_lb_ = kMaxSequenceNumber;
-  uint32_t memtable_op_scan_flush_trigger_ = 0;
-  uint32_t avg_op_scan_flush_trigger_ = 0;
-  uint32_t iter_step_since_seek_ = 1;
-  uint32_t mem_hidden_op_scanned_since_seek_ = 0;
+  SequenceNumber memtable_seqno_lb_;
+  uint32_t memtable_op_scan_flush_trigger_;
+  uint32_t avg_op_scan_flush_trigger_;
+  uint32_t iter_step_since_seek_;
+  uint32_t mem_hidden_op_scanned_since_seek_;
   Direction direction_;
   bool valid_;
   bool current_entry_is_merged_;
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 452e1ed9e677..60feeb5c9191 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -92,12 +92,10 @@ FlushJob::FlushJob(
     const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
     const FileOptions& file_options, VersionSet* versions,
     InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
-    std::vector<SequenceNumber> existing_snapshots,
-    SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, JobContext* job_context,
-    FlushReason flush_reason, LogBuffer* log_buffer, FSDirectory* db_directory,
-    FSDirectory* output_file_directory, CompressionType output_compression,
-    Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+    JobContext* job_context, FlushReason flush_reason, LogBuffer* log_buffer,
+    FSDirectory* db_directory, FSDirectory* output_file_directory,
+    CompressionType output_compression, Statistics* stats,
+    EventLogger* event_logger, bool measure_io_stats,
     const bool sync_output_directory, const bool write_manifest,
     Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
     std::shared_ptr<const SeqnoToTimeMapping> seqno_to_time_mapping,
@@ -114,12 +112,7 @@ FlushJob::FlushJob(
       versions_(versions),
       db_mutex_(db_mutex),
       shutting_down_(shutting_down),
-      existing_snapshots_(std::move(existing_snapshots)),
-      earliest_snapshot_(existing_snapshots_.empty()
-                             ? kMaxSequenceNumber
-                             : existing_snapshots_.at(0)),
-      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
-      snapshot_checker_(snapshot_checker),
+      earliest_snapshot_(job_context->GetEarliestSnapshotSequence()),
       job_context_(job_context),
       flush_reason_(flush_reason),
       log_buffer_(log_buffer),
@@ -140,6 +133,7 @@ FlushJob::FlushJob(
       full_history_ts_low_(std::move(full_history_ts_low)),
       blob_callback_(blob_callback),
       seqno_to_time_mapping_(std::move(seqno_to_time_mapping)) {
+  assert(job_context->snapshot_context_initialized);
   // Update the thread status to indicate flush.
   ReportStartedFlush();
   TEST_SYNC_POINT("FlushJob::FlushJob()");
@@ -456,7 +450,7 @@ Status FlushJob::MemPurge() {
   const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow());
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
       new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
-                                       existing_snapshots_,
+                                       job_context_->snapshot_seqs,
                                        full_history_ts_low));
   for (auto& rd_iter : range_del_iters) {
     range_del_agg->AddTombstones(std::move(rd_iter));
@@ -495,19 +489,19 @@ Status FlushJob::MemPurge() {
 
     Env* env = db_options_.env;
     assert(env);
-    MergeHelper merge(
-        env, (cfd_->internal_comparator()).user_comparator(),
-        (ioptions.merge_operator).get(), compaction_filter.get(),
-        ioptions.logger, true /* internal key corruption is not ok */,
-        existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
-        snapshot_checker_);
+    MergeHelper merge(env, (cfd_->internal_comparator()).user_comparator(),
+                      (ioptions.merge_operator).get(), compaction_filter.get(),
+                      ioptions.logger,
+                      true /* internal key corruption is not ok */,
+                      job_context_->GetLatestSnapshotSequence(),
+                      job_context_->snapshot_checker);
     assert(job_context_);
-    SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence();
     const std::atomic<bool> kManualCompactionCanceledFalse{false};
     CompactionIterator c_iter(
         iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
-        kMaxSequenceNumber, &existing_snapshots_, earliest_snapshot_,
-        earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
+        kMaxSequenceNumber, &job_context_->snapshot_seqs, earliest_snapshot_,
+        job_context_->earliest_write_conflict_snapshot,
+        job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
         env, ShouldReportDetailedTime(env, ioptions.stats),
         true /* internal key corruption is not ok */, range_del_agg.get(),
         nullptr, ioptions.allow_data_in_errors,
@@ -761,7 +755,7 @@ bool FlushJob::MemPurgeDecider(double threshold) {
       // Pick the oldest existing snapshot that is more recent
       // than the sequence number of the sampled entry.
       min_seqno_snapshot = kMaxSequenceNumber;
-      for (SequenceNumber seq_num : existing_snapshots_) {
+      for (SequenceNumber seq_num : job_context_->snapshot_seqs) {
         if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
           min_seqno_snapshot = seq_num;
         }
@@ -1000,20 +994,19 @@ Status FlushJob::WriteLevel0Table() {
           preclude_last_level_min_seqno_ == kMaxSequenceNumber
               ? preclude_last_level_min_seqno_
               : std::min(earliest_snapshot_, preclude_last_level_min_seqno_));
-      const SequenceNumber job_snapshot_seq =
-          job_context_->GetJobSnapshotSequence();
-
       s = BuildTable(
           dbname_, versions_, db_options_, tboptions, file_options_,
           cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
-          &blob_file_additions, existing_snapshots_, earliest_snapshot_,
-          earliest_write_conflict_snapshot_, job_snapshot_seq,
-          snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
-          cfd_->internal_stats(), &io_s, io_tracer_,
-          BlobFileCreationReason::kFlush, seqno_to_time_mapping_.get(),
-          event_logger_, job_context_->job_id, &table_properties_, write_hint,
-          full_history_ts_low, blob_callback_, base_, &memtable_payload_bytes,
-          &memtable_garbage_bytes, &flush_stats);
+          &blob_file_additions, job_context_->snapshot_seqs, earliest_snapshot_,
+          job_context_->earliest_write_conflict_snapshot,
+          job_context_->GetJobSnapshotSequence(),
+          job_context_->snapshot_checker,
+          mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
+          &io_s, io_tracer_, BlobFileCreationReason::kFlush,
+          seqno_to_time_mapping_.get(), event_logger_, job_context_->job_id,
+          &table_properties_, write_hint, full_history_ts_low, blob_callback_,
+          base_, &memtable_payload_bytes, &memtable_garbage_bytes,
+          &flush_stats);
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s);
       // TODO: Cleanup io_status in BuildTable and table builders
       assert(!s.ok() || io_s.ok());
diff --git a/db/flush_job.h b/db/flush_job.h
index f3f85abbcc70..aa95c7b41aef 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -63,11 +63,9 @@ class FlushJob {
            const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
            const FileOptions& file_options, VersionSet* versions,
            InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
-           std::vector<SequenceNumber> existing_snapshots,
-           SequenceNumber earliest_write_conflict_snapshot,
-           SnapshotChecker* snapshot_checker, JobContext* job_context,
-           FlushReason flush_reason, LogBuffer* log_buffer,
-           FSDirectory* db_directory, FSDirectory* output_file_directory,
+           JobContext* job_context, FlushReason flush_reason,
+           LogBuffer* log_buffer, FSDirectory* db_directory,
+           FSDirectory* output_file_directory,
            CompressionType output_compression, Statistics* stats,
            EventLogger* event_logger, bool measure_io_stats,
            const bool sync_output_directory, const bool write_manifest,
@@ -167,10 +165,7 @@ class FlushJob {
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
-  std::vector<SequenceNumber> existing_snapshots_;
   SequenceNumber earliest_snapshot_;
-  SequenceNumber earliest_write_conflict_snapshot_;
-  SnapshotChecker* snapshot_checker_;
   JobContext* job_context_;
   FlushReason flush_reason_;
   LogBuffer* log_buffer_;
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index f37eaf829be5..b84bb3d8bcb7 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -186,16 +186,16 @@ TEST_F(FlushJobTest, Empty) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, nullptr, &event_logger, false,
-      true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, nullptr, &event_logger, false,
+                     true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
   {
     InstrumentedMutexLock l(&mutex_);
     flush_job.PickMemTable();
@@ -272,16 +272,16 @@ TEST_F(FlushJobTest, NonEmpty) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
 
   HistogramData hist;
   FileMetaData file_meta;
@@ -332,18 +332,18 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
 
   assert(memtable_ids.size() == num_mems);
   uint64_t smallest_memtable_id = memtable_ids.front();
   uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
       cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
   HistogramData hist;
   FileMetaData file_meta;
@@ -405,18 +405,17 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
   std::vector<std::unique_ptr<FlushJob>> flush_jobs;
   k = 0;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   for (auto cfd : all_cfds) {
     std::vector<SequenceNumber> snapshot_seqs;
     flush_jobs.emplace_back(new FlushJob(
         dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(),
         memtable_ids[k], env_options_, versions_.get(), &mutex_,
-        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
-        &job_context, FlushReason::kTest, nullptr, nullptr, nullptr,
-        kNoCompression, db_options_.statistics.get(), &event_logger, true,
-        false /* sync_output_directory */, false /* write_manifest */,
+        &shutting_down_, &job_context, FlushReason::kTest, nullptr, nullptr,
+        nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
+        true, false /* sync_output_directory */, false /* write_manifest */,
         Env::Priority::USER, nullptr /*IOTracer*/,
         empty_seqno_to_time_mapping_));
     k++;
@@ -532,16 +531,17 @@ TEST_F(FlushJobTest, Snapshots) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber,
+                                  std::move(snapshots));
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
   mutex_.Lock();
   flush_job.PickMemTable();
   ASSERT_OK(flush_job.Run());
@@ -585,18 +585,18 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
 
   assert(memtable_ids.size() == num_mems);
   uint64_t smallest_memtable_id = memtable_ids.front();
   uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
       cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
 
   // When the state from WriteController is normal.
@@ -658,16 +658,16 @@ TEST_F(FlushJobTest, ReplaceTimedPutWriteTimeWithPreferredSeqno) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      cfd->GetLatestMutableCFOptions(),
-      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
-      Env::Priority::USER, nullptr /*IOTracer*/, seqno_to_time_mapping);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     &job_context, FlushReason::kTest, nullptr, nullptr,
+                     nullptr, kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, seqno_to_time_mapping);
 
   FileMetaData file_meta;
   mutex_.Lock();
@@ -761,19 +761,19 @@ TEST_P(FlushJobTimestampTest, AllKeysExpired) {
   }
 
   std::vector<SequenceNumber> snapshots;
-  constexpr SnapshotChecker* const snapshot_checker = nullptr;
   JobContext job_context(0);
   EventLogger event_logger(db_options_.info_log.get());
   std::string full_history_ts_low;
   PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
   cfd->SetFullHistoryTsLow(full_history_ts_low);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   FlushJob flush_job(
       dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(),
       std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
       /*db_id=*/"",
       /*db_session_id=*/"", full_history_ts_low);
@@ -823,8 +823,8 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) {
   }
 
   std::vector<SequenceNumber> snapshots;
-  SnapshotChecker* const snapshot_checker = nullptr;
   JobContext job_context(0);
+  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber, {});
   EventLogger event_logger(db_options_.info_log.get());
   std::string full_history_ts_low;
   PutFixed64(&full_history_ts_low, 0);
@@ -832,10 +832,10 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) {
   FlushJob flush_job(
       dbname_, cfd, db_options_, cfd->GetLatestMutableCFOptions(),
       std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
-      snapshot_checker, &job_context, FlushReason::kTest, nullptr, nullptr,
-      nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
-      true, true /* sync_output_directory */, true /* write_manifest */,
+      versions_.get(), &mutex_, &shutting_down_, &job_context,
+      FlushReason::kTest, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
       Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
       /*db_id=*/"",
       /*db_session_id=*/"", full_history_ts_low);
diff --git a/db/job_context.h b/db/job_context.h
index 3d2fe933a5c2..365a820d5f48 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -135,6 +135,37 @@ struct JobContext {
     return kMaxSequenceNumber;
   }
 
+  SequenceNumber GetLatestSnapshotSequence() const {
+    assert(snapshot_context_initialized);
+    if (snapshot_seqs.empty()) {
+      return 0;
+    }
+    return snapshot_seqs.back();
+  }
+
+  SequenceNumber GetEarliestSnapshotSequence() const {
+    assert(snapshot_context_initialized);
+    if (snapshot_seqs.empty()) {
+      return kMaxSequenceNumber;
+    }
+    return snapshot_seqs.front();
+  }
+
+  void InitSnapshotContext(SnapshotChecker* checker,
+                           std::unique_ptr<ManagedSnapshot> managed_snapshot,
+                           SequenceNumber earliest_write_conflict,
+                           std::vector<SequenceNumber>&& snapshots) {
+    if (snapshot_context_initialized) {
+      return;
+    }
+    snapshot_context_initialized = true;
+    snapshot_checker = checker;
+    assert(!job_snapshot);
+    job_snapshot = std::move(managed_snapshot);
+    earliest_write_conflict_snapshot = earliest_write_conflict;
+    snapshot_seqs = std::move(snapshots);
+  }
+
   // Structure to store information for candidate files to delete.
   struct CandidateFileInfo {
     std::string file_name;
@@ -146,9 +177,6 @@ struct JobContext {
     }
   };
 
-  // Unique job id
-  int job_id;
-
   // a list of all files that we'll consider deleting
   // (every once in a while this is filled up with all files
   // in the DB directory)
@@ -197,14 +225,14 @@ struct JobContext {
 
   // the current manifest_file_number, log_number and prev_log_number
   // that corresponds to the set of files in 'live'.
-  uint64_t manifest_file_number;
-  uint64_t pending_manifest_file_number;
+  uint64_t manifest_file_number = 0;
+  uint64_t pending_manifest_file_number = 0;
 
   // Used for remote compaction. To prevent OPTIONS files from getting
   // purged by PurgeObsoleteFiles() of the primary host
   uint64_t min_options_file_number;
-  uint64_t log_number;
-  uint64_t prev_log_number;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
 
   uint64_t min_pending_output = 0;
   uint64_t prev_wals_total_size = 0;
@@ -213,17 +241,27 @@ struct JobContext {
 
   // Snapshot taken before flush/compaction job.
   std::unique_ptr<ManagedSnapshot> job_snapshot;
+  SnapshotChecker* snapshot_checker = nullptr;
+  std::vector<SequenceNumber> snapshot_seqs;
+  // This is the earliest snapshot that could be used for write-conflict
+  // checking by a transaction.  For any user-key newer than this snapshot, we
+  // should make sure not to remove evidence that a write occurred.
+  SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber;
+
+  // Unique job id
+  int job_id;
+
+  bool snapshot_context_initialized = false;
 
   explicit JobContext(int _job_id, bool create_superversion = false) {
     job_id = _job_id;
-    manifest_file_number = 0;
-    pending_manifest_file_number = 0;
-    log_number = 0;
-    prev_log_number = 0;
     superversion_contexts.emplace_back(
         SuperVersionContext(create_superversion));
   }
 
+  // Delete the default constructor
+  JobContext() = delete;
+
   // For non-empty JobContext Clean() has to be called at least once before
   // before destruction (see asserts in ~JobContext()). Should be called with
   // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally

From a00391c72996a5dbdd93a621dbc53719c13b05c4 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 22 May 2025 17:29:23 -0700
Subject: [PATCH 106/500] Enable large txn optimization by transaction write
 batch size (#13634)

Summary:
Larger key/values can cause memtable write to take longer time. Add new option `TransactionOptions::large_txn_commit_optimize_byte_threshold` that enables the optimization by transaction write batch size.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13634

Test Plan:
- new unit test
- added option to stress test and ran stress test for some time: `python3 ./tools/db_crashtest.py --txn blackbox  --txn_write_policy=0 --commit_bypass_memtable_one_in=50 --test_batches_snapshots=0`

Reviewed By: jowlyzhang

Differential Revision: D75248126

Pulled By: cbi42

fbshipit-source-id: 9522db93457729ba60e4176f7d47f7c2c7778567
---
 db_stress_tool/db_stress_test_base.cc         |   8 +-
 include/rocksdb/utilities/transaction_db.h    |  23 ++++
 .../new_features/large-txn-byte-threshold.md  |   1 +
 .../transactions/pessimistic_transaction.cc   |  10 +-
 .../transactions/pessimistic_transaction.h    |   5 +-
 utilities/transactions/transaction_test.cc    | 112 ++++++++++++++++++
 6 files changed, 155 insertions(+), 4 deletions(-)
 create mode 100644 unreleased_history/new_features/large-txn-byte-threshold.md

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index e464cf8ad4f3..fb9fb0f744c3 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -849,9 +849,15 @@ Status StressTest::NewTxn(WriteOptions& write_opts, ThreadState* thread,
       assert(FLAGS_user_timestamp_size == 0);
       if (thread->rand.OneIn(2)) {
         txn_options.commit_bypass_memtable = true;
-      } else {
+      }
+      if (thread->rand.OneIn(2)) {
         txn_options.large_txn_commit_optimize_threshold = 1;
       }
+      if (thread->rand.OneIn(2) ||
+          (!txn_options.commit_bypass_memtable &&
+           txn_options.large_txn_commit_optimize_threshold != 1)) {
+        txn_options.large_txn_commit_optimize_byte_threshold = 1;
+      }
       if (commit_bypass_memtable) {
         *commit_bypass_memtable = txn_options.commit_bypass_memtable;
       }
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 32398d9ea83e..d5c343d42047 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -366,6 +366,22 @@ struct TransactionOptions {
   // DeleteRange, SingleDelete.
   bool write_batch_track_timestamp_size = false;
 
+  // The following three options enable optimizations for large transaction
+  // commit to bypass memtable write.
+  // - If any transaction's commit should bybass memtable write,
+  //  set commit_bypass_memtable to true.
+  // - If only bypass memtable write for transactions with >= n operations,
+  //  set commit_bypass_memtable to false,
+  //  large_txn_commit_optimize_threshold to n, and
+  //  large_txn_commit_optimize_byte_threshold to max.
+  //  Similarly for only optimize when a transaction's write batch size is >= n.
+  // - If bypass memtable write for transactions with >= n operations or >= x
+  // bytes,
+  //  set commit_bypass_memtable to false,
+  //  large_txn_commit_optimize_threshold to n, and
+  //  large_txn_commit_optimize_byte_threshold to x.
+  //
+  //
   // EXPERIMENTAL, SUBJECT TO CHANGE
   // Only supports write-committed policy. If set to true, the transaction will
   // skip memtable write and ingest into the DB directly during Commit(). This
@@ -396,6 +412,13 @@ struct TransactionOptions {
   // comment for `commit_bypass_memtable` for more optimization detail.
   uint32_t large_txn_commit_optimize_threshold =
       std::numeric_limits<uint32_t>::max();
+
+  // EXPERIMENTAL, SUBJECT TO CHANGE
+  // When the size of a transaction's write batch is at least this threshold,
+  // we will enable optimizations for commiting a large transaction. See
+  // comment for `commit_bypass_memtable` for more optimization detail.
+  uint64_t large_txn_commit_optimize_byte_threshold =
+      std::numeric_limits<uint64_t>::max();
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff --git a/unreleased_history/new_features/large-txn-byte-threshold.md b/unreleased_history/new_features/large-txn-byte-threshold.md
new file mode 100644
index 000000000000..4d781c41efc7
--- /dev/null
+++ b/unreleased_history/new_features/large-txn-byte-threshold.md
@@ -0,0 +1 @@
+* Add new experimental `TransactionOptions::large_txn_commit_optimize_byte_threshold` to enable optimizations for large transaction commit by transaction batch data size.
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 5243ec9a2570..b802179d454f 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -115,6 +115,9 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
     commit_bypass_memtable_threshold_ =
         db_options.txn_commit_bypass_memtable_threshold;
   }
+
+  commit_bypass_memtable_byte_threshold_ =
+      txn_options.large_txn_commit_optimize_byte_threshold;
 }
 
 PessimisticTransaction::~PessimisticTransaction() {
@@ -857,6 +860,8 @@ Status WriteCommittedTxn::CommitInternal() {
   } else {
     assert(commit_bypass_memtable_threshold_ ==
            std::numeric_limits<uint32_t>::max());
+    assert(commit_bypass_memtable_byte_threshold_ ==
+           std::numeric_limits<uint64_t>::max());
     assert(commit_timestamp_ != kMaxTxnTimestamp);
     char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
     EncodeFixed64(commit_ts_buf, commit_timestamp_);
@@ -895,7 +900,10 @@ Status WriteCommittedTxn::CommitInternal() {
   uint32_t wb_count = wb->Count();
   RecordInHistogram(db_impl_->immutable_db_options_.stats,
                     NUM_OP_PER_TRANSACTION, wb_count);
-  bool bypass_memtable = wb_count >= commit_bypass_memtable_threshold_;
+  bool bypass_memtable =
+      !needs_ts &&
+      (wb_count >= commit_bypass_memtable_threshold_ ||
+       wb->GetDataSize() >= commit_bypass_memtable_byte_threshold_);
   if (!bypass_memtable) {
     // insert prepared batch into Memtable only skipping WAL.
     // Memtable will ignore BeginPrepare/EndPrepare markers
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index a85071ad187a..dd166bd080ad 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -166,10 +166,11 @@ class PessimisticTransaction : public TransactionBaseImpl {
   // Refer to
   // TransactionOptions::skip_prepare
   bool skip_prepare_ = false;
-  // Refer to
-  // TransactionOptions::commit_bypass_memtable
+  // Refer to TransactionOptions::commit_bypass_memtable
   uint32_t commit_bypass_memtable_threshold_ =
       std::numeric_limits<uint32_t>::max();
+  uint64_t commit_bypass_memtable_byte_threshold_ =
+      std::numeric_limits<uint64_t>::max();
 
  private:
   friend class TransactionTest_ValidateSnapshotTest_Test;
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 641299ba0540..84d00775807b 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -9889,6 +9889,118 @@ TEST_F(TransactionDBTest, SelfDeadlockBug) {
   delete txn1;
   delete txn2;
 }
+
+TEST_P(CommitBypassMemtableTest,
+       OptimizeLargeTxnCommitWriteBatchSizeThreshold) {
+  // Tests TransactionOptions::large_txn_commit_optimize_byte_threshold
+  const uint64_t threshold = 100;
+  SetUpTransactionDB();
+  bool commit_bypass_memtable = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteCommittedTxn::CommitInternal:bypass_memtable",
+      [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  // Test with transaction option only
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  txn_opts.large_txn_commit_optimize_byte_threshold = threshold;
+
+  // Above threshold
+  auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn->SetName("xid1"));
+  ASSERT_OK(txn->Put("k1", rnd.RandomString(threshold)));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() >= threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  // Below threshold
+  txn = txn_db->BeginTransaction(wopts, txn_opts, txn);
+  ASSERT_OK(txn->SetName("xid2"));
+  ASSERT_OK(txn->Put("k2", "v2"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn;
+
+  // With commit_bypass_memtbale
+  TransactionOptions txn_opts2;
+  txn_opts2.commit_bypass_memtable = true;
+  txn_opts2.large_txn_commit_optimize_byte_threshold = threshold;
+  txn = txn_db->BeginTransaction(wopts, txn_opts2, nullptr);
+  ASSERT_OK(txn->SetName("xid3"));
+  ASSERT_OK(txn->Put("k3", "v3"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+  delete txn;
+
+  // With count based threshold `large_txn_commit_optimize_threshold`
+  TransactionOptions txn_opts3;
+  txn_opts3.commit_bypass_memtable = false;
+  txn_opts3.large_txn_commit_optimize_byte_threshold = threshold;
+  txn_opts3.large_txn_commit_optimize_threshold = 3;
+  txn = txn_db->BeginTransaction(wopts, txn_opts3, nullptr);
+  ASSERT_OK(txn->SetName("xid4"));
+  ASSERT_OK(txn->Put("k3", "v3"));
+  ASSERT_OK(txn->Delete("k2"));
+  ASSERT_OK(txn->Delete("k1"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  txn = txn_db->BeginTransaction(wopts, txn_opts3, txn);
+  ASSERT_OK(txn->SetName("xid4"));
+  ASSERT_OK(txn->Put("k3", "v3"));
+  ASSERT_OK(txn->Delete("k2"));
+  ASSERT_OK(txn->Delete("k1"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  txn = txn_db->BeginTransaction(wopts, txn_opts3, txn);
+  ASSERT_OK(txn->SetName("xid5"));
+  ASSERT_OK(txn->Put("k5", "v5"));
+  ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn;
+
+  // Test with multiple column families
+  std::vector<std::string> cfs = {"pk", "sk"};
+  CreateColumnFamilies(cfs, options);
+  TransactionOptions txn_opts_cf;
+
+  txn_opts_cf.large_txn_commit_optimize_byte_threshold = threshold;
+
+  // Below threshold
+  auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts_cf, nullptr);
+  ASSERT_OK(txn_cf->SetName("xid_cf_above"));
+  ASSERT_OK(txn_cf->Put(handles_[0], "k1", rnd.RandomString(threshold / 2)));
+  ASSERT_OK(txn_cf->Put(handles_[1], "k2", rnd.RandomString(threshold / 2)));
+  ASSERT_TRUE(txn_cf->GetWriteBatch()->GetDataSize() >= threshold);
+  ASSERT_OK(txn_cf->Prepare());
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_TRUE(commit_bypass_memtable);
+
+  txn_cf = txn_db->BeginTransaction(wopts, txn_opts_cf, txn_cf);
+  ASSERT_OK(txn_cf->SetName("xid_cf_below"));
+  ASSERT_OK(txn_cf->Put(handles_[0], "k1", rnd.RandomString(10)));
+  ASSERT_OK(txn_cf->Put(handles_[1], "k2", rnd.RandomString(10)));
+  ASSERT_TRUE(txn_cf->GetWriteBatch()->GetDataSize() < threshold);
+  ASSERT_OK(txn_cf->Prepare());
+  ASSERT_OK(txn_cf->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
+
+  delete txn_cf;
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 11631c0609024331232ffc4d90702793c6c1cb47 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 22 May 2025 20:03:51 -0700
Subject: [PATCH 107/500] Update default value for large txn options (#13636)

Summary:
to make it easier to use 0 for disabled. And deprecate the use of txn db option `txn_commit_bypass_memtable_threshold`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13636

Test Plan: updated unit tests.

Reviewed By: jowlyzhang

Differential Revision: D75262136

Pulled By: cbi42

fbshipit-source-id: 9040e5a9c918c1d0906a2db4600cc012d2436b22
---
 include/rocksdb/utilities/transaction_db.h    |  14 +-
 .../behavior_changes/large-txn-default-val.md |   1 +
 .../transactions/pessimistic_transaction.cc   |  26 ++--
 utilities/transactions/transaction_test.cc    | 128 ++++--------------
 4 files changed, 50 insertions(+), 119 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/large-txn-default-val.md

diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index d5c343d42047..4a69c141b06d 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -254,7 +254,7 @@ struct TransactionDBOptions {
   // for more details.
   std::vector<std::shared_ptr<SecondaryIndex>> secondary_indices;
 
-  // Deprecated, this option may be removed in the future.
+  // Deprecated, this option has no effect and may be removed in the future.
   // Use TransactionOptions::large_txn_commit_optimize_threshold instead.
   //
   // This option is only valid for write committed. If the number of updates in
@@ -373,7 +373,7 @@ struct TransactionOptions {
   // - If only bypass memtable write for transactions with >= n operations,
   //  set commit_bypass_memtable to false,
   //  large_txn_commit_optimize_threshold to n, and
-  //  large_txn_commit_optimize_byte_threshold to max.
+  //  large_txn_commit_optimize_byte_threshold to 0.
   //  Similarly for only optimize when a transaction's write batch size is >= n.
   // - If bypass memtable write for transactions with >= n operations or >= x
   // bytes,
@@ -410,15 +410,17 @@ struct TransactionOptions {
   // When the number of updates in a transaction is at least this threshold,
   // we will enable optimizations for commiting a large transaction. See
   // comment for `commit_bypass_memtable` for more optimization detail.
-  uint32_t large_txn_commit_optimize_threshold =
-      std::numeric_limits<uint32_t>::max();
+  //
+  // Default: 0 (disabled).
+  uint32_t large_txn_commit_optimize_threshold = 0;
 
   // EXPERIMENTAL, SUBJECT TO CHANGE
   // When the size of a transaction's write batch is at least this threshold,
   // we will enable optimizations for commiting a large transaction. See
   // comment for `commit_bypass_memtable` for more optimization detail.
-  uint64_t large_txn_commit_optimize_byte_threshold =
-      std::numeric_limits<uint64_t>::max();
+  //
+  // Default: 0 (disabled).
+  uint64_t large_txn_commit_optimize_byte_threshold = 0;
 };
 
 // The per-write optimizations that do not involve transactions. TransactionDB
diff --git a/unreleased_history/behavior_changes/large-txn-default-val.md b/unreleased_history/behavior_changes/large-txn-default-val.md
new file mode 100644
index 000000000000..7f0dde81c3cd
--- /dev/null
+++ b/unreleased_history/behavior_changes/large-txn-default-val.md
@@ -0,0 +1 @@
+* `TransactionOptions::large_txn_commit_optimize_threshold` now has default value 0 for disabled. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` now has no effect on transactions.
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index b802179d454f..a5b22a579279 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -107,13 +107,9 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
   if (txn_options.commit_bypass_memtable) {
     // No need to optimize for empty transction
     commit_bypass_memtable_threshold_ = 1;
-  } else if (txn_options.large_txn_commit_optimize_threshold !=
-             std::numeric_limits<uint32_t>::max()) {
-    commit_bypass_memtable_threshold_ =
-        txn_options.large_txn_commit_optimize_threshold;
   } else {
     commit_bypass_memtable_threshold_ =
-        db_options.txn_commit_bypass_memtable_threshold;
+        txn_options.large_txn_commit_optimize_threshold;
   }
 
   commit_bypass_memtable_byte_threshold_ =
@@ -858,10 +854,8 @@ Status WriteCommittedTxn::CommitInternal() {
   if (!needs_ts) {
     s = WriteBatchInternal::MarkCommit(working_batch, name_);
   } else {
-    assert(commit_bypass_memtable_threshold_ ==
-           std::numeric_limits<uint32_t>::max());
-    assert(commit_bypass_memtable_byte_threshold_ ==
-           std::numeric_limits<uint64_t>::max());
+    assert(!commit_bypass_memtable_threshold_);
+    assert(!commit_bypass_memtable_byte_threshold_);
     assert(commit_timestamp_ != kMaxTxnTimestamp);
     char commit_ts_buf[sizeof(kMaxTxnTimestamp)];
     EncodeFixed64(commit_ts_buf, commit_timestamp_);
@@ -900,10 +894,16 @@ Status WriteCommittedTxn::CommitInternal() {
   uint32_t wb_count = wb->Count();
   RecordInHistogram(db_impl_->immutable_db_options_.stats,
                     NUM_OP_PER_TRANSACTION, wb_count);
-  bool bypass_memtable =
-      !needs_ts &&
-      (wb_count >= commit_bypass_memtable_threshold_ ||
-       wb->GetDataSize() >= commit_bypass_memtable_byte_threshold_);
+  bool bypass_memtable = false;
+  if (!needs_ts) {
+    if (commit_bypass_memtable_threshold_ &&
+        wb_count >= commit_bypass_memtable_threshold_) {
+      bypass_memtable = true;
+    } else if (commit_bypass_memtable_byte_threshold_ &&
+               wb->GetDataSize() >= commit_bypass_memtable_byte_threshold_) {
+      bypass_memtable = true;
+    }
+  }
   if (!bypass_memtable) {
     // insert prepared batch into Memtable only skipping WAL.
     // Memtable will ignore BeginPrepare/EndPrepare markers
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 84d00775807b..bf5bbc562925 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -8959,7 +8959,6 @@ class CommitBypassMemtableTest : public DBTestBase,
   TransactionDBOptions txn_db_opts;
 
   void SetUpTransactionDB(
-      uint32_t threshold = std::numeric_limits<uint32_t>::max(),
       bool atomic_flush = false) {
     options = CurrentOptions();
     options.create_if_missing = true;
@@ -8973,7 +8972,6 @@ class CommitBypassMemtableTest : public DBTestBase,
     Destroy(options, true);
 
     txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
-    txn_db_opts.txn_commit_bypass_memtable_threshold = threshold;
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
     ASSERT_NE(txn_db, nullptr);
     db_ = txn_db;
@@ -9429,10 +9427,10 @@ TEST_P(CommitBypassMemtableTest, Recovery) {
   VerifyDBFromMap(expected);
 }
 
-TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
-  // Tests TransactionDBOptions::txn_commit_bypass_memtable_threshold
+TEST_P(CommitBypassMemtableTest, OptimizeLargeTxnCommitThreshold) {
+  // Tests TransactionOptions::large_txn_commit_optimize_threshold
   const uint32_t threshold = 10;
-  SetUpTransactionDB(/*threshold=*/threshold);
+  SetUpTransactionDB();
   bool commit_bypass_memtable = false;
   // TODO: add and use stats for this
   SyncPoint::GetInstance()->SetCallBack(
@@ -9440,73 +9438,25 @@ TEST_P(CommitBypassMemtableTest, ThresholdTxnDBOption) {
       [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
   SyncPoint::GetInstance()->EnableProcessing();
 
-  // TransactionOptions::commit_bypass_memtable takes precedence
   WriteOptions wopts;
+  // Test default (disabled)
   TransactionOptions txn_opts;
-  txn_opts.commit_bypass_memtable = true;
-  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
-  ASSERT_OK(txn1->SetName("xid1"));
-  ASSERT_OK(txn1->Put("k2", "v2"));
-  ASSERT_OK(txn1->Put("k1", "v1"));
+  auto txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn1->SetName("xid0"));
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(
+        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+  }
   ASSERT_OK(txn1->Prepare());
   ASSERT_OK(txn1->Commit());
-  ASSERT_TRUE(commit_bypass_memtable);
-
-  // Test threshold behavior
-  for (auto num_ops : {threshold - 1, threshold}) {
-    commit_bypass_memtable = false;
-    txn_opts.commit_bypass_memtable = false;
-    auto txn = txn_db->BeginTransaction(wopts, txn_opts, txn1);
-    txn1 = nullptr;
-    ASSERT_OK(txn->SetName("xid" + std::to_string(num_ops)));
-    for (uint32_t i = 0; i < num_ops; ++i) {
-      ASSERT_OK(
-          txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
-    }
-    ASSERT_OK(txn->Prepare());
-    ASSERT_OK(txn->Commit());
-    ASSERT_EQ(commit_bypass_memtable, num_ops >= threshold);
-    delete txn;
-  }
-
-  // Repeat the same test with updates to two CFs
-  std::vector<std::string> cfs = {"pk", "sk"};
-  CreateColumnFamilies(cfs, options);
-
-  // Test threshold behavior with CFs
-  for (auto num_ops : {threshold - 1, threshold}) {
-    commit_bypass_memtable = false;
-    txn_opts.commit_bypass_memtable = false;
-    auto txn_cf = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
-    ASSERT_OK(txn_cf->SetName("xid_cf" + std::to_string(num_ops)));
-    for (uint32_t i = 0; i < num_ops; ++i) {
-      ASSERT_OK(txn_cf->Put(handles_[i % 2], "key" + std::to_string(i),
-                            "value" + std::to_string(i)));
-    }
-    ASSERT_OK(txn_cf->Prepare());
-    ASSERT_OK(txn_cf->Commit());
-    ASSERT_EQ(commit_bypass_memtable, num_ops >= threshold);
-    delete txn_cf;
-  }
-}
-
-TEST_P(CommitBypassMemtableTest, OptimizeLargeTxnCommitThreshold) {
-  // Tests TransactionOptions::large_txn_commit_optimize_threshold
-  const uint32_t threshold = 10;
-  SetUpTransactionDB();
-  bool commit_bypass_memtable = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "WriteCommittedTxn::CommitInternal:bypass_memtable",
-      [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
-  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_FALSE(commit_bypass_memtable);
+  delete txn1;
 
   // Test with transaction option only
-  WriteOptions wopts;
-  TransactionOptions txn_opts;
   txn_opts.large_txn_commit_optimize_threshold = threshold;
 
   // Test with transaction below threshold
-  auto txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
   ASSERT_OK(txn1->SetName("xid1"));
   ASSERT_OK(txn1->Put("k1", "v1"));
   ASSERT_OK(txn1->Prepare());
@@ -9526,38 +9476,6 @@ TEST_P(CommitBypassMemtableTest, OptimizeLargeTxnCommitThreshold) {
   ASSERT_TRUE(commit_bypass_memtable);
   delete txn1;
 
-  // Test with both DB option and transaction option - transaction option should
-  // take precedence
-  SetUpTransactionDB(/*threshold=*/threshold * 2);
-
-  // Transaction option is lower than DB option, should use transaction option
-  txn_opts.large_txn_commit_optimize_threshold = threshold;
-  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
-  ASSERT_OK(txn1->SetName("xid3"));
-  for (uint32_t i = 0; i < threshold; ++i) {
-    ASSERT_OK(
-        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
-  }
-  ASSERT_OK(txn1->Prepare());
-  commit_bypass_memtable = false;
-  ASSERT_OK(txn1->Commit());
-  ASSERT_TRUE(commit_bypass_memtable);
-  delete txn1;
-
-  // Transaction option is higher than DB option, should use transaction option
-  txn_opts.large_txn_commit_optimize_threshold = threshold * 3;
-  txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
-  ASSERT_OK(txn1->SetName("xid4"));
-  for (uint32_t i = 0; i < threshold * 3 - 1; ++i) {
-    ASSERT_OK(
-        txn1->Put("key" + std::to_string(i), "value" + std::to_string(i)));
-  }
-  ASSERT_OK(txn1->Prepare());
-  commit_bypass_memtable = false;
-  ASSERT_OK(txn1->Commit());
-  ASSERT_FALSE(commit_bypass_memtable);
-  delete txn1;
-
   SetUpTransactionDB();
   // Test with multiple column families
   std::vector<std::string> cfs = {"pk", "sk"};
@@ -9610,7 +9528,7 @@ TEST_P(CommitBypassMemtableTest, OptimizeLargeTxnCommitThreshold) {
 
 TEST_P(CommitBypassMemtableTest, AtomicFlushTest) {
   const uint32_t threshold = 10;
-  SetUpTransactionDB(/*threshold=*/threshold, /*atomic_flush=*/true);
+  SetUpTransactionDB(/*atomic_flush=*/true);
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<std::string> cfs = {"cf0", "cf1", "cf2"};
@@ -9621,7 +9539,9 @@ TEST_P(CommitBypassMemtableTest, AtomicFlushTest) {
   ASSERT_OK(db_->Put({}, handles_[2], "key2", "val2"));
 
   // Write to cf 0, should see cf1 and cf2 flushed too
-  auto txn = txn_db->BeginTransaction({}, {}, nullptr);
+  TransactionOptions txn_opts;
+  txn_opts.large_txn_commit_optimize_threshold = threshold;
+  auto txn = txn_db->BeginTransaction({}, txn_opts, nullptr);
   for (uint32_t i = 0; i <= threshold; ++i) {
     ASSERT_OK(txn->Put(handles_[0], "key" + std::to_string(i),
                        "cf0" + std::to_string(i)));
@@ -9902,13 +9822,21 @@ TEST_P(CommitBypassMemtableTest,
   SyncPoint::GetInstance()->EnableProcessing();
 
   Random rnd(301);
-  // Test with transaction option only
+
   WriteOptions wopts;
   TransactionOptions txn_opts;
-  txn_opts.large_txn_commit_optimize_byte_threshold = threshold;
+  // Test default
+  auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_OK(txn->SetName("xid0"));
+  ASSERT_OK(txn->Put("k1", rnd.RandomString(1000)));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  ASSERT_FALSE(commit_bypass_memtable);
 
+  // Test with transaction option only
+  txn_opts.large_txn_commit_optimize_byte_threshold = threshold;
   // Above threshold
-  auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  txn = txn_db->BeginTransaction(wopts, txn_opts, txn);
   ASSERT_OK(txn->SetName("xid1"));
   ASSERT_OK(txn->Put("k1", rnd.RandomString(threshold)));
   ASSERT_TRUE(txn->GetWriteBatch()->GetDataSize() >= threshold);

From 7208116105a34b251968418634623107bdad341a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 27 May 2025 10:41:09 -0700
Subject: [PATCH 108/500] Update API comments for mutable tiering options
 (#13642)

Summary:
Mutable as described in 9.11 release notes

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13642

Test Plan: already tested in tiered_compaction_test; search for ApplyConfigChange

Reviewed By: jowlyzhang

Differential Revision: D75458238

Pulled By: pdillinger

fbshipit-source-id: a2aa7273dbdc7be95aceed76edf502f883130172
---
 include/rocksdb/advanced_options.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 57cd12b08472..2f9b04699a4c 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -863,7 +863,7 @@ struct AdvancedColumnFamilyOptions {
   //
   // Default: 0 (disable the feature)
   //
-  // Not dynamically changeable, change it requires db restart.
+  // Dynamically changeable through the SetOptions() API
   uint64_t preclude_last_level_data_seconds = 0;
 
   // EXPERIMENTAL
@@ -886,7 +886,7 @@ struct AdvancedColumnFamilyOptions {
   //
   // Default: 0 (disable the feature)
   //
-  // Not dynamically changeable, change it requires db restart.
+  // Dynamically changeable through the SetOptions() API
   uint64_t preserve_internal_time_seconds = 0;
 
   // When set, large values (blobs) are written to separate blob files, and

From f2a8ee8ff26e424c490be44a41a0b47afabeecbe Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@openai.com>
Date: Tue, 27 May 2025 16:34:04 -0700
Subject: [PATCH 109/500] get block_based_table_builder.cc to compile on c++23
 (#13638)

Summary:
Get table/block_based/block_based_table_builder.cc to compile on c++23 on clang, by re-ordering BlockBasedTableBuilder::Rep and BlockBasedTableBuilder::ParallelCompressionRep definitions.

Clang `--std=c++23` changed behavior of unique_ptr<> with incomplete types. Now, constructor/destructures involving types with unique_ptr fields, must have access to the complete type; and thus must be defined after all its dependencies: See [godbolt link for behavior](https://godbolt.org/#z:OYLghAFBqd5QCxAYwPYBMCmBRdBLAF1QCcAaPECAMzwBtMA7AQwFtMQByARg9KtQYEAysib0QXACx8BBAKoBnTAAUAHpwAMvAFYTStJg1DIApACYAQuYukl9ZATwDKjdAGFUtAK4sGe1wAyeAyYAHI%2BAEaYxBIAbKQADqgKhE4MHt6%2BekkpjgJBIeEsUTFc8XaYDmlCBEzEBBk%2Bfly2mPZ5DDV1BAVhkdFxtrX1jVktCsM9wX3FA2UAlLaoXsTI7BzmAMzByN5YANQmm25sLCQAnkfYJhoAgje3u0wKCvsAkgxoLAn0BJhHVjuDyeL32t0OAHZAbcAPQw/YI24QCboEAgLwMPAARy8mAA%2BgkCMQjm4Pl8fpg/ld9kx5gCHojkQRUejMTj8YTiccyahvr9/ptsDT5iAaXimSyzgA3TAQWnzSFWCEAEQZ%2BxRaIx2NxBKJJJ5fMpAqFTDx9KBKvN9zuINeBopf0VJktm2hDzh%2BwxXzYgn2RH25LomH2wQD1msZk2%2BzlDHQ%2B2ImGlwYICGDwVo0zjaAYE2IXgcJH2WBomI6dLuHtuaKRGtZ2o5eu5n15DuNwv2otNErRSbl8wVzqVqqBd2CBH2LCYwQgA6haoRCYIKwY%2Bw0Vudyo4i1onAArLw/BwtKRUJw3OHLOrlqtg1seKQCJot4sANYgMy7/ScSQHp8nzi8AoIAaA%2BT6LHAsBIJgqiVF4RBkBQcrEMACjKIYbRCAgqAAO6HvegYGB0aEhLQmE4Yex4EQMTxGFwAAcXAgVRxChKw6y8MxADycFkbhf7QZUtzIUBHC8AJyA1Pgh68PwggiGI7BSDIgiKCo6hHjoegGEYKAXjYGYREBkCLKghJpCJAC0FkokcyqmJYEabLwqAysQxB4FgRkzqQeaCHgbAACqoJ4XmLAo15rHoKLBMRGFYXx3C8NhxBMAknA8Nue6/hp/4cNgMHIHBhaqHRsQWbEkgBtpwD7PRAB0XB1Ro0bng5likPsuCEIWd7zLwj4af2pBvh%2BX4cD%2BpAUc5AG2MBoGDaQEGICA4lFQhlB1ChsWkfFeEcS2dBMER6E7eRf7MSg1X0YxpDMaxbAzVxPG7fxBVCShM3iZJwQzbJwiiOISl/apah/roZj6IYxh6foeCGfAJlmQIlnWcytn2VYlhmMeLnRO5nkIz5GKOIFwW0KFSwrJF4zMjFJ28XtpDJal6VbmN%2B6TX%2Bp55QVa37CVZUVVVUO1XRDVNS1MOdfg8GHJGXB9fNWhDSNkh1QAnJrWva9r8Q7uN2U4zNQEgQNyuLTAy2rfB5AbcJ20M6zt0HYRaQOy9OUXcAXC7i0d1sY9LvRNxwge8e4nvSJYkFd90nKf9CkSNIwNKKDOW6JskM6Rj1iw/DxknkjOacFZNmbHZemRs5rn45gFO%2BSTmBBSFhPhVTilDFJ7tnYlTMpWliWZRwHNTbl%2BWwbLAvlZVwDIMgtW7g1kttTY0vdcQcubAr/Vga%2B76fvrE2j9zgFzWbz5jWYhvTaJSsX65KTOJIQA%3D%3D%3D).

Interestingly, `gcc --std=c++23` accepts the code as-is.

Fixes https://github.com/facebook/rocksdb/issues/13574

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13638

Reviewed By: hx235

Differential Revision: D75472325

Pulled By: cbi42

fbshipit-source-id: 671df558cc0a54db94b7cc4af46591cd33c32ad6
---
 .../block_based/block_based_table_builder.cc  | 1376 ++++++++---------
 1 file changed, 688 insertions(+), 688 deletions(-)

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 95f25f80784a..64fe71351ad2 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -190,779 +190,779 @@ struct BlockBasedTableBuilder::WorkingAreaPair {
   Decompressor::ManagedWorkingArea verify;
 };
 
-struct BlockBasedTableBuilder::Rep {
-  const ImmutableOptions ioptions;
-  // BEGIN from MutableCFOptions
-  std::shared_ptr<const SliceTransform> prefix_extractor;
-  // END from MutableCFOptions
-  const WriteOptions write_options;
-  const BlockBasedTableOptions table_options;
-  const InternalKeyComparator& internal_comparator;
-  // Size in bytes for the user-defined timestamps.
-  size_t ts_sz;
-  // When `ts_sz` > 0 and this flag is false, the user-defined timestamp in the
-  // user key will be stripped when creating the block based table. This
-  // stripping happens for all user keys, including the keys in data block,
-  // index block for data block, index block for index block (if index type is
-  // `kTwoLevelIndexSearch`), index for filter blocks (if using partitioned
-  // filters), the `first_internal_key` in `IndexValue`, the `end_key` for range
-  // deletion entries.
-  // As long as the user keys are sorted when added via `Add` API, their logic
-  // ordering won't change after timestamps are stripped. However, for each user
-  // key to be logically equivalent before and after timestamp is stripped, the
-  // user key should contain the minimum timestamp.
-  bool persist_user_defined_timestamps;
-  WritableFileWriter* file;
-  std::atomic<uint64_t> offset;
-  size_t alignment;
-  BlockBuilder data_block;
-  // Buffers uncompressed data blocks to replay later. Needed when
-  // compression dictionary is enabled so we can finalize the dictionary before
-  // compressing any data blocks.
-  std::vector<std::string> data_block_buffers;
-  BlockBuilder range_del_block;
-
-  InternalKeySliceTransform internal_prefix_transform;
-  std::unique_ptr<IndexBuilder> index_builder;
-  std::string index_separator_scratch;
-  PartitionedIndexBuilder* p_index_builder_ = nullptr;
-
-  std::string last_ikey;  // Internal key or empty (unset)
-  const Slice* first_key_in_next_block = nullptr;
-  bool warm_cache = false;
-
-  uint64_t sample_for_compression;
-  std::atomic<uint64_t> compressible_input_data_bytes;
-  std::atomic<uint64_t> uncompressible_input_data_bytes;
-  std::atomic<uint64_t> sampled_input_data_bytes;
-  std::atomic<uint64_t> sampled_output_slow_data_bytes;
-  std::atomic<uint64_t> sampled_output_fast_data_bytes;
-  uint32_t compression_parallel_threads;
-  int max_compressed_bytes_per_kb;
-  size_t max_dict_sample_bytes = 0;
-
-  // *** Compressors & decompressors - Yes, it seems like a lot here but ***
-  // *** these are distinct fields to minimize extra conditionals and    ***
-  // *** field reads on hot code paths.                                  ***
+struct BlockBasedTableBuilder::ParallelCompressionRep {
+  // TODO: consider replacing with autovector or similar
+  // Keys is a wrapper of vector of strings avoiding
+  // releasing string memories during vector clear()
+  // in order to save memory allocation overhead
+  class Keys {
+   public:
+    Keys() : keys_(kKeysInitSize), size_(0) {}
+    void PushBack(const Slice& key) {
+      if (size_ == keys_.size()) {
+        keys_.emplace_back(key.data(), key.size());
+      } else {
+        keys_[size_].assign(key.data(), key.size());
+      }
+      size_++;
+    }
+    void SwapAssign(std::vector<std::string>& keys) {
+      size_ = keys.size();
+      std::swap(keys_, keys);
+    }
+    void Clear() { size_ = 0; }
+    size_t Size() { return size_; }
+    std::string& Back() { return keys_[size_ - 1]; }
+    std::string& operator[](size_t idx) {
+      assert(idx < size_);
+      return keys_[idx];
+    }
 
-  // A compressor for blocks in general, without dictionary compression
-  std::unique_ptr<Compressor> basic_compressor;
-  // A compressor using dictionary compression (when applicable)
-  std::unique_ptr<Compressor> compressor_with_dict;
-  // Once configured/determined, points to one of the above Compressors to
-  // use on data blocks.
-  Compressor* data_block_compressor = nullptr;
-  // A decompressor corresponding to basic_compressor (when non-nullptr).
-  // Used for verification and cache warming.
-  std::shared_ptr<Decompressor> basic_decompressor;
-  // When needed, a decompressor for verifying compression using a
-  // dictionary sampled/trained from this file.
-  std::unique_ptr<Decompressor> verify_decompressor_with_dict;
-  // When non-nullptr, compression should be verified with this corresponding
-  // decompressor, except for data blocks. (Points to same as basic_decompressor
-  // when verify_compression is set.)
-  UnownedPtr<Decompressor> verify_decompressor;
-  // Once configured/determined, points to one of the above Decompressors to use
-  // in verifying data blocks.
-  UnownedPtr<Decompressor> data_block_verify_decompressor;
+   private:
+    static constexpr size_t kKeysInitSize = 32;
+    std::vector<std::string> keys_;
+    size_t size_;
+  };
+  Keys curr_block_keys;
 
-  // Working area for basic_compressor when compression_parallel_threads==1
-  WorkingAreaPair basic_working_area;
-  // Working areas for data_block_compressor, for each of
-  // compression_parallel_threads
-  std::vector<WorkingAreaPair> data_block_working_areas;
+  struct BlockRep;
 
-  size_t data_begin_offset = 0;
+  // Use BlockRepSlot to keep block order in write thread.
+  // slot_ will pass references to BlockRep
+  class BlockRepSlot {
+   public:
+    BlockRepSlot() : slot_(1) {}
+    template <typename T>
+    void Fill(T&& rep) {
+      slot_.push(std::forward<T>(rep));
+    }
+    void Take(BlockRep*& rep) { slot_.pop(rep); }
 
-  TableProperties props;
+   private:
+    // slot_ will pass references to BlockRep in block_rep_buf,
+    // and those references are always valid before the destruction of
+    // block_rep_buf.
+    WorkQueue<BlockRep*> slot_;
+  };
 
-  // States of the builder.
-  //
-  // - `kBuffered`: This is the initial state where zero or more data blocks are
-  //   accumulated uncompressed in-memory. From this state, call
-  //   `EnterUnbuffered()` to finalize the compression dictionary if enabled,
-  //   compress/write out any buffered blocks, and proceed to the `kUnbuffered`
-  //   state.
-  //
-  // - `kUnbuffered`: This is the state when compression dictionary is finalized
-  //   either because it wasn't enabled in the first place or it's been created
-  //   from sampling previously buffered data. In this state, blocks are simply
-  //   compressed/written out as they fill up. From this state, call `Finish()`
-  //   to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
-  //   the partially created file.
-  //
-  // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
-  //   called, so the table builder is no longer usable. We must be in this
-  //   state by the time the destructor runs.
-  enum class State {
-    kBuffered,
-    kUnbuffered,
-    kClosed,
+  // BlockRep instances are fetched from and recycled to
+  // block_rep_pool during parallel compression.
+  struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep {
+    // Uncompressed block contents
+    std::string uncompressed;
+    std::string compressed;
+    CompressionType compression_type = kNoCompression;
+    // For efficiency, the std::string is repeatedly overwritten without
+    // checking for "has no value". Only at the end of its life will it be
+    // assigned "no value". Thus, it needs to start with a value.
+    std::optional<std::string> first_key_in_next_block = std::string{};
+    Keys keys;
+    BlockRepSlot slot;
+    Status status;
   };
-  State state = State::kUnbuffered;
-  // `kBuffered` state is allowed only as long as the buffering of uncompressed
-  // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
-  uint64_t buffer_limit = 0;
-  std::shared_ptr<CacheReservationManager>
-      compression_dict_buffer_cache_res_mgr;
-  const bool use_delta_encoding_for_index_values;
-  std::unique_ptr<FilterBlockBuilder> filter_builder;
-  OffsetableCacheKey base_cache_key;
-  const TableFileCreationReason reason;
 
-  BlockHandle pending_handle;  // Handle to add to index block
+  // Use a vector of BlockRep as a buffer for a determined number
+  // of BlockRep structures. All data referenced by pointers in
+  // BlockRep will be freed when this vector is destructed.
+  using BlockRepBuffer = std::vector<BlockRep>;
+  BlockRepBuffer block_rep_buf;
+  // Use a thread-safe queue for concurrent access from block
+  // building thread and writer thread.
+  using BlockRepPool = WorkQueue<BlockRep*>;
+  BlockRepPool block_rep_pool;
 
-  std::string single_threaded_compressed_output;
-  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
+  // Compression queue will pass references to BlockRep in block_rep_buf,
+  // and those references are always valid before the destruction of
+  // block_rep_buf.
+  using CompressQueue = WorkQueue<BlockRep*>;
+  CompressQueue compress_queue;
+  std::vector<port::Thread> compress_thread_pool;
 
-  std::vector<std::unique_ptr<InternalTblPropColl>> table_properties_collectors;
+  // Write queue will pass references to BlockRep::slot in block_rep_buf,
+  // and those references are always valid before the corresponding
+  // BlockRep::slot is destructed, which is before the destruction of
+  // block_rep_buf.
+  using WriteQueue = WorkQueue<BlockRepSlot*>;
+  WriteQueue write_queue;
+  std::unique_ptr<port::Thread> write_thread;
 
-  std::unique_ptr<ParallelCompressionRep> pc_rep;
-  BlockCreateContext create_context;
+  // Estimate output file size when parallel compression is enabled. This is
+  // necessary because compression & flush are no longer synchronized,
+  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
+  // memory_order_relaxed suffices because accurate statistics is not required.
+  class FileSizeEstimator {
+   public:
+    explicit FileSizeEstimator()
+        : uncomp_bytes_compressed(0),
+          uncomp_bytes_curr_block(0),
+          uncomp_bytes_curr_block_set(false),
+          uncomp_bytes_inflight(0),
+          blocks_inflight(0),
+          curr_compression_ratio(0),
+          estimated_file_size(0) {}
 
-  // The size of the "tail" part of a SST file. "Tail" refers to
-  // all blocks after data blocks till the end of the SST file.
-  uint64_t tail_size;
+    // Estimate file size when a block is about to be emitted to
+    // compression thread
+    void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) {
+      uint64_t new_uncomp_bytes_inflight =
+          uncomp_bytes_inflight.fetch_add(uncomp_block_size,
+                                          std::memory_order_relaxed) +
+          uncomp_block_size;
 
-  // The total size of all blocks in this file before they are compressed.
-  // This is used for logging compaction stats.
-  uint64_t pre_compression_size = 0;
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
 
-  // See class Footer
-  uint32_t base_context_checksum;
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_uncomp_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+    }
 
-  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
-  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
+    // Estimate file size when a block is already reaped from
+    // compression thread
+    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
+      assert(uncomp_bytes_curr_block_set);
 
-  bool IsParallelCompressionEnabled() const {
-    return compression_parallel_threads > 1;
-  }
+      uint64_t new_uncomp_bytes_compressed =
+          uncomp_bytes_compressed + uncomp_bytes_curr_block;
+      assert(new_uncomp_bytes_compressed > 0);
 
-  Status GetStatus() {
-    // We need to make modifications of status visible when status_ok is set
-    // to false, and this is ensured by status_mutex, so no special memory
-    // order for status_ok is required.
-    if (status_ok.load(std::memory_order_relaxed)) {
-      return Status::OK();
-    } else {
-      return CopyStatus();
+      curr_compression_ratio.store(
+          (curr_compression_ratio.load(std::memory_order_relaxed) *
+               uncomp_bytes_compressed +
+           compressed_block_size) /
+              static_cast<double>(new_uncomp_bytes_compressed),
+          std::memory_order_relaxed);
+      uncomp_bytes_compressed = new_uncomp_bytes_compressed;
+
+      uint64_t new_uncomp_bytes_inflight =
+          uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block,
+                                          std::memory_order_relaxed) -
+          uncomp_bytes_curr_block;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_uncomp_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+
+      uncomp_bytes_curr_block_set = false;
     }
-  }
 
-  Status CopyStatus() {
-    std::lock_guard<std::mutex> lock(status_mutex);
-    return status;
-  }
+    void SetEstimatedFileSize(uint64_t size) {
+      estimated_file_size.store(size, std::memory_order_relaxed);
+    }
 
-  IOStatus GetIOStatus() {
-    // We need to make modifications of io_status visible when status_ok is set
-    // to false, and this is ensured by io_status_mutex, so no special memory
-    // order for io_status_ok is required.
-    if (io_status_ok.load(std::memory_order_relaxed)) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED  // Avoid unnecessary lock acquisition
-      auto ios = CopyIOStatus();
-      ios.PermitUncheckedError();
-      // Assume no races in unit tests
-      assert(ios.ok());
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
-      return IOStatus::OK();
-    } else {
-      return CopyIOStatus();
+    uint64_t GetEstimatedFileSize() {
+      return estimated_file_size.load(std::memory_order_relaxed);
+    }
+
+    void SetCurrBlockUncompSize(uint64_t size) {
+      uncomp_bytes_curr_block = size;
+      uncomp_bytes_curr_block_set = true;
+    }
+
+   private:
+    // Input bytes compressed so far.
+    uint64_t uncomp_bytes_compressed;
+    // Size of current block being appended.
+    uint64_t uncomp_bytes_curr_block;
+    // Whether uncomp_bytes_curr_block has been set for next
+    // ReapBlock call.
+    bool uncomp_bytes_curr_block_set;
+    // Input bytes under compression and not appended yet.
+    std::atomic<uint64_t> uncomp_bytes_inflight;
+    // Number of blocks under compression and not appended yet.
+    std::atomic<uint64_t> blocks_inflight;
+    // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock.
+    std::atomic<double> curr_compression_ratio;
+    // Estimated SST file size.
+    std::atomic<uint64_t> estimated_file_size;
+  };
+  FileSizeEstimator file_size_estimator;
+
+  // Facilities used for waiting first block completion. Need to Wait for
+  // the completion of first block compression and flush to get a non-zero
+  // compression ratio.
+  std::atomic<bool> first_block_processed;
+  std::condition_variable first_block_cond;
+  std::mutex first_block_mutex;
+
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
+      : block_rep_buf(parallel_threads),
+        block_rep_pool(parallel_threads),
+        compress_queue(parallel_threads),
+        write_queue(parallel_threads),
+        first_block_processed(false) {
+    for (uint32_t i = 0; i < parallel_threads; i++) {
+      // Prime the queue of available BlockReps
+      block_rep_pool.push(&block_rep_buf[i]);
     }
   }
 
-  IOStatus CopyIOStatus() {
-    std::lock_guard<std::mutex> lock(io_status_mutex);
-    return io_status;
+  ~ParallelCompressionRep() { block_rep_pool.finish(); }
+
+  // Make a block prepared to be emitted to compression thread
+  // Used in non-buffered mode
+  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
+                         BlockBuilder* data_block) {
+    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
+    assert(block_rep != nullptr);
+    data_block->SwapAndReset(block_rep->uncompressed);
+    std::swap(block_rep->keys, curr_block_keys);
+    curr_block_keys.Clear();
+    return block_rep;
   }
 
-  // Never erase an existing status that is not OK.
-  void SetStatus(Status s) {
-    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_parallel_threads
-      // case but since it's unlikely that s is not OK, we take this cost
-      // to be simplicity.
-      std::lock_guard<std::mutex> lock(status_mutex);
-      status = s;
-      status_ok.store(false, std::memory_order_relaxed);
+  // Used in EnterUnbuffered
+  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
+                         std::string* data_block,
+                         std::vector<std::string>* keys) {
+    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
+    assert(block_rep != nullptr);
+    std::swap(block_rep->uncompressed, *data_block);
+    block_rep->keys.SwapAssign(*keys);
+    return block_rep;
+  }
+
+  // Emit a block to compression thread
+  void EmitBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    assert(block_rep->status.ok());
+    if (!write_queue.push(&block_rep->slot)) {
+      return;
+    }
+    if (!compress_queue.push(block_rep)) {
+      return;
+    }
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::unique_lock<std::mutex> lock(first_block_mutex);
+      first_block_cond.wait(lock, [this] {
+        return first_block_processed.load(std::memory_order_relaxed);
+      });
     }
   }
 
-  // Never erase an existing I/O status that is not OK.
-  // Calling this will also SetStatus(ios)
-  void SetIOStatus(IOStatus ios) {
-    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_parallel_threads
-      // case but since it's unlikely that s is not OK, we take this cost
-      // to be simplicity.
-      std::lock_guard<std::mutex> lock(io_status_mutex);
-      io_status = ios;
-      io_status_ok.store(false, std::memory_order_relaxed);
+  // Reap a block from compression thread
+  void ReapBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    block_rep->compressed.clear();
+    block_rep_pool.push(block_rep);
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::lock_guard<std::mutex> lock(first_block_mutex);
+      first_block_processed.store(true, std::memory_order_relaxed);
+      first_block_cond.notify_one();
     }
-    SetStatus(ios);
   }
 
-  Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
-      WritableFileWriter* f)
-      : ioptions(tbo.ioptions),
-        prefix_extractor(tbo.moptions.prefix_extractor),
-        write_options(tbo.write_options),
-        table_options(table_opt),
-        internal_comparator(tbo.internal_comparator),
-        ts_sz(tbo.internal_comparator.user_comparator()->timestamp_size()),
-        persist_user_defined_timestamps(
-            tbo.ioptions.persist_user_defined_timestamps),
-        file(f),
-        offset(0),
-        alignment(table_options.block_align
-                      ? std::min(static_cast<size_t>(table_options.block_size),
-                                 kDefaultPageSize)
-                      : 0),
-        data_block(table_options.block_restart_interval,
-                   table_options.use_delta_encoding,
-                   false /* use_value_delta_encoding */,
-                   tbo.internal_comparator.user_comparator()
-                           ->CanKeysWithDifferentByteContentsBeEqual()
-                       ? BlockBasedTableOptions::kDataBlockBinarySearch
-                       : table_options.data_block_index_type,
-                   table_options.data_block_hash_table_util_ratio, ts_sz,
-                   persist_user_defined_timestamps),
-        range_del_block(
-            1 /* block_restart_interval */, true /* use_delta_encoding */,
-            false /* use_value_delta_encoding */,
-            BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */,
-            0.75 /* data_block_hash_table_util_ratio */, ts_sz,
-            persist_user_defined_timestamps),
-        internal_prefix_transform(prefix_extractor.get()),
-        sample_for_compression(tbo.moptions.sample_for_compression),
-        compressible_input_data_bytes(0),
-        uncompressible_input_data_bytes(0),
-        sampled_input_data_bytes(0),
-        sampled_output_slow_data_bytes(0),
-        sampled_output_fast_data_bytes(0),
-        compression_parallel_threads(tbo.compression_opts.parallel_threads),
-        max_compressed_bytes_per_kb(
-            tbo.compression_opts.max_compressed_bytes_per_kb),
-        data_block_working_areas(compression_parallel_threads),
-        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
-                                            !table_opt.block_align),
-        reason(tbo.reason),
-        flush_block_policy(
-            table_options.flush_block_policy_factory->NewFlushBlockPolicy(
-                table_options, data_block)),
-        create_context(&table_options, &ioptions, ioptions.stats,
-                       /*decompressor=*/nullptr,
-                       tbo.moptions.block_protection_bytes_per_key,
-                       tbo.internal_comparator.user_comparator(),
-                       !use_delta_encoding_for_index_values,
-                       table_opt.index_type ==
-                           BlockBasedTableOptions::kBinarySearchWithFirstKey),
-        tail_size(0),
-        status_ok(true),
-        io_status_ok(true) {
-    FilterBuildingContext filter_context(table_options);
+ private:
+  BlockRep* PrepareBlockInternal(const Slice* first_key_in_next_block) {
+    BlockRep* block_rep = nullptr;
+    block_rep_pool.pop(block_rep);
+    assert(block_rep != nullptr);
 
-    filter_context.info_log = ioptions.logger;
-    filter_context.column_family_name = tbo.column_family_name;
-    filter_context.reason = reason;
+    block_rep->compression_type = kNoCompression;
 
-    // Only populate other fields if known to be in LSM rather than
-    // generating external SST file
-    if (reason != TableFileCreationReason::kMisc) {
-      filter_context.compaction_style = ioptions.compaction_style;
-      filter_context.num_levels = ioptions.num_levels;
-      filter_context.level_at_creation = tbo.level_at_creation;
-      filter_context.is_bottommost = tbo.is_bottommost;
-      assert(filter_context.level_at_creation < filter_context.num_levels);
+    if (first_key_in_next_block == nullptr) {
+      block_rep->first_key_in_next_block = {};
+    } else {
+      block_rep->first_key_in_next_block->assign(
+          first_key_in_next_block->data(), first_key_in_next_block->size());
     }
 
-    // TODO: get CompressionManager from options and sort out properties
-    auto mgr = tbo.moptions.compression_manager;
-    if (mgr == nullptr) {
-      mgr = GetBuiltinCompressionManager(
-          GetCompressFormatForVersion(table_opt.format_version));
-    }
-    props.compression_name = CompressionTypeToString(tbo.compression_type);
-    props.compression_options =
-        CompressionOptionsToString(tbo.compression_opts);
-
-    // Sanitize to only allowing compression when it saves space.
-    max_compressed_bytes_per_kb =
-        std::min(int{1023}, tbo.compression_opts.max_compressed_bytes_per_kb);
-
-    basic_compressor = mgr->GetCompressorForSST(
-        filter_context, tbo.compression_opts, tbo.compression_type);
-    if (basic_compressor) {
-      if (table_options.enable_index_compression) {
-        basic_working_area.compress = basic_compressor->ObtainWorkingArea();
-      }
-      max_dict_sample_bytes = basic_compressor->GetMaxSampleSizeIfWantDict(
-          CacheEntryRole::kDataBlock);
-      if (max_dict_sample_bytes > 0) {
-        state = State::kBuffered;
-        if (tbo.target_file_size == 0) {
-          buffer_limit = tbo.compression_opts.max_dict_buffer_bytes;
-        } else if (tbo.compression_opts.max_dict_buffer_bytes == 0) {
-          buffer_limit = tbo.target_file_size;
-        } else {
-          buffer_limit = std::min(tbo.target_file_size,
-                                  tbo.compression_opts.max_dict_buffer_bytes);
-        }
-      } else {
-        // No distinct data block compressor using dictionary
-        data_block_compressor = basic_compressor.get();
-        for (uint32_t i = 0; i < compression_parallel_threads; i++) {
-          data_block_working_areas[i].compress =
-              data_block_compressor->ObtainWorkingArea();
-        }
-      }
-      basic_decompressor =
-          mgr->GetDecompressorOptimizeFor(tbo.compression_type);
-      create_context.decompressor = basic_decompressor.get();
-
-      if (table_options.verify_compression) {
-        verify_decompressor = basic_decompressor.get();
-        if (table_options.enable_index_compression) {
-          basic_working_area.verify =
-              verify_decompressor->ObtainWorkingArea(tbo.compression_type);
-        }
-        if (state == State::kUnbuffered) {
-          for (uint32_t i = 0; i < compression_parallel_threads; i++) {
-            data_block_working_areas[i].verify =
-                verify_decompressor->ObtainWorkingArea(tbo.compression_type);
-          }
-          data_block_verify_decompressor = verify_decompressor.get();
-        }
-      }
-    }
+    return block_rep;
+  }
+};
 
-    switch (table_options.prepopulate_block_cache) {
-      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
-        warm_cache = (reason == TableFileCreationReason::kFlush);
-        break;
-      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
-        warm_cache = false;
-        break;
-      default:
-        // missing case
-        assert(false);
-        warm_cache = false;
-    }
+struct BlockBasedTableBuilder::Rep {
+  const ImmutableOptions ioptions;
+  // BEGIN from MutableCFOptions
+  std::shared_ptr<const SliceTransform> prefix_extractor;
+  // END from MutableCFOptions
+  const WriteOptions write_options;
+  const BlockBasedTableOptions table_options;
+  const InternalKeyComparator& internal_comparator;
+  // Size in bytes for the user-defined timestamps.
+  size_t ts_sz;
+  // When `ts_sz` > 0 and this flag is false, the user-defined timestamp in the
+  // user key will be stripped when creating the block based table. This
+  // stripping happens for all user keys, including the keys in data block,
+  // index block for data block, index block for index block (if index type is
+  // `kTwoLevelIndexSearch`), index for filter blocks (if using partitioned
+  // filters), the `first_internal_key` in `IndexValue`, the `end_key` for range
+  // deletion entries.
+  // As long as the user keys are sorted when added via `Add` API, their logic
+  // ordering won't change after timestamps are stripped. However, for each user
+  // key to be logically equivalent before and after timestamp is stripped, the
+  // user key should contain the minimum timestamp.
+  bool persist_user_defined_timestamps;
+  WritableFileWriter* file;
+  std::atomic<uint64_t> offset;
+  size_t alignment;
+  BlockBuilder data_block;
+  // Buffers uncompressed data blocks to replay later. Needed when
+  // compression dictionary is enabled so we can finalize the dictionary before
+  // compressing any data blocks.
+  std::vector<std::string> data_block_buffers;
+  BlockBuilder range_del_block;
 
-    const auto compress_dict_build_buffer_charged =
-        table_options.cache_usage_options.options_overrides
-            .at(CacheEntryRole::kCompressionDictionaryBuildingBuffer)
-            .charged;
-    if (table_options.block_cache &&
-        (compress_dict_build_buffer_charged ==
-             CacheEntryRoleOptions::Decision::kEnabled ||
-         compress_dict_build_buffer_charged ==
-             CacheEntryRoleOptions::Decision::kFallback)) {
-      compression_dict_buffer_cache_res_mgr =
-          std::make_shared<CacheReservationManagerImpl<
-              CacheEntryRole::kCompressionDictionaryBuildingBuffer>>(
-              table_options.block_cache);
-    } else {
-      compression_dict_buffer_cache_res_mgr = nullptr;
-    }
+  InternalKeySliceTransform internal_prefix_transform;
+  std::unique_ptr<IndexBuilder> index_builder;
+  std::string index_separator_scratch;
+  PartitionedIndexBuilder* p_index_builder_ = nullptr;
 
-    if (table_options.index_type ==
-        BlockBasedTableOptions::kTwoLevelIndexSearch) {
-      p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
-          &internal_comparator, use_delta_encoding_for_index_values,
-          table_options, ts_sz, persist_user_defined_timestamps);
-      index_builder.reset(p_index_builder_);
-    } else {
-      index_builder.reset(IndexBuilder::CreateIndexBuilder(
-          table_options.index_type, &internal_comparator,
-          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
-          table_options, ts_sz, persist_user_defined_timestamps));
-    }
-    if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
-      // Apply optimize_filters_for_hits setting here when applicable by
-      // skipping filter generation
-      filter_builder.reset();
-    } else if (tbo.skip_filters) {
-      // For SstFileWriter skip_filters
-      filter_builder.reset();
-    } else if (!table_options.filter_policy) {
-      // Null filter_policy -> no filter
-      filter_builder.reset();
-    } else {
-      filter_builder.reset(CreateFilterBlockBuilder(
-          ioptions, tbo.moptions, filter_context,
-          use_delta_encoding_for_index_values, p_index_builder_, ts_sz,
-          persist_user_defined_timestamps));
-    }
+  std::string last_ikey;  // Internal key or empty (unset)
+  const Slice* first_key_in_next_block = nullptr;
+  bool warm_cache = false;
 
-    assert(tbo.internal_tbl_prop_coll_factories);
-    for (auto& factory : *tbo.internal_tbl_prop_coll_factories) {
-      assert(factory);
+  uint64_t sample_for_compression;
+  std::atomic<uint64_t> compressible_input_data_bytes;
+  std::atomic<uint64_t> uncompressible_input_data_bytes;
+  std::atomic<uint64_t> sampled_input_data_bytes;
+  std::atomic<uint64_t> sampled_output_slow_data_bytes;
+  std::atomic<uint64_t> sampled_output_fast_data_bytes;
+  uint32_t compression_parallel_threads;
+  int max_compressed_bytes_per_kb;
+  size_t max_dict_sample_bytes = 0;
 
-      std::unique_ptr<InternalTblPropColl> collector{
-          factory->CreateInternalTblPropColl(
-              tbo.column_family_id, tbo.level_at_creation,
-              tbo.ioptions.num_levels,
-              tbo.last_level_inclusive_max_seqno_threshold)};
-      if (collector) {
-        table_properties_collectors.emplace_back(std::move(collector));
-      }
-    }
-    table_properties_collectors.emplace_back(
-        new BlockBasedTablePropertiesCollector(
-            table_options.index_type, table_options.whole_key_filtering,
-            prefix_extractor != nullptr,
-            table_options.decouple_partitioned_filters));
-    if (ts_sz > 0 && persist_user_defined_timestamps) {
-      table_properties_collectors.emplace_back(
-          new TimestampTablePropertiesCollector(
-              tbo.internal_comparator.user_comparator()));
-    }
+  // *** Compressors & decompressors - Yes, it seems like a lot here but ***
+  // *** these are distinct fields to minimize extra conditionals and    ***
+  // *** field reads on hot code paths.                                  ***
 
-    // These are only needed for populating table properties
-    props.column_family_id = tbo.column_family_id;
-    props.column_family_name = tbo.column_family_name;
-    props.oldest_key_time = tbo.oldest_key_time;
-    props.newest_key_time = tbo.newest_key_time;
-    props.file_creation_time = tbo.file_creation_time;
-    props.orig_file_number = tbo.cur_file_num;
-    props.db_id = tbo.db_id;
-    props.db_session_id = tbo.db_session_id;
-    props.db_host_id = ioptions.db_host_id;
-    props.format_version = table_options.format_version;
-    if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
-      ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
-    }
-    // Default is UINT64_MAX for unknown. Setting it to 0 here
-    // to allow updating it by taking max in BlockBasedTableBuilder::Add().
-    props.key_largest_seqno = 0;
+  // A compressor for blocks in general, without dictionary compression
+  std::unique_ptr<Compressor> basic_compressor;
+  // A compressor using dictionary compression (when applicable)
+  std::unique_ptr<Compressor> compressor_with_dict;
+  // Once configured/determined, points to one of the above Compressors to
+  // use on data blocks.
+  Compressor* data_block_compressor = nullptr;
+  // A decompressor corresponding to basic_compressor (when non-nullptr).
+  // Used for verification and cache warming.
+  std::shared_ptr<Decompressor> basic_decompressor;
+  // When needed, a decompressor for verifying compression using a
+  // dictionary sampled/trained from this file.
+  std::unique_ptr<Decompressor> verify_decompressor_with_dict;
+  // When non-nullptr, compression should be verified with this corresponding
+  // decompressor, except for data blocks. (Points to same as basic_decompressor
+  // when verify_compression is set.)
+  UnownedPtr<Decompressor> verify_decompressor;
+  // Once configured/determined, points to one of the above Decompressors to use
+  // in verifying data blocks.
+  UnownedPtr<Decompressor> data_block_verify_decompressor;
 
-    if (FormatVersionUsesContextChecksum(table_options.format_version)) {
-      // Must be non-zero and semi- or quasi-random
-      // TODO: ideally guaranteed different for related files (e.g. use file
-      // number and db_session, for benefit of SstFileWriter)
-      do {
-        base_context_checksum = Random::GetTLSInstance()->Next();
-      } while (UNLIKELY(base_context_checksum == 0));
-    } else {
-      base_context_checksum = 0;
-    }
+  // Working area for basic_compressor when compression_parallel_threads==1
+  WorkingAreaPair basic_working_area;
+  // Working areas for data_block_compressor, for each of
+  // compression_parallel_threads
+  std::vector<WorkingAreaPair> data_block_working_areas;
 
-    if (alignment > 0 && basic_compressor) {
-      // With better sanitization in `CompactionPicker::CompactFiles()`, we
-      // would not need to handle this case here and could change it to an
-      // assertion instead.
-      SetStatus(Status::InvalidArgument(
-          "Enable block_align, but compression enabled"));
-    }
-  }
+  size_t data_begin_offset = 0;
 
-  Rep(const Rep&) = delete;
-  Rep& operator=(const Rep&) = delete;
+  TableProperties props;
 
- private:
-  // Synchronize status & io_status accesses across threads from main thread,
-  // compression thread and write thread in parallel compression.
-  std::mutex status_mutex;
-  std::atomic<bool> status_ok;
-  Status status;
-  std::mutex io_status_mutex;
-  std::atomic<bool> io_status_ok;
-  IOStatus io_status;
-};
+  // States of the builder.
+  //
+  // - `kBuffered`: This is the initial state where zero or more data blocks are
+  //   accumulated uncompressed in-memory. From this state, call
+  //   `EnterUnbuffered()` to finalize the compression dictionary if enabled,
+  //   compress/write out any buffered blocks, and proceed to the `kUnbuffered`
+  //   state.
+  //
+  // - `kUnbuffered`: This is the state when compression dictionary is finalized
+  //   either because it wasn't enabled in the first place or it's been created
+  //   from sampling previously buffered data. In this state, blocks are simply
+  //   compressed/written out as they fill up. From this state, call `Finish()`
+  //   to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
+  //   the partially created file.
+  //
+  // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
+  //   called, so the table builder is no longer usable. We must be in this
+  //   state by the time the destructor runs.
+  enum class State {
+    kBuffered,
+    kUnbuffered,
+    kClosed,
+  };
+  State state = State::kUnbuffered;
+  // `kBuffered` state is allowed only as long as the buffering of uncompressed
+  // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
+  uint64_t buffer_limit = 0;
+  std::shared_ptr<CacheReservationManager>
+      compression_dict_buffer_cache_res_mgr;
+  const bool use_delta_encoding_for_index_values;
+  std::unique_ptr<FilterBlockBuilder> filter_builder;
+  OffsetableCacheKey base_cache_key;
+  const TableFileCreationReason reason;
 
-struct BlockBasedTableBuilder::ParallelCompressionRep {
-  // TODO: consider replacing with autovector or similar
-  // Keys is a wrapper of vector of strings avoiding
-  // releasing string memories during vector clear()
-  // in order to save memory allocation overhead
-  class Keys {
-   public:
-    Keys() : keys_(kKeysInitSize), size_(0) {}
-    void PushBack(const Slice& key) {
-      if (size_ == keys_.size()) {
-        keys_.emplace_back(key.data(), key.size());
-      } else {
-        keys_[size_].assign(key.data(), key.size());
-      }
-      size_++;
-    }
-    void SwapAssign(std::vector<std::string>& keys) {
-      size_ = keys.size();
-      std::swap(keys_, keys);
-    }
-    void Clear() { size_ = 0; }
-    size_t Size() { return size_; }
-    std::string& Back() { return keys_[size_ - 1]; }
-    std::string& operator[](size_t idx) {
-      assert(idx < size_);
-      return keys_[idx];
-    }
+  BlockHandle pending_handle;  // Handle to add to index block
 
-   private:
-    static constexpr size_t kKeysInitSize = 32;
-    std::vector<std::string> keys_;
-    size_t size_;
-  };
-  Keys curr_block_keys;
+  std::string single_threaded_compressed_output;
+  std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
-  struct BlockRep;
+  std::vector<std::unique_ptr<InternalTblPropColl>> table_properties_collectors;
 
-  // Use BlockRepSlot to keep block order in write thread.
-  // slot_ will pass references to BlockRep
-  class BlockRepSlot {
-   public:
-    BlockRepSlot() : slot_(1) {}
-    template <typename T>
-    void Fill(T&& rep) {
-      slot_.push(std::forward<T>(rep));
-    }
-    void Take(BlockRep*& rep) { slot_.pop(rep); }
+  std::unique_ptr<ParallelCompressionRep> pc_rep;
+  BlockCreateContext create_context;
 
-   private:
-    // slot_ will pass references to BlockRep in block_rep_buf,
-    // and those references are always valid before the destruction of
-    // block_rep_buf.
-    WorkQueue<BlockRep*> slot_;
-  };
+  // The size of the "tail" part of a SST file. "Tail" refers to
+  // all blocks after data blocks till the end of the SST file.
+  uint64_t tail_size;
 
-  // BlockRep instances are fetched from and recycled to
-  // block_rep_pool during parallel compression.
-  struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep {
-    // Uncompressed block contents
-    std::string uncompressed;
-    std::string compressed;
-    CompressionType compression_type = kNoCompression;
-    // For efficiency, the std::string is repeatedly overwritten without
-    // checking for "has no value". Only at the end of its life will it be
-    // assigned "no value". Thus, it needs to start with a value.
-    std::optional<std::string> first_key_in_next_block = std::string{};
-    Keys keys;
-    BlockRepSlot slot;
-    Status status;
-  };
+  // The total size of all blocks in this file before they are compressed.
+  // This is used for logging compaction stats.
+  uint64_t pre_compression_size = 0;
 
-  // Use a vector of BlockRep as a buffer for a determined number
-  // of BlockRep structures. All data referenced by pointers in
-  // BlockRep will be freed when this vector is destructed.
-  using BlockRepBuffer = std::vector<BlockRep>;
-  BlockRepBuffer block_rep_buf;
-  // Use a thread-safe queue for concurrent access from block
-  // building thread and writer thread.
-  using BlockRepPool = WorkQueue<BlockRep*>;
-  BlockRepPool block_rep_pool;
+  // See class Footer
+  uint32_t base_context_checksum;
 
-  // Compression queue will pass references to BlockRep in block_rep_buf,
-  // and those references are always valid before the destruction of
-  // block_rep_buf.
-  using CompressQueue = WorkQueue<BlockRep*>;
-  CompressQueue compress_queue;
-  std::vector<port::Thread> compress_thread_pool;
+  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
+  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
 
-  // Write queue will pass references to BlockRep::slot in block_rep_buf,
-  // and those references are always valid before the corresponding
-  // BlockRep::slot is destructed, which is before the destruction of
-  // block_rep_buf.
-  using WriteQueue = WorkQueue<BlockRepSlot*>;
-  WriteQueue write_queue;
-  std::unique_ptr<port::Thread> write_thread;
+  bool IsParallelCompressionEnabled() const {
+    return compression_parallel_threads > 1;
+  }
 
-  // Estimate output file size when parallel compression is enabled. This is
-  // necessary because compression & flush are no longer synchronized,
-  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
-  // memory_order_relaxed suffices because accurate statistics is not required.
-  class FileSizeEstimator {
-   public:
-    explicit FileSizeEstimator()
-        : uncomp_bytes_compressed(0),
-          uncomp_bytes_curr_block(0),
-          uncomp_bytes_curr_block_set(false),
-          uncomp_bytes_inflight(0),
-          blocks_inflight(0),
-          curr_compression_ratio(0),
-          estimated_file_size(0) {}
+  Status GetStatus() {
+    // We need to make modifications of status visible when status_ok is set
+    // to false, and this is ensured by status_mutex, so no special memory
+    // order for status_ok is required.
+    if (status_ok.load(std::memory_order_relaxed)) {
+      return Status::OK();
+    } else {
+      return CopyStatus();
+    }
+  }
 
-    // Estimate file size when a block is about to be emitted to
-    // compression thread
-    void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) {
-      uint64_t new_uncomp_bytes_inflight =
-          uncomp_bytes_inflight.fetch_add(uncomp_block_size,
-                                          std::memory_order_relaxed) +
-          uncomp_block_size;
+  Status CopyStatus() {
+    std::lock_guard<std::mutex> lock(status_mutex);
+    return status;
+  }
 
-      uint64_t new_blocks_inflight =
-          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
+  IOStatus GetIOStatus() {
+    // We need to make modifications of io_status visible when status_ok is set
+    // to false, and this is ensured by io_status_mutex, so no special memory
+    // order for io_status_ok is required.
+    if (io_status_ok.load(std::memory_order_relaxed)) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED  // Avoid unnecessary lock acquisition
+      auto ios = CopyIOStatus();
+      ios.PermitUncheckedError();
+      // Assume no races in unit tests
+      assert(ios.ok());
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+      return IOStatus::OK();
+    } else {
+      return CopyIOStatus();
+    }
+  }
 
-      estimated_file_size.store(
-          curr_file_size +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_uncomp_bytes_inflight) *
-                  curr_compression_ratio.load(std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
+  IOStatus CopyIOStatus() {
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    return io_status;
+  }
+
+  // Never erase an existing status that is not OK.
+  void SetStatus(Status s) {
+    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(status_mutex);
+      status = s;
+      status_ok.store(false, std::memory_order_relaxed);
     }
+  }
 
-    // Estimate file size when a block is already reaped from
-    // compression thread
-    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
-      assert(uncomp_bytes_curr_block_set);
+  // Never erase an existing I/O status that is not OK.
+  // Calling this will also SetStatus(ios)
+  void SetIOStatus(IOStatus ios) {
+    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
+      // Locking is an overkill for non compression_parallel_threads
+      // case but since it's unlikely that s is not OK, we take this cost
+      // to be simplicity.
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      io_status = ios;
+      io_status_ok.store(false, std::memory_order_relaxed);
+    }
+    SetStatus(ios);
+  }
 
-      uint64_t new_uncomp_bytes_compressed =
-          uncomp_bytes_compressed + uncomp_bytes_curr_block;
-      assert(new_uncomp_bytes_compressed > 0);
+  Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
+      WritableFileWriter* f)
+      : ioptions(tbo.ioptions),
+        prefix_extractor(tbo.moptions.prefix_extractor),
+        write_options(tbo.write_options),
+        table_options(table_opt),
+        internal_comparator(tbo.internal_comparator),
+        ts_sz(tbo.internal_comparator.user_comparator()->timestamp_size()),
+        persist_user_defined_timestamps(
+            tbo.ioptions.persist_user_defined_timestamps),
+        file(f),
+        offset(0),
+        alignment(table_options.block_align
+                      ? std::min(static_cast<size_t>(table_options.block_size),
+                                 kDefaultPageSize)
+                      : 0),
+        data_block(table_options.block_restart_interval,
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   tbo.internal_comparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio, ts_sz,
+                   persist_user_defined_timestamps),
+        range_del_block(
+            1 /* block_restart_interval */, true /* use_delta_encoding */,
+            false /* use_value_delta_encoding */,
+            BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */,
+            0.75 /* data_block_hash_table_util_ratio */, ts_sz,
+            persist_user_defined_timestamps),
+        internal_prefix_transform(prefix_extractor.get()),
+        sample_for_compression(tbo.moptions.sample_for_compression),
+        compressible_input_data_bytes(0),
+        uncompressible_input_data_bytes(0),
+        sampled_input_data_bytes(0),
+        sampled_output_slow_data_bytes(0),
+        sampled_output_fast_data_bytes(0),
+        compression_parallel_threads(tbo.compression_opts.parallel_threads),
+        max_compressed_bytes_per_kb(
+            tbo.compression_opts.max_compressed_bytes_per_kb),
+        data_block_working_areas(compression_parallel_threads),
+        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+                                            !table_opt.block_align),
+        reason(tbo.reason),
+        flush_block_policy(
+            table_options.flush_block_policy_factory->NewFlushBlockPolicy(
+                table_options, data_block)),
+        create_context(&table_options, &ioptions, ioptions.stats,
+                       /*decompressor=*/nullptr,
+                       tbo.moptions.block_protection_bytes_per_key,
+                       tbo.internal_comparator.user_comparator(),
+                       !use_delta_encoding_for_index_values,
+                       table_opt.index_type ==
+                           BlockBasedTableOptions::kBinarySearchWithFirstKey),
+        tail_size(0),
+        status_ok(true),
+        io_status_ok(true) {
+    FilterBuildingContext filter_context(table_options);
 
-      curr_compression_ratio.store(
-          (curr_compression_ratio.load(std::memory_order_relaxed) *
-               uncomp_bytes_compressed +
-           compressed_block_size) /
-              static_cast<double>(new_uncomp_bytes_compressed),
-          std::memory_order_relaxed);
-      uncomp_bytes_compressed = new_uncomp_bytes_compressed;
+    filter_context.info_log = ioptions.logger;
+    filter_context.column_family_name = tbo.column_family_name;
+    filter_context.reason = reason;
 
-      uint64_t new_uncomp_bytes_inflight =
-          uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block,
-                                          std::memory_order_relaxed) -
-          uncomp_bytes_curr_block;
+    // Only populate other fields if known to be in LSM rather than
+    // generating external SST file
+    if (reason != TableFileCreationReason::kMisc) {
+      filter_context.compaction_style = ioptions.compaction_style;
+      filter_context.num_levels = ioptions.num_levels;
+      filter_context.level_at_creation = tbo.level_at_creation;
+      filter_context.is_bottommost = tbo.is_bottommost;
+      assert(filter_context.level_at_creation < filter_context.num_levels);
+    }
 
-      uint64_t new_blocks_inflight =
-          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
+    // TODO: get CompressionManager from options and sort out properties
+    auto mgr = tbo.moptions.compression_manager;
+    if (mgr == nullptr) {
+      mgr = GetBuiltinCompressionManager(
+          GetCompressFormatForVersion(table_opt.format_version));
+    }
+    props.compression_name = CompressionTypeToString(tbo.compression_type);
+    props.compression_options =
+        CompressionOptionsToString(tbo.compression_opts);
 
-      estimated_file_size.store(
-          curr_file_size +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_uncomp_bytes_inflight) *
-                  curr_compression_ratio.load(std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
+    // Sanitize to only allowing compression when it saves space.
+    max_compressed_bytes_per_kb =
+        std::min(int{1023}, tbo.compression_opts.max_compressed_bytes_per_kb);
+
+    basic_compressor = mgr->GetCompressorForSST(
+        filter_context, tbo.compression_opts, tbo.compression_type);
+    if (basic_compressor) {
+      if (table_options.enable_index_compression) {
+        basic_working_area.compress = basic_compressor->ObtainWorkingArea();
+      }
+      max_dict_sample_bytes = basic_compressor->GetMaxSampleSizeIfWantDict(
+          CacheEntryRole::kDataBlock);
+      if (max_dict_sample_bytes > 0) {
+        state = State::kBuffered;
+        if (tbo.target_file_size == 0) {
+          buffer_limit = tbo.compression_opts.max_dict_buffer_bytes;
+        } else if (tbo.compression_opts.max_dict_buffer_bytes == 0) {
+          buffer_limit = tbo.target_file_size;
+        } else {
+          buffer_limit = std::min(tbo.target_file_size,
+                                  tbo.compression_opts.max_dict_buffer_bytes);
+        }
+      } else {
+        // No distinct data block compressor using dictionary
+        data_block_compressor = basic_compressor.get();
+        for (uint32_t i = 0; i < compression_parallel_threads; i++) {
+          data_block_working_areas[i].compress =
+              data_block_compressor->ObtainWorkingArea();
+        }
+      }
+      basic_decompressor =
+          mgr->GetDecompressorOptimizeFor(tbo.compression_type);
+      create_context.decompressor = basic_decompressor.get();
 
-      uncomp_bytes_curr_block_set = false;
+      if (table_options.verify_compression) {
+        verify_decompressor = basic_decompressor.get();
+        if (table_options.enable_index_compression) {
+          basic_working_area.verify =
+              verify_decompressor->ObtainWorkingArea(tbo.compression_type);
+        }
+        if (state == State::kUnbuffered) {
+          for (uint32_t i = 0; i < compression_parallel_threads; i++) {
+            data_block_working_areas[i].verify =
+                verify_decompressor->ObtainWorkingArea(tbo.compression_type);
+          }
+          data_block_verify_decompressor = verify_decompressor.get();
+        }
+      }
     }
 
-    void SetEstimatedFileSize(uint64_t size) {
-      estimated_file_size.store(size, std::memory_order_relaxed);
+    switch (table_options.prepopulate_block_cache) {
+      case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
+        warm_cache = (reason == TableFileCreationReason::kFlush);
+        break;
+      case BlockBasedTableOptions::PrepopulateBlockCache::kDisable:
+        warm_cache = false;
+        break;
+      default:
+        // missing case
+        assert(false);
+        warm_cache = false;
     }
 
-    uint64_t GetEstimatedFileSize() {
-      return estimated_file_size.load(std::memory_order_relaxed);
+    const auto compress_dict_build_buffer_charged =
+        table_options.cache_usage_options.options_overrides
+            .at(CacheEntryRole::kCompressionDictionaryBuildingBuffer)
+            .charged;
+    if (table_options.block_cache &&
+        (compress_dict_build_buffer_charged ==
+             CacheEntryRoleOptions::Decision::kEnabled ||
+         compress_dict_build_buffer_charged ==
+             CacheEntryRoleOptions::Decision::kFallback)) {
+      compression_dict_buffer_cache_res_mgr =
+          std::make_shared<CacheReservationManagerImpl<
+              CacheEntryRole::kCompressionDictionaryBuildingBuffer>>(
+              table_options.block_cache);
+    } else {
+      compression_dict_buffer_cache_res_mgr = nullptr;
     }
 
-    void SetCurrBlockUncompSize(uint64_t size) {
-      uncomp_bytes_curr_block = size;
-      uncomp_bytes_curr_block_set = true;
+    if (table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
+          &internal_comparator, use_delta_encoding_for_index_values,
+          table_options, ts_sz, persist_user_defined_timestamps);
+      index_builder.reset(p_index_builder_);
+    } else {
+      index_builder.reset(IndexBuilder::CreateIndexBuilder(
+          table_options.index_type, &internal_comparator,
+          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+          table_options, ts_sz, persist_user_defined_timestamps));
     }
-
-   private:
-    // Input bytes compressed so far.
-    uint64_t uncomp_bytes_compressed;
-    // Size of current block being appended.
-    uint64_t uncomp_bytes_curr_block;
-    // Whether uncomp_bytes_curr_block has been set for next
-    // ReapBlock call.
-    bool uncomp_bytes_curr_block_set;
-    // Input bytes under compression and not appended yet.
-    std::atomic<uint64_t> uncomp_bytes_inflight;
-    // Number of blocks under compression and not appended yet.
-    std::atomic<uint64_t> blocks_inflight;
-    // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock.
-    std::atomic<double> curr_compression_ratio;
-    // Estimated SST file size.
-    std::atomic<uint64_t> estimated_file_size;
-  };
-  FileSizeEstimator file_size_estimator;
-
-  // Facilities used for waiting first block completion. Need to Wait for
-  // the completion of first block compression and flush to get a non-zero
-  // compression ratio.
-  std::atomic<bool> first_block_processed;
-  std::condition_variable first_block_cond;
-  std::mutex first_block_mutex;
-
-  explicit ParallelCompressionRep(uint32_t parallel_threads)
-      : block_rep_buf(parallel_threads),
-        block_rep_pool(parallel_threads),
-        compress_queue(parallel_threads),
-        write_queue(parallel_threads),
-        first_block_processed(false) {
-    for (uint32_t i = 0; i < parallel_threads; i++) {
-      // Prime the queue of available BlockReps
-      block_rep_pool.push(&block_rep_buf[i]);
+    if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
+      // Apply optimize_filters_for_hits setting here when applicable by
+      // skipping filter generation
+      filter_builder.reset();
+    } else if (tbo.skip_filters) {
+      // For SstFileWriter skip_filters
+      filter_builder.reset();
+    } else if (!table_options.filter_policy) {
+      // Null filter_policy -> no filter
+      filter_builder.reset();
+    } else {
+      filter_builder.reset(CreateFilterBlockBuilder(
+          ioptions, tbo.moptions, filter_context,
+          use_delta_encoding_for_index_values, p_index_builder_, ts_sz,
+          persist_user_defined_timestamps));
     }
-  }
-
-  ~ParallelCompressionRep() { block_rep_pool.finish(); }
-
-  // Make a block prepared to be emitted to compression thread
-  // Used in non-buffered mode
-  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
-                         BlockBuilder* data_block) {
-    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
-    assert(block_rep != nullptr);
-    data_block->SwapAndReset(block_rep->uncompressed);
-    std::swap(block_rep->keys, curr_block_keys);
-    curr_block_keys.Clear();
-    return block_rep;
-  }
 
-  // Used in EnterUnbuffered
-  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
-                         std::string* data_block,
-                         std::vector<std::string>* keys) {
-    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
-    assert(block_rep != nullptr);
-    std::swap(block_rep->uncompressed, *data_block);
-    block_rep->keys.SwapAssign(*keys);
-    return block_rep;
-  }
+    assert(tbo.internal_tbl_prop_coll_factories);
+    for (auto& factory : *tbo.internal_tbl_prop_coll_factories) {
+      assert(factory);
 
-  // Emit a block to compression thread
-  void EmitBlock(BlockRep* block_rep) {
-    assert(block_rep != nullptr);
-    assert(block_rep->status.ok());
-    if (!write_queue.push(&block_rep->slot)) {
-      return;
+      std::unique_ptr<InternalTblPropColl> collector{
+          factory->CreateInternalTblPropColl(
+              tbo.column_family_id, tbo.level_at_creation,
+              tbo.ioptions.num_levels,
+              tbo.last_level_inclusive_max_seqno_threshold)};
+      if (collector) {
+        table_properties_collectors.emplace_back(std::move(collector));
+      }
     }
-    if (!compress_queue.push(block_rep)) {
-      return;
+    table_properties_collectors.emplace_back(
+        new BlockBasedTablePropertiesCollector(
+            table_options.index_type, table_options.whole_key_filtering,
+            prefix_extractor != nullptr,
+            table_options.decouple_partitioned_filters));
+    if (ts_sz > 0 && persist_user_defined_timestamps) {
+      table_properties_collectors.emplace_back(
+          new TimestampTablePropertiesCollector(
+              tbo.internal_comparator.user_comparator()));
     }
 
-    if (!first_block_processed.load(std::memory_order_relaxed)) {
-      std::unique_lock<std::mutex> lock(first_block_mutex);
-      first_block_cond.wait(lock, [this] {
-        return first_block_processed.load(std::memory_order_relaxed);
-      });
+    // These are only needed for populating table properties
+    props.column_family_id = tbo.column_family_id;
+    props.column_family_name = tbo.column_family_name;
+    props.oldest_key_time = tbo.oldest_key_time;
+    props.newest_key_time = tbo.newest_key_time;
+    props.file_creation_time = tbo.file_creation_time;
+    props.orig_file_number = tbo.cur_file_num;
+    props.db_id = tbo.db_id;
+    props.db_session_id = tbo.db_session_id;
+    props.db_host_id = ioptions.db_host_id;
+    props.format_version = table_options.format_version;
+    if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
+      ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
     }
-  }
+    // Default is UINT64_MAX for unknown. Setting it to 0 here
+    // to allow updating it by taking max in BlockBasedTableBuilder::Add().
+    props.key_largest_seqno = 0;
 
-  // Reap a block from compression thread
-  void ReapBlock(BlockRep* block_rep) {
-    assert(block_rep != nullptr);
-    block_rep->compressed.clear();
-    block_rep_pool.push(block_rep);
+    if (FormatVersionUsesContextChecksum(table_options.format_version)) {
+      // Must be non-zero and semi- or quasi-random
+      // TODO: ideally guaranteed different for related files (e.g. use file
+      // number and db_session, for benefit of SstFileWriter)
+      do {
+        base_context_checksum = Random::GetTLSInstance()->Next();
+      } while (UNLIKELY(base_context_checksum == 0));
+    } else {
+      base_context_checksum = 0;
+    }
 
-    if (!first_block_processed.load(std::memory_order_relaxed)) {
-      std::lock_guard<std::mutex> lock(first_block_mutex);
-      first_block_processed.store(true, std::memory_order_relaxed);
-      first_block_cond.notify_one();
+    if (alignment > 0 && basic_compressor) {
+      // With better sanitization in `CompactionPicker::CompactFiles()`, we
+      // would not need to handle this case here and could change it to an
+      // assertion instead.
+      SetStatus(Status::InvalidArgument(
+          "Enable block_align, but compression enabled"));
     }
   }
 
- private:
-  BlockRep* PrepareBlockInternal(const Slice* first_key_in_next_block) {
-    BlockRep* block_rep = nullptr;
-    block_rep_pool.pop(block_rep);
-    assert(block_rep != nullptr);
-
-    block_rep->compression_type = kNoCompression;
-
-    if (first_key_in_next_block == nullptr) {
-      block_rep->first_key_in_next_block = {};
-    } else {
-      block_rep->first_key_in_next_block->assign(
-          first_key_in_next_block->data(), first_key_in_next_block->size());
-    }
+  Rep(const Rep&) = delete;
+  Rep& operator=(const Rep&) = delete;
 
-    return block_rep;
-  }
+ private:
+  // Synchronize status & io_status accesses across threads from main thread,
+  // compression thread and write thread in parallel compression.
+  std::mutex status_mutex;
+  std::atomic<bool> status_ok;
+  Status status;
+  std::mutex io_status_mutex;
+  std::atomic<bool> io_status_ok;
+  IOStatus io_status;
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(

From 7b2b4b7c534160aa8c8672d9dbfbca78d7194871 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 28 May 2025 18:38:15 -0700
Subject: [PATCH 110/500] Save some missing CompressionOptions to table
 properties (#13646)

Summary:
Also revamping test
GeneralTableTest::ApproximateOffsetOfCompressed so that it's not sensitive to adding new metadata to SST files

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13646

Test Plan: manually inspect new table property, which is not parsed anywhere, just for information to human reader

Reviewed By: hx235

Differential Revision: D75561241

Pulled By: pdillinger

fbshipit-source-id: c076c01a8b540bc4cb771964d48fa919c4c48ae4
---
 table/table_test.cc | 17 +++++++++++------
 util/compression.h  |  8 ++++++++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/table/table_test.cc b/table/table_test.cc
index b381a88f3196..17b5bbfc36d6 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -4728,13 +4728,18 @@ static void DoCompressionTest(CompressionType comp) {
   const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
+  size_t file_size = c.TEST_GetSink()->contents().size();
+  EXPECT_EQ(c.ApproximateOffsetOf("abc"), 0);
+  EXPECT_EQ(c.ApproximateOffsetOf("k01"), 0);
+  EXPECT_EQ(c.ApproximateOffsetOf("k02"), 0);
+  EXPECT_NEAR2(c.ApproximateOffsetOf("k03"), file_size / 2, file_size / 10);
+  EXPECT_NEAR2(c.ApproximateOffsetOf("k04"), file_size / 2, file_size / 10);
+  EXPECT_NEAR2(c.ApproximateOffsetOf("xyz"), file_size, file_size / 10);
+
+  size_t data_blocks_size = c.GetTableReader()->GetTableProperties()->data_size;
+  // Near expected compressed size ~= (0.25 + 0.25) * 10000
+  EXPECT_NEAR2(data_blocks_size, 5000, 1500);
 
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3555));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3555));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7110));
   c.ResetTableReader();
 }
 
diff --git a/util/compression.h b/util/compression.h
index 87545f573404..31bd3191d1bf 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -826,6 +826,8 @@ inline std::string CompressionOptionsToString(
   result.append("zstd_max_train_bytes=")
       .append(std::to_string(compression_options.zstd_max_train_bytes))
       .append("; ");
+  // NOTE: parallel_threads is skipped because it doesn't really affect the file
+  // contents written, arguably doesn't belong in CompressionOptions
   result.append("enabled=")
       .append(std::to_string(compression_options.enabled))
       .append("; ");
@@ -835,6 +837,12 @@ inline std::string CompressionOptionsToString(
   result.append("use_zstd_dict_trainer=")
       .append(std::to_string(compression_options.use_zstd_dict_trainer))
       .append("; ");
+  result.append("max_compressed_bytes_per_kb=")
+      .append(std::to_string(compression_options.max_compressed_bytes_per_kb))
+      .append("; ");
+  result.append("checksum=")
+      .append(std::to_string(compression_options.checksum))
+      .append("; ");
   return result;
 }
 

From 6efdbe85e3716277e2723b2638c4c1af1a4aa1c5 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 29 May 2025 21:12:05 -0700
Subject: [PATCH 111/500] Detailed comment about setting ZSTD compression type
 for mixed compression (#13653)

Summary:
**Context/Summary:** .... to clarify things more explicitly

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13653

Test Plan: no code change

Reviewed By: pdillinger

Differential Revision: D75655419

Pulled By: hx235

fbshipit-source-id: d9ee2e669df15aacf7996a3122c382412b23229e
---
 tools/ldb_cmd.cc    | 12 +++++++++---
 util/compression.cc |  5 +++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index a581990420a2..68be8c8618fd 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -872,9 +872,15 @@ bool LDBCommand::ParseCompressionTypeOption(
       // read side with no code to generate them on the write side. We can test
       // that functionality, e.g. in check_format_compatible.sh, with this hack
       g_hack_mixed_compression.StoreRelaxed(1);
-      // Need to list zstd in compression_name table property if it's
-      // potentially in the mix, for proper handling of context and dictionary.
-      // (Older versions of RocksDB could crash if that's not satisfied.)
+      // Need to list zstd in the compression_name table property if it's
+      // potentially used by being in the mix (i.e., potentially at least one
+      // data block in the table is compressed by zstd). This ensures proper
+      // context and dictionary handling, and prevents crashes in older RocksDB
+      // versions.
+      //
+      // To achieve this, set `value` (the compression_type in Options which
+      // will be used to set compression_name table property) to kZSTD, even if
+      // multiple compression types are used within a single table.
       value = ZSTD_Supported() ? kZSTD : GetSupportedCompressions()[0];
       return true;
 #endif  // !NDEBUG
diff --git a/util/compression.cc b/util/compression.cc
index 68ed29c446d2..8434ec998574 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -266,8 +266,9 @@ class BuiltinCompressorV2 : public Compressor {
     CompressionType type = type_;
 #ifndef NDEBUG
     if (type != kNoCompression && g_hack_mixed_compression.LoadRelaxed() > 0U) {
-      // If zstd is in the mix, the compression_name table property needs to be
-      // set to it, for proper handling of context and dictionaries.
+      // To assert that if zstd is in the mix, the compression_name table
+      // property (which comes from `type_`) needs to be set to kZSTD, for
+      // proper handling of context and dictionaries.
       assert(!ZSTD_Supported() || type == kZSTD);
       const auto& compressions = GetSupportedCompressions();
       auto counter = g_hack_mixed_compression.FetchAddRelaxed(1);

From 0c533e61bc6d89fdf1295e8e0bcee4edb3aef401 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 29 May 2025 22:32:10 -0700
Subject: [PATCH 112/500] Fix XPRESS compression and enable in CI (#13649)

Summary:
Somehow this was previously not being tested in our Windows CI jobs so was accidentally broken in https://github.com/facebook/rocksdb/pull/13540  This fix will need to be backported to 10.3.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13649

Test Plan: CI

Reviewed By: hx235

Differential Revision: D75655418

Pulled By: pdillinger

fbshipit-source-id: a56bb213270904a1b7a13b905c2cc1919116df1c
---
 .../actions/windows-build-steps/action.yml    |  2 +-
 port/win/xpress_win.cc                        | 58 +++++++++++++++++++
 port/win/xpress_win.h                         |  2 +
 util/compression.cc                           | 16 ++++-
 util/compression.h                            |  3 +-
 5 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/.github/actions/windows-build-steps/action.yml b/.github/actions/windows-build-steps/action.yml
index dc535a477415..0986099ce9a2 100644
--- a/.github/actions/windows-build-steps/action.yml
+++ b/.github/actions/windows-build-steps/action.yml
@@ -38,7 +38,7 @@ runs:
       $env:Path = $env:JAVA_HOME + ";" + $env:Path
       mkdir build
       cd build
-      & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DJNI=1 ..
+      & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DXPRESS=1 -DJNI=1 ..
       if(!$?) { Exit $LASTEXITCODE }
       cd ..
       echo "Building with VS version: $Env:CMAKE_GENERATOR"
diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc
index 7e0454f3ec69..959ee382e284 100644
--- a/port/win/xpress_win.cc
+++ b/port/win/xpress_win.cc
@@ -202,11 +202,69 @@ char* Decompress(const char* input_data, size_t input_length,
   return outputBuffer.release();
 }
 
+int64_t GetDecompressedSize(const char* input_data, size_t input_length) {
+  assert(input_data != nullptr);
+
+  if (input_length == 0) {
+    return 0;
+  }
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  DECOMPRESSOR_HANDLE decompressor = NULL;
+
+  BOOL success =
+      CreateDecompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                         allocRoutinesPtr,  //  Optional allocation routine
+                         &decompressor);    //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Decompressor LastError "
+              << GetLastError() << std::endl;
+#endif
+    return -1;
+  }
+
+  std::unique_ptr<void, decltype(CloseDecompressorFun)> decompressorGuard(
+      decompressor, CloseDecompressorFun);
+
+  SIZE_T decompressedBufferSize = 0;
+
+  success = ::Decompress(decompressor,                   //  Compressor Handle
+                         const_cast<char*>(input_data),  //  Compressed data
+                         input_length,              //  Compressed data size
+                         NULL,                      //  Buffer set to NULL
+                         0,                         //  Buffer size set to 0
+                         &decompressedBufferSize);  //  Decompressed Data size
+
+  assert(!success);
+  auto lastError = GetLastError();
+
+  if (lastError != ERROR_INSUFFICIENT_BUFFER) {
+#ifdef _DEBUG
+    std::cerr
+        << "XPRESS: Failed to estimate decompressed buffer size LastError "
+        << lastError << std::endl;
+#endif
+    return -1;
+  }
+
+  assert(decompressedBufferSize > 0);
+  return static_cast<int64_t>(decompressedBufferSize);
+}
+
 int64_t DecompressToBuffer(const char* input, size_t input_length, char* output,
                            size_t output_length) {
   assert(input != nullptr);
   assert(output != nullptr);
 
+  if (input_length == 0) {
+    return 0;
+  }
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
   DECOMPRESSOR_HANDLE decompressor = NULL;
 
   BOOL success =
diff --git a/port/win/xpress_win.h b/port/win/xpress_win.h
index ab4be3a6f0df..3bab9c29894a 100644
--- a/port/win/xpress_win.h
+++ b/port/win/xpress_win.h
@@ -22,6 +22,8 @@ bool Compress(const char* input, size_t length, std::string* output);
 char* Decompress(const char* input_data, size_t input_length,
                  size_t* uncompressed_size);
 
+int64_t GetDecompressedSize(const char* input, size_t input_length);
+
 int64_t DecompressToBuffer(const char* input, size_t input_length, char* output,
                            size_t output_length);
 
diff --git a/util/compression.cc b/util/compression.cc
index 8434ec998574..a91dcc798e23 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -612,7 +612,7 @@ class BuiltinDecompressorV2 : public Decompressor {
   Status ExtractUncompressedSize(Args& args) override {
     assert(args.compression_type != kNoCompression);
     if (args.compression_type == kSnappyCompression) {
-      // Exception to encoding of uncompressed size
+      // 1st exception to encoding of uncompressed size
 #ifdef SNAPPY
       size_t uncompressed_length = 0;
       if (!snappy::GetUncompressedLength(args.compressed_data.data(),
@@ -625,6 +625,20 @@ class BuiltinDecompressorV2 : public Decompressor {
 #else
       return Status::NotSupported("Snappy not supported in this build");
 #endif
+    } else if (args.compression_type == kXpressCompression) {
+      // 2nd exception to encoding of uncompressed size
+#ifdef XPRESS
+      int64_t result = port::xpress::GetDecompressedSize(
+          args.compressed_data.data(), args.compressed_data.size());
+      if (result < 0) {
+        return Status::Corruption("Error reading XPRESS compressed length");
+      }
+      args.uncompressed_size = static_cast<size_t>(result);
+      return Status::OK();
+#else
+      return Status::NotSupported("XPRESS not supported in this build");
+#endif
+
     } else {
       // Extract encoded uncompressed size
       return Decompressor::ExtractUncompressedSize(args);
diff --git a/util/compression.h b/util/compression.h
index 31bd3191d1bf..863971b3945e 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -851,7 +851,8 @@ inline std::string CompressionOptionsToString(
 // block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
 // way.
 // 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
-// start of compressed block. Snappy format is the same as version 1.
+// start of compressed block. Snappy and XPRESS instead extract the decompressed
+// size from the compressed block itself, same as version 1.
 
 inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
                             size_t length, ::std::string* output) {

From 20d065d940dcc39d4f5dbbdc90085fb50bbc9ebd Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Mon, 2 Jun 2025 15:36:32 -0700
Subject: [PATCH 113/500] Populate Missing Compaction Input Statistics (#13637)

Summary:
**Summary**
This pull request aims to populate num_input_files and total_input_bytes in the CompactionJobStats object, which is accessible through EventListener::OnCompactionBegin(DB*, const CompactionJobInfo&). This change will enable RocksDB users to access accurate compaction input information.

**Context/Goals**
Provide accurate compaction input statistics to RocksDB users
Populate num_input_files and total_input_bytes in CompactionJobStats
Ensure correct population of these fields before EventListener::OnCompactionBegin() is called

**Test Plan**
Added test code to capture num_input_file and total_num_bytes when EventHandler is triggered
Asserted that these values are populated correctly

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13637

Reviewed By: cbi42

Differential Revision: D75690774

Pulled By: shubhajeet

fbshipit-source-id: 8236546f8ce7743f46048b302b376b7ef6429887
---
 db/compaction/compaction_job.cc      | 15 ++++++++--
 db/compaction/compaction_job_test.cc |  2 --
 db/listener_test.cc                  | 41 ++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3da86d7483d9..1f521f7d2dd0 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -217,10 +217,9 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
       ThreadStatus::COMPACTION_PROP_FLAGS,
       compaction->is_manual_compaction() +
           (compaction->deletion_compaction() << 1));
-
+  auto total_input_bytes = compaction->CalculateTotalInputSize();
   ThreadStatusUtil::SetThreadOperationProperty(
-      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
-      compaction->CalculateTotalInputSize());
+      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, total_input_bytes);
 
   IOSTATS_RESET(bytes_written);
   IOSTATS_RESET(bytes_read);
@@ -235,6 +234,16 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
 
   job_stats_->is_manual_compaction = compaction->is_manual_compaction();
   job_stats_->is_full_compaction = compaction->is_full_compaction();
+  // populate compaction stats num_input_files and total_num_of_bytes
+  size_t num_input_files = 0;
+  for (int input_level = 0;
+       input_level < static_cast<int>(compaction->num_input_levels());
+       ++input_level) {
+    const LevelFilesBrief* flevel = compaction->input_levels(input_level);
+    num_input_files += flevel->num_files;
+  }
+  job_stats_->CompactionJobStats::num_input_files = num_input_files;
+  job_stats_->total_input_bytes = total_input_bytes;
 }
 
 void CompactionJob::Prepare(
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 89d724e067c1..450d9c13820f 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -43,7 +43,6 @@ void VerifyInitializationOfCompactionJobStats(
   ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
 
   ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
-  ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
   ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
 
   ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
@@ -52,7 +51,6 @@ void VerifyInitializationOfCompactionJobStats(
   ASSERT_TRUE(compaction_job_stats.is_manual_compaction);
   ASSERT_FALSE(compaction_job_stats.is_remote_compaction);
 
-  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
   ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
 
   ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
diff --git a/db/listener_test.cc b/db/listener_test.cc
index bfd5953668ff..033e86d2023d 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -537,6 +537,47 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
   ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
 }
 
+class TestNumInputFilesTotalInputBytesPouplatedInListener
+    : public EventListener {
+ public:
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    num_input_files = ci.stats.num_input_files;
+    total_num_of_bytes = ci.stats.total_input_bytes;
+  }
+  size_t num_input_files = 0;
+  size_t total_num_of_bytes = 0;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, NumInputFilesTotalBytesPopulated) {
+  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestNumInputFilesTotalInputBytesPouplatedInListener* listener =
+      new TestNumInputFilesTotalInputBytesPouplatedInListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleLevel;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  ASSERT_EQ(listener->num_input_files, 0);
+  ASSERT_EQ(listener->total_num_of_bytes, 0);
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(listener->num_input_files, 4);
+  ASSERT_NE(listener->total_num_of_bytes, 0);
+}
+
 class TestCompactionReasonListener : public EventListener {
  public:
   void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {

From 09175119d2464d7ceecdf1cb7d6d5b517b730965 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 3 Jun 2025 19:03:38 -0700
Subject: [PATCH 114/500] Allow SmallEnumSet on larger enum types (#13657)

Summary:
... to support SmallEnumSet over CompressionType with allowed custom compression types using most of the available byte. This is accomplished using an std::array<uint64_t> in place of just uint64_t. Also adds an std::bitset-like count() operation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13657

Test Plan: unit tests included

Reviewed By: hx235

Differential Revision: D75827601

Pulled By: pdillinger

fbshipit-source-id: 519ae97ac671fd9885d6485976abbd969d1392d3
---
 include/rocksdb/data_structure.h | 119 ++++++++++++++++++++++---------
 util/data_structure.cc           |   2 +
 util/slice_test.cc               |  71 ++++++++++++++++++
 3 files changed, 158 insertions(+), 34 deletions(-)

diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h
index 6d408a95968f..a903a9649966 100644
--- a/include/rocksdb/data_structure.h
+++ b/include/rocksdb/data_structure.h
@@ -7,9 +7,9 @@
 
 #include <assert.h>
 
+#include <array>
 #include <cstddef>
 #include <cstdint>
-#include <vector>
 
 #include "rocksdb/rocksdb_namespace.h"
 
@@ -17,24 +17,33 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace detail {
 int CountTrailingZeroBitsForSmallEnumSet(uint64_t);
+int BitsSetToOneForSmallEnumSet(uint64_t);
 }  // namespace detail
 
-// Represents a set of values of some enum type with a small number of
-// possible enumerators. For now, it supports enums where no enumerator
-// exceeds 63 when converted to int.
+// Represents a set of values of some enum type with a small number of possible
+// enumerators. Assumes that any combination of enumerators with values 0
+// through MAX_ENUMERATOR (inclusive) might be part of the set. NOTE: would like
+// to use std::bitset, but it doesn't support constexpr (in C++17) operations
+// and doesn't support efficient iteration over sparse "set to true" entries.
 template <typename ENUM_TYPE, ENUM_TYPE MAX_ENUMERATOR>
 class SmallEnumSet {
  private:
-  using StateT = uint64_t;
-  static constexpr int kStateBits = sizeof(StateT) * 8;
-  static constexpr int kMaxMax = kStateBits - 1;
   static constexpr int kMaxValue = static_cast<int>(MAX_ENUMERATOR);
   static_assert(kMaxValue >= 0);
-  static_assert(kMaxValue <= kMaxMax);
+  static_assert(kMaxValue < 1024, "MAX_ENUMERATOR is suspiciously large");
+  using PieceT = uint64_t;
+  static constexpr int kPieceBits = 64;
+  static constexpr int kPieceMask = 63;
+  static constexpr int kPieceShift = 6;
+  static constexpr int kPieceCount = kMaxValue / kPieceBits + 1;
+  using StateT = std::array<PieceT, kPieceCount>;
+  static constexpr int kStateBits = kPieceBits * kPieceCount;
+  static_assert(kStateBits == sizeof(StateT) * 8);
+  static_assert(kMaxValue <= kStateBits - 1);
 
  public:
-  // construct / create
-  SmallEnumSet() : state_(0) {}
+  // construct / create empty set
+  SmallEnumSet() : state_{} {}
 
   template <class... TRest>
   /*implicit*/ constexpr SmallEnumSet(const ENUM_TYPE e, TRest... rest) {
@@ -44,8 +53,16 @@ class SmallEnumSet {
   // Return the set that includes all valid values, assuming the enum
   // is "dense" (includes all values converting to 0 through kMaxValue)
   static constexpr SmallEnumSet All() {
-    StateT tmp = StateT{1} << kMaxValue;
-    return SmallEnumSet(RawStateMarker(), tmp | (tmp - 1));
+    StateT tmp;
+    for (int i = 0; i < kPieceCount - 1; ++i) {
+      tmp[i] = ~PieceT{0};
+    }
+    if constexpr (((kMaxValue + 1) & kPieceMask) != 0) {
+      tmp[kPieceCount - 1] = (PieceT{1} << ((kMaxValue + 1) & kPieceMask)) - 1;
+    } else {
+      tmp[kPieceCount - 1] = ~PieceT{0};
+    }
+    return SmallEnumSet(RawStateMarker(), tmp);
   }
 
   // equality
@@ -60,11 +77,17 @@ class SmallEnumSet {
   bool Contains(const ENUM_TYPE e) const {
     int value = static_cast<int>(e);
     assert(value >= 0 && value <= kMaxValue);
-    StateT tmp = 1;
-    return state_ & (tmp << value);
+    return GetPiece(value) & (PieceT{1} << (value & kPieceMask));
   }
 
-  bool empty() const { return state_ == 0; }
+  bool empty() const {
+    for (int i = 0; i < kPieceCount; ++i) {
+      if (state_[i] != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
 
   // iterator
   class const_iterator {
@@ -92,7 +115,7 @@ class SmallEnumSet {
       if (pos_ < kMaxValue) {
         pos_ = set_->SkipUnset(pos_ + 1);
       } else {
-        pos_ = kStateBits;
+        pos_ = kMaxValue + 1;
       }
       return *this;
     }
@@ -118,7 +141,15 @@ class SmallEnumSet {
 
   const_iterator begin() const { return const_iterator(this, SkipUnset(0)); }
 
-  const_iterator end() const { return const_iterator(this, kStateBits); }
+  const_iterator end() const { return const_iterator(this, kMaxValue + 1); }
+
+  size_t count() const {
+    size_t rv = 0;
+    for (int i = 0; i < kPieceCount; ++i) {
+      rv += static_cast<size_t>(detail::BitsSetToOneForSmallEnumSet(state_[i]));
+    }
+    return rv;
+  }
 
   // mutable ops
 
@@ -127,9 +158,10 @@ class SmallEnumSet {
   bool Add(const ENUM_TYPE e) {
     int value = static_cast<int>(e);
     assert(value >= 0 && value <= kMaxValue);
-    StateT old_state = state_;
-    state_ |= (StateT{1} << value);
-    return old_state != state_;
+    PieceT& piece_ref = RefPiece(value);
+    PieceT old_piece = piece_ref;
+    piece_ref |= (PieceT{1} << (value & kPieceMask));
+    return old_piece != piece_ref;
   }
 
   // Modifies the set (if needed) not to include the given value. Returns true
@@ -137,18 +169,20 @@ class SmallEnumSet {
   bool Remove(const ENUM_TYPE e) {
     int value = static_cast<int>(e);
     assert(value >= 0 && value <= kMaxValue);
-    StateT old_state = state_;
-    state_ &= ~(StateT{1} << value);
-    return old_state != state_;
+    PieceT& piece_ref = RefPiece(value);
+    PieceT old_piece = piece_ref;
+    piece_ref &= ~(PieceT{1} << (value & kPieceMask));
+    return old_piece != piece_ref;
   }
 
   // applicative ops
 
   // Return a new set based on this one with the additional value(s) inserted
   constexpr SmallEnumSet With(const ENUM_TYPE e) const {
-    int value = static_cast<int>(e);
-    assert(value >= 0 && value <= kMaxValue);
-    return SmallEnumSet(RawStateMarker(), state_ | (StateT{1} << value));
+    assert(static_cast<int>(e) >= 0 && static_cast<int>(e) <= kMaxValue);
+    SmallEnumSet rv(*this);
+    rv.Add(e);
+    return rv;
   }
   template <class... TRest>
   constexpr SmallEnumSet With(const ENUM_TYPE e1, const ENUM_TYPE e2,
@@ -158,9 +192,10 @@ class SmallEnumSet {
 
   // Return a new set based on this one excluding the given value(s)
   constexpr SmallEnumSet Without(const ENUM_TYPE e) const {
-    int value = static_cast<int>(e);
-    assert(value >= 0 && value <= kMaxValue);
-    return SmallEnumSet(RawStateMarker(), state_ & ~(StateT{1} << value));
+    assert(static_cast<int>(e) >= 0 && static_cast<int>(e) <= kMaxValue);
+    SmallEnumSet rv(*this);
+    rv.Remove(e);
+    return rv;
   }
   template <class... TRest>
   constexpr SmallEnumSet Without(const ENUM_TYPE e1, const ENUM_TYPE e2,
@@ -170,15 +205,31 @@ class SmallEnumSet {
 
  private:
   int SkipUnset(int pos) const {
-    StateT tmp = state_ >> pos;
-    if (tmp == 0) {
-      return kStateBits;
-    } else {
-      return pos + detail::CountTrailingZeroBitsForSmallEnumSet(tmp);
+    while (pos <= kMaxValue) {
+      PieceT remainder = GetPiece(pos) >> (pos & kPieceMask);
+      if (remainder != 0) {
+        return pos + detail::CountTrailingZeroBitsForSmallEnumSet(remainder);
+      }
+      pos = (pos + kPieceBits) & ~kPieceMask;
     }
+    return kMaxValue + 1;
   }
   struct RawStateMarker {};
   explicit SmallEnumSet(RawStateMarker, StateT state) : state_(state) {}
+  PieceT GetPiece(int pos) const {
+    if constexpr (kPieceCount == 1) {
+      return state_[0];
+    } else {
+      return state_[pos >> kPieceShift];
+    }
+  }
+  PieceT& RefPiece(int pos) {
+    if constexpr (kPieceCount == 1) {
+      return state_[0];
+    } else {
+      return state_[pos >> kPieceShift];
+    }
+  }
 
   StateT state_;
 };
diff --git a/util/data_structure.cc b/util/data_structure.cc
index 04d0442a5fa9..6987168eebfa 100644
--- a/util/data_structure.cc
+++ b/util/data_structure.cc
@@ -13,4 +13,6 @@ int CountTrailingZeroBitsForSmallEnumSet(uint64_t v) {
   return CountTrailingZeroBits(v);
 }
 
+int BitsSetToOneForSmallEnumSet(uint64_t v) { return BitsSetToOne(v); }
+
 }  // namespace ROCKSDB_NAMESPACE::detail
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 0028cce85965..6e7142dc9505 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -177,18 +177,23 @@ class SmallEnumSetTest : public testing::Test {
 TEST_F(SmallEnumSetTest, SmallEnumSetTest1) {
   FileTypeSet fs;  // based on a legacy enum type
   ASSERT_TRUE(fs.empty());
+  ASSERT_EQ(fs.count(), 0U);
   ASSERT_TRUE(fs.Add(FileType::kIdentityFile));
   ASSERT_FALSE(fs.empty());
+  ASSERT_EQ(fs.count(), 1U);
   ASSERT_FALSE(fs.Add(FileType::kIdentityFile));
   ASSERT_TRUE(fs.Add(FileType::kInfoLogFile));
   ASSERT_TRUE(fs.Contains(FileType::kIdentityFile));
   ASSERT_FALSE(fs.Contains(FileType::kDBLockFile));
   ASSERT_FALSE(fs.empty());
+  ASSERT_EQ(fs.count(), 2U);
   ASSERT_FALSE(fs.Remove(FileType::kDBLockFile));
   ASSERT_TRUE(fs.Remove(FileType::kIdentityFile));
   ASSERT_FALSE(fs.empty());
+  ASSERT_EQ(fs.count(), 1U);
   ASSERT_TRUE(fs.Remove(FileType::kInfoLogFile));
   ASSERT_TRUE(fs.empty());
+  ASSERT_EQ(fs.count(), 0U);
 }
 
 namespace {
@@ -224,12 +229,16 @@ TEST_F(SmallEnumSetTest, SmallEnumSetTest2) {
   ASSERT_NE(cs, MyEnumClassSet{MyEnumClass::B});
   ASSERT_NE(cs, MyEnumClassSet::All());
 
+  ASSERT_EQ(MyEnumClassSet{}.count(), 0U);
+  ASSERT_EQ(MyEnumClassSet::All().count(), 3U);
+
   int count = 0;
   for (MyEnumClass e : cs) {
     ASSERT_EQ(e, MyEnumClass::A);
     ++count;
   }
   ASSERT_EQ(count, 1);
+  ASSERT_EQ(cs.count(), 1U);
 
   count = 0;
   for (MyEnumClass e : MyEnumClassSet::All().Without(MyEnumClass::B)) {
@@ -244,6 +253,68 @@ TEST_F(SmallEnumSetTest, SmallEnumSetTest2) {
   }
 }
 
+template <typename ENUM_TYPE, ENUM_TYPE MAX_ENUMERATOR>
+void TestBiggerEnumSet() {
+  using MySet = SmallEnumSet<ENUM_TYPE, MAX_ENUMERATOR>;
+  constexpr int kMaxValue = static_cast<int>(MAX_ENUMERATOR);
+  SCOPED_TRACE("kMaxValue = " + std::to_string(kMaxValue));
+
+  ASSERT_EQ(sizeof(MySet), (kMaxValue + 1 + 63) / 64 * 8);
+
+  MySet s;
+  ASSERT_TRUE(s.empty());
+  ASSERT_EQ(s.count(), 0U);
+  ASSERT_TRUE(s.Add(ENUM_TYPE(0)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 1U);
+  ASSERT_TRUE(s.Add(ENUM_TYPE(kMaxValue - 1)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 2U);
+  ASSERT_TRUE(s.Add(ENUM_TYPE(kMaxValue)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 3U);
+
+  int count = 0;
+  for (ENUM_TYPE e : s) {
+    ASSERT_TRUE(e == ENUM_TYPE(0) || e == ENUM_TYPE(kMaxValue - 1) ||
+                e == ENUM_TYPE(kMaxValue));
+    ++count;
+  }
+  ASSERT_EQ(count, 3);
+
+  ASSERT_TRUE(s.Remove(ENUM_TYPE(0)));
+  ASSERT_TRUE(s.Remove(ENUM_TYPE(kMaxValue)));
+  ASSERT_FALSE(s.empty());
+  ASSERT_EQ(s.count(), 1U);
+
+  count = 0;
+  for (ENUM_TYPE e : s) {
+    ASSERT_EQ(e, ENUM_TYPE(kMaxValue - 1));
+    ++count;
+  }
+  ASSERT_EQ(count, 1);
+}
+
+TEST_F(SmallEnumSetTest, BiggerEnumClasses) {
+  enum class BiggerEnumClass63 { A, B, C = 63 };
+  enum class BiggerEnumClass64 { A, B, C = 64 };
+  enum class BiggerEnumClass65 { A, B, C = 65 };
+  enum class BiggerEnumClass127 { A, B, C = 127 };
+  enum class BiggerEnumClass128 { A, B, C = 128 };
+  enum class BiggerEnumClass129 { A, B, C = 129 };
+  enum class BiggerEnumClass150 { A, B, C = 150 };
+  enum class BiggerEnumClass255 { A, B, C = 255 };
+
+  TestBiggerEnumSet<BiggerEnumClass63, BiggerEnumClass63::C>();
+  TestBiggerEnumSet<BiggerEnumClass64, BiggerEnumClass64::C>();
+  TestBiggerEnumSet<BiggerEnumClass65, BiggerEnumClass65::C>();
+  TestBiggerEnumSet<BiggerEnumClass127, BiggerEnumClass127::C>();
+  TestBiggerEnumSet<BiggerEnumClass128, BiggerEnumClass128::C>();
+  TestBiggerEnumSet<BiggerEnumClass129, BiggerEnumClass129::C>();
+  TestBiggerEnumSet<BiggerEnumClass150, BiggerEnumClass150::C>();
+  TestBiggerEnumSet<BiggerEnumClass255, BiggerEnumClass255::C>();
+}
+
 // ***************************************************************** //
 // Unit test for Status
 TEST(StatusTest, Update) {

From 972795643684504c357e3628d335620344cce9c9 Mon Sep 17 00:00:00 2001
From: Zaidoon Abd Al Hadi <zaidoon@cloudflare.com>
Date: Wed, 4 Jun 2025 10:03:23 -0700
Subject: [PATCH 115/500] Expose Options::memtable_avg_op_scan_flush_trigger
 via C API (#13631)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13631

Reviewed By: pdillinger

Differential Revision: D75928433

Pulled By: cbi42

fbshipit-source-id: d9f13a17058cfac68e380ea7d227aa8197b1d028
---
 db/c.cc             | 10 ++++++++++
 db/c_test.c         | 10 ++++++++++
 include/rocksdb/c.h |  6 ++++++
 3 files changed, 26 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index d324ca3f10a0..c7221f2fd118 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3748,6 +3748,16 @@ uint32_t rocksdb_options_get_memtable_op_scan_flush_trigger(
   return opt->rep.memtable_op_scan_flush_trigger;
 }
 
+void rocksdb_options_set_memtable_avg_op_scan_flush_trigger(
+    rocksdb_options_t* opt, uint32_t n) {
+  opt->rep.memtable_avg_op_scan_flush_trigger = n;
+}
+
+uint32_t rocksdb_options_get_memtable_avg_op_scan_flush_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_avg_op_scan_flush_trigger;
+}
+
 void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
   opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
 }
diff --git a/db/c_test.c b/db/c_test.c
index 2d2f34bad86c..73bdf564706e 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -2220,6 +2220,10 @@ int main(int argc, char** argv) {
     CheckCondition(100 ==
                    rocksdb_options_get_memtable_op_scan_flush_trigger(o));
 
+    rocksdb_options_set_memtable_avg_op_scan_flush_trigger(o, 150);
+    CheckCondition(150 ==
+                   rocksdb_options_get_memtable_avg_op_scan_flush_trigger(o));
+
     rocksdb_options_set_ttl(o, 5000);
     CheckCondition(5000 == rocksdb_options_get_ttl(o));
 
@@ -2663,6 +2667,12 @@ int main(int argc, char** argv) {
     CheckCondition(100 ==
                    rocksdb_options_get_memtable_op_scan_flush_trigger(o));
 
+    rocksdb_options_set_memtable_avg_op_scan_flush_trigger(copy, 900);
+    CheckCondition(
+        900 == rocksdb_options_get_memtable_avg_op_scan_flush_trigger(copy));
+    CheckCondition(150 ==
+                   rocksdb_options_get_memtable_avg_op_scan_flush_trigger(o));
+
     rocksdb_options_set_ttl(copy, 8000);
     CheckCondition(8000 == rocksdb_options_get_ttl(copy));
     CheckCondition(5000 == rocksdb_options_get_ttl(o));
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ee5cc5274642..92a47b25b212 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1507,6 +1507,12 @@ rocksdb_options_set_memtable_op_scan_flush_trigger(rocksdb_options_t*,
 extern ROCKSDB_LIBRARY_API uint32_t
 rocksdb_options_get_memtable_op_scan_flush_trigger(rocksdb_options_t*);
 
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_avg_op_scan_flush_trigger(rocksdb_options_t*,
+                                                       uint32_t);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_options_get_memtable_avg_op_scan_flush_trigger(rocksdb_options_t*);
+
 enum {
   rocksdb_statistics_level_disable_all = 0,
   rocksdb_statistics_level_except_tickers =

From 0119a8c78b5bf2b6e0c61d85b73a9c450605153c Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 4 Jun 2025 10:08:46 -0700
Subject: [PATCH 116/500] Fix `Checkpoint::ExportColumnFamily()` returning
 staled data (#13654)

Summary:
`Checkpoint::ExportColumnFamily()` calls DB::Flush() before getting all SST file metadata through `GetColumnFamilyMetaData()`. `GetColumnFamilyMetaData()` gets metadata through the SuperVersion but Flush() does not guarantee the flush result is reflected in SuperVersion upon return (explained below). This PR updates `GetColumnFamilyMetaData()` to get metadata from version instead. Since `GetColumnFamilyMetaData()` [acquires db mutex](https://github.com/facebook/rocksdb/blob/0c533e61bc6d89fdf1295e8e0bcee4edb3aef401/db/db_impl/db_impl.cc#L5023-L5031), it should not need to acquire SV anyway.

Fixes https://github.com/facebook/rocksdb/issues/13652.

Here we explain how Flush(wait=true) does not guarantee that the flush result is in SuperVersion when the call returns.
- RocksDB uses group commit to do MANIFEST update.
- When a flush tries to install its flush result, it may be done by another MANIFEST writer.
- MANIFEST write is done atomically together with updating Version and cfd->imm() (the list of immutable memtables), but it does not install new SuperVresion
- When the MANIFEST writer releases db mutex, the flush wait thread finds that cfd->imm() does not have the relevant memtable anymore: https://github.com/facebook/rocksdb/blob/09175119d2464d7ceecdf1cb7d6d5b517b730965/db/db_impl/db_impl_compaction_flush.cc#L2739-L2742

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13654

Test Plan: the repro in https://github.com/pcholakov/rocksdb/commit/a52d426e82ff5a3dd181dbd5d676dbb54080f5fa pass after this change.

Reviewed By: hx235

Differential Revision: D75795658

Pulled By: cbi42

fbshipit-source-id: 4f10baff67944bcd762cf0d237d653a8a35dbca3
---
 db/db_impl/db_impl.cc                              | 4 +---
 db/db_impl/db_impl_compaction_flush.cc             | 4 ++++
 unreleased_history/bug_fixes/create-with-import.md | 1 +
 utilities/checkpoint/checkpoint_impl.cc            | 1 +
 4 files changed, 7 insertions(+), 3 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/create-with-import.md

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index cfb0ea07ef09..deb3b9dee700 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -5018,7 +5018,6 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
   assert(column_family);
   auto* cfd =
       static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
-  auto* sv = GetAndRefSuperVersion(cfd);
   {
     // Without mutex, Version::GetColumnFamilyMetaData will have data race
     // with Compaction::MarkFilesBeingCompacted. One solution is to use mutex,
@@ -5030,9 +5029,8 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
     // DB::GetColumnFamilyMetaData is not called frequently, the regression
     // should not be big. We still need to keep an eye on it.
     InstrumentedMutexLock l(&mutex_);
-    sv->current->GetColumnFamilyMetaData(cf_meta);
+    cfd->current()->GetColumnFamilyMetaData(cf_meta);
   }
-  ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
 void DBImpl::GetAllColumnFamilyMetaData(
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index a69c80a3cb03..5b1ea2cd1f61 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2699,6 +2699,10 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
 // Finish waiting when ALL column families finish flushing memtables.
 // resuming_from_bg_err indicates whether the caller is trying to resume from
 // background error or in normal processing.
+// Note that the wait finishes when the flush result is installed to column
+// families' Versions and persisted in MANIFEST. It doesn't wait until
+// SuperVersion to reflect the flush result, except for the case when
+// flush_reason is `kExternalFileIngestion`.
 Status DBImpl::WaitForFlushMemTables(
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const uint64_t*>& flush_memtable_ids,
diff --git a/unreleased_history/bug_fixes/create-with-import.md b/unreleased_history/bug_fixes/create-with-import.md
new file mode 100644
index 000000000000..12efa1d4321f
--- /dev/null
+++ b/unreleased_history/bug_fixes/create-with-import.md
@@ -0,0 +1 @@
+* Fix a bug where CreateColumnFamilyWithImport() could miss the SST file for the memtable flush it triggered. The exported CF then may not contain the updates in the memtable when CreateColumnFamilyWithImport() is called.
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index dcedfd2ddf65..c7ed298cf02f 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -340,6 +340,7 @@ Status CheckpointImpl::ExportColumnFamily(
   s = db_->GetEnv()->CreateDir(tmp_export_dir);
 
   if (s.ok()) {
+    // FIXME: should respect atomic_flush and flush all CFs if needed.
     s = db_->Flush(ROCKSDB_NAMESPACE::FlushOptions(), handle);
   }
 

From fccc881894c9791e2a2a3eefbb3b2c4a217669df Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Wed, 4 Jun 2025 10:18:44 -0700
Subject: [PATCH 117/500] Implement MixedCompressor that Round robins on
 compression algorithm (#13647)

Summary:
**Summary**
This pull request introduces a mixed compressor, RoundRobinManager and RoundRobinCompressor, which selects algorithms in a loop. This implementation replaces the current hacky approach to round-robin compression in BuiltInCompressorV2. Additionally, it configures RocksDB to optionally utilize this customized compressor in the db stress test.

**Testing**
Testing was performed by verifying the stdout output from both RoundRobinCompressor and BuiltInCompressorV2.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13647

Reviewed By: pdillinger

Differential Revision: D75921997

Pulled By: shubhajeet

fbshipit-source-id: 8f42ac46f08ba982b2cd70241bd7dc13ff5a1225
---
 db/db_test2.cc                        | 105 +++++++++++++++++++++
 db_stress_tool/db_stress_common.h     |   1 +
 db_stress_tool/db_stress_gflags.cc    |   4 +-
 db_stress_tool/db_stress_test_base.cc |  24 ++++-
 tools/db_crashtest.py                 |  15 ++-
 tools/ldb_cmd.cc                      |  12 ++-
 util/compression.cc                   |  16 ----
 util/compression.h                    |   6 --
 util/simple_mixed_compressor.h        | 130 ++++++++++++++++++++++++++
 9 files changed, 281 insertions(+), 32 deletions(-)
 create mode 100644 util/simple_mixed_compressor.h

diff --git a/db/db_test2.cc b/db/db_test2.cc
index 7a056de9cc49..1f325d7433e5 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -30,6 +30,7 @@
 #include "test_util/testutil.h"
 #include "util/defer.h"
 #include "util/random.h"
+#include "util/simple_mixed_compressor.h"
 #include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -1883,6 +1884,110 @@ TEST_F(DBTest2, CompressionOptions) {
   }
 }
 
+TEST_F(DBTest2, RoundRobinManager) {
+  if (ZSTD_Supported()) {
+    auto mgr = std::make_shared<RoundRobinManager>(
+        GetDefaultBuiltinCompressionManager());
+
+    for (CompressionType type : {kZSTD}) {
+      std::vector<std::string> values;
+      for (bool use_wrapper : {true}) {
+        SCOPED_TRACE("Compression type: " + std::to_string(type) +
+                     (use_wrapper ? " with " : " no ") + "wrapper");
+
+        Options options = CurrentOptions();
+        options.compression = type;
+        options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+        options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+        BlockBasedTableOptions bbto;
+        bbto.enable_index_compression = false;
+        options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+        options.compression_manager = use_wrapper ? mgr : nullptr;
+        DestroyAndReopen(options);
+
+        Random rnd(301);
+        constexpr int kCount = 13;
+
+        // Highly compressible blocks, except 1 non-compressible. Half of the
+        // compressible are morked for bypass and 1 marked for rejection. Values
+        // are large enough to ensure just 1 k-v per block.
+        for (int i = 0; i < kCount; ++i) {
+          std::string value;
+          if (i == 6) {
+            // One non-compressible block
+            value = rnd.RandomBinaryString(20000);
+          } else {
+            test::CompressibleString(&rnd, 0.1, 20000, &value);
+          }
+          values.push_back(value);
+          ASSERT_OK(Put(Key(i), value));
+          ASSERT_EQ(Get(Key(i)), value);
+        }
+        ASSERT_OK(Flush());
+
+        // Ensure well-formed for reads
+        for (int i = 0; i < kCount; ++i) {
+          ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+          ASSERT_EQ(Get(Key(i)), values[i]);
+        }
+        ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+      }
+    }
+  }
+}
+
+TEST_F(DBTest2, SimpleMixedCompressionManager) {
+  if (ZSTD_Supported()) {
+    auto mgr = std::make_shared<SimpleMixedCompressionManager>(
+        GetDefaultBuiltinCompressionManager());
+    // Currently mixedmanager only supports with preffered compression manager
+    // zstd
+    for (CompressionType type : {kZSTD}) {
+      std::vector<std::string> values;
+      for (bool use_wrapper : {true}) {
+        SCOPED_TRACE("Compression type: " + std::to_string(type) +
+                     (use_wrapper ? " with " : " no ") + "wrapper");
+
+        Options options = CurrentOptions();
+        options.compression = type;
+        options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+        options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+        BlockBasedTableOptions bbto;
+        bbto.enable_index_compression = false;
+        options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+        options.compression_manager = use_wrapper ? mgr : nullptr;
+        DestroyAndReopen(options);
+
+        Random rnd(301);
+        constexpr int kCount = 13;
+
+        // Highly compressible blocks, except 1 non-compressible. Half of the
+        // compressible are morked for bypass and 1 marked for rejection. Values
+        // are large enough to ensure just 1 k-v per block.
+        for (int i = 0; i < kCount; ++i) {
+          std::string value;
+          if (i == 6) {
+            // One non-compressible block
+            value = rnd.RandomBinaryString(20000);
+          } else {
+            test::CompressibleString(&rnd, 0.1, 20000, &value);
+          }
+          values.push_back(value);
+          ASSERT_OK(Put(Key(i), value));
+          ASSERT_EQ(Get(Key(i)), value);
+        }
+        ASSERT_OK(Flush());
+
+        // Ensure well-formed for reads
+        for (int i = 0; i < kCount; ++i) {
+          ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+          ASSERT_EQ(Get(Key(i)), values[i]);
+        }
+        ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+      }
+    }
+  }
+}
 TEST_F(DBTest2, CompressionManagerWrapper) {
   // Test that we can use a custom CompressionManager to wrap the built-in
   // CompressionManager, thus adopting a custom *strategy* based on existing
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 6b09e66a8fa0..f71d75f7ab05 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -398,6 +398,7 @@ DECLARE_bool(use_adaptive_mutex_lru);
 DECLARE_uint32(compress_format_version);
 DECLARE_uint64(manifest_preallocation_size);
 DECLARE_bool(enable_checksum_handoff);
+DECLARE_string(compression_manager);
 DECLARE_uint64(max_total_wal_size);
 DECLARE_double(high_pri_pool_ratio);
 DECLARE_double(low_pri_pool_ratio);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index b95111932349..efad640bfa3f 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -467,7 +467,9 @@ DEFINE_uint64(blob_file_size,
 DEFINE_string(blob_compression_type, "none",
               "[Integrated BlobDB] The compression algorithm to use for large "
               "values stored in blob files.");
-
+DEFINE_string(compression_manager, "mixed",
+              "Ability to change compression manager specified in "
+              "simple_mixed_manager.h (mixed -> roundRobin)");
 DEFINE_bool(enable_blob_garbage_collection,
             ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
                 .enable_blob_garbage_collection,
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index fb9fb0f744c3..9d99cbdb9ff1 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -34,6 +34,7 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "test_util/testutil.h"
 #include "util/cast_util.h"
+#include "util/simple_mixed_compressor.h"
 #include "utilities/backup/backup_engine_impl.h"
 #include "utilities/fault_injection_fs.h"
 #include "utilities/fault_injection_secondary_cache.h"
@@ -3411,7 +3412,28 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     InitializeOptionsFromFlags(cache_, filter_policy_, options_);
   }
   InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_);
-
+  if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+    // Currently limited to ZSTD compression. Table property compression_name
+    // needs to set to zstd for now even when there can be more than one
+    // algorithm in the table under your compressor.
+    options_.compression = kZSTD;
+    options_.bottommost_compression = kZSTD;
+    if (!ZSTD_Supported()) {
+      fprintf(stderr,
+              "ZSTD compression not supported thus mixed compression cannot be "
+              "used\n");
+      exit(1);
+    }
+    auto mgr = std::make_shared<RoundRobinManager>(
+        GetDefaultBuiltinCompressionManager());
+    options_.compression_manager = mgr;
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+    // Nothing to do using default compression manager
+  } else {
+    fprintf(stderr, "Unknown compression manager: %s\n",
+            FLAGS_compression_manager.c_str());
+    exit(1);
+  }
   if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) {
     fprintf(stderr,
             "prefeix_size cannot be zero if memtablerep == prefix_hash\n");
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 9653e24a5d52..f679b0e865e1 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -347,7 +347,9 @@
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
     "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
+    "compression_manager": lambda: random.choice(["mixed", "none"]),
 }
+
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
 # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR
 _TEST_EXPECTED_DIR_ENV_VAR = "TEST_TMPDIR_EXPECTED"
@@ -995,10 +997,17 @@ def finalize_and_sanitize(src_params):
             # have to disable metadata write fault injection to other file
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
-    # Enabling block_align with compression is not supported
-    if dest_params.get("block_align") == 1:
-        dest_params["compression_type"] = "none"
+    # Disabling block align if mixed manager is neing used
+    if dest_params.get("compression_manager") == "mixed":
+        if dest_params.get("block_align") == 1:
+            dest_params["block_align"] = 0
+        dest_params["compression_type"] = "zstd"
         dest_params["bottommost_compression_type"] = "none"
+    else:
+        # Enabling block_align with compression is not supported
+        if dest_params.get("block_align") == 1:
+            dest_params["compression_type"] = "none"
+            dest_params["bottommost_compression_type"] = "none"
     # If periodic_compaction_seconds is not set, daily_offpeak_time_utc doesn't do anything
     if dest_params.get("periodic_compaction_seconds") == 0:
         dest_params["daily_offpeak_time_utc"] = ""
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 68be8c8618fd..46b0f4b0b9e3 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -44,6 +44,7 @@
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_checksum_helper.h"
+#include "util/simple_mixed_compressor.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "util/write_batch_util.h"
@@ -867,11 +868,12 @@ bool LDBCommand::ParseCompressionTypeOption(
             "No compressions are supported in this build for \"mixed\".");
         return false;
       }
-      // A temporary hack to generate an SST file with a mix of compression
-      // types, as this has been *de facto* supported for a long time on the
-      // read side with no code to generate them on the write side. We can test
-      // that functionality, e.g. in check_format_compatible.sh, with this hack
-      g_hack_mixed_compression.StoreRelaxed(1);
+      options_.compression = kZSTD;
+      options_.bottommost_compression = kZSTD;
+      auto mgr = std::make_shared<RoundRobinManager>(
+          GetDefaultBuiltinCompressionManager());
+      options_.compression_manager = mgr;
+
       // Need to list zstd in the compression_name table property if it's
       // potentially used by being in the mix (i.e., potentially at least one
       // data block in the table is compressed by zstd). This ensures proper
diff --git a/util/compression.cc b/util/compression.cc
index a91dcc798e23..998c45fdaa4c 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -254,7 +254,6 @@ class BuiltinCompressorV2 : public Compressor {
   void ReleaseWorkingArea(WorkingArea* wa) override {
     delete static_cast<CompressionContext*>(wa);
   }
-
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override {
@@ -264,17 +263,6 @@ class BuiltinCompressorV2 : public Compressor {
       ctx = static_cast<CompressionContext*>(wa->get());
     }
     CompressionType type = type_;
-#ifndef NDEBUG
-    if (type != kNoCompression && g_hack_mixed_compression.LoadRelaxed() > 0U) {
-      // To assert that if zstd is in the mix, the compression_name table
-      // property (which comes from `type_`) needs to be set to kZSTD, for
-      // proper handling of context and dictionaries.
-      assert(!ZSTD_Supported() || type == kZSTD);
-      const auto& compressions = GetSupportedCompressions();
-      auto counter = g_hack_mixed_compression.FetchAddRelaxed(1);
-      type = compressions[counter % compressions.size()];
-    }
-#endif  // !NDEBUG
     if (ctx == nullptr) {
       tmp_ctx.emplace(type, opts_);
       ctx = &*tmp_ctx;
@@ -821,7 +809,6 @@ Status BuiltinDecompressorV2OptimizeZstd::MaybeCloneForDict(
       serialized_dict);
   return Status::OK();
 }
-
 class BuiltinCompressionManagerV2 : public CompressionManager {
  public:
   BuiltinCompressionManagerV2() = default;
@@ -954,7 +941,4 @@ GetDefaultBuiltinCompressionManager() {
 // END built-in implementation of customization interface
 // ***********************************************************************
 
-#ifndef NDEBUG
-RelaxedAtomic<uint64_t> g_hack_mixed_compression{0};
-#endif  // !NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.h b/util/compression.h
index 863971b3945e..8f975b2eef0f 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -1988,10 +1988,4 @@ class ZSTDStreamingUncompress final : public StreamingUncompress {
 #endif
 };
 
-#ifndef NDEBUG
-// 0 == disable the hack
-// > 0 => counter for rotating through compression types
-extern RelaxedAtomic<uint64_t> g_hack_mixed_compression;
-#endif
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
new file mode 100644
index 000000000000..79debc000299
--- /dev/null
+++ b/util/simple_mixed_compressor.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+#include <random>
+
+#include "compression.h"
+#include "options/options_helper.h"
+#include "rocksdb/advanced_compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MultiCompressorWrapper : public Compressor {
+ public:
+  explicit MultiCompressorWrapper(const CompressionOptions& opts,
+                                  CompressionType type,
+                                  CompressionDict&& dict = {}) {
+    assert(type != kNoCompression);
+    assert(type == kZSTD);
+    auto builtInManager = GetDefaultBuiltinCompressionManager();
+    const auto& compressions = GetSupportedCompressions();
+    for (auto type_ : compressions) {
+      if (type_ == kNoCompression) {  // Avoid no compression
+        continue;
+      }
+      compressors_.push_back(builtInManager->GetCompressor(opts, type_));
+    }
+    (void)dict;
+    (void)type;
+  }
+  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override {
+    return compressors_.back()->GetMaxSampleSizeIfWantDict(block_type);
+  }
+
+  Slice GetSerializedDict() const override {
+    return compressors_.back()->GetSerializedDict();
+  }
+
+  CompressionType GetPreferredCompressionType() const override { return kZSTD; }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+    return compressors_.back()->ObtainWorkingArea();
+  }
+  virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+    return compressors_.back()->MaybeCloneSpecialized(block_type,
+                                                      std::move(dict_samples));
+  }
+
+ protected:
+  std::vector<std::unique_ptr<Compressor>> compressors_;
+
+ private:
+  mutable std::mutex mutex_;  // Protects access to current_index_
+};
+struct SimpleMixedCompressor : public MultiCompressorWrapper {
+  using MultiCompressorWrapper::MultiCompressorWrapper;
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+    const auto& compressions = GetSupportedCompressions();
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis(
+        1, (int)compressions.size() - 2);  // avoiding no compression and zstd
+    auto selected = dis(gen);
+    auto& compressor = compressors_[selected % compressors_.size()];
+    // fprintf(stdout, "[MultiCompressorWrapper] selected compressor
+    // typeint:%d\n",
+    //         selected);
+    Status status = compressor->CompressBlock(
+        uncompressed_data, compressed_output, out_compression_type, wa);
+    return status;
+  }
+};
+
+class SimpleMixedCompressionManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override { return wrapped_->Name(); }
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override {
+    assert(preferred == kZSTD);
+    (void)context;
+    return std::make_unique<SimpleMixedCompressor>(opts, preferred);
+  }
+};
+
+struct RoundRobinCompressor : public MultiCompressorWrapper {
+  using MultiCompressorWrapper::MultiCompressorWrapper;
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+    const auto& compressions = GetSupportedCompressions();
+    auto counter = block_counter.FetchAddRelaxed(1);
+    auto sel_idx = counter % (compressions.size() - 1);
+    auto& compressor = compressors_[sel_idx];
+    // auto type = compressions[sel_idx];
+    // fprintf(stdout,
+    //         "[CompressorWrapper] selected compression algo: %s typeint:%d\n",
+    //         std::to_string(type).c_str(), type);
+    return compressor->CompressBlock(uncompressed_data, compressed_output,
+                                     out_compression_type, wa);
+  }
+  static RelaxedAtomic<uint64_t> block_counter;
+};
+RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
+
+class RoundRobinManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override { return wrapped_->Name(); }
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override {
+    assert(preferred == kZSTD);
+    (void)context;
+    // fprintf(stdout,
+    //         "[CompressorWrapper] selected compression algo: %s typeint:%d\n",
+    // void)context;
+    return std::make_unique<RoundRobinCompressor>(opts, preferred);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE

From eaa4f9d23b148bc5b1989fc04bdb0c1f43724798 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 4 Jun 2025 10:44:17 -0700
Subject: [PATCH 118/500] Fix tests broken by gtest upgrade (#13661)

Summary:
Some tests were failing due to apparent missing include of iomanip. I suspect this was from a gtest upgrade, because in open source, the include iomanip comes from gtest.h. To ensure we maintain compatibility with older gtest as well as the newer one, I pulled the include iomanip out of the in-repo gtest.h. Note that other places in gtest code only instantiate floating-point related templates with `float` and `double` types.

Also, to avoid `make format` being insanely slow on gtest.h, I've excluded third-party from the formatting check.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13661

Test Plan: make check, internal CI, manually ensure formatting check works outside of third-party/

Reviewed By: jaykorean

Differential Revision: D75963897

Pulled By: pdillinger

fbshipit-source-id: ed5737dd456e74068185f1ac5d57046d7509df7a
---
 build_tools/format-diff.sh                    | 11 ++++---
 db/db_io_failure_test.cc                      |  2 ++
 table/table_test.cc                           |  1 +
 .../gtest-1.8.1/fused-src/gtest/gtest-all.cc  | 32 +++++++++++++++++++
 .../gtest-1.8.1/fused-src/gtest/gtest.h       | 24 ++------------
 tools/ldb_cmd_test.cc                         |  1 +
 .../lock/range/range_locking_test.cc          |  1 +
 7 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index 9dc85496c91b..aa6b634563da 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -118,6 +118,9 @@ fi
 # fi
 set -e
 
+# Exclude third-party from formatting
+EXCLUDE=':!third-party/'
+
 uncommitted_code=`git diff HEAD`
 
 # If there's no uncommitted changes, we assume user are doing post-commit
@@ -137,11 +140,11 @@ then
   # should be relevant for formatting fixes.
   FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
   # Get the differences
-  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1) || true
+  diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" -- $EXCLUDE | $CLANG_FORMAT_DIFF -p 1) || true
   echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
 else
   # Check the format of uncommitted lines,
-  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) || true
+  diffs=$(git diff -U0 HEAD -- $EXCLUDE | $CLANG_FORMAT_DIFF -p 1) || true
   echo "Checking format of uncommitted changes..."
 fi
 
@@ -187,9 +190,9 @@ fi
 # Do in-place format adjustment.
 if [ -z "$uncommitted_code" ]
 then
-  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" -- $EXCLUDE | $CLANG_FORMAT_DIFF -i -p 1
 else
-  git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1
+  git diff -U0 HEAD -- $EXCLUDE | $CLANG_FORMAT_DIFF -i -p 1
 fi
 echo "Files reformatted!"
 
diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc
index ecef6e860aba..4021ea73d30a 100644
--- a/db/db_io_failure_test.cc
+++ b/db/db_io_failure_test.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <iomanip>
+
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "test_util/testutil.h"
diff --git a/table/table_test.cc b/table/table_test.cc
index 17b5bbfc36d6..302ff718588e 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdio>
+#include <iomanip>
 #include <iostream>
 #include <map>
 #include <memory>
diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
index b19c9f2a8115..f3c10c469daf 100644
--- a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc
@@ -477,6 +477,38 @@ GTEST_DECLARE_bool_(death_test_use_fork);
 
 namespace internal {
 
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value, RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
+
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
+
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
+                   false);
+}
+
+template
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         float lhs_value, float rhs_value);
+template
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         double lhs_value, double rhs_value);
+
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
 GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
index 2d82d8e4d0b1..f6e3fabed005 100644
--- a/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
+++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest.h
@@ -3973,7 +3973,7 @@ const char* StringFromGTestEnv(const char* flag, const char* default_val);
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
-#include <iomanip>
+// #include <iomanip> // Not included in newer versions of gtest
 #include <limits>
 #include <map>
 #include <set>
@@ -21451,27 +21451,7 @@ template <typename RawType>
 AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
                                          const char* rhs_expression,
                                          RawType lhs_value,
-                                         RawType rhs_value) {
-  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
-
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  ::std::stringstream lhs_ss;
-  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-         << lhs_value;
-
-  ::std::stringstream rhs_ss;
-  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-         << rhs_value;
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
-                   false);
-}
+                                         RawType rhs_value);
 
 // Helper function for implementing ASSERT_NEAR.
 //
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 711a313db678..5715f93db337 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -6,6 +6,7 @@
 #include "rocksdb/utilities/ldb_cmd.h"
 
 #include <cinttypes>
+#include <iomanip>
 
 #include "db/db_test_util.h"
 #include "db/version_edit.h"
diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc
index 961a5a11ae9c..45531910d159 100644
--- a/utilities/transactions/lock/range/range_locking_test.cc
+++ b/utilities/transactions/lock/range/range_locking_test.cc
@@ -7,6 +7,7 @@
 
 #include <algorithm>
 #include <functional>
+#include <iomanip>
 #include <string>
 #include <thread>
 

From 7d80ea45442e84c25669db61cb7376ba0cd10ba5 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 4 Jun 2025 17:46:56 -0700
Subject: [PATCH 119/500] Fix iterator errors for CFs with
 disallow_memtable_writes (#13663)

Summary:
Iterator seek returns "SeekAndValidate() not implemented" error if the disallow_memtable_writes CF option is set along with paranoid_memory_checks. The fix is to sanitize the paranoid_memory_checks option to false, which should be safe since the memtable is guaranteed to be empty.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13663

Test Plan: Update unit test in db_basic_test.cc

Reviewed By: pdillinger

Differential Revision: D75973515

Pulled By: anand1976

fbshipit-source-id: 3f381f19dcda72e3b78ee375f755fb4809c6b99c
---
 db/db_basic_test.cc                           |  6 ++++++
 memtable/vectorrep.cc                         | 20 +++++++++++++++++++
 ...isallow_memtable_writes_paranoid_checks.md |  1 +
 3 files changed, 27 insertions(+)
 create mode 100644 unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 55323d29de8d..cb7313e090ca 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -5086,6 +5086,7 @@ TEST_F(DBBasicTest, DisallowMemtableWrite) {
   options_allow.create_if_missing = true;
   Options options_disallow = options_allow;
   options_disallow.disallow_memtable_writes = true;
+  options_disallow.paranoid_memory_checks = true;
 
   DestroyAndReopen(options_allow);
   // CFs allowing and disallowing memtable write
@@ -5125,6 +5126,11 @@ TEST_F(DBBasicTest, DisallowMemtableWrite) {
   EXPECT_EQ(Get(2, "b2"), "2");
   EXPECT_EQ(Get(3, "b3"), "NOT_FOUND");
 
+  std::unique_ptr<Iterator> iter(
+      dbfull()->NewIterator(ReadOptions(), handles_[3]));
+  iter->Seek("a3");
+  ASSERT_OK(iter->status());
+  iter.reset();
   // When the DB is re-opened with WAL entries for a CF that is newly setting
   // disallow_memtable_writes, we detect that and fail the open gracefully.
   ASSERT_EQ(TryReopenWithColumnFamilies(
diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc
index 9b0192cb8e8e..9a50bdc9fba5 100644
--- a/memtable/vectorrep.cc
+++ b/memtable/vectorrep.cc
@@ -79,6 +79,10 @@ class VectorRep : public MemTableRep {
     // Advance to the first entry with a key >= target
     void Seek(const Slice& user_key, const char* memtable_key) override;
 
+    // Seek and do some memory validation
+    Status SeekAndValidate(const Slice& internal_key, const char* memtable_key,
+                           bool allow_data_in_errors) override;
+
     // Advance to the first entry with a key <= target
     void SeekForPrev(const Slice& user_key, const char* memtable_key) override;
 
@@ -221,6 +225,22 @@ void VectorRep::Iterator::Seek(const Slice& user_key,
              .first;
 }
 
+Status VectorRep::Iterator::SeekAndValidate(const Slice& /* internal_key */,
+                                            const char* /* memtable_key */,
+                                            bool /* allow_data_in_errors */) {
+  if (vrep_) {
+    WriteLock l(&vrep_->rwlock_);
+    if (bucket_->begin() == bucket_->end()) {
+      // Memtable is empty
+      return Status::OK();
+    } else {
+      return Status::NotSupported("SeekAndValidate() not implemented");
+    }
+  } else {
+    return Status::NotSupported("SeekAndValidate() not implemented");
+  }
+}
+
 // Advance to the first entry with a key <= target
 void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/,
                                       const char* /*memtable_key*/) {
diff --git a/unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md b/unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md
new file mode 100644
index 000000000000..d4aea983272b
--- /dev/null
+++ b/unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md
@@ -0,0 +1 @@
+Fix iterator operations returning NotImplemented status if disallow_memtable_writes and paranoid_memory_checks CF options are both set.

From 20d051a00e27facb276fd57ded209e14e14dc367 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Fri, 6 Jun 2025 08:23:03 -0700
Subject: [PATCH 120/500] Support for mixed compression type (round robin) in
 benchmark (#13655)

Summary:
**Summary**
This pull request aims to enhance the functionality of DB bench by introducing ability to use custom compression manager **mixed** that RoundRobin betweens all the compression algorithm within SST blow. The pull request also introduces the **same_value_percentage** that increases the probability of the generate value to be same.

**Verification:**
Manually verified the injection of custom compression manager by setting breakpoint in the debugger.
Verified the effectiveness of the tunable parameter
```bash
#!/bin/bash

# Script to run db_bench with different parameter combinations
# Parameters varied:
# - compression_manager: mixed, none
# - same_value_percentage: 0, 50, 100
#
# To make this script executable, run: chmod +x run_db_bench.sh

# Exit on error
set -e

# Check if db_bench exists
if [ ! -f "./db_bench" ]; then
    echo "Error: db_bench executable not found in current directory"
    exit 1
fi

# Define parameter arrays
compression_managers=("mixed" "none")
same_value_percentages=(0 50 100)

# Create output directory if it doesn't exist
mkdir -p results

# Loop through all combinations
for cm in "${compression_managers[@]}"; do
    for svp in "${same_value_percentages[@]}"; do
        # Define output file
        output_file="results/bench_${cm}_${svp}.log"

        echo "Running with compression_manager=${cm}, same_value_percentage=${svp}"
        # Run db_bench with current parameters
        ./db_bench -db=/dev/shm/dbbench \
            --benchmarks=fillseq \
            -num=10000000 \
            -compaction_style=2 \
            -fifo_compaction_max_table_files_size_mb=1000 \
            -fifo_compaction_allow_compaction=0 \
            -disable_wal \
            -write_buffer_size=12000000 \
            -compression_type=zstd \
            -compression_parallel_threads=1 \
            -compression_manager="${cm}" \
            -same_value_percentage="${svp}" \
            --stats_level=5 \
            --statistics > "${output_file}" 2>&1

        echo "Completed. Results saved to ${output_file}"
    done
done

echo "All benchmarks completed successfully!"

```
**Result**
compression manager | same_value_percentage | compressed byte from | compressed bytes to | ratio | compression time nanos sum | count | avg (compression time)
-- | -- | -- | -- | -- | -- | -- | --
mixed | 0 | 1203147502 | 637471319 | 1.887375112 | 37314989743 | 299756 | 124484.5466
mixed | 50 | 1203412251 | 398088802 | 3.022974384 | 34026215298 | 299846 | 113478.9702
mixed | 100 | 1206024000 | 109625322 | 11.00132686 | 20307741897 | 300557 | 67567.02355
none | 0 | 1209573133 | 559497700 | 2.161891162 | 6379855390 | 301301 | 21174.3585
none | 50 | 1209478701 | 348595024 | 3.469581083 | 4289921941 | 301295 | 14238.2779
none | 100 | 1209380499 | 72681369 | 16.63948431 | 2147469616 | 301303 | 7127.275918

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13655

Reviewed By: hx235

Differential Revision: D76092113

Pulled By: shubhajeet

fbshipit-source-id: 4a4e998650d78bfe1651257cb2f1b97016dcec56
---
 tools/db_bench_tool.cc | 55 +++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index bd5ccfef5f0b..d78ea5ac3cdb 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -83,6 +83,7 @@
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
+#include "util/simple_mixed_compressor.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -92,7 +93,6 @@
 #include "utilities/merge_operators/bytesxor.h"
 #include "utilities/merge_operators/sortlist.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
-
 #ifdef MEMKIND
 #include "memory/memkind_kmem_allocator.h"
 #endif
@@ -596,6 +596,9 @@ static enum ROCKSDB_NAMESPACE::CompressionType
     FLAGS_compressed_secondary_cache_compression_type_e =
         ROCKSDB_NAMESPACE::kLZ4Compression;
 
+DEFINE_string(compression_manager, "none",
+              "Set the compression manager type to mixed(roundrobin) or other "
+              "type. None for BuilInCompressor");
 DEFINE_int32(compressed_secondary_cache_compression_level,
              ROCKSDB_NAMESPACE::CompressionOptions().level,
              "Compression level. The meaning of this value is library-"
@@ -1811,6 +1814,10 @@ DEFINE_bool(track_and_verify_wals_in_manifest, false,
 
 DEFINE_bool(track_and_verify_wals, false, "See Options.track_and_verify_wals");
 
+DEFINE_int32(same_value_percentage, 0,
+             "Percentage of time value will be same i.e good for compression "
+             "of the block");
+
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static Status CreateMemTableRepFactory(
@@ -1931,9 +1938,10 @@ class RandomGenerator {
   std::string data_;
   unsigned int pos_;
   std::unique_ptr<BaseDistribution> dist_;
+  Random rnd;
 
  public:
-  RandomGenerator() {
+  RandomGenerator() : rnd(301) {
     auto max_value_size = FLAGS_value_size_max;
     switch (FLAGS_value_size_distribution_type_e) {
       case kUniform:
@@ -1952,7 +1960,6 @@ class RandomGenerator {
     // We use a limited amount of data over and over again and ensure
     // that it is larger than the compression window (32KB), and also
     // large enough to serve all typical value sizes we want to write.
-    Random rnd(301);
     std::string piece;
     while (data_.size() < (unsigned)std::max(1048576, max_value_size)) {
       // Add a short fragment that is as compressible as specified
@@ -1965,11 +1972,15 @@ class RandomGenerator {
 
   Slice Generate(unsigned int len) {
     assert(len <= data_.size());
-    if (pos_ + len > data_.size()) {
-      pos_ = 0;
+    if (rnd.PercentTrue(FLAGS_same_value_percentage)) {
+      return Slice(data_.data(), len);
+    } else {
+      if (pos_ + len > data_.size()) {
+        pos_ = 0;
+      }
+      pos_ += len;
+      return Slice(data_.data() + pos_ - len, len);
     }
-    pos_ += len;
-    return Slice(data_.data() + pos_ - len, len);
   }
 
   Slice Generate() {
@@ -2884,9 +2895,17 @@ class Benchmark {
       }
 #endif
     }
-
-    auto compression = CompressionTypeToString(FLAGS_compression_type_e);
-    fprintf(stdout, "Compression: %s\n", compression.c_str());
+    // mixed compression  manager expect compression type to be expliciltiy
+    // configured through Options to be zstd
+    auto compression = std::string("zstd");
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+      fprintf(stdout, "Compression manager: mixed\n");
+      fprintf(stdout, "Compression: zstd\n");
+    } else {
+      fprintf(stdout, "Compression manager: none\n");
+      compression = CompressionTypeToString(FLAGS_compression_type_e);
+      fprintf(stdout, "Compression: %s\n", compression.c_str());
+    }
     fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
             FLAGS_sample_for_compression);
     if (options.memtable_factory != nullptr) {
@@ -4610,7 +4629,21 @@ class Benchmark {
         FLAGS_level0_file_num_compaction_trigger;
     options.level0_slowdown_writes_trigger =
         FLAGS_level0_slowdown_writes_trigger;
-    options.compression = FLAGS_compression_type_e;
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+      // Need to list zstd in the compression_name table property if it's
+      // potentially used by being in the mix (i.e., potentially at least one
+      // data block in the table is compressed by zstd). This ensures proper
+      // context and dictionary handling, and prevents crashes in older RocksDB
+      // versions.
+      options.compression = kZSTD;
+      options.bottommost_compression = kZSTD;
+      auto mgr = std::make_shared<RoundRobinManager>(
+          GetDefaultBuiltinCompressionManager());
+      options.compression_manager = mgr;
+    } else {
+      options.compression = FLAGS_compression_type_e;
+    }
+
     if (FLAGS_simulate_hybrid_fs_file != "") {
       options.last_level_temperature = Temperature::kWarm;
     }

From adb750fdf4078d1088d157a94993fecf6a85bb19 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Fri, 6 Jun 2025 09:32:12 -0700
Subject: [PATCH 121/500] Separating into cc and header file for
 simple_mixed_compressor.h (#13665)

Summary:
**Summary**
This pull request fixes the issue of having a single file simple_mixed_compressor.h containing both implementation and declaration. To improve code organization and follow best practices, I have separated the implementation into a new file simple_mixed_compressor.cc and updated the original file to only contain the necessary declarations.

**Testing**
Testing was performed by verifying the stdout output from both RoundRobinCompressor and BuiltInCompressorV2.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13665

Reviewed By: pdillinger

Differential Revision: D76060831

Pulled By: shubhajeet

fbshipit-source-id: c034868be51ea7b89c1a8dd12082b0159f49f588
---
 BUCK                            |   1 +
 CMakeLists.txt                  |   1 +
 src.mk                          |   1 +
 util/simple_mixed_compressor.cc | 110 ++++++++++++++++++++++++++++++++
 util/simple_mixed_compressor.h  | 105 ++++++------------------------
 5 files changed, 134 insertions(+), 84 deletions(-)
 create mode 100644 util/simple_mixed_compressor.cc

diff --git a/BUCK b/BUCK
index 811fcd5a3854..c14cd38883b1 100644
--- a/BUCK
+++ b/BUCK
@@ -267,6 +267,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "util/random.cc",
         "util/rate_limiter.cc",
         "util/ribbon_config.cc",
+        "util/simple_mixed_compressor.cc",
         "util/slice.cc",
         "util/status.cc",
         "util/stderr_logger.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 182b4cde1514..b4f5b8fe185b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -879,6 +879,7 @@ set(SOURCES
         util/compaction_job_stats_impl.cc
         util/comparator.cc
         util/compression.cc
+        util/simple_mixed_compressor.cc
         util/compression_context_cache.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
diff --git a/src.mk b/src.mk
index a25348b919e9..6fe4b8539cbe 100644
--- a/src.mk
+++ b/src.mk
@@ -256,6 +256,7 @@ LIB_SOURCES =                                                   \
   util/ribbon_config.cc                                         \
   util/slice.cc                                                 \
   util/file_checksum_helper.cc                                  \
+  util/simple_mixed_compressor.cc                               \
   util/status.cc                                                \
   util/stderr_logger.cc                                         \
   util/string_util.cc                                           \
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
new file mode 100644
index 000000000000..48257a30a7c4
--- /dev/null
+++ b/util/simple_mixed_compressor.cc
@@ -0,0 +1,110 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Creates mixed compressor wrapper which uses multiple compression algorithm
+// within same SST file.
+
+#include "simple_mixed_compressor.h"
+
+#include <options/options_helper.h>
+
+#include "random.h"
+#include "rocksdb/advanced_compression.h"
+namespace ROCKSDB_NAMESPACE {
+
+// MultiCompressorWrapper implementation
+MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts,
+                                               CompressionType type,
+                                               CompressionDict&& dict) {
+  assert(type != kNoCompression);
+  assert(type == kZSTD);
+  auto builtInManager = GetDefaultBuiltinCompressionManager();
+  const auto& compressions = GetSupportedCompressions();
+  for (auto type_ : compressions) {
+    if (type_ == kNoCompression) {
+      continue;
+    }
+    compressors_.push_back(builtInManager->GetCompressor(opts, type_));
+  }
+  (void)dict;
+  (void)type;
+}
+
+size_t MultiCompressorWrapper::GetMaxSampleSizeIfWantDict(
+    CacheEntryRole block_type) const {
+  return compressors_.back()->GetMaxSampleSizeIfWantDict(block_type);
+}
+
+Slice MultiCompressorWrapper::GetSerializedDict() const {
+  return compressors_.back()->GetSerializedDict();
+}
+
+CompressionType MultiCompressorWrapper::GetPreferredCompressionType() const {
+  return kZSTD;
+}
+
+Compressor::ManagedWorkingArea MultiCompressorWrapper::ObtainWorkingArea() {
+  return compressors_.back()->ObtainWorkingArea();
+}
+
+std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
+    CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+  return compressors_.back()->MaybeCloneSpecialized(block_type,
+                                                    std::move(dict_samples));
+}
+
+// SimpleMixedCompressor implementation
+Status SimpleMixedCompressor::CompressBlock(
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+  auto selected =
+      Random::GetTLSInstance()->Uniform(static_cast<int>(compressors_.size()));
+  auto& compressor = compressors_[selected];
+  return compressor->CompressBlock(uncompressed_data, compressed_output,
+                                   out_compression_type, wa);
+}
+
+// SimpleMixedCompressionManager implementation
+const char* SimpleMixedCompressionManager::Name() const {
+  return wrapped_->Name();
+  // return "SimpleMixedCompressionManager";
+}
+
+std::unique_ptr<Compressor> SimpleMixedCompressionManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(preferred == kZSTD);
+  (void)context;
+  return std::make_unique<SimpleMixedCompressor>(opts, preferred);
+}
+
+// RoundRobinCompressor implementation
+Status RoundRobinCompressor::CompressBlock(
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+  auto counter = block_counter.FetchAddRelaxed(1);
+  auto sel_idx = counter % (compressors_.size());
+  auto& compressor = compressors_[sel_idx];
+  return compressor->CompressBlock(uncompressed_data, compressed_output,
+                                   out_compression_type, wa);
+}
+
+RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
+
+// RoundRobinManager implementation
+const char* RoundRobinManager::Name() const {
+  // return "RoundRobinManager";
+  return wrapped_->Name();
+}
+
+std::unique_ptr<Compressor> RoundRobinManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(preferred == kZSTD);
+  (void)context;
+  return std::make_unique<RoundRobinCompressor>(opts, preferred);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 79debc000299..84d67558a4bd 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -1,17 +1,17 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
+// Creates mixed compressor wrapper which uses multiple compression algorithm
+// within same SST file.
+
 #pragma once
-#include <random>
+#include <memory>
+#include <mutex>
+#include <vector>
 
 #include "compression.h"
-#include "options/options_helper.h"
 #include "rocksdb/advanced_compression.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -20,111 +20,48 @@ class MultiCompressorWrapper : public Compressor {
  public:
   explicit MultiCompressorWrapper(const CompressionOptions& opts,
                                   CompressionType type,
-                                  CompressionDict&& dict = {}) {
-    assert(type != kNoCompression);
-    assert(type == kZSTD);
-    auto builtInManager = GetDefaultBuiltinCompressionManager();
-    const auto& compressions = GetSupportedCompressions();
-    for (auto type_ : compressions) {
-      if (type_ == kNoCompression) {  // Avoid no compression
-        continue;
-      }
-      compressors_.push_back(builtInManager->GetCompressor(opts, type_));
-    }
-    (void)dict;
-    (void)type;
-  }
-  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override {
-    return compressors_.back()->GetMaxSampleSizeIfWantDict(block_type);
-  }
-
-  Slice GetSerializedDict() const override {
-    return compressors_.back()->GetSerializedDict();
-  }
+                                  CompressionDict&& dict = {});
 
-  CompressionType GetPreferredCompressionType() const override { return kZSTD; }
-
-  ManagedWorkingArea ObtainWorkingArea() override {
-    return compressors_.back()->ObtainWorkingArea();
-  }
-  virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
-    return compressors_.back()->MaybeCloneSpecialized(block_type,
-                                                      std::move(dict_samples));
-  }
+  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;
+  Slice GetSerializedDict() const override;
+  CompressionType GetPreferredCompressionType() const override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override;
 
  protected:
   std::vector<std::unique_ptr<Compressor>> compressors_;
-
- private:
-  mutable std::mutex mutex_;  // Protects access to current_index_
 };
+
 struct SimpleMixedCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
-                       ManagedWorkingArea* wa) override {
-    const auto& compressions = GetSupportedCompressions();
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_int_distribution<> dis(
-        1, (int)compressions.size() - 2);  // avoiding no compression and zstd
-    auto selected = dis(gen);
-    auto& compressor = compressors_[selected % compressors_.size()];
-    // fprintf(stdout, "[MultiCompressorWrapper] selected compressor
-    // typeint:%d\n",
-    //         selected);
-    Status status = compressor->CompressBlock(
-        uncompressed_data, compressed_output, out_compression_type, wa);
-    return status;
-  }
+                       ManagedWorkingArea* wa) override;
 };
 
 class SimpleMixedCompressionManager : public CompressionManagerWrapper {
   using CompressionManagerWrapper::CompressionManagerWrapper;
-  const char* Name() const override { return wrapped_->Name(); }
+  const char* Name() const override;
   std::unique_ptr<Compressor> GetCompressorForSST(
       const FilterBuildingContext& context, const CompressionOptions& opts,
-      CompressionType preferred) override {
-    assert(preferred == kZSTD);
-    (void)context;
-    return std::make_unique<SimpleMixedCompressor>(opts, preferred);
-  }
+      CompressionType preferred) override;
 };
 
 struct RoundRobinCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
-                       ManagedWorkingArea* wa) override {
-    const auto& compressions = GetSupportedCompressions();
-    auto counter = block_counter.FetchAddRelaxed(1);
-    auto sel_idx = counter % (compressions.size() - 1);
-    auto& compressor = compressors_[sel_idx];
-    // auto type = compressions[sel_idx];
-    // fprintf(stdout,
-    //         "[CompressorWrapper] selected compression algo: %s typeint:%d\n",
-    //         std::to_string(type).c_str(), type);
-    return compressor->CompressBlock(uncompressed_data, compressed_output,
-                                     out_compression_type, wa);
-  }
+                       ManagedWorkingArea* wa) override;
   static RelaxedAtomic<uint64_t> block_counter;
 };
-RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
 
 class RoundRobinManager : public CompressionManagerWrapper {
   using CompressionManagerWrapper::CompressionManagerWrapper;
-  const char* Name() const override { return wrapped_->Name(); }
+  const char* Name() const override;
   std::unique_ptr<Compressor> GetCompressorForSST(
       const FilterBuildingContext& context, const CompressionOptions& opts,
-      CompressionType preferred) override {
-    assert(preferred == kZSTD);
-    (void)context;
-    // fprintf(stdout,
-    //         "[CompressorWrapper] selected compression algo: %s typeint:%d\n",
-    // void)context;
-    return std::make_unique<RoundRobinCompressor>(opts, preferred);
-  }
+      CompressionType preferred) override;
 };
 
 }  // namespace ROCKSDB_NAMESPACE

From de376be2ba3c9f20de5eecfdaae2bb028893a17a Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 6 Jun 2025 12:40:45 -0700
Subject: [PATCH 122/500] Simplify
 RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources test
 (#13672)

Summary:
**Context/Summary:**
`RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources` has been flaky and difficult to de-flake. One of the reasons is the complicated usage of sync points and unnecessarily strict verification.
- The sync points don't seem necessary to verify the number of extra reserved threads for sub-compactions so are removed.
- The full reservation after compaction to verify extra reserved threads were release is indirect and hard to get right. So it's replaced with simpler sync-point callback check.
    - Since we already have tests (see https://github.com/facebook/rocksdb/blob/7d80ea45442e84c25669db61cb7376ba0cd10ba5/env/env_test.cc#L841 and )for testing pure functionality of reserve/release does reserve/release the threads, verifying the relevant code paths are called should be enough to verify extra reserved threads were released after compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13672

Test Plan: Monitor future flakiness.

Reviewed By: cbi42

Differential Revision: D76108242

Pulled By: hx235

fbshipit-source-id: 30113f16455688f113f296bda0098a66a7a198a3
---
 db/compaction/compaction_job.cc |  3 +--
 db/db_compaction_test.cc        | 48 ++++++++++++++-------------------
 2 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 1f521f7d2dd0..f2a36907de42 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -839,8 +839,7 @@ Status CompactionJob::Run() {
   }
 
   ReleaseSubcompactionResources();
-  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
-  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources");
 
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.GetOutputs()) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 98536960c8a9..ba4edeffa2da 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -6634,7 +6634,7 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
   // compaction is enough to make post-compaction L1 size less than
   // the maximum size (this test assumes only one round-robin compaction
   // is triggered by kLevelMaxLevelSize)
-  options.max_compaction_bytes = 100000000;
+  options.max_compaction_bytes = std::numeric_limits<uint64_t>::max();
 
   DestroyAndReopen(options);
   env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
@@ -6667,41 +6667,33 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
         // More than 10 files are selected for round-robin under auto
         // compaction. The number of planned subcompaction is restricted by
         // the minimum number between available threads and compaction limits
-        ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
-                  std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
+        auto actual_reserved_threads =
+            num_planned_subcompactions - options.max_subcompactions;
+        auto expected_reserved_threads =
+            std::min(total_low_pri_threads_, max_compaction_limits_) - 1;
+        ASSERT_EQ(actual_reserved_threads, expected_reserved_threads);
         num_planned_subcompactions_verified = true;
       });
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"RoundRobinSubcompactionsAgainstResources:0",
-        "BackgroundCallCompaction:0"},
-       {"CompactionJob::AcquireSubcompactionResources:0",
-        "RoundRobinSubcompactionsAgainstResources:1"},
-       {"RoundRobinSubcompactionsAgainstResources:2",
-        "CompactionJob::AcquireSubcompactionResources:1"},
-       {"CompactionJob::ReleaseSubcompactionResources:0",
-        "RoundRobinSubcompactionsAgainstResources:3"},
-       {"RoundRobinSubcompactionsAgainstResources:4",
-        "CompactionJob::ReleaseSubcompactionResources:1"}});
+
+  int acquire_count = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::AcquireSubcompactionResources:0",
+      [&](void* /*arg*/) { acquire_count++; });
+  int release_count = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ReleaseSubcompactionResources",
+      [&](void* /*arg*/) { release_count++; });
+
   SyncPoint::GetInstance()->EnableProcessing();
 
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
   auto pressure_token =
       dbfull()->TEST_write_controler().GetCompactionPressureToken();
-
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
-  // We can reserve more threads now except one is being used
-  ASSERT_EQ(total_low_pri_threads_ - 1,
-            env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
-  ASSERT_EQ(
-      total_low_pri_threads_ - 1,
-      env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
-  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
   ASSERT_TRUE(num_planned_subcompactions_verified);
+  ASSERT_EQ(acquire_count, release_count);
+
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }

From 6403642c02831d424b3b1a762b131373b312ede6 Mon Sep 17 00:00:00 2001
From: Ryan4253 <ryan.liao0305@gmail.com>
Date: Tue, 10 Jun 2025 19:03:29 -0700
Subject: [PATCH 123/500] Add missing fields in BuildSubcompactionJobInfo
 (#13667) (#13668)

Summary:
As title, BuildSubcompactionJobInfo doesn't update compaction_reason, compression, and blob_compression_type. event listeners depend on this information.

## Verification
Ran the code on [kvrocks](https://github.com/apache/kvrocks/tree/unstable) which implements event hooks when subcompaction happens.

Before:
```
[2025-06-03T10:31:33.660798-04:00][I][event_listener.cc:119] [event_listener/subcompaction_begin] column family: metadata, job_id: 7, compaction reason: Unknown, output compression type: no
```

After:
```
[2025-06-03T10:31:33.660798-04:00][I][event_listener.cc:119] [event_listener/subcompaction_begin] column family: metadata, job_id: 7, compaction reason: LevelL0FilesNum, output compression type: no
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13668

Reviewed By: virajthakur

Differential Revision: D76173031

Pulled By: hx235

fbshipit-source-id: 3ec8f5b0cbd73b75d4dc98ca788b07c31a590b4d
---
 db/compaction/subcompaction_state.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index fba41c974318..14e11bcf2452 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -106,7 +106,11 @@ class SubcompactionState {
     subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
     subcompaction_job_info.base_input_level = c->start_level();
     subcompaction_job_info.output_level = c->output_level();
+    subcompaction_job_info.compaction_reason = c->compaction_reason();
+    subcompaction_job_info.compression = c->output_compression();
     subcompaction_job_info.stats = compaction_job_stats;
+    subcompaction_job_info.blob_compression_type =
+        c->mutable_cf_options().blob_compression_type;
   }
 
   SubcompactionState() = delete;

From ab1fb6cf8edaedf581b8c4d868a4d448762c284e Mon Sep 17 00:00:00 2001
From: jeffzfzheng <jeffzfzheng@tencent.com>
Date: Tue, 10 Jun 2025 19:03:53 -0700
Subject: [PATCH 124/500] Fix overflow of data_size in
 WritableFileWriter::WriteBufferedWithChecksum (#13641)

Summary:
In the function WritableFileWriter::WriteBufferedWithChecksum, since the alignment parameter passed to RequestToken defaults to 4096, when data_size is less than 4096, subtracting a larger value from data_size (which is of type unsigned long) will cause an underflow. This results in an infinite loop. Since WriteBuffered does not require alignment, it is sufficient to pass alignment == 0.

issue:https://github.com/facebook/rocksdb/issues/13640

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13641

Reviewed By: jaykorean

Differential Revision: D76341973

Pulled By: hx235

fbshipit-source-id: 8912f2b6598bb5a48b6b813c53146d9ecfd31d30
---
 file/writable_file_writer.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc
index 41e3b582afa4..dad64fb22424 100644
--- a/file/writable_file_writer.cc
+++ b/file/writable_file_writer.cc
@@ -687,9 +687,9 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum(const IOOptions& opts,
   if (rate_limiter_ != nullptr && rate_limiter_priority_used != Env::IO_TOTAL) {
     while (data_size > 0) {
       size_t tmp_size;
-      tmp_size = rate_limiter_->RequestToken(data_size, buf_.Alignment(),
-                                             rate_limiter_priority_used, stats_,
-                                             RateLimiter::OpType::kWrite);
+      tmp_size =
+          rate_limiter_->RequestToken(data_size, 0, rate_limiter_priority_used,
+                                      stats_, RateLimiter::OpType::kWrite);
       data_size -= tmp_size;
     }
   }

From 37a26591c7a06793ef5c458e0b004ee48e361964 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 11 Jun 2025 17:14:14 -0700
Subject: [PATCH 125/500] Print note about the large hard-coded num_level for
 manifest dump (#13681)

Summary:
**Context/Summary:**
Since LDB manifest dump including printing the LSM shape does not open the db and manifest itself does not have info about Options.num_levels, LDB tool (the only caller of `DumpManifestHandler` has to set a "hopefully-large-enough" level number (i.e,64) to print info of every level for the LSM shape in the manifest. This can mislead whoever that's reading the manifest to believe there are actually 64 levels configured with the CF. This PR clarifies that.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13681

Test Plan:
Manual test
`./ldb  manifest_dump --hex --verbose --json --path=<some manifest file path>`
```
--------------- Column family "9"  (ID 9) --------------
log number: 115873
comparator: leveldb.BytewiseComparator
 --- level 0 --- version# 19 ---
 --- level 1 --- version# 19 --- compact_cursor: '000000000000000900000000000000DF78787878787878' seq:3418519, type:2 ---
 --- level 2 --- version# 19 --- compact_cursor: '000000000000000900000000000000D8' seq:3446619, type:2 ---
 --- level 3 --- version# 19 --- compact_cursor: '000000000000000900000000000000DF78787878787878' seq:3418519, type:2 ---
 --- level 4 --- version# 19 --- compact_cursor: '000000000000000900000000000000DF78787878787878' seq:3418519, type:2 ---
 --- level 5 --- version# 19 --- compact_cursor: '0000000000000009000000000000012B0000000000000065' seq:3447830, type:2 ---
 --- level 6 --- version# 19 ---
 115931:376281[0 .. 0]['0000000000000000' seq:0, type:1 .. '00000000000003E7000000000000012B00000000000002B1' seq:0, type:1]
 --- level 7 --- version# 19 ---
 --- level 8 --- version# 19 ---
 --- level 9 --- version# 19 ---
 --- level 10 --- version# 19 ---
 --- level 11 --- version# 19 ---
....
 --- level 61 --- version# 19 ---
 --- level 62 --- version# 19 ---
 --- level 63 --- version# 19 ---
By default, manifest file dump prints LSM trees as if 64 levels were configured, which is not necessarily true for the column family (CF) this manifest is associated with. Please consult other DB files, such as the OPTIONS file, to confirm.

```

Reviewed By: jaykorean

Differential Revision: D76391064

Pulled By: hx235

fbshipit-source-id: 3e1c58e0eeb39a5fa020040201b07b181f8977a6
---
 db/version_edit_handler.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index e60644e2714b..c89fe0a42123 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -1135,6 +1135,15 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
     // Print out DebugStrings. Can include non-terminating null characters.
     fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
            cfd->current()->DebugString(hex_).size(), stdout);
+
+    fprintf(stdout,
+            "By default, manifest file dump prints LSM trees as if %d levels "
+            "were configured, "
+            "which is not necessarily true for the column family (CF) this "
+            "manifest is associated with. "
+            "Please consult other DB files, such as the OPTIONS file, to "
+            "confirm.\n",
+            cfd->ioptions().num_levels);
   }
   fprintf(stdout,
           "next_file_number %" PRIu64 " last_sequence %" PRIu64

From 873f7fe5358a489f14368812651ca45aadf3b587 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 11 Jun 2025 17:30:54 -0700
Subject: [PATCH 126/500] Add MergeOperator UnitTest for Remote Compaction
 (#13683)

Summary:
As title. Simple Unit Test to check MergeOperator in Remote Compaction flow.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13683

Test Plan:
```
./compaction_service_test --gtest_filter="*CompactionServiceTest.MergeOperator*"
```

Reviewed By: hx235

Differential Revision: D76459146

Pulled By: jaykorean

fbshipit-source-id: 50956824d50c503e7166304a2d52f624bbdda7ec
---
 db/compaction/compaction_service_test.cc | 27 ++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index e59185b64212..a5c0295540d5 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -7,6 +7,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/utilities/options_util.h"
 #include "table/unique_id_impl.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -1222,6 +1223,32 @@ TEST_F(CompactionServiceTest, CompactionFilter) {
   ASSERT_GE(my_cs->GetCompactionNum(), 1);
 }
 
+TEST_F(CompactionServiceTest, MergeOperator) {
+  Options options = CurrentOptions();
+  options.merge_operator.reset(new StringAppendOperator(','));
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i),
+                         "merge_op_append_" + std::to_string(i)));
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i) + ",merge_op_append_" +
+                            std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i) + ",merge_op_append_" +
+                            std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
 TEST_F(CompactionServiceTest, Snapshot) {
   Options options = CurrentOptions();
   ReopenWithCompactionService(&options);

From 82586e293e99d80ec4b11fb7460a1161116e93a8 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 12 Jun 2025 09:21:04 -0700
Subject: [PATCH 127/500] Upgrade Maven to 3.9.10 (#13684)

Summary:
https://dlcdn.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz is no longer available. Because of that, CI for JAVA has been broken.

https://github.com/facebook/rocksdb/actions/runs/15596243797/job/43927189803?pr=13683

Instead of finding a new place to download from, taking this opportunity to upgrade to 3.9.10.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13684

Test Plan: CI

Reviewed By: pdillinger, archang19

Differential Revision: D76474615

Pulled By: jaykorean

fbshipit-source-id: 3c05efb9e0ef381c97fa43dc3c9960b627c6dd59
---
 .github/actions/install-maven/action.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/actions/install-maven/action.yml b/.github/actions/install-maven/action.yml
index 69a925272ac1..0c6a68d3c3a2 100644
--- a/.github/actions/install-maven/action.yml
+++ b/.github/actions/install-maven/action.yml
@@ -4,8 +4,8 @@ runs:
   steps:
   - name: Install Maven
     run: |
-      wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.6/binaries/apache-maven-3.9.6-bin.tar.gz
-      tar zxf apache-maven-3.9.6-bin.tar.gz
-      echo "export M2_HOME=$(pwd)/apache-maven-3.9.6" >> $GITHUB_ENV
-      echo "$(pwd)/apache-maven-3.9.6/bin" >> $GITHUB_PATH
+      wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.10/binaries/apache-maven-3.9.10-bin.tar.gz
+      tar zxf apache-maven-3.9.10-bin.tar.gz
+      echo "export M2_HOME=$(pwd)/apache-maven-3.9.10" >> $GITHUB_ENV
+      echo "$(pwd)/apache-maven-3.9.10/bin" >> $GITHUB_PATH
     shell: bash

From 96305d9bb4f65303fa8f43ca78d4525c5a151b53 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 12 Jun 2025 09:43:45 -0700
Subject: [PATCH 128/500] Fix to enable --Wunreachable-code-break (#13686)

Summary:
As title

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13686

Test Plan: CI

Reviewed By: archang19

Differential Revision: D76518029

Pulled By: jaykorean

fbshipit-source-id: cb04d8a79edde8f122e02cf761a1d42c203347cd
---
 options/options_helper.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/options/options_helper.cc b/options/options_helper.cc
index 9ce73cad094b..de6b796822a7 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -568,7 +568,6 @@ bool SerializeSingleOptionHelper(const void* opt_address,
       return SerializeEnum<CompressionType>(
           compression_type_string_map,
           *(static_cast<const CompressionType*>(opt_address)), value);
-      break;
     case OptionType::kChecksumType:
       return SerializeEnum<ChecksumType>(
           checksum_type_string_map,

From 85910fb575bec0257c6835bda0ae1b5cbe80d01b Mon Sep 17 00:00:00 2001
From: Ryan4253 <ryan.liao0305@gmail.com>
Date: Thu, 12 Jun 2025 13:52:30 -0700
Subject: [PATCH 129/500] event_helpers logging symmetry improvements (#13669)
 (#13670)

Summary:
1. LogAndNotifyTableFileDeletion checks for null event logger like other functions
2. LogAndNotifyBlobFileCreationFinished and LogAndNotifyTablebFileCreationFinished log on success similar to deletions
3. LogAndNotify functions log status on success

## Verification
Ran the code on [kvrocks](https://github.com/apache/kvrocks/tree/unstable) which implements event hooks, and the logging is now observable / consistent.
```
2025/06/05-10:00:49.644611 92065 EVENT_LOG_v1 {"time_micros": 1749132049644595, "cf_name": "metadata", "job": 5, "event": "blob_file_creation", "file_number": 34, "total_blob_count": 68, "total_blob_bytes": 272018457, "file_checksum": "", "file_checksum_func_name": "Unknown", "status": "OK"}
```
```
2025/06/02-09:42:29.343893 122068 EVENT_LOG_v1 {"time_micros": 1748871749343853, "cf_name": "metadata", "job": 93, "event": "table_file_creation", "file_number": 853, "file_size": 0, "file_checksum": "", "file_checksum_func_name": "Unknown", "smallest_seqno": 23371, "largest_seqno": 24182, "table_properties": {"data_size": 0, "index_size": 0, "index_partitions": 0, "top_level_index_size": 0, "index_key_is_user_key": 0, "index_value_is_delta_encoded": 0, "filter_size": 0, "raw_key_size": 0, "raw_average_key_size": 0, "raw_value_size": 0, "raw_average_value_size": 0, "num_data_blocks": 0, "num_entries": 0, "num_filter_entries": 0, "num_deletions": 0, "num_merge_operands": 0, "num_range_deletions": 0, "format_version": 0, "fixed_key_len": 0, "filter_policy": "", "column_family_name": "", "column_family_id": 2147483647, "comparator": "", "user_defined_timestamps_persisted": 1, "key_largest_seqno": 18446744073709551615, "merge_operator": "", "prefix_extractor_name": "", "property_collectors": "", "compression": "", "compression_options": "", "creation_time": 0, "oldest_key_time": 0, "newest_key_time": 0, "file_creation_time": 0, "slow_compression_estimated_data_size": 0, "fast_compression_estimated_data_size": 0, "db_id": "", "db_session_id": "", "orig_file_number": 0, "seqno_to_time_mapping": "N/A"}, "oldest_blob_file_number": 821, "status": "Shutdown in progress: Database shutdown"}
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13670

Reviewed By: jaykorean

Differential Revision: D76173710

Pulled By: hx235

fbshipit-source-id: 1f81623c1edade0c122bd0e73391a1b76abc13d9
---
 db/event_helpers.cc | 48 +++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index 2b901f6adc06..638d0ed6e2c9 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -77,7 +77,12 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
     TableFileCreationReason reason, const Status& s,
     const std::string& file_checksum,
     const std::string& file_checksum_func_name) {
-  if (s.ok() && event_logger) {
+  if (!event_logger && listeners.empty()) {
+    s.PermitUncheckedError();
+    return;
+  }
+
+  if (event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
     jwriter << "cf_name" << cf_name << "job" << job_id << "event"
@@ -165,6 +170,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
       jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
     }
 
+    jwriter << "status" << s.ToString();
+
     jwriter.EndObject();
 
     event_logger->Log(jwriter);
@@ -195,18 +202,22 @@ void EventHelpers::LogAndNotifyTableFileDeletion(
     const std::string& file_path, const Status& status,
     const std::string& dbname,
     const std::vector<std::shared_ptr<EventListener>>& listeners) {
-  JSONWriter jwriter;
-  AppendCurrentTime(&jwriter);
-
-  jwriter << "job" << job_id << "event" << "table_file_deletion"
-          << "file_number" << file_number;
-  if (!status.ok()) {
-    jwriter << "status" << status.ToString();
+  if (!event_logger && listeners.empty()) {
+    status.PermitUncheckedError();
+    return;
   }
 
-  jwriter.EndObject();
+  if (event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+
+    jwriter << "job" << job_id << "event" << "table_file_deletion"
+            << "file_number" << file_number << "status" << status.ToString();
 
-  event_logger->Log(jwriter);
+    jwriter.EndObject();
+
+    event_logger->Log(jwriter);
+  }
 
   if (listeners.empty()) {
     return;
@@ -274,7 +285,12 @@ void EventHelpers::LogAndNotifyBlobFileCreationFinished(
     const std::string& file_checksum,
     const std::string& file_checksum_func_name, uint64_t total_blob_count,
     uint64_t total_blob_bytes) {
-  if (s.ok() && event_logger) {
+  if (!event_logger && listeners.empty()) {
+    s.PermitUncheckedError();
+    return;
+  }
+
+  if (event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
     jwriter << "cf_name" << cf_name << "job" << job_id << "event"
@@ -305,15 +321,17 @@ void EventHelpers::LogAndNotifyBlobFileDeletion(
     const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
     uint64_t file_number, const std::string& file_path, const Status& status,
     const std::string& dbname) {
+  if (!event_logger && listeners.empty()) {
+    status.PermitUncheckedError();
+    return;
+  }
+
   if (event_logger) {
     JSONWriter jwriter;
     AppendCurrentTime(&jwriter);
 
     jwriter << "job" << job_id << "event" << "blob_file_deletion"
-            << "file_number" << file_number;
-    if (!status.ok()) {
-      jwriter << "status" << status.ToString();
-    }
+            << "file_number" << file_number << "status" << status.ToString();
 
     jwriter.EndObject();
     event_logger->Log(jwriter);

From 02bce9b1af98dbd32fed981b99bb2252b42b1542 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 12 Jun 2025 18:16:47 -0700
Subject: [PATCH 130/500] Reduce universal compaction input lock time by
 forwarding intended compaction and re-picking (#13633)

Summary:
**Context:**
RocksDB currently selects files for long-running compaction outputs to the bottommost level, preventing these selected files files from being selected, but does not execute the compaction immediately like other compactions. Instead, this compaction is forwarded to another Env::Priority::bottom thread pool, where it waits (potentially for a long time) until its thread is ready to execute. This extended L0 lock time in universal compaction caused our users write stall and read performance regression.

**Summary:**
This PR is to eliminate L0 lock time during bottom priority compaction waiting to execute by the following
- Create and forward an intended compaction only consists of last input file (or sorted run if non-L0) instead of all the input files. This eliminate the locking for non-bottommost level input files while waiting for bottom priority thread is up to run.
- Re-pick compaction that outputs to max output level when bottom priority thread is up to run
- Refactor universal compaction picking logic to make it cleaner and easier to force picking compaction with max output level when bottom priority thread is up to run
- Guard feature behind a temporary option as requested

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13633

Test Plan:
- New unit test to cover the case that's not covered by existing tests - bottom priority thread re-picks compaction ends up picking nothing due to LSM shape changes
- Adapted existing unit tests to verify various bottom priority compaction behavior with this new option
- Stress test `python3 tools/db_crashtest.py --simple blackbox --compaction_style=1 --target_file_size_base=1000 --write_buffer_size=1000 --compact_range_one_in=10000 --compact_files_one_in=10000 `

Reviewed By: cbi42

Differential Revision: D76005505

Pulled By: hx235

fbshipit-source-id: 9688f22d4a84f619452820f12f15b765c17301fd
---
 db/column_family.cc                           |   7 +-
 db/column_family.h                            |   3 +-
 db/compaction/compaction.h                    |   5 +
 db/compaction/compaction_picker.h             |   5 +-
 db/compaction/compaction_picker_fifo.cc       |   2 +-
 db/compaction/compaction_picker_fifo.h        |   3 +-
 db/compaction/compaction_picker_level.cc      |   2 +-
 db/compaction/compaction_picker_level.h       |   3 +-
 db/compaction/compaction_picker_universal.cc  | 405 +++++++++++-------
 db/compaction/compaction_picker_universal.h   |   5 +-
 db/db_compaction_test.cc                      | 354 ++++++++++-----
 db/db_impl/db_impl.h                          |  14 +
 db/db_impl/db_impl_compaction_flush.cc        | 227 ++++++++--
 db/db_universal_compaction_test.cc            | 108 +++--
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   7 +
 db_stress_tool/db_stress_test_base.cc         |   2 +
 include/rocksdb/universal_compaction.h        |  23 +-
 options/cf_options.cc                         |   7 +-
 options/options.cc                            |   3 +
 tools/db_bench_tool.cc                        |   7 +
 tools/db_crashtest.py                         |   4 +-
 .../new_features/reduce_file_locking.md       |   1 +
 23 files changed, 834 insertions(+), 364 deletions(-)
 create mode 100644 unreleased_history/new_features/reduce_file_locking.md

diff --git a/db/column_family.cc b/db/column_family.cc
index b4fa2fbf611d..06f7d1bbc053 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -463,7 +463,6 @@ ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
       result.memtable_avg_op_scan_flush_trigger = 0;
     }
   }
-
   return result;
 }
 
@@ -1218,10 +1217,12 @@ Compaction* ColumnFamilyData::PickCompaction(
     const MutableCFOptions& mutable_options,
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& existing_snapshots,
-    const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer) {
+    const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    bool require_max_output_level) {
   auto* result = compaction_picker_->PickCompaction(
       GetName(), mutable_options, mutable_db_options, existing_snapshots,
-      snapshot_checker, current_->storage_info(), log_buffer);
+      snapshot_checker, current_->storage_info(), log_buffer,
+      require_max_output_level);
   if (result != nullptr) {
     result->FinalizeInputInfo(current_);
   }
diff --git a/db/column_family.h b/db/column_family.h
index 31b0575a1b27..1b048dd9b4d4 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -424,7 +424,8 @@ class ColumnFamilyData {
       const MutableCFOptions& mutable_options,
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
-      const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer);
+      const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+      bool require_max_output_level = false);
 
   // Check if the passed range overlap with any running compactions.
   // REQUIRES: DB mutex held
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 082cf132b150..373a8b647492 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -463,6 +463,11 @@ class Compaction {
                                    const int start_level,
                                    const int output_level);
 
+  static bool OutputToNonZeroMaxOutputLevel(int output_level,
+                                            int max_output_level) {
+    return output_level > 0 && output_level == max_output_level;
+  }
+
   // If some data cannot be safely migrated "up" the LSM tree due to a change
   // in the preclude_last_level_data_seconds setting, this indicates a sequence
   // number for the newest data that must be kept in the last level.
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 093344c65c43..bbcc8fbac662 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -65,7 +65,7 @@ class CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer) = 0;
+      LogBuffer* log_buffer, bool require_max_output_level) = 0;
 
   // The returned Compaction might not include the whole requested range.
   // In that case, compaction_end will be set to the next key that needs
@@ -272,7 +272,8 @@ class NullCompactionPicker : public CompactionPicker {
       const MutableDBOptions& /*mutable_db_options*/,
       const std::vector<SequenceNumber>& /*existing_snapshots*/,
       const SnapshotChecker* /*snapshot_checker*/,
-      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */) override {
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      bool /*require_max_output_level*/ = false) override {
     return nullptr;
   }
 
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index d5c735194004..4f18cdda35c5 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -428,7 +428,7 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& /* existing_snapshots */,
     const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer, bool /* require_max_output_level*/) {
   Compaction* c = nullptr;
   if (mutable_cf_options.ttl > 0) {
     c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index 4dd1053e127b..6178be7be2de 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -23,7 +23,8 @@ class FIFOCompactionPicker : public CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& /* existing_snapshots */,
       const SnapshotChecker* /* snapshot_checker */,
-      VersionStorageInfo* version, LogBuffer* log_buffer) override;
+      VersionStorageInfo* version, LogBuffer* log_buffer,
+      bool /* require_max_output_level*/ = false) override;
 
   Compaction* CompactRange(const std::string& cf_name,
                            const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index b4a122954bf4..108c80cf3a76 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -978,7 +978,7 @@ Compaction* LevelCompactionPicker::PickCompaction(
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& /*existing_snapshots */,
     const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer, bool /* require_max_output_level*/) {
   LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
                                  mutable_cf_options, ioptions_,
                                  mutable_db_options);
diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
index 9cb41dfb64f8..34419f279841 100644
--- a/db/compaction/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@@ -25,7 +25,8 @@ class LevelCompactionPicker : public CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& /* existing_snapshots */,
       const SnapshotChecker* /* snapshot_checker */,
-      VersionStorageInfo* vstorage, LogBuffer* log_buffer) override;
+      VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      bool /*require_max_output_level*/ = false) override;
 
   bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
 };
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index f2bc740028ee..f9ba649273f7 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -38,7 +38,8 @@ class UniversalCompactionBuilder {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      UniversalCompactionPicker* picker, LogBuffer* log_buffer)
+      UniversalCompactionPicker* picker, LogBuffer* log_buffer,
+      bool require_max_output_level)
       : ioptions_(ioptions),
         icmp_(icmp),
         cf_name_(cf_name),
@@ -46,7 +47,8 @@ class UniversalCompactionBuilder {
         mutable_db_options_(mutable_db_options),
         vstorage_(vstorage),
         picker_(picker),
-        log_buffer_(log_buffer) {
+        log_buffer_(log_buffer),
+        require_max_output_level_(require_max_output_level) {
     assert(icmp_);
     const auto* ucmp = icmp_->user_comparator();
     assert(ucmp);
@@ -102,6 +104,174 @@ class UniversalCompactionBuilder {
     bool level_has_marked_standalone_rangedel;
   };
 
+  unsigned int GetMaxNumFilesToCompactBasedOnMaxReadAmp(
+      const int file_num_compaction_trigger, const unsigned int ratio,
+      int* num_sr_not_compacted_output, int* max_num_runs_output) const {
+    assert(num_sr_not_compacted_output);
+    assert(max_num_runs_output);
+    int max_num_runs =
+        mutable_cf_options_.compaction_options_universal.max_read_amp;
+    if (max_num_runs < 0) {
+      // any value < -1 is not valid
+      assert(max_num_runs == -1);
+      // By default, fall back to `level0_file_num_compaction_trigger`
+      max_num_runs = file_num_compaction_trigger;
+    } else if (max_num_runs == 0) {
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleTotalSize) {
+        // 0 means auto-tuning by RocksDB. We estimate max num run based on
+        // max_run_size, size_ratio and write buffer size:
+        // Assume the size of the lowest level size is equal to
+        // write_buffer_size. Each subsequent level is the max size without
+        // triggering size_ratio compaction. `max_num_runs` is the minimum
+        // number of levels required such that the target size of the
+        // largest level is at least `max_run_size_`.
+        max_num_runs = 1;
+        double cur_level_max_size =
+            static_cast<double>(mutable_cf_options_.write_buffer_size);
+        double total_run_size = 0;
+        while (cur_level_max_size < static_cast<double>(max_run_size_)) {
+          // This loop should not take too many iterations since
+          // cur_level_max_size at least doubles each iteration.
+          total_run_size += cur_level_max_size;
+          cur_level_max_size = (100.0 + ratio) / 100.0 * total_run_size;
+          ++max_num_runs;
+        }
+      } else {
+        // TODO: implement the auto-tune logic for this stop style
+        max_num_runs = file_num_compaction_trigger;
+      }
+    } else {
+      // max_num_runs > 0, it's the limit on the number of sorted run
+    }
+
+    // Get the total number of sorted runs that are not being compacted
+    int num_sr_not_compacted = 0;
+    for (size_t i = 0; i < sorted_runs_.size(); i++) {
+      if (sorted_runs_[i].being_compacted == false &&
+          !sorted_runs_[i].level_has_marked_standalone_rangedel) {
+        num_sr_not_compacted++;
+      }
+    }
+
+    *num_sr_not_compacted_output = num_sr_not_compacted;
+    *max_num_runs_output = max_num_runs;
+
+    if (num_sr_not_compacted > max_num_runs) {
+      return num_sr_not_compacted - max_num_runs + 1;
+    } else {
+      return 0;
+    }
+  }
+
+  Compaction* MaybePickPeriodicCompaction(Compaction* const prev_picked_c) {
+    if (prev_picked_c != nullptr ||
+        vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+      return prev_picked_c;
+    }
+    // Always need to do a full compaction for periodic compaction.
+    Compaction* c = PickPeriodicCompaction();
+    TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c);
+    if (c != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for periodic compaction\n",
+                       cf_name_.c_str());
+    }
+    return c;
+  }
+
+  Compaction* MaybePickSizeAmpCompaction(Compaction* const prev_picked_c,
+                                         int file_num_compaction_trigger) {
+    if (prev_picked_c != nullptr ||
+        sorted_runs_.size() <
+            static_cast<size_t>(file_num_compaction_trigger)) {
+      return prev_picked_c;
+    }
+    Compaction* c = PickCompactionToReduceSizeAmp();
+    if (c != nullptr) {
+      TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr");
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for size amp compaction \n",
+                       cf_name_.c_str());
+    }
+    return c;
+  }
+
+  Compaction* MaybePickCompactionToReduceSortedRunsBasedFileRatio(
+      Compaction* const prev_picked_c, int file_num_compaction_trigger,
+      unsigned int ratio) {
+    if (prev_picked_c != nullptr ||
+        sorted_runs_.size() <
+            static_cast<size_t>(file_num_compaction_trigger)) {
+      return prev_picked_c;
+    }
+    Compaction* c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX);
+    if (c != nullptr) {
+      TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr");
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for size ratio compaction to "
+                       "reduce sorted run\n",
+                       cf_name_.c_str());
+    }
+    return c;
+  }
+
+  Compaction* MaybePickCompactionToReduceSortedRuns(
+      Compaction* const prev_picked_c, int file_num_compaction_trigger,
+      unsigned int ratio) {
+    if (prev_picked_c != nullptr ||
+        sorted_runs_.size() <
+            static_cast<size_t>(file_num_compaction_trigger)) {
+      return prev_picked_c;
+    }
+
+    int num_sr_not_compacted = 0;
+    int max_num_runs = 0;
+    const unsigned int max_num_files_to_compact =
+        GetMaxNumFilesToCompactBasedOnMaxReadAmp(file_num_compaction_trigger,
+                                                 ratio, &num_sr_not_compacted,
+                                                 &max_num_runs);
+    if (max_num_files_to_compact == 0) {
+      ROCKS_LOG_BUFFER(
+          log_buffer_,
+          "[%s] Universal: skipping compaction to reduce sorted run, num "
+          "sorted runs not "
+          "being compacted -- %u, max num runs allowed -- %d, max_run_size "
+          "-- %" PRIu64 "\n",
+          cf_name_.c_str(), num_sr_not_compacted, max_num_runs, max_run_size_);
+      return nullptr;
+    }
+
+    Compaction* c =
+        PickCompactionToReduceSortedRuns(UINT_MAX, max_num_files_to_compact);
+    if (c != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: picked for sorted run num compaction "
+                       "to reduce sorted run, to "
+                       "compact file num -- %u, max num runs allowed"
+                       "-- %d, max_run_size -- %" PRIu64 "\n",
+                       cf_name_.c_str(), max_num_files_to_compact, max_num_runs,
+                       max_run_size_);
+    }
+    return c;
+  }
+
+  Compaction* MaybePickDeleteTriggeredCompaction(
+      Compaction* const prev_picked_c) {
+    if (prev_picked_c != nullptr) {
+      return prev_picked_c;
+    }
+    Compaction* c = PickDeleteTriggeredCompaction();
+    if (c != nullptr) {
+      TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr");
+      ROCKS_LOG_BUFFER(
+          log_buffer_,
+          "[%s] Universal: picked for delete triggered compaction\n",
+          cf_name_.c_str());
+    }
+    return c;
+  }
+
   // Pick Universal compaction to limit read amplification
   Compaction* PickCompactionToReduceSortedRuns(
       unsigned int ratio, unsigned int max_number_of_files_to_compact);
@@ -249,6 +419,13 @@ class UniversalCompactionBuilder {
     return num_l0_to_exclude;
   }
 
+  bool MeetsOutputLevelRequirements(int output_level) const {
+    return !require_max_output_level_ ||
+           Compaction::OutputToNonZeroMaxOutputLevel(
+               output_level,
+               vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
+  }
+
   const ImmutableOptions& ioptions_;
   const InternalKeyComparator* icmp_;
   double score_;
@@ -260,6 +437,7 @@ class UniversalCompactionBuilder {
   VersionStorageInfo* vstorage_;
   UniversalCompactionPicker* picker_;
   LogBuffer* log_buffer_;
+  bool require_max_output_level_;
   // Optional earliest snapshot at time of compaction picking. This is only
   // provided if the column family doesn't enable user-defined timestamps.
   // And this information is only passed to `Compaction` picked by deletion
@@ -420,10 +598,11 @@ Compaction* UniversalCompactionPicker::PickCompaction(
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& existing_snapshots,
     const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer) {
+    LogBuffer* log_buffer, bool require_max_output_level) {
   UniversalCompactionBuilder builder(
       ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options,
-      existing_snapshots, snapshot_checker, vstorage, this, log_buffer);
+      existing_snapshots, snapshot_checker, vstorage, this, log_buffer,
+      require_max_output_level);
   return builder.PickCompaction();
 }
 
@@ -554,13 +733,21 @@ bool UniversalCompactionBuilder::ShouldSkipMarkedFile(
 Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
   score_ = vstorage_->CompactionScore(kLevel0);
-  int max_output_level =
+  const int max_output_level =
       vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  const int file_num_compaction_trigger =
+      mutable_cf_options_.level0_file_num_compaction_trigger;
+  const unsigned int ratio =
+      mutable_cf_options_.compaction_options_universal.size_ratio;
+
+  if (max_output_level == 0 &&
+      !MeetsOutputLevelRequirements(0 /* output_level */)) {
+    return nullptr;
+  }
+
   max_run_size_ = 0;
   sorted_runs_ =
       CalculateSortedRuns(*vstorage_, max_output_level, &max_run_size_);
-  int file_num_compaction_trigger =
-      mutable_cf_options_.level0_file_num_compaction_trigger;
 
   if (sorted_runs_.size() == 0 ||
       (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
@@ -572,6 +759,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
         "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
+
   VersionStorageInfo::LevelSummaryStorage tmp;
   ROCKS_LOG_BUFFER_MAX_SZ(
       log_buffer_, 3072,
@@ -579,119 +767,14 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
       cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
 
   Compaction* c = nullptr;
-  // Periodic compaction has higher priority than other type of compaction
-  // because it's a hard requirement.
-  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
-    // Always need to do a full compaction for periodic compaction.
-    c = PickPeriodicCompaction();
-    TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c);
-  }
 
-  if (c == nullptr &&
-      sorted_runs_.size() >= static_cast<size_t>(file_num_compaction_trigger)) {
-    // Check for size amplification.
-    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
-      TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr");
-      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
-                       cf_name_.c_str());
-    } else {
-      // Size amplification is within limits. Try reducing read
-      // amplification while maintaining file size ratios.
-      unsigned int ratio =
-          mutable_cf_options_.compaction_options_universal.size_ratio;
-
-      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
-        TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr");
-        ROCKS_LOG_BUFFER(log_buffer_,
-                         "[%s] Universal: compacting for size ratio\n",
-                         cf_name_.c_str());
-      } else {
-        // Size amplification and file size ratios are within configured limits.
-        // If max read amplification exceeds configured limits, then force
-        // compaction to reduce the number sorted runs without looking at file
-        // size ratios.
-
-        // This is guaranteed by NeedsCompaction()
-        assert(sorted_runs_.size() >=
-               static_cast<size_t>(file_num_compaction_trigger));
-        int max_num_runs =
-            mutable_cf_options_.compaction_options_universal.max_read_amp;
-        if (max_num_runs < 0) {
-          // any value < -1 is not valid
-          assert(max_num_runs == -1);
-          // By default, fall back to `level0_file_num_compaction_trigger`
-          max_num_runs = file_num_compaction_trigger;
-        } else if (max_num_runs == 0) {
-          if (mutable_cf_options_.compaction_options_universal.stop_style ==
-              kCompactionStopStyleTotalSize) {
-            // 0 means auto-tuning by RocksDB. We estimate max num run based on
-            // max_run_size, size_ratio and write buffer size:
-            // Assume the size of the lowest level size is equal to
-            // write_buffer_size. Each subsequent level is the max size without
-            // triggering size_ratio compaction. `max_num_runs` is the minimum
-            // number of levels required such that the target size of the
-            // largest level is at least `max_run_size_`.
-            max_num_runs = 1;
-            double cur_level_max_size =
-                static_cast<double>(mutable_cf_options_.write_buffer_size);
-            double total_run_size = 0;
-            while (cur_level_max_size < static_cast<double>(max_run_size_)) {
-              // This loop should not take too many iterations since
-              // cur_level_max_size at least doubles each iteration.
-              total_run_size += cur_level_max_size;
-              cur_level_max_size = (100.0 + ratio) / 100.0 * total_run_size;
-              ++max_num_runs;
-            }
-          } else {
-            // TODO: implement the auto-tune logic for this stop style
-            max_num_runs = file_num_compaction_trigger;
-          }
-        } else {
-          // max_num_runs > 0, it's the limit on the number of sorted run
-        }
-        // Get the total number of sorted runs that are not being compacted
-        int num_sr_not_compacted = 0;
-        for (size_t i = 0; i < sorted_runs_.size(); i++) {
-          if (sorted_runs_[i].being_compacted == false &&
-              !sorted_runs_[i].level_has_marked_standalone_rangedel) {
-            num_sr_not_compacted++;
-          }
-        }
-
-        // The number of sorted runs that are not being compacted is greater
-        // than the maximum allowed number of sorted runs
-        if (num_sr_not_compacted > max_num_runs) {
-          unsigned int num_files = num_sr_not_compacted - max_num_runs + 1;
-          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
-              nullptr) {
-            ROCKS_LOG_BUFFER(log_buffer_,
-                             "[%s] Universal: compacting for file num, to "
-                             "compact file num -- %u, max num runs allowed"
-                             "-- %d, max_run_size -- %" PRIu64 "\n",
-                             cf_name_.c_str(), num_files, max_num_runs,
-                             max_run_size_);
-          }
-        } else {
-          ROCKS_LOG_BUFFER(
-              log_buffer_,
-              "[%s] Universal: skipping compaction for file num, num runs not "
-              "being compacted -- %u, max num runs allowed -- %d, max_run_size "
-              "-- %" PRIu64 "\n",
-              cf_name_.c_str(), num_sr_not_compacted, max_num_runs,
-              max_run_size_);
-        }
-      }
-    }
-  }
-
-  if (c == nullptr) {
-    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
-      TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr");
-      ROCKS_LOG_BUFFER(log_buffer_,
-                       "[%s] Universal: delete triggered compaction\n",
-                       cf_name_.c_str());
-    }
-  }
+  c = MaybePickPeriodicCompaction(c);
+  c = MaybePickSizeAmpCompaction(c, file_num_compaction_trigger);
+  c = MaybePickCompactionToReduceSortedRunsBasedFileRatio(
+      c, file_num_compaction_trigger, ratio);
+  c = MaybePickCompactionToReduceSortedRuns(c, file_num_compaction_trigger,
+                                            ratio);
+  c = MaybePickDeleteTriggeredCompaction(c);
 
   if (c == nullptr) {
     TEST_SYNC_POINT_CALLBACK(
@@ -700,6 +783,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   }
   assert(c->output_level() <=
          vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
+  assert(MeetsOutputLevelRequirements(c->output_level()));
 
   if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
           true &&
@@ -825,14 +909,16 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
       if (sr->being_compacted) {
         ROCKS_LOG_BUFFER(log_buffer_,
                          "[%s] Universal: %s"
-                         "[%d] being compacted, skipping",
+                         "[%d] being compacted, skipping for compaction to "
+                         "reduce sorted runs",
                          cf_name_.c_str(), file_num_buf, loop);
       } else if (sr->level_has_marked_standalone_rangedel) {
-        ROCKS_LOG_BUFFER(log_buffer_,
-                         "[%s] Universal: %s"
-                         "[%d] has standalone range tombstone files marked for "
-                         "compaction, skipping",
-                         cf_name_.c_str(), file_num_buf, loop);
+        ROCKS_LOG_BUFFER(
+            log_buffer_,
+            "[%s] Universal: %s"
+            "[%d] has standalone range tombstone files marked for "
+            "compaction, skipping for compaction to reduce sorted runs",
+            cf_name_.c_str(), file_num_buf, loop);
       }
 
       sr = nullptr;
@@ -845,7 +931,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       ROCKS_LOG_BUFFER(log_buffer_,
-                       "[%s] Universal: Possible candidate %s[%d].",
+                       "[%s] Universal: Possible candidate for compaction to "
+                       "reduce sorted runs %s[%d].",
                        cf_name_.c_str(), file_num_buf, loop);
     }
 
@@ -947,6 +1034,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
     output_level = sorted_runs_[first_index_after].level - 1;
   }
 
+  if (!MeetsOutputLevelRequirements(output_level)) {
+    return nullptr;
+  }
+
   std::vector<CompactionInputFiles> inputs(max_output_level + 1);
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
@@ -1039,18 +1130,19 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
       char file_num_buf[kFormatFileNumberBufSize];
       sr->Dump(file_num_buf, sizeof(file_num_buf), true);
       if (sr->being_compacted) {
-        ROCKS_LOG_BUFFER(
-            log_buffer_,
-            "[%s] Universal: stopping at sorted run undergoing compaction: "
-            "%s[%" ROCKSDB_PRIszt "]",
-            cf_name_.c_str(), file_num_buf, start_index - 1);
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: stopping for size amp compaction at "
+                         "sorted run undergoing compaction: "
+                         "%s[%" ROCKSDB_PRIszt "]",
+                         cf_name_.c_str(), file_num_buf, start_index - 1);
       } else if (sr->level_has_marked_standalone_rangedel) {
-        ROCKS_LOG_BUFFER(
-            log_buffer_,
-            "[%s] Universal: stopping at sorted run that has standalone range "
-            "tombstone files marked for compaction: "
-            "%s[%" ROCKSDB_PRIszt "]",
-            cf_name_.c_str(), file_num_buf, start_index - 1);
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: stopping for size amp compaction at "
+                         "sorted run that has "
+                         "standalone range "
+                         "tombstone files marked for compaction: "
+                         "%s[%" ROCKSDB_PRIszt "]",
+                         cf_name_.c_str(), file_num_buf, start_index - 1);
       }
       break;
     }
@@ -1066,11 +1158,12 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
   {
     const size_t num_l0_to_exclude = MightExcludeNewL0sToReduceWriteStop(
         num_l0_files, end_index, start_index, candidate_size);
-    ROCKS_LOG_BUFFER(log_buffer_,
-                     "[%s] Universal: Excluding %" ROCKSDB_PRIszt
-                     " latest L0 files to reduce potential write stop "
-                     "triggered by `level0_stop_writes_trigger`",
-                     cf_name_.c_str(), num_l0_to_exclude);
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: Excluding for size amp compaction %" ROCKSDB_PRIszt
+        " latest L0 files to reduce potential write stop "
+        "triggered by `level0_stop_writes_trigger`",
+        cf_name_.c_str(), num_l0_to_exclude);
   }
 
   {
@@ -1088,18 +1181,18 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
 
   // size amplification = percentage of additional size
   if (candidate_size * 100 < ratio * base_sr_size) {
-    ROCKS_LOG_BUFFER(
-        log_buffer_,
-        "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
-        " earliest-file-size %" PRIu64,
-        cf_name_.c_str(), candidate_size, base_sr_size);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: size amp compction not needed. "
+                     "newer-files-total-size %" PRIu64
+                     " earliest-file-size %" PRIu64,
+                     cf_name_.c_str(), candidate_size, base_sr_size);
     return nullptr;
   } else {
-    ROCKS_LOG_BUFFER(
-        log_buffer_,
-        "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
-        " earliest-file-size %" PRIu64,
-        cf_name_.c_str(), candidate_size, base_sr_size);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: size amp compaction needed. "
+                     "newer-files-total-size %" PRIu64
+                     " earliest-file-size %" PRIu64,
+                     cf_name_.c_str(), candidate_size, base_sr_size);
   }
   // Since incremental compaction can't include more than second last
   // level, it can introduce penalty, compared to full compaction. We
@@ -1450,6 +1543,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
     }
     assert(output_level <= max_output_level);
 
+    if (!MeetsOutputLevelRequirements(output_level)) {
+      return nullptr;
+    }
+
     if (output_level != 0) {
       if (start_level == 0) {
         if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
@@ -1574,6 +1671,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
     output_level = sorted_runs_[end_index + 1].level - 1;
   }
 
+  if (!MeetsOutputLevelRequirements(output_level)) {
+    return nullptr;
+  }
+
   // intra L0 compactions outputs could have overlap
   if (output_level != 0 && picker_->FilesRangeOverlapWithCompaction(
                                inputs, output_level,
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
index 18c0f27afbf4..d37fd65bb2a8 100644
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -18,12 +18,15 @@ class UniversalCompactionPicker : public CompactionPicker {
   UniversalCompactionPicker(const ImmutableOptions& ioptions,
                             const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
+
+  // If `require_max_output_level` is true, only pick compaction
+  // with max output level or return nullptr if no such compaction exists.
   Compaction* PickCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer) override;
+      LogBuffer* log_buffer, bool require_max_output_level = false) override;
   int MaxOutputLevel() const override { return NumberLevels() - 1; }
 
   bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index ba4edeffa2da..26af75656ea0 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -455,6 +455,72 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
   }
 }
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_F(DBCompactionTest, UniversalReduceFileLockingRepickNothing) {
+  const int kFileNumCompactionTrigger = 3;
+
+  Options options = CurrentOptions();
+  options.compaction_options_universal.reduce_file_locking = true;
+  // Set `max_background_jobs` to be 3 to allow low and bottom priority thread
+  // to run compaction together
+  options.max_background_jobs = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  options.num_levels = 3;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kFileNumCompactionTrigger;
+  options.compaction_options_universal.max_size_amplification_percent = 1;
+
+  DestroyAndReopen(options);
+
+  // Need to get a token to enable compaction parallelism up to
+  // `max_background_compactions` jobs.
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {// Wait for the full (bottom-priority) compaction to be pre-picked as an
+       // intent (that is allowing files to be picked by other compactions and
+       // will pick later when the bottom-priority thread is available to
+       // execute the compaction) before triggering the low-priority compaction.
+       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+        "LowPriCompaction"},
+       // Wait for low-priority compaction to start before
+       // repicking for the full compaction intent (bottom-priority), enabling
+       // them to run in parallel.
+       {"DBImpl::BackgroundCompaction:NonTrivial",
+        "DBImpl::BGWorkBottomCompaction"}});
+
+  bool bottom_pri_compaction_attempt_repick = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri",
+      [&](void* arg) {
+        bottom_pri_compaction_attempt_repick = true;
+        Compaction* c = static_cast<Compaction*>(arg);
+        // Verify the intended full compaction for bottom priority thread does
+        // not get to run (i.e, output to bottommost level) since when it
+        // repicks its files, some of the the intended input files are already
+        // compacted by the low priority thread
+        assert(c == nullptr);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kFileNumCompactionTrigger; ++i) {
+    if (i == 0) {
+      ASSERT_OK(Put("file_locked_for_bottom_pri_compaction", "value"));
+    } else {
+      ASSERT_OK(
+          Put("file_not_locked_for_bottom_pri_compaction" + std::to_string(i),
+              "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT("LowPriCompaction");
+  ASSERT_OK(Put("a_new_file_to_pick_for_low_pri_compaction", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(bottom_pri_compaction_attempt_repick);
+}
 
 TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
   // This test verify UpdateAccumulatedStats is not on
@@ -2824,46 +2890,99 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
 }
 
 TEST_F(DBCompactionTest, ManualAutoRace) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
-       {"DBImpl::RunManualCompaction:WaitScheduled",
-        "BackgroundCallCompaction:0"}});
+  const int kNumL0FilesTrigger = 4;
+  // Verify that the auto compaction is retried after the conflicting exclusive
+  // manual compaction finishes for:
+  // 1. Non-bottom-priority compactions (tested with level compaction)
+  // 2. Bottom-priority compactions (tested with universal compaction)
+  for (auto compaction_style :
+       {kCompactionStyleLevel, kCompactionStyleUniversal}) {
+    Env::Default()->SetBackgroundThreads(
+        compaction_style == kCompactionStyleUniversal ? 2 : 0,
+        Env::Priority::BOTTOM);
+    for (auto universal_reduce_file_locking : {false, true}) {
+      if (compaction_style != kCompactionStyleUniversal &&
+          universal_reduce_file_locking) {
+        continue;
+      }
 
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+      Options options = CurrentOptions();
+      options.num_levels = 3;
+      options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+      options.compaction_style = compaction_style;
+      options.compaction_options_universal.reduce_file_locking =
+          universal_reduce_file_locking;
 
-  ASSERT_OK(Put(1, "foo", ""));
-  ASSERT_OK(Put(1, "bar", ""));
-  ASSERT_OK(Flush(1));
-  ASSERT_OK(Put(1, "foo", ""));
-  ASSERT_OK(Put(1, "bar", ""));
-  // Generate four files in CF 0, which should trigger an auto compaction
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("foo", ""));
-  ASSERT_OK(Put("bar", ""));
-  ASSERT_OK(Flush());
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"exclusive_manual_compaction_cf"}, options);
 
-  // The auto compaction is scheduled but waited until here
-  TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
-  // The auto compaction will wait until the manual compaction is registerd
-  // before processing so that it will be cancelled.
-  CompactRangeOptions cro;
-  cro.exclusive_manual_compaction = true;
-  ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
-  ASSERT_EQ("0,1", FilesPerLevel(1));
+      // Set up sync points to ensure that the auto compaction
+      // encounters a conflict from exclusive manual compaction before the auto
+      // compaction gets to pick files, This will trigger a retry later.
+      //
+      // Specifically, the sync points are set up as following:
+      // 1. Wait until background low-pri scheduled (not picking files yet) or
+      // bottom-pri scheduled (not repicking files yet) for
+      // `universal_reduce_file_locking = true` before triggering
+      // CompactRange()
+      //
+      // 2. Wait until the triggered CompactRange()
+      // registers its compaction and creates conflict before the auto
+      // compaction picks or repicks files for the background compaction.
+      if (compaction_style == kCompactionStyleLevel ||
+          !universal_reduce_file_locking) {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+            {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+             {"DBImpl::RunManualCompaction:WaitScheduled",
+              "BackgroundCallCompaction:0"}});
+      } else {
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+            {{"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+              "DBCompactionTest::ManualAutoRace:1"},
+             {"DBImpl::RunManualCompaction:WaitScheduled",
+              "BackgroundCallCompaction:0:BottomPri"}});
+      }
 
-  // Eventually the cancelled compaction will be rescheduled and executed.
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      bool encounter_conflict = false;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::BackgroundCompaction()::Conflict",
+          [&](void* /*arg*/) { encounter_conflict = true; });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Generate files in CF 1 for exclusive CompactRange()
+      ASSERT_OK(Put(1, "foo", ""));
+      ASSERT_OK(Put(1, "bar", ""));
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(Put(1, "foo", ""));
+      ASSERT_OK(Put(1, "bar", ""));
+      // Generate files in CF0 to trigger full compaction
+      for (int i = 0; i < kNumL0FilesTrigger; ++i) {
+        ASSERT_OK(Put("foo", ""));
+        ASSERT_OK(Put("bar", ""));
+        ASSERT_OK(Flush());
+      }
+
+      TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+      CompactRangeOptions cro;
+      cro.exclusive_manual_compaction = true;
+      ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
+      ASSERT_EQ(compaction_style == kCompactionStyleLevel ? "0,1" : "0,0,1",
+                FilesPerLevel(1));
+
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+      ASSERT_TRUE(encounter_conflict);
+
+      // Verify that the auto compaction is eventually executed after the
+      // exclusive CompactRange() finishes.
+      ASSERT_EQ(compaction_style == kCompactionStyleLevel ? "0,1" : "0,0,1",
+                FilesPerLevel(0));
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    }
+    Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+  }
 }
 
 TEST_P(DBCompactionTestWithParam, ManualCompaction) {
@@ -3986,41 +4105,51 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
 TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
   const int kNumFilesTrigger = 3;
   Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
-  for (bool use_universal_compaction : {false, true}) {
-    Options options = CurrentOptions();
-    if (use_universal_compaction) {
-      options.compaction_style = kCompactionStyleUniversal;
-    } else {
-      options.compaction_style = kCompactionStyleLevel;
-      options.level_compaction_dynamic_level_bytes = true;
-    }
-    options.num_levels = 4;
-    options.write_buffer_size = 100 << 10;     // 100KB
-    options.target_file_size_base = 32 << 10;  // 32KB
-    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
-    // Trigger compaction if size amplification exceeds 110%
-    options.compaction_options_universal.max_size_amplification_percent = 110;
-    DestroyAndReopen(options);
+  for (auto compaction_style :
+       {kCompactionStyleLevel, kCompactionStyleUniversal}) {
+    for (auto universal_reduce_file_locking : {false, true}) {
+      if (compaction_style != kCompactionStyleUniversal &&
+          universal_reduce_file_locking) {
+        continue;
+      }
+      Options options = CurrentOptions();
+      options.compaction_style = compaction_style;
+      if (compaction_style == kCompactionStyleLevel) {
+        options.level_compaction_dynamic_level_bytes = true;
+      } else {
+        options.compaction_options_universal.reduce_file_locking =
+            universal_reduce_file_locking;
+        // Trigger compaction if size amplification exceeds 110%
+        options.compaction_options_universal.max_size_amplification_percent =
+            110;
+      }
+      options.num_levels = 4;
+      options.write_buffer_size = 100 << 10;     // 100KB
+      options.target_file_size_base = 32 << 10;  // 32KB
+      options.level0_file_num_compaction_trigger = kNumFilesTrigger;
 
-    int num_bottom_pri_compactions = 0;
-    SyncPoint::GetInstance()->SetCallBack(
-        "DBImpl::BGWorkBottomCompaction",
-        [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
-    SyncPoint::GetInstance()->EnableProcessing();
+      DestroyAndReopen(options);
 
-    Random rnd(301);
-    for (int num = 0; num < kNumFilesTrigger; num++) {
-      ASSERT_EQ(NumSortedRuns(), num);
-      int key_idx = 0;
-      GenerateNewFile(&rnd, &key_idx);
-    }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      int num_bottom_pri_compactions = 0;
+      SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::BGWorkBottomCompaction",
+          [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+      SyncPoint::GetInstance()->EnableProcessing();
 
-    ASSERT_EQ(1, num_bottom_pri_compactions);
+      Random rnd(301);
+      for (int num = 0; num < kNumFilesTrigger; num++) {
+        ASSERT_EQ(NumSortedRuns(), num);
+        int key_idx = 0;
+        GenerateNewFile(&rnd, &key_idx);
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-    // Verify that size amplification did occur
-    ASSERT_EQ(NumSortedRuns(), 1);
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      ASSERT_EQ(1, num_bottom_pri_compactions);
+
+      // Verify that size amplification did occur
+      ASSERT_EQ(NumSortedRuns(), 1);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
   }
   Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
 }
@@ -9998,55 +10127,60 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
 
   env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
 
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = kNumL0Files;
-  options.num_levels = kNumLevels;
-  DestroyAndReopen(options);
+  for (bool universal_reduce_file_locking : {false, true}) {
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = kNumL0Files;
+    options.num_levels = kNumLevels;
+    options.compaction_style = kCompactionStyleUniversal;
+    options.compaction_options_universal.reduce_file_locking =
+        universal_reduce_file_locking;
+    DestroyAndReopen(options);
 
-  // Setup last level to be non-empty since it's a bit unclear whether
-  // compaction to an empty level would be considered "bottommost".
-  ASSERT_OK(Put(Key(0), "val"));
-  ASSERT_OK(Flush());
-  MoveFilesToLevel(kNumLevels - 1);
+    // Setup last level to be non-empty since it's a bit unclear whether
+    // compaction to an empty level would be considered "bottommost".
+    ASSERT_OK(Put(Key(0), "val"));
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(kNumLevels - 1);
 
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BGWorkBottomCompaction",
-        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-        "PreTriggerCompaction"},
-       {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-        "PostTriggerCompaction",
-        "BackgroundCallCompaction:0"}});
-  SyncPoint::GetInstance()->EnableProcessing();
+    SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::BGWorkBottomCompaction",
+          "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+          "PreTriggerCompaction"},
+         {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+          "PostTriggerCompaction",
+          "BackgroundCallCompaction:0"}});
+    SyncPoint::GetInstance()->EnableProcessing();
 
-  port::Thread compact_range_thread([&] {
-    CompactRangeOptions cro;
-    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-    cro.exclusive_manual_compaction = false;
-    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
-  });
+    port::Thread compact_range_thread([&] {
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      cro.exclusive_manual_compaction = false;
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+    });
 
-  // Sleep in the low-pri thread so any newly scheduled compaction will be
-  // queued. Otherwise it might finish before we check its existence.
-  test::SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
-  sleeping_task_low.WaitUntilSleeping();
+    // Sleep in the low-pri thread so any newly scheduled compaction will be
+    // queued. Otherwise it might finish before we check its existence.
+    test::SleepingBackgroundTask sleeping_task_low;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_task_low, Env::Priority::LOW);
+    sleeping_task_low.WaitUntilSleeping();
 
-  TEST_SYNC_POINT(
-      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-      "PreTriggerCompaction");
-  for (int i = 0; i < kNumL0Files; ++i) {
-    ASSERT_OK(Put(Key(0), "val"));
-    ASSERT_OK(Flush());
-  }
-  ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
-  TEST_SYNC_POINT(
-      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
-      "PostTriggerCompaction");
+    TEST_SYNC_POINT(
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PreTriggerCompaction");
+    for (int i = 0; i < kNumL0Files; ++i) {
+      ASSERT_OK(Put(Key(0), "val"));
+      ASSERT_OK(Flush());
+    }
+    ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+    TEST_SYNC_POINT(
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PostTriggerCompaction");
 
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
-  compact_range_thread.join();
+    sleeping_task_low.WakeUp();
+    sleeping_task_low.WaitUntilDone();
+    compact_range_thread.join();
+  }
 }
 
 TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 0034a7e97764..66918e1d0077 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1952,12 +1952,19 @@ class DBImpl : public DB {
   };
   struct PrepickedCompaction {
     // background compaction takes ownership of `compaction`.
+    // TODO(hx235): consider using std::shared_ptr for easier ownership
+    // management
     Compaction* compaction;
     // caller retains ownership of `manual_compaction_state` as it is reused
     // across background compactions.
     ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
     // task limiter token is requested during compaction picking.
     std::unique_ptr<TaskLimiterToken> task_token;
+    // If true, `compaction` is picked temporarily to express compaction intent
+    // and will be released before re-picking a real compaction based on the
+    // updated LSM shape when thread associated with `compaction` is ready to
+    // run
+    bool need_repick;
   };
 
   struct CompactionArg {
@@ -2456,6 +2463,8 @@ class DBImpl : public DB {
                          bool* flush_rescheduled_to_retain_udt,
                          Env::Priority thread_pri);
 
+  Compaction* CreateIntendedCompactionForwardedToBottomPriorityPool(
+      Compaction* c);
   bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
                                const std::vector<CompactionInputFiles>& inputs,
                                bool* sfm_bookkeeping, LogBuffer* log_buffer);
@@ -2723,6 +2732,11 @@ class DBImpl : public DB {
       const std::vector<ColumnFamilyHandle*>& column_families,
       ErrorIteratorFuncType error_iterator_func);
 
+  bool ShouldPickCompaction(bool is_prepicked,
+                            const PrepickedCompaction* prepicked_compaction);
+
+  void ResetBottomPriCompactionIntent(ColumnFamilyData* cfd,
+                                      std::unique_ptr<Compaction>& c);
   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
   FileLock* db_lock_ = nullptr;
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 5b1ea2cd1f61..b5f6e0e70332 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2170,6 +2170,7 @@ Status DBImpl::RunManualCompaction(
         // Don't throttle manual compaction, only count outstanding tasks.
         assert(false);
       }
+      ca->prepicked_compaction->need_repick = false;
       manual.incomplete = false;
       if (compaction->bottommost_level() &&
           env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
@@ -3396,6 +3397,10 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
   bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
   TEST_SYNC_POINT("BackgroundCallCompaction:0");
+  if (bg_thread_pri == Env::Priority::BOTTOM) {
+    TEST_SYNC_POINT("BackgroundCallCompaction:0:BottomPri");
+  }
+
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
   {
@@ -3631,34 +3636,54 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                  : m->manual_end->DebugString(true).c_str()));
       }
     }
-  } else if (!is_prepicked && !compaction_queue_.empty()) {
+  } else if (ShouldPickCompaction(is_prepicked, prepicked_compaction)) {
+    bool need_repick = is_prepicked && prepicked_compaction->need_repick;
     if (HasExclusiveManualCompaction()) {
-      // Can't compact right now, but try again later
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
 
-      // Stay in the compaction queue.
-      unscheduled_compactions_++;
+      // TODO(hx235): Resolve conflict between intended
+      // bottom-priority compaction (requiring repick, i.e., need_repick = true)
+      // and exclusive manual compaction by releasing the intended
+      // bottom-priority compaction.
+      if (!need_repick) {
+        // Can't compact right now, but try again later
+        //
+        // Increase `unscheduled_compactions_` directly so we
+        // don't need to
+        // dequeue and enqueue the CFD again in the compaction queue and thus
+        // keep the CFD's position in the queue
+        unscheduled_compactions_++;
 
-      return Status::OK();
+        return Status::OK();
+      }
     }
 
-    auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
-    if (cfd == nullptr) {
-      // Can't find any executable task from the compaction queue.
-      // All tasks have been throttled by compaction thread limiter.
-      ++unscheduled_compactions_;
-      return Status::Busy();
-    }
+    ColumnFamilyData* cfd = nullptr;
+
+    if (!need_repick) {
+      cfd = PickCompactionFromQueue(&task_token, log_buffer);
+      if (cfd == nullptr) {
+        // Can't find any executable task from the compaction queue.
+        // All tasks have been throttled by compaction thread limiter.
+        ++unscheduled_compactions_;
+        return Status::Busy();
+      }
 
-    // We unreference here because the following code will take a Ref() on
-    // this cfd if it is going to use it (Compaction class holds a
-    // reference).
-    // This will all happen under a mutex so we don't have to be afraid of
-    // somebody else deleting it.
-    if (cfd->UnrefAndTryDelete()) {
-      // This was the last reference of the column family, so no need to
-      // compact.
-      return Status::OK();
+      // We unreference here because the following code will take a Ref() on
+      // this cfd if it is going to use it (Compaction class holds a
+      // reference).
+      // This will all happen under a mutex so we don't have to be afraid of
+      // somebody else deleting it.
+      if (cfd->UnrefAndTryDelete()) {
+        // This was the last reference of the column family, so no need to
+        // compact.
+        return Status::OK();
+      }
+    } else {
+      cfd = c->column_family_data();
+      assert(cfd);
+      ResetBottomPriCompactionIntent(cfd, c);
+      assert(c == nullptr);
     }
 
     // Pick up latest mutable CF Options and use it throughout the
@@ -3679,10 +3704,17 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         InitSnapshotContext(job_context);
         assert(is_snapshot_supported_ || snapshots_.empty());
       }
-      c.reset(cfd->PickCompaction(mutable_cf_options, mutable_db_options_,
-                                  job_context->snapshot_seqs,
-                                  job_context->snapshot_checker, log_buffer));
-      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+      c.reset(cfd->PickCompaction(
+          mutable_cf_options, mutable_db_options_, job_context->snapshot_seqs,
+          job_context->snapshot_checker, log_buffer,
+          thread_pri == Env::Priority::BOTTOM /* require_max_output_level */));
+      if (thread_pri == Env::Priority::LOW) {
+        TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+      } else if (thread_pri == Env::Priority::BOTTOM) {
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri",
+            c.get());
+      }
 
       if (c != nullptr) {
         bool enough_room = EnoughRoomForCompaction(
@@ -3696,7 +3728,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
               ->storage_info()
               ->ComputeCompactionScore(c->immutable_options(),
                                        c->mutable_cf_options());
-          AddToCompactionQueue(cfd);
+          EnqueuePendingCompaction(cfd);
 
           c.reset();
           // Don't need to sleep here, because BackgroundCallCompaction
@@ -3718,16 +3750,21 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           // options take effect.
           // 3) When we Pick a new compaction, we "remove" those files being
           // compacted from the calculation, which then influences compaction
-          // score. Here we check if we need the new compaction even without the
-          // files that are currently being compacted. If we need another
-          // compaction, we might be able to execute it in parallel, so we add
-          // it to the queue and schedule a new thread.
-          if (cfd->NeedsCompaction()) {
-            // Yes, we need more compactions!
-            AddToCompactionQueue(cfd);
-            MaybeScheduleFlushOrCompaction();
-          }
+          // score. Inside EnqueuePendingCompaction(),  we check if we need
+          // the new compaction even without the files that are currently being
+          // compacted. If we need another compaction, we might be able to
+          // execute it in parallel, so we add it to the queue and schedule a
+          // new thread.
+          EnqueuePendingCompaction(cfd);
+          MaybeScheduleFlushOrCompaction();
         }
+      } else if (is_prepicked) {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Pre-picked compaction repicked files for compaction as "
+            "required, "
+            "but upon re-evaluation, no compaction was found necessary \n",
+            cfd->GetName().c_str());
       }
     }
   }
@@ -4104,14 +4141,16 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     ThreadStatusUtil::ResetThreadStatus();
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
                              c->column_family_data());
-  } else if (!is_prepicked && c->output_level() > 0 &&
-             c->output_level() ==
+  } else if (!is_prepicked &&
+             Compaction::OutputToNonZeroMaxOutputLevel(
+                 c->output_level(),
                  c->column_family_data()
                      ->current()
                      ->storage_info()
                      ->MaxOutputLevel(
-                         immutable_db_options_.allow_ingest_behind) &&
+                         immutable_db_options_.allow_ingest_behind)) &&
              env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+    assert(thread_pri == Env::Priority::LOW);
     // Forward compactions involving last level to the bottom pool if it exists,
     // such that compactions unlikely to contribute to write stalls can be
     // delayed or deprioritized.
@@ -4120,7 +4159,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     ca->db = this;
     ca->compaction_pri_ = Env::Priority::BOTTOM;
     ca->prepicked_compaction = new PrepickedCompaction;
-    ca->prepicked_compaction->compaction = c.release();
+
+    // If `universal_reduce_file_locking` is true, we only lock a limited set of
+    // input files by creating an intended compaction to forward to bottom
+    // priority pool and repicking files when bottom priority thread
+    // gets to execute this intended compaction
+    const bool need_repick =
+        c->mutable_cf_options()
+            .compaction_options_universal.reduce_file_locking;
+    if (need_repick) {
+      ca->prepicked_compaction->compaction =
+          CreateIntendedCompactionForwardedToBottomPriorityPool(c.get());
+      c.reset();
+      ca->prepicked_compaction->need_repick = true;
+    } else {
+      ca->prepicked_compaction->compaction = c.release();
+      ca->prepicked_compaction->need_repick = false;
+    }
     ca->prepicked_compaction->manual_compaction_state = nullptr;
     // Transfer requested token, so it doesn't need to do it again.
     ca->prepicked_compaction->task_token = std::move(task_token);
@@ -4164,8 +4219,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
     mutex_.Unlock();
-    TEST_SYNC_POINT_CALLBACK(
-        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    if (thread_pri == Env::Priority::LOW) {
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    } else {
+      assert(thread_pri == Env::Priority::BOTTOM);
+      TEST_SYNC_POINT(
+          "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri");
+    }
+
     // Should handle error?
     compaction_job.Run().PermitUncheckedError();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
@@ -4259,9 +4321,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           ->storage_info()
           ->ComputeCompactionScore(c->immutable_options(),
                                    c->mutable_cf_options());
-      if (!cfd->queued_for_compaction()) {
-        AddToCompactionQueue(cfd);
-      }
+      EnqueuePendingCompaction(cfd);
     }
   }
   // this will unref its input_version and column_family_data
@@ -4306,6 +4366,71 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   return status;
 }
 
+// Create an intended compaction to forward based on the original picked
+// compaction. It serves two purposes while it is waiting
+// for a bottom-priority thread becomes available to run:
+// - Prevent the last input file (or sorted run if non-L0) from
+// being included in compaction score calculations unnecessarily since the
+// intended compaction is already scheduled to compact it
+// - Allow other input files to be picked by low-priority compactions that can
+// run right away
+//
+// Once a bottom-priority available to run this intended compaction, it will
+// repick files to consider the LSM updates that occurred during the waiting
+// period.
+Compaction* DBImpl::CreateIntendedCompactionForwardedToBottomPriorityPool(
+    Compaction* c) {
+  auto* cfd = c->column_family_data();
+  const auto& io = c->immutable_options();
+  const auto& mo = c->mutable_cf_options();
+  auto* vstorage = c->input_version()->storage_info();
+
+  std::vector<CompactionInputFiles> inputs(1);
+
+  const std::vector<FileMetaData*>* max_intput_level_files = nullptr;
+  int max_intput_level = 0;
+
+  for (size_t i = c->num_input_levels(); i >= 1; --i) {
+    size_t level = i - 1;
+    if (c->num_input_files(level) > 0) {
+      max_intput_level = static_cast<int>(level);
+      max_intput_level_files = c->inputs(level);
+      break;
+    }
+  }
+
+  assert(max_intput_level_files);
+  assert(!max_intput_level_files->empty());
+  inputs[0].level = max_intput_level;
+
+  if (max_intput_level == 0) {
+    // The last input file
+    inputs[0].files.push_back(
+        (*max_intput_level_files)[max_intput_level_files->size() - 1]);
+  } else {
+    // The last input sorted run
+    for (FileMetaData* f : (*max_intput_level_files)) {
+      inputs[0].files.push_back(f);
+    }
+  }
+
+  c->ReleaseCompactionFiles(Status::OK());
+
+  Compaction* intended_compaction = new Compaction(
+      vstorage, io, mo, mutable_db_options_, std::move(inputs),
+      c->output_level(), c->target_output_file_size(),
+      c->max_compaction_bytes(), c->output_path_id(), c->output_compression(),
+      c->output_compression_opts(), c->output_temperature(),
+      c->max_subcompactions(), c->grandparents(),
+      std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */);
+
+  cfd->compaction_picker()->RegisterCompaction(intended_compaction);
+  vstorage->ComputeCompactionScore(io, mo);
+  intended_compaction->FinalizeInputInfo(cfd->current());
+
+  return intended_compaction;
+}
+
 bool DBImpl::HasPendingManualCompaction() {
   return (!manual_compaction_dequeue_.empty());
 }
@@ -4662,4 +4787,18 @@ Status DBImpl::WaitForCompact(
   }
 }
 
+bool DBImpl::ShouldPickCompaction(
+    bool is_prepicked, const PrepickedCompaction* prepicked_compaction) {
+  return (!is_prepicked && !compaction_queue_.empty()) ||
+         (is_prepicked && prepicked_compaction->need_repick);
+}
+
+void DBImpl::ResetBottomPriCompactionIntent(ColumnFamilyData* cfd,
+                                            std::unique_ptr<Compaction>& c) {
+  c->ReleaseCompactionFiles(Status::OK());
+  cfd->current()->storage_info()->ComputeCompactionScore(
+      c->immutable_options(), c->mutable_cf_options());
+  c.reset();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 5a540e4d3321..e7fc69d6fbbe 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -1672,55 +1672,75 @@ TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
   }
   const int kNumFilesTrigger = 3;
   Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.max_background_compactions = 2;
-  options.num_levels = num_levels_;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
-  // Trigger compaction if size amplification exceeds 110%
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  DestroyAndReopen(options);
-
-  // Need to get a token to enable compaction parallelism up to
-  // `max_background_compactions` jobs.
-  auto pressure_token =
-      dbfull()->TEST_write_controler().GetCompactionPressureToken();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {// wait for the full compaction to be picked before adding files intended
-       // for the second one.
-       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
-        "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
-       // the full (bottom-pri) compaction waits until a partial (low-pri)
-       // compaction has started to verify they can run in parallel.
-       {"DBImpl::BackgroundCompaction:NonTrivial",
-        "DBImpl::BGWorkBottomCompaction"}});
-  SyncPoint::GetInstance()->EnableProcessing();
 
-  Random rnd(301);
-  for (int i = 0; i < 2; ++i) {
-    for (int num = 0; num < kNumFilesTrigger; num++) {
-      int key_idx = 0;
-      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
-      // use no_wait above because that one waits for flush and compaction. We
-      // don't want to wait for compaction because the full compaction is
-      // intentionally blocked while more files are flushed.
-      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  for (bool universal_reduce_file_locking : {true, false}) {
+    Options options = CurrentOptions();
+    options.compaction_style = kCompactionStyleUniversal;
+    options.compaction_options_universal.reduce_file_locking =
+        universal_reduce_file_locking;
+    options.max_background_compactions = 2;
+    options.num_levels = num_levels_;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    // Need to get a token to enable compaction parallelism up to
+    // `max_background_compactions` jobs.
+    auto pressure_token =
+        dbfull()->TEST_write_controler().GetCompactionPressureToken();
+    if (universal_reduce_file_locking) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {// Wait for the full compaction to be repicked before adding files
+           // intended for the second compaction.
+           {"DBImpl::BackgroundCompaction():AfterPickCompactionBottomPri",
+            "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+           // Wait for the second compaction to run before running the full
+           // compaction to verify they can run in parallel
+           {"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+            "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"}});
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {// Wait for the full compaction to be forwarded before adding files
+           // intended for the second compaction.
+           {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+            "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+           // Wait for the second compaction to run before running the full
+           // compaction to verify they can run in parallel
+           {"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+            "DBImpl::BackgroundCompaction:NonTrivial:BeforeRunBottomPri"}});
     }
-    if (i == 0) {
-      TEST_SYNC_POINT(
-          "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int i = 0; i < 2; ++i) {
+      for (int num = 0; num < kNumFilesTrigger; num++) {
+        int key_idx = 0;
+        GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+        // use no_wait above because that one waits for flush and compaction. We
+        // don't want to wait for compaction because the full compaction is
+        // intentionally blocked while more files are flushed.
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+      if (i == 0) {
+        TEST_SYNC_POINT(
+            "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+      }
     }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    // First compaction should output to bottom level. Second should output to
+    // L0 since older L0 files pending compaction prevent it from being placed
+    // lower.
+    ASSERT_EQ(NumSortedRuns(), 2);
+    ASSERT_GT(NumTableFilesAtLevel(0), 0);
+    ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  // First compaction should output to bottom level. Second should output to L0
-  // since older L0 files pending compaction prevent it from being placed lower.
-  ASSERT_EQ(NumSortedRuns(), 2);
-  ASSERT_GT(NumTableFilesAtLevel(0), 0);
-  ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
 }
 
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index f71d75f7ab05..f911f09ca230 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -427,6 +427,7 @@ DECLARE_bool(auto_refresh_iterator_with_snapshot);
 DECLARE_uint32(memtable_op_scan_flush_trigger);
 DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
 DECLARE_uint32(ingest_wbwi_one_in);
+DECLARE_bool(universal_reduce_file_locking);
 
 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index efad640bfa3f..19636d38112c 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1488,4 +1488,11 @@ DEFINE_uint32(
     memtable_avg_op_scan_flush_trigger,
     ROCKSDB_NAMESPACE::ColumnFamilyOptions().memtable_avg_op_scan_flush_trigger,
     "Sets CF option memtable_avg_op_scan_flush_trigger.");
+
+DEFINE_bool(
+    universal_reduce_file_locking,
+    ROCKSDB_NAMESPACE::ColumnFamilyOptions()
+        .compaction_options_universal.reduce_file_locking,
+    "Sets "
+    "ColumnFamilyOptions().compaciton_options_universal.reduce_file_locking.");
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 9d99cbdb9ff1..1e674180b1a7 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4359,6 +4359,8 @@ void InitializeOptionsFromFlags(
   }
 
   options.memtable_op_scan_flush_trigger = FLAGS_memtable_op_scan_flush_trigger;
+  options.compaction_options_universal.reduce_file_locking =
+      FLAGS_universal_reduce_file_locking;
 }
 
 void InitializeOptionsGeneral(
diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
index e40033cae44a..6d2579baae2f 100644
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -111,6 +111,26 @@ class CompactionOptionsUniversal {
   // Default: false
   bool incremental;
 
+  // EXPERIMENTAL
+  //
+  // If true, auto universal compaction picking will adjust to minimize locking
+  // of input files when bottom priority compactions are waiting to run. This
+  // can increase the likelihood of existing L0s being selected for compaction,
+  // thereby improving write stall and reducing read regression. It may increase
+  // the overrall write amplification and compaction load on low priority
+  // threads.
+  //
+  // Default: false (disabled)
+  //
+  // This options does not apply to manual compactions.
+  //
+  // This option is temporary in case turning on this feature causes problems
+  // and users need to undo it quickly. This option is planned for removal in
+  // the near future with default value set to true.
+  //
+  // Dynamically changeable through the SetOptions() API.
+  bool reduce_file_locking;
+
   // Default set of parameters
   CompactionOptionsUniversal()
       : size_ratio(1),
@@ -121,7 +141,8 @@ class CompactionOptionsUniversal {
         max_read_amp(-1),
         stop_style(kCompactionStopStyleTotalSize),
         allow_trivial_move(false),
-        incremental(false) {}
+        incremental(false),
+        reduce_file_locking(false) {}
 
 #if __cplusplus >= 202002L
   bool operator==(const CompactionOptionsUniversal& rhs) const = default;
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 315e70273331..6d062089a066 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -349,6 +349,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionTypeFlags::kMutable}},
         {"allow_trivial_move",
          {offsetof(class CompactionOptionsUniversal, allow_trivial_move),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"reduce_file_locking",
+         {offsetof(class CompactionOptionsUniversal, reduce_file_locking),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}}};
 
@@ -1201,7 +1205,6 @@ void MutableCFOptions::Dump(Logger* log) const {
                  memtable_op_scan_flush_trigger);
   ROCKS_LOG_INFO(log, "         memtable_avg_op_scan_flush_trigger: %" PRIu32,
                  memtable_avg_op_scan_flush_trigger);
-
   // Universal Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d",
                  compaction_options_universal.size_ratio);
@@ -1224,6 +1227,8 @@ void MutableCFOptions::Dump(Logger* log) const {
       static_cast<int>(compaction_options_universal.allow_trivial_move));
   ROCKS_LOG_INFO(log, "compaction_options_universal.incremental        : %d",
                  static_cast<int>(compaction_options_universal.incremental));
+  ROCKS_LOG_INFO(log, "compaction_options_universal.reduce_file_locking : %d",
+                 compaction_options_universal.reduce_file_locking);
 
   // FIFO Compaction Options
   ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64,
diff --git a/options/options.cc b/options/options.cc
index d61fd8403182..bafcf61a600c 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -358,6 +358,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    str_compaction_stop_style.c_str());
   ROCKS_LOG_HEADER(log, "Options.compaction_options_universal.max_read_amp: %d",
                    compaction_options_universal.max_read_amp);
+  ROCKS_LOG_HEADER(
+      log, "Options.compaction_options_universal.reduce_file_locking: %d",
+      compaction_options_universal.reduce_file_locking);
   ROCKS_LOG_HEADER(
       log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
       compaction_options_fifo.max_table_files_size);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index d78ea5ac3cdb..bfb0bc2dc91b 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1818,6 +1818,11 @@ DEFINE_int32(same_value_percentage, 0,
              "Percentage of time value will be same i.e good for compression "
              "of the block");
 
+DEFINE_bool(universal_reduce_file_locking,
+            ROCKSDB_NAMESPACE::Options()
+                .compaction_options_universal.reduce_file_locking,
+            "See Options().compaction_options_universal.reduce_file_locking");
+
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static Status CreateMemTableRepFactory(
@@ -4789,6 +4794,8 @@ class Benchmark {
     options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
     options.memtable_op_scan_flush_trigger =
         FLAGS_memtable_op_scan_flush_trigger;
+    options.compaction_options_universal.reduce_file_locking =
+        FLAGS_universal_reduce_file_locking;
   }
 
   void InitializeOptionsGeneral(Options* opts, ToolHooks& hooks) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f679b0e865e1..8fd152f99c22 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -107,7 +107,8 @@
     "iterpercent": 10,
     "lock_wal_one_in": lambda: random.choice([10000, 1000000]),
     "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1),
-    "max_background_compactions": 20,
+    "max_background_compactions": lambda: random.choice([2, 20]),
+    "num_bottom_pri_threads": lambda: random.choice([0, 1, 20]),
     "max_bytes_for_level_base": 10485760,
     # max_key has to be the same across invocations for verification to work, hence no lambda
     "max_key": random.choice([100000, 25000000]),
@@ -348,6 +349,7 @@
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
     "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
     "compression_manager": lambda: random.choice(["mixed", "none"]),
+    "universal_reduce_file_locking": lambda: random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
diff --git a/unreleased_history/new_features/reduce_file_locking.md b/unreleased_history/new_features/reduce_file_locking.md
new file mode 100644
index 000000000000..d2f04d60cc00
--- /dev/null
+++ b/unreleased_history/new_features/reduce_file_locking.md
@@ -0,0 +1 @@
+Add a new option `CompactionOptionsUniversal::reduce_file_locking` and if it's true, auto universal compaction picking will adjust to minimize locking of input files when bottom priority compactions are waiting to run. This can increase the likelihood of existing L0s being selected for compaction, thereby improving write stall and reducing read regression.

From 945fcbe8208d3cc1d73699aa05c0b5f7bda9fe4b Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Thu, 12 Jun 2025 18:39:28 -0700
Subject: [PATCH 131/500] Add cost info field to IODebugContext (#13666)

Summary:
This field will be used internally to feed Warm Storage cost information back through the Sally IO stack. This is needed for cost accounting / reporting.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13666

Test Plan: I made the additional changes needed to set/record the new cost info field, and confirmed that this information could be fed through.

Reviewed By: anand1976

Differential Revision: D76070434

Pulled By: archang19

fbshipit-source-id: 2fab975f14fd8f7c20b5d0d85c31686ccf682068
---
 include/rocksdb/file_system.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index cb6ecee9f28b..e82494711c2b 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -18,6 +18,7 @@
 
 #include <stdint.h>
 
+#include <any>
 #include <chrono>
 #include <cstdarg>
 #include <functional>
@@ -250,6 +251,9 @@ struct IODebugContext {
   };
   uint64_t trace_data = 0;
 
+  // Arbitrary structure containing cost information about the IO request
+  std::any cost_info;
+
   IODebugContext() {}
 
   void AddCounter(std::string& name, uint64_t value) {

From 58420b7c60c4541ea5d91bb895cc7f9d535ebff9 Mon Sep 17 00:00:00 2001
From: Jiffin Tony Thottan <thottanjiffin@gmail.com>
Date: Fri, 13 Jun 2025 09:47:52 -0700
Subject: [PATCH 132/500] include cstdint to trace_record.h (#13651)

Summary:
There are compilation errors on gcc 15 in fedora 42 while compiling ceph.

This is similar to PR https://github.com/facebook/rocksdb/issues/13573.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13651

Reviewed By: jaykorean

Differential Revision: D76062855

Pulled By: cbi42

fbshipit-source-id: d213debbda39fdfac01641daa567687fc104d260
---
 include/rocksdb/trace_record.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/rocksdb/trace_record.h b/include/rocksdb/trace_record.h
index 8f9c3ee2f0f5..d321f538745d 100644
--- a/include/rocksdb/trace_record.h
+++ b/include/rocksdb/trace_record.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>

From e3a91ec1e33b90ea4e01ee16cd06276133893855 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Fri, 13 Jun 2025 14:12:10 -0700
Subject: [PATCH 133/500] Add copy constructor and assignment operator to
 IODebugContext (#13690)

Summary:
Since `request_id` is a raw pointer to a string, copying `IODebugContext` becomes a little bit more complicated. We need to ensure that `request_id` gets its memory freed, but by we don't have ownership of the memory by default. The `request_id` inside `IODebugContext` is meant to point to a string allocated outside of the RocksDB read request. To get around this issue without refactoring `request_id`'s type entirely, we can store a private member variable and have `request_id` point to it, so the memory deallocation happens automatically for us.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13690

Test Plan:
I updated the `RequestIdPlumbingTest` unit test from https://github.com/facebook/rocksdb/issues/13616
```
./db_test --gtest_filter=DBTest.RequestIdPlumbingTest
```

Reviewed By: anand1976

Differential Revision: D76613051

Pulled By: archang19

fbshipit-source-id: 053a5b9c4cde20606ec7854ada29904bdf11d40c
---
 db/db_test.cc                 | 18 ++++++++++++++++++
 include/rocksdb/file_system.h | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index cda3517d7db5..b1f8f2dee048 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -151,6 +151,7 @@ TEST_F(DBTest, RequestIdPlumbingTest) {
   options.env = env_;
 
   // Create a mock environment to capture IODebugContext during reads
+  IODebugContext dbgCopy;
   const std::string* captured_request_id_dbg;
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -160,6 +161,8 @@ TEST_F(DBTest, RequestIdPlumbingTest) {
           captured_request_id_dbg = nullptr;
         } else {
           captured_request_id_dbg = dbg->request_id;
+          // Test IODebugContext assignment operator
+          dbgCopy = *dbg;
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -178,6 +181,10 @@ TEST_F(DBTest, RequestIdPlumbingTest) {
     // Verify the request_id was propagated to the file system
     ASSERT_NE(captured_request_id_dbg, nullptr);
     ASSERT_EQ(*captured_request_id_dbg, test_request_id);
+
+    ASSERT_NE(dbgCopy.request_id, nullptr);
+    ASSERT_NE(dbgCopy.request_id, captured_request_id_dbg);
+    ASSERT_EQ(*dbgCopy.request_id, test_request_id);
   }
 
   captured_request_id_dbg = nullptr;
@@ -197,6 +204,17 @@ TEST_F(DBTest, RequestIdPlumbingTest) {
     // Verify the request_id was propagated to the file system
     ASSERT_NE(captured_request_id_dbg, nullptr);
     ASSERT_EQ(*captured_request_id_dbg, request_id);
+
+    ASSERT_NE(dbgCopy.request_id, nullptr);
+    ASSERT_NE(dbgCopy.request_id, captured_request_id_dbg);
+    ASSERT_EQ(*dbgCopy.request_id, request_id);
+
+    // Test IODebugContext copy constructor
+    IODebugContext dbgCopy2(dbgCopy);
+    ASSERT_NE(dbgCopy2.request_id, nullptr);
+    ASSERT_NE(dbgCopy2.request_id, captured_request_id_dbg);
+    ASSERT_NE(dbgCopy2.request_id, dbgCopy.request_id);
+    ASSERT_EQ(*dbgCopy2.request_id, request_id);
   }
 
   // test request_id plumbing during multiget
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index e82494711c2b..f2c827ad60a6 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -256,14 +256,39 @@ struct IODebugContext {
 
   IODebugContext() {}
 
+  // Copy constructor
+  IODebugContext(const IODebugContext& other)
+      : file_path(other.file_path),
+        counters(other.counters),
+        msg(other.msg),
+        trace_data(other.trace_data),
+        cost_info(other.cost_info),
+        _request_id(other.request_id ? *other.request_id : "") {
+    request_id = other.request_id ? &_request_id : nullptr;
+  }
+
+  // Copy assignment operator
+  IODebugContext& operator=(const IODebugContext& other) {
+    if (this != &other) {
+      file_path = other.file_path;
+      counters = other.counters;
+      msg = other.msg;
+      trace_data = other.trace_data;
+      cost_info = other.cost_info;
+      _request_id = other.request_id ? *other.request_id : "";
+      request_id = other.request_id ? &_request_id : nullptr;
+    }
+    return *this;
+  }
+
   void AddCounter(std::string& name, uint64_t value) {
     counters.emplace(name, value);
   }
 
   // Called by underlying file system to set request_id and log request_id in
   // IOTracing.
-  void SetRequestId(const std::string* _request_id) {
-    request_id = _request_id;
+  void SetRequestId(const std::string* updated_request_id) {
+    request_id = updated_request_id;
     trace_data |= (1 << TraceData::kRequestID);
   }
 
@@ -276,6 +301,12 @@ struct IODebugContext {
     ss << msg;
     return ss.str();
   }
+
+ private:
+  // Private member that allows for safe copying of IODebugContext without any
+  // memory ownership issues. After copying, request_id can point directly to
+  // this field.
+  std::string _request_id;
 };
 
 // A function pointer type for custom destruction of void pointer passed to

From 504ff4ed55a820817ccb1169b271e2f37fc0dc67 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Sat, 14 Jun 2025 05:45:56 -0700
Subject: [PATCH 134/500] Auto skip Compression (#13674)

Summary:
**Context:**
RocksDB's current compression approach rejects blocks if the compressed size exceeds a predefined threshold. To optimize performance, we aim to develop an algorithm that dynamically stops and resumes block compression attempts based on past rejection data.

**Summary:**
The goal of this milestone is to design, implement, and evaluate an algorithm that intelligently skips and resumes block compression attempts in RocksDB. The algorithm tracks whether randomly selected blocks was rejected, compressed or bypassed and using data of window size to determine the current rejection rate. The calculate rejection rate is used to decide whether to pause and resume compression attempts. We measure the effectiveness of skipping and resuming compression using DB bench and identify any concerning regressions in correctness and performance.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13674

Test Plan:
1. Test case to see if it can automatically start compression on compression friendly workload and see if it can automatically stop compression on non-compression friendly workload (auto_skip_compresor_test.cc)
3. Regression analysis to prove that no significant performance attempt

```bash
SUFFIX=`tty | sed 's|/|_|g'`; for ARGS in "-compression_parallel_threads=1 -compression_type=zstd -compression_manager=none"  "-compression_parallel_threads=4 -compression_type=zstd -compression_manager=none" "-compression_parallel_threads=1 -compression_type=zstd -compression_manager=autoskip"  "-compression_parallel_threads=4 -compression_type=zstd -compression_manager=autoskip" ; do echo $ARGS; (for I in `seq 1 20`; do ./db_bench -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 $ARGS 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done
```
Measurement experiment | throughput (% change from main branch) |
|---------------|--------------------------------|
compression manager = none (main branch) | 1106890.35 ops/s
compression manager = none (auto skip) | 1097574.55 ops/s (-0.84%)
compression manager = auto skip (auto skip branch) | 1133432.9 ops/s (+2.4%)

Reviewed By: hx235

Differential Revision: D76220795

Pulled By: shubhajeet

fbshipit-source-id: 0f46ab34da1b451f8907306afba221503e6e22a5
---
 BUCK                                   |   7 +
 CMakeLists.txt                         |   2 +
 Makefile                               |   3 +
 db_stress_tool/db_stress_test_base.cc  |  34 ++--
 include/rocksdb/advanced_compression.h |   7 +-
 src.mk                                 |   2 +
 tools/db_bench_tool.cc                 |  53 +++---
 tools/db_crashtest.py                  |  17 +-
 util/auto_skip_compressor.cc           | 111 +++++++++++++
 util/auto_skip_compressor.h            |  67 ++++++++
 util/compression_test.cc               | 216 +++++++++++++++++++++++++
 11 files changed, 484 insertions(+), 35 deletions(-)
 create mode 100644 util/auto_skip_compressor.cc
 create mode 100644 util/auto_skip_compressor.h
 create mode 100644 util/compression_test.cc

diff --git a/BUCK b/BUCK
index c14cd38883b1..52e256c342f9 100644
--- a/BUCK
+++ b/BUCK
@@ -249,6 +249,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "trace_replay/trace_record_result.cc",
         "trace_replay/trace_replay.cc",
         "util/async_file_reader.cc",
+        "util/auto_skip_compressor.cc",
         "util/build_version.cc",
         "util/cleanable.cc",
         "util/coding.cc",
@@ -4710,6 +4711,12 @@ cpp_unittest_wrapper(name="compressed_secondary_cache_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="compression_test",
+            srcs=["util/compression_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="configurable_test",
             srcs=["options/configurable_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4f5b8fe185b..b760890a3100 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -880,6 +880,7 @@ set(SOURCES
         util/comparator.cc
         util/compression.cc
         util/simple_mixed_compressor.cc
+        util/auto_skip_compressor.cc
         util/compression_context_cache.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
@@ -1446,6 +1447,7 @@ if(WITH_TESTS)
         table/table_test.cc
         table/block_fetcher_test.cc
         test_util/testutil_test.cc
+        util/compression_test.cc
         trace_replay/block_cache_tracer_test.cc
         trace_replay/io_tracer_test.cc
         tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
diff --git a/Makefile b/Makefile
index 3b423ba41660..87778338075e 100644
--- a/Makefile
+++ b/Makefile
@@ -1491,6 +1491,9 @@ db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY)
 db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+compression_test: $(OBJ_DIR)/util/compression_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_logical_block_size_cache_test: $(OBJ_DIR)/db/db_logical_block_size_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 1e674180b1a7..1235321d56b1 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3412,21 +3412,27 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     InitializeOptionsFromFlags(cache_, filter_policy_, options_);
   }
   InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_);
-  if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-    // Currently limited to ZSTD compression. Table property compression_name
-    // needs to set to zstd for now even when there can be more than one
-    // algorithm in the table under your compressor.
-    options_.compression = kZSTD;
-    options_.bottommost_compression = kZSTD;
-    if (!ZSTD_Supported()) {
-      fprintf(stderr,
-              "ZSTD compression not supported thus mixed compression cannot be "
-              "used\n");
-      exit(1);
+  if (strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+      // Currently limited to ZSTD compression. Table property compression_name
+      // needs to set to zstd for now even when there can be more than one
+      // algorithm in the table under your compressor.
+      if (!ZSTD_Supported()) {
+        fprintf(
+            stderr,
+            "ZSTD compression not supported thus mixed compression cannot be "
+            "used\n");
+        exit(1);
+      }
+      auto mgr = std::make_shared<RoundRobinManager>(
+          GetDefaultBuiltinCompressionManager());
+      options_.compression_manager = mgr;
+      options_.compression = kZSTD;
+      options_.bottommost_compression = kZSTD;
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
+      options_.compression_manager = CreateAutoSkipCompressionManager(
+          GetDefaultBuiltinCompressionManager());
     }
-    auto mgr = std::make_shared<RoundRobinManager>(
-        GetDefaultBuiltinCompressionManager());
-    options_.compression_manager = mgr;
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
     // Nothing to do using default compression manager
   } else {
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index f73f5838fda6..3672bc84e0c6 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -506,5 +506,10 @@ class CompressionManagerWrapper : public CompressionManager {
 // compression_manager=nullptr with this
 const std::shared_ptr<CompressionManager>&
 GetDefaultBuiltinCompressionManager();
-
+// Gets CompressionManager designed for the automated compression strategy.
+// This may include deciding to compress or not.
+// In future should be able to select compression algorithm based on the CPU
+// utilization and IO constraints.
+std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index 6fe4b8539cbe..b1d5a59d8d0e 100644
--- a/src.mk
+++ b/src.mk
@@ -243,6 +243,7 @@ LIB_SOURCES =                                                   \
   util/compaction_job_stats_impl.cc                             \
   util/comparator.cc                                            \
   util/compression.cc                                           \
+  util/auto_skip_compressor.cc                                           \
   util/compression_context_cache.cc                             \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
@@ -592,6 +593,7 @@ TEST_MAIN_SOURCES =                                                     \
   table/table_test.cc                                                   \
   table/block_fetcher_test.cc                                           \
   test_util/testutil_test.cc                                            \
+  util/compression_test.cc                                            \
   tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc         \
   tools/io_tracer_parser_test.cc                                        \
   tools/ldb_cmd_test.cc                                                 \
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index bfb0bc2dc91b..aeda592e8934 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2903,14 +2903,13 @@ class Benchmark {
     // mixed compression  manager expect compression type to be expliciltiy
     // configured through Options to be zstd
     auto compression = std::string("zstd");
-    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-      fprintf(stdout, "Compression manager: mixed\n");
-      fprintf(stdout, "Compression: zstd\n");
-    } else {
-      fprintf(stdout, "Compression manager: none\n");
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
       compression = CompressionTypeToString(FLAGS_compression_type_e);
-      fprintf(stdout, "Compression: %s\n", compression.c_str());
+    } else {
+      fprintf(stdout, "Compression manager: %s\n",
+              FLAGS_compression_manager.c_str());
     }
+    fprintf(stdout, "Compression: %s\n", compression.c_str());
     fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
             FLAGS_sample_for_compression);
     if (options.memtable_factory != nullptr) {
@@ -4634,19 +4633,37 @@ class Benchmark {
         FLAGS_level0_file_num_compaction_trigger;
     options.level0_slowdown_writes_trigger =
         FLAGS_level0_slowdown_writes_trigger;
-    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-      // Need to list zstd in the compression_name table property if it's
-      // potentially used by being in the mix (i.e., potentially at least one
-      // data block in the table is compressed by zstd). This ensures proper
-      // context and dictionary handling, and prevents crashes in older RocksDB
-      // versions.
-      options.compression = kZSTD;
-      options.bottommost_compression = kZSTD;
-      auto mgr = std::make_shared<RoundRobinManager>(
-          GetDefaultBuiltinCompressionManager());
-      options.compression_manager = mgr;
-    } else {
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
       options.compression = FLAGS_compression_type_e;
+    } else {
+      std::shared_ptr<CompressionManagerWrapper> mgr;
+      if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+        // Need to list zstd in the compression_name table property if it's
+        // potentially used by being in the mix (i.e., potentially at least one
+        // data block in the table is compressed by zstd). This ensures proper
+        // context and dictionary handling, and prevents crashes in older
+        // RocksDB versions.
+        options.compression = kZSTD;
+        options.bottommost_compression = kZSTD;
+
+        mgr = std::make_shared<RoundRobinManager>(
+            GetDefaultBuiltinCompressionManager());
+      } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
+        options.compression = FLAGS_compression_type_e;
+        if (FLAGS_compression_type_e == kNoCompression) {
+          fprintf(stderr,
+                  "Compression type must not be no Compression when using "
+                  "autoskip");
+          ErrorExit();
+        }
+        mgr = CreateAutoSkipCompressionManager(
+            GetDefaultBuiltinCompressionManager());
+      } else {
+        // not defined -> exit with error
+        fprintf(stderr, "Requested compression manager not supported");
+        ErrorExit();
+      }
+      options.compression_manager = mgr;
     }
 
     if (FLAGS_simulate_hybrid_fs_file != "") {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 8fd152f99c22..f5b34e363c77 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -348,8 +348,8 @@
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
     "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
-    "compression_manager": lambda: random.choice(["mixed", "none"]),
     "universal_reduce_file_locking": lambda: random.randint(0, 1),
+    "compression_manager": lambda: random.choice(["mixed", "none", "autoskip"]),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
@@ -1004,7 +1004,20 @@ def finalize_and_sanitize(src_params):
         if dest_params.get("block_align") == 1:
             dest_params["block_align"] = 0
         dest_params["compression_type"] = "zstd"
-        dest_params["bottommost_compression_type"] = "none"
+        dest_params["bottommost_compression_type"] = "zstd"
+    elif dest_params.get("compression_manager") == "autoskip":
+        # disabling compression parallel threads if mixed manager is being used as the predictor is not thread safe
+        dest_params["compression_parallel_threads"] = 1
+        # esuring the compression is being used
+        if dest_params.get("compression_type") == "none":
+            dest_params["compression_type"] = random.choice(
+                ["snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
+            )
+        if dest_params.get("bottommost_compression_type") == "none":
+            dest_params["bottommost_compression_type"] = random.choice(
+                ["snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
+            )
+        dest_params["block_align"] = 0
     else:
         # Enabling block_align with compression is not supported
         if dest_params.get("block_align") == 1:
diff --git a/util/auto_skip_compressor.cc b/util/auto_skip_compressor.cc
new file mode 100644
index 000000000000..3337a2dd95e8
--- /dev/null
+++ b/util/auto_skip_compressor.cc
@@ -0,0 +1,111 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "util/auto_skip_compressor.h"
+
+#include "options/options_helper.h"
+#include "rocksdb/advanced_compression.h"
+#include "util/random.h"
+namespace ROCKSDB_NAMESPACE {
+
+int CompressionRejectionProbabilityPredictor::Predict() const {
+  return pred_rejection_prob_percentage_;
+}
+
+size_t CompressionRejectionProbabilityPredictor::attempted_compression_count()
+    const {
+  return rejected_count_ + compressed_count_;
+}
+
+bool CompressionRejectionProbabilityPredictor::Record(
+    Slice uncompressed_block_data, std::string* compressed_output,
+    const CompressionOptions& opts) {
+  if (compressed_output->size() >
+      (static_cast<uint64_t>(opts.max_compressed_bytes_per_kb) *
+       uncompressed_block_data.size()) >>
+      10) {
+    rejected_count_++;
+  } else {
+    compressed_count_++;
+  }
+  if (attempted_compression_count() >= window_size_) {
+    pred_rejection_prob_percentage_ = static_cast<int>(
+        rejected_count_ * 100 / (compressed_count_ + rejected_count_));
+    compressed_count_ = 0;
+    rejected_count_ = 0;
+    assert(attempted_compression_count() == 0);
+  }
+  return true;
+}
+AutoSkipCompressorWrapper::AutoSkipCompressorWrapper(
+    std::unique_ptr<Compressor> compressor, const CompressionOptions& opts,
+    const CompressionType type)
+    : CompressorWrapper::CompressorWrapper(std::move(compressor)),
+      opts_(opts),
+      type_(type),
+      predictor_(
+          std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {
+  (void)type_;
+  (void)opts_;
+}
+
+Status AutoSkipCompressorWrapper::CompressBlock(
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+  bool exploration =
+      Random::GetTLSInstance()->PercentTrue(kExplorationPercentage);
+  TEST_SYNC_POINT_CALLBACK(
+      "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+      &exploration);
+  if (exploration) {
+    return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                  out_compression_type, wa);
+  } else {
+    auto prediction = predictor_->Predict();
+    if (prediction <= kProbabilityCutOff) {
+      // decide to compress
+      return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                    out_compression_type, wa);
+    } else {
+      // decide to bypass compression
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+  }
+  return Status::OK();
+}
+
+Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+  Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                          out_compression_type, wa);
+  // determine if it was rejected or compressed
+  predictor_->Record(uncompressed_data, compressed_output, opts_);
+  return status;
+}
+
+const char* AutoSkipCompressorManager::Name() const {
+  // should have returned "AutoSkipCompressorManager" but we currently have an
+  // error so for now returning name of the wrapped container
+  return wrapped_->Name();
+}
+
+std::unique_ptr<Compressor> AutoSkipCompressorManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(GetSupportedCompressions().size() > 1);
+  assert(preferred != kNoCompression);
+  return std::make_unique<AutoSkipCompressorWrapper>(
+      wrapped_->GetCompressorForSST(context, opts, preferred), opts, preferred);
+}
+
+std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped) {
+  return std::make_shared<AutoSkipCompressorManager>(
+      wrapped == nullptr ? GetDefaultBuiltinCompressionManager() : wrapped);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/auto_skip_compressor.h b/util/auto_skip_compressor.h
new file mode 100644
index 000000000000..0a6bcec2059c
--- /dev/null
+++ b/util/auto_skip_compressor.h
@@ -0,0 +1,67 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Creates auto skip compressor wrapper which intelligently decides bypassing
+// compression based on past data
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/advanced_compression.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Predict rejection probability using a moving window approach
+// This class is not thread safe
+class CompressionRejectionProbabilityPredictor {
+ public:
+  CompressionRejectionProbabilityPredictor(int window_size)
+      : pred_rejection_prob_percentage_(0),
+        rejected_count_(0),
+        compressed_count_(0),
+        window_size_(window_size) {}
+  int Predict() const;
+  bool Record(Slice uncompressed_block_data, std::string* compressed_output,
+              const CompressionOptions& opts);
+  size_t attempted_compression_count() const;
+
+ protected:
+  int pred_rejection_prob_percentage_;
+  size_t rejected_count_;
+  size_t compressed_count_;
+  size_t window_size_;
+};
+
+class AutoSkipCompressorWrapper : public CompressorWrapper {
+ public:
+  explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
+                                     const CompressionOptions& opts,
+                                     const CompressionType type);
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+
+ private:
+  Status CompressBlockAndRecord(Slice uncompressed_data,
+                                std::string* compressed_output,
+                                CompressionType* out_compression_type,
+                                ManagedWorkingArea* wa);
+  static constexpr int kExplorationPercentage = 10;
+  static constexpr int kProbabilityCutOff = 50;
+  const CompressionOptions& opts_;
+  const CompressionType type_;
+  std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor_;
+};
+
+class AutoSkipCompressorManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression_test.cc b/util/compression_test.cc
new file mode 100644
index 000000000000..e00c6813fb40
--- /dev/null
+++ b/util/compression_test.cc
@@ -0,0 +1,216 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Testing the features of auto skip compression manager
+//
+// ***********************************************************************
+// EXPERIMENTAL - subject to change while under development
+// ***********************************************************************
+
+#include <cstdlib>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/flush_block_policy.h"
+#include "table/block_based/block_builder.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class AutoSkipTestFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+  explicit AutoSkipTestFlushBlockPolicy(const int window,
+                                        const BlockBuilder& data_block_builder,
+                                        std::shared_ptr<Statistics> statistics)
+      : window_(window),
+        num_keys_(0),
+        data_block_builder_(data_block_builder),
+        statistics_(statistics) {}
+
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    auto multiple_of_10 = num_keys_ / window_;
+    if (data_block_builder_.empty()) {
+      // First key in this block
+      return false;
+    }
+    // Check every window
+    if (num_keys_ % window_ == 0) {
+      auto set_exploration = [&](void* arg) {
+        bool* exploration = static_cast<bool*>(arg);
+        *exploration = true;
+      };
+      auto unset_exploration = [&](void* arg) {
+        bool* exploration = static_cast<bool*>(arg);
+        *exploration = false;
+      };
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      // We force exploration to set the predicted rejection ratio and then test
+      // that the prediction is exploited
+      if (multiple_of_10 % 2 == 0) {
+        SyncPoint::GetInstance()->SetCallBack(
+            "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+            set_exploration);
+      } else {
+        SyncPoint::GetInstance()->SetCallBack(
+            "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+            unset_exploration);
+      }
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      auto compressed_count = PopStat(NUMBER_BLOCK_COMPRESSED);
+      auto bypassed_count = PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED);
+      auto rejected_count = PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED);
+      auto total = compressed_count + rejected_count + bypassed_count;
+      int rejection_percentage, bypassed_percentage, compressed_percentage;
+      if (total != 0) {
+        rejection_percentage = static_cast<int>(rejected_count * 100 / total);
+        bypassed_percentage = static_cast<int>(bypassed_count * 100 / total);
+        compressed_percentage =
+            static_cast<int>(compressed_count * 100 / total);
+      }
+      // use mulitple of 10 to get correct assertion
+      switch (multiple_of_10) {
+        case 1:
+          // This is exploration stage in which we set the rejection ratio to
+          // 0.6
+          EXPECT_EQ(rejection_percentage, 60);
+          EXPECT_EQ(bypassed_percentage, 0);
+          EXPECT_EQ(compressed_percentage, 40);
+          break;
+        case 2:
+          // With the rejection ratio set to 0.6 all the blocks should be
+          // bypassed in next window
+          EXPECT_EQ(rejection_percentage, 0);
+          EXPECT_EQ(bypassed_percentage, 100);
+          EXPECT_EQ(compressed_percentage, 0);
+          break;
+        case 3:
+          // This is exploration stage in which we set the rejection ratio to
+          // 0.4
+          EXPECT_EQ(rejection_percentage, 40);
+          EXPECT_EQ(bypassed_percentage, 0);
+          EXPECT_EQ(compressed_percentage, 60);
+          break;
+        case 4:
+          // With the rejection ratio set to 0.4 all the blocks should be
+          // attempted to be compressed
+          EXPECT_EQ(rejection_percentage, 60);
+          EXPECT_EQ(bypassed_percentage, 0);
+          EXPECT_EQ(compressed_percentage, 40);
+      }
+    }
+    num_keys_++;
+    return true;
+  }
+  uint64_t PopStat(Tickers t) { return statistics_->getAndResetTickerCount(t); }
+
+ private:
+  int window_;
+  int num_keys_;
+  const BlockBuilder& data_block_builder_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+class AutoSkipTestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit AutoSkipTestFlushBlockPolicyFactory(
+      const int window, std::shared_ptr<Statistics> statistics)
+      : window_(window), statistics_(statistics) {}
+
+  virtual const char* Name() const override {
+    return "AutoSkipTestFlushBlockPolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& data_block_builder) const override {
+    (void)data_block_builder;
+    return new AutoSkipTestFlushBlockPolicy(window_, data_block_builder,
+                                            statistics_);
+  }
+
+ private:
+  int window_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+class DBAutoSkip : public DBTestBase {
+ public:
+  Options options;
+  Random rnd_;
+  int key_index_;
+  DBAutoSkip()
+      : DBTestBase("db_auto_skip", /*env_do_fsync=*/true),
+        options(CurrentOptions()),
+        rnd_(231),
+        key_index_(0) {
+    options.compression_manager =
+        CreateAutoSkipCompressionManager(GetDefaultBuiltinCompressionManager());
+    auto statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.statistics = statistics;
+    options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+    BlockBasedTableOptions bbto;
+    bbto.enable_index_compression = false;
+    bbto.flush_block_policy_factory.reset(
+        new AutoSkipTestFlushBlockPolicyFactory(10, statistics));
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+  }
+
+  bool CompressionFriendlyPut(const int no_of_kvs, const int size_of_value) {
+    auto value = std::string(size_of_value, 'A');
+    for (int i = 0; i < no_of_kvs; ++i) {
+      auto status = Put(Key(key_index_), value);
+      EXPECT_EQ(status.ok(), true);
+      key_index_++;
+    }
+    return true;
+  }
+  bool CompressionUnfriendlyPut(const int no_of_kvs, const int size_of_value) {
+    auto value = rnd_.RandomBinaryString(size_of_value);
+    for (int i = 0; i < no_of_kvs; ++i) {
+      auto status = Put(Key(key_index_), value);
+      EXPECT_EQ(status.ok(), true);
+      key_index_++;
+    }
+    return true;
+  }
+};
+
+TEST_F(DBAutoSkip, AutoSkipCompressionManager) {
+  if (GetSupportedCompressions().size() > 1) {
+    const int kValueSize = 20000;
+    // This will set the rejection ratio to 60%
+    CompressionUnfriendlyPut(6, kValueSize);
+    CompressionFriendlyPut(4, kValueSize);
+    // This will verify all the data block compressions are bypassed based on
+    // previous prediction
+    CompressionUnfriendlyPut(6, kValueSize);
+    CompressionFriendlyPut(4, kValueSize);
+    // This will set the rejection ratio to 40%
+    CompressionUnfriendlyPut(4, kValueSize);
+    CompressionFriendlyPut(6, kValueSize);
+    // This will verify all the data block compression are attempted based on
+    // previous prediction
+    // Compression will be rejected for 6 compression unfriendly blocks
+    // Compression will be accepted for 4 compression friendly blocks
+    CompressionUnfriendlyPut(6, kValueSize);
+    CompressionFriendlyPut(4, kValueSize);
+    // Extra block write to ensure that the all above cases are checked
+    CompressionFriendlyPut(6, kValueSize);
+    CompressionFriendlyPut(4, kValueSize);
+    ASSERT_OK(Flush());
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}

From 2dcfc5475276a524be692ab08afdc831def81066 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Sun, 15 Jun 2025 06:08:56 -0700
Subject: [PATCH 135/500] Mixed compressor adding RandomCompressorManager to
 db_stress_test (#13691)

Summary:
**Summary:**
This pull request configures RocksDB to optionally utilize this customized compressor (RandomCompressor) in the db stress test. It randomly selects the compression algorithm among the blocks.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13691

Test Plan: Testing was performed by verifying the stdout output from both RandomCompressor.

Reviewed By: hx235

Differential Revision: D76624220

Pulled By: shubhajeet

fbshipit-source-id: d9c458eeee930b25e8a87a77dc29f0647836310e
---
 db/db_test2.cc                        |  4 +--
 db_stress_tool/db_stress_test_base.cc | 35 +++++++++++++++------------
 tools/db_crashtest.py                 |  9 +++++--
 util/simple_mixed_compressor.cc       | 12 ++++-----
 util/simple_mixed_compressor.h        |  4 +--
 5 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/db/db_test2.cc b/db/db_test2.cc
index 1f325d7433e5..b46f7fa4fc35 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -1936,9 +1936,9 @@ TEST_F(DBTest2, RoundRobinManager) {
   }
 }
 
-TEST_F(DBTest2, SimpleMixedCompressionManager) {
+TEST_F(DBTest2, RandomMixedCompressionManager) {
   if (ZSTD_Supported()) {
-    auto mgr = std::make_shared<SimpleMixedCompressionManager>(
+    auto mgr = std::make_shared<RandomMixedCompressionManager>(
         GetDefaultBuiltinCompressionManager());
     // Currently mixedmanager only supports with preffered compression manager
     // zstd
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 1235321d56b1..89bf0189bf95 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3412,27 +3412,32 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     InitializeOptionsFromFlags(cache_, filter_policy_, options_);
   }
   InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_);
-  if (strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+  if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed") ||
+      !strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
+    // Currently limited to ZSTD compression. Table property compression_name
+    // needs to set to zstd for now even when there can be more than one
+    // algorithm in the table under your compressor.
+    if (!ZSTD_Supported()) {
+      fprintf(stderr,
+              "ZSTD compression not supported thus mixed compression cannot be "
+              "used\n");
+      exit(1);
+    }
     if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-      // Currently limited to ZSTD compression. Table property compression_name
-      // needs to set to zstd for now even when there can be more than one
-      // algorithm in the table under your compressor.
-      if (!ZSTD_Supported()) {
-        fprintf(
-            stderr,
-            "ZSTD compression not supported thus mixed compression cannot be "
-            "used\n");
-        exit(1);
-      }
       auto mgr = std::make_shared<RoundRobinManager>(
           GetDefaultBuiltinCompressionManager());
       options_.compression_manager = mgr;
-      options_.compression = kZSTD;
-      options_.bottommost_compression = kZSTD;
-    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
-      options_.compression_manager = CreateAutoSkipCompressionManager(
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
+      auto mgr = std::make_shared<RandomMixedCompressionManager>(
           GetDefaultBuiltinCompressionManager());
+      options_.compression_manager = mgr;
     }
+    options_.compression = kZSTD;
+    options_.bottommost_compression = kZSTD;
+
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
+    options_.compression_manager =
+        CreateAutoSkipCompressionManager(GetDefaultBuiltinCompressionManager());
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
     // Nothing to do using default compression manager
   } else {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f5b34e363c77..75b8f7aa5c37 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -349,7 +349,9 @@
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
     "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
     "universal_reduce_file_locking": lambda: random.randint(0, 1),
-    "compression_manager": lambda: random.choice(["mixed", "none", "autoskip"]),
+    "compression_manager": lambda: random.choice(
+        ["mixed"] * 1 + ["none"] * 2 + ["autoskip"] * 2 + ["randommixed"] * 2
+    ),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
@@ -1000,7 +1002,10 @@ def finalize_and_sanitize(src_params):
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
     # Disabling block align if mixed manager is neing used
-    if dest_params.get("compression_manager") == "mixed":
+    if (
+        dest_params.get("compression_manager") == "mixed"
+        or dest_params.get("compression_manager") == "randommixed"
+    ):
         if dest_params.get("block_align") == 1:
             dest_params["block_align"] = 0
         dest_params["compression_type"] = "zstd"
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 48257a30a7c4..4270e1e37543 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -55,8 +55,7 @@ std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
                                                     std::move(dict_samples));
 }
 
-// SimpleMixedCompressor implementation
-Status SimpleMixedCompressor::CompressBlock(
+Status RandomMixedCompressor::CompressBlock(
     Slice uncompressed_data, std::string* compressed_output,
     CompressionType* out_compression_type, ManagedWorkingArea* wa) {
   auto selected =
@@ -66,18 +65,17 @@ Status SimpleMixedCompressor::CompressBlock(
                                    out_compression_type, wa);
 }
 
-// SimpleMixedCompressionManager implementation
-const char* SimpleMixedCompressionManager::Name() const {
+const char* RandomMixedCompressionManager::Name() const {
   return wrapped_->Name();
-  // return "SimpleMixedCompressionManager";
+  // return "RandomMixedCompressionManager";
 }
 
-std::unique_ptr<Compressor> SimpleMixedCompressionManager::GetCompressorForSST(
+std::unique_ptr<Compressor> RandomMixedCompressionManager::GetCompressorForSST(
     const FilterBuildingContext& context, const CompressionOptions& opts,
     CompressionType preferred) {
   assert(preferred == kZSTD);
   (void)context;
-  return std::make_unique<SimpleMixedCompressor>(opts, preferred);
+  return std::make_unique<RandomMixedCompressor>(opts, preferred);
 }
 
 // RoundRobinCompressor implementation
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 84d67558a4bd..09f71160d333 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -33,14 +33,14 @@ class MultiCompressorWrapper : public Compressor {
   std::vector<std::unique_ptr<Compressor>> compressors_;
 };
 
-struct SimpleMixedCompressor : public MultiCompressorWrapper {
+struct RandomMixedCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
 };
 
-class SimpleMixedCompressionManager : public CompressionManagerWrapper {
+class RandomMixedCompressionManager : public CompressionManagerWrapper {
   using CompressionManagerWrapper::CompressionManagerWrapper;
   const char* Name() const override;
   std::unique_ptr<Compressor> GetCompressorForSST(

From 4bdfb7e7daac546002af2760ca6172b2649f13c5 Mon Sep 17 00:00:00 2001
From: virajthakur <virajthakur@berkeley.edu>
Date: Mon, 16 Jun 2025 14:01:29 -0700
Subject: [PATCH 136/500] support canceling ongoing CompactFiles (#13687)

Summary:
Add an atomic bool to CompactionOptions to cancel an ongoing CompactFiles() operation, in the same fashion we do for CompactRange().

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13687

Test Plan: ./db_test2 --gtest_filter=DBTest2.TestCancelCompactFiles

Reviewed By: jaykorean

Differential Revision: D76538529

Pulled By: virajthakur

fbshipit-source-id: 77db5b4fb4cbd5280584834df28e51a72b084dab
---
 db/db_impl/db_impl_compaction_flush.cc |  10 ++-
 db/db_test2.cc                         | 101 ++++++++++++++++++++++++-
 include/rocksdb/options.h              |  14 +++-
 3 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index b5f6e0e70332..2e178053a9f5 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1381,6 +1381,9 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
   TEST_SYNC_POINT_CALLBACK("TestCompactFiles:PausingManualCompaction:3",
                            static_cast<void*>(const_cast<std::atomic<int>*>(
                                &manual_compaction_paused_)));
+  TEST_SYNC_POINT_CALLBACK("TestCancelCompactFiles:SuccessfulCompaction",
+                           static_cast<void*>(const_cast<std::atomic<int>*>(
+                               &manual_compaction_paused_)));
   {
     InstrumentedMutexLock l(&mutex_);
     auto* current = cfd->current();
@@ -1433,7 +1436,12 @@ Status DBImpl::CompactFilesImpl(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return Status::ShutdownInProgress();
   }
-  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+
+  // triggered by DisableManualCompactions or by user-set canceled flag in
+  // CompactionOptions
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0 ||
+      (compact_options.canceled &&
+       compact_options.canceled->load(std::memory_order_acquire))) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index b46f7fa4fc35..d35333b73c77 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -10,7 +10,6 @@
 #include <atomic>
 #include <cstdlib>
 #include <functional>
-#include <iostream>
 #include <memory>
 
 #include "db/db_test_util.h"
@@ -3228,7 +3227,7 @@ TEST_F(DBTest2, PausingManualCompaction1) {
       "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
         auto paused = static_cast<std::atomic<int>*>(arg);
         // CompactFiles() relies on manual_compactions_paused to
-        // determine if thie compaction should be paused or not
+        // determine if this compaction should be paused or not
         ASSERT_EQ(0, paused->load(std::memory_order_acquire));
         paused->fetch_add(1, std::memory_order_release);
       });
@@ -3340,6 +3339,7 @@ TEST_F(DBTest2, PausingManualCompaction3) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   dbfull()->DisableManualCompaction();
+
   ASSERT_TRUE(dbfull()
                   ->CompactRange(compact_options, nullptr, nullptr)
                   .IsManualCompactionPaused());
@@ -5639,6 +5639,103 @@ TEST_F(DBTest2, TestCompactFiles) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_F(DBTest2, TestCancelCompactFiles) {
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.env = env_;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+      ROCKSDB_NAMESPACE::EnvOptions(), options};
+
+  // ingest large SST files
+  std::vector<std::string> external_sst_file_names;
+  int key_counter = 0;
+  const int num_keys_per_file = 100000;
+  const int num_files = 10;
+  for (int i = 0; i < num_files; ++i) {
+    std::string file_name =
+        dbname_ + "/test_compact_files" + std::to_string(i) + ".sst_t";
+    external_sst_file_names.push_back(file_name);
+    ASSERT_OK(sst_file_writer.Open(file_name));
+    for (int j = 0; j < num_keys_per_file; ++j) {
+      ASSERT_OK(sst_file_writer.Put(Key(j + num_keys_per_file * key_counter),
+                                    std::to_string(j)));
+    }
+    key_counter += 1;
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(handle, external_sst_file_names,
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), num_files);
+
+  // Test that 0 compactions happen - canceled is set to True initially
+  CompactionOptions compaction_options;
+  std::atomic<bool> canceled(true);
+  compaction_options.canceled = &canceled;
+
+  ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1)
+                  .IsManualCompactionPaused());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+
+  // Test cancellation before the check to cancel compaction happens -
+  // compaction should not occur
+  bool disable_compaction = false;
+  compaction_options.canceled->store(false, std::memory_order_release);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestCancelCompactFiles:SuccessfulCompaction", [&](void* arg) {
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        if (disable_compaction) {
+          db_->DisableManualCompaction();
+          ASSERT_EQ(1, paused->load(std::memory_order_acquire));
+        } else {
+          compaction_options.canceled->store(true, std::memory_order_release);
+          ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        }
+      });
+
+  ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1)
+                  .IsManualCompactionPaused());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+
+  // DisableManualCompaction() should successfully cancel compaction
+  disable_compaction = true;
+  compaction_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(db_->CompactFiles(compaction_options, handle, files, 1)
+                  .IsManualCompactionPaused());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), num_files);
+  // unlike CompactRange, value of compaction_options.canceled will be
+  // unaffected by calling DisableManualCompactions()
+  ASSERT_FALSE(compaction_options.canceled->load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  db_->EnableManualCompaction();
+
+  // Test cancelation after the check to cancel compaction - compaction should
+  // occur, leaving only 1 file
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactFilesImpl:0", [&](void* /*arg*/) {
+        compaction_options.canceled->store(true, std::memory_order_release);
+      });
+
+  compaction_options.canceled->store(false, std::memory_order_release);
+  ASSERT_OK(db_->CompactFiles(compaction_options, handle, files, 1));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(DBTest2, MultiDBParallelOpenTest) {
   const int kNumDbs = 2;
   Options options = CurrentOptions();
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 6e802f75a923..734dad323074 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2222,10 +2222,20 @@ struct CompactionOptions {
   // If > 0, it will replace the option in the DBOptions for this compaction.
   uint32_t max_subcompactions;
 
+  // Allows cancellation of an in-progress manual compaction.
+  //
+  // Cancellation can be delayed waiting on automatic compactions when used
+  // together with `exclusive_manual_compaction == true`.
+  std::atomic<bool>* canceled;
+  // NOTE: Calling DisableManualCompaction() will not override the
+  // canceled variable in CompactionOptions, as it does for CompactRangeOptions
+  // - this is because ManualCompactionState is not used
+
   CompactionOptions()
       : compression(kDisableCompressionOption),
         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
-        max_subcompactions(0) {}
+        max_subcompactions(0),
+        canceled(nullptr) {}
 };
 
 // For level based compaction, we can configure if we want to skip/force
@@ -2292,7 +2302,7 @@ struct CompactRangeOptions {
   // Cancellation can be delayed waiting on automatic compactions when used
   // together with `exclusive_manual_compaction == true`.
   std::atomic<bool>* canceled = nullptr;
-  // NOTE: Calling DisableManualCompaction() overwrites the uer-provided
+  // NOTE: Calling DisableManualCompaction() overwrites the user-provided
   // canceled variable in CompactRangeOptions.
   // Typically, when CompactRange is being called in one thread (t1) with
   // canceled = false, and DisableManualCompaction is being called in the

From 9d490593d00fe39f309478169eacd862b2f05ba4 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 16 Jun 2025 14:19:03 -0700
Subject: [PATCH 137/500] Preliminary support for custom compression algorithms
 (#13659)

Summary:
This change builds on https://github.com/facebook/rocksdb/issues/13540 and https://github.com/facebook/rocksdb/issues/13626 in allowing a CompressionManager / Compressor / Decompressor to use a custom compression algorithm, with a distinct CompressionType. For background, review the API comments on CompressionManager and its CompatibilityName() function.

Highlights:
* Reserve and name 127 new CompressionTypes that can be used for custom compression algorithms / schemas. In many or most cases I expect the enumerators such as `kCustomCompression8F` to be used in user code rather than casting between integers and CompressionTypes, as I expect the supported custom compression algorithms to be identifiable / enumerable at compile time.
* When using these custom compression types, a CompressionManager must use a CompatibilityName() other than the built-in one AND new format_version=7 (see below).
* When building new SST files, track the full set of CompressionTypes actually used (usually just one aside from kNoCompression), using our efficient bitset SmallEnumSet, which supports fast iteration over the bits set to 1. Ideally, to support mixed or non-mixed compression algorithms in a file as efficiently as possible, we would know the set of CompressionTypes as SST file open time.
* New schema for `TableProperties::compression_name` in format_version=7 to represent the CompressionManager's CompatibilityName(), the set of CompressionTypes used, and potentially more in the future, while keeping the data relatively human-readable.
  * It would be possible to do this without a new format_version, but then the only way to ensure incompatible versions fail is with an unsupported CompressionType tag, not with a compression_name property. Therefore, (a) I prefer not to put something misleading in the `compression_name` property (a built-in compression name) when there is nuance because of a CompressionManager, and (b) I prefer better, more consistent error messages that refer to either format_version or the CompressionManager's CompatibilityName(), rather than an unrecognized custom CompressionType value (which could have come from various CompressionManagers).
* The current configured CompressionManager is passed in to TableReaders so that it (or one it knows about) can be used if it matches the CompatibilityName() used for compression in the SST file. Until the connection with ObjectRegistry is implemented, the only way to read files generated with a particular CompressionManager using custom compression algorithms is to configure it (or a known relative; see FindCompatibleCompressionManager()) in the ColumnFamilyOptions.
* Optimized snappy compression with BuiltinDecompressorV2SnappyOnly, to offset some small added overheads with the new tracking. This is essentially an early part of the planned refactoring that will get rid of the old internal compression APIs.
* Another small optimization in eliminating an unnecessary key copy in flush (builder.cc).
* Fix some handling of named CompressionManagers in CompressionManager::CreateFromString() (problem seen in https://github.com/facebook/rocksdb/issues/13647)

Smaller things:
* Adds Name() and GetId() functions to Compressor for debugging/logging purposes. (Compressor and Decompressor are not expected to be Customizable because they are only instantiated by a CompressionManager.)
* When using an explicit compression_manager, the GetId() of the CompressionManager and the Compressor used to build the file are stored as bonus entries in the compression_options table property. This table property is not parsed anywhere, so it is currently for human reading, but still could be parsed with the new underscore-prefixed bonus entries. IMHO, this is preferable to additional table properties, which would increase memory fragmentation in the TableProperties objects and likely take slightly more CPU on SST open and slightly more storage.
* ReleaseWorkingArea() function from protected to public to make wrappers work, because of a quirk in C++ (vs. Java) in which you cannot access protected members of another instance of the same class (sigh)
* Added `CompressionManager:: SupportsCompressionType()` for early options sanity checking.

Follow-up before release:
* Make format_version=7 official / supported
* Stress test coverage

Sooner than later:
* Update tests for RoundRobinManager and SimpleMixedCompressionManager to take advantage of e.g. set of compression types in compression_name property
* ObjectRegistry stuff
* Refactor away old internal compression APIs

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13659

Test Plan:
Basic unit test added.

## Performance

### SST write performance
```
SUFFIX=`tty | sed 's|/|_|g'`; for ARGS in "-compression_type=none" "-compression_type=snappy" "-compression_type=zstd" "-compression_type=snappy -verify_compression=1" "-compression_type=zstd -verify_compression=1" "-compression_type=zstd -compression_max_dict_bytes=8180"; do echo $ARGS; (for I in `seq 1 20`; do BIN=/dev/shm/dbbench${SUFFIX}.bin; rm -f $BIN; cp db_bench $BIN; $BIN -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 $ARGS 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done
```

Ops/sec, Before -> After, both fv=6:
-compression_type=none
1894386 -> 1858403 (-2.0%)
-compression_type=snappy
1859131 -> 1807469 (-2.8%)
-compression_type=zstd
1191428 -> 1214374 (+1.9%)
-compression_type=snappy -verify_compression=1
1861819 -> 1858342 (+0.2%)
-compression_type=zstd -verify_compression=1
979435 -> 995870 (+1.6%)
-compression_type=zstd -compression_max_dict_bytes=8180
905349 -> 940563 (+3.9%)

Ops/sec, Before fv=6 -> After fv=7:
-compression_type=none
1879365 -> 1836159 (-2.3%)
-compression_type=snappy
1865460 -> 1830916 (-1.9%)
-compression_type=zstd
1191428 -> 1210260 (+1.6%)
-compression_type=snappy -verify_compression=1
1866756 -> 1818989 (-2.6%)
-compression_type=zstd -verify_compression=1
982640 -> 997129 (+1.5%)
-compression_type=zstd -compression_max_dict_bytes=8180
912608 -> 937248 (+2.7%)

### SST read performance
Create DBs
```
for COMP in none snappy zstd; do echo $ARGS; ./db_bench -db=/dev/shm/dbbench-7-$COMP --benchmarks=fillseq,flush -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -compression_type=$COMP -format_version=7; done
```
And test
```
for COMP in none
snappy zstd none; do echo $COMP; (for I in `seq 1 8`; do ./db_bench -readonly -db=/dev/shm/dbbench
-7-$COMP --benchmarks=readrandom -num=10000000 -duration=20 -threads=8 2>&1 | grep micros/op; done
) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done
```

Ops/sec, Before -> After (both fv=6)
none
1491732 -> 1500209 (+0.6%)
snappy
1157216 -> 1169202 (+1.0%)
zstd
695414 -> 703719 (+1.2%)
none (again)
1491787 -> 1528789 (+2.4%)

Ops/sec, Before fv=6 -> After fv=7:
none
1492278 -> 1508668 (+1.1%)
snappy
1140769 -> 1152613 (+1.0%)
zstd
696437 -> 696511 (+0.0%)
none (again)
1500585 -> 1512037 (+0.7%)

Overall, I think we can take the read CPU improvement in exchange for the hit (in some cases) on background write CPU

Reviewed By: hx235

Differential Revision: D76520739

Pulled By: pdillinger

fbshipit-source-id: e73bd72502ff85c8779cba313f26f7d1fd50be3a
---
 db/builder.cc                                 |   4 +-
 db/column_family.cc                           |  13 +-
 db/compaction/compaction_job_test.cc          |   5 +-
 db/convenience.cc                             |   7 +-
 db/db_test.cc                                 |   3 +-
 db/db_test2.cc                                | 360 +++++++++++++++++-
 db/external_sst_file_ingestion_job.cc         |   3 +-
 db/import_column_family_job.cc                |   3 +-
 db/table_cache.cc                             |   3 +-
 fuzz/sst_file_writer_fuzzer.cc                |   3 +-
 include/rocksdb/advanced_compression.h        | 106 +++++-
 include/rocksdb/compression_type.h            | 150 +++++++-
 include/rocksdb/table_properties.h            |   9 +-
 options/configurable.cc                       |   7 +-
 options/options_helper.cc                     | 129 ++++++-
 options/options_test.cc                       |   2 +-
 .../block_based/block_based_table_builder.cc  | 124 +++++-
 .../block_based/block_based_table_factory.cc  |  86 +++--
 table/block_based/block_based_table_factory.h |   4 -
 table/block_based/block_based_table_reader.cc | 119 +++++-
 table/block_based/block_based_table_reader.h  |   1 +
 .../block_based_table_reader_test.cc          |   9 +-
 .../block_based/data_block_hash_index_test.cc |   3 +-
 table/format.cc                               |  11 +-
 table/format.h                                |   8 +
 table/sst_file_dumper.cc                      |   3 +-
 table/sst_file_reader.cc                      |   3 +-
 table/table_builder.h                         |   6 +
 table/table_reader_bench.cc                   |   5 +-
 table/table_test.cc                           |  12 +-
 test_util/testutil.h                          |   2 +-
 tools/db_bench_tool.cc                        |   1 +
 util/auto_skip_compressor.cc                  |   5 +
 util/auto_skip_compressor.h                   |   2 +-
 util/compression.cc                           | 106 +++++-
 util/compression.h                            |  14 +-
 util/simple_mixed_compressor.cc               |  17 +-
 util/simple_mixed_compressor.h                |   3 +-
 38 files changed, 1211 insertions(+), 140 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index 2caa75c04630..1bc59eb25bb4 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -218,8 +218,7 @@ Status BuildTable(
       const Slice& key = c_iter.key();
       const Slice& value = c_iter.value();
       ParsedInternalKey ikey = c_iter.ikey();
-      key_after_flush_buf.assign(key.data(), key.size());
-      Slice key_after_flush = key_after_flush_buf;
+      Slice key_after_flush = key;
       Slice value_after_flush = value;
 
       if (ikey.type == kTypeValuePreferredSeqno) {
@@ -237,6 +236,7 @@ Status BuildTable(
               std::min(smallest_preferred_seqno, preferred_seqno);
         } else {
           // Cannot get a useful preferred seqno, convert it to a kTypeValue.
+          key_after_flush_buf.assign(key.data(), key.size());
           UpdateInternalKey(&key_after_flush_buf, ikey.sequence, kTypeValue);
           ikey = ParsedInternalKey(ikey.user_key, ikey.sequence, kTypeValue);
           key_after_flush = key_after_flush_buf;
diff --git a/db/column_family.cc b/db/column_family.cc
index 06f7d1bbc053..9cda23eabe16 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -110,11 +110,19 @@ void GetInternalTblPropCollFactory(
   }
 }
 
+bool CompressionSupportedWithManager(CompressionType type,
+                                     UnownedPtr<CompressionManager> mgr) {
+  return mgr ? mgr->SupportsCompressionType(type)
+             : CompressionTypeSupported(type);
+}
+
 Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
   if (!cf_options.compression_per_level.empty()) {
     for (size_t level = 0; level < cf_options.compression_per_level.size();
          ++level) {
-      if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+      if (!CompressionSupportedWithManager(
+              cf_options.compression_per_level[level],
+              cf_options.compression_manager.get())) {
         return Status::InvalidArgument(
             "Compression type " +
             CompressionTypeToString(cf_options.compression_per_level[level]) +
@@ -122,7 +130,8 @@ Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
       }
     }
   } else {
-    if (!CompressionTypeSupported(cf_options.compression)) {
+    if (!CompressionSupportedWithManager(
+            cf_options.compression, cf_options.compression_manager.get())) {
       return Status::InvalidArgument(
           "Compression type " +
           CompressionTypeToString(cf_options.compression) +
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 450d9c13820f..d2517e6aa3ad 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -458,9 +458,10 @@ class CompactionJobTestBase : public testing::Test {
       ReadOptions read_opts;
       Status s = cf_options_.table_factory->NewTableReader(
           read_opts,
-          TableReaderOptions(cfd->ioptions(), nullptr, FileOptions(),
+          TableReaderOptions(cfd->ioptions(), /*prefix_extractor=*/nullptr,
+                             /*compression_manager=*/nullptr, FileOptions(),
                              cfd_->internal_comparator(),
-                             0 /* block_protection_bytes_per_key */),
+                             /*block_protection_bytes_per_key=*/0),
           std::move(freader), file_size, &table_reader, false);
       ASSERT_OK(s);
       assert(table_reader);
diff --git a/db/convenience.cc b/db/convenience.cc
index 384854a1e0f7..e8c1fcd01e00 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -93,9 +93,10 @@ Status VerifySstFileChecksumInternal(const Options& options,
           nullptr /* file_read_hist */, ioptions.rate_limiter.get()));
   const bool kImmortal = true;
   auto reader_options = TableReaderOptions(
-      ioptions, options.prefix_extractor, env_options, internal_comparator,
-      options.block_protection_bytes_per_key, false /* skip_filters */,
-      !kImmortal, false /* force_direct_prefetch */, -1 /* level */);
+      ioptions, options.prefix_extractor, options.compression_manager.get(),
+      env_options, internal_comparator, options.block_protection_bytes_per_key,
+      false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
+      -1 /* level */);
   reader_options.largest_seqno = largest_seqno;
   s = options.table_factory->NewTableReader(
       read_options, reader_options, std::move(file_reader), file_size,
diff --git a/db/db_test.cc b/db/db_test.cc
index b1f8f2dee048..64958361b598 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6209,8 +6209,7 @@ TEST_F(DBTest, L0L1L2AndUpHitCounter) {
 }
 
 TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
-  bool& allow_unsupported_fv =
-      BlockBasedTableFactory::AllowUnsupportedFormatVersion();
+  bool& allow_unsupported_fv = TEST_AllowUnsupportedFormatVersion();
   SaveAndRestore guard(&allow_unsupported_fv);
   ASSERT_FALSE(allow_unsupported_fv);
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index d35333b73c77..c552388ae758 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -2000,6 +2000,7 @@ TEST_F(DBTest2, CompressionManagerWrapper) {
 
   struct MyCompressor : public CompressorWrapper {
     using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "MyCompressor"; }
 
     Status CompressBlock(Slice uncompressed_data,
                          std::string* compressed_output,
@@ -2026,7 +2027,7 @@ TEST_F(DBTest2, CompressionManagerWrapper) {
   };
   struct MyManager : public CompressionManagerWrapper {
     using CompressionManagerWrapper::CompressionManagerWrapper;
-    const char* Name() const override { return wrapped_->Name(); }
+    const char* Name() const override { return "MyManager"; }
     std::unique_ptr<Compressor> GetCompressorForSST(
         const FilterBuildingContext& context, const CompressionOptions& opts,
         CompressionType preferred) override {
@@ -2102,6 +2103,363 @@ TEST_F(DBTest2, CompressionManagerWrapper) {
   }
 }
 
+namespace {
+template <CompressionType kCompression>
+struct CompressorCustomAlg : public CompressorWrapper {
+  explicit CompressorCustomAlg(const CompressionOptions& opts)
+      : CompressorWrapper(GetDefaultBuiltinCompressionManager()->GetCompressor(
+            opts, kSnappyCompression)) {}
+
+  explicit CompressorCustomAlg(std::unique_ptr<Compressor> compressor)
+      : CompressorWrapper(std::move(compressor)) {}
+
+  const char* Name() const override { return "CompressorCustomAlg"; }
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* working_area) override {
+    Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       out_compression_type, working_area);
+    if (*out_compression_type != kNoCompression) {
+      assert(*out_compression_type == kSnappyCompression);
+      compressed_output->insert(/*pos=*/0, /*count=*/1,
+                                lossless_cast<char>(kCompression));
+      *out_compression_type = kCompression;
+    }
+    return s;
+  }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+    std::unique_ptr<Compressor> rv =
+        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+    if (rv) {
+      rv = std::make_unique<CompressorCustomAlg>(std::move(rv));
+    }
+    return rv;
+  }
+};
+
+struct DecompressorCustomAlg : public DecompressorWrapper {
+  DecompressorCustomAlg()
+      : DecompressorWrapper(
+            GetDefaultBuiltinCompressionManager()->GetDecompressor()) {}
+
+  explicit DecompressorCustomAlg(std::shared_ptr<Decompressor> decompressor)
+      : DecompressorWrapper(std::move(decompressor)) {}
+
+  const char* Name() const override { return "DecompressorCustomAlg"; }
+
+  Status MaybeCloneForDict(const Slice& serialized_dict,
+                           std::unique_ptr<Decompressor>* out) override {
+    Status s = wrapped_->MaybeCloneForDict(serialized_dict, out);
+    if (s.ok()) {
+      *out = std::make_unique<DecompressorCustomAlg>(std::move(*out));
+    }
+    return s;
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    if (args.compression_type > kLastBuiltinCompression) {
+      assert(args.compressed_data.size() > 0);
+      assert(args.compressed_data[0] ==
+             lossless_cast<char>(args.compression_type));
+      // It's ok to modify args if we restore to original
+      SaveAndRestore<Slice> save_compressed_slice(&args.compressed_data);
+      args.compressed_data.remove_prefix(1);
+      SaveAndRestore<CompressionType> save_compression_type(
+          &args.compression_type);
+      args.compression_type = kSnappyCompression;
+      return wrapped_->ExtractUncompressedSize(args);
+    } else {
+      // Also support built-in compressions
+      return wrapped_->ExtractUncompressedSize(args);
+    }
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (args.compression_type > kLastBuiltinCompression) {
+      assert(args.compressed_data.size() > 0);
+      assert(args.compressed_data[0] ==
+             lossless_cast<char>(args.compression_type));
+      // Or we can copy args and modify
+      Args modified_args = args;
+      modified_args.compressed_data.remove_prefix(1);
+      modified_args.compression_type = kSnappyCompression;
+      return wrapped_->DecompressBlock(modified_args, uncompressed_output);
+    } else {
+      // Also support built-in compressions
+      return wrapped_->DecompressBlock(args, uncompressed_output);
+    }
+  }
+};
+}  // anonymous namespace
+
+TEST_F(DBTest2, CompressionManagerCustomCompression) {
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "snappy compression not supported, skip this test\n");
+    return;
+  }
+
+  // Test that we can use a custom CompressionManager to implement custom
+  // compression algorithms, and that there are appropriate schema guard rails
+  // to ensure data is not processed by the wrong algorithm.
+  using Compressor8A = CompressorCustomAlg<kCustomCompression8A>;
+  using Compressor8B = CompressorCustomAlg<kCustomCompression8B>;
+  using Compressor8C = CompressorCustomAlg<kCustomCompression8C>;
+
+  class MyManager : public CompressionManager {
+   public:
+    explicit MyManager(const char* compat_name) : compat_name_(compat_name) {}
+    const char* Name() const override { return name_.c_str(); }
+    const char* CompatibilityName() const override { return compat_name_; }
+
+    bool SupportsCompressionType(CompressionType type) const override {
+      return type == kCustomCompression8A || type == kCustomCompression8B ||
+             type == kCustomCompression8C ||
+             GetDefaultBuiltinCompressionManager()->SupportsCompressionType(
+                 type);
+    }
+
+    int used_compressor8A_count_ = 0;
+    int used_compressor8B_count_ = 0;
+    int used_compressor8C_count_ = 0;
+
+    std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                              CompressionType type) override {
+      switch (static_cast<unsigned char>(type)) {
+        case kCustomCompression8A:
+          used_compressor8A_count_++;
+          return std::make_unique<Compressor8A>(opts);
+        case kCustomCompression8B:
+          used_compressor8B_count_++;
+          return std::make_unique<Compressor8B>(opts);
+        case kCustomCompression8C:
+          used_compressor8C_count_++;
+          return std::make_unique<Compressor8C>(opts);
+        // Also support built-in compression algorithms
+        default:
+          return GetDefaultBuiltinCompressionManager()->GetCompressor(opts,
+                                                                      type);
+      }
+    }
+
+    // TODO: test limited-scope decompressors
+    std::shared_ptr<Decompressor> GetDecompressor() override {
+      return std::make_shared<DecompressorCustomAlg>();
+    }
+
+    CompressionType last_specific_decompressor_type_ = kNoCompression;
+
+    std::shared_ptr<Decompressor> GetDecompressorForTypes(
+        const CompressionType* types_begin,
+        const CompressionType* types_end) override {
+      assert(types_end > types_begin);
+      last_specific_decompressor_type_ = *types_begin;
+      return std::make_shared<DecompressorCustomAlg>();
+    }
+
+    void AddFriend(const std::shared_ptr<CompressionManager>& mgr) {
+      friends_[mgr->CompatibilityName()] = mgr;
+    }
+    std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+        Slice compatibility_name) override {
+      std::shared_ptr<CompressionManager> rv =
+          CompressionManager::FindCompatibleCompressionManager(
+              compatibility_name);
+      if (!rv) {
+        auto it = friends_.find(compatibility_name.ToString());
+        if (it != friends_.end()) {
+          return it->second.lock();
+        }
+      }
+      return rv;
+    }
+
+   private:
+    const char* compat_name_;
+    std::string name_;
+    // weak_ptr to avoid cycles
+    std::map<std::string, std::weak_ptr<CompressionManager>> friends_;
+  };
+
+  // Although these compression managers are actually compatible, we must
+  // respect their distinct compatibility names and treat them as incompatible
+  // (or else risk processing data incorrectly)
+  // NOTE: these are not registered in ObjectRegistry to test what happens
+  // when the original CompressionManager might not be available.
+  auto mgr_foo = std::make_shared<MyManager>("Foo");
+  auto mgr_bar = std::make_shared<MyManager>("Bar");
+
+  // And this one claims to be fully compatible with the built-in compression
+  // manager when it's not fully compatible (for custom CompressionTypes)
+  auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 20;
+  BlockBasedTableOptions bbto;
+  bbto.enable_index_compression = false;
+  bbto.format_version = 6;  // Before custom compression alg support
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  // Claims not to use custom compression (and doesn't unless setting a custom
+  // CompressionType)
+  options.compression_manager = mgr_claim_compatible;
+  // Use a built-in compression type
+  options.compression = kSnappyCompression;
+  DestroyAndReopen(options);
+
+  constexpr uint16_t kValueSize = 10000;
+  Random rnd(404);
+  std::string value;
+  ASSERT_OK(Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+
+  // That data should be readable without access to the original compression
+  // manager, because it used the built-in CompatibilityName and a built-in
+  // CompressionType
+  options.compression_manager = nullptr;
+  Reopen(options);
+  ASSERT_EQ(Get("a"), value);
+
+  // Verify it was compressed
+  Range r = {"a", "a0"};
+  TablePropertiesCollection tables_properties;
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Snappy");
+
+  // Disallow setting a custom CompressionType with a CompressionManager
+  // claiming to be built-in compatible.
+  options.compression_manager = mgr_claim_compatible;
+  options.compression = kCustomCompression8A;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  options.compression_manager = nullptr;
+  options.compression = kCustomCompressionFE;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+  options.compression =
+      static_cast<CompressionType>(kLastBuiltinCompression + 1);
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Custom compression schema (different CompatibilityName) not supported
+  // before format_version=7
+  options.compression_manager = mgr_foo;
+  options.compression = kSnappyCompression;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // TODO: eliminate this hack when format_version=7 is published
+  SaveAndRestore guard(&TEST_AllowUnsupportedFormatVersion(), true);
+
+  // Set new format version
+  bbto.format_version = 7;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  // Custom compression type not supported with built-in schema name, even with
+  // format_version=7
+  options.compression_manager = mgr_claim_compatible;
+  options.compression = kCustomCompression8B;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Using a built-in compression type with fv=7 but named custom schema
+  options.compression_manager = mgr_foo;
+  options.compression = kSnappyCompression;
+  Reopen(options);
+  ASSERT_OK(Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_EQ(Get("b"), value);
+
+  // Verify it was compressed with snappy
+  r = {"b", "b0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  // Uses new format for "compression_name" property
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;01;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kSnappyCompression);
+
+  // Custom compression type
+  options.compression = kCustomCompression8A;
+  Reopen(options);
+  ASSERT_OK(Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  ASSERT_EQ(Get("c"), value);
+  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
+
+  // Verify it was compressed with custom format
+  r = {"c", "c0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8A);
+
+  // Also dynamically changeable, because the compression manager will respect
+  // the current setting as reported under the legacy logic
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+  ASSERT_OK(Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+  ASSERT_EQ(Get("d"), value);
+
+  // Verify it was compressed with snappy
+  r = {"d", "d0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;01;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kSnappyCompression);
+
+  // Dynamically changeable to custom compressions also
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
+  ASSERT_OK(Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+  ASSERT_EQ(Get("e"), value);
+
+  // Verify it was compressed with custom format
+  r = {"e", "e0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8B);
+
+  // Fails to re-open with incompatible compression manager (can't find
+  // compression manager Foo because it's not registered nor known by Bar)
+  options.compression_manager = mgr_bar;
+  options.compression = kSnappyCompression;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotFound);
+
+  // But should re-open if we make Bar aware of the Foo compression manager
+  mgr_bar->AddFriend(mgr_foo);
+  Reopen(options);
+
+  // Can still read everything
+  ASSERT_EQ(Get("a").size(), kValueSize);
+  ASSERT_EQ(Get("b").size(), kValueSize);
+  ASSERT_EQ(Get("c").size(), kValueSize);
+  ASSERT_EQ(Get("d").size(), kValueSize);
+  ASSERT_EQ(Get("e").size(), kValueSize);
+
+  // TODO: mix of compatibility names in same DB
+  // TODO: test old version of a compression manager unable to read a
+  // compression type
+  // TODO: test getting compression manager from object registry
+}
+
 class CompactionStallTestListener : public EventListener {
  public:
   CompactionStallTestListener()
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 2a45516b4a2e..f6c257654f9c 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -834,7 +834,8 @@ Status ExternalSstFileIngestionJob::ResetTableReader(
       ro,
       TableReaderOptions(
           cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
-          env_options_, cfd_->internal_comparator(),
+          sv->mutable_cf_options.compression_manager.get(), env_options_,
+          cfd_->internal_comparator(),
           sv->mutable_cf_options.block_protection_bytes_per_key,
           /*skip_filters*/ false, /*immortal*/ false,
           /*force_direct_prefetch*/ false, /*level*/ -1,
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 2a725726b913..770dc5b69025 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -324,7 +324,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
   status = sv->mutable_cf_options.table_factory->NewTableReader(
       TableReaderOptions(
           cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
-          env_options_, cfd_->internal_comparator(),
+          sv->mutable_cf_options.compression_manager.get(), env_options_,
+          cfd_->internal_comparator(),
           sv->mutable_cf_options.block_protection_bytes_per_key,
           /*skip_filters*/ false, /*immortal*/ false,
           /*force_direct_prefetch*/ false, /*level*/ -1,
diff --git a/db/table_cache.cc b/db/table_cache.cc
index b689a7730ade..feb66f2eff4f 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -146,7 +146,8 @@ Status TableCache::GetTableReader(
     s = mutable_cf_options.table_factory->NewTableReader(
         ro,
         TableReaderOptions(
-            ioptions_, mutable_cf_options.prefix_extractor, file_options,
+            ioptions_, mutable_cf_options.prefix_extractor,
+            mutable_cf_options.compression_manager.get(), file_options,
             internal_comparator,
             mutable_cf_options.block_protection_bytes_per_key, skip_filters,
             immortal_tables_, false /* force_direct_prefetch */, level,
diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc
index 676daf574fa4..ae17f64cd2fb 100644
--- a/fuzz/sst_file_writer_fuzzer.cc
+++ b/fuzz/sst_file_writer_fuzzer.cc
@@ -91,7 +91,8 @@ TableReader* NewTableReader(const std::string& sst_file_path,
   }
   if (s.ok()) {
     ImmutableOptions iopts(options, cf_ioptions);
-    TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options,
+    TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr,
+                             /*compression_manager=*/nullptr, env_options,
                              cf_ioptions.internal_comparator,
                              0 /* block_protection_bytes_per_key */);
     t_opt.largest_seqno = kMaxSequenceNumber;
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index 3672bc84e0c6..3f5bf231ab2a 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -55,6 +55,15 @@ class Compressor {
   Compressor() = default;
   virtual ~Compressor() = default;
 
+  // Class name for logging / debugging purposes
+  virtual const char* Name() const = 0;
+
+  // Potentially more elaborate identifier for logging / debugging purposes
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
   // Returns the max total bytes of for all sampled blocks for creating the data
   // dictionary, or zero indicating dictionary compression should not be
   // used/configured. This will typically be called after
@@ -228,21 +237,19 @@ class Decompressor {
   // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
   struct WorkingArea {};
 
- protected:
   // To allow for flexible re-use / reclaimation, we have explicit Obtain and
   // Release functions, which are typically wrapped in a special RAII smart
   // pointer. For example, a WorkingArea could be saved/recycled in thread-local
   // or core-local storage, or heap managed, etc., though an explicit
   // WorkingArea is only advised for repeated decompression (by a single
-  // thread).
-
+  // thread). ReleaseWorkingArea() in not intended to be called directly, but
+  // used by ManagedWorkingArea.
   virtual void ReleaseWorkingArea(WorkingArea* wa) {
     // Default implementation: no working area
     (void)wa;
     assert(wa == nullptr);
   }
 
- public:
   using ManagedWorkingArea =
       ManagedPtr<WorkingArea, Decompressor, &Decompressor::ReleaseWorkingArea>;
 
@@ -346,22 +353,30 @@ class CompressionManager
   // should have the same CompatibilityName(), so that a compatible
   // CompressionManager/Decompressor might be used if the original is
   // unavailable. (Name() can be useful in addition to CompatibilityName() for
-  // understanding what compression strategy was used.)
+  // understanding what compression strategy was used.) This name should be
+  // limited to legal variable names in C++ (alphanumeric and underscores).
   virtual const char* CompatibilityName() const = 0;
 
   // Default implementation checks the current compatibility name and returns
   // this CompressionManager (via `out`) if appropriate, and otherwise defers
-  // to CreateFromString().
-  virtual Status FindCompatibleCompressionManager(
-      Slice compatibility_name, std::shared_ptr<CompressionManager>* out);
+  // to CreateFromString(). Failure should simply be a matter of "not found" in
+  // which case nullptr is returned.
+  virtual std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+      Slice compatibility_name);
 
-  // Create a CompressionManager from a string, including built-in
+  // Create or find a CompressionManager from a string, including built-in
   // CompressionManager types.
   // TODO: ObjectLibrary stuff
   static Status CreateFromString(const ConfigOptions& config_options,
                                  const std::string& id,
                                  std::shared_ptr<CompressionManager>* result);
 
+  // Will this compression type be used if requested in calling
+  // GetCompressor/GetCompressorForSST?
+  virtual bool SupportsCompressionType(CompressionType type) const = 0;
+
+  // TODO: function to check compatibility with or sanitize CompressionOptions
+
   // ************************* Compressor creation *********************** //
   // Returning nullptr means compression is entirely disabled for the file,
   // which is valid at the discretion of the CompressionManager. Returning
@@ -410,6 +425,14 @@ class CompressionManager
     // Safe default implementation
     return GetDecompressor();
   }
+
+  // Get a decompressor that is allowed to have support only for the
+  // CompressionTypes used by the given Compressor.
+  virtual std::shared_ptr<Decompressor> GetDecompressorForCompressor(
+      const Compressor& compressor) {
+    // Reasonable default implementation
+    return GetDecompressorOptimizeFor(compressor.GetPreferredCompressionType());
+  }
 };
 
 // ************************* Utility wrappers etc. *********************** //
@@ -453,6 +476,51 @@ class CompressorWrapper : public Compressor {
   std::unique_ptr<Compressor> wrapped_;
 };
 
+class DecompressorWrapper : public Decompressor {
+ public:
+  explicit DecompressorWrapper(std::shared_ptr<Decompressor> decompressor)
+      : wrapped_(std::move(decompressor)) {}
+  // No copies
+  DecompressorWrapper(const DecompressorWrapper&) = delete;
+  DecompressorWrapper& operator=(const DecompressorWrapper&) = delete;
+
+  const char* Name() const override { return wrapped_->Name(); }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    wrapped_->ReleaseWorkingArea(wa);
+  }
+
+  ManagedWorkingArea ObtainWorkingArea(CompressionType preferred) override {
+    return wrapped_->ObtainWorkingArea(preferred);
+  }
+
+  const Slice& GetSerializedDict() const override {
+    return wrapped_->GetSerializedDict();
+  }
+
+  Status MaybeCloneForDict(const Slice& serialized_dict,
+                           std::unique_ptr<Decompressor>* out) override {
+    // NOTE: derived class probably needs to override this to ensure a
+    // derived wrapper around the new Decompressor
+    return wrapped_->MaybeCloneForDict(serialized_dict, out);
+  }
+
+  size_t ApproximateOwnedMemoryUsage() const override {
+    return wrapped_->ApproximateOwnedMemoryUsage();
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    return wrapped_->ExtractUncompressedSize(args);
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    return wrapped_->DecompressBlock(args, uncompressed_output);
+  }
+
+ protected:
+  std::shared_ptr<Decompressor> wrapped_;
+};
+
 // TODO: CompressorBase, for custom compressions
 
 class CompressionManagerWrapper : public CompressionManager {
@@ -465,10 +533,13 @@ class CompressionManagerWrapper : public CompressionManager {
     return wrapped_->CompatibilityName();
   }
 
-  Status FindCompatibleCompressionManager(
-      Slice compatibility_name,
-      std::shared_ptr<CompressionManager>* out) override {
-    return wrapped_->FindCompatibleCompressionManager(compatibility_name, out);
+  std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+      Slice compatibility_name) override {
+    return wrapped_->FindCompatibleCompressionManager(compatibility_name);
+  }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return wrapped_->SupportsCompressionType(type);
   }
 
   std::unique_ptr<Compressor> GetCompressorForSST(
@@ -497,13 +568,18 @@ class CompressionManagerWrapper : public CompressionManager {
     return wrapped_->GetDecompressorForTypes(types_begin, types_end);
   }
 
+  std::shared_ptr<Decompressor> GetDecompressorForCompressor(
+      const Compressor& compressor) override {
+    return wrapped_->GetDecompressorForCompressor(compressor);
+  }
+
  protected:
   std::shared_ptr<CompressionManager> wrapped_;
 };
 
 // Compression manager that implements built-in compression strategy. The
-// behavior of
-// compression_manager=nullptr with this
+// behavior of compression_manager=nullptr is essentially equivalent to
+// using this compression manager.
 const std::shared_ptr<CompressionManager>&
 GetDefaultBuiltinCompressionManager();
 // Gets CompressionManager designed for the automated compression strategy.
diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index 96377c2427ad..6a5ace94c7d0 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -18,16 +18,146 @@ namespace ROCKSDB_NAMESPACE {
 enum CompressionType : unsigned char {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
-  kNoCompression = 0x0,
-  kSnappyCompression = 0x1,
-  kZlibCompression = 0x2,
-  kBZip2Compression = 0x3,
-  kLZ4Compression = 0x4,
-  kLZ4HCCompression = 0x5,
-  kXpressCompression = 0x6,
-  kZSTD = 0x7,
-
-  // TODO: add enum values for user custom compression types
+  kNoCompression = 0x00,
+  kSnappyCompression = 0x01,
+  kZlibCompression = 0x02,
+  kBZip2Compression = 0x03,
+  kLZ4Compression = 0x04,
+  kLZ4HCCompression = 0x05,
+  kXpressCompression = 0x06,
+  kZSTD = 0x07,
+  kLastBuiltinCompression = kZSTD,
+
+  // Reserved for future use: up to 0x7F
+
+  // For use by user custom CompressionManagers
+  kCustomCompression80 = 0x80,
+  kCustomCompression81 = 0x81,
+  kCustomCompression82 = 0x82,
+  kCustomCompression83 = 0x83,
+  kCustomCompression84 = 0x84,
+  kCustomCompression85 = 0x85,
+  kCustomCompression86 = 0x86,
+  kCustomCompression87 = 0x87,
+  kCustomCompression88 = 0x88,
+  kCustomCompression89 = 0x89,
+  kCustomCompression8A = 0x8A,
+  kCustomCompression8B = 0x8B,
+  kCustomCompression8C = 0x8C,
+  kCustomCompression8D = 0x8D,
+  kCustomCompression8E = 0x8E,
+  kCustomCompression8F = 0x8F,
+  kCustomCompression90 = 0x90,
+  kCustomCompression91 = 0x91,
+  kCustomCompression92 = 0x92,
+  kCustomCompression93 = 0x93,
+  kCustomCompression94 = 0x94,
+  kCustomCompression95 = 0x95,
+  kCustomCompression96 = 0x96,
+  kCustomCompression97 = 0x97,
+  kCustomCompression98 = 0x98,
+  kCustomCompression99 = 0x99,
+  kCustomCompression9A = 0x9A,
+  kCustomCompression9B = 0x9B,
+  kCustomCompression9C = 0x9C,
+  kCustomCompression9D = 0x9D,
+  kCustomCompression9E = 0x9E,
+  kCustomCompression9F = 0x9F,
+  kCustomCompressionA0 = 0xA0,
+  kCustomCompressionA1 = 0xA1,
+  kCustomCompressionA2 = 0xA2,
+  kCustomCompressionA3 = 0xA3,
+  kCustomCompressionA4 = 0xA4,
+  kCustomCompressionA5 = 0xA5,
+  kCustomCompressionA6 = 0xA6,
+  kCustomCompressionA7 = 0xA7,
+  kCustomCompressionA8 = 0xA8,
+  kCustomCompressionA9 = 0xA9,
+  kCustomCompressionAA = 0xAA,
+  kCustomCompressionAB = 0xAB,
+  kCustomCompressionAC = 0xAC,
+  kCustomCompressionAD = 0xAD,
+  kCustomCompressionAE = 0xAE,
+  kCustomCompressionAF = 0xAF,
+  kCustomCompressionB0 = 0xB0,
+  kCustomCompressionB1 = 0xB1,
+  kCustomCompressionB2 = 0xB2,
+  kCustomCompressionB3 = 0xB3,
+  kCustomCompressionB4 = 0xB4,
+  kCustomCompressionB5 = 0xB5,
+  kCustomCompressionB6 = 0xB6,
+  kCustomCompressionB7 = 0xB7,
+  kCustomCompressionB8 = 0xB8,
+  kCustomCompressionB9 = 0xB9,
+  kCustomCompressionBA = 0xBA,
+  kCustomCompressionBB = 0xBB,
+  kCustomCompressionBC = 0xBC,
+  kCustomCompressionBD = 0xBD,
+  kCustomCompressionBE = 0xBE,
+  kCustomCompressionBF = 0xBF,
+  kCustomCompressionC0 = 0xC0,
+  kCustomCompressionC1 = 0xC1,
+  kCustomCompressionC2 = 0xC2,
+  kCustomCompressionC3 = 0xC3,
+  kCustomCompressionC4 = 0xC4,
+  kCustomCompressionC5 = 0xC5,
+  kCustomCompressionC6 = 0xC6,
+  kCustomCompressionC7 = 0xC7,
+  kCustomCompressionC8 = 0xC8,
+  kCustomCompressionC9 = 0xC9,
+  kCustomCompressionCA = 0xCA,
+  kCustomCompressionCB = 0xCB,
+  kCustomCompressionCC = 0xCC,
+  kCustomCompressionCD = 0xCD,
+  kCustomCompressionCE = 0xCE,
+  kCustomCompressionCF = 0xCF,
+  kCustomCompressionD0 = 0xD0,
+  kCustomCompressionD1 = 0xD1,
+  kCustomCompressionD2 = 0xD2,
+  kCustomCompressionD3 = 0xD3,
+  kCustomCompressionD4 = 0xD4,
+  kCustomCompressionD5 = 0xD5,
+  kCustomCompressionD6 = 0xD6,
+  kCustomCompressionD7 = 0xD7,
+  kCustomCompressionD8 = 0xD8,
+  kCustomCompressionD9 = 0xD9,
+  kCustomCompressionDA = 0xDA,
+  kCustomCompressionDB = 0xDB,
+  kCustomCompressionDC = 0xDC,
+  kCustomCompressionDD = 0xDD,
+  kCustomCompressionDE = 0xDE,
+  kCustomCompressionDF = 0xDF,
+  kCustomCompressionE0 = 0xE0,
+  kCustomCompressionE1 = 0xE1,
+  kCustomCompressionE2 = 0xE2,
+  kCustomCompressionE3 = 0xE3,
+  kCustomCompressionE4 = 0xE4,
+  kCustomCompressionE5 = 0xE5,
+  kCustomCompressionE6 = 0xE6,
+  kCustomCompressionE7 = 0xE7,
+  kCustomCompressionE8 = 0xE8,
+  kCustomCompressionE9 = 0xE9,
+  kCustomCompressionEA = 0xEA,
+  kCustomCompressionEB = 0xEB,
+  kCustomCompressionEC = 0xEC,
+  kCustomCompressionED = 0xED,
+  kCustomCompressionEE = 0xEE,
+  kCustomCompressionEF = 0xEF,
+  kCustomCompressionF0 = 0xF0,
+  kCustomCompressionF1 = 0xF1,
+  kCustomCompressionF2 = 0xF2,
+  kCustomCompressionF3 = 0xF3,
+  kCustomCompressionF4 = 0xF4,
+  kCustomCompressionF5 = 0xF5,
+  kCustomCompressionF6 = 0xF6,
+  kCustomCompressionF7 = 0xF7,
+  kCustomCompressionF8 = 0xF8,
+  kCustomCompressionF9 = 0xF9,
+  kCustomCompressionFA = 0xFA,
+  kCustomCompressionFB = 0xFB,
+  kCustomCompressionFC = 0xFC,
+  kCustomCompressionFD = 0xFD,
+  kCustomCompressionFE = 0xFE,
 
   // kDisableCompressionOption is used to disable some compression options.
   kDisableCompressionOption = 0xff,
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 00e448ba7d7f..76f3529bf576 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -344,7 +344,14 @@ struct TableProperties {
   // {collector_name[1]},{collector_name[2]},{collector_name[3]} ..
   std::string property_collectors_names;
 
-  // The compression algo used to compress the SST files.
+  // Identifies the compression algorithm or schema used in the file.
+  // Specifically:
+  // * For format_version < 7, it is one of several names for built-in
+  // compression types. Because of how some previous versions of RocksDB
+  // behave, this must be set to "ZSTD" if any blocks are compressed
+  // with zstd and must NOT be set to "NoCompression" if any blocks are
+  // compressed.
+  // * For format_version >= 7, it is ...
   std::string compression_name;
 
   // Compression options used to compress the SST files.
diff --git a/options/configurable.cc b/options/configurable.cc
index 76ea54116a23..fe1f7efc9ab7 100644
--- a/options/configurable.cc
+++ b/options/configurable.cc
@@ -272,7 +272,8 @@ Status ConfigurableHelper::ConfigureOptions(
   if (config_options.ignore_unknown_options) {
     s = Status::OK();
   } else if (s.ok() && unused == nullptr && !remaining.empty()) {
-    s = Status::NotFound("Could not find option: ", remaining.begin()->first);
+    s = Status::NotFound("Extra option not recognized",
+                         remaining.begin()->first);
   }
   return s;
 }
@@ -369,7 +370,7 @@ Status ConfigurableHelper::ConfigureSingleOption(
   const auto opt_info =
       FindOption(configurable, opt_name, &elem_name, &opt_ptr);
   if (opt_info == nullptr) {
-    return Status::NotFound("Could not find option: ", name);
+    return Status::NotFound("Could not find option", name);
   } else {
     return ConfigureOption(config_options, configurable, *opt_info, opt_name,
                            elem_name, value, opt_ptr);
@@ -465,7 +466,7 @@ Status ConfigurableHelper::ConfigureOption(
     return configurable.ParseOption(config_options, opt_info, name, value,
                                     opt_ptr);
   } else {
-    return Status::NotFound("Could not find option: ", name);
+    return Status::NotFound("Unknown how to configure option", name);
   }
 }
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index de6b796822a7..999dd28cae94 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -386,6 +386,133 @@ std::unordered_map<std::string, CompressionType>
         {"kLZ4HCCompression", kLZ4HCCompression},
         {"kXpressCompression", kXpressCompression},
         {"kZSTD", kZSTD},
+        {"kCustomCompression80", kCustomCompression80},
+        {"kCustomCompression81", kCustomCompression81},
+        {"kCustomCompression82", kCustomCompression82},
+        {"kCustomCompression83", kCustomCompression83},
+        {"kCustomCompression84", kCustomCompression84},
+        {"kCustomCompression85", kCustomCompression85},
+        {"kCustomCompression86", kCustomCompression86},
+        {"kCustomCompression87", kCustomCompression87},
+        {"kCustomCompression88", kCustomCompression88},
+        {"kCustomCompression89", kCustomCompression89},
+        {"kCustomCompression8A", kCustomCompression8A},
+        {"kCustomCompression8B", kCustomCompression8B},
+        {"kCustomCompression8C", kCustomCompression8C},
+        {"kCustomCompression8D", kCustomCompression8D},
+        {"kCustomCompression8E", kCustomCompression8E},
+        {"kCustomCompression8F", kCustomCompression8F},
+        {"kCustomCompression90", kCustomCompression90},
+        {"kCustomCompression91", kCustomCompression91},
+        {"kCustomCompression92", kCustomCompression92},
+        {"kCustomCompression93", kCustomCompression93},
+        {"kCustomCompression94", kCustomCompression94},
+        {"kCustomCompression95", kCustomCompression95},
+        {"kCustomCompression96", kCustomCompression96},
+        {"kCustomCompression97", kCustomCompression97},
+        {"kCustomCompression98", kCustomCompression98},
+        {"kCustomCompression99", kCustomCompression99},
+        {"kCustomCompression9A", kCustomCompression9A},
+        {"kCustomCompression9B", kCustomCompression9B},
+        {"kCustomCompression9C", kCustomCompression9C},
+        {"kCustomCompression9D", kCustomCompression9D},
+        {"kCustomCompression9E", kCustomCompression9E},
+        {"kCustomCompression9F", kCustomCompression9F},
+        {"kCustomCompressionA0", kCustomCompressionA0},
+        {"kCustomCompressionA1", kCustomCompressionA1},
+        {"kCustomCompressionA2", kCustomCompressionA2},
+        {"kCustomCompressionA3", kCustomCompressionA3},
+        {"kCustomCompressionA4", kCustomCompressionA4},
+        {"kCustomCompressionA5", kCustomCompressionA5},
+        {"kCustomCompressionA6", kCustomCompressionA6},
+        {"kCustomCompressionA7", kCustomCompressionA7},
+        {"kCustomCompressionA8", kCustomCompressionA8},
+        {"kCustomCompressionA9", kCustomCompressionA9},
+        {"kCustomCompressionAA", kCustomCompressionAA},
+        {"kCustomCompressionAB", kCustomCompressionAB},
+        {"kCustomCompressionAC", kCustomCompressionAC},
+        {"kCustomCompressionAD", kCustomCompressionAD},
+        {"kCustomCompressionAE", kCustomCompressionAE},
+        {"kCustomCompressionAF", kCustomCompressionAF},
+        {"kCustomCompressionB0", kCustomCompressionB0},
+        {"kCustomCompressionB1", kCustomCompressionB1},
+        {"kCustomCompressionB2", kCustomCompressionB2},
+        {"kCustomCompressionB3", kCustomCompressionB3},
+        {"kCustomCompressionB4", kCustomCompressionB4},
+        {"kCustomCompressionB5", kCustomCompressionB5},
+        {"kCustomCompressionB6", kCustomCompressionB6},
+        {"kCustomCompressionB7", kCustomCompressionB7},
+        {"kCustomCompressionB8", kCustomCompressionB8},
+        {"kCustomCompressionB9", kCustomCompressionB9},
+        {"kCustomCompressionBA", kCustomCompressionBA},
+        {"kCustomCompressionBB", kCustomCompressionBB},
+        {"kCustomCompressionBC", kCustomCompressionBC},
+        {"kCustomCompressionBD", kCustomCompressionBD},
+        {"kCustomCompressionBE", kCustomCompressionBE},
+        {"kCustomCompressionBF", kCustomCompressionBF},
+        {"kCustomCompressionC0", kCustomCompressionC0},
+        {"kCustomCompressionC1", kCustomCompressionC1},
+        {"kCustomCompressionC2", kCustomCompressionC2},
+        {"kCustomCompressionC3", kCustomCompressionC3},
+        {"kCustomCompressionC4", kCustomCompressionC4},
+        {"kCustomCompressionC5", kCustomCompressionC5},
+        {"kCustomCompressionC6", kCustomCompressionC6},
+        {"kCustomCompressionC7", kCustomCompressionC7},
+        {"kCustomCompressionC8", kCustomCompressionC8},
+        {"kCustomCompressionC9", kCustomCompressionC9},
+        {"kCustomCompressionCA", kCustomCompressionCA},
+        {"kCustomCompressionCB", kCustomCompressionCB},
+        {"kCustomCompressionCC", kCustomCompressionCC},
+        {"kCustomCompressionCD", kCustomCompressionCD},
+        {"kCustomCompressionCE", kCustomCompressionCE},
+        {"kCustomCompressionCF", kCustomCompressionCF},
+        {"kCustomCompressionD0", kCustomCompressionD0},
+        {"kCustomCompressionD1", kCustomCompressionD1},
+        {"kCustomCompressionD2", kCustomCompressionD2},
+        {"kCustomCompressionD3", kCustomCompressionD3},
+        {"kCustomCompressionD4", kCustomCompressionD4},
+        {"kCustomCompressionD5", kCustomCompressionD5},
+        {"kCustomCompressionD6", kCustomCompressionD6},
+        {"kCustomCompressionD7", kCustomCompressionD7},
+        {"kCustomCompressionD8", kCustomCompressionD8},
+        {"kCustomCompressionD9", kCustomCompressionD9},
+        {"kCustomCompressionDA", kCustomCompressionDA},
+        {"kCustomCompressionDB", kCustomCompressionDB},
+        {"kCustomCompressionDC", kCustomCompressionDC},
+        {"kCustomCompressionDD", kCustomCompressionDD},
+        {"kCustomCompressionDE", kCustomCompressionDE},
+        {"kCustomCompressionDF", kCustomCompressionDF},
+        {"kCustomCompressionE0", kCustomCompressionE0},
+        {"kCustomCompressionE1", kCustomCompressionE1},
+        {"kCustomCompressionE2", kCustomCompressionE2},
+        {"kCustomCompressionE3", kCustomCompressionE3},
+        {"kCustomCompressionE4", kCustomCompressionE4},
+        {"kCustomCompressionE5", kCustomCompressionE5},
+        {"kCustomCompressionE6", kCustomCompressionE6},
+        {"kCustomCompressionE7", kCustomCompressionE7},
+        {"kCustomCompressionE8", kCustomCompressionE8},
+        {"kCustomCompressionE9", kCustomCompressionE9},
+        {"kCustomCompressionEA", kCustomCompressionEA},
+        {"kCustomCompressionEB", kCustomCompressionEB},
+        {"kCustomCompressionEC", kCustomCompressionEC},
+        {"kCustomCompressionED", kCustomCompressionED},
+        {"kCustomCompressionEE", kCustomCompressionEE},
+        {"kCustomCompressionEF", kCustomCompressionEF},
+        {"kCustomCompressionF0", kCustomCompressionF0},
+        {"kCustomCompressionF1", kCustomCompressionF1},
+        {"kCustomCompressionF2", kCustomCompressionF2},
+        {"kCustomCompressionF3", kCustomCompressionF3},
+        {"kCustomCompressionF4", kCustomCompressionF4},
+        {"kCustomCompressionF5", kCustomCompressionF5},
+        {"kCustomCompressionF6", kCustomCompressionF6},
+        {"kCustomCompressionF7", kCustomCompressionF7},
+        {"kCustomCompressionF8", kCustomCompressionF8},
+        {"kCustomCompressionF9", kCustomCompressionF9},
+        {"kCustomCompressionFA", kCustomCompressionFA},
+        {"kCustomCompressionFB", kCustomCompressionFB},
+        {"kCustomCompressionFC", kCustomCompressionFC},
+        {"kCustomCompressionFD", kCustomCompressionFD},
+        {"kCustomCompressionFE", kCustomCompressionFE},
         {"kDisableCompressionOption", kDisableCompressionOption}};
 
 const std::vector<CompressionType>& GetSupportedCompressions() {
@@ -910,7 +1037,7 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options,
                                        : value;
 
     if (opt_ptr == nullptr) {
-      return Status::NotFound("Could not find option", opt_name);
+      return Status::NotFound("Nullptr option", opt_name);
     } else if (parse_func_ != nullptr) {
       ConfigOptions copy = config_options;
       copy.invoke_prepare_options = false;
diff --git a/options/options_test.cc b/options/options_test.cc
index bacee1d1edd7..c045266380d3 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -2031,7 +2031,7 @@ TEST_F(OptionsTest, GetStringFromCompressionType) {
   ASSERT_EQ(res, "kZlibCompression");
 
   ASSERT_NOK(
-      GetStringFromCompressionType(&res, static_cast<CompressionType>(-10)));
+      GetStringFromCompressionType(&res, static_cast<CompressionType>(0x7F)));
 }
 
 TEST_F(OptionsTest, OnlyMutableDBOptions) {
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 64fe71351ad2..7ea8b3ed658c 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -524,6 +524,7 @@ struct BlockBasedTableBuilder::Rep {
   std::string last_ikey;  // Internal key or empty (unset)
   const Slice* first_key_in_next_block = nullptr;
   bool warm_cache = false;
+  bool uses_explicit_compression_manager = false;
 
   uint64_t sample_for_compression;
   std::atomic<uint64_t> compressible_input_data_bytes;
@@ -560,6 +561,11 @@ struct BlockBasedTableBuilder::Rep {
   // in verifying data blocks.
   UnownedPtr<Decompressor> data_block_verify_decompressor;
 
+  // Set of compression types used for blocks in this file (mixing compression
+  // algorithms in a single file is allowed, using a CompressionManager)
+  SmallEnumSet<CompressionType, kDisableCompressionOption>
+      compression_types_used;
+
   // Working area for basic_compressor when compression_parallel_threads==1
   WorkingAreaPair basic_working_area;
   // Working areas for data_block_compressor, for each of
@@ -770,15 +776,25 @@ struct BlockBasedTableBuilder::Rep {
       assert(filter_context.level_at_creation < filter_context.num_levels);
     }
 
-    // TODO: get CompressionManager from options and sort out properties
-    auto mgr = tbo.moptions.compression_manager;
+    props.compression_options =
+        CompressionOptionsToString(tbo.compression_opts);
+
+    auto* mgr = tbo.moptions.compression_manager.get();
     if (mgr == nullptr) {
+      uses_explicit_compression_manager = false;
       mgr = GetBuiltinCompressionManager(
-          GetCompressFormatForVersion(table_opt.format_version));
+                GetCompressFormatForVersion(
+                    static_cast<uint32_t>(table_opt.format_version)))
+                .get();
+    } else {
+      uses_explicit_compression_manager = true;
+
+      // Stuff some extra debugging info as extra pseudo-options. Using
+      // underscore prefix to indicate they are special.
+      props.compression_options.append("_compression_manager=");
+      props.compression_options.append(mgr->GetId());
+      props.compression_options.append("; ");
     }
-    props.compression_name = CompressionTypeToString(tbo.compression_type);
-    props.compression_options =
-        CompressionOptionsToString(tbo.compression_opts);
 
     // Sanitize to only allowing compression when it saves space.
     max_compressed_bytes_per_kb =
@@ -810,8 +826,7 @@ struct BlockBasedTableBuilder::Rep {
               data_block_compressor->ObtainWorkingArea();
         }
       }
-      basic_decompressor =
-          mgr->GetDecompressorOptimizeFor(tbo.compression_type);
+      basic_decompressor = mgr->GetDecompressorForCompressor(*basic_compressor);
       create_context.decompressor = basic_decompressor.get();
 
       if (table_options.verify_compression) {
@@ -930,6 +945,7 @@ struct BlockBasedTableBuilder::Rep {
     // Default is UINT64_MAX for unknown. Setting it to 0 here
     // to allow updating it by taking max in BlockBasedTableBuilder::Add().
     props.key_largest_seqno = 0;
+    PrePopulateCompressionProperties(mgr);
 
     if (FormatVersionUsesContextChecksum(table_options.format_version)) {
       // Must be non-zero and semi- or quasi-random
@@ -954,6 +970,92 @@ struct BlockBasedTableBuilder::Rep {
   Rep(const Rep&) = delete;
   Rep& operator=(const Rep&) = delete;
 
+  void PrePopulateCompressionProperties(UnownedPtr<CompressionManager> mgr) {
+    if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
+      assert(mgr);
+      // Use newer compression_name property
+      props.compression_name.reserve(32);
+      props.compression_name.append(mgr->CompatibilityName());
+      props.compression_name.push_back(';');
+      // Rest of property to be filled out at the end of building the file
+    } else {
+      // Use legacy compression_name property, populated at the end of building
+      // the file. Not compatible with compression managers using custom
+      // algorithms / compression types.
+      assert(Slice(mgr->CompatibilityName())
+                 .compare(GetBuiltinCompressionManager(
+                              GetCompressFormatForVersion(
+                                  static_cast<uint32_t>(props.format_version)))
+                              ->CompatibilityName()) == 0);
+    }
+  }
+  void PostPopulateCompressionProperties() {
+    // Do not include "no compression" in the set. It's not really useful
+    // information whether there are any uncompressed blocks. Some kinds of
+    // blocks are never compressed anyway.
+    compression_types_used.Remove(kNoCompression);
+    size_t ctype_count = compression_types_used.count();
+
+    if (uses_explicit_compression_manager) {
+      // Stuff some extra debugging info as extra pseudo-options. Using
+      // underscore prefix to indicate they are special.
+      std::string& compression_options = props.compression_options;
+      compression_options.append("_compressor=");
+      compression_options.append(data_block_compressor
+                                     ? data_block_compressor->GetId()
+                                     : std::string{});
+      compression_options.append("; ");
+    } else {
+      // No explicit compression manager
+      assert(compression_types_used.count() <= 1);
+    }
+
+    std::string& compression_name = props.compression_name;
+    if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
+      // Fill in extended field of "compression name" property, which is the set
+      // of compression types used, sorted by unsigned byte and then hex
+      // encoded with two digits each (so that table properties are human
+      // readable).
+      assert(*compression_name.rbegin() == ';');
+      size_t pos = compression_name.size();
+      // Make space for the field contents
+      compression_name.append(ctype_count * 2, '\0');
+      char* ptr = compression_name.data() + pos;
+      // Populate the field contents
+      for (CompressionType t : compression_types_used) {
+        PutBaseChars<16>(&ptr, /*digits=*/2, static_cast<unsigned char>(t),
+                         /*uppercase=*/true);
+      }
+      assert(ptr == compression_name.data() + pos + ctype_count * 2);
+      // Allow additional fields in the future
+      compression_name.push_back(';');
+    } else {
+      // Use legacy compression naming. To adhere to requirements described in
+      // TableProperties::compression_name, we might have to replace the name
+      // based on the legacy configured compression type.
+      assert(compression_name.empty());
+      if (ctype_count == 0) {
+        // We could get a slight performance boost in the reader by marking the
+        // file as "no compression" if compression is configured but
+        // consistently rejected, but that would give misleading info for
+        // debugging purposes. So instead we record the configured compression
+        // type, matching the historical behavior.
+        if (data_block_compressor) {
+          compression_name = CompressionTypeToString(
+              data_block_compressor->GetPreferredCompressionType());
+        } else {
+          assert(basic_compressor == nullptr);
+          compression_name = CompressionTypeToString(kNoCompression);
+        }
+      } else if (compression_types_used.Contains(kZSTD)) {
+        compression_name = CompressionTypeToString(kZSTD);
+      } else {
+        compression_name =
+            CompressionTypeToString(*compression_types_used.begin());
+      }
+    }
+  }
+
  private:
   // Synchronize status & io_status accesses across threads from main thread,
   // compression thread and write thread in parallel compression.
@@ -1432,6 +1534,7 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     }
   }
 
+  r->compression_types_used.Add(comp_type);
   std::array<char, kBlockTrailerSize> trailer;
   trailer[0] = comp_type;
   uint32_t checksum = ComputeBuiltinChecksumWithLastByte(
@@ -1783,6 +1886,9 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
     }
     property_collectors_names += "]";
     rep_->props.property_collectors_names = property_collectors_names;
+
+    rep_->PostPopulateCompressionProperties();
+
     if (rep_->table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       assert(rep_->p_index_builder_ != nullptr);
@@ -1930,6 +2036,8 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
   if (kNumBlocksBuffered == 0) {
     // The below code is neither safe nor necessary for handling zero data
     // blocks.
+    // For PostPopulateCompressionProperties()
+    r->data_block_compressor = r->basic_compressor.get();
     return;
   }
 
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index c93dea119f3b..1bf18f0b9f84 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -469,7 +469,7 @@ void BlockBasedTableFactory::InitializeOptions() {
   }
 
   if (table_options_.format_version < kMinSupportedFormatVersion) {
-    if (AllowUnsupportedFormatVersion()) {
+    if (TEST_AllowUnsupportedFormatVersion()) {
       // Allow old format version for testing.
       // And relevant old sanitization.
       if (table_options_.format_version == 0 &&
@@ -569,9 +569,11 @@ Status BlockBasedTableFactory::NewTableReader(
       file_size, table_reader_options.block_protection_bytes_per_key,
       table_reader, table_reader_options.tail_size,
       shared_state_->table_reader_cache_res_mgr,
-      table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache,
-      table_reader_options.skip_filters, table_reader_options.level,
-      table_reader_options.immortal, table_reader_options.largest_seqno,
+      table_reader_options.prefix_extractor,
+      table_reader_options.compression_manager,
+      prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
+      table_reader_options.level, table_reader_options.immortal,
+      table_reader_options.largest_seqno,
       table_reader_options.force_direct_prefetch,
       &shared_state_->tail_prefetch_stats,
       table_reader_options.block_cache_tracer,
@@ -608,28 +610,67 @@ Status BlockBasedTableFactory::ValidateOptions(
         "Enable pin_l0_filter_and_index_blocks_in_cache, "
         ", but block cache is disabled");
   }
-  if (!IsSupportedFormatVersion(table_options_.format_version)) {
+  if (!IsSupportedFormatVersion(table_options_.format_version) &&
+      !TEST_AllowUnsupportedFormatVersion()) {
     return Status::InvalidArgument(
         "Unsupported BlockBasedTable format_version. Please check "
         "include/rocksdb/table.h for more info");
   }
-  if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
-    return Status::InvalidArgument(
-        "Enable block_align, but compression "
-        "enabled");
-  }
-  if (table_options_.block_align &&
-      cf_opts.bottommost_compression != kDisableCompressionOption &&
-      cf_opts.bottommost_compression != kNoCompression) {
-    return Status::InvalidArgument(
-        "Enable block_align, but bottommost_compression enabled");
+  bool using_builtin_compatible_compression = true;
+  if (cf_opts.compression_manager &&
+      strcmp(cf_opts.compression_manager->CompatibilityName(),
+             GetBuiltinCompressionManager(
+                 GetCompressFormatForVersion(table_options_.format_version))
+                 ->CompatibilityName()) != 0) {
+    if (FormatVersionUsesCompressionManagerName(
+            table_options_.format_version)) {
+      using_builtin_compatible_compression = false;
+    } else {
+      return Status::InvalidArgument(
+          "Using a CompressionManager incompatible with built-in (custom "
+          "CompatibilityName()) is not supported for format_version < 7");
+    }
   }
-  if (table_options_.block_align) {
-    for (auto level_compression : cf_opts.compression_per_level) {
-      if (level_compression != kDisableCompressionOption &&
-          level_compression != kNoCompression) {
+  auto validate_compression_type_fn = [&](CompressionType ctype,
+                                          const char* context) {
+    if (ctype == kNoCompression) {
+      return Status::OK();
+    }
+    if (ctype == kDisableCompressionOption) {
+      if (strcmp(context, "compression") == 0) {
         return Status::InvalidArgument(
-            "Enable block_align, but compression_per_level enabled");
+            "kDisableCompressionOption not permitted for option: "
+            "compression");
+      } else {
+        return Status::OK();
+      }
+    }
+    if (table_options_.block_align) {
+      return Status::InvalidArgument("Enable block_align, but " +
+                                     std::string(context) + " enabled");
+    }
+    if (ctype > kLastBuiltinCompression &&
+        using_builtin_compatible_compression) {
+      return Status::InvalidArgument(
+          "Using a CompressionType other than built-in ...");  // TODO
+    }
+    // Otherwise
+    return Status::OK();
+  };
+  {
+    Status s = validate_compression_type_fn(cf_opts.compression, "compression");
+    if (!s.ok()) {
+      return s;
+    }
+    s = validate_compression_type_fn(cf_opts.bottommost_compression,
+                                     "bottommost_compression");
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto ctype : cf_opts.compression_per_level) {
+      s = validate_compression_type_fn(ctype, "compression_per_level");
+      if (!s.ok()) {
+        return s;
       }
     }
   }
@@ -924,11 +965,6 @@ Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options,
   return status;
 }
 
-bool& BlockBasedTableFactory::AllowUnsupportedFormatVersion() {
-  static bool allow = false;
-  return allow;
-}
-
 Status GetBlockBasedTableOptionsFromString(
     const ConfigOptions& config_options,
     const BlockBasedTableOptions& table_options, const std::string& opts_str,
diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h
index fd1c577d7f5d..d1d13f4e2108 100644
--- a/table/block_based/block_based_table_factory.h
+++ b/table/block_based/block_based_table_factory.h
@@ -89,10 +89,6 @@ class BlockBasedTableFactory : public TableFactory {
 
   static constexpr int kMinSupportedFormatVersion = 2;
 
-  // Set to true to allow unit testing of writing unsupported block-based table
-  // format versions (to test read side)
-  static bool& AllowUnsupportedFormatVersion();
-
  protected:
   const void* GetOptionsPtr(const std::string& name) const override;
   Status ParseOption(const ConfigOptions& config_options,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index f11458f5cee0..27d1753cc936 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -562,6 +562,105 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties,
 
   return Status::OK();
 }
+
+Status GetDecompressor(const std::string& compression_name,
+                       UnownedPtr<CompressionManager> compression_manager,
+                       uint32_t table_format_version,
+                       std::shared_ptr<Decompressor>* out_decompressor) {
+  if (compression_name.empty()) {
+    // Very old file (before RocksDB 4.9.0) that might contain compressed
+    // blocks. Get a general decompressor for the format version.
+    auto mgr_to_use = GetBuiltinCompressionManager(
+        GetCompressFormatForVersion(table_format_version));
+    *out_decompressor = mgr_to_use->GetDecompressor();
+    return Status::OK();
+  }
+  if (FormatVersionUsesCompressionManagerName(table_format_version)) {
+    constexpr char kFieldSep = ';';
+    size_t separator_pos = compression_name.find_first_of(kFieldSep);
+    if (separator_pos == std::string::npos) {
+      return Status::Corruption(
+          "Missing separator in compression_name property");
+    }
+    // Built with explicit CompressionManager and schema support for
+    // identifying its compatibility name, which is the first field here.
+    Slice compatibility_name(compression_name.data(), separator_pos);
+    std::shared_ptr<CompressionManager> mgr_to_use;
+    if (compression_manager) {
+      // First attempt to go through the compression manager configured for
+      // writing new files, for efficiency (usually correct) and not forcing
+      // use of ObjectLibrary registration (dependency injection).
+      mgr_to_use = compression_manager->FindCompatibleCompressionManager(
+          compatibility_name);
+    }
+    if (mgr_to_use == nullptr) {
+      ConfigOptions strict;
+      strict.ignore_unknown_options = false;
+      strict.ignore_unsupported_options = false;
+      Status s = CompressionManager::CreateFromString(
+          strict, compatibility_name.ToString(), &mgr_to_use);
+      // Even though we might be able to recover from "not found" if only
+      // built-in compression types are used (would be checked below), it
+      // would provide misleading or unreliable success to allow that to
+      // succeed.
+      if (!s.ok()) {
+        return s;
+      }
+      assert(mgr_to_use);
+    }
+
+    // Second field is set of compression types actually used in the file
+    size_t start_pos = separator_pos + 1;
+    separator_pos = compression_name.find_first_of(kFieldSep, start_pos);
+    if (UNLIKELY(separator_pos == std::string::npos)) {
+      return Status::Corruption("Missing second field from compression_name");
+    }
+    if (UNLIKELY((separator_pos - start_pos) & 1)) {
+      return Status::Corruption(
+          "Second field of compression_name has odd size");
+    }
+    size_t count = (separator_pos - start_pos) / 2;
+    auto ctypes = std::make_unique<CompressionType[]>(count);
+    const char* ptr = compression_name.data() + start_pos;
+    for (size_t i = 0; i < count; ++i) {
+      uint64_t val = 0;
+      bool success = ParseBaseChars<16>(&ptr, 2, &val);
+      if (UNLIKELY(!success || val == kNoCompression ||
+                   val >= kDisableCompressionOption)) {
+        return Status::Corruption(
+            "Error parsing second field of compression_name");
+      }
+      ctypes[i] = static_cast<CompressionType>(val);
+    }
+    *out_decompressor =
+        mgr_to_use->GetDecompressorForTypes(ctypes.get(), ctypes.get() + count);
+    assert(*out_decompressor || count == 0);
+    // Can ignore possible additional future fields
+  } else {
+    // No explicit CompressionManager, e.g. legacy file support where
+    // decompressing with built-in CompressionManager works.
+    CompressionType saved_comp_type =
+        CompressionTypeFromString(compression_name);
+    if (saved_comp_type == kDisableCompressionOption) {
+      // Unrecognized. For RocksDB versions able to read format_version=7,
+      // this is considered an error so that we can continue to evolve the
+      // schema of the compression_name property and report good error
+      // messages.
+      return Status::Corruption("Unrecognized compression_name: " +
+                                compression_name);
+    } else if (saved_comp_type != kNoCompression) {
+      // Use built-in compression manager
+      auto mgr_to_use = GetBuiltinCompressionManager(
+          GetCompressFormatForVersion(table_format_version));
+      *out_decompressor =
+          mgr_to_use->GetDecompressorOptimizeFor(saved_comp_type);
+    } else {
+      // No compression -> decompressor not needed
+      *out_decompressor = nullptr;
+    }
+  }
+  return Status::OK();
+}
 }  // namespace
 
 void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties,
@@ -629,6 +728,7 @@ Status BlockBasedTable::Open(
     std::unique_ptr<TableReader>* table_reader, uint64_t tail_size,
     std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr,
     const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    UnownedPtr<CompressionManager> compression_manager,
     const bool prefetch_index_and_filter_in_cache, const bool skip_filters,
     const int level, const bool immortal_table,
     const SequenceNumber largest_seqno, const bool force_direct_prefetch,
@@ -696,7 +796,8 @@ Status BlockBasedTable::Open(
     }
     return s;
   }
-  if (!IsSupportedFormatVersion(footer.format_version())) {
+  if (!IsSupportedFormatVersion(footer.format_version()) &&
+      !TEST_AllowUnsupportedFormatVersion()) {
     return Status::Corruption(
         "Unknown Footer version. Maybe this file was created with newer "
         "version of RocksDB?");
@@ -746,17 +847,13 @@ Status BlockBasedTable::Open(
     return s;
   }
 
-  CompressionType saved_comp_type = CompressionTypeFromString(
+  // Read compression metadata and configure decompressor
+  s = GetDecompressor(
       rep->table_properties ? rep->table_properties->compression_name
-                            : std::string{});
-  if (saved_comp_type != kNoCompression) {
-    // Includes "unrecognized" or "unspecified" case, including some old files
-    // before the compression_name table property was introduced in
-    // version 4.9.0
-    // TODO: custom CompressionManager
-    auto mgr = GetBuiltinCompressionManager(
-        GetCompressFormatForVersion(footer.format_version()));
-    rep->decompressor = mgr->GetDecompressorOptimizeFor(saved_comp_type);
+                            : std::string{},
+      compression_manager, footer.format_version(), &rep->decompressor);
+  if (!s.ok()) {
+    return s;
   }
 
   // Populate BlockCreateContext
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index b01a67d007eb..8a8a3e493a02 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -106,6 +106,7 @@ class BlockBasedTable : public TableReader {
       std::shared_ptr<CacheReservationManager> table_reader_cache_res_mgr =
           nullptr,
       const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      UnownedPtr<CompressionManager> compression_manager = nullptr,
       bool prefetch_index_and_filter_in_cache = true, bool skip_filters = false,
       int level = -1, const bool immortal_table = false,
       const SequenceNumber largest_seqno = 0,
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 19e652cc3ceb..a8c6d5d17fd3 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -163,10 +163,11 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
                                 bool user_defined_timestamps_persisted = true) {
     const MutableCFOptions moptions(options_);
     TableReaderOptions table_reader_options = TableReaderOptions(
-        ioptions, moptions.prefix_extractor, foptions, comparator,
-        0 /* block_protection_bytes_per_key */, false /* _skip_filters */,
-        false /* _immortal */, false /* _force_direct_prefetch */,
-        -1 /* _level */, nullptr /* _block_cache_tracer */,
+        ioptions, moptions.prefix_extractor, moptions.compression_manager.get(),
+        foptions, comparator, 0 /* block_protection_bytes_per_key */,
+        false /* _skip_filters */, false /* _immortal */,
+        false /* _force_direct_prefetch */, -1 /* _level */,
+        nullptr /* _block_cache_tracer */,
         0 /* _max_file_size_for_l0_meta_pin */, "" /* _cur_db_session_id */,
         0 /* _cur_file_num */, {} /* _unique_id */, 0 /* _largest_seqno */,
         0 /* _tail_size */, user_defined_timestamps_persisted);
diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
index 7970ca1d9f9b..5bf0faa14ab0 100644
--- a/table/block_based/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -582,7 +582,8 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
   const bool kSkipFilters = true;
   const bool kImmortal = true;
   ASSERT_OK(moptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+      TableReaderOptions(ioptions, moptions.prefix_extractor,
+                         nullptr /* compression_manager */, soptions,
                          internal_comparator,
                          0 /* block_protection_bytes_per_key */, !kSkipFilters,
                          !kImmortal, level_),
diff --git a/table/format.cc b/table/format.cc
index 7164044eed64..13cebde7682e 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -229,7 +229,8 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
                             const BlockHandle& index_handle,
                             uint32_t base_context_checksum) {
   assert(magic_number != Footer::kNullTableMagicNumber);
-  assert(IsSupportedFormatVersion(format_version));
+  assert(IsSupportedFormatVersion(format_version) ||
+         TEST_AllowUnsupportedFormatVersion());
 
   char* part2;
   char* part3;
@@ -362,7 +363,8 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   } else {
     part3_ptr = magic_ptr - 4;
     format_version_ = DecodeFixed32(part3_ptr);
-    if (UNLIKELY(!IsSupportedFormatVersion(format_version_))) {
+    if (UNLIKELY(!IsSupportedFormatVersion(format_version_) &&
+                 !TEST_AllowUnsupportedFormatVersion())) {
       return Status::Corruption("Corrupt or unsupported format_version: " +
                                 std::to_string(format_version_));
     }
@@ -475,6 +477,11 @@ std::string Footer::ToString() const {
   return result;
 }
 
+bool& TEST_AllowUnsupportedFormatVersion() {
+  static bool allow = false;
+  return allow;
+}
+
 static Status ReadFooterFromFileInternal(const IOOptions& opts,
                                          RandomAccessFileReader* file,
                                          FileSystem& fs,
diff --git a/table/format.h b/table/format.h
index 5737c2cd2684..ffc120e35eb0 100644
--- a/table/format.h
+++ b/table/format.h
@@ -175,6 +175,10 @@ inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) {
   return version < 6;
 }
 
+inline bool FormatVersionUsesCompressionManagerName(uint32_t version) {
+  return version >= 7;
+}
+
 // Footer encapsulates the fixed information stored at the tail end of every
 // SST file. In general, it should only include things that cannot go
 // elsewhere under the metaindex block. For example, checksum_type is
@@ -308,6 +312,10 @@ class FooterBuilder {
   std::array<char, Footer::kMaxEncodedLength> data_;
 };
 
+// Set to true to allow unit testing of writing unsupported block-based table
+// format versions (to test read side)
+bool& TEST_AllowUnsupportedFormatVersion();
+
 // Read the footer from file
 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
 // corruption if table_magic number is not equal to enforce_table_magic_number
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index cbad9aa120d8..b19fc0ab4aff 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -172,7 +172,8 @@ Status SstFileDumper::NewTableReader(
     const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
     std::unique_ptr<TableReader>* /*table_reader*/) {
   auto t_opt = TableReaderOptions(
-      ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_,
+      ioptions_, moptions_.prefix_extractor,
+      moptions_.compression_manager.get(), soptions_, internal_comparator_,
       0 /* block_protection_bytes_per_key */, false /* skip_filters */,
       false /* immortal */, true /* force_direct_prefetch */, -1 /* level */,
       nullptr /* block_cache_tracer */, 0 /* max_file_size_for_l0_meta_pin */,
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index d0a4e8de4598..11013712e281 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -62,7 +62,8 @@ Status SstFileReader::Open(const std::string& file_path) {
   }
   if (s.ok()) {
     TableReaderOptions t_opt(
-        r->ioptions, r->moptions.prefix_extractor, r->soptions,
+        r->ioptions, r->moptions.prefix_extractor,
+        r->moptions.compression_manager.get(), r->soptions,
         r->ioptions.internal_comparator,
         r->moptions.block_protection_bytes_per_key,
         /*skip_filters*/ false, /*immortal*/ false,
diff --git a/table/table_builder.h b/table/table_builder.h
index 10b3476b6b68..8d0132966f8d 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -24,6 +24,7 @@
 #include "rocksdb/table_properties.h"
 #include "table/unique_id_impl.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -35,6 +36,7 @@ struct TableReaderOptions {
   TableReaderOptions(
       const ImmutableOptions& _ioptions,
       const std::shared_ptr<const SliceTransform>& _prefix_extractor,
+      UnownedPtr<CompressionManager> _compression_manager,
       const EnvOptions& _env_options,
       const InternalKeyComparator& _internal_comparator,
       uint8_t _block_protection_bytes_per_key, bool _skip_filters = false,
@@ -46,6 +48,7 @@ struct TableReaderOptions {
       uint64_t _tail_size = 0, bool _user_defined_timestamps_persisted = true)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
+        compression_manager(_compression_manager),
         env_options(_env_options),
         internal_comparator(_internal_comparator),
         skip_filters(_skip_filters),
@@ -64,6 +67,9 @@ struct TableReaderOptions {
 
   const ImmutableOptions& ioptions;
   const std::shared_ptr<const SliceTransform>& prefix_extractor;
+  // NOTE: the compression manager is not saved, just potentially a decompressor
+  // from it, so we don't need a shared_ptr copy
+  UnownedPtr<CompressionManager> compression_manager;
   const EnvOptions& env_options;
   const InternalKeyComparator& internal_comparator;
   // This is only used for BlockBasedTable (reader)
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index a588f6eea07c..8a164488c8c0 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -145,8 +145,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(raf), file_name));
     s = opts.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor, env_options,
-                           ikc, 0 /* block_protection_bytes_per_key */),
+        TableReaderOptions(ioptions, moptions.prefix_extractor,
+                           moptions.compression_manager.get(), env_options, ikc,
+                           0 /* block_protection_bytes_per_key */),
         std::move(file_reader), file_size, &table_reader);
     if (!s.ok()) {
       fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
diff --git a/table/table_test.cc b/table/table_test.cc
index 302ff718588e..94be08ddcec3 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -444,7 +444,8 @@ class TableConstructor : public Constructor {
 
     file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
     return moptions.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor, soptions,
+        TableReaderOptions(ioptions, moptions.prefix_extractor,
+                           moptions.compression_manager.get(), soptions,
                            *last_internal_comparator_,
                            0 /* block_protection_bytes_per_key */,
                            /*skip_filters*/ false,
@@ -5332,7 +5333,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
         new RandomAccessFileReader(std::move(source), ""));
 
     options.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(),
+        TableReaderOptions(ioptions, moptions.prefix_extractor,
+                           moptions.compression_manager.get(), EnvOptions(),
                            ikc, 0 /* block_protection_bytes_per_key */),
         std::move(file_reader), ss_rw.contents().size(), &table_reader);
 
@@ -5507,7 +5509,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
   const MutableCFOptions moptions2(options2);
 
   ASSERT_OK(moptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(),
+      TableReaderOptions(ioptions2, moptions2.prefix_extractor,
+                         moptions2.compression_manager.get(), EnvOptions(),
                          GetPlainInternalComparator(options2.comparator),
                          0 /* block_protection_bytes_per_key */),
       std::move(file_reader), sink->contents().size(), &table_reader));
@@ -7386,8 +7389,7 @@ TEST_F(ExternalTableTest, IngestionTest) {
 
 int main(int argc, char** argv) {
   // Opt-in this whole test file
-  ROCKSDB_NAMESPACE::BlockBasedTableFactory::AllowUnsupportedFormatVersion() =
-      true;
+  ROCKSDB_NAMESPACE::TEST_AllowUnsupportedFormatVersion() = true;
 
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 2d693b5f201f..1713e2dbcc67 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -71,7 +71,7 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode);
 // Store in *dst a string of length "len" that will compress to
 // "N*compressed_fraction" bytes and return a Slice that references
 // the generated data.
-Slice CompressibleString(Random* rnd, double compressed_fraction, int len,
+Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len,
                          std::string* dst);
 
 #ifndef NDEBUG
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index aeda592e8934..3eaceaca9e82 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4495,6 +4495,7 @@ class Benchmark {
       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
       block_based_options.index_block_restart_interval =
           FLAGS_index_block_restart_interval;
+      TEST_AllowUnsupportedFormatVersion() = true;
       block_based_options.format_version =
           static_cast<uint32_t>(FLAGS_format_version);
       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
diff --git a/util/auto_skip_compressor.cc b/util/auto_skip_compressor.cc
index 3337a2dd95e8..7652f874485c 100644
--- a/util/auto_skip_compressor.cc
+++ b/util/auto_skip_compressor.cc
@@ -8,6 +8,7 @@
 
 #include "options/options_helper.h"
 #include "rocksdb/advanced_compression.h"
+#include "test_util/sync_point.h"
 #include "util/random.h"
 namespace ROCKSDB_NAMESPACE {
 
@@ -52,6 +53,10 @@ AutoSkipCompressorWrapper::AutoSkipCompressorWrapper(
   (void)opts_;
 }
 
+const char* AutoSkipCompressorWrapper::Name() const {
+  return "AutoSkipCompressorWrapper";
+}
+
 Status AutoSkipCompressorWrapper::CompressBlock(
     Slice uncompressed_data, std::string* compressed_output,
     CompressionType* out_compression_type, ManagedWorkingArea* wa) {
diff --git a/util/auto_skip_compressor.h b/util/auto_skip_compressor.h
index 0a6bcec2059c..4100388c4853 100644
--- a/util/auto_skip_compressor.h
+++ b/util/auto_skip_compressor.h
@@ -10,7 +10,6 @@
 #include <memory>
 
 #include "rocksdb/advanced_compression.h"
-#include "util/compression.h"
 
 namespace ROCKSDB_NAMESPACE {
 // Predict rejection probability using a moving window approach
@@ -36,6 +35,7 @@ class CompressionRejectionProbabilityPredictor {
 
 class AutoSkipCompressorWrapper : public CompressorWrapper {
  public:
+  const char* Name() const override;
   explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
                                      const CompressionOptions& opts,
                                      const CompressionType type);
diff --git a/util/compression.cc b/util/compression.cc
index 998c45fdaa4c..e48d8fd19b39 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -132,8 +132,8 @@ Status Decompressor::ExtractUncompressedSize(Args& args) {
   // payload. (RocksDB compress_format_version=2 except Snappy)
   //
   // This is historically a varint32, but it is preliminarily generalized
-  // to varint64. (TODO: support that on the write side, at least for some
-  // codecs, in BBT format_version=7)
+  // to varint64, in case that is supported on the write side for some
+  // algorithms.
   if (LIKELY(GetVarint64(&args.compressed_data, &args.uncompressed_size))) {
     if (LIKELY(args.uncompressed_size <= SIZE_MAX)) {
       return Status::OK();
@@ -155,6 +155,8 @@ namespace {
 
 class BuiltinCompressorV1 : public Compressor {
  public:
+  const char* Name() const override { return "BuiltinCompressorV1"; }
+
   explicit BuiltinCompressorV1(const CompressionOptions& opts,
                                CompressionType type)
       : opts_(opts), type_(type) {
@@ -192,6 +194,8 @@ class BuiltinCompressorV1 : public Compressor {
 
 class BuiltinCompressorV2 : public Compressor {
  public:
+  const char* Name() const override { return "BuiltinCompressorV2"; }
+
   explicit BuiltinCompressorV2(const CompressionOptions& opts,
                                CompressionType type,
                                CompressionDict&& dict = {})
@@ -248,8 +252,7 @@ class BuiltinCompressorV2 : public Compressor {
 
   // TODO: use ZSTD_CCtx directly
   ManagedWorkingArea ObtainWorkingArea() override {
-    return ManagedWorkingArea(
-        static_cast<WorkingArea*>(new CompressionContext(type_, opts_)), this);
+    return ManagedWorkingArea(new CompressionContext(type_, opts_), this);
   }
   void ReleaseWorkingArea(WorkingArea* wa) override {
     delete static_cast<CompressionContext*>(wa);
@@ -348,6 +351,8 @@ class BuiltinCompressionManagerV1 : public CompressionManager {
 
   std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
                                             CompressionType type) override {
+    // At the time of deprecating the writing of new format_version=1 files,
+    // ZSTD was the last supported built-in compression type.
     if (type > kZSTD) {
       // Unrecognized; fall back on default compression
       type = ColumnFamilyOptions{}.compression;
@@ -363,6 +368,10 @@ class BuiltinCompressionManagerV1 : public CompressionManager {
     return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
   }
 
+  bool SupportsCompressionType(CompressionType type) const override {
+    return CompressionTypeSupported(type);
+  }
+
  protected:
   BuiltinDecompressorV1 decompressor_;
 };
@@ -665,6 +674,41 @@ class BuiltinDecompressorV2 : public Decompressor {
   }
 };
 
+class BuiltinDecompressorV2SnappyOnly : public BuiltinDecompressorV2 {
+ public:
+  const char* Name() const override {
+    return "BuiltinDecompressorV2SnappyOnly";
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    assert(args.compression_type == kSnappyCompression);
+#ifdef SNAPPY
+    size_t uncompressed_length = 0;
+    if (!snappy::GetUncompressedLength(args.compressed_data.data(),
+                                       args.compressed_data.size(),
+                                       &uncompressed_length)) {
+      return Status::Corruption("Error reading snappy compressed length");
+    }
+    args.uncompressed_size = uncompressed_length;
+    return Status::OK();
+#else
+    return Status::NotSupported("Snappy not supported in this build");
+#endif
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    assert(args.compression_type == kSnappyCompression);
+    return Snappy_DecompressBlock(args, uncompressed_output);
+  }
+
+  Status MaybeCloneForDict(const Slice&,
+                           std::unique_ptr<Decompressor>* out) override {
+    // NOTE: quietly ignores the dictionary (for compatibility)
+    *out = std::make_unique<BuiltinDecompressorV2SnappyOnly>();
+    return Status::OK();
+  }
+};
+
 class BuiltinDecompressorV2WithDict : public BuiltinDecompressorV2 {
  public:
   explicit BuiltinDecompressorV2WithDict(const Slice& dict) : dict_(dict) {}
@@ -824,7 +868,7 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
       // No acceptable compression ratio => no compression
       return nullptr;
     }
-    if (type > kZSTD) {
+    if (type > kLastBuiltinCompression) {
       // Unrecognized; fall back on default compression
       type = ColumnFamilyOptions{}.compression;
     }
@@ -851,16 +895,40 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
   std::shared_ptr<Decompressor> GetDecompressorForTypes(
       const CompressionType* types_begin,
       const CompressionType* types_end) override {
-    if (std::find(types_begin, types_end, kZSTD)) {
+    if (types_begin == types_end) {
+      return nullptr;
+    } else if (types_begin + 1 == types_end &&
+               *types_begin == kSnappyCompression) {
+      return GetSnappyDecompressor();
+    } else if (std::find(types_begin, types_end, kZSTD)) {
       return GetZstdDecompressor();
     } else {
       return GetGeneralDecompressor();
     }
   }
+  std::shared_ptr<Decompressor> GetDecompressorForCompressor(
+      const Compressor& compressor) override {
+#ifdef ROCKSDB_USE_RTTI
+    // To be extra safe, only optimize here if we are certain we are not
+    // looking at a wrapped compressor, so that we are sure it only uses that
+    // one compression type.
+    if (dynamic_cast<const BuiltinCompressorV2*>(&compressor)) {
+      CompressionType type = compressor.GetPreferredCompressionType();
+      return GetDecompressorForTypes(&type, &type + 1);
+    }
+#endif
+    // Fallback
+    return CompressionManager::GetDecompressorForCompressor(compressor);
+  }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return CompressionTypeSupported(type);
+  }
 
  protected:
   BuiltinDecompressorV2 decompressor_;
   BuiltinDecompressorV2OptimizeZstd zstd_decompressor_;
+  BuiltinDecompressorV2SnappyOnly snappy_decompressor_;
 
   inline std::shared_ptr<Decompressor> GetGeneralDecompressor() {
     return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
@@ -870,6 +938,11 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
     return std::shared_ptr<Decompressor>(shared_from_this(),
                                          &zstd_decompressor_);
   }
+
+  inline std::shared_ptr<Decompressor> GetSnappyDecompressor() {
+    return std::shared_ptr<Decompressor>(shared_from_this(),
+                                         &snappy_decompressor_);
+  }
 };
 
 const std::shared_ptr<BuiltinCompressionManagerV1>
@@ -882,7 +955,7 @@ const std::shared_ptr<BuiltinCompressionManagerV2>
 }  // namespace
 
 Status CompressionManager::CreateFromString(
-    const ConfigOptions& /*config_options*/, const std::string& id,
+    const ConfigOptions& config_options, const std::string& id,
     std::shared_ptr<CompressionManager>* result) {
   if (id == kNullptrString || id.empty()) {
     result->reset();
@@ -897,20 +970,27 @@ Status CompressionManager::CreateFromString(
              id.compare(kBuiltinCompressionManagerV2->Name()) == 0) {
     *result = kBuiltinCompressionManagerV2;
     return Status::OK();
+  } else if (config_options.ignore_unsupported_options) {
+    return Status::OK();
   } else {
     return Status::NotFound("Compatible compression manager for \"" + id +
                             "\"");
   }
 }
 
-Status CompressionManager::FindCompatibleCompressionManager(
-    Slice compatibility_name, std::shared_ptr<CompressionManager>* out) {
+std::shared_ptr<CompressionManager>
+CompressionManager::FindCompatibleCompressionManager(Slice compatibility_name) {
   if (compatibility_name.compare(CompatibilityName()) == 0) {
-    *out = shared_from_this();
-    return Status::OK();
+    return shared_from_this();
   } else {
-    return CreateFromString(ConfigOptions(), compatibility_name.ToString(),
-                            out);
+    std::shared_ptr<CompressionManager> out;
+    Status s =
+        CreateFromString(ConfigOptions(), compatibility_name.ToString(), &out);
+    if (s.ok()) {
+      return out;
+    } else {
+      return nullptr;
+    }
   }
 }
 
diff --git a/util/compression.h b/util/compression.h
index 8f975b2eef0f..ef09a33c19d3 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -28,6 +28,7 @@
 #include "table/block_based/block_type.h"
 #include "test_util/sync_point.h"
 #include "util/atomic.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
 #include "util/string_util.h"
@@ -697,8 +698,7 @@ inline bool CompressionTypeSupported(CompressionType compression_type) {
       return XPRESS_Supported();
     case kZSTD:
       return ZSTD_Supported();
-    default:
-      assert(false);
+    default:  // Including custom compression types
       return false;
   }
 }
@@ -726,8 +726,7 @@ inline bool DictCompressionTypeSupported(CompressionType compression_type) {
       // NB: dictionary supported since 0.5.0. See ZSTD_VERSION_NUMBER check
       // above.
       return ZSTD_Supported();
-    default:
-      assert(false);
+    default:  // Including custom compression types
       return false;
   }
 }
@@ -753,9 +752,10 @@ inline std::string CompressionTypeToString(CompressionType compression_type) {
       return "ZSTD";
     case kDisableCompressionOption:
       return "DisableOption";
-    default:
-      assert(false);
-      return "";
+    default: {
+      char c = lossless_cast<char>(compression_type);
+      return "Custom" + Slice(&c, 1).ToString(/*hex=*/true);
+    }
   }
 }
 
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 4270e1e37543..054a49e19979 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -55,6 +55,11 @@ std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
                                                     std::move(dict_samples));
 }
 
+// RandomMixedCompressor implementation
+const char* RandomMixedCompressor::Name() const {
+  return "RandomMixedCompressor";
+}
+
 Status RandomMixedCompressor::CompressBlock(
     Slice uncompressed_data, std::string* compressed_output,
     CompressionType* out_compression_type, ManagedWorkingArea* wa) {
@@ -66,8 +71,7 @@ Status RandomMixedCompressor::CompressBlock(
 }
 
 const char* RandomMixedCompressionManager::Name() const {
-  return wrapped_->Name();
-  // return "RandomMixedCompressionManager";
+  return "RandomMixedCompressionManager";
 }
 
 std::unique_ptr<Compressor> RandomMixedCompressionManager::GetCompressorForSST(
@@ -79,6 +83,10 @@ std::unique_ptr<Compressor> RandomMixedCompressionManager::GetCompressorForSST(
 }
 
 // RoundRobinCompressor implementation
+const char* RoundRobinCompressor::Name() const {
+  return "RoundRobinCompressor";
+}
+
 Status RoundRobinCompressor::CompressBlock(
     Slice uncompressed_data, std::string* compressed_output,
     CompressionType* out_compression_type, ManagedWorkingArea* wa) {
@@ -92,10 +100,7 @@ Status RoundRobinCompressor::CompressBlock(
 RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
 
 // RoundRobinManager implementation
-const char* RoundRobinManager::Name() const {
-  // return "RoundRobinManager";
-  return wrapped_->Name();
-}
+const char* RoundRobinManager::Name() const { return "RoundRobinManager"; }
 
 std::unique_ptr<Compressor> RoundRobinManager::GetCompressorForSST(
     const FilterBuildingContext& context, const CompressionOptions& opts,
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 09f71160d333..fd9e1cf3a7a8 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -8,7 +8,6 @@
 
 #pragma once
 #include <memory>
-#include <mutex>
 #include <vector>
 
 #include "compression.h"
@@ -35,6 +34,7 @@ class MultiCompressorWrapper : public Compressor {
 
 struct RandomMixedCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
+  const char* Name() const override;
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
@@ -50,6 +50,7 @@ class RandomMixedCompressionManager : public CompressionManagerWrapper {
 
 struct RoundRobinCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
+  const char* Name() const override;
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;

From d27b47f4394d9ebf4ab151f3132d3bbdfa850a99 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 16 Jun 2025 17:02:40 -0700
Subject: [PATCH 138/500] Add NewExternalTableFactoryAsUniquePtr API (#13694)

Summary:
The Object registry requires object to be allocated as std::unique_ptr. Hence we provide a new API for external table plugins to allocate and return a unique_ptr ExternalTableFactory wrapper.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13694

Reviewed By: jaykorean

Differential Revision: D76767974

Pulled By: anand1976

fbshipit-source-id: ac59c523a11679ca7c9f0b280325c7873c6b4c07
---
 include/rocksdb/external_table.h                           | 4 ++++
 table/external_table.cc                                    | 7 +++++++
 table/table_test.cc                                        | 3 ++-
 .../public_api_changes/external_table_unique_ptr.md        | 1 +
 4 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 unreleased_history/public_api_changes/external_table_unique_ptr.md

diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index 4bfad214e253..bb7c3cb3d53d 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -272,4 +272,8 @@ class ExternalTableFactory : public Customizable {
 std::shared_ptr<TableFactory> NewExternalTableFactory(
     std::shared_ptr<ExternalTableFactory> inner_factory);
 
+// A unique_ptr version of the above
+std::unique_ptr<TableFactory> NewExternalTableFactoryAsUniquePtr(
+    std::shared_ptr<ExternalTableFactory> inner_factory);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/external_table.cc b/table/external_table.cc
index 70abe82dba4c..6900bf108254 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -480,4 +480,11 @@ std::shared_ptr<TableFactory> NewExternalTableFactory(
   return res;
 }
 
+std::unique_ptr<TableFactory> NewExternalTableFactoryAsUniquePtr(
+    std::shared_ptr<ExternalTableFactory> inner_factory) {
+  std::unique_ptr<TableFactory> res;
+  res = std::make_unique<ExternalTableFactoryAdapter>(std::move(inner_factory));
+  return res;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/table_test.cc b/table/table_test.cc
index 94be08ddcec3..08163cfc979e 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7007,7 +7007,8 @@ TEST_F(ExternalTableTest, SstReaderTest) {
   std::shared_ptr<ExternalTableFactory> factory =
       std::make_shared<DummyExternalTableFactory>(
           /*support_property_block=*/false);
-  options.table_factory = NewExternalTableFactory(factory);
+  options.table_factory.reset(
+      NewExternalTableFactoryAsUniquePtr(factory).release());
 
   std::unique_ptr<SstFileWriter> writer;
   writer.reset(new SstFileWriter(EnvOptions(), options));
diff --git a/unreleased_history/public_api_changes/external_table_unique_ptr.md b/unreleased_history/public_api_changes/external_table_unique_ptr.md
new file mode 100644
index 000000000000..4abefe625b62
--- /dev/null
+++ b/unreleased_history/public_api_changes/external_table_unique_ptr.md
@@ -0,0 +1 @@
+Add the NewExternalTableFactoryAsUniquePtr() API to return a std::unique_ptr

From c6cfbc29190db976130eb1bda1375da74584a317 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Tue, 17 Jun 2025 09:47:47 -0700
Subject: [PATCH 139/500] clang tidy warning lsh == rhs. lhs or rhs not
 initialized. (#13703)

Summary:
**Summary**:

Clang tidy was throwing error that the gtest assertion EXPECT_EQ was being carried out variables that was not initialized.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13703

Test Plan:
Ran the clang-tidy operations to make sure the same error does not appear.
```bash
  CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze
```

Reviewed By: hx235, pdillinger

Differential Revision: D76777988

Pulled By: shubhajeet

fbshipit-source-id: b9bfe26a2264d4c21224ab53a0b0307596d7f49d
---
 util/compression_test.cc | 76 ++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 34 deletions(-)

diff --git a/util/compression_test.cc b/util/compression_test.cc
index e00c6813fb40..5df440c44a90 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -32,7 +32,7 @@ class AutoSkipTestFlushBlockPolicy : public FlushBlockPolicy {
         statistics_(statistics) {}
 
   bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
-    auto multiple_of_10 = num_keys_ / window_;
+    auto nth_window = num_keys_ / window_;
     if (data_block_builder_.empty()) {
       // First key in this block
       return false;
@@ -49,9 +49,10 @@ class AutoSkipTestFlushBlockPolicy : public FlushBlockPolicy {
       };
       SyncPoint::GetInstance()->DisableProcessing();
       SyncPoint::GetInstance()->ClearAllCallBacks();
-      // We force exploration to set the predicted rejection ratio and then test
-      // that the prediction is exploited
-      if (multiple_of_10 % 2 == 0) {
+      // We force exploration to set the predicted rejection ratio for odd
+      // window and then test that the prediction is exploited in the even
+      // window
+      if (nth_window % 2 == 0) {
         SyncPoint::GetInstance()->SetCallBack(
             "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
             set_exploration);
@@ -72,36 +73,43 @@ class AutoSkipTestFlushBlockPolicy : public FlushBlockPolicy {
         bypassed_percentage = static_cast<int>(bypassed_count * 100 / total);
         compressed_percentage =
             static_cast<int>(compressed_count * 100 / total);
-      }
-      // use mulitple of 10 to get correct assertion
-      switch (multiple_of_10) {
-        case 1:
-          // This is exploration stage in which we set the rejection ratio to
-          // 0.6
-          EXPECT_EQ(rejection_percentage, 60);
-          EXPECT_EQ(bypassed_percentage, 0);
-          EXPECT_EQ(compressed_percentage, 40);
-          break;
-        case 2:
-          // With the rejection ratio set to 0.6 all the blocks should be
-          // bypassed in next window
-          EXPECT_EQ(rejection_percentage, 0);
-          EXPECT_EQ(bypassed_percentage, 100);
-          EXPECT_EQ(compressed_percentage, 0);
-          break;
-        case 3:
-          // This is exploration stage in which we set the rejection ratio to
-          // 0.4
-          EXPECT_EQ(rejection_percentage, 40);
-          EXPECT_EQ(bypassed_percentage, 0);
-          EXPECT_EQ(compressed_percentage, 60);
-          break;
-        case 4:
-          // With the rejection ratio set to 0.4 all the blocks should be
-          // attempted to be compressed
-          EXPECT_EQ(rejection_percentage, 60);
-          EXPECT_EQ(bypassed_percentage, 0);
-          EXPECT_EQ(compressed_percentage, 40);
+        // use nth window to detect test cases and set the expected
+        switch (nth_window) {
+          case 1:
+            // In first window we only explore and thus here we verify that the
+            // correct prediction has been made by the end of the window
+            // Since 6 of 10 blocks are compression unfriendly, the predicted
+            // rejection ratio should be 60%
+            EXPECT_EQ(rejection_percentage, 60);
+            EXPECT_EQ(bypassed_percentage, 0);
+            EXPECT_EQ(compressed_percentage, 40);
+            break;
+          case 2:
+            // With the rejection ratio set to 0.6 all the blocks should be
+            // bypassed in next window
+            EXPECT_EQ(rejection_percentage, 0);
+            EXPECT_EQ(bypassed_percentage, 100);
+            EXPECT_EQ(compressed_percentage, 0);
+            break;
+          case 3:
+            // In third window we only explore and verify that the correct
+            // prediction has been made by the end of the window
+            // since 4 of 10 blocks are compression ufriendly, the predicted
+            // rejection ratio should be 40%
+            EXPECT_EQ(rejection_percentage, 40);
+            EXPECT_EQ(bypassed_percentage, 0);
+            EXPECT_EQ(compressed_percentage, 60);
+            break;
+          case 4:
+            // With the rejection ratio set to 0.4 all the blocks should be
+            // attempted to be compressed
+            // 6 of 10 blocks are compression unfriendly and thus should be
+            // rejected 4 of 10 blocks are compression friendly and thus should
+            // be compressed
+            EXPECT_EQ(rejection_percentage, 60);
+            EXPECT_EQ(bypassed_percentage, 0);
+            EXPECT_EQ(compressed_percentage, 40);
+        }
       }
     }
     num_keys_++;

From 25837eeee5ccdb2e627422d5afa9ef20f52072e2 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 17 Jun 2025 10:50:33 -0700
Subject: [PATCH 140/500] Change NewExternalTableFactory to return unique_ptr
 (#13705)

Summary:
Change NewExternalTableFactory API and remove the just added NewExternalTableFactoryAsUniquePtr.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13705

Reviewed By: jaykorean

Differential Revision: D76827580

Pulled By: anand1976

fbshipit-source-id: 251ad0e498b62059b8417ff967ca74146de43e2f
---
 include/rocksdb/external_table.h                         | 6 +-----
 table/external_table.cc                                  | 9 +--------
 table/table_test.cc                                      | 3 +--
 .../public_api_changes/external_table_unique_ptr.md      | 2 +-
 4 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/include/rocksdb/external_table.h b/include/rocksdb/external_table.h
index bb7c3cb3d53d..844ba9d96b85 100644
--- a/include/rocksdb/external_table.h
+++ b/include/rocksdb/external_table.h
@@ -269,11 +269,7 @@ class ExternalTableFactory : public Customizable {
 
 // Allocate a TableFactory that wraps around an ExternalTableFactory. Use this
 // to allocate and set in ColumnFamilyOptions::table_factory.
-std::shared_ptr<TableFactory> NewExternalTableFactory(
-    std::shared_ptr<ExternalTableFactory> inner_factory);
-
-// A unique_ptr version of the above
-std::unique_ptr<TableFactory> NewExternalTableFactoryAsUniquePtr(
+std::unique_ptr<TableFactory> NewExternalTableFactory(
     std::shared_ptr<ExternalTableFactory> inner_factory);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/external_table.cc b/table/external_table.cc
index 6900bf108254..a85073737bf8 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -473,14 +473,7 @@ class ExternalTableFactoryAdapter : public TableFactory {
 
 }  // namespace
 
-std::shared_ptr<TableFactory> NewExternalTableFactory(
-    std::shared_ptr<ExternalTableFactory> inner_factory) {
-  std::shared_ptr<TableFactory> res;
-  res.reset(new ExternalTableFactoryAdapter(std::move(inner_factory)));
-  return res;
-}
-
-std::unique_ptr<TableFactory> NewExternalTableFactoryAsUniquePtr(
+std::unique_ptr<TableFactory> NewExternalTableFactory(
     std::shared_ptr<ExternalTableFactory> inner_factory) {
   std::unique_ptr<TableFactory> res;
   res = std::make_unique<ExternalTableFactoryAdapter>(std::move(inner_factory));
diff --git a/table/table_test.cc b/table/table_test.cc
index 08163cfc979e..94be08ddcec3 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7007,8 +7007,7 @@ TEST_F(ExternalTableTest, SstReaderTest) {
   std::shared_ptr<ExternalTableFactory> factory =
       std::make_shared<DummyExternalTableFactory>(
           /*support_property_block=*/false);
-  options.table_factory.reset(
-      NewExternalTableFactoryAsUniquePtr(factory).release());
+  options.table_factory = NewExternalTableFactory(factory);
 
   std::unique_ptr<SstFileWriter> writer;
   writer.reset(new SstFileWriter(EnvOptions(), options));
diff --git a/unreleased_history/public_api_changes/external_table_unique_ptr.md b/unreleased_history/public_api_changes/external_table_unique_ptr.md
index 4abefe625b62..29a4a98cba26 100644
--- a/unreleased_history/public_api_changes/external_table_unique_ptr.md
+++ b/unreleased_history/public_api_changes/external_table_unique_ptr.md
@@ -1 +1 @@
-Add the NewExternalTableFactoryAsUniquePtr() API to return a std::unique_ptr
+Change NewExternalTableFactory to return a unique_ptr instead of shared_ptr.

From 05996cd497cdd495edf05b35494da04c560ccafa Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Tue, 17 Jun 2025 11:28:33 -0700
Subject: [PATCH 141/500] crash test and other refactoring (#13704)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
**Summary:**
AddressSanitizer failed complaining stack-use-after-return while executing AutoSkipCompressorWrapper::CompressBlock. This was caused because the AutoSkipCompressorWrapper was storing const reference pointer to Compression Options. It seems like the life time of the Compression Options can be shorter than the AutoSkipCompressorWrapper thus we need to copy the Compression Options and store it in AutoSkipCompressorWrapper.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13704

Test Plan:
Run the crashtest again to verify that we don’t encounter the issue again.
```bash
make clean
COMPILE_WITH_ASAN=1 make -j80 dbg
mkdir -p /dev/shm/rocksdb_test/rocksdb_crashtest_blackbox
mkdir -p dev/shm/rocksdb_test/rocksdb_crashtest_expected
./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=100 --adaptive_readahead=0 --adm_policy=2 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=1 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=1 --async_io=0 --auto_readahead_size=1 --auto_refresh_iterator_with_snapshot=0 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=1000000 --blob_cache_size=2097152 --blob_compaction_readahead_size=0 --blob_compression_type=snappy --blob_file_size=16777216 --blob_file_starting_level=2 --blob_garbage_collection_age_cutoff=0.5 --blob_garbage_collection_force_threshold=0.5 --block_align=0 --block_protection_bytes_per_key=2 --block_size=16384 --bloom_before_level=2147483646 --bloom_bits=9.703060295811829 --bottommost_compression_type=disable --bottommost_file_compaction_delay=0 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=33554432 --cache_type=fixed_hyper_clock_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=1000 --compact_range_one_in=1000000 --compaction_pri=1 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=0 --compress_format_version=1 --compressed_secondary_cache_size=16777216 --compression_checksum=0 --compression_manager=autoskip --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=1 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_blackbox --db_write_buffer_size=1048576 --decouple_partitioned_filters=1 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=30000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=1 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_remote_compaction=0 --enable_sst_partitioner_factory=0 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=0 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fifo_allow_compaction=1 --file_checksum_impl=xxh64 --file_temperature_age_thresholds= --fill_cache=1 --flush_one_in=1000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=15 --index_shortening=2 --index_type=2 --ingest_external_file_one_in=1000 --ingest_wbwi_one_in=0 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kHot --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=0 --log_file_time_to_roll=0 --log_readahead_size=16777216 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=524288 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=1 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=1048576 --memtable_avg_op_scan_flush_trigger=0 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=0 --memtable_op_scan_flush_trigger=1000 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=0 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=1000 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_bottom_pri_threads=1 --num_file_reads_for_auto_readahead=1 --open_files=100 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=16 --ops_per_thread=100000000 --optimize_filters_for_hits=0 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=0 --paranoid_memory_checks=0 --partition_filters=1 --partition_pinning=3 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=7 --prefixpercent=5 --prepopulate_blob_cache=0 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=16384 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --report_bg_io_stats=0 --reset_stats_one_in=10000 --sample_for_compression=0 --secondary_cache_fault_one_in=32 --secondary_cache_uri= --set_options_one_in=1000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=1048576 --sqfc_name=bar --sqfc_version=2 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=0 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=3 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=-1 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_ingest_standalone_range_deletion_one_in=10 --top_level_index_pinning=2 --uncache_aggressiveness=5100 --universal_max_read_amp=0 --universal_reduce_file_locking=1 --unpartitioned_pinning=2 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=0 --use_attribute_group=1 --use_blob_cache=1 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=1 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=10 --use_shared_block_and_blob_cache=0 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=1 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000 --verify_compression=0 --verify_db_one_in=100000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=1048576 --write_dbid_to_manifest=1 --write_fault_one_in=128 --write_identity_file=0 --writepercent=35
```

Reviewed By: hx235

Differential Revision: D76826904

Pulled By: shubhajeet

fbshipit-source-id: c4a1522d3fed37bdd3e711f4c99c16d7bd1d794f
---
 CMakeLists.txt               |  4 ++--
 tools/db_crashtest.py        |  2 +-
 util/auto_skip_compressor.cc | 22 +++++++++-------------
 util/auto_skip_compressor.h  |  6 ++----
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b760890a3100..b32049758221 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,13 +874,13 @@ set(SOURCES
         trace_replay/trace_record.cc
         trace_replay/trace_replay.cc
         util/async_file_reader.cc
+        util/auto_skip_compressor.cc
         util/cleanable.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
         util/compression.cc
         util/simple_mixed_compressor.cc
-        util/auto_skip_compressor.cc
         util/compression_context_cache.cc
         util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
@@ -1447,7 +1447,6 @@ if(WITH_TESTS)
         table/table_test.cc
         table/block_fetcher_test.cc
         test_util/testutil_test.cc
-        util/compression_test.cc
         trace_replay/block_cache_tracer_test.cc
         trace_replay/io_tracer_test.cc
         tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -1459,6 +1458,7 @@ if(WITH_TESTS)
         util/autovector_test.cc
         util/bloom_test.cc
         util/coding_test.cc
+        util/compression_test.cc
         util/crc32c_test.cc
         util/defer_test.cc
         util/dynamic_bloom_test.cc
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 75b8f7aa5c37..fd9bf72ca6fe 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1011,7 +1011,7 @@ def finalize_and_sanitize(src_params):
         dest_params["compression_type"] = "zstd"
         dest_params["bottommost_compression_type"] = "zstd"
     elif dest_params.get("compression_manager") == "autoskip":
-        # disabling compression parallel threads if mixed manager is being used as the predictor is not thread safe
+        # disabling compression parallel threads if auto skip manager is being used as the predictor is not thread safe
         dest_params["compression_parallel_threads"] = 1
         # esuring the compression is being used
         if dest_params.get("compression_type") == "none":
diff --git a/util/auto_skip_compressor.cc b/util/auto_skip_compressor.cc
index 7652f874485c..d7b79a3d17d6 100644
--- a/util/auto_skip_compressor.cc
+++ b/util/auto_skip_compressor.cc
@@ -32,9 +32,10 @@ bool CompressionRejectionProbabilityPredictor::Record(
   } else {
     compressed_count_++;
   }
-  if (attempted_compression_count() >= window_size_) {
-    pred_rejection_prob_percentage_ = static_cast<int>(
-        rejected_count_ * 100 / (compressed_count_ + rejected_count_));
+  auto attempted = attempted_compression_count();
+  if (attempted >= window_size_) {
+    pred_rejection_prob_percentage_ =
+        static_cast<int>(rejected_count_ * 100 / attempted);
     compressed_count_ = 0;
     rejected_count_ = 0;
     assert(attempted_compression_count() == 0);
@@ -42,16 +43,11 @@ bool CompressionRejectionProbabilityPredictor::Record(
   return true;
 }
 AutoSkipCompressorWrapper::AutoSkipCompressorWrapper(
-    std::unique_ptr<Compressor> compressor, const CompressionOptions& opts,
-    const CompressionType type)
+    std::unique_ptr<Compressor> compressor, const CompressionOptions& opts)
     : CompressorWrapper::CompressorWrapper(std::move(compressor)),
-      opts_(opts),
-      type_(type),
+      kOpts(opts),
       predictor_(
-          std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {
-  (void)type_;
-  (void)opts_;
-}
+          std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {}
 
 const char* AutoSkipCompressorWrapper::Name() const {
   return "AutoSkipCompressorWrapper";
@@ -89,7 +85,7 @@ Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
   Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
                                           out_compression_type, wa);
   // determine if it was rejected or compressed
-  predictor_->Record(uncompressed_data, compressed_output, opts_);
+  predictor_->Record(uncompressed_data, compressed_output, kOpts);
   return status;
 }
 
@@ -105,7 +101,7 @@ std::unique_ptr<Compressor> AutoSkipCompressorManager::GetCompressorForSST(
   assert(GetSupportedCompressions().size() > 1);
   assert(preferred != kNoCompression);
   return std::make_unique<AutoSkipCompressorWrapper>(
-      wrapped_->GetCompressorForSST(context, opts, preferred), opts, preferred);
+      wrapped_->GetCompressorForSST(context, opts, preferred), opts);
 }
 
 std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
diff --git a/util/auto_skip_compressor.h b/util/auto_skip_compressor.h
index 4100388c4853..92247cb4d63e 100644
--- a/util/auto_skip_compressor.h
+++ b/util/auto_skip_compressor.h
@@ -37,8 +37,7 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
  public:
   const char* Name() const override;
   explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
-                                     const CompressionOptions& opts,
-                                     const CompressionType type);
+                                     const CompressionOptions& opts);
 
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
@@ -51,8 +50,7 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
                                 ManagedWorkingArea* wa);
   static constexpr int kExplorationPercentage = 10;
   static constexpr int kProbabilityCutOff = 50;
-  const CompressionOptions& opts_;
-  const CompressionType type_;
+  const CompressionOptions kOpts;
   std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor_;
 };
 

From 34d8f03af4c8ed62f82709e95a262074f33b13e8 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Tue, 17 Jun 2025 19:17:25 -0700
Subject: [PATCH 142/500] Moving predictor to WorkingArea to make it thread
 safe (#13706)

Summary:
**Summary:**

We need to move the Predictor to WorkingArea so that it is local to each thread and thus is thread safe.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13706

Test Plan: It should pass the test case written in ./compression_test.

Reviewed By: pdillinger

Differential Revision: D76836846

Pulled By: shubhajeet

fbshipit-source-id: 0d0170baf65f4bb95ba107fec77151e66b8a4449
---
 tools/db_crashtest.py        |  4 +---
 util/auto_skip_compressor.cc | 31 +++++++++++++++++++++++++------
 util/auto_skip_compressor.h  | 29 +++++++++++++++++++++++++++--
 3 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index fd9bf72ca6fe..debec49e126d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1011,9 +1011,7 @@ def finalize_and_sanitize(src_params):
         dest_params["compression_type"] = "zstd"
         dest_params["bottommost_compression_type"] = "zstd"
     elif dest_params.get("compression_manager") == "autoskip":
-        # disabling compression parallel threads if auto skip manager is being used as the predictor is not thread safe
-        dest_params["compression_parallel_threads"] = 1
-        # esuring the compression is being used
+        # ensuring the compression is being used
         if dest_params.get("compression_type") == "none":
             dest_params["compression_type"] = random.choice(
                 ["snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"]
diff --git a/util/auto_skip_compressor.cc b/util/auto_skip_compressor.cc
index d7b79a3d17d6..eadd50d00a56 100644
--- a/util/auto_skip_compressor.cc
+++ b/util/auto_skip_compressor.cc
@@ -56,20 +56,30 @@ const char* AutoSkipCompressorWrapper::Name() const {
 Status AutoSkipCompressorWrapper::CompressBlock(
     Slice uncompressed_data, std::string* compressed_output,
     CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+  // Check if the managed working area is provided or owned by this object.
+  // If not, bypass auto-skip logic since the working area lacks a predictor to
+  // record or make necessary decisions to compress or bypass compression of the
+  // block
+  if (wa == nullptr || wa->owner() != this) {
+    return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                   out_compression_type, wa);
+  }
   bool exploration =
       Random::GetTLSInstance()->PercentTrue(kExplorationPercentage);
   TEST_SYNC_POINT_CALLBACK(
       "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
       &exploration);
+  auto autoskip_wa = static_cast<AutoSkipWorkingArea*>(wa->get());
   if (exploration) {
     return CompressBlockAndRecord(uncompressed_data, compressed_output,
-                                  out_compression_type, wa);
+                                  out_compression_type, autoskip_wa);
   } else {
-    auto prediction = predictor_->Predict();
+    auto predictor_ptr = autoskip_wa->predictor;
+    auto prediction = predictor_ptr->Predict();
     if (prediction <= kProbabilityCutOff) {
       // decide to compress
       return CompressBlockAndRecord(uncompressed_data, compressed_output,
-                                    out_compression_type, wa);
+                                    out_compression_type, autoskip_wa);
     } else {
       // decide to bypass compression
       *out_compression_type = kNoCompression;
@@ -79,13 +89,22 @@ Status AutoSkipCompressorWrapper::CompressBlock(
   return Status::OK();
 }
 
+Compressor::ManagedWorkingArea AutoSkipCompressorWrapper::ObtainWorkingArea() {
+  auto wrap_wa = wrapped_->ObtainWorkingArea();
+  return ManagedWorkingArea(new AutoSkipWorkingArea(std::move(wrap_wa)), this);
+}
+void AutoSkipCompressorWrapper::ReleaseWorkingArea(WorkingArea* wa) {
+  delete static_cast<AutoSkipWorkingArea*>(wa);
+}
+
 Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
     Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+    CompressionType* out_compression_type, AutoSkipWorkingArea* wa) {
   Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                          out_compression_type, wa);
+                                          out_compression_type, &(wa->wrapped));
   // determine if it was rejected or compressed
-  predictor_->Record(uncompressed_data, compressed_output, kOpts);
+  auto predictor_ptr = wa->predictor;
+  predictor_ptr->Record(uncompressed_data, compressed_output, kOpts);
   return status;
 }
 
diff --git a/util/auto_skip_compressor.h b/util/auto_skip_compressor.h
index 92247cb4d63e..d93a4f4ade41 100644
--- a/util/auto_skip_compressor.h
+++ b/util/auto_skip_compressor.h
@@ -13,7 +13,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 // Predict rejection probability using a moving window approach
-// This class is not thread safe
 class CompressionRejectionProbabilityPredictor {
  public:
   CompressionRejectionProbabilityPredictor(int window_size)
@@ -33,6 +32,30 @@ class CompressionRejectionProbabilityPredictor {
   size_t window_size_;
 };
 
+class AutoSkipWorkingArea : public Compressor::WorkingArea {
+ public:
+  explicit AutoSkipWorkingArea(Compressor::ManagedWorkingArea&& wa)
+      : wrapped(std::move(wa)),
+        predictor(
+            std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {}
+  ~AutoSkipWorkingArea() {}
+  AutoSkipWorkingArea(const AutoSkipWorkingArea&) = delete;
+  AutoSkipWorkingArea& operator=(const AutoSkipWorkingArea&) = delete;
+  AutoSkipWorkingArea(AutoSkipWorkingArea&& other) noexcept
+      : wrapped(std::move(other.wrapped)),
+        predictor(std::move(other.predictor)) {}
+
+  AutoSkipWorkingArea& operator=(AutoSkipWorkingArea&& other) noexcept {
+    if (this != &other) {
+      wrapped = std::move(other.wrapped);
+      predictor = std::move(other.predictor);
+    }
+    return *this;
+  }
+  Compressor::ManagedWorkingArea wrapped;
+  std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor;
+};
+
 class AutoSkipCompressorWrapper : public CompressorWrapper {
  public:
   const char* Name() const override;
@@ -42,12 +65,14 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  void ReleaseWorkingArea(WorkingArea* wa) override;
 
  private:
   Status CompressBlockAndRecord(Slice uncompressed_data,
                                 std::string* compressed_output,
                                 CompressionType* out_compression_type,
-                                ManagedWorkingArea* wa);
+                                AutoSkipWorkingArea* wa);
   static constexpr int kExplorationPercentage = 10;
   static constexpr int kProbabilityCutOff = 50;
   const CompressionOptions kOpts;

From 1601da40496d456419bb23946bbea4ceb1bd9d78 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 17 Jun 2025 21:34:34 -0700
Subject: [PATCH 143/500] Improve file checksum handling for ingestion (#13708)

Summary:
* Improve debugability with better error messages (including the returned status, not just log messages)
* Tolerate user providing file checksums recognized by the factory but not the same function as currently, generally provided by the factory. This makes it practical to transition from one type of checksum to another without major hiccups in ingestion workflows.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13708

Test Plan: updated unit test, manually inspect LOG file from the unit test

Reviewed By: cbi42

Differential Revision: D76837804

Pulled By: pdillinger

fbshipit-source-id: 45b744829b3a125e9d0ee6874bd37ce534c2e13c
---
 db/external_sst_file_basic_test.cc            | 122 +++++++++++++-----
 db/external_sst_file_ingestion_job.cc         |  77 ++++++-----
 include/rocksdb/file_checksum.h               |   3 +-
 .../bug_fixes/ingestion_file_checksum.md      |   1 +
 4 files changed, 138 insertions(+), 65 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/ingestion_file_checksum.md

diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index a247e68128c5..6bc46938658d 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -16,6 +16,7 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/defer.h"
+#include "util/file_checksum_helper.h"
 #include "util/random.h"
 #include "utilities/fault_injection_env.h"
 
@@ -346,7 +347,8 @@ class ChecksumVerifyHelper {
 
   Status GetSingleFileChecksumAndFuncName(
       const std::string& file_path, std::string* file_checksum,
-      std::string* file_checksum_func_name) {
+      std::string* file_checksum_func_name,
+      const std::string& requested_func_name = {}) {
     Status s;
     EnvOptions soptions;
     std::unique_ptr<SequentialFile> file_reader;
@@ -364,6 +366,8 @@ class ChecksumVerifyHelper {
       return Status::OK();
     } else {
       FileChecksumGenContext gen_context;
+      gen_context.file_name = file_path;
+      gen_context.requested_checksum_func_name = requested_func_name;
       std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
           file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
       *file_checksum_func_name = file_checksum_gen->Name();
@@ -439,10 +443,50 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
   DestroyAndRecreateExternalSSTFilesDir();
 }
 
+namespace {
+class VariousFileChecksumGenerator : public FileChecksumGenCrc32c {
+ public:
+  explicit VariousFileChecksumGenerator(const std::string& name)
+      : FileChecksumGenCrc32c({}), name_(name) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  std::string GetChecksum() const override {
+    return FileChecksumGenCrc32c::GetChecksum() + "_" + name_;
+  }
+
+ private:
+  const std::string name_;
+};
+
+class VariousFileChecksumGenFactory : public FileChecksumGenFactory {
+ public:
+  std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+      const FileChecksumGenContext& context) override {
+    static RelaxedAtomic<int> counter{0};
+    if (Slice(context.requested_checksum_func_name).starts_with("Various")) {
+      return std::make_unique<VariousFileChecksumGenerator>(
+          context.requested_checksum_func_name);
+    } else if (context.requested_checksum_func_name.empty()) {
+      // Lacking a specific request, use a different function name for each
+      // result.
+      return std::make_unique<VariousFileChecksumGenerator>(
+          "Various" + std::to_string(counter.FetchAddRelaxed(1)));
+    } else {
+      return nullptr;
+    }
+  }
+
+  static const char* kClassName() { return "VariousFileChecksumGenFactory"; }
+  const char* Name() const override { return kClassName(); }
+};
+}  // namespace
+
 TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   Options old_options = CurrentOptions();
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.file_checksum_gen_factory =
+      std::make_shared<VariousFileChecksumGenFactory>();
   const ImmutableCFOptions ioptions(options);
   ChecksumVerifyHelper checksum_helper(options);
 
@@ -463,7 +507,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file1_info.largest_key, Key(1099));
   std::string file_checksum1, file_checksum_func_name1;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file1, &file_checksum1, &file_checksum_func_name1));
+      file1, &file_checksum1, &file_checksum_func_name1,
+      file1_info.file_checksum_func_name));
   ASSERT_EQ(file1_info.file_checksum, file_checksum1);
   ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1);
 
@@ -482,7 +527,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file2_info.largest_key, Key(1299));
   std::string file_checksum2, file_checksum_func_name2;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file2, &file_checksum2, &file_checksum_func_name2));
+      file2, &file_checksum2, &file_checksum_func_name2,
+      file2_info.file_checksum_func_name));
   ASSERT_EQ(file2_info.file_checksum, file_checksum2);
   ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2);
 
@@ -501,7 +547,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file3_info.largest_key, Key(1499));
   std::string file_checksum3, file_checksum_func_name3;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file3, &file_checksum3, &file_checksum_func_name3));
+      file3, &file_checksum3, &file_checksum_func_name3,
+      file3_info.file_checksum_func_name));
   ASSERT_EQ(file3_info.file_checksum, file_checksum3);
   ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3);
 
@@ -520,7 +567,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file4_info.largest_key, Key(1799));
   std::string file_checksum4, file_checksum_func_name4;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file4, &file_checksum4, &file_checksum_func_name4));
+      file4, &file_checksum4, &file_checksum_func_name4,
+      file4_info.file_checksum_func_name));
   ASSERT_EQ(file4_info.file_checksum, file_checksum4);
   ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4);
 
@@ -539,7 +587,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file5_info.largest_key, Key(1999));
   std::string file_checksum5, file_checksum_func_name5;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file5, &file_checksum5, &file_checksum_func_name5));
+      file5, &file_checksum5, &file_checksum_func_name5,
+      file5_info.file_checksum_func_name));
   ASSERT_EQ(file5_info.file_checksum, file_checksum5);
   ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5);
 
@@ -558,7 +607,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_EQ(file6_info.largest_key, Key(2199));
   std::string file_checksum6, file_checksum_func_name6;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-      file6, &file_checksum6, &file_checksum_func_name6));
+      file6, &file_checksum6, &file_checksum_func_name6,
+      file6_info.file_checksum_func_name));
   ASSERT_EQ(file6_info.file_checksum, file_checksum6);
   ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6);
 
@@ -628,18 +678,23 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ASSERT_OK(env_->FileExists(file2));
 
-  // Enable verify_file_checksum option
-  // No checksum information is provided, generate it when ingesting
-  std::vector<std::string> checksum, checksum_func;
-  s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
-                              false, false);
+  // Enable verify_file_checksum option. No checksum information is provided,
+  // so it is generated when ingesting. The configured checksum factory will
+  // use a different function than before.
+  s = AddFileWithFileChecksum({file3}, {}, {}, true, false, false, false);
   ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files2;
   dbfull()->GetLiveFilesMetaData(&live_files2);
   for (const auto& f : live_files2) {
     if (set1.find(f.name) == set1.end()) {
-      ASSERT_EQ(f.file_checksum, file_checksum3);
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3);
+      // Recomputed checksum, different function
+      EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name3);
+      std::string cur_checksum3, cur_checksum_func_name3;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum3, &cur_checksum_func_name3,
+          f.file_checksum_func_name));
+      EXPECT_EQ(f.file_checksum, cur_checksum3);
+      EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name3);
       set1.insert(f.name);
     }
   }
@@ -653,8 +708,9 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_NOK(s) << s.ToString();
 
   // Does not enable verify_file_checksum options
-  // Checksum function name matches, store the checksum being ingested.
-  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
+  // Checksum function name is recognized, so store the checksum being ingested.
+  std::string file_checksum_func_name4alt = "VariousABCD";
+  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4alt},
                               false, false, false, false);
   ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files3;
@@ -663,7 +719,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
     if (set1.find(f.name) == set1.end()) {
       ASSERT_FALSE(f.file_checksum == file_checksum4);
       ASSERT_EQ(f.file_checksum, "asd");
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4alt);
       set1.insert(f.name);
     }
   }
@@ -672,7 +728,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
 
   // enable verify_file_checksum options, DB enable checksum, and enable
   // write_global_seq. So the checksum stored is different from the one
-  // ingested due to the sequence number changes.
+  // ingested due to the sequence number changes. The checksum function name
+  // may also change since the checksum is recomputed.
   s = AddFileWithFileChecksum({file5}, {file_checksum5},
                               {file_checksum_func_name5}, true, false, false,
                               true);
@@ -681,11 +738,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   dbfull()->GetLiveFilesMetaData(&live_files4);
   for (const auto& f : live_files4) {
     if (set1.find(f.name) == set1.end()) {
+      // Recomputed checksum, different function
+      EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name5);
       std::string cur_checksum5, cur_checksum_func_name5;
       ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
-          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5));
-      ASSERT_EQ(f.file_checksum, cur_checksum5);
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5);
+          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5,
+          f.file_checksum_func_name));
+      EXPECT_EQ(f.file_checksum, cur_checksum5);
+      EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name5);
       set1.insert(f.name);
     }
   }
@@ -693,18 +753,22 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   ASSERT_OK(env_->FileExists(file5));
 
   // Does not enable verify_file_checksum options and also the ingested file
-  // checksum information is empty. DB will generate and store the checksum
-  // in Manifest.
-  std::vector<std::string> files_c6, files_name6;
-  s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
-                              false, false);
+  // checksum information is empty. DB will generate and store file checksum
+  // in Manifest, which could be different from the previous invocation.
+  s = AddFileWithFileChecksum({file6}, {}, {}, false, false, false, false);
   ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files6;
   dbfull()->GetLiveFilesMetaData(&live_files6);
   for (const auto& f : live_files6) {
     if (set1.find(f.name) == set1.end()) {
-      ASSERT_EQ(f.file_checksum, file_checksum6);
-      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6);
+      // Recomputed checksum, different function
+      EXPECT_NE(f.file_checksum_func_name, file_checksum_func_name6);
+      std::string cur_checksum6, cur_checksum_func_name6;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum6, &cur_checksum_func_name6,
+          f.file_checksum_func_name));
+      EXPECT_EQ(f.file_checksum, cur_checksum6);
+      EXPECT_EQ(f.file_checksum_func_name, cur_checksum_func_name6);
       set1.insert(f.name);
     }
   }
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index f6c257654f9c..7e99dc9f918e 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -260,10 +260,6 @@ Status ExternalSstFileIngestionJob::Prepare(
     } else {
       need_generate_file_checksum_ = true;
     }
-    FileChecksumGenContext gen_context;
-    std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
-        db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator(
-            gen_context);
     std::vector<std::string> generated_checksums;
     std::vector<std::string> generated_checksum_func_names;
     // Step 1: generate the checksum for ingested sst file.
@@ -271,7 +267,9 @@ Status ExternalSstFileIngestionJob::Prepare(
       for (size_t i = 0; i < files_to_ingest_.size(); i++) {
         std::string generated_checksum;
         std::string generated_checksum_func_name;
-        std::string requested_checksum_func_name;
+        std::string requested_checksum_func_name =
+            i < files_checksum_func_names.size() ? files_checksum_func_names[i]
+                                                 : "";
         // TODO: rate limit file reads for checksum calculation during file
         // ingestion.
         // TODO: plumb Env::IOActivity
@@ -314,40 +312,50 @@ Status ExternalSstFileIngestionJob::Prepare(
             if (files_checksum_func_names[i] !=
                 generated_checksum_func_names[i]) {
               status = Status::InvalidArgument(
-                  "Checksum function name does not match with the checksum "
-                  "function name of this DB");
-              ROCKS_LOG_WARN(
-                  db_options_.info_log,
-                  "Sst file checksum verification of file: %s failed: %s",
-                  external_files_paths[i].c_str(), status.ToString().c_str());
+                  "DB file checksum gen factory " +
+                  std::string(db_options_.file_checksum_gen_factory->Name()) +
+                  " generated checksum function name " +
+                  generated_checksum_func_names[i] + " for file " +
+                  external_files_paths[i] +
+                  " which does not match requested/provided " +
+                  files_checksum_func_names[i]);
               break;
             }
             if (files_checksums[i] != generated_checksums[i]) {
               status = Status::Corruption(
-                  "Ingested checksum does not match with the generated "
-                  "checksum");
-              ROCKS_LOG_WARN(
-                  db_options_.info_log,
-                  "Sst file checksum verification of file: %s failed: %s",
-                  files_to_ingest_[i].internal_file_path.c_str(),
-                  status.ToString().c_str());
+                  "Checksum verification mismatch for ingestion file " +
+                  external_files_paths[i] + " using function " +
+                  generated_checksum_func_names[i] + ". Expected: " +
+                  Slice(files_checksums[i]).ToString(/*hex=*/true) +
+                  " Computed: " +
+                  Slice(generated_checksums[i]).ToString(/*hex=*/true));
               break;
             }
           }
         } else {
-          // If verify_file_checksum is not enabled, we only verify the
-          // checksum function name. If it does not match, fail the ingestion.
-          // If matches, we trust the ingested checksum information and store
-          // in the Manifest.
+          // If verify_file_checksum is not enabled, we only verify the factory
+          // recognizes the checksum function name. If it does not match, fail
+          // the ingestion. If matches, we trust the ingested checksum
+          // information and store in the Manifest.
           for (size_t i = 0; i < files_to_ingest_.size(); i++) {
-            if (files_checksum_func_names[i] != file_checksum_gen->Name()) {
+            FileChecksumGenContext gen_context;
+            gen_context.file_name = files_to_ingest_[i].internal_file_path;
+            gen_context.requested_checksum_func_name =
+                files_checksum_func_names[i];
+            auto file_checksum_gen =
+                db_options_.file_checksum_gen_factory
+                    ->CreateFileChecksumGenerator(gen_context);
+
+            if (file_checksum_gen == nullptr ||
+                files_checksum_func_names[i] != file_checksum_gen->Name()) {
               status = Status::InvalidArgument(
-                  "Checksum function name does not match with the checksum "
-                  "function name of this DB");
-              ROCKS_LOG_WARN(
-                  db_options_.info_log,
-                  "Sst file checksum verification of file: %s failed: %s",
-                  external_files_paths[i].c_str(), status.ToString().c_str());
+                  "Checksum function name " + files_checksum_func_names[i] +
+                  " for file " + external_files_paths[i] +
+                  " not recognized by DB checksum gen factory" +
+                  db_options_.file_checksum_gen_factory->Name() +
+                  (file_checksum_gen ? (" Returned function " +
+                                        std::string(file_checksum_gen->Name()))
+                                     : ""));
               break;
             }
             files_to_ingest_[i].file_checksum = files_checksums[i];
@@ -362,12 +370,11 @@ Status ExternalSstFileIngestionJob::Prepare(
         status = Status::InvalidArgument(
             "The checksum information of ingested sst files are nonempty and "
             "the size of checksums or the size of the checksum function "
-            "names "
-            "does not match with the number of ingested sst files");
-        ROCKS_LOG_WARN(
-            db_options_.info_log,
-            "The ingested sst files checksum information is incomplete: %s",
-            status.ToString().c_str());
+            "names does not match with the number of ingested sst files");
+      }
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log, "Ingestion failed: %s",
+                       status.ToString().c_str());
       }
     }
   }
diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h
index 66024d0a1b4e..bbb148c67d28 100644
--- a/include/rocksdb/file_checksum.h
+++ b/include/rocksdb/file_checksum.h
@@ -80,7 +80,8 @@ class FileChecksumGenFactory : public Customizable {
       const ConfigOptions& options, const std::string& value,
       std::shared_ptr<FileChecksumGenFactory>* result);
 
-  // Create a new FileChecksumGenerator.
+  // Create a new FileChecksumGenerator. Recommended to return nullptr if the
+  // requested function name is not recognized.
   virtual std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
       const FileChecksumGenContext& context) = 0;
 
diff --git a/unreleased_history/bug_fixes/ingestion_file_checksum.md b/unreleased_history/bug_fixes/ingestion_file_checksum.md
new file mode 100644
index 000000000000..28ee8c59ca5e
--- /dev/null
+++ b/unreleased_history/bug_fixes/ingestion_file_checksum.md
@@ -0,0 +1 @@
+* Fixed handling of file checksums in IngestExternalFile() to allow providing checksums using recognized but not necessarily the DB's preferred checksum function, to ease migration between checksum functions.

From c8aafdba337e7e454cc8dd2b7208d8326f568507 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 18 Jun 2025 17:32:59 -0700
Subject: [PATCH 144/500] Support concurrent write for vector memtable (#13675)

Summary:
Some usage of vector memtable is bottlenecked in the memtable insertion path when using multiple writers. This PR adds support for concurrent writes for the vector memtable. The updates from each concurrent writer are buffered in a thread local vector. When a writer is done, MemTable::BatchPostProcess() is called to flush the thread local updates to the main vector. TSAN test and function comment suggest that ApproximateMemoryUsage() needs to be thread-safe, so its implementation is updated to provide thread-safe access.

Together with unordered_write, benchmark shows much improved insertion throughput.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13675

Test Plan:
- new unit test
- enabled some coverage of vector memtable in stress test
- Performance benchmark: benchmarked memtable insertion performance with by running fillrandom 20 times
  - Compare branch and main performance with one thread and write batch size 100:
    - main: 4896888.950 ops/sec
    - branch: 4923366.350 ops/sec
  - Benchmark this branch by configuring different threads, allow_concurrent_memtable_write, and unordered_write. Performance ratio is computed as current ops/sec divided by ops/sec at 1 thread with the same options.

allow_concurrent | unordered_write | Threads | ops/sec | Performance Ratio
-- | -- | -- | -- | --
0 | 0 | 1 | 4923367 | 1.0
0 | 0 | 2 | 5215640 | 1.1
0 | 0 | 4 | 5588510 | 1.1
0 | 0 | 8 | 6077525 | 1.2
1 | 0 | 1 | 4919060 | 1.0
1 | 0 | 2 | 5821922 | 1.2
1 | 0 | 4 | 7850395 | 1.6
1 | 0 | 8 | 10516600 | 2.1
1 | 1 | 1 | 5050004 | 1.0
1 | 1 | 2 | 8489834 | 1.7
1 | 1 | 4 | 14439513 | 2.9
1 | 1 | 8 | 21538098 | 4.3

```
mkdir -p /tmp/bench_$1
export TEST_TMPDIR=/tmp/bench_$1

memtablerep_value=${6:-vector}

(for I in $(seq 1 $2)
do
	/data/users/changyubi/vscode-root/rocksdb/$1 --benchmarks=fillrandom --seed=1722808058 --write_buffer_size=67108864 --min_write_buffer_number_to_merge=1000 --max_write_buffer_number=1000 --enable_pipelined_write=0 --memtablerep=$memtablerep_value --disable_auto_compactions=1 --disable_wal=1 --avoid_flush_during_shutdown=1 --allow_concurrent_memtable_write=${5:-0} --unordered_write=$4 --batch_size=1 --threads=$3 2>&1 | grep "fillrandom"
done;) | awk '{ t += $5; c++; print } END { printf ("%9.3f\n", 1.0 * t / c) }';
```

Reviewed By: pdillinger

Differential Revision: D76641755

Pulled By: cbi42

fbshipit-source-id: c107ba42749855ad4fd1f52491eb93900757542e
---
 db/db_impl/db_impl_write.cc   |  5 +-
 db/db_memtable_test.cc        | 90 +++++++++++++++++++++++++++++++++++
 db/memtable.cc                |  3 +-
 db/memtable.h                 |  5 ++
 db/write_batch.cc             |  8 +---
 db/write_batch_internal.h     |  9 ++--
 include/rocksdb/memtablerep.h | 13 +++++
 memtable/vectorrep.cc         | 57 ++++++++++++++++++----
 tools/db_bench_tool.cc        |  5 ++
 tools/db_crashtest.py         |  7 ++-
 10 files changed, 178 insertions(+), 24 deletions(-)

diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 667e4750c7d6..8a4c5ec9be6c 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -856,8 +856,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
             write_group, current_sequence, column_family_memtables_.get(),
             &flush_scheduler_, &trim_history_scheduler_,
             write_options.ignore_missing_column_families,
-            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
-            batch_per_txn_);
+            0 /*recovery_log_number*/, this, seq_per_batch_, batch_per_txn_);
       } else {
         write_group.last_sequence = last_sequence;
         write_thread_.LaunchParallelMemTableWriters(&write_group);
@@ -1115,7 +1114,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
           memtable_write_group, w.sequence, column_family_memtables_.get(),
           &flush_scheduler_, &trim_history_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+          seq_per_batch_, batch_per_txn_);
       if (memtable_write_group.status
               .ok()) {  // Don't publish a partial batch write
         versions_->SetLastSequence(memtable_write_group.last_sequence);
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 3f7b029572e4..1768cb9c0866 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -424,6 +424,96 @@ TEST_F(DBMemTableTest, IntegrityChecks) {
     ASSERT_FALSE(iter->Valid());
   }
 }
+
+TEST_F(DBMemTableTest, VectorConcurrentInsert) {
+  Options options;
+  options.create_if_missing = true;
+  options.create_missing_column_families = true;
+  options.allow_concurrent_memtable_write = true;
+  options.memtable_factory.reset(new VectorRepFactory());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf1"}, options);
+
+  // Multi-threaded writes
+  {
+    WriteOptions write_options;
+    std::vector<port::Thread> threads;
+    for (int i = 0; i < 10; ++i) {
+      threads.emplace_back([&, i]() {
+        int start = i * 100;
+        int end = start + 100;
+        WriteBatch batch;
+        for (int j = start; j < end; ++j) {
+          ASSERT_OK(
+              batch.Put(handles_[0], Key(j), "value" + std::to_string(j)));
+        }
+        ASSERT_OK(db_->Write(write_options, &batch));
+      });
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[0]));
+    iter->SeekToFirst();
+    for (int i = 0; i < 1000; ++i) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().ToString(), Key(i));
+      ASSERT_EQ(iter->value().ToString(), "value" + std::to_string(i));
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  // Multi-threaded writes, multi CF
+  {
+    WriteOptions write_options;
+    std::vector<port::Thread> threads;
+    for (int i = 0; i < 10; ++i) {
+      threads.emplace_back([&, i]() {
+        int start = i * 100;
+        int end = start + 100;
+        WriteBatch batch;
+        for (int j = start; j < end; ++j) {
+          ASSERT_OK(batch.Put(handles_[0], Key(j), "CF0" + std::to_string(j)));
+          ASSERT_OK(batch.Put(handles_[1], Key(j), "CF1" + std::to_string(j)));
+        }
+        ASSERT_OK(db_->Write(write_options, &batch));
+      });
+    }
+
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    std::unique_ptr<Iterator> iter0(
+        db_->NewIterator(ReadOptions(), handles_[0]));
+    std::unique_ptr<Iterator> iter1(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter0->SeekToFirst();
+    iter1->SeekToFirst();
+    for (int i = 0; i < 1000; ++i) {
+      ASSERT_TRUE(iter0->Valid());
+      ASSERT_EQ(iter0->key().ToString(), Key(i));
+      ASSERT_EQ(iter0->value().ToString(), "CF0" + std::to_string(i));
+      iter0->Next();
+
+      ASSERT_TRUE(iter1->Valid());
+      ASSERT_EQ(iter1->key().ToString(), Key(i));
+      ASSERT_EQ(iter1->value().ToString(), "CF1" + std::to_string(i));
+      iter1->Next();
+    }
+    ASSERT_FALSE(iter0->Valid());
+    ASSERT_OK(iter0->status());
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_OK(iter1->status());
+  }
+
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/memtable.cc b/db/memtable.cc
index 5b4bfdd9c936..4f07704c7337 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -196,10 +196,11 @@ bool MemTable::ShouldFlushNow() {
   // allocate one more block.
   const double kAllowOverAllocationRatio = 0.6;
 
+  // range deletion use skip list which allocates all memeory through `arena_`
+  assert(range_del_table_->ApproximateMemoryUsage() == 0);
   // If arena still have room for new block allocation, we can safely say it
   // shouldn't flush.
   auto allocated_memory = table_->ApproximateMemoryUsage() +
-                          range_del_table_->ApproximateMemoryUsage() +
                           arena_.MemoryAllocatedBytes();
 
   approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
diff --git a/db/memtable.h b/db/memtable.h
index 3968cfb4b180..79e9bbdd77c3 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -688,6 +688,7 @@ class MemTable final : public ReadOnlyMemTable {
   // Update counters and flush status after inserting a whole write batch
   // Used in concurrent memtable inserts.
   void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+    table_->BatchPostProcess();
     num_entries_.fetch_add(update_counters.num_entries,
                            std::memory_order_relaxed);
     data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
@@ -698,6 +699,10 @@ class MemTable final : public ReadOnlyMemTable {
     if (update_counters.num_range_deletes > 0) {
       num_range_deletes_.fetch_add(update_counters.num_range_deletes,
                                    std::memory_order_relaxed);
+      // noop for skip-list memtable
+      // Besides correctness test in stress test, memtable flush record count
+      // check will catch this if it were not noop.
+      // range_del_table_->BatchPostProcess();
     }
     UpdateFlushState();
   }
diff --git a/db/write_batch.cc b/db/write_batch.cc
index 84dbd06d0255..c2f7a7eddf51 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -551,9 +551,6 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
 
     if (LIKELY(!s.IsTryAgain())) {
       last_was_try_again = false;
-      tag = 0;
-      column_family = 0;  // default
-
       s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
                                    &blob, &xid, &write_unix_time);
       if (!s.ok()) {
@@ -1897,7 +1894,6 @@ Status WriteBatch::VerifyChecksum() const {
     // ReadRecordFromWriteBatch
     key.clear();
     value.clear();
-    column_family = 0;
     s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
                                  &blob, &xid, /*write_unix_time=*/nullptr);
     if (!s.ok()) {
@@ -3214,11 +3210,11 @@ Status WriteBatchInternal::InsertInto(
     ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
     TrimHistoryScheduler* trim_history_scheduler,
     bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
-    bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+    bool seq_per_batch, bool batch_per_txn) {
   MemTableInserter inserter(
       sequence, memtables, flush_scheduler, trim_history_scheduler,
       ignore_missing_column_families, recovery_log_number, db,
-      concurrent_memtable_writes, nullptr /* prot_info */,
+      /*concurrent_memtable_writes=*/false, nullptr /* prot_info */,
       nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
   for (auto w : write_group) {
     if (w->CallbackFailed()) {
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 3cf3f4689a8c..f7b36a4133cf 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -185,18 +185,19 @@ class WriteBatchInternal {
   // If flush_scheduler is non-null, it will be invoked if the memtable
   // should be flushed.
   //
-  // Under concurrent use, the caller is responsible for making sure that
-  // the memtables object itself is thread-local.
+  // This overload is for non-concurrent insertion only.
   static Status InsertInto(
       WriteThread::WriteGroup& write_group, SequenceNumber sequence,
       ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
       TrimHistoryScheduler* trim_history_scheduler,
       bool ignore_missing_column_families = false, uint64_t log_number = 0,
-      DB* db = nullptr, bool concurrent_memtable_writes = false,
-      bool seq_per_batch = false, bool batch_per_txn = true);
+      DB* db = nullptr, bool seq_per_batch = false, bool batch_per_txn = true);
 
   // Convenience form of InsertInto when you have only one batch
   // next_seq returns the seq after last sequence number used in MemTable insert
+  //
+  // Under concurrent use, the caller is responsible for making sure that
+  // the memtables object itself is thread-local.
   static Status InsertInto(
       const WriteBatch* batch, ColumnFamilyMemTables* memtables,
       FlushScheduler* flush_scheduler,
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index fd63f127f468..dff6e4248b2a 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -162,6 +162,12 @@ class MemTableRep {
     return true;
   }
 
+  // Only used after concurrent memtable inserts.
+  // This function will be called by each writer after all writes are done
+  // through InsertConcurrently().
+  // This is used by VectorRep to do batched writes for concurrent inserts.
+  virtual void BatchPostProcess() {}
+
   // Returns true iff an entry that compares equal to key is in the collection.
   virtual bool Contains(const char* key) const = 0;
 
@@ -397,6 +403,11 @@ class SkipListFactory : public MemTableRepFactory {
 // the vector is sorted. This is useful for workloads where iteration is very
 // rare and writes are generally not issued after reads begin.
 //
+// Concurrent inserts are supported by buffering writes in thread-local vectors
+// for each write batch. To optimize performance for concurrent inserts, it is
+// recommended to perform batched writes, and enable unordered_write (refer to
+// the option comment for its impact on read consistency).
+//
 // Parameters:
 //   count: Passed to the constructor of the underlying std::vector of each
 //     VectorRep. On initialization, the underlying array will be at least count
@@ -418,6 +429,8 @@ class VectorRepFactory : public MemTableRepFactory {
   MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, Allocator*,
                                  const SliceTransform*,
                                  Logger* logger) override;
+
+  bool IsInsertConcurrentlySupported() const override { return true; }
 };
 
 // This class contains a fixed array of buckets, each
diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc
index 9a50bdc9fba5..fa9449c68cc5 100644
--- a/memtable/vectorrep.cc
+++ b/memtable/vectorrep.cc
@@ -30,6 +30,8 @@ class VectorRep : public MemTableRep {
   // collection.
   void Insert(KeyHandle handle) override;
 
+  void InsertConcurrently(KeyHandle handle) override;
+
   // Returns true iff an entry that compares equal to key is in the collection.
   bool Contains(const char* key) const override;
 
@@ -40,6 +42,8 @@ class VectorRep : public MemTableRep {
   void Get(const LookupKey& k, void* callback_args,
            bool (*callback_func)(void* arg, const char* entry)) override;
 
+  void BatchPostProcess() override;
+
   ~VectorRep() override = default;
 
   class Iterator : public MemTableRep::Iterator {
@@ -100,19 +104,40 @@ class VectorRep : public MemTableRep {
 
  private:
   friend class Iterator;
+  ALIGN_AS(CACHE_LINE_SIZE) RelaxedAtomic<size_t> bucket_size_;
   using Bucket = std::vector<const char*>;
   std::shared_ptr<Bucket> bucket_;
   mutable port::RWMutex rwlock_;
   bool immutable_;
   bool sorted_;
   const KeyComparator& compare_;
+  // Thread-local vector to buffer concurrent writes.
+  using TlBucket = std::vector<const char*>;
+  ThreadLocalPtr tl_writes_;
+
+  static void DeleteTlBucket(void* ptr) {
+    auto* v = static_cast<TlBucket*>(ptr);
+    delete v;
+  }
 };
 
 void VectorRep::Insert(KeyHandle handle) {
   auto* key = static_cast<char*>(handle);
-  WriteLock l(&rwlock_);
-  assert(!immutable_);
-  bucket_->push_back(key);
+  {
+    WriteLock l(&rwlock_);
+    assert(!immutable_);
+    bucket_->push_back(key);
+  }
+  bucket_size_.FetchAddRelaxed(1);
+}
+
+void VectorRep::InsertConcurrently(KeyHandle handle) {
+  auto* v = static_cast<TlBucket*>(tl_writes_.Get());
+  if (!v) {
+    v = new TlBucket();
+    tl_writes_.Reset(v);
+  }
+  v->push_back(static_cast<char*>(handle));
 }
 
 // Returns true iff an entry that compares equal to key is in the collection.
@@ -127,19 +152,35 @@ void VectorRep::MarkReadOnly() {
 }
 
 size_t VectorRep::ApproximateMemoryUsage() {
-  return sizeof(bucket_) + sizeof(*bucket_) +
-         bucket_->size() *
-             sizeof(
-                 std::remove_reference<decltype(*bucket_)>::type::value_type);
+  return bucket_size_.LoadRelaxed() *
+         sizeof(std::remove_reference<decltype(*bucket_)>::type::value_type);
+}
+
+void VectorRep::BatchPostProcess() {
+  auto* v = static_cast<TlBucket*>(tl_writes_.Get());
+  if (v) {
+    {
+      WriteLock l(&rwlock_);
+      assert(!immutable_);
+      for (auto& key : *v) {
+        bucket_->push_back(key);
+      }
+    }
+    bucket_size_.FetchAddRelaxed(v->size());
+    delete v;
+    tl_writes_.Reset(nullptr);
+  }
 }
 
 VectorRep::VectorRep(const KeyComparator& compare, Allocator* allocator,
                      size_t count)
     : MemTableRep(allocator),
+      bucket_size_(0),
       bucket_(new Bucket()),
       immutable_(false),
       sorted_(false),
-      compare_(compare) {
+      compare_(compare),
+      tl_writes_(DeleteTlBucket) {
   bucket_.get()->reserve(count);
 }
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 3eaceaca9e82..3cba7e753754 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1730,6 +1730,10 @@ DEFINE_uint64(stats_history_buffer_size,
 DEFINE_bool(avoid_flush_during_recovery,
             ROCKSDB_NAMESPACE::Options().avoid_flush_during_recovery,
             "If true, avoids flushing the recovered WAL data where possible.");
+
+DEFINE_bool(avoid_flush_during_shutdown,
+            ROCKSDB_NAMESPACE::Options().avoid_flush_during_shutdown,
+            "If true, avoids flushing the recovered WAL data where possible.");
 DEFINE_int64(multiread_stride, 0,
              "Stride length for the keys in a MultiGet batch");
 DEFINE_bool(multiread_batched, false, "Use the new MultiGet API");
@@ -4264,6 +4268,7 @@ class Benchmark {
     options.stats_history_buffer_size =
         static_cast<size_t>(FLAGS_stats_history_buffer_size);
     options.avoid_flush_during_recovery = FLAGS_avoid_flush_during_recovery;
+    options.avoid_flush_during_shutdown = FLAGS_avoid_flush_during_shutdown;
 
     options.compression_opts.level = FLAGS_compression_level;
     options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index debec49e126d..168d0284b7da 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -520,6 +520,7 @@ def is_direct_io_supported(dbname):
     "ingest_external_file_one_in": 0,
     # `CfConsistencyStressTest::TestIterateAgainstExpected()` is not implemented.
     "verify_iterator_with_expected_state_one_in": 0,
+    "memtablerep": random.choice(["skip_list"] * 9 + ["vector"]),
 }
 
 # For pessimistic transaction db
@@ -722,6 +723,10 @@ def finalize_and_sanitize(src_params):
         else:
             dest_params["mock_direct_io"] = True
 
+    if dest_params["memtablerep"] == "vector":
+        dest_params["inplace_update_support"] = 0
+        dest_params["paranoid_memory_checks"] = 0
+
     if dest_params["test_batches_snapshots"] == 1:
         dest_params["enable_compaction_filter"] = 0
         dest_params["inplace_update_support"] = 0
@@ -949,8 +954,6 @@ def finalize_and_sanitize(src_params):
         # disable atomic flush.
         if dest_params["test_best_efforts_recovery"] == 0:
             dest_params["disable_wal"] = 0
-    if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
-        dest_params["memtablerep"] = "skip_list"
     if (
         dest_params.get("enable_compaction_filter", 0) == 1
         or dest_params.get("inplace_update_support", 0) == 1

From d55655a423a80b6118ac07001721596723418ece Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 19 Jun 2025 11:04:35 -0700
Subject: [PATCH 145/500] Add an optional min file size requirement for
 deletion triggered compaction (#13707)

Summary:
add the `min_file_size` parameter to CompactOnDeletionCollector. A file must be at least this size for it to qualify for DTC. This is useful when a user wants to specific a min file size requirement that is larger than the size constraint imposed by the sliding window's `deletion_trigger` requirement.

Added some comment explaining that the file_size provided to table property collector only includes data blocks and may not be up-to-date. This PR also updates DTC to consider SingleDelete and DeletionWithTimestamp as tombstones.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13707

Test Plan:
- new unit test for when min_file_size is specified.
- existing unit test for when min_file_size is not specified.

Reviewed By: hx235, pdillinger

Differential Revision: D76837231

Pulled By: cbi42

fbshipit-source-id: 0782144e75aef9961bf03da2a2c4b3c613ce5db3
---
 include/rocksdb/table_properties.h            |  6 +-
 .../utilities/table_properties_collectors.h   | 19 +++++-
 tools/db_crashtest.py                         |  2 +-
 .../new_features/concurrent_vector_insert.md  |  1 +
 .../public_api_changes/min-file-size-dtc.md   |  1 +
 .../compact_on_deletion_collector.cc          | 68 ++++++++++++++-----
 .../compact_on_deletion_collector.h           | 16 +++--
 .../compact_on_deletion_collector_test.cc     | 55 +++++++++++++++
 8 files changed, 141 insertions(+), 27 deletions(-)
 create mode 100644 unreleased_history/new_features/concurrent_vector_insert.md
 create mode 100644 unreleased_history/public_api_changes/min-file-size-dtc.md

diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 76f3529bf576..f8ae270fa86c 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -109,6 +109,10 @@ class TablePropertiesCollector {
   // table.
   // @params key    the user key that is inserted into the table.
   // @params value  the value that is inserted into the table.
+  // @params file_size the current file size. For BlockBasedTable, this
+  //         includes all the data blocks written so far, upto but not including
+  //         the current block being built. With parallel compression, data
+  //         blocks are written async so it depends on the compression progress.
   virtual Status AddUserKey(const Slice& key, const Slice& value,
                             EntryType /*type*/, SequenceNumber /*seq*/,
                             uint64_t /*file_size*/) {
@@ -143,7 +147,7 @@ class TablePropertiesCollector {
   // The name of the properties collector can be used for debugging purpose.
   virtual const char* Name() const = 0;
 
-  // EXPERIMENTAL Return whether the output file should be further compacted
+  // Return whether the output file should be further compacted
   virtual bool NeedCompact() const { return false; }
 
   // For internal use only.
diff --git a/include/rocksdb/utilities/table_properties_collectors.h b/include/rocksdb/utilities/table_properties_collectors.h
index 0f79f725e5d8..c8c8af1de6a8 100644
--- a/include/rocksdb/utilities/table_properties_collectors.h
+++ b/include/rocksdb/utilities/table_properties_collectors.h
@@ -23,15 +23,20 @@ class CompactOnDeletionCollectorFactory
   // A factory of a table property collector that marks a SST
   // file as need-compaction when it observe at least "D" deletion
   // entries in any "N" consecutive entries, or the ratio of tombstone
-  // entries >= deletion_ratio.
+  // entries >= deletion_ratio for the entire file.
   //
   // @param sliding_window_size "N"
   // @param deletion_trigger "D"
   // @param deletion_ratio, if <= 0 or > 1, disable triggering compaction
   //     based on deletion ratio.
+  // @param min_file_size, a file needs to be at least this size to be marked
+  //     for compaction. See comments above
+  //     TablePropertiesCollector::AddUserKey() for limitations/inaccuracies on
+  //     the file size.
   CompactOnDeletionCollectorFactory(size_t sliding_window_size,
                                     size_t deletion_trigger,
-                                    double deletion_ratio);
+                                    double deletion_ratio,
+                                    uint64_t min_file_size = 0);
 
   ~CompactOnDeletionCollectorFactory() override {}
 
@@ -59,6 +64,12 @@ class CompactOnDeletionCollectorFactory
   }
 
   double GetDeletionRatio() const { return deletion_ratio_.load(); }
+
+  uint64_t GetMinFileSize() const { return min_file_size_.load(); }
+  void SetMinFileSize(uint64_t min_file_size) {
+    min_file_size_.store(min_file_size);
+  }
+
   static const char* kClassName() { return "CompactOnDeletionCollector"; }
   const char* Name() const override { return kClassName(); }
 
@@ -68,6 +79,7 @@ class CompactOnDeletionCollectorFactory
   std::atomic<size_t> sliding_window_size_;
   std::atomic<size_t> deletion_trigger_;
   std::atomic<double> deletion_ratio_;
+  std::atomic<uint64_t> min_file_size_;
 };
 
 // Creates a factory of a table property collector that marks a SST
@@ -85,7 +97,8 @@ class CompactOnDeletionCollectorFactory
 std::shared_ptr<CompactOnDeletionCollectorFactory>
 NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
                                      size_t deletion_trigger,
-                                     double deletion_ratio = 0);
+                                     double deletion_ratio = 0,
+                                     uint64_t min_file_size = 0);
 
 // A factory of a table property collector that marks a SST file as
 // need-compaction when for the tiering use case, it observes, among all the
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 168d0284b7da..61057fa4c370 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -723,7 +723,7 @@ def finalize_and_sanitize(src_params):
         else:
             dest_params["mock_direct_io"] = True
 
-    if dest_params["memtablerep"] == "vector":
+    if dest_params.get("memtablerep") == "vector":
         dest_params["inplace_update_support"] = 0
         dest_params["paranoid_memory_checks"] = 0
 
diff --git a/unreleased_history/new_features/concurrent_vector_insert.md b/unreleased_history/new_features/concurrent_vector_insert.md
new file mode 100644
index 000000000000..b85f2c8d31a2
--- /dev/null
+++ b/unreleased_history/new_features/concurrent_vector_insert.md
@@ -0,0 +1 @@
+* Vector based memtable now supports concurrent writers (DBOptions::allow_concurrent_memtable_write) #13675.
diff --git a/unreleased_history/public_api_changes/min-file-size-dtc.md b/unreleased_history/public_api_changes/min-file-size-dtc.md
new file mode 100644
index 000000000000..ae6ad13a2134
--- /dev/null
+++ b/unreleased_history/public_api_changes/min-file-size-dtc.md
@@ -0,0 +1 @@
+* Add an optional min file size requirement for deletion triggered compaction. It can be specified when creating `CompactOnDeletionCollectorFactory`.
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/utilities/table_properties_collectors/compact_on_deletion_collector.cc
index a175d0a016f2..348cd849a87d 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -17,16 +17,19 @@
 namespace ROCKSDB_NAMESPACE {
 
 CompactOnDeletionCollector::CompactOnDeletionCollector(
-    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio,
+    uint64_t min_file_size)
     : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets),
       current_bucket_(0),
       num_keys_in_current_bucket_(0),
       num_deletions_in_observation_window_(0),
       deletion_trigger_(deletion_trigger),
       deletion_ratio_(deletion_ratio),
+      min_file_size_(min_file_size),
+      cur_file_size_(0),
+      max_deletion_in_window_(0),
       deletion_ratio_enabled_(deletion_ratio > 0 && deletion_ratio <= 1),
-      need_compaction_(false),
-      finished_(false) {
+      need_compaction_(false) {
   memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
 }
 
@@ -39,7 +42,7 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
                                               const Slice& /*value*/,
                                               EntryType type,
                                               SequenceNumber /*seq*/,
-                                              uint64_t /*file_size*/) {
+                                              uint64_t file_size) {
   assert(!finished_);
   if (!bucket_size_ && !deletion_ratio_enabled_) {
     // This collector is effectively disabled
@@ -51,11 +54,14 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
     return Status::OK();
   }
 
+  const bool is_delete = (type == kEntryDelete || type == kEntrySingleDelete ||
+                          type == kEntryDeleteWithTimestamp);
   if (deletion_ratio_enabled_) {
     total_entries_++;
-    if (type == kEntryDelete) {
+    if (is_delete) {
       deletion_entries_++;
     }
+    cur_file_size_ = file_size;
   }
 
   if (bucket_size_) {
@@ -76,13 +82,20 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
     }
 
     num_keys_in_current_bucket_++;
-    if (type == kEntryDelete) {
+    if (is_delete) {
       num_deletions_in_observation_window_++;
       num_deletions_in_buckets_[current_bucket_]++;
-      if (num_deletions_in_observation_window_ >= deletion_trigger_) {
-        need_compaction_ = true;
+      if (num_deletions_in_observation_window_ >= max_deletion_in_window_) {
+        max_deletion_in_window_ = num_deletions_in_observation_window_;
       }
     }
+
+    // The file may qualify for compaction based on file size constraints,
+    // even if max_deletion_in_window_ is not updated.
+    if (max_deletion_in_window_ >= deletion_trigger_ &&
+        file_size >= min_file_size_) {
+      need_compaction_ = true;
+    }
   }
 
   return Status::OK();
@@ -90,7 +103,8 @@ Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
 
 Status CompactOnDeletionCollector::Finish(
     UserCollectedProperties* /*properties*/) {
-  if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0) {
+  if (!need_compaction_ && deletion_ratio_enabled_ && total_entries_ > 0 &&
+      cur_file_size_ >= min_file_size_) {
     double ratio = static_cast<double>(deletion_entries_) / total_entries_;
     need_compaction_ = ratio >= deletion_ratio_;
   }
@@ -153,23 +167,43 @@ static std::unordered_map<std::string, OptionTypeInfo>
             return Status::OK();
           },
           nullptr}},
+        {"min_file_size",
+         {0, OptionType::kUnknown, OptionVerificationType::kNormal,
+          OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable,
+          [](const ConfigOptions&, const std::string&, const std::string& value,
+             void* addr) {
+            auto* factory =
+                static_cast<CompactOnDeletionCollectorFactory*>(addr);
+            factory->SetMinFileSize(ParseUint64(value));
+            return Status::OK();
+          },
+          [](const ConfigOptions&, const std::string&, const void* addr,
+             std::string* value) {
+            const auto* factory =
+                static_cast<const CompactOnDeletionCollectorFactory*>(addr);
+            *value = std::to_string(factory->GetMinFileSize());
+            return Status::OK();
+          },
+          nullptr}},
 
 };
 
 CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory(
-    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio)
+    size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio,
+    uint64_t min_file_size)
     : sliding_window_size_(sliding_window_size),
       deletion_trigger_(deletion_trigger),
-      deletion_ratio_(deletion_ratio) {
+      deletion_ratio_(deletion_ratio),
+      min_file_size_(min_file_size) {
   RegisterOptions("", this, &on_deletion_collector_type_info);
 }
 
 TablePropertiesCollector*
 CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
     TablePropertiesCollectorFactory::Context /*context*/) {
-  return new CompactOnDeletionCollector(sliding_window_size_.load(),
-                                        deletion_trigger_.load(),
-                                        deletion_ratio_.load());
+  return new CompactOnDeletionCollector(
+      sliding_window_size_.load(), deletion_trigger_.load(),
+      deletion_ratio_.load(), min_file_size_.load());
 }
 
 std::string CompactOnDeletionCollectorFactory::ToString() const {
@@ -183,10 +217,12 @@ std::string CompactOnDeletionCollectorFactory::ToString() const {
 std::shared_ptr<CompactOnDeletionCollectorFactory>
 NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
                                      size_t deletion_trigger,
-                                     double deletion_ratio) {
+                                     double deletion_ratio,
+                                     uint64_t min_file_size) {
   return std::shared_ptr<CompactOnDeletionCollectorFactory>(
       new CompactOnDeletionCollectorFactory(sliding_window_size,
-                                            deletion_trigger, deletion_ratio));
+                                            deletion_trigger, deletion_ratio,
+                                            min_file_size));
 }
 
 namespace {
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h
index 1ccfa7becdf7..a800760dcb82 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector.h
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h
@@ -11,7 +11,8 @@ namespace ROCKSDB_NAMESPACE {
 class CompactOnDeletionCollector : public TablePropertiesCollector {
  public:
   CompactOnDeletionCollector(size_t sliding_window_size,
-                             size_t deletion_trigger, double deletion_raatio);
+                             size_t deletion_trigger, double deletion_ratio,
+                             uint64_t min_file_size);
 
   // AddUserKey() will be called when a new key/value pair is inserted into the
   // table.
@@ -36,7 +37,7 @@ class CompactOnDeletionCollector : public TablePropertiesCollector {
   // The name of the properties collector can be used for debugging purpose.
   const char* Name() const override { return "CompactOnDeletionCollector"; }
 
-  // EXPERIMENTAL Return whether the output file should be further compacted
+  // Return whether the output file should be further compacted
   bool NeedCompact() const override { return need_compaction_; }
 
   static const int kNumBuckets = 128;
@@ -48,18 +49,21 @@ class CompactOnDeletionCollector : public TablePropertiesCollector {
   // "bucket_size_" keys.
   size_t num_deletions_in_buckets_[kNumBuckets];
   // the number of keys in a bucket
-  size_t bucket_size_;
+  const size_t bucket_size_;
 
   size_t current_bucket_;
   size_t num_keys_in_current_bucket_;
   size_t num_deletions_in_observation_window_;
-  size_t deletion_trigger_;
+  const size_t deletion_trigger_;
   const double deletion_ratio_;
-  const bool deletion_ratio_enabled_;
   size_t total_entries_ = 0;
   size_t deletion_entries_ = 0;
+  const size_t min_file_size_;
+  size_t cur_file_size_;
+  size_t max_deletion_in_window_;
+  const bool deletion_ratio_enabled_;
   // true if the current SST file needs to be compacted.
   bool need_compaction_;
-  bool finished_;
+  bool finished_ = false;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
index 9fec089fc13f..5fabb9856eba 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -232,6 +232,61 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
   }
 }
 
+TEST(CompactOnDeletionCollector, MinFileSize) {
+  TablePropertiesCollectorFactory::Context context;
+  context.column_family_id =
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+  context.last_level_inclusive_max_seqno_threshold = kMaxSequenceNumber;
+
+  const size_t kWindowSize = 1000;
+  const size_t kDeletionTrigger = 800;
+  const double kDeletionRatio = 0.9;
+  const uint64_t kMinFileSize = 1 << 20;
+
+  for (uint64_t file_size : {(uint64_t)0, kMinFileSize - 1, kMinFileSize}) {
+    {
+      auto factory = NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kDeletionTrigger, 0, kMinFileSize);
+      std::unique_ptr<TablePropertiesCollector> collector(
+          factory->CreateTablePropertiesCollector(context));
+
+      // Add enough deletions to meet the sliding window triggers
+      for (size_t i = 0; i < kWindowSize; i++) {
+        if (i < kDeletionTrigger) {
+          ASSERT_OK(collector->AddUserKey("key", "value", kEntryDelete, 0,
+                                          file_size));
+        } else {
+          ASSERT_OK(
+              collector->AddUserKey("key", "value", kEntryPut, 0, file_size));
+        }
+      }
+      ASSERT_OK(collector->Finish(nullptr));
+      ASSERT_EQ(collector->NeedCompact(), file_size >= kMinFileSize);
+    }
+
+    {
+      auto factory = NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kDeletionTrigger, kDeletionRatio, kMinFileSize);
+
+      std::unique_ptr<TablePropertiesCollector> collector(
+          factory->CreateTablePropertiesCollector(context));
+
+      const size_t kTotalEntries = 100;
+      // Add all deletions to maximize tombstone ratio
+      for (size_t i = 0; i < kTotalEntries - 1; i++) {
+        ASSERT_OK(
+            collector->AddUserKey("key", "value", kEntrySingleDelete, 0, 0));
+      }
+      // Give update file size
+      ASSERT_OK(collector->AddUserKey("key", "value", kEntrySingleDelete, 0,
+                                      file_size));
+
+      ASSERT_OK(collector->Finish(nullptr));
+      ASSERT_EQ(collector->NeedCompact(), file_size >= kMinFileSize);
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From fdc2970d37e7043836966d95bddfd415d8c98a23 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 19 Jun 2025 12:54:15 -0700
Subject: [PATCH 146/500] Connect custom compression to crash test and
 ObjectLibrary (#13710)

Summary:
Some pieces of follow-up to https://github.com/facebook/rocksdb/issues/13659.
_Recommend hiding whitespace for review_
* Add support for instantiating CompressionManagers through CreateFromString/ObjectLibrary.
* Pull CompressorCustomAlg and DecompressorCustomAlg out of db_test2, refactor/improvement them a bit, and put them in testutil.h for sharing with db_stress. Switched it from being built on snappy to being built on lz4 so that it can properly test dictionary compression.
* Add a custom compression manager for db_stress that uses these, and add to crash test. This depends on the ObjectLibrary stuff because some invocations of db_stress will not be configured with the custom compression manager but will need to access it to read some existing SST files.
* Remove some pieces where the concern of setting compression=kZSTD for compatibility purposes had leaked into configuring some tests and compression managers. After https://github.com/facebook/rocksdb/issues/13659 this compatibility concern is contained in the SST building code.
* Fix BuiltinDecompressorV2SnappyOnly hiding the (ignored) compression dictionary. SST read logic expects the serialized dictionary to be returned by the decompressor even if it's effectively ignored. Updated DBBlockCacheTest.CacheCompressionDict to cover this case.

For follow-up:
* Combine custom compression and mixed compression types in a file (not clean/easy without duplicating or majorly refactoring the mixed/random compressor)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13710

Test Plan: unit tests updated

Reviewed By: hx235

Differential Revision: D76928974

Pulled By: pdillinger

fbshipit-source-id: 772cf9cb048d737699b0e2887c624fb64a68aa8c
---
 db/db_block_cache_test.cc                     | 125 ++--
 db/db_test2.cc                                | 624 ++++++++----------
 .../db_stress_compression_manager.h           |  65 ++
 db_stress_tool/db_stress_test_base.cc         |  52 +-
 include/rocksdb/advanced_compression.h        |  10 +-
 options/options_settable_test.cc              |   8 +-
 .../block_based/block_based_table_builder.cc  |  12 +-
 test_util/testutil.h                          | 126 ++++
 tools/db_crashtest.py                         |  22 +-
 tools/sst_dump_tool.cc                        |   2 +
 util/compression.cc                           |  77 ++-
 util/compression.h                            |   1 +
 util/simple_mixed_compressor.cc               |  30 +-
 util/simple_mixed_compressor.h                |   1 -
 14 files changed, 658 insertions(+), 497 deletions(-)
 create mode 100644 db_stress_tool/db_stress_compression_manager.h

diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 1810ef8eb6fa..3d65bf9eb6d7 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -829,68 +829,79 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
   const int kNumEntriesPerFile = 128;
   const int kNumBytesPerEntry = 1024;
 
-  // Try all the available libraries that support dictionary compression
-  std::vector<CompressionType> compression_types;
-  if (Zlib_Supported()) {
-    compression_types.push_back(kZlibCompression);
-  }
-  if (LZ4_Supported()) {
-    compression_types.push_back(kLZ4Compression);
-    compression_types.push_back(kLZ4HCCompression);
-  }
-  if (ZSTD_Supported()) {
-    compression_types.push_back(kZSTD);
-  }
+  std::vector<CompressionType> dict_compressions =
+      GetSupportedDictCompressions();
   Random rnd(301);
-  for (auto compression_type : compression_types) {
-    Options options = CurrentOptions();
-    options.bottommost_compression = compression_type;
-    options.bottommost_compression_opts.max_dict_bytes = 4096;
-    options.bottommost_compression_opts.enabled = true;
-    options.create_if_missing = true;
-    options.num_levels = 2;
-    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
-    BlockBasedTableOptions table_options;
-    table_options.cache_index_and_filter_blocks = true;
-    table_options.block_cache.reset(new MockCache());
-    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    DestroyAndReopen(options);
+  // Format version before and after compression handling changes
+  TEST_AllowUnsupportedFormatVersion() = true;
+  for (int format_version : {6, 7}) {
+    // Test all supported compression types because (at least historically)
+    // dictionary compression could be enabled and a dictionary block saved
+    // but ignored by some compression types. Ensure we at least don't crash
+    // or return corruption for those.
+    for (auto compression_type : GetSupportedCompressions()) {
+      // Extra handling checks only for types actually supporting dictionary
+      // compression.
+      bool dict_supported =
+          std::count(dict_compressions.begin(), dict_compressions.end(),
+                     compression_type) > 0;
+
+      Options options = CurrentOptions();
+      options.bottommost_compression = compression_type;
+      options.bottommost_compression_opts.max_dict_bytes = 4096;
+      options.bottommost_compression_opts.enabled = true;
+      options.create_if_missing = true;
+      options.num_levels = 2;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+      BlockBasedTableOptions table_options;
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.block_cache.reset(new MockCache());
+      table_options.format_version = format_version;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      DestroyAndReopen(options);
 
-    RecordCacheCountersForCompressionDict(options);
+      RecordCacheCountersForCompressionDict(options);
 
-    for (int i = 0; i < kNumFiles; ++i) {
-      ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
-      for (int j = 0; j < kNumEntriesPerFile; ++j) {
-        std::string value = rnd.RandomString(kNumBytesPerEntry);
-        ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+      for (int i = 0; i < kNumFiles; ++i) {
+        ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+        for (int j = 0; j < kNumEntriesPerFile; ++j) {
+          std::string value = rnd.RandomString(kNumBytesPerEntry);
+          ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+        }
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+      ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+      if (dict_supported) {
+        // Compression dictionary blocks are preloaded.
+        CheckCacheCountersForCompressionDict(
+            options, kNumFiles /* expected_compression_dict_misses */,
+            0 /* expected_compression_dict_hits */,
+            kNumFiles /* expected_compression_dict_inserts */);
+      }
+
+      // Seek to a key in a file. It should cause the SST's dictionary
+      // meta-block to be read.
+      RecordCacheCounters(options);
+      RecordCacheCountersForCompressionDict(options);
+      ReadOptions read_options;
+      ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+
+      if (dict_supported) {
+        // Two block hits: index and dictionary since they are prefetched
+        // One block missed/added: data block
+        CheckCacheCounters(options, 1 /* expected_misses */,
+                           2 /* expected_hits */, 1 /* expected_inserts */,
+                           0 /* expected_failures */);
+        CheckCacheCountersForCompressionDict(
+            options, 0 /* expected_compression_dict_misses */,
+            1 /* expected_compression_dict_hits */,
+            0 /* expected_compression_dict_inserts */);
       }
-      ASSERT_OK(Flush());
     }
-    ASSERT_OK(dbfull()->TEST_WaitForCompact());
-    ASSERT_EQ(0, NumTableFilesAtLevel(0));
-    ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
-
-    // Compression dictionary blocks are preloaded.
-    CheckCacheCountersForCompressionDict(
-        options, kNumFiles /* expected_compression_dict_misses */,
-        0 /* expected_compression_dict_hits */,
-        kNumFiles /* expected_compression_dict_inserts */);
-
-    // Seek to a key in a file. It should cause the SST's dictionary meta-block
-    // to be read.
-    RecordCacheCounters(options);
-    RecordCacheCountersForCompressionDict(options);
-    ReadOptions read_options;
-    ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
-    // Two block hits: index and dictionary since they are prefetched
-    // One block missed/added: data block
-    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
-                       1 /* expected_inserts */, 0 /* expected_failures */);
-    CheckCacheCountersForCompressionDict(
-        options, 0 /* expected_compression_dict_misses */,
-        1 /* expected_compression_dict_hits */,
-        0 /* expected_compression_dict_inserts */);
   }
 }
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index c552388ae758..47393b49238a 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -1888,49 +1888,45 @@ TEST_F(DBTest2, RoundRobinManager) {
     auto mgr = std::make_shared<RoundRobinManager>(
         GetDefaultBuiltinCompressionManager());
 
-    for (CompressionType type : {kZSTD}) {
-      std::vector<std::string> values;
-      for (bool use_wrapper : {true}) {
-        SCOPED_TRACE("Compression type: " + std::to_string(type) +
-                     (use_wrapper ? " with " : " no ") + "wrapper");
-
-        Options options = CurrentOptions();
-        options.compression = type;
-        options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-        options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
-        BlockBasedTableOptions bbto;
-        bbto.enable_index_compression = false;
-        options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-        options.compression_manager = use_wrapper ? mgr : nullptr;
-        DestroyAndReopen(options);
-
-        Random rnd(301);
-        constexpr int kCount = 13;
-
-        // Highly compressible blocks, except 1 non-compressible. Half of the
-        // compressible are morked for bypass and 1 marked for rejection. Values
-        // are large enough to ensure just 1 k-v per block.
-        for (int i = 0; i < kCount; ++i) {
-          std::string value;
-          if (i == 6) {
-            // One non-compressible block
-            value = rnd.RandomBinaryString(20000);
-          } else {
-            test::CompressibleString(&rnd, 0.1, 20000, &value);
-          }
-          values.push_back(value);
-          ASSERT_OK(Put(Key(i), value));
-          ASSERT_EQ(Get(Key(i)), value);
-        }
-        ASSERT_OK(Flush());
+    std::vector<std::string> values;
+    for (bool use_wrapper : {true}) {
+      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
+
+      Options options = CurrentOptions();
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
 
-        // Ensure well-formed for reads
-        for (int i = 0; i < kCount; ++i) {
-          ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-          ASSERT_EQ(Get(Key(i)), values[i]);
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
         }
-        ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+        values.push_back(value);
+        ASSERT_OK(Put(Key(i), value));
+        ASSERT_EQ(Get(Key(i)), value);
       }
+      ASSERT_OK(Flush());
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+        ASSERT_EQ(Get(Key(i)), values[i]);
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
     }
   }
 }
@@ -1939,54 +1935,49 @@ TEST_F(DBTest2, RandomMixedCompressionManager) {
   if (ZSTD_Supported()) {
     auto mgr = std::make_shared<RandomMixedCompressionManager>(
         GetDefaultBuiltinCompressionManager());
-    // Currently mixedmanager only supports with preffered compression manager
-    // zstd
-    for (CompressionType type : {kZSTD}) {
-      std::vector<std::string> values;
-      for (bool use_wrapper : {true}) {
-        SCOPED_TRACE("Compression type: " + std::to_string(type) +
-                     (use_wrapper ? " with " : " no ") + "wrapper");
-
-        Options options = CurrentOptions();
-        options.compression = type;
-        options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-        options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
-        BlockBasedTableOptions bbto;
-        bbto.enable_index_compression = false;
-        options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-        options.compression_manager = use_wrapper ? mgr : nullptr;
-        DestroyAndReopen(options);
-
-        Random rnd(301);
-        constexpr int kCount = 13;
-
-        // Highly compressible blocks, except 1 non-compressible. Half of the
-        // compressible are morked for bypass and 1 marked for rejection. Values
-        // are large enough to ensure just 1 k-v per block.
-        for (int i = 0; i < kCount; ++i) {
-          std::string value;
-          if (i == 6) {
-            // One non-compressible block
-            value = rnd.RandomBinaryString(20000);
-          } else {
-            test::CompressibleString(&rnd, 0.1, 20000, &value);
-          }
-          values.push_back(value);
-          ASSERT_OK(Put(Key(i), value));
-          ASSERT_EQ(Get(Key(i)), value);
-        }
-        ASSERT_OK(Flush());
+    std::vector<std::string> values;
+    for (bool use_wrapper : {true}) {
+      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
+
+      Options options = CurrentOptions();
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
 
-        // Ensure well-formed for reads
-        for (int i = 0; i < kCount; ++i) {
-          ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-          ASSERT_EQ(Get(Key(i)), values[i]);
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
         }
-        ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+        values.push_back(value);
+        ASSERT_OK(Put(Key(i), value));
+        ASSERT_EQ(Get(Key(i)), value);
       }
+      ASSERT_OK(Flush());
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+        ASSERT_EQ(Get(Key(i)), values[i]);
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
     }
   }
 }
+
 TEST_F(DBTest2, CompressionManagerWrapper) {
   // Test that we can use a custom CompressionManager to wrap the built-in
   // CompressionManager, thus adopting a custom *strategy* based on existing
@@ -2103,110 +2094,19 @@ TEST_F(DBTest2, CompressionManagerWrapper) {
   }
 }
 
-namespace {
-template <CompressionType kCompression>
-struct CompressorCustomAlg : public CompressorWrapper {
-  explicit CompressorCustomAlg(const CompressionOptions& opts)
-      : CompressorWrapper(GetDefaultBuiltinCompressionManager()->GetCompressor(
-            opts, kSnappyCompression)) {}
-
-  explicit CompressorCustomAlg(std::unique_ptr<Compressor> compressor)
-      : CompressorWrapper(std::move(compressor)) {}
-
-  const char* Name() const override { return "CompressorCustomAlg"; }
-
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
-                       CompressionType* out_compression_type,
-                       ManagedWorkingArea* working_area) override {
-    Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                       out_compression_type, working_area);
-    if (*out_compression_type != kNoCompression) {
-      assert(*out_compression_type == kSnappyCompression);
-      compressed_output->insert(/*pos=*/0, /*count=*/1,
-                                lossless_cast<char>(kCompression));
-      *out_compression_type = kCompression;
-    }
-    return s;
-  }
-
-  std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
-    std::unique_ptr<Compressor> rv =
-        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
-    if (rv) {
-      rv = std::make_unique<CompressorCustomAlg>(std::move(rv));
-    }
-    return rv;
-  }
-};
-
-struct DecompressorCustomAlg : public DecompressorWrapper {
-  DecompressorCustomAlg()
-      : DecompressorWrapper(
-            GetDefaultBuiltinCompressionManager()->GetDecompressor()) {}
-
-  explicit DecompressorCustomAlg(std::shared_ptr<Decompressor> decompressor)
-      : DecompressorWrapper(std::move(decompressor)) {}
-
-  const char* Name() const override { return "DecompressorCustomAlg"; }
-
-  Status MaybeCloneForDict(const Slice& serialized_dict,
-                           std::unique_ptr<Decompressor>* out) override {
-    Status s = wrapped_->MaybeCloneForDict(serialized_dict, out);
-    if (s.ok()) {
-      *out = std::make_unique<DecompressorCustomAlg>(std::move(*out));
-    }
-    return s;
-  }
-
-  Status ExtractUncompressedSize(Args& args) override {
-    if (args.compression_type > kLastBuiltinCompression) {
-      assert(args.compressed_data.size() > 0);
-      assert(args.compressed_data[0] ==
-             lossless_cast<char>(args.compression_type));
-      // It's ok to modify args if we restore to original
-      SaveAndRestore<Slice> save_compressed_slice(&args.compressed_data);
-      args.compressed_data.remove_prefix(1);
-      SaveAndRestore<CompressionType> save_compression_type(
-          &args.compression_type);
-      args.compression_type = kSnappyCompression;
-      return wrapped_->ExtractUncompressedSize(args);
-    } else {
-      // Also support built-in compressions
-      return wrapped_->ExtractUncompressedSize(args);
-    }
-  }
-
-  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
-    if (args.compression_type > kLastBuiltinCompression) {
-      assert(args.compressed_data.size() > 0);
-      assert(args.compressed_data[0] ==
-             lossless_cast<char>(args.compression_type));
-      // Or we can copy args and modify
-      Args modified_args = args;
-      modified_args.compressed_data.remove_prefix(1);
-      modified_args.compression_type = kSnappyCompression;
-      return wrapped_->DecompressBlock(modified_args, uncompressed_output);
-    } else {
-      // Also support built-in compressions
-      return wrapped_->DecompressBlock(args, uncompressed_output);
-    }
-  }
-};
-}  // anonymous namespace
-
 TEST_F(DBTest2, CompressionManagerCustomCompression) {
-  if (!Snappy_Supported()) {
-    fprintf(stderr, "snappy compression not supported, skip this test\n");
-    return;
-  }
-
   // Test that we can use a custom CompressionManager to implement custom
   // compression algorithms, and that there are appropriate schema guard rails
   // to ensure data is not processed by the wrong algorithm.
-  using Compressor8A = CompressorCustomAlg<kCustomCompression8A>;
-  using Compressor8B = CompressorCustomAlg<kCustomCompression8B>;
-  using Compressor8C = CompressorCustomAlg<kCustomCompression8C>;
+  using Compressor8A = test::CompressorCustomAlg<kCustomCompression8A>;
+  using Compressor8B = test::CompressorCustomAlg<kCustomCompression8B>;
+  using Compressor8C = test::CompressorCustomAlg<kCustomCompression8C>;
+
+  if (!Compressor8A::Supported() || !LZ4_Supported()) {
+    fprintf(stderr,
+            "Prerequisite compression library not supported. Skipping\n");
+    return;
+  }
 
   class MyManager : public CompressionManager {
    public:
@@ -2230,13 +2130,13 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
       switch (static_cast<unsigned char>(type)) {
         case kCustomCompression8A:
           used_compressor8A_count_++;
-          return std::make_unique<Compressor8A>(opts);
+          return std::make_unique<Compressor8A>();
         case kCustomCompression8B:
           used_compressor8B_count_++;
-          return std::make_unique<Compressor8B>(opts);
+          return std::make_unique<Compressor8B>();
         case kCustomCompression8C:
           used_compressor8C_count_++;
-          return std::make_unique<Compressor8C>(opts);
+          return std::make_unique<Compressor8C>();
         // Also support built-in compression algorithms
         default:
           return GetDefaultBuiltinCompressionManager()->GetCompressor(opts,
@@ -2244,9 +2144,8 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
       }
     }
 
-    // TODO: test limited-scope decompressors
     std::shared_ptr<Decompressor> GetDecompressor() override {
-      return std::make_shared<DecompressorCustomAlg>();
+      return std::make_shared<test::DecompressorCustomAlg>();
     }
 
     CompressionType last_specific_decompressor_type_ = kNoCompression;
@@ -2256,7 +2155,9 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
         const CompressionType* types_end) override {
       assert(types_end > types_begin);
       last_specific_decompressor_type_ = *types_begin;
-      return std::make_shared<DecompressorCustomAlg>();
+      auto decomp = std::make_shared<test::DecompressorCustomAlg>();
+      decomp->SetAllowedTypes(types_begin, types_end);
+      return decomp;
     }
 
     void AddFriend(const std::shared_ptr<CompressionManager>& mgr) {
@@ -2283,181 +2184,192 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     std::map<std::string, std::weak_ptr<CompressionManager>> friends_;
   };
 
-  // Although these compression managers are actually compatible, we must
-  // respect their distinct compatibility names and treat them as incompatible
-  // (or else risk processing data incorrectly)
-  // NOTE: these are not registered in ObjectRegistry to test what happens
-  // when the original CompressionManager might not be available.
-  auto mgr_foo = std::make_shared<MyManager>("Foo");
-  auto mgr_bar = std::make_shared<MyManager>("Bar");
+  for (bool use_dict : {false, true}) {
+    SCOPED_TRACE(use_dict ? "With dict" : "No dict");
 
-  // And this one claims to be fully compatible with the built-in compression
-  // manager when it's not fully compatible (for custom CompressionTypes)
-  auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
+    // Although these compression managers are actually compatible, we must
+    // respect their distinct compatibility names and treat them as incompatible
+    // (or else risk processing data incorrectly)
+    // NOTE: these are not registered in ObjectRegistry to test what happens
+    // when the original CompressionManager might not be available.
+    auto mgr_foo = std::make_shared<MyManager>("Foo");
+    auto mgr_bar = std::make_shared<MyManager>("Bar");
 
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = 20;
-  BlockBasedTableOptions bbto;
-  bbto.enable_index_compression = false;
-  bbto.format_version = 6;  // Before custom compression alg support
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  // Claims not to use custom compression (and doesn't unless setting a custom
-  // CompressionType)
-  options.compression_manager = mgr_claim_compatible;
-  // Use a built-in compression type
-  options.compression = kSnappyCompression;
-  DestroyAndReopen(options);
+    // And this one claims to be fully compatible with the built-in compression
+    // manager when it's not fully compatible (for custom CompressionTypes)
+    auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
 
-  constexpr uint16_t kValueSize = 10000;
-  Random rnd(404);
-  std::string value;
-  ASSERT_OK(Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-  ASSERT_OK(Flush());
+    constexpr uint16_t kValueSize = 10000;
 
-  // That data should be readable without access to the original compression
-  // manager, because it used the built-in CompatibilityName and a built-in
-  // CompressionType
-  options.compression_manager = nullptr;
-  Reopen(options);
-  ASSERT_EQ(Get("a"), value);
-
-  // Verify it was compressed
-  Range r = {"a", "a0"};
-  TablePropertiesCollection tables_properties;
-  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
-                                              &tables_properties));
-  ASSERT_EQ(tables_properties.size(), 1U);
-  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Snappy");
-
-  // Disallow setting a custom CompressionType with a CompressionManager
-  // claiming to be built-in compatible.
-  options.compression_manager = mgr_claim_compatible;
-  options.compression = kCustomCompression8A;
-  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-  options.compression_manager = nullptr;
-  options.compression = kCustomCompressionFE;
-  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-  options.compression =
-      static_cast<CompressionType>(kLastBuiltinCompression + 1);
-  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-  // Custom compression schema (different CompatibilityName) not supported
-  // before format_version=7
-  options.compression_manager = mgr_foo;
-  options.compression = kSnappyCompression;
-  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-  // TODO: eliminate this hack when format_version=7 is published
-  SaveAndRestore guard(&TEST_AllowUnsupportedFormatVersion(), true);
-
-  // Set new format version
-  bbto.format_version = 7;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 20;
+    BlockBasedTableOptions bbto;
+    bbto.enable_index_compression = false;
+    bbto.format_version = 6;  // Before custom compression alg support
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    // Claims not to use custom compression (and doesn't unless setting a custom
+    // CompressionType)
+    options.compression_manager = mgr_claim_compatible;
+    // Use a built-in compression type with dictionary support
+    options.compression = kLZ4Compression;
+    options.compression_opts.max_dict_bytes = kValueSize / 2;
+    DestroyAndReopen(options);
 
-  // Custom compression type not supported with built-in schema name, even with
-  // format_version=7
-  options.compression_manager = mgr_claim_compatible;
-  options.compression = kCustomCompression8B;
-  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+    Random rnd(404);
+    std::string value;
+    ASSERT_OK(
+        Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
 
-  // Using a built-in compression type with fv=7 but named custom schema
-  options.compression_manager = mgr_foo;
-  options.compression = kSnappyCompression;
-  Reopen(options);
-  ASSERT_OK(Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-  ASSERT_OK(Flush());
-  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  ASSERT_EQ(Get("b"), value);
-
-  // Verify it was compressed with snappy
-  r = {"b", "b0"};
-  tables_properties.clear();
-  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
-                                              &tables_properties));
-  ASSERT_EQ(tables_properties.size(), 1U);
-  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-  // Uses new format for "compression_name" property
-  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;01;");
-  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kSnappyCompression);
-
-  // Custom compression type
-  options.compression = kCustomCompression8A;
-  Reopen(options);
-  ASSERT_OK(Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
-  ASSERT_OK(Flush());
-  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
-  ASSERT_EQ(Get("c"), value);
-  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
-
-  // Verify it was compressed with custom format
-  r = {"c", "c0"};
-  tables_properties.clear();
-  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
-                                              &tables_properties));
-  ASSERT_EQ(tables_properties.size(), 1U);
-  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
-  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8A);
-
-  // Also dynamically changeable, because the compression manager will respect
-  // the current setting as reported under the legacy logic
-  ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
-  ASSERT_OK(Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-  ASSERT_OK(Flush());
-  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
-  ASSERT_EQ(Get("d"), value);
-
-  // Verify it was compressed with snappy
-  r = {"d", "d0"};
-  tables_properties.clear();
-  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
-                                              &tables_properties));
-  ASSERT_EQ(tables_properties.size(), 1U);
-  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;01;");
-  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kSnappyCompression);
-
-  // Dynamically changeable to custom compressions also
-  ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
-  ASSERT_OK(Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-  ASSERT_OK(Flush());
-  ASSERT_EQ(NumTableFilesAtLevel(0), 5);
-  ASSERT_EQ(Get("e"), value);
-
-  // Verify it was compressed with custom format
-  r = {"e", "e0"};
-  tables_properties.clear();
-  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
-                                              &tables_properties));
-  ASSERT_EQ(tables_properties.size(), 1U);
-  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
-  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8B);
-
-  // Fails to re-open with incompatible compression manager (can't find
-  // compression manager Foo because it's not registered nor known by Bar)
-  options.compression_manager = mgr_bar;
-  options.compression = kSnappyCompression;
-  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotFound);
-
-  // But should re-open if we make Bar aware of the Foo compression manager
-  mgr_bar->AddFriend(mgr_foo);
-  Reopen(options);
+    // That data should be readable without access to the original compression
+    // manager, because it used the built-in CompatibilityName and a built-in
+    // CompressionType
+    options.compression_manager = nullptr;
+    Reopen(options);
+    ASSERT_EQ(Get("a"), value);
+
+    // Verify it was compressed
+    Range r = {"a", "a0"};
+    TablePropertiesCollection tables_properties;
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "LZ4");
+
+    // Disallow setting a custom CompressionType with a CompressionManager
+    // claiming to be built-in compatible.
+    options.compression_manager = mgr_claim_compatible;
+    options.compression = kCustomCompression8A;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    options.compression_manager = nullptr;
+    options.compression = kCustomCompressionFE;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+    options.compression =
+        static_cast<CompressionType>(kLastBuiltinCompression + 1);
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    // Custom compression schema (different CompatibilityName) not supported
+    // before format_version=7
+    options.compression_manager = mgr_foo;
+    options.compression = kLZ4Compression;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    // TODO: eliminate this hack when format_version=7 is published
+    SaveAndRestore guard(&TEST_AllowUnsupportedFormatVersion(), true);
+
+    // Set new format version
+    bbto.format_version = 7;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+    // Custom compression type not supported with built-in schema name, even
+    // with format_version=7
+    options.compression_manager = mgr_claim_compatible;
+    options.compression = kCustomCompression8B;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
 
-  // Can still read everything
-  ASSERT_EQ(Get("a").size(), kValueSize);
-  ASSERT_EQ(Get("b").size(), kValueSize);
-  ASSERT_EQ(Get("c").size(), kValueSize);
-  ASSERT_EQ(Get("d").size(), kValueSize);
-  ASSERT_EQ(Get("e").size(), kValueSize);
-
-  // TODO: mix of compatibility names in same DB
-  // TODO: test old version of a compression manager unable to read a
-  // compression type
-  // TODO: test getting compression manager from object registry
+    // Using a built-in compression type with fv=7 but named custom schema
+    options.compression_manager = mgr_foo;
+    options.compression = kLZ4Compression;
+    Reopen(options);
+    ASSERT_OK(
+        Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+    ASSERT_EQ(Get("b"), value);
+
+    // Verify it was compressed with snappy
+    r = {"b", "b0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    // Uses new format for "compression_name" property
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kLZ4Compression);
+
+    // Custom compression type
+    options.compression = kCustomCompression8A;
+    Reopen(options);
+    ASSERT_OK(
+        Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+    ASSERT_EQ(Get("c"), value);
+    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
+
+    // Verify it was compressed with custom format
+    r = {"c", "c0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8A);
+
+    // Also dynamically changeable, because the compression manager will respect
+    // the current setting as reported under the legacy logic
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kLZ4Compression"}}));
+    ASSERT_OK(
+        Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+    ASSERT_EQ(Get("d"), value);
+
+    // Verify it was compressed with snappy
+    r = {"d", "d0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kLZ4Compression);
+
+    // Dynamically changeable to custom compressions also
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
+    ASSERT_OK(
+        Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    ASSERT_EQ(Get("e"), value);
+
+    // Verify it was compressed with custom format
+    r = {"e", "e0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8B);
+
+    // Fails to re-open with incompatible compression manager (can't find
+    // compression manager Foo because it's not registered nor known by Bar)
+    options.compression_manager = mgr_bar;
+    options.compression = kLZ4Compression;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+    // But should re-open if we make Bar aware of the Foo compression manager
+    mgr_bar->AddFriend(mgr_foo);
+    Reopen(options);
+
+    // Can still read everything
+    ASSERT_EQ(Get("a").size(), kValueSize);
+    ASSERT_EQ(Get("b").size(), kValueSize);
+    ASSERT_EQ(Get("c").size(), kValueSize);
+    ASSERT_EQ(Get("d").size(), kValueSize);
+    ASSERT_EQ(Get("e").size(), kValueSize);
+
+    // TODO: mix of compatibility names in same DB
+    // TODO: test old version of a compression manager unable to read a
+    // compression type
+    // TODO: test getting compression manager from object registry
+  }
 }
 
 class CompactionStallTestListener : public EventListener {
diff --git a/db_stress_tool/db_stress_compression_manager.h b/db_stress_tool/db_stress_compression_manager.h
new file mode 100644
index 000000000000..0c41517b186e
--- /dev/null
+++ b/db_stress_tool/db_stress_compression_manager.h
@@ -0,0 +1,65 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DbStressCustomCompressionManager : public CompressionManager {
+ public:
+  const char* Name() const override {
+    return "DbStressCustomCompressionManager";
+  }
+  const char* CompatibilityName() const override { return "DbStressCustom1"; }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return default_->SupportsCompressionType(type) ||
+           type == kCustomCompressionAA || type == kCustomCompressionAB ||
+           type == kCustomCompressionAC;
+  }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                            CompressionType type) override {
+    // db_stress never specifies a custom type, so we randomly use them anyway
+    // when this compression manager is used.
+    std::array<CompressionType, 4> choices = {
+        type, kCustomCompressionAA, kCustomCompressionAB, kCustomCompressionAC};
+    type = choices[Random::GetTLSInstance()->Uniform(4)];
+    switch (static_cast<unsigned char>(type)) {
+      case kCustomCompressionAA:
+        return std::make_unique<
+            test::CompressorCustomAlg<kCustomCompressionAA>>();
+      case kCustomCompressionAB:
+        return std::make_unique<
+            test::CompressorCustomAlg<kCustomCompressionAB>>();
+      case kCustomCompressionAC:
+        return std::make_unique<
+            test::CompressorCustomAlg<kCustomCompressionAC>>();
+      // Also support built-in compression algorithms
+      default:
+        return GetDefaultBuiltinCompressionManager()->GetCompressor(opts, type);
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return std::make_shared<test::DecompressorCustomAlg>();
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressorForTypes(
+      const CompressionType* types_begin,
+      const CompressionType* types_end) override {
+    auto decomp = std::make_shared<test::DecompressorCustomAlg>();
+    decomp->SetAllowedTypes(types_begin, types_end);
+    return decomp;
+  }
+
+ protected:
+  std::shared_ptr<CompressionManager> default_ =
+      GetDefaultBuiltinCompressionManager();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 89bf0189bf95..5b9f37089bd2 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -11,6 +11,7 @@
 #include <ios>
 #include <thread>
 
+#include "db_stress_tool/db_stress_compression_manager.h"
 #include "db_stress_tool/db_stress_listener.h"
 #include "rocksdb/io_status.h"
 #include "rocksdb/options.h"
@@ -3412,29 +3413,34 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     InitializeOptionsFromFlags(cache_, filter_policy_, options_);
   }
   InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_);
-  if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed") ||
-      !strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
-    // Currently limited to ZSTD compression. Table property compression_name
-    // needs to set to zstd for now even when there can be more than one
-    // algorithm in the table under your compressor.
-    if (!ZSTD_Supported()) {
-      fprintf(stderr,
-              "ZSTD compression not supported thus mixed compression cannot be "
-              "used\n");
-      exit(1);
-    }
-    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-      auto mgr = std::make_shared<RoundRobinManager>(
-          GetDefaultBuiltinCompressionManager());
-      options_.compression_manager = mgr;
-    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
-      auto mgr = std::make_shared<RandomMixedCompressionManager>(
-          GetDefaultBuiltinCompressionManager());
-      options_.compression_manager = mgr;
-    }
-    options_.compression = kZSTD;
-    options_.bottommost_compression = kZSTD;
-
+  {
+    // We must register any compression managers with a custom
+    // CompatibilityName() so that if it was used in a past invocation but not
+    // the current invocation, we can still read the SST files requiring it.
+    static std::once_flag loaded;
+    std::call_once(loaded, [&]() {
+      TEST_AllowUnsupportedFormatVersion() = true;
+      auto& library = *ObjectLibrary::Default();
+      library.AddFactory<CompressionManager>(
+          DbStressCustomCompressionManager().CompatibilityName(),
+          [](const std::string& /*uri*/,
+             std::unique_ptr<CompressionManager>* guard,
+             std::string* /*errmsg*/) {
+            *guard = std::make_unique<DbStressCustomCompressionManager>();
+            return guard->get();
+          });
+    });
+  }
+  if (!strcasecmp(FLAGS_compression_manager.c_str(), "custom")) {
+    options_.compression_manager =
+        std::make_shared<DbStressCustomCompressionManager>();
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+    options_.compression_manager = std::make_shared<RoundRobinManager>(
+        GetDefaultBuiltinCompressionManager());
+  } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
+    options_.compression_manager =
+        std::make_shared<RandomMixedCompressionManager>(
+            GetDefaultBuiltinCompressionManager());
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
     options_.compression_manager =
         CreateAutoSkipCompressionManager(GetDefaultBuiltinCompressionManager());
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index 3f5bf231ab2a..2f989cd4e410 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -273,11 +273,11 @@ class Decompressor {
   // dictionary is processed into a form reusable by repeated compressions in
   // many threads, that happens within this call.
   //
-  // Must return OK if storing a result in `out`. Otherwise, could return values
-  // like NotSupported - dictionary compression is not (yet) supported for this
-  // kind of Decompressor.
-  // Corruption - dictionary is malformed (though many implementations will
-  // accept any data as a dictionary)
+  // Must return OK if and only if storing a result in `out`. Otherwise, could
+  // return values like NotSupported - dictionary compression is not (yet)
+  // supported for this kind of Decompressor. Corruption - dictionary is
+  // malformed (though many implementations will accept any data as a
+  // dictionary)
   virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
                                    std::unique_ptr<Decompressor>* /*out*/) {
     return Status::NotSupported(
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 294f1b9e1f74..160ce21e5919 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -703,8 +703,12 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       new_options->compaction_options_fifo.file_temperature_age_thresholds[0]
           .age,
       12345);
-  ASSERT_EQ(new_options->compression_manager,
-            GetBuiltinCompressionManager(/*compression_format_version*/ 2));
+  // TODO: try to enhance ObjectLibrary to support singletons
+  // ASSERT_EQ(new_options->compression_manager,
+  //           GetBuiltinCompressionManager(/*compression_format_version*/ 2));
+  ASSERT_STREQ(
+      new_options->compression_manager->Name(),
+      GetBuiltinCompressionManager(/*compression_format_version*/ 2)->Name());
 
   ColumnFamilyOptions rnd_filled_options = *new_options;
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 7ea8b3ed658c..8292c369c83b 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -832,15 +832,17 @@ struct BlockBasedTableBuilder::Rep {
       if (table_options.verify_compression) {
         verify_decompressor = basic_decompressor.get();
         if (table_options.enable_index_compression) {
-          basic_working_area.verify =
-              verify_decompressor->ObtainWorkingArea(tbo.compression_type);
+          basic_working_area.verify = verify_decompressor->ObtainWorkingArea(
+              basic_compressor->GetPreferredCompressionType());
         }
         if (state == State::kUnbuffered) {
+          assert(data_block_compressor);
+          data_block_verify_decompressor = verify_decompressor.get();
           for (uint32_t i = 0; i < compression_parallel_threads; i++) {
             data_block_working_areas[i].verify =
-                verify_decompressor->ObtainWorkingArea(tbo.compression_type);
+                data_block_verify_decompressor->ObtainWorkingArea(
+                    data_block_compressor->GetPreferredCompressionType());
           }
-          data_block_verify_decompressor = verify_decompressor.get();
         }
       }
     }
@@ -2075,6 +2077,8 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
     }
   }
 
+  assert(samples.sample_data.size() > 0);
+
   // final sample data block flushed, now we can generate dictionary
   r->compressor_with_dict = r->basic_compressor->MaybeCloneSpecialized(
       CacheEntryRole::kDataBlock, std::move(samples));
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 1713e2dbcc67..99c76550ca74 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -23,6 +23,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
 #include "table/internal_iterator.h"
+#include "util/defer.h"
 #include "util/mutexlock.h"
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
@@ -731,6 +732,131 @@ class StringFS : public FileSystemWrapper {
   std::unordered_map<std::string, std::string> files_;
 };
 
+// A compressor that essentially implements a custom compression algorithm
+// by leveraging an existing compression algorithm and putting a custom header
+// on it to detect any attempts to decompress it with the wrong compression
+// type or dictionary.
+template <CompressionType kCompression>
+struct CompressorCustomAlg : public CompressorWrapper {
+  static bool Supported() { return LZ4_Supported(); }
+
+  explicit CompressorCustomAlg(std::unique_ptr<Compressor> wrapped =
+                                   GetDefaultBuiltinCompressionManager()
+                                       ->GetCompressor({}, kLZ4Compression))
+      : CompressorWrapper(std::move(wrapped)),
+        dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())) {
+    static_assert(kCompression > kLastBuiltinCompression);
+  }
+
+  const char* Name() const override { return "CompressorCustomAlg"; }
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* working_area) override {
+    Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       out_compression_type, working_area);
+    if (*out_compression_type != kNoCompression) {
+      assert(*out_compression_type == kLZ4Compression);
+      std::string header(/*size=*/5, 0);
+      header[0] = lossless_cast<char>(kCompression);
+      EncodeFixed32(&header[1], dictionary_hash_);
+      compressed_output->insert(0, header);
+      *out_compression_type = kCompression;
+    }
+    return s;
+  }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+    auto clone =
+        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+    return std::make_unique<CompressorCustomAlg>(std::move(clone));
+  }
+
+ protected:
+  uint32_t dictionary_hash_;
+};
+
+// A decompressor suitable for all the instantiable CompressorCustomAlg
+// implementations. Can be configured to check that it is only used to
+// decompress certain types using SetAllowedTypes().
+struct DecompressorCustomAlg : public DecompressorWrapper {
+  using TypeSet = SmallEnumSet<CompressionType, kDisableCompressionOption>;
+
+  DecompressorCustomAlg(
+      std::shared_ptr<Decompressor> wrapped =
+          GetDefaultBuiltinCompressionManager()->GetDecompressor())
+      : DecompressorWrapper(std::move(wrapped)),
+        dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())),
+        allowed_types_(TypeSet::All()) {}
+
+  const char* Name() const override { return "DecompressorCustomAlg"; }
+
+  Status MaybeCloneForDict(const Slice& serialized_dict,
+                           std::unique_ptr<Decompressor>* out) override {
+    Status s = wrapped_->MaybeCloneForDict(serialized_dict, out);
+    if (s.ok()) {
+      assert(*out != nullptr);
+      auto clone = std::make_unique<DecompressorCustomAlg>(std::move(*out));
+      clone->SetAllowedTypes(allowed_types_);
+      *out = std::move(clone);
+      assert(out->get()->GetSerializedDict() == serialized_dict);
+    } else {
+      assert(*out == nullptr);
+    }
+    return s;
+  }
+
+  Status ExtractUncompressedSize(Args& args) override {
+    if (args.compression_type > kLastBuiltinCompression) {
+      assert(args.compressed_data.size() > 0);
+      assert(args.compressed_data[0] ==
+             lossless_cast<char>(args.compression_type));
+      assert(DecodeFixed32(args.compressed_data.data() + 1) ==
+             dictionary_hash_);
+      // Strip off our header because ExtractUncompressedSize() is also going
+      // to strip off the uncompressed size data.
+      args.compressed_data.remove_prefix(5);
+      // It's ok to modify other parts of args if we restore to original
+      SaveAndRestore<CompressionType> save_compression_type(
+          &args.compression_type, kLZ4Compression);
+      return wrapped_->ExtractUncompressedSize(args);
+    } else {
+      // Also support built-in compressions
+      return wrapped_->ExtractUncompressedSize(args);
+    }
+  }
+
+  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
+    if (args.compression_type > kLastBuiltinCompression) {
+      // Also allowed to copy args and modify
+      Args modified_args = args;
+      modified_args.compression_type = kLZ4Compression;
+      return wrapped_->DecompressBlock(modified_args, uncompressed_output);
+    } else {
+      // Also support built-in compressions
+      return wrapped_->DecompressBlock(args, uncompressed_output);
+    }
+  }
+
+  void SetAllowedTypes(const CompressionType* types_begin,
+                       const CompressionType* types_end) {
+    TypeSet allowed_types;
+    for (auto type = types_begin; type != types_end; ++type) {
+      allowed_types.Add(*type);
+    }
+    allowed_types_ = std::move(allowed_types);
+  }
+
+  void SetAllowedTypes(TypeSet allowed_types) {
+    allowed_types_ = std::move(allowed_types);
+  }
+
+ protected:
+  uint32_t dictionary_hash_;
+  SmallEnumSet<CompressionType, kDisableCompressionOption> allowed_types_;
+};
+
 // Randomly initialize the given DBOptions
 void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
 
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 61057fa4c370..3d3582c0372c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -178,7 +178,7 @@
     "verify_checksum": 1,
     "write_buffer_size": lambda: random.choice([1024 * 1024, 4 * 1024 * 1024]),
     "writepercent": 35,
-    "format_version": lambda: random.choice([2, 3, 4, 5, 6, 6]),
+    "format_version": lambda: random.choice([2, 3, 4, 5, 6, 7, 7]),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget": lambda: random.randint(0, 1),
     "use_get_entity": lambda: random.choice([0] * 7 + [1]),
@@ -350,7 +350,11 @@
     "ingest_wbwi_one_in": lambda: random.choice([0, 0, 100, 500]),
     "universal_reduce_file_locking": lambda: random.randint(0, 1),
     "compression_manager": lambda: random.choice(
-        ["mixed"] * 1 + ["none"] * 2 + ["autoskip"] * 2 + ["randommixed"] * 2
+        ["mixed"] * 1
+        + ["none"] * 2
+        + ["autoskip"] * 2
+        + ["randommixed"] * 2
+        + ["custom"] * 3
     ),
 }
 
@@ -1004,15 +1008,17 @@ def finalize_and_sanitize(src_params):
             # have to disable metadata write fault injection to other file
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
-    # Disabling block align if mixed manager is neing used
-    if (
+    # Disabling block align if mixed manager is being used
+    if dest_params.get("compression_manager") == "custom":
+        if dest_params.get("block_align") == 1:
+            dest_params["block_align"] = 0
+        if dest_params["format_version"] < 7:
+            dest_params["format_version"] = 7
+    elif (
         dest_params.get("compression_manager") == "mixed"
         or dest_params.get("compression_manager") == "randommixed"
     ):
-        if dest_params.get("block_align") == 1:
-            dest_params["block_align"] = 0
-        dest_params["compression_type"] = "zstd"
-        dest_params["bottommost_compression_type"] = "zstd"
+        dest_params["block_align"] = 0
     elif dest_params.get("compression_manager") == "autoskip":
         # ensuring the compression is being used
         if dest_params.get("compression_type") == "none":
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index d7d784c54689..94bf38245559 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -197,6 +197,8 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
 
   int64_t tmp_val;
 
+  TEST_AllowUnsupportedFormatVersion() = true;
+
   for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--env_uri=", 10) == 0) {
       env_uri = argv[i] + 10;
diff --git a/util/compression.cc b/util/compression.cc
index e48d8fd19b39..c7eb8267ea40 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -7,6 +7,7 @@
 
 #include "options/options_helper.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/utilities/object_registry.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -700,13 +701,6 @@ class BuiltinDecompressorV2SnappyOnly : public BuiltinDecompressorV2 {
     assert(args.compression_type == kSnappyCompression);
     return Snappy_DecompressBlock(args, uncompressed_output);
   }
-
-  Status MaybeCloneForDict(const Slice&,
-                           std::unique_ptr<Decompressor>* out) override {
-    // NOTE: quietly ignores the dictionary (for compatibility)
-    *out = std::make_unique<BuiltinDecompressorV2SnappyOnly>();
-    return Status::OK();
-  }
 };
 
 class BuiltinDecompressorV2WithDict : public BuiltinDecompressorV2 {
@@ -752,6 +746,17 @@ class BuiltinDecompressorV2WithDict : public BuiltinDecompressorV2 {
 
 Status BuiltinDecompressorV2::MaybeCloneForDict(
     const Slice& dict, std::unique_ptr<Decompressor>* out) {
+  // Because of unfortunate decisions in handling built-in compression types,
+  // all the compression types before ZSTD that do not actually support
+  // dictionary compression pretend to support it. Specifically, we have to be
+  // able to read files with a compression dictionary block using those
+  // compression types even though the compression dictionary is ignored by
+  // the compression algorithm. And the Decompressor has to return the
+  // configured dictionary from GetSerializedDict() even if it is ignored. This
+  // unfortunately means that a new schema version (BuiltinV3?) would be needed
+  // toactually support dictionary compression in the future for these
+  // algorithms (if the libraries add support).
+  // TODO: can we make this a better/cleaner experience?
   *out = std::make_unique<BuiltinDecompressorV2WithDict>(dict);
   return Status::OK();
 }
@@ -955,27 +960,53 @@ const std::shared_ptr<BuiltinCompressionManagerV2>
 }  // namespace
 
 Status CompressionManager::CreateFromString(
-    const ConfigOptions& config_options, const std::string& id,
+    const ConfigOptions& config_options, const std::string& value,
     std::shared_ptr<CompressionManager>* result) {
-  if (id == kNullptrString || id.empty()) {
+  if (value == kNullptrString || value.empty()) {
     result->reset();
     return Status::OK();
-  } else if (id.compare(kBuiltinCompressionManagerV1->CompatibilityName()) ==
-                 0 ||
-             id.compare(kBuiltinCompressionManagerV1->Name()) == 0) {
-    *result = kBuiltinCompressionManagerV1;
-    return Status::OK();
-  } else if (id.compare(kBuiltinCompressionManagerV2->CompatibilityName()) ==
-                 0 ||
-             id.compare(kBuiltinCompressionManagerV2->Name()) == 0) {
-    *result = kBuiltinCompressionManagerV2;
-    return Status::OK();
-  } else if (config_options.ignore_unsupported_options) {
-    return Status::OK();
+  }
+
+  static std::once_flag loaded;
+  std::call_once(loaded, [&]() {
+    auto& library = *ObjectLibrary::Default();
+    // TODO: try to enhance ObjectLibrary to support singletons
+    library.AddFactory<CompressionManager>(
+        kBuiltinCompressionManagerV1->CompatibilityName(),
+        [](const std::string& /*uri*/,
+           std::unique_ptr<CompressionManager>* guard,
+           std::string* /*errmsg*/) {
+          *guard = std::make_unique<BuiltinCompressionManagerV1>();
+          return guard->get();
+        });
+    library.AddFactory<CompressionManager>(
+        kBuiltinCompressionManagerV2->CompatibilityName(),
+        [](const std::string& /*uri*/,
+           std::unique_ptr<CompressionManager>* guard,
+           std::string* /*errmsg*/) {
+          *guard = std::make_unique<BuiltinCompressionManagerV2>();
+          return guard->get();
+        });
+  });
+
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status = Customizable::GetOptionsMap(config_options, result->get(),
+                                              value, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  } else if (id.empty()) {  // We have no Id but have options.  Not good
+    return Status::NotSupported("Cannot reset object ", id);
   } else {
-    return Status::NotFound("Compatible compression manager for \"" + id +
-                            "\"");
+    status = config_options.registry->NewSharedObject(id, result);
+  }
+  if (config_options.ignore_unsupported_options && status.IsNotSupported()) {
+    return Status::OK();
+  } else if (status.ok()) {
+    status = Customizable::ConfigureNewObject(config_options, result->get(),
+                                              opt_map);
   }
+  return status;
 }
 
 std::shared_ptr<CompressionManager>
diff --git a/util/compression.h b/util/compression.h
index ef09a33c19d3..896750d22bee 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -248,6 +248,7 @@ struct DecompressorDict {
       decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
     } else {
       assert(s.ok());
+      assert(decompressor_->GetSerializedDict() == dict);
     }
 
     memory_usage_ = sizeof(struct DecompressorDict);
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 054a49e19979..00a7562fd530 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -16,20 +16,16 @@ namespace ROCKSDB_NAMESPACE {
 
 // MultiCompressorWrapper implementation
 MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts,
-                                               CompressionType type,
                                                CompressionDict&& dict) {
-  assert(type != kNoCompression);
-  assert(type == kZSTD);
   auto builtInManager = GetDefaultBuiltinCompressionManager();
   const auto& compressions = GetSupportedCompressions();
-  for (auto type_ : compressions) {
-    if (type_ == kNoCompression) {
+  for (auto type : compressions) {
+    if (type == kNoCompression) {
       continue;
     }
-    compressors_.push_back(builtInManager->GetCompressor(opts, type_));
+    compressors_.push_back(builtInManager->GetCompressor(opts, type));
   }
   (void)dict;
-  (void)type;
 }
 
 size_t MultiCompressorWrapper::GetMaxSampleSizeIfWantDict(
@@ -42,7 +38,7 @@ Slice MultiCompressorWrapper::GetSerializedDict() const {
 }
 
 CompressionType MultiCompressorWrapper::GetPreferredCompressionType() const {
-  return kZSTD;
+  return compressors_.back()->GetPreferredCompressionType();
 }
 
 Compressor::ManagedWorkingArea MultiCompressorWrapper::ObtainWorkingArea() {
@@ -51,6 +47,8 @@ Compressor::ManagedWorkingArea MultiCompressorWrapper::ObtainWorkingArea() {
 
 std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
     CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+  // TODO: full dictionary compression support. Currently this just falls
+  // back on a non-multi compressor when asked to use a dictionary.
   return compressors_.back()->MaybeCloneSpecialized(block_type,
                                                     std::move(dict_samples));
 }
@@ -75,11 +73,9 @@ const char* RandomMixedCompressionManager::Name() const {
 }
 
 std::unique_ptr<Compressor> RandomMixedCompressionManager::GetCompressorForSST(
-    const FilterBuildingContext& context, const CompressionOptions& opts,
-    CompressionType preferred) {
-  assert(preferred == kZSTD);
-  (void)context;
-  return std::make_unique<RandomMixedCompressor>(opts, preferred);
+    const FilterBuildingContext& /*context*/, const CompressionOptions& opts,
+    CompressionType /*preferred*/) {
+  return std::make_unique<RandomMixedCompressor>(opts);
 }
 
 // RoundRobinCompressor implementation
@@ -103,11 +99,9 @@ RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
 const char* RoundRobinManager::Name() const { return "RoundRobinManager"; }
 
 std::unique_ptr<Compressor> RoundRobinManager::GetCompressorForSST(
-    const FilterBuildingContext& context, const CompressionOptions& opts,
-    CompressionType preferred) {
-  assert(preferred == kZSTD);
-  (void)context;
-  return std::make_unique<RoundRobinCompressor>(opts, preferred);
+    const FilterBuildingContext& /*context*/, const CompressionOptions& opts,
+    CompressionType /*preferred*/) {
+  return std::make_unique<RoundRobinCompressor>(opts);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index fd9e1cf3a7a8..69c4cc1490dd 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -18,7 +18,6 @@ namespace ROCKSDB_NAMESPACE {
 class MultiCompressorWrapper : public Compressor {
  public:
   explicit MultiCompressorWrapper(const CompressionOptions& opts,
-                                  CompressionType type,
                                   CompressionDict&& dict = {});
 
   size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;

From 190bb0bd241b605382b7225ad7c66bd65dd89a13 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Fri, 20 Jun 2025 12:38:32 -0700
Subject: [PATCH 147/500] Disable AutoSkipCompressionManager test (#13715)

Summary:
Auto skip compression manager code is currently running only in context of test / db bench. Disable failing test to unblock monthly minor release.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13715

Test Plan: Disable test.

Reviewed By: hx235

Differential Revision: D77039218

Pulled By: mszeszko-meta

fbshipit-source-id: f9eeec8d5ca4efeaf1f490c5f091b3aff7861a4a
---
 util/compression_test.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/util/compression_test.cc b/util/compression_test.cc
index 5df440c44a90..884ee2794a3d 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -190,7 +190,12 @@ class DBAutoSkip : public DBTestBase {
   }
 };
 
-TEST_F(DBAutoSkip, AutoSkipCompressionManager) {
+// FIXME: the test is failing the assertion in auto_skip_compressor.cc
+// when run on nightly build in build-linux-arm-test-full mode [1].
+//
+// [1]
+// auto_skip_compressor.cc:101: Assertion `preferred != kNoCompression' failed.
+TEST_F(DBAutoSkip, DISABLED_AutoSkipCompressionManager) {
   if (GetSupportedCompressions().size() > 1) {
     const int kValueSize = 20000;
     // This will set the rejection ratio to 60%

From 78c83ac1ec719c72b444f68c07551fb190a9f739 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 20 Jun 2025 17:39:47 -0700
Subject: [PATCH 148/500] Publish/support format_version=7, related
 enhancements (#13713)

Summary:
* Make new format_version=7 a supported setting.
* Fix a bug in compressed_secondary_cache.cc that is newly exercised by custom compression types and showing up in crash test with tiered secondary cache
* Small change to handling of disabled compression in fv=7: use empty compression manager compatibility name.
* Get rid of GetDefaultBuiltinCompressionManager() in public API because it could cause unexpected+unsafe schema change on a user's CompressionManager if built upon the default built-in manager and we add a new built-in schema. Now must be referenced by explicit compression schema version in the public API. (That notion was already exposed in compressed secondary cache API, for better or worse.)
* Improve some error messages for compression misconfiguration
* Improve testing with ObjectLibrary and CompressionManagers
* Improve testing of compression_name table property in BlockBasedTableTest.BlockBasedTableProperties2
* Improve some comments

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13713

Test Plan: existing and updated tests. Notably, the crash test has already been running with (unpublished) format_version=7

Reviewed By: mszeszko-meta, hx235

Differential Revision: D77035482

Pulled By: pdillinger

fbshipit-source-id: 95278de8734a79706a22361bff2184b1edb230ca
---
 cache/compressed_secondary_cache.cc           | 13 +--
 db/column_family.cc                           | 50 ++++++----
 db/db_block_cache_test.cc                     |  1 -
 db/db_test.cc                                 |  1 +
 db/db_test2.cc                                | 98 ++++++++++++++-----
 .../db_stress_compression_manager.h           |  4 +-
 db_stress_tool/db_stress_test_base.cc         |  8 +-
 include/rocksdb/advanced_compression.h        | 21 ++--
 include/rocksdb/compression_type.h            |  2 +
 include/rocksdb/table.h                       |  4 +
 include/rocksdb/table_properties.h            |  8 +-
 options/options_settable_test.cc              |  7 +-
 .../block_based/block_based_table_builder.cc  |  5 +-
 table/block_based/block_based_table_reader.cc | 15 ++-
 table/format.h                                |  7 +-
 table/table_test.cc                           | 19 +++-
 test_util/testutil.h                          | 23 +++--
 tools/db_bench_tool.cc                        |  6 +-
 tools/ldb_cmd.cc                              |  4 +-
 .../new_features/format_version_7.md          |  1 +
 util/auto_skip_compressor.cc                  |  2 +-
 util/compression.cc                           |  3 +-
 util/compression.h                            |  7 +-
 util/compression_test.cc                      |  2 +-
 util/simple_mixed_compressor.cc               |  3 +-
 util/slice_test.cc                            | 12 +++
 util/string_util.h                            | 10 ++
 27 files changed, 238 insertions(+), 98 deletions(-)
 create mode 100644 unreleased_history/new_features/format_version_7.md

diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index 70c8ef936891..d4d505d873c4 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -80,15 +80,16 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
     handle_value_charge = cache_->GetCharge(lru_handle);
     data_ptr = ptr->get();
-    data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
-                              static_cast<uint32_t*>(&type_32));
+    const char* limit = ptr->get() + handle_value_charge;
+    data_ptr =
+        GetVarint32Ptr(data_ptr, limit, static_cast<uint32_t*>(&type_32));
     type = static_cast<CompressionType>(type_32);
-    data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1,
-                              static_cast<uint32_t*>(&source_32));
+    data_ptr =
+        GetVarint32Ptr(data_ptr, limit, static_cast<uint32_t*>(&source_32));
     source = static_cast<CacheTier>(source_32);
     uint64_t data_size = 0;
-    data_ptr = GetVarint64Ptr(data_ptr, ptr->get() + handle_value_charge,
-                              static_cast<uint64_t*>(&data_size));
+    data_ptr =
+        GetVarint64Ptr(data_ptr, limit, static_cast<uint64_t*>(&data_size));
     assert(handle_value_charge > data_size);
     handle_value_charge = data_size;
   }
diff --git a/db/column_family.cc b/db/column_family.cc
index 9cda23eabe16..03d4f8a8c34a 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -110,32 +110,48 @@ void GetInternalTblPropCollFactory(
   }
 }
 
-bool CompressionSupportedWithManager(CompressionType type,
-                                     UnownedPtr<CompressionManager> mgr) {
-  return mgr ? mgr->SupportsCompressionType(type)
-             : CompressionTypeSupported(type);
+Status CheckCompressionSupportedWithManager(
+    CompressionType type, UnownedPtr<CompressionManager> mgr) {
+  if (mgr) {
+    if (!mgr->SupportsCompressionType(type)) {
+      return Status::NotSupported("Compression type " +
+                                  CompressionTypeToString(type) +
+                                  " is not recognized/supported by this "
+                                  "version of CompressionManager " +
+                                  mgr->GetId());
+    }
+  } else {
+    if (!CompressionTypeSupported(type)) {
+      if (type <= kLastBuiltinCompression) {
+        return Status::InvalidArgument("Compression type " +
+                                       CompressionTypeToString(type) +
+                                       " is not linked with the binary.");
+      } else {
+        return Status::NotSupported(
+            "Compression type " + CompressionTypeToString(type) +
+            " is not recognized/supported by built-in CompressionManager.");
+      }
+    }
+  }
+  return Status::OK();
 }
 
 Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
   if (!cf_options.compression_per_level.empty()) {
     for (size_t level = 0; level < cf_options.compression_per_level.size();
          ++level) {
-      if (!CompressionSupportedWithManager(
-              cf_options.compression_per_level[level],
-              cf_options.compression_manager.get())) {
-        return Status::InvalidArgument(
-            "Compression type " +
-            CompressionTypeToString(cf_options.compression_per_level[level]) +
-            " is not linked with the binary.");
+      Status s = CheckCompressionSupportedWithManager(
+          cf_options.compression_per_level[level],
+          cf_options.compression_manager.get());
+      if (!s.ok()) {
+        return s;
       }
     }
   } else {
-    if (!CompressionSupportedWithManager(
-            cf_options.compression, cf_options.compression_manager.get())) {
-      return Status::InvalidArgument(
-          "Compression type " +
-          CompressionTypeToString(cf_options.compression) +
-          " is not linked with the binary.");
+    Status s = CheckCompressionSupportedWithManager(
+        cf_options.compression, cf_options.compression_manager.get());
+    if (!s.ok()) {
+      return s;
     }
   }
   if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index 3d65bf9eb6d7..d712e7253fae 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -833,7 +833,6 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
       GetSupportedDictCompressions();
   Random rnd(301);
   // Format version before and after compression handling changes
-  TEST_AllowUnsupportedFormatVersion() = true;
   for (int format_version : {6, 7}) {
     // Test all supported compression types because (at least historically)
     // dictionary compression could be enabled and a dictionary block saved
diff --git a/db/db_test.cc b/db/db_test.cc
index 64958361b598..276b330f6c0b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6209,6 +6209,7 @@ TEST_F(DBTest, L0L1L2AndUpHitCounter) {
 }
 
 TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  // Allow testing format_version=1
   bool& allow_unsupported_fv = TEST_AllowUnsupportedFormatVersion();
   SaveAndRestore guard(&allow_unsupported_fv);
   ASSERT_FALSE(allow_unsupported_fv);
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 47393b49238a..4eabda2ba258 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -24,6 +24,7 @@
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/trace_record.h"
 #include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/replayer.h"
 #include "rocksdb/wal_filter.h"
 #include "test_util/testutil.h"
@@ -1885,8 +1886,8 @@ TEST_F(DBTest2, CompressionOptions) {
 
 TEST_F(DBTest2, RoundRobinManager) {
   if (ZSTD_Supported()) {
-    auto mgr = std::make_shared<RoundRobinManager>(
-        GetDefaultBuiltinCompressionManager());
+    auto mgr =
+        std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
 
     std::vector<std::string> values;
     for (bool use_wrapper : {true}) {
@@ -1934,7 +1935,7 @@ TEST_F(DBTest2, RoundRobinManager) {
 TEST_F(DBTest2, RandomMixedCompressionManager) {
   if (ZSTD_Supported()) {
     auto mgr = std::make_shared<RandomMixedCompressionManager>(
-        GetDefaultBuiltinCompressionManager());
+        GetBuiltinV2CompressionManager());
     std::vector<std::string> values;
     for (bool use_wrapper : {true}) {
       SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
@@ -2026,7 +2027,7 @@ TEST_F(DBTest2, CompressionManagerWrapper) {
           wrapped_->GetCompressorForSST(context, opts, preferred));
     }
   };
-  auto mgr = std::make_shared<MyManager>(GetDefaultBuiltinCompressionManager());
+  auto mgr = std::make_shared<MyManager>(GetBuiltinV2CompressionManager());
 
   for (CompressionType type : GetSupportedCompressions()) {
     for (bool use_wrapper : {false, true}) {
@@ -2117,8 +2118,7 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     bool SupportsCompressionType(CompressionType type) const override {
       return type == kCustomCompression8A || type == kCustomCompression8B ||
              type == kCustomCompression8C ||
-             GetDefaultBuiltinCompressionManager()->SupportsCompressionType(
-                 type);
+             GetBuiltinV2CompressionManager()->SupportsCompressionType(type);
     }
 
     int used_compressor8A_count_ = 0;
@@ -2139,8 +2139,7 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
           return std::make_unique<Compressor8C>();
         // Also support built-in compression algorithms
         default:
-          return GetDefaultBuiltinCompressionManager()->GetCompressor(opts,
-                                                                      type);
+          return GetBuiltinV2CompressionManager()->GetCompressor(opts, type);
       }
     }
 
@@ -2148,13 +2147,14 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
       return std::make_shared<test::DecompressorCustomAlg>();
     }
 
-    CompressionType last_specific_decompressor_type_ = kNoCompression;
+    RelaxedAtomic<CompressionType> last_specific_decompressor_type_{
+        kNoCompression};
 
     std::shared_ptr<Decompressor> GetDecompressorForTypes(
         const CompressionType* types_begin,
         const CompressionType* types_end) override {
       assert(types_end > types_begin);
-      last_specific_decompressor_type_ = *types_begin;
+      last_specific_decompressor_type_.StoreRelaxed(*types_begin);
       auto decomp = std::make_shared<test::DecompressorCustomAlg>();
       decomp->SetAllowedTypes(types_begin, types_end);
       return decomp;
@@ -2191,9 +2191,11 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     // respect their distinct compatibility names and treat them as incompatible
     // (or else risk processing data incorrectly)
     // NOTE: these are not registered in ObjectRegistry to test what happens
-    // when the original CompressionManager might not be available.
+    // when the original CompressionManager might not be available, but
+    // mgr_bar will be registered during the test, with different names to
+    // prevent interference between iterations.
     auto mgr_foo = std::make_shared<MyManager>("Foo");
-    auto mgr_bar = std::make_shared<MyManager>("Bar");
+    auto mgr_bar = std::make_shared<MyManager>(use_dict ? "Bar1" : "Bar2");
 
     // And this one claims to be fully compatible with the built-in compression
     // manager when it's not fully compatible (for custom CompressionTypes)
@@ -2256,10 +2258,7 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     options.compression = kLZ4Compression;
     ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
 
-    // TODO: eliminate this hack when format_version=7 is published
-    SaveAndRestore guard(&TEST_AllowUnsupportedFormatVersion(), true);
-
-    // Set new format version
+    // Set format version supporting custom compression
     bbto.format_version = 7;
     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
 
@@ -2269,8 +2268,13 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     options.compression = kCustomCompression8B;
     ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
 
-    // Using a built-in compression type with fv=7 but named custom schema
+    // Custom compression schema, but specifying a custom compression type it
+    // doesn't support.
     options.compression_manager = mgr_foo;
+    options.compression = kCustomCompressionF0;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+    // Using a built-in compression type with fv=7 but named custom schema
     options.compression = kLZ4Compression;
     Reopen(options);
     ASSERT_OK(
@@ -2279,7 +2283,7 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     ASSERT_EQ(NumTableFilesAtLevel(0), 2);
     ASSERT_EQ(Get("b"), value);
 
-    // Verify it was compressed with snappy
+    // Verify it was compressed with LZ4
     r = {"b", "b0"};
     tables_properties.clear();
     ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
@@ -2288,7 +2292,8 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
     // Uses new format for "compression_name" property
     EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kLZ4Compression);
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kLZ4Compression);
 
     // Custom compression type
     options.compression = kCustomCompression8A;
@@ -2309,7 +2314,8 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     ASSERT_EQ(tables_properties.size(), 1U);
     EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
     EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8A);
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kCustomCompression8A);
 
     // Also dynamically changeable, because the compression manager will respect
     // the current setting as reported under the legacy logic
@@ -2320,7 +2326,7 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     ASSERT_EQ(NumTableFilesAtLevel(0), 4);
     ASSERT_EQ(Get("d"), value);
 
-    // Verify it was compressed with snappy
+    // Verify it was compressed with LZ4
     r = {"d", "d0"};
     tables_properties.clear();
     ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
@@ -2328,7 +2334,8 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     ASSERT_EQ(tables_properties.size(), 1U);
     EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
     EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kLZ4Compression);
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kLZ4Compression);
 
     // Dynamically changeable to custom compressions also
     ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
@@ -2346,7 +2353,8 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     ASSERT_EQ(tables_properties.size(), 1U);
     EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
     EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_, kCustomCompression8B);
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kCustomCompression8B);
 
     // Fails to re-open with incompatible compression manager (can't find
     // compression manager Foo because it's not registered nor known by Bar)
@@ -2365,10 +2373,50 @@ TEST_F(DBTest2, CompressionManagerCustomCompression) {
     ASSERT_EQ(Get("d").size(), kValueSize);
     ASSERT_EQ(Get("e").size(), kValueSize);
 
-    // TODO: mix of compatibility names in same DB
+    // Add a file using mgr_bar
+    ASSERT_OK(
+        Put("f", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 6);
+    ASSERT_EQ(Get("f"), value);
+
+    // Verify it was compressed appropriately
+    r = {"f", "f0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(mgr_bar->last_specific_decompressor_type_.LoadRelaxed(),
+              kLZ4Compression);
+
+    // Fails to re-open with incompatible compression manager (can't find
+    // compression manager Bar because it's not registered nor known by Foo)
+    options.compression_manager = mgr_foo;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+    // Register and re-open
+    auto& library = *ObjectLibrary::Default();
+    library.AddFactory<CompressionManager>(
+        mgr_bar->CompatibilityName(),
+        [mgr_bar](const std::string& /*uri*/,
+                  std::unique_ptr<CompressionManager>* guard,
+                  std::string* /*errmsg*/) {
+          *guard = std::make_unique<MyManager>(mgr_bar->CompatibilityName());
+          return guard->get();
+        });
+    Reopen(options);
+
+    // Can still read everything
+    ASSERT_EQ(Get("a").size(), kValueSize);
+    ASSERT_EQ(Get("b").size(), kValueSize);
+    ASSERT_EQ(Get("c").size(), kValueSize);
+    ASSERT_EQ(Get("d").size(), kValueSize);
+    ASSERT_EQ(Get("e").size(), kValueSize);
+    ASSERT_EQ(Get("f").size(), kValueSize);
+
     // TODO: test old version of a compression manager unable to read a
     // compression type
-    // TODO: test getting compression manager from object registry
   }
 }
 
diff --git a/db_stress_tool/db_stress_compression_manager.h b/db_stress_tool/db_stress_compression_manager.h
index 0c41517b186e..f1ac5aa1275e 100644
--- a/db_stress_tool/db_stress_compression_manager.h
+++ b/db_stress_tool/db_stress_compression_manager.h
@@ -41,7 +41,7 @@ class DbStressCustomCompressionManager : public CompressionManager {
             test::CompressorCustomAlg<kCustomCompressionAC>>();
       // Also support built-in compression algorithms
       default:
-        return GetDefaultBuiltinCompressionManager()->GetCompressor(opts, type);
+        return GetBuiltinV2CompressionManager()->GetCompressor(opts, type);
     }
   }
 
@@ -59,7 +59,7 @@ class DbStressCustomCompressionManager : public CompressionManager {
 
  protected:
   std::shared_ptr<CompressionManager> default_ =
-      GetDefaultBuiltinCompressionManager();
+      GetBuiltinV2CompressionManager();
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 5b9f37089bd2..8ead78588c4e 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3435,15 +3435,15 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     options_.compression_manager =
         std::make_shared<DbStressCustomCompressionManager>();
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-    options_.compression_manager = std::make_shared<RoundRobinManager>(
-        GetDefaultBuiltinCompressionManager());
+    options_.compression_manager =
+        std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "randommixed")) {
     options_.compression_manager =
         std::make_shared<RandomMixedCompressionManager>(
-            GetDefaultBuiltinCompressionManager());
+            GetBuiltinV2CompressionManager());
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
     options_.compression_manager =
-        CreateAutoSkipCompressionManager(GetDefaultBuiltinCompressionManager());
+        CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager());
   } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
     // Nothing to do using default compression manager
   } else {
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index 2f989cd4e410..682a5d6bc56a 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -371,8 +371,9 @@ class CompressionManager
                                  const std::string& id,
                                  std::shared_ptr<CompressionManager>* result);
 
-  // Will this compression type be used if requested in calling
-  // GetCompressor/GetCompressorForSST?
+  // Returns false iff a configuration that would pass the given compression
+  // type to GetCompressor/GetCompressorForSST should be rejected (not
+  // supported)
   virtual bool SupportsCompressionType(CompressionType type) const = 0;
 
   // TODO: function to check compatibility with or sanitize CompressionOptions
@@ -577,11 +578,17 @@ class CompressionManagerWrapper : public CompressionManager {
   std::shared_ptr<CompressionManager> wrapped_;
 };
 
-// Compression manager that implements built-in compression strategy. The
-// behavior of compression_manager=nullptr is essentially equivalent to
-// using this compression manager.
-const std::shared_ptr<CompressionManager>&
-GetDefaultBuiltinCompressionManager();
+// Compression manager that implements the second schema for RocksDB built-in
+// compression support. (The first schema is intentionally not provided here.)
+// *** CURRENT STATE ***
+// This is currently the latest schema for built-in compression, and the
+// compression manager used when compression_manager=nullptr.
+const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager();
+
+// NOTE: No GetLatestBuiltinCompressionManager() is provided because that could
+// lead to unexpected schema changes for user CompressionManagers building on
+// the built-in schema, in the unlikely/rare case of a new built-in schema.
+
 // Gets CompressionManager designed for the automated compression strategy.
 // This may include deciding to compress or not.
 // In future should be able to select compression algorithm based on the CPU
diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index 6a5ace94c7d0..63d78c163c49 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -32,6 +32,7 @@ enum CompressionType : unsigned char {
 
   // For use by user custom CompressionManagers
   kCustomCompression80 = 0x80,
+  kFirstCustomCompression = kCustomCompression80,
   kCustomCompression81 = 0x81,
   kCustomCompression82 = 0x82,
   kCustomCompression83 = 0x83,
@@ -158,6 +159,7 @@ enum CompressionType : unsigned char {
   kCustomCompressionFC = 0xFC,
   kCustomCompressionFD = 0xFD,
   kCustomCompressionFE = 0xFE,
+  kLastCustomCompression = kCustomCompressionFE,
 
   // kDisableCompressionOption is used to disable some compression options.
   kDisableCompressionOption = 0xff,
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 1ce073d4a44e..cc6368fe0c27 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -561,6 +561,10 @@ struct BlockBasedTableOptions {
   // misplaced within or between files is as likely to fail checksum
   // verification as random corruption. Also checksum-protects SST footer.
   // Can be read by RocksDB versions >= 8.6.0.
+  // 7 -- Support for custom compression algorithms with a CompressionManager
+  // using a non-built-in CompatibilityName(). See `compression_manager` in
+  // ColumnFamilyOptions. Also changes the format of TableProperties field
+  // `compression_name`. Can be read by RocksDB versions >= 10.4.0.
   //
   // Using the default setting of format_version is strongly recommended, so
   // that available enhancements are adopted eventually and automatically. The
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index f8ae270fa86c..1b20d9d3ab99 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -355,7 +355,13 @@ struct TableProperties {
   // behave, this must be set to "ZSTD" if any blocks are compressed
   // with zstd and must NOT be set to "NoCompression" if any blocks are
   // compressed.
-  // * For format_version >= 7, it is ...
+  // * For format_version >= 7, the format is
+  //   <compatibility_name>;<hex-coded compression types>;<future use>
+  // where <compatibility_name> is the CompatibilityName() of the
+  // CompressionManager used for the file, or empty if compression was
+  // disabled; <hex-coded compression types> represents a sorted set of
+  // CompressionType values used in the file other than kNoCompression, each
+  // as 2-digit hex, e.g. 04 for LZ$, 07 for ZSTD, etc.
   std::string compression_name;
 
   // Compression options used to compress the SST files.
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 160ce21e5919..05a86b6d690c 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -705,10 +705,9 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       12345);
   // TODO: try to enhance ObjectLibrary to support singletons
   // ASSERT_EQ(new_options->compression_manager,
-  //           GetBuiltinCompressionManager(/*compression_format_version*/ 2));
-  ASSERT_STREQ(
-      new_options->compression_manager->Name(),
-      GetBuiltinCompressionManager(/*compression_format_version*/ 2)->Name());
+  //           GetBuiltinV2CompressionManager());
+  ASSERT_STREQ(new_options->compression_manager->Name(),
+               GetBuiltinV2CompressionManager()->Name());
 
   ColumnFamilyOptions rnd_filled_options = *new_options;
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 8292c369c83b..89888d5d71d4 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -977,7 +977,10 @@ struct BlockBasedTableBuilder::Rep {
       assert(mgr);
       // Use newer compression_name property
       props.compression_name.reserve(32);
-      props.compression_name.append(mgr->CompatibilityName());
+      // If compression is disabled, use empty manager name
+      if (basic_compressor) {
+        props.compression_name.append(mgr->CompatibilityName());
+      }
       props.compression_name.push_back(';');
       // Rest of property to be filled out at the end of building the file
     } else {
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 27d1753cc936..6c36e5d47232 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -606,7 +606,8 @@ Status GetDecompressor(const std::string& compression_name,
       if (!s.ok()) {
         return s;
       }
-      assert(mgr_to_use);
+      assert(mgr_to_use || compatibility_name == kNullptrString ||
+             compatibility_name.empty());
     }
 
     // Second field is set of compression types actually used in the file
@@ -632,9 +633,15 @@ Status GetDecompressor(const std::string& compression_name,
       }
       ctypes[i] = static_cast<CompressionType>(val);
     }
-    *out_decompressor =
-        mgr_to_use->GetDecompressorForTypes(ctypes.get(), ctypes.get() + count);
-    assert(*out_decompressor || count == 0);
+    if (mgr_to_use) {
+      *out_decompressor = mgr_to_use->GetDecompressorForTypes(
+          ctypes.get(), ctypes.get() + count);
+      assert(*out_decompressor || count == 0);
+    } else {
+      // Compression/decompression disabled
+      *out_decompressor = nullptr;
+      assert(count == 0);
+    }
     // Can ignore possible additional future fields
   } else {
     // No explicit CompressionManager, e.g. legacy file support where
diff --git a/table/format.h b/table/format.h
index ffc120e35eb0..0e914a4d9530 100644
--- a/table/format.h
+++ b/table/format.h
@@ -157,10 +157,15 @@ inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
   // As of format_version 2, we encode compressed block with
   // compress_format_version == 2. Before that, the version is 1.
   // DO NOT CHANGE THIS FUNCTION, it affects disk format
+  // As of format_version 7 and opening up to custom compression, the
+  // compression format version is essentially independent of the block-based
+  // table format version, and encoded in the compression_name table property.
+  // Thus, this function can go away once we remove support for reading
+  // format_version=1.
   return format_version >= 2 ? 2 : 1;
 }
 
-constexpr uint32_t kLatestFormatVersion = 6;
+constexpr uint32_t kLatestFormatVersion = 7;
 
 inline bool IsSupportedFormatVersion(uint32_t version) {
   return version <= kLatestFormatVersion;
diff --git a/table/table_test.cc b/table/table_test.cc
index 94be08ddcec3..23371787a6e2 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1795,18 +1795,23 @@ TEST_P(BlockBasedTableTest, IndexUncompressed) {
 #endif  // SNAPPY
 
 TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
-  TableConstructor c(&reverse_key_comparator);
+  TableConstructor c(&reverse_key_comparator,
+                     true /* convert_to_internal_key_ */);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
 
-  {
+  for (CompressionType ct : {kNoCompression, kSnappyCompression}) {
+    if (!Snappy_Supported() && ct == kSnappyCompression) {
+      continue;
+    }
     Options options;
-    options.compression = CompressionType::kNoCompression;
+    options.compression = ct;
     BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
+    c.Add("blah", std::string(200, 'x'));  // something to compress
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
@@ -1823,7 +1828,13 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
     // No filter policy is used
     ASSERT_EQ("", props.filter_policy_name);
     // Compression type == that set:
-    ASSERT_EQ("NoCompression", props.compression_name);
+    if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
+      ASSERT_EQ(ct == kNoCompression ? ";;" : "BuiltinV2;01;",
+                props.compression_name);
+    } else {
+      ASSERT_EQ(ct == kNoCompression ? "NoCompression" : "Snappy",
+                props.compression_name);
+    }
     c.ResetTableReader();
   }
 
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 99c76550ca74..2e641ac89c63 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -740,9 +740,9 @@ template <CompressionType kCompression>
 struct CompressorCustomAlg : public CompressorWrapper {
   static bool Supported() { return LZ4_Supported(); }
 
-  explicit CompressorCustomAlg(std::unique_ptr<Compressor> wrapped =
-                                   GetDefaultBuiltinCompressionManager()
-                                       ->GetCompressor({}, kLZ4Compression))
+  explicit CompressorCustomAlg(
+      std::unique_ptr<Compressor> wrapped =
+          GetBuiltinV2CompressionManager()->GetCompressor({}, kLZ4Compression))
       : CompressorWrapper(std::move(wrapped)),
         dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())) {
     static_assert(kCompression > kLastBuiltinCompression);
@@ -750,12 +750,16 @@ struct CompressorCustomAlg : public CompressorWrapper {
 
   const char* Name() const override { return "CompressorCustomAlg"; }
 
+  CompressionType GetPreferredCompressionType() const override {
+    return kCompression;
+  }
+
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* working_area) override {
     Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output,
                                        out_compression_type, working_area);
-    if (*out_compression_type != kNoCompression) {
+    if (s.ok() && *out_compression_type != kNoCompression) {
       assert(*out_compression_type == kLZ4Compression);
       std::string header(/*size=*/5, 0);
       header[0] = lossless_cast<char>(kCompression);
@@ -783,9 +787,8 @@ struct CompressorCustomAlg : public CompressorWrapper {
 struct DecompressorCustomAlg : public DecompressorWrapper {
   using TypeSet = SmallEnumSet<CompressionType, kDisableCompressionOption>;
 
-  DecompressorCustomAlg(
-      std::shared_ptr<Decompressor> wrapped =
-          GetDefaultBuiltinCompressionManager()->GetDecompressor())
+  DecompressorCustomAlg(std::shared_ptr<Decompressor> wrapped =
+                            GetBuiltinV2CompressionManager()->GetDecompressor())
       : DecompressorWrapper(std::move(wrapped)),
         dictionary_hash_(GetSliceHash(wrapped_->GetSerializedDict())),
         allowed_types_(TypeSet::All()) {}
@@ -808,7 +811,8 @@ struct DecompressorCustomAlg : public DecompressorWrapper {
   }
 
   Status ExtractUncompressedSize(Args& args) override {
-    if (args.compression_type > kLastBuiltinCompression) {
+    if (args.compression_type >= kFirstCustomCompression &&
+        args.compression_type <= kLastCustomCompression) {
       assert(args.compressed_data.size() > 0);
       assert(args.compressed_data[0] ==
              lossless_cast<char>(args.compression_type));
@@ -828,7 +832,8 @@ struct DecompressorCustomAlg : public DecompressorWrapper {
   }
 
   Status DecompressBlock(const Args& args, char* uncompressed_output) override {
-    if (args.compression_type > kLastBuiltinCompression) {
+    if (args.compression_type >= kFirstCustomCompression &&
+        args.compression_type <= kLastCustomCompression) {
       // Also allowed to copy args and modify
       Args modified_args = args;
       modified_args.compression_type = kLZ4Compression;
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 3cba7e753754..4242dbbd9834 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4653,7 +4653,7 @@ class Benchmark {
         options.bottommost_compression = kZSTD;
 
         mgr = std::make_shared<RoundRobinManager>(
-            GetDefaultBuiltinCompressionManager());
+            GetBuiltinV2CompressionManager());
       } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
         options.compression = FLAGS_compression_type_e;
         if (FLAGS_compression_type_e == kNoCompression) {
@@ -4662,8 +4662,8 @@ class Benchmark {
                   "autoskip");
           ErrorExit();
         }
-        mgr = CreateAutoSkipCompressionManager(
-            GetDefaultBuiltinCompressionManager());
+        mgr =
+            CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager());
       } else {
         // not defined -> exit with error
         fprintf(stderr, "Requested compression manager not supported");
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 46b0f4b0b9e3..565c24540901 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -870,8 +870,8 @@ bool LDBCommand::ParseCompressionTypeOption(
       }
       options_.compression = kZSTD;
       options_.bottommost_compression = kZSTD;
-      auto mgr = std::make_shared<RoundRobinManager>(
-          GetDefaultBuiltinCompressionManager());
+      auto mgr =
+          std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
       options_.compression_manager = mgr;
 
       // Need to list zstd in the compression_name table property if it's
diff --git a/unreleased_history/new_features/format_version_7.md b/unreleased_history/new_features/format_version_7.md
new file mode 100644
index 000000000000..5cf24cb52945
--- /dev/null
+++ b/unreleased_history/new_features/format_version_7.md
@@ -0,0 +1 @@
+* Add new `format_version=7` to aid experimental support of custom compression algorithms with CompressionManager and block-based table. This format version includes changing the format of `TableProperties::compression_name`.
diff --git a/util/auto_skip_compressor.cc b/util/auto_skip_compressor.cc
index eadd50d00a56..0c8713ad2142 100644
--- a/util/auto_skip_compressor.cc
+++ b/util/auto_skip_compressor.cc
@@ -126,6 +126,6 @@ std::unique_ptr<Compressor> AutoSkipCompressorManager::GetCompressorForSST(
 std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
     std::shared_ptr<CompressionManager> wrapped) {
   return std::make_shared<AutoSkipCompressorManager>(
-      wrapped == nullptr ? GetDefaultBuiltinCompressionManager() : wrapped);
+      wrapped == nullptr ? GetBuiltinV2CompressionManager() : wrapped);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.cc b/util/compression.cc
index c7eb8267ea40..0aa473b179dc 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -1043,8 +1043,7 @@ const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
   }
 }
 
-const std::shared_ptr<CompressionManager>&
-GetDefaultBuiltinCompressionManager() {
+const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager() {
   return GetBuiltinCompressionManager(2);
 }
 
diff --git a/util/compression.h b/util/compression.h
index 896750d22bee..15f576b53623 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -754,8 +754,11 @@ inline std::string CompressionTypeToString(CompressionType compression_type) {
     case kDisableCompressionOption:
       return "DisableOption";
     default: {
-      char c = lossless_cast<char>(compression_type);
-      return "Custom" + Slice(&c, 1).ToString(/*hex=*/true);
+      bool is_custom = compression_type >= kFirstCustomCompression &&
+                       compression_type <= kLastCustomCompression;
+      unsigned char c = lossless_cast<unsigned char>(compression_type);
+      return (is_custom ? "Custom" : "Reserved") +
+             ToBaseCharsString<16>(2, c, /*uppercase=*/true);
     }
   }
 }
diff --git a/util/compression_test.cc b/util/compression_test.cc
index 884ee2794a3d..0b349f028780 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -158,7 +158,7 @@ class DBAutoSkip : public DBTestBase {
         rnd_(231),
         key_index_(0) {
     options.compression_manager =
-        CreateAutoSkipCompressionManager(GetDefaultBuiltinCompressionManager());
+        CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager());
     auto statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     options.statistics = statistics;
     options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 00a7562fd530..4c952750cdeb 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -17,7 +17,8 @@ namespace ROCKSDB_NAMESPACE {
 // MultiCompressorWrapper implementation
 MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts,
                                                CompressionDict&& dict) {
-  auto builtInManager = GetDefaultBuiltinCompressionManager();
+  // TODO: make the compression manager a field
+  auto builtInManager = GetBuiltinV2CompressionManager();
   const auto& compressions = GetSupportedCompressions();
   for (auto type : compressions) {
     if (type == kNoCompression) {
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 6e7142dc9505..bc8925299a56 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -14,6 +14,7 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/cast_util.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -410,6 +411,17 @@ TEST(UnownedPtrTest, Tests) {
   }
 }
 
+TEST(ToBaseCharsStringTest, Tests) {
+  using ROCKSDB_NAMESPACE::ToBaseCharsString;
+  // Base 16
+  ASSERT_EQ(ToBaseCharsString<16>(5, 0, true), "00000");
+  ASSERT_EQ(ToBaseCharsString<16>(5, 42, true), "0002A");
+  ASSERT_EQ(ToBaseCharsString<16>(5, 42, false), "0002a");
+  ASSERT_EQ(ToBaseCharsString<16>(2, 255, false), "ff");
+  // Base 32
+  ASSERT_EQ(ToBaseCharsString<32>(2, 255, false), "7v");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/util/string_util.h b/util/string_util.h
index 1374642a6cd7..818349870883 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -40,6 +40,16 @@ inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) {
   *buf += n;
 }
 
+// Construct a string of n digits from v in base kBase
+template <size_t kBase>
+inline std::string ToBaseCharsString(size_t n, uint64_t v, bool uppercase) {
+  std::string result;
+  result.resize(n);
+  char* buf = &result[0];
+  PutBaseChars<kBase>(&buf, n, v, uppercase);
+  return result;
+}
+
 // Parse n digits from *buf in base kBase to *v and advance *buf to the
 // position after what was read. On success, true is returned. On failure,
 // false is returned, *buf is placed at the first bad character, and *v

From f340a2eccccdc0013f74c75a12e2ad638ebcddce Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 20 Jun 2025 17:56:24 -0700
Subject: [PATCH 149/500] Port codemod changes from fbcode/rocksdb (#13714)

Summary:
Port changes made directly in fbcode in order to facilitate the 10.4 release.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13714

Test Plan: Existing tests

Reviewed By: mszeszko-meta

Differential Revision: D77038668

Pulled By: anand1976

fbshipit-source-id: 6b9b16d62bccf75923b525c1c24597a59920a948
---
 db/db_iter.cc      | 3 ---
 env/env_posix.cc   | 6 ++++--
 env/fs_posix.cc    | 1 -
 port/port_posix.cc | 3 ++-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 42739e006204..7258913e765d 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -511,7 +511,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
 
             valid_ = true;
             return true;
-            break;
           case kTypeMerge:
             if (!PrepareValueInternal()) {
               return false;
@@ -523,7 +522,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
             current_entry_is_merged_ = true;
             valid_ = true;
             return MergeValuesNewToOld();  // Go to a different state machine
-            break;
           default:
             valid_ = false;
             status_ = Status::Corruption(
@@ -1124,7 +1122,6 @@ bool DBIter::FindValueForCurrentKey() {
         }
         return true;
       }
-      break;
     case kTypeValue:
     case kTypeValuePreferredSeqno:
       SetValueAndColumnsFromPlain(pinned_value_);
diff --git a/env/env_posix.cc b/env/env_posix.cc
index 8b24a7a27888..86a7741f0f34 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -169,8 +169,9 @@ class PosixClock : public SystemClock {
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
-#endif
+#else
     return 0;
+#endif
   }
 
   uint64_t CPUNanos() override {
@@ -179,8 +180,9 @@ class PosixClock : public SystemClock {
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#endif
+#else
     return 0;
+#endif
   }
 
   void SleepForMicroseconds(int micros) override { usleep(micros); }
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 61fd2c5e614c..82bda886db05 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -868,7 +868,6 @@ class PosixFileSystem : public FileSystem {
       IOOptions opts;
       return CreateDirIfMissing(*result, opts, nullptr);
     }
-    return IOStatus::OK();
   }
 
   IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 7042a710dc84..1159d0bf8a63 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -220,8 +220,9 @@ int GetMaxOpenFiles() {
     return std::numeric_limits<int>::max();
   }
   return static_cast<int>(no_files_limit.rlim_cur);
-#endif
+#else
   return -1;
+#endif
 }
 
 void* cacheline_aligned_alloc(size_t size) {

From f2d03736a7ae6dbe226aeffdc3a359b40d69c27e Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Fri, 20 Jun 2025 21:02:39 -0700
Subject: [PATCH 150/500] Start development 10.5 (#13719)

Summary:
* Release notes from 10.4 branch
* Update version.h
* Add [10.4.fb](https://github.com/facebook/rocksdb/tree/10.4.fb) (to check_format_compatible.sh
* Update folly commit hash.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13719

Test Plan: Release collateral.

Reviewed By: anand1976

Differential Revision: D77062142

Pulled By: mszeszko-meta

fbshipit-source-id: 66c61323580386eb062e8763bba5d3480aadbc80
---
 HISTORY.md                                    | 20 +++++++++++++++++++
 Makefile                                      |  2 +-
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 .../behavior_changes/large-txn-default-val.md |  1 -
 .../bug_fixes/create-with-import.md           |  1 -
 ...isallow_memtable_writes_paranoid_checks.md |  1 -
 .../bug_fixes/ingestion_file_checksum.md      |  1 -
 .../new_features/avg-flush-trigger.md         |  1 -
 .../new_features/concurrent_vector_insert.md  |  1 -
 .../new_features/format_version_7.md          |  1 -
 .../new_features/large-txn-byte-threshold.md  |  1 -
 .../new_features/reduce_file_locking.md       |  1 -
 .../external_table_unique_ptr.md              |  1 -
 .../public_api_changes/min-file-size-dtc.md   |  1 -
 15 files changed, 23 insertions(+), 14 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/large-txn-default-val.md
 delete mode 100644 unreleased_history/bug_fixes/create-with-import.md
 delete mode 100644 unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md
 delete mode 100644 unreleased_history/bug_fixes/ingestion_file_checksum.md
 delete mode 100644 unreleased_history/new_features/avg-flush-trigger.md
 delete mode 100644 unreleased_history/new_features/concurrent_vector_insert.md
 delete mode 100644 unreleased_history/new_features/format_version_7.md
 delete mode 100644 unreleased_history/new_features/large-txn-byte-threshold.md
 delete mode 100644 unreleased_history/new_features/reduce_file_locking.md
 delete mode 100644 unreleased_history/public_api_changes/external_table_unique_ptr.md
 delete mode 100644 unreleased_history/public_api_changes/min-file-size-dtc.md

diff --git a/HISTORY.md b/HISTORY.md
index 9084ed860765..b9089d9e50a4 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,26 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.4.0 (06/20/2025)
+### New Features
+* Add a new CF option `memtable_avg_op_scan_flush_trigger` that supports triggering memtable flush when an iterator scans through an expensive range of keys, with the average number of skipped keys from the active memtable exceeding the threshold.
+* Vector based memtable now supports concurrent writers (DBOptions::allow_concurrent_memtable_write) #13675.
+* Add new experimental `TransactionOptions::large_txn_commit_optimize_byte_threshold` to enable optimizations for large transaction commit by transaction batch data size.
+* Add a new option `CompactionOptionsUniversal::reduce_file_locking` and if it's true, auto universal compaction picking will adjust to minimize locking of input files when bottom priority compactions are waiting to run. This can increase the likelihood of existing L0s being selected for compaction, thereby improving write stall and reducing read regression.
+* Add new `format_version=7` to aid experimental support of custom compression algorithms with CompressionManager and block-based table. This format version includes changing the format of `TableProperties::compression_name`.
+
+### Public API Changes
+* Change NewExternalTableFactory to return a unique_ptr instead of shared_ptr.
+* Add an optional min file size requirement for deletion triggered compaction. It can be specified when creating `CompactOnDeletionCollectorFactory`.
+
+### Behavior Changes
+* `TransactionOptions::large_txn_commit_optimize_threshold` now has default value 0 for disabled. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` now has no effect on transactions.
+
+### Bug Fixes
+* Fix a bug where CreateColumnFamilyWithImport() could miss the SST file for the memtable flush it triggered. The exported CF then may not contain the updates in the memtable when CreateColumnFamilyWithImport() is called.
+* Fix iterator operations returning NotImplemented status if disallow_memtable_writes and paranoid_memory_checks CF options are both set.
+* Fixed handling of file checksums in IngestExternalFile() to allow providing checksums using recognized but not necessarily the DB's preferred checksum function, to ease migration between checksum functions.
+
 ## 10.3.0 (05/17/2025)
 ### New Features
 * Add new experimental `CompactionOptionsFIFO::allow_trivial_copy_when_change_temperature` along with `CompactionOptionsFIFO::trivial_copy_buffer_size` to allow optimizing FIFO compactions with tiering when kChangeTemperature to move files from source tier FileSystem to another tier FileSystem via trivial and direct copying raw sst file instead of reading thru the content of the SST file then rebuilding the table files.
diff --git a/Makefile b/Makefile
index 87778338075e..5426c099f9ad 100644
--- a/Makefile
+++ b/Makefile
@@ -2492,7 +2492,7 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard d17bf897cb5bbf8f07b122a614e8cffdc38edcde
+	cd third-party/folly && git reset --hard 5c626dd6a028a02e461edb5396694d48305e9284
 	@# Apparently missing include
 	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
 	@# Warning-as-error on memcpy
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 9890c3682fec..c81a31aa96c4 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 4
+#define ROCKSDB_MINOR 5
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index fa84094f8452..0b8df70b0295 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/large-txn-default-val.md b/unreleased_history/behavior_changes/large-txn-default-val.md
deleted file mode 100644
index 7f0dde81c3cd..000000000000
--- a/unreleased_history/behavior_changes/large-txn-default-val.md
+++ /dev/null
@@ -1 +0,0 @@
-* `TransactionOptions::large_txn_commit_optimize_threshold` now has default value 0 for disabled. `TransactionDBOptions::txn_commit_bypass_memtable_threshold` now has no effect on transactions.
diff --git a/unreleased_history/bug_fixes/create-with-import.md b/unreleased_history/bug_fixes/create-with-import.md
deleted file mode 100644
index 12efa1d4321f..000000000000
--- a/unreleased_history/bug_fixes/create-with-import.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug where CreateColumnFamilyWithImport() could miss the SST file for the memtable flush it triggered. The exported CF then may not contain the updates in the memtable when CreateColumnFamilyWithImport() is called.
diff --git a/unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md b/unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md
deleted file mode 100644
index d4aea983272b..000000000000
--- a/unreleased_history/bug_fixes/disallow_memtable_writes_paranoid_checks.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix iterator operations returning NotImplemented status if disallow_memtable_writes and paranoid_memory_checks CF options are both set.
diff --git a/unreleased_history/bug_fixes/ingestion_file_checksum.md b/unreleased_history/bug_fixes/ingestion_file_checksum.md
deleted file mode 100644
index 28ee8c59ca5e..000000000000
--- a/unreleased_history/bug_fixes/ingestion_file_checksum.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fixed handling of file checksums in IngestExternalFile() to allow providing checksums using recognized but not necessarily the DB's preferred checksum function, to ease migration between checksum functions.
diff --git a/unreleased_history/new_features/avg-flush-trigger.md b/unreleased_history/new_features/avg-flush-trigger.md
deleted file mode 100644
index 53fd31e89dae..000000000000
--- a/unreleased_history/new_features/avg-flush-trigger.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add a new CF option `memtable_avg_op_scan_flush_trigger` that supports triggering memtable flush when an iterator scans through an expensive range of keys, with the average number of skipped keys from the active memtable exceeding the threshold.
diff --git a/unreleased_history/new_features/concurrent_vector_insert.md b/unreleased_history/new_features/concurrent_vector_insert.md
deleted file mode 100644
index b85f2c8d31a2..000000000000
--- a/unreleased_history/new_features/concurrent_vector_insert.md
+++ /dev/null
@@ -1 +0,0 @@
-* Vector based memtable now supports concurrent writers (DBOptions::allow_concurrent_memtable_write) #13675.
diff --git a/unreleased_history/new_features/format_version_7.md b/unreleased_history/new_features/format_version_7.md
deleted file mode 100644
index 5cf24cb52945..000000000000
--- a/unreleased_history/new_features/format_version_7.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add new `format_version=7` to aid experimental support of custom compression algorithms with CompressionManager and block-based table. This format version includes changing the format of `TableProperties::compression_name`.
diff --git a/unreleased_history/new_features/large-txn-byte-threshold.md b/unreleased_history/new_features/large-txn-byte-threshold.md
deleted file mode 100644
index 4d781c41efc7..000000000000
--- a/unreleased_history/new_features/large-txn-byte-threshold.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add new experimental `TransactionOptions::large_txn_commit_optimize_byte_threshold` to enable optimizations for large transaction commit by transaction batch data size.
diff --git a/unreleased_history/new_features/reduce_file_locking.md b/unreleased_history/new_features/reduce_file_locking.md
deleted file mode 100644
index d2f04d60cc00..000000000000
--- a/unreleased_history/new_features/reduce_file_locking.md
+++ /dev/null
@@ -1 +0,0 @@
-Add a new option `CompactionOptionsUniversal::reduce_file_locking` and if it's true, auto universal compaction picking will adjust to minimize locking of input files when bottom priority compactions are waiting to run. This can increase the likelihood of existing L0s being selected for compaction, thereby improving write stall and reducing read regression.
diff --git a/unreleased_history/public_api_changes/external_table_unique_ptr.md b/unreleased_history/public_api_changes/external_table_unique_ptr.md
deleted file mode 100644
index 29a4a98cba26..000000000000
--- a/unreleased_history/public_api_changes/external_table_unique_ptr.md
+++ /dev/null
@@ -1 +0,0 @@
-Change NewExternalTableFactory to return a unique_ptr instead of shared_ptr.
diff --git a/unreleased_history/public_api_changes/min-file-size-dtc.md b/unreleased_history/public_api_changes/min-file-size-dtc.md
deleted file mode 100644
index ae6ad13a2134..000000000000
--- a/unreleased_history/public_api_changes/min-file-size-dtc.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add an optional min file size requirement for deletion triggered compaction. It can be specified when creating `CompactOnDeletionCollectorFactory`.

From 820a30f0d23a79325b15cddd96f79f47178d5391 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Mon, 23 Jun 2025 11:10:13 -0700
Subject: [PATCH 151/500] Fix AutoSkipCompressionManager test should not be run
 with preferred compression kNoCompression (#13716)

Summary:
The nightly build was failing because we were using the AutoSkipCompressionManager with kNoCompression. The test cases should not be running with NoCompression.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13716

Test Plan:
Run the test code being run on the nightly build.
```bash
  make V=1 J=4 -j4 check
```

Reviewed By: hx235

Differential Revision: D77042874

Pulled By: shubhajeet

fbshipit-source-id: 821643b30ca53b1855fc24e3bc0a319e4fec2876
---
 util/compression_test.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/util/compression_test.cc b/util/compression_test.cc
index 0b349f028780..5840d180a9a3 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -167,7 +167,6 @@ class DBAutoSkip : public DBTestBase {
     bbto.flush_block_policy_factory.reset(
         new AutoSkipTestFlushBlockPolicyFactory(10, statistics));
     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-    DestroyAndReopen(options);
   }
 
   bool CompressionFriendlyPut(const int no_of_kvs, const int size_of_value) {
@@ -190,13 +189,14 @@ class DBAutoSkip : public DBTestBase {
   }
 };
 
-// FIXME: the test is failing the assertion in auto_skip_compressor.cc
-// when run on nightly build in build-linux-arm-test-full mode [1].
-//
-// [1]
-// auto_skip_compressor.cc:101: Assertion `preferred != kNoCompression' failed.
-TEST_F(DBAutoSkip, DISABLED_AutoSkipCompressionManager) {
-  if (GetSupportedCompressions().size() > 1) {
+TEST_F(DBAutoSkip, AutoSkipCompressionManager) {
+  for (auto type : GetSupportedCompressions()) {
+    if (type == kNoCompression) {
+      continue;
+    }
+    options.compression = type;
+    options.bottommost_compression = type;
+    DestroyAndReopen(options);
     const int kValueSize = 20000;
     // This will set the rejection ratio to 60%
     CompressionUnfriendlyPut(6, kValueSize);

From 29ec7aaa519ff4dbd758e3f98ec43b4c9f97ca10 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 25 Jun 2025 12:39:39 -0700
Subject: [PATCH 152/500] Update CI jobs for upgrade and cost (#13717)

Summary:
The Windows 2019 will be [deprecated](https://github.com/actions/runner-images/issues/12045) soon so I'm updating it to Windows 2022, and removed the same job from nightly runs.

To save some CI cost, I moved some jobs into nightly since they have low failure rates and examples/fuzzers are not updated often: https://github.com/facebook/rocksdb/actions/metrics/performance?dateRangeType=DATE_RANGE_TYPE_PREVIOUS_MONTH&sort=failureRate%2CORDER_BY_DIRECTION_ASC&tab=jobs&filters=workflow_file_name%3Apr-jobs.yml.

I don't think microbench is used/looked at so I'm deleting it from nightly too.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13717

Test Plan: CI

Reviewed By: jaykorean

Differential Revision: D77234715

Pulled By: cbi42

fbshipit-source-id: 75a5edf56391e4743efa1824b4070208ef10f280
---
 .github/workflows/nightly.yml | 77 +++++++++++++++++++++++++----------
 .github/workflows/pr-jobs.yml | 63 ++--------------------------
 2 files changed, 60 insertions(+), 80 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1370a5460402..6d3139e799fb 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -27,18 +27,6 @@ jobs:
         git config --global --add safe.directory /__w/rocksdb/rocksdb
         tools/check_format_compatible.sh
     - uses: "./.github/actions/post-steps"
-  build-linux-run-microbench:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: DEBUG_LEVEL=0 make -j32 run_microbench
-    - uses: "./.github/actions/post-steps"
   build-linux-non-shm:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
@@ -91,15 +79,6 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/windows-build-steps"
-  build-windows-vs2022:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: windows-2022
-    env:
-      CMAKE_GENERATOR: Visual Studio 17 2022
-      CMAKE_PORTABLE: 1
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/windows-build-steps"
   build-linux-arm-test-full:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
@@ -110,3 +89,59 @@ jobs:
       - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev
       - run: make V=1 J=4 -j4 check
       - uses: "./.github/actions/post-steps"
+  build-examples:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 4-core-ubuntu
+    container:
+      image: zjay437/rocksdb:0.6
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - name: Build examples
+      run: make V=1 -j4 static_lib && cd examples && make V=1 -j4
+    - uses: "./.github/actions/post-steps"
+  build-fuzzers:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 4-core-ubuntu
+    container:
+      image: zjay437/rocksdb:0.6
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - name: Build rocksdb lib
+      run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib
+    - name: Build fuzzers
+      run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
+    - uses: "./.github/actions/post-steps"
+  build-linux-gcc-11-no_test_run:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    container:
+      image: zjay437/rocksdb:0.6
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
+    - uses: "./.github/actions/post-steps"
+  build-linux-cmake-with-folly-lite-no-test:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    container:
+      image: zjay437/rocksdb:0.6
+      options: --shm-size=16gb
+    env:
+      CC: gcc-10
+      CXX: g++-10
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/setup-folly"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)"
+    - uses: "./.github/actions/post-steps"
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index ec221bfb0695..83e229378ea7 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -109,22 +109,6 @@ jobs:
     - uses: "./.github/actions/build-folly"
     - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-folly-lite-no-test:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - uses: "./.github/actions/setup-folly"
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)"
-    - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
@@ -242,34 +226,6 @@ jobs:
     - run: apt-get remove -y libgflags-dev
     - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
     - run: if ./db_stress --version; then false; else true; fi
-  build-examples:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 4-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - name: Build examples
-      run: make V=1 -j4 static_lib && cd examples && make V=1 -j4
-    - uses: "./.github/actions/post-steps"
-  build-fuzzers:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 4-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - name: Build rocksdb lib
-      run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib
-    - name: Build fuzzers
-      run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
-    - uses: "./.github/actions/post-steps"
   build-linux-clang-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
@@ -317,18 +273,7 @@ jobs:
     - uses: "./.github/actions/pre-steps"
     - run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all
     - uses: "./.github/actions/post-steps"
-  build-linux-gcc-11-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
-    - uses: "./.github/actions/post-steps"
+
   # ======================== Linux Other Checks ======================= #
   build-linux-clang10-clang-analyze:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -469,11 +414,11 @@ jobs:
     - uses: "./.github/actions/post-steps"
   # ======================== Windows with Tests ======================= #
   # NOTE: some windows jobs are in "nightly" to save resources
-  build-windows-vs2019:
+  build-windows-vs2022:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: windows-2019
+    runs-on: windows-2022
     env:
-      CMAKE_GENERATOR: Visual Studio 16 2019
+      CMAKE_GENERATOR: Visual Studio 17 2022
       CMAKE_PORTABLE: 1
     steps:
     - uses: actions/checkout@v4.1.0

From 08dc5cacd9a615e64cce1c718301cf200eb730df Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 25 Jun 2025 13:32:08 -0700
Subject: [PATCH 153/500] Check op count in WBWI vs WB when ingesting WBWI
 (#13722)

Summary:
Large txn commit optimization requires all updates are added to a transaction's WriteBatchWithIndex. However, some usage of transactions may add updates directly to the WBWI's underlying write batch. In these cases, we should not attempt to ingest the WBWI since it will drop these updates. This PR adds sanity checking for this.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13722

Test Plan:
- added checks in unit test and stress test
- manually check LOG files for the new unit test

Reviewed By: hx235

Differential Revision: D77247688

Pulled By: cbi42

fbshipit-source-id: 3d1c0c6e64d6d7dfd5578bc4d77abe44cac1e419
---
 db_stress_tool/db_stress_test_base.cc         |  4 ++
 include/rocksdb/utilities/transaction_db.h    |  4 +-
 .../utilities/write_batch_with_index.h        |  2 +
 .../transactions/pessimistic_transaction.cc   | 24 ++++++++-
 utilities/transactions/transaction_test.cc    | 51 +++++++++++++++++++
 .../write_batch_with_index.cc                 | 12 +++--
 .../write_batch_with_index_test.cc            |  9 ++++
 7 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 8ead78588c4e..e72ce36795dc 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -876,6 +876,10 @@ Status StressTest::CommitTxn(Transaction& txn, ThreadState* thread) {
     return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
   }
   Status s = Status::OK();
+  // We don't issue write to transaction's underlying WriteBatch in stress test
+  assert(txn.GetWriteBatch()->GetWriteBatch()->Count());
+  assert(txn.GetWriteBatch()->GetWBWIOpCount() ==
+         txn.GetWriteBatch()->GetWriteBatch()->Count());
   if (FLAGS_use_optimistic_txn) {
     assert(optimistic_txn_db_);
     s = txn.Commit();
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 4a69c141b06d..c5c10be0c8b5 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -396,9 +396,9 @@ struct TransactionOptions {
   // due to too many memtables.
   // Note that the ingestion relies on the transaction's underlying index,
   // (WriteBatchWithIndex), so updates that are added to the transaction
-  // without indexing (e.g. added directly to the transaction underlying
+  // without indexing (i.e. added directly to the transaction underlying
   // write batch through Transaction::GetWriteBatch()->GetWriteBatch())
-  // are not supported. They will not be applied to the DB.
+  // are not supported, and the optimization will not apply in that case.
   //
   // NOTE: since WBWI keep track of the most recent update per key, a Put
   // followed by a SingleDelete will be written to DB as a SingleDelete. This
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 9d3914c1b44f..edced15b9ec7 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -379,6 +379,8 @@ class WriteBatchWithIndex : public WriteBatchBase {
   };
   const std::unordered_map<uint32_t, CFStat>& GetCFStats() const;
 
+  // The total number of operations issued into this WBWI.
+  size_t GetWBWIOpCount() const;
   bool GetOverwriteKey() const;
 
  private:
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index a5b22a579279..9bdb587274f3 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -898,10 +898,30 @@ Status WriteCommittedTxn::CommitInternal() {
   if (!needs_ts) {
     if (commit_bypass_memtable_threshold_ &&
         wb_count >= commit_bypass_memtable_threshold_) {
-      bypass_memtable = true;
+      if (wbwi->GetWBWIOpCount() != wb_count) {
+        ROCKS_LOG_WARN(
+            db_impl_->immutable_db_options().info_log,
+            "Transaction %s qualifies for commit optimization due to update "
+            "count. However, it will commit normally due to wbwi and wb record "
+            "count mismatch. Some updates were added directly to the "
+            "transaction's underlying write batch.",
+            GetName().c_str());
+      } else {
+        bypass_memtable = true;
+      }
     } else if (commit_bypass_memtable_byte_threshold_ &&
                wb->GetDataSize() >= commit_bypass_memtable_byte_threshold_) {
-      bypass_memtable = true;
+      if (wbwi->GetWBWIOpCount() != wb_count) {
+        ROCKS_LOG_WARN(
+            db_impl_->immutable_db_options().info_log,
+            "Transaction %s qualifies for commit optimization due to write "
+            "batch size. However, it will commit normally due to wbwi and wb "
+            "record count mismatch. Some updates were added directly to the "
+            "transaction's underlying write batch.",
+            GetName().c_str());
+      } else {
+        bypass_memtable = true;
+      }
     }
   }
   if (!bypass_memtable) {
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index bf5bbc562925..226914733524 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -9929,6 +9929,57 @@ TEST_P(CommitBypassMemtableTest,
 
   delete txn_cf;
 }
+
+TEST_P(CommitBypassMemtableTest, WBWIOpCountMismatchWBCount) {
+  // Tests that large txn optimization checks op count in WBWI vs WB. When an
+  // update is written directly to a transaction's underlying write batch, the
+  // optimization should not apply.
+  SetUpTransactionDB();
+  bool commit_bypass_memtable = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteCommittedTxn::CommitInternal:bypass_memtable",
+      [&](void* arg) { commit_bypass_memtable = *(static_cast<bool*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  {
+    WriteOptions wopts;
+    TransactionOptions txn_opts;
+    txn_opts.large_txn_commit_optimize_byte_threshold = 100;
+    auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+    ASSERT_OK(txn->SetName("xid0"));
+    ASSERT_OK(txn->Put("k1", rnd.RandomString(1000)));
+    // This update is written directly to the underlying write batch, so the
+    // optimization should not apply.
+    ASSERT_OK(txn->GetWriteBatch()->GetWriteBatch()->Put("meta", "1"));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    ASSERT_FALSE(commit_bypass_memtable);
+
+    ASSERT_EQ(Get("meta"), "1");
+    delete txn;
+  }
+
+  {
+    WriteOptions wopts;
+    TransactionOptions txn_opts;
+    txn_opts.large_txn_commit_optimize_threshold = 10;
+    auto txn = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+    ASSERT_OK(txn->SetName("xid0"));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(txn->Put(Key(i), rnd.RandomString(10)));
+    }
+    // This update is written directly to the underlying write batch, so the
+    // optimization should not apply.
+    ASSERT_OK(txn->GetWriteBatch()->GetWriteBatch()->Put("meta", "2"));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    ASSERT_FALSE(commit_bypass_memtable);
+
+    ASSERT_EQ(Get("meta"), "2");
+    delete txn;
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 0bd6c42fd0d8..3171c0bf71f2 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -32,7 +32,8 @@ struct WriteBatchWithIndex::Rep {
         skip_list(comparator, &arena),
         last_sub_batch_offset(0),
         sub_batch_cnt(1),
-        overwrite_key(_overwrite_key) {}
+        overwrite_key(_overwrite_key),
+        op_count(0) {}
   ReadableWriteBatch write_batch;
   WriteBatchEntryComparator comparator;
   Arena arena;
@@ -48,7 +49,8 @@ struct WriteBatchWithIndex::Rep {
   // Tracks ids of CFs that have updates in this WBWI, number of updates and
   // number of overwritten single deletions per cf. Useful for WBWIMemTable
   // when this WBWI is ingested into a DB.
-  std::unordered_map<uint32_t, CFStat> cf_id_to_stat;
+  std::unordered_map<uint32_t, WriteBatchWithIndex::CFStat> cf_id_to_stat;
+  size_t op_count;
 
   // In overwrite mode, find the existing entry for the same key and update it
   // to point to the current entry if this is not a Merge operation.
@@ -154,6 +156,7 @@ bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
 void WriteBatchWithIndex::Rep::AddOrUpdateIndexWithCfId(
     uint32_t cf_id, const Slice& key, WriteType type, size_t last_entry_offset,
     const Comparator* cf_cmp) {
+  op_count++;
   uint32_t update_count = 0;
   if (!UpdateExistingEntryWithCfId(cf_id, key, type, last_entry_offset,
                                    &update_count)) {
@@ -201,7 +204,6 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id,
 
 void WriteBatchWithIndex::Rep::Clear() {
   write_batch.Clear();
-  cf_id_to_stat.clear();
   ClearIndex();
 }
 
@@ -212,6 +214,8 @@ void WriteBatchWithIndex::Rep::ClearIndex() {
   new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
   last_sub_batch_offset = 0;
   sub_batch_cnt = 1;
+  cf_id_to_stat.clear();
+  op_count = 0;
 }
 
 Status WriteBatchWithIndex::Rep::ReBuildIndex() {
@@ -1173,5 +1177,7 @@ WriteBatchWithIndex::GetCFStats() const {
   return rep->cf_id_to_stat;
 }
 
+size_t WriteBatchWithIndex::GetWBWIOpCount() const { return rep->op_count; }
+
 bool WriteBatchWithIndex::GetOverwriteKey() const { return rep->overwrite_key; }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 2c8b71203e17..a61de9129f23 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -342,6 +342,10 @@ void AssertIterEqual(WBWIIteratorImpl* wbwii,
   }
   ASSERT_FALSE(wbwii->Valid());
 }
+
+void AssertWBWICountEQWBCount(WriteBatchWithIndex& wbwi) {
+  ASSERT_EQ(wbwi.GetWBWIOpCount(), wbwi.GetWriteBatch()->Count());
+}
 }  // namespace
 
 class WBWIBaseTest : public testing::Test {
@@ -356,6 +360,8 @@ class WBWIBaseTest : public testing::Test {
   }
 
   virtual ~WBWIBaseTest() {
+    AssertWBWICountEQWBCount(*batch_);
+
     if (db_ != nullptr) {
       ReleaseSnapshot();
       delete db_;
@@ -715,6 +721,7 @@ TEST_P(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   batch_.reset(new WriteBatchWithIndex(nullptr, 20, GetParam()));
 
   TestValueAsSecondaryIndexHelper(entries_list, batch_.get(), GetParam());
+  AssertWBWICountEQWBCount(*batch_);
 
   // Clear batch and re-run test with new values
   batch_->Clear();
@@ -729,6 +736,7 @@ TEST_P(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   entries_list = std::vector<Entry>(new_entries, new_entries + 8);
 
   TestValueAsSecondaryIndexHelper(entries_list, batch_.get(), GetParam());
+  AssertWBWICountEQWBCount(*batch_);
 }
 
 TEST_P(WriteBatchWithIndexTest, WBWIIteratorImpl) {
@@ -3816,6 +3824,7 @@ TEST_F(WBWIMemTableTest, ReadFromWBWIMemtable) {
     // See comment for WBWIMemTable for sequence number assignment method.
     expected_seqno[idx]++;
   }
+  AssertWBWICountEQWBCount(*wbwi);
   // Get a non-existing key
   found_final_value = false;
   ASSERT_EQ("NOT_FOUND", Get("foo", wbwi_mem, visible_seq, &found_final_value));

From fd95bc8f5a5afe5b31d077e8a1f514b43a2f42de Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Thu, 26 Jun 2025 08:59:56 -0700
Subject: [PATCH 154/500] Custom Compressor for predicting the CPU and IO cost
 of the block level compression (#13711)

Summary:
This pull request implements the prediction aspect of auto-tuning compression in RocksDB, as part of Milestone 2. The goal is to optimize compression decisions to meet a given CPU and IO budget, based on the predicted CPU time and result compression ratio for compression decisions on a data block.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13711

Test Plan:
Ran benchmark tests to evaluate performance impact of new algorithm
Verified that optimization does not compromise overall system performance
```bash
SUFFIX=`tty | sed 's|/|_|g'`; for ARGS in "-compression_parallel_threads=1 -compression_type=zstd -compression_manager=none"  "-compression_parallel_threads=4 -compression_type=zstd -compression_manager=none" "-compression_parallel_threads=1 -compression_type=zstd -compression_manager=costpredictor"  "-compression_parallel_threads=4 -compression_type=zstd -compression_manager=costpredictor" ; do echo $ARGS; (for I in `seq 1 20`; do ./db_bench -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 $ARGS 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done

```
parallel threads | 1 | 4
-- | -- | --
master branch | 1076660.5 ops | 1668411.3 ops
new code compression manager="none" | 1057155.35 ops (-1.81%) | 1648664.2 ops (-1.18%)
new code compression manager="costpredictor" | 1080794.8 ops (0.38%)| 1652720.35 ops (-0.94%)

Used the mean absolute percentage error (MAPE) to show accuracy of the predictor.
```bash
./db_bench --db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq --compaction_style=2 --num=10000000 --fifo_compaction_max_table_files_size_mb=1000 --fifo_compaction_allow_compaction=0 --disable_wal --write_buffer_size=12000000 --statistics --stats_level=5 --value_size=2000 --compression_manager=costpredictor --compression_type=zstd --progress_reports=false 2>&1 | tee /tmp/predict.log
```

compression_name | compression_level | MAPE (cpu cost) | MAPE (io cost) | average measured_time (micro sec) | average predicted_time (micro sec) | average measured_io (bytes) | average predicted_io (bytes)
-- | -- | -- | -- | -- | -- | -- | --
Snappy | 0 | 16.979548 | 3.138885 | 3.639488 | 2.98755 | 2257.655152 | 2178.070375
LZ4 | 1 | 15.508632 | 3.103681 | 4.733639 | 4.010361 | 2257.803299 | 2179.82233
LZ4 | 4 | 15.471204 | 3.102158 | 4.731955 | 4.006011 | 2258.529203 | 2179.778441
LZ4 | 9 | 15.429305 | 3.09599 | 4.729104 | 4.007059 | 2257.822368 | 2179.927506
LZ4HC | 1 | 7.254545 | 3.112858 | 79.64412 | 76.603272 | 2258.636774 | 2177.464922
LZ4HC | 4 | 7.249132 | 3.085802 | 79.591264 | 76.576416 | 2255.098757 | 2176.126082
LZ4HC | 9 | 7.248921 | 3.09695 | 79.719061 | 76.614155 | 2253.772057 | 2175.882686
ZSTD | 1 | 8.728305 | 3.223971 | 18.93434 | 17.882706 | 1957.773706 | 1890.895071
ZSTD | 15 | 4.853552 | 3.238199 | 329.396574 | 318.277613 | 1918.021616 | 1853.833546
ZSTD | 22 | 4.275209 | 3.243137 | 625.471394 | 596.254939 | 1919.035477 | 1853.44902

```bash
./db_bench --db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq --compaction_style=2 --num=10000000 --fifo_compaction_max_table_files_size_mb=1000 --fifo_compaction_allow_compaction=0 --disable_wal --write_buffer_size=12000000 --statistics --stats_level=5 --value_size=2000 --compression_manager=costpredictor --compression_type=zstd --progress_reports=false --write_buffer_size=140737488355328 --block_size=16382
```
Increasing the block size i.e. doubling the measured time reduces the MAPE by half.
compression_name | compression_level | MAPE (cpu cost) | MAPE (io cost) | average measured_time (micro sec) | average predicted_time (micro sec) | average measured_io (bytes) | average predicted_io (bytes)
-- | -- | -- | -- | -- | -- | -- | --
Snappy | 0 | 7.933944 | 0.061173 | 7.187587 | 6.815071 | 4466.536629 | 4465.925648
LZ4 | 1 | 5.614279 | 0.050215 | 8.526641 | 8.14445 | 4473.768752 | 4473.159792
LZ4 | 4 | 5.617925 | 0.050317 | 8.525155 | 8.144209 | 4473.772343 | 4473.159782
LZ4 | 9 | 5.65519 | 0.050249 | 8.530569 | 8.14836 | 4473.762187 | 4473.150695
LZ4HC | 1 | 4.259648 | 0.028564 | 98.273778 | 97.820515 | 4471.691596 | 4471.05918
LZ4HC | 4 | 4.269529 | 0.027665 | 98.240579 | 97.788721 | 4465.537078 | 4464.901328
LZ4HC | 9 | 4.274553 | 0.027555 | 98.319357 | 97.8637 | 4465.539437 | 4464.903889
ZSTD | 1 | 4.909716 | 0.155441 | 29.503133 | 29.047057 | 3713.562704 | 3712.978633
ZSTD | 15 | 1.310407 | 0.162864 | 643.803097 | 635.960631 | 3797.544307 | 3705.772419
ZSTD | 22 | 1.011497 | 0.155876 | 1221.189822 | 1220.693678 | 3705.556448 | 3704.972332

Reviewed By: hx235

Differential Revision: D77065528

Pulled By: shubhajeet

fbshipit-source-id: f7f4ae018f786bfeae3eacf0135055c63e142610
---
 BUCK                                   |   2 +-
 CMakeLists.txt                         |   2 +-
 db/merge_helper.h                      |   2 +-
 include/rocksdb/advanced_compression.h |  12 +-
 src.mk                                 |   2 +-
 tools/db_bench_tool.cc                 |  52 ++---
 util/auto_skip_compressor.cc           | 131 -----------
 util/auto_skip_compressor.h            |  90 -------
 util/auto_tune_compressor.cc           | 309 +++++++++++++++++++++++++
 util/auto_tune_compressor.h            | 189 +++++++++++++++
 util/compression_test.cc               | 120 +++++++++-
 util/stop_watch.h                      |  24 +-
 12 files changed, 669 insertions(+), 266 deletions(-)
 delete mode 100644 util/auto_skip_compressor.cc
 delete mode 100644 util/auto_skip_compressor.h
 create mode 100644 util/auto_tune_compressor.cc
 create mode 100644 util/auto_tune_compressor.h

diff --git a/BUCK b/BUCK
index 52e256c342f9..1a106f9a5262 100644
--- a/BUCK
+++ b/BUCK
@@ -249,7 +249,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "trace_replay/trace_record_result.cc",
         "trace_replay/trace_replay.cc",
         "util/async_file_reader.cc",
-        "util/auto_skip_compressor.cc",
+        "util/auto_tune_compressor.cc",
         "util/build_version.cc",
         "util/cleanable.cc",
         "util/coding.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b32049758221..b1e7a9215775 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -874,7 +874,7 @@ set(SOURCES
         trace_replay/trace_record.cc
         trace_replay/trace_replay.cc
         util/async_file_reader.cc
-        util/auto_skip_compressor.cc
+        util/auto_tune_compressor.cc
         util/cleanable.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
diff --git a/db/merge_helper.h b/db/merge_helper.h
index 39bd15f60876..3c016e6753e7 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -250,7 +250,7 @@ class MergeHelper {
   // Parallel with keys_; stores the operands
   mutable MergeContext merge_context_;
 
-  StopWatchNano filter_timer_;
+  StopWatchNano<> filter_timer_;
   uint64_t total_filter_time_;
   Statistics* stats_;
 
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index 682a5d6bc56a..42cd87ec391e 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -589,10 +589,14 @@ const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager();
 // lead to unexpected schema changes for user CompressionManagers building on
 // the built-in schema, in the unlikely/rare case of a new built-in schema.
 
-// Gets CompressionManager designed for the automated compression strategy.
+// Creates CompressionManager designed for the automated compression strategy.
 // This may include deciding to compress or not.
-// In future should be able to select compression algorithm based on the CPU
-// utilization and IO constraints.
+// EXPERIMENTAL
 std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
-    std::shared_ptr<CompressionManager> wrapped);
+    std::shared_ptr<CompressionManager> wrapped = nullptr);
+// Creates CompressionManager designed for the CPU and IO cost aware compression
+// strategy
+// EXPERIMENTAL
+std::shared_ptr<CompressionManagerWrapper> CreateCostAwareCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped = nullptr);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index b1d5a59d8d0e..fff9c6e55086 100644
--- a/src.mk
+++ b/src.mk
@@ -237,13 +237,13 @@ LIB_SOURCES =                                                   \
   trace_replay/block_cache_tracer.cc                            \
   trace_replay/io_tracer.cc                                     \
   util/async_file_reader.cc					                            \
+  util/auto_tune_compressor.cc                                           \
   util/build_version.cc                                         \
   util/cleanable.cc                                             \
   util/coding.cc                                                \
   util/compaction_job_stats_impl.cc                             \
   util/comparator.cc                                            \
   util/compression.cc                                           \
-  util/auto_skip_compressor.cc                                           \
   util/compression_context_cache.cc                             \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 4242dbbd9834..439df43ed93e 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4639,36 +4639,32 @@ class Benchmark {
         FLAGS_level0_file_num_compaction_trigger;
     options.level0_slowdown_writes_trigger =
         FLAGS_level0_slowdown_writes_trigger;
-    if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+    options.compression = FLAGS_compression_type_e;
+    std::shared_ptr<CompressionManagerWrapper> mgr = nullptr;
+    if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
+      mgr =
+          std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(),
+                           "costpredictor")) {
+      mgr = CreateCostAwareCompressionManager();
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
+      mgr = CreateAutoSkipCompressionManager();
+    } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
       options.compression = FLAGS_compression_type_e;
     } else {
-      std::shared_ptr<CompressionManagerWrapper> mgr;
-      if (!strcasecmp(FLAGS_compression_manager.c_str(), "mixed")) {
-        // Need to list zstd in the compression_name table property if it's
-        // potentially used by being in the mix (i.e., potentially at least one
-        // data block in the table is compressed by zstd). This ensures proper
-        // context and dictionary handling, and prevents crashes in older
-        // RocksDB versions.
-        options.compression = kZSTD;
-        options.bottommost_compression = kZSTD;
-
-        mgr = std::make_shared<RoundRobinManager>(
-            GetBuiltinV2CompressionManager());
-      } else if (!strcasecmp(FLAGS_compression_manager.c_str(), "autoskip")) {
-        options.compression = FLAGS_compression_type_e;
-        if (FLAGS_compression_type_e == kNoCompression) {
-          fprintf(stderr,
-                  "Compression type must not be no Compression when using "
-                  "autoskip");
-          ErrorExit();
-        }
-        mgr =
-            CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager());
-      } else {
-        // not defined -> exit with error
-        fprintf(stderr, "Requested compression manager not supported");
-        ErrorExit();
-      }
+      // compression manager is not supported
+      // exit with error
+      fprintf(stderr, "Requested compression manager not supported");
+      ErrorExit();
+    }
+    if (FLAGS_compression_type_e == kNoCompression &&
+        strcasecmp(FLAGS_compression_manager.c_str(), "none")) {
+      fprintf(stderr,
+              "Compression type must not be no Compression when using "
+              "compression manager");
+      ErrorExit();
+    }
+    if (mgr != nullptr) {
       options.compression_manager = mgr;
     }
 
diff --git a/util/auto_skip_compressor.cc b/util/auto_skip_compressor.cc
deleted file mode 100644
index 0c8713ad2142..000000000000
--- a/util/auto_skip_compressor.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-
-#include "util/auto_skip_compressor.h"
-
-#include "options/options_helper.h"
-#include "rocksdb/advanced_compression.h"
-#include "test_util/sync_point.h"
-#include "util/random.h"
-namespace ROCKSDB_NAMESPACE {
-
-int CompressionRejectionProbabilityPredictor::Predict() const {
-  return pred_rejection_prob_percentage_;
-}
-
-size_t CompressionRejectionProbabilityPredictor::attempted_compression_count()
-    const {
-  return rejected_count_ + compressed_count_;
-}
-
-bool CompressionRejectionProbabilityPredictor::Record(
-    Slice uncompressed_block_data, std::string* compressed_output,
-    const CompressionOptions& opts) {
-  if (compressed_output->size() >
-      (static_cast<uint64_t>(opts.max_compressed_bytes_per_kb) *
-       uncompressed_block_data.size()) >>
-      10) {
-    rejected_count_++;
-  } else {
-    compressed_count_++;
-  }
-  auto attempted = attempted_compression_count();
-  if (attempted >= window_size_) {
-    pred_rejection_prob_percentage_ =
-        static_cast<int>(rejected_count_ * 100 / attempted);
-    compressed_count_ = 0;
-    rejected_count_ = 0;
-    assert(attempted_compression_count() == 0);
-  }
-  return true;
-}
-AutoSkipCompressorWrapper::AutoSkipCompressorWrapper(
-    std::unique_ptr<Compressor> compressor, const CompressionOptions& opts)
-    : CompressorWrapper::CompressorWrapper(std::move(compressor)),
-      kOpts(opts),
-      predictor_(
-          std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {}
-
-const char* AutoSkipCompressorWrapper::Name() const {
-  return "AutoSkipCompressorWrapper";
-}
-
-Status AutoSkipCompressorWrapper::CompressBlock(
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
-  // Check if the managed working area is provided or owned by this object.
-  // If not, bypass auto-skip logic since the working area lacks a predictor to
-  // record or make necessary decisions to compress or bypass compression of the
-  // block
-  if (wa == nullptr || wa->owner() != this) {
-    return wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                   out_compression_type, wa);
-  }
-  bool exploration =
-      Random::GetTLSInstance()->PercentTrue(kExplorationPercentage);
-  TEST_SYNC_POINT_CALLBACK(
-      "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
-      &exploration);
-  auto autoskip_wa = static_cast<AutoSkipWorkingArea*>(wa->get());
-  if (exploration) {
-    return CompressBlockAndRecord(uncompressed_data, compressed_output,
-                                  out_compression_type, autoskip_wa);
-  } else {
-    auto predictor_ptr = autoskip_wa->predictor;
-    auto prediction = predictor_ptr->Predict();
-    if (prediction <= kProbabilityCutOff) {
-      // decide to compress
-      return CompressBlockAndRecord(uncompressed_data, compressed_output,
-                                    out_compression_type, autoskip_wa);
-    } else {
-      // decide to bypass compression
-      *out_compression_type = kNoCompression;
-      return Status::OK();
-    }
-  }
-  return Status::OK();
-}
-
-Compressor::ManagedWorkingArea AutoSkipCompressorWrapper::ObtainWorkingArea() {
-  auto wrap_wa = wrapped_->ObtainWorkingArea();
-  return ManagedWorkingArea(new AutoSkipWorkingArea(std::move(wrap_wa)), this);
-}
-void AutoSkipCompressorWrapper::ReleaseWorkingArea(WorkingArea* wa) {
-  delete static_cast<AutoSkipWorkingArea*>(wa);
-}
-
-Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, AutoSkipWorkingArea* wa) {
-  Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                          out_compression_type, &(wa->wrapped));
-  // determine if it was rejected or compressed
-  auto predictor_ptr = wa->predictor;
-  predictor_ptr->Record(uncompressed_data, compressed_output, kOpts);
-  return status;
-}
-
-const char* AutoSkipCompressorManager::Name() const {
-  // should have returned "AutoSkipCompressorManager" but we currently have an
-  // error so for now returning name of the wrapped container
-  return wrapped_->Name();
-}
-
-std::unique_ptr<Compressor> AutoSkipCompressorManager::GetCompressorForSST(
-    const FilterBuildingContext& context, const CompressionOptions& opts,
-    CompressionType preferred) {
-  assert(GetSupportedCompressions().size() > 1);
-  assert(preferred != kNoCompression);
-  return std::make_unique<AutoSkipCompressorWrapper>(
-      wrapped_->GetCompressorForSST(context, opts, preferred), opts);
-}
-
-std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
-    std::shared_ptr<CompressionManager> wrapped) {
-  return std::make_shared<AutoSkipCompressorManager>(
-      wrapped == nullptr ? GetBuiltinV2CompressionManager() : wrapped);
-}
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/auto_skip_compressor.h b/util/auto_skip_compressor.h
deleted file mode 100644
index d93a4f4ade41..000000000000
--- a/util/auto_skip_compressor.h
+++ /dev/null
@@ -1,90 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Creates auto skip compressor wrapper which intelligently decides bypassing
-// compression based on past data
-
-#pragma once
-#include <memory>
-
-#include "rocksdb/advanced_compression.h"
-
-namespace ROCKSDB_NAMESPACE {
-// Predict rejection probability using a moving window approach
-class CompressionRejectionProbabilityPredictor {
- public:
-  CompressionRejectionProbabilityPredictor(int window_size)
-      : pred_rejection_prob_percentage_(0),
-        rejected_count_(0),
-        compressed_count_(0),
-        window_size_(window_size) {}
-  int Predict() const;
-  bool Record(Slice uncompressed_block_data, std::string* compressed_output,
-              const CompressionOptions& opts);
-  size_t attempted_compression_count() const;
-
- protected:
-  int pred_rejection_prob_percentage_;
-  size_t rejected_count_;
-  size_t compressed_count_;
-  size_t window_size_;
-};
-
-class AutoSkipWorkingArea : public Compressor::WorkingArea {
- public:
-  explicit AutoSkipWorkingArea(Compressor::ManagedWorkingArea&& wa)
-      : wrapped(std::move(wa)),
-        predictor(
-            std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {}
-  ~AutoSkipWorkingArea() {}
-  AutoSkipWorkingArea(const AutoSkipWorkingArea&) = delete;
-  AutoSkipWorkingArea& operator=(const AutoSkipWorkingArea&) = delete;
-  AutoSkipWorkingArea(AutoSkipWorkingArea&& other) noexcept
-      : wrapped(std::move(other.wrapped)),
-        predictor(std::move(other.predictor)) {}
-
-  AutoSkipWorkingArea& operator=(AutoSkipWorkingArea&& other) noexcept {
-    if (this != &other) {
-      wrapped = std::move(other.wrapped);
-      predictor = std::move(other.predictor);
-    }
-    return *this;
-  }
-  Compressor::ManagedWorkingArea wrapped;
-  std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor;
-};
-
-class AutoSkipCompressorWrapper : public CompressorWrapper {
- public:
-  const char* Name() const override;
-  explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
-                                     const CompressionOptions& opts);
-
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
-                       CompressionType* out_compression_type,
-                       ManagedWorkingArea* wa) override;
-  ManagedWorkingArea ObtainWorkingArea() override;
-  void ReleaseWorkingArea(WorkingArea* wa) override;
-
- private:
-  Status CompressBlockAndRecord(Slice uncompressed_data,
-                                std::string* compressed_output,
-                                CompressionType* out_compression_type,
-                                AutoSkipWorkingArea* wa);
-  static constexpr int kExplorationPercentage = 10;
-  static constexpr int kProbabilityCutOff = 50;
-  const CompressionOptions kOpts;
-  std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor_;
-};
-
-class AutoSkipCompressorManager : public CompressionManagerWrapper {
-  using CompressionManagerWrapper::CompressionManagerWrapper;
-  const char* Name() const override;
-  std::unique_ptr<Compressor> GetCompressorForSST(
-      const FilterBuildingContext& context, const CompressionOptions& opts,
-      CompressionType preferred) override;
-};
-
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
new file mode 100644
index 000000000000..b76b27db32e6
--- /dev/null
+++ b/util/auto_tune_compressor.cc
@@ -0,0 +1,309 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "util/auto_tune_compressor.h"
+
+#include "options/options_helper.h"
+#include "rocksdb/advanced_compression.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+namespace ROCKSDB_NAMESPACE {
+const std::vector<std::vector<int>> CostAwareCompressor::kCompressionLevels{
+    {0},         // KSnappyCompression
+    {},          // kZlibCompression
+    {},          // kBZip2Compression
+    {1, 4, 9},   // kLZ4Compression
+    {1, 4, 9},   // klZ4HCCompression
+    {},          // kXpressCompression
+    {1, 15, 22}  // kZSTD
+};
+
+int CompressionRejectionProbabilityPredictor::Predict() const {
+  return pred_rejection_prob_percentage_;
+}
+
+size_t CompressionRejectionProbabilityPredictor::attempted_compression_count()
+    const {
+  return rejected_count_ + compressed_count_;
+}
+
+bool CompressionRejectionProbabilityPredictor::Record(
+    Slice uncompressed_block_data, std::string* compressed_output,
+    const CompressionOptions& opts) {
+  if (compressed_output->size() >
+      (static_cast<uint64_t>(opts.max_compressed_bytes_per_kb) *
+       uncompressed_block_data.size()) >>
+      10) {
+    rejected_count_++;
+  } else {
+    compressed_count_++;
+  }
+  auto attempted = attempted_compression_count();
+  if (attempted >= window_size_) {
+    pred_rejection_prob_percentage_ =
+        static_cast<int>(rejected_count_ * 100 / attempted);
+    compressed_count_ = 0;
+    rejected_count_ = 0;
+    assert(attempted_compression_count() == 0);
+  }
+  return true;
+}
+
+AutoSkipCompressorWrapper::AutoSkipCompressorWrapper(
+    std::unique_ptr<Compressor> compressor, const CompressionOptions& opts)
+    : CompressorWrapper::CompressorWrapper(std::move(compressor)),
+      opts_(opts) {}
+
+const char* AutoSkipCompressorWrapper::Name() const {
+  return "AutoSkipCompressorWrapper";
+}
+
+Status AutoSkipCompressorWrapper::CompressBlock(
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+  // Check if the managed working area is provided or owned by this object.
+  // If not, bypass auto-skip logic since the working area lacks a predictor to
+  // record or make necessary decisions to compress or bypass compression of the
+  // block
+  if (wa == nullptr || wa->owner() != this) {
+    return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                   out_compression_type, wa);
+  }
+  bool exploration =
+      Random::GetTLSInstance()->PercentTrue(kExplorationPercentage);
+  TEST_SYNC_POINT_CALLBACK(
+      "AutoSkipCompressorWrapper::CompressBlock::exploitOrExplore",
+      &exploration);
+  auto autoskip_wa = static_cast<AutoSkipWorkingArea*>(wa->get());
+  if (exploration) {
+    return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                  out_compression_type, autoskip_wa);
+  } else {
+    auto predictor_ptr = autoskip_wa->predictor;
+    auto prediction = predictor_ptr->Predict();
+    if (prediction <= kProbabilityCutOff) {
+      // decide to compress
+      return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                    out_compression_type, autoskip_wa);
+    } else {
+      // decide to bypass compression
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+  }
+  return Status::OK();
+}
+
+Compressor::ManagedWorkingArea AutoSkipCompressorWrapper::ObtainWorkingArea() {
+  auto wrap_wa = wrapped_->ObtainWorkingArea();
+  return ManagedWorkingArea(new AutoSkipWorkingArea(std::move(wrap_wa)), this);
+}
+void AutoSkipCompressorWrapper::ReleaseWorkingArea(WorkingArea* wa) {
+  delete static_cast<AutoSkipWorkingArea*>(wa);
+}
+
+Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, AutoSkipWorkingArea* wa) {
+  Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                          out_compression_type, &(wa->wrapped));
+  // determine if it was rejected or compressed
+  auto predictor_ptr = wa->predictor;
+  predictor_ptr->Record(uncompressed_data, compressed_output, opts_);
+  return status;
+}
+
+const char* AutoSkipCompressorManager::Name() const {
+  // should have returned "AutoSkipCompressorManager" but we currently have an
+  // error so for now returning name of the wrapped container
+  return wrapped_->Name();
+}
+
+std::unique_ptr<Compressor> AutoSkipCompressorManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(GetSupportedCompressions().size() > 1);
+  assert(preferred != kNoCompression);
+  return std::make_unique<AutoSkipCompressorWrapper>(
+      wrapped_->GetCompressorForSST(context, opts, preferred), opts);
+}
+
+CostAwareCompressor::CostAwareCompressor(const CompressionOptions& opts)
+    : opts_(opts) {
+  // Creates compressor supporting all the compression types and levels as per
+  // the compression levels set in vector CompressionLevels
+  auto builtInManager = GetBuiltinV2CompressionManager();
+  const auto& compressions = GetSupportedCompressions();
+  for (size_t i = 0; i < kCompressionLevels.size(); i++) {
+    CompressionType type = static_cast<CompressionType>(i + 1);
+    if (type == kNoCompression) {
+      continue;
+    }
+    if (kCompressionLevels[type - 1].size() == 0) {
+      allcompressors_.emplace_back();
+      continue;
+    } else {
+      // if the compression type is not supported, then skip and remove
+      // compression levels from the supported compression level list
+      if (std::find(compressions.begin(), compressions.end(), type) ==
+          compressions.end()) {
+        allcompressors_.emplace_back();
+        continue;
+      }
+      std::vector<std::unique_ptr<Compressor>> compressors_diff_levels;
+      for (size_t j = 0; j < kCompressionLevels[type - 1].size(); j++) {
+        auto level = kCompressionLevels[type - 1][j];
+        CompressionOptions new_opts = opts;
+        new_opts.level = level;
+        compressors_diff_levels.push_back(
+            builtInManager->GetCompressor(new_opts, type));
+        allcompressors_index_.emplace_back(i, j);
+      }
+      allcompressors_.push_back(std::move(compressors_diff_levels));
+    }
+  }
+}
+
+const char* CostAwareCompressor::Name() const { return "CostAwareCompressor"; }
+size_t CostAwareCompressor::GetMaxSampleSizeIfWantDict(
+    CacheEntryRole block_type) const {
+  auto idx = allcompressors_index_.back();
+  return allcompressors_[idx.first][idx.second]->GetMaxSampleSizeIfWantDict(
+      block_type);
+}
+
+Slice CostAwareCompressor::GetSerializedDict() const {
+  auto idx = allcompressors_index_.back();
+  return allcompressors_[idx.first][idx.second]->GetSerializedDict();
+}
+
+CompressionType CostAwareCompressor::GetPreferredCompressionType() const {
+  return kZSTD;
+}
+std::unique_ptr<Compressor> CostAwareCompressor::MaybeCloneSpecialized(
+    CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+  // TODO: full dictionary compression support. Currently this just falls
+  // back on a non-multi compressor when asked to use a dictionary.
+  auto idx = allcompressors_index_.back();
+  return allcompressors_[idx.first][idx.second]->MaybeCloneSpecialized(
+      block_type, std::move(dict_samples));
+}
+Status CostAwareCompressor::CompressBlock(Slice uncompressed_data,
+                                          std::string* compressed_output,
+                                          CompressionType* out_compression_type,
+                                          ManagedWorkingArea* wa) {
+  // Check if the managed working area is provided or owned by this object.
+  // If not, bypass compressor logic since the working area lacks a predictor
+  if (allcompressors_.size() == 0) {
+    return Status::NotSupported("No compression type supported");
+  }
+  if (wa == nullptr || wa->owner() != this) {
+    // highest compression level of Zstd
+    size_t choosen_compression_type = 6;
+    size_t compression_level_ptr = 2;
+    return allcompressors_[choosen_compression_type][compression_level_ptr]
+        ->CompressBlock(uncompressed_data, compressed_output,
+                        out_compression_type, wa);
+  }
+  auto local_wa = static_cast<CostAwareWorkingArea*>(wa->get());
+  std::pair<size_t, size_t> choosen_index(6, 2);
+  size_t choosen_compression_type = choosen_index.first;
+  size_t compresion_level_ptr = choosen_index.second;
+  return CompressBlockAndRecord(choosen_compression_type, compresion_level_ptr,
+                                uncompressed_data, compressed_output,
+                                out_compression_type, local_wa);
+}
+
+Compressor::ManagedWorkingArea CostAwareCompressor::ObtainWorkingArea() {
+  auto wrap_wa = allcompressors_.back().back()->ObtainWorkingArea();
+  auto wa = new CostAwareWorkingArea(std::move(wrap_wa));
+  // Create cost predictors for each compression type and level
+  wa->cost_predictors_.reserve(allcompressors_.size());
+  for (size_t i = 0; i < allcompressors_.size(); i++) {
+    CompressionType type = static_cast<CompressionType>(i + 1);
+    if (allcompressors_[type - 1].size() == 0) {
+      wa->cost_predictors_.emplace_back();
+      continue;
+    } else {
+      std::vector<IOCPUCostPredictor*> predictors_diff_levels;
+      predictors_diff_levels.reserve(kCompressionLevels[type - 1].size());
+      for (size_t j = 0; j < kCompressionLevels[type - 1].size(); j++) {
+        predictors_diff_levels.emplace_back(new IOCPUCostPredictor(10));
+      }
+      wa->cost_predictors_.emplace_back(std::move(predictors_diff_levels));
+    }
+  }
+  return ManagedWorkingArea(wa, this);
+}
+void CostAwareCompressor::ReleaseWorkingArea(WorkingArea* wa) {
+  // remove all created cost predictors
+  for (auto& prdictors_diff_levels :
+       static_cast<CostAwareWorkingArea*>(wa)->cost_predictors_) {
+    for (auto& predictor : prdictors_diff_levels) {
+      delete predictor;
+    }
+  }
+  delete static_cast<CostAwareWorkingArea*>(wa);
+}
+
+Status CostAwareCompressor::CompressBlockAndRecord(
+    size_t choosen_compression_type, size_t compression_level_ptr,
+    Slice uncompressed_data, std::string* compressed_output,
+    CompressionType* out_compression_type, CostAwareWorkingArea* wa) {
+  assert(choosen_compression_type < allcompressors_.size());
+  assert(compression_level_ptr <
+         allcompressors_[choosen_compression_type].size());
+  assert(choosen_compression_type < wa->cost_predictors_.size());
+  assert(compression_level_ptr <
+         wa->cost_predictors_[choosen_compression_type].size());
+  StopWatchNano<> timer(Env::Default()->GetSystemClock().get(), true);
+  Status status =
+      allcompressors_[choosen_compression_type][compression_level_ptr]
+          ->CompressBlock(uncompressed_data, compressed_output,
+                          out_compression_type, &(wa->wrapped_));
+  std::pair<size_t, size_t> measured_data(timer.ElapsedMicros(),
+                                          compressed_output->size());
+  auto predictor =
+      wa->cost_predictors_[choosen_compression_type][compression_level_ptr];
+  auto output_length = measured_data.second;
+  auto cpu_time = measured_data.first;
+  predictor->CPUPredictor.Record(cpu_time);
+  predictor->IOPredictor.Record(output_length);
+  TEST_SYNC_POINT_CALLBACK(
+      "CostAwareCompressor::CompressBlockAndRecord::GetPredictor",
+      wa->cost_predictors_[choosen_compression_type][compression_level_ptr]);
+  return status;
+}
+
+std::shared_ptr<CompressionManagerWrapper> CreateAutoSkipCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped) {
+  return std::make_shared<AutoSkipCompressorManager>(
+      wrapped == nullptr ? GetBuiltinV2CompressionManager() : wrapped);
+}
+const char* CostAwareCompressorManager::Name() const {
+  // should have returned "CostAwareCompressorManager" but we currently have an
+  // error so for now returning name of the wrapped container
+  return wrapped_->Name();
+}
+
+std::unique_ptr<Compressor> CostAwareCompressorManager::GetCompressorForSST(
+    const FilterBuildingContext& context, const CompressionOptions& opts,
+    CompressionType preferred) {
+  assert(GetSupportedCompressions().size() > 1);
+  assert(preferred != kNoCompression);
+  (void)context;
+  (void)preferred;
+  return std::make_unique<CostAwareCompressor>(opts);
+}
+
+std::shared_ptr<CompressionManagerWrapper> CreateCostAwareCompressionManager(
+    std::shared_ptr<CompressionManager> wrapped) {
+  return std::make_shared<CostAwareCompressorManager>(
+      wrapped == nullptr ? GetBuiltinV2CompressionManager() : wrapped);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/auto_tune_compressor.h b/util/auto_tune_compressor.h
new file mode 100644
index 000000000000..79bd7eed7db4
--- /dev/null
+++ b/util/auto_tune_compressor.h
@@ -0,0 +1,189 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Defines auto skip compressor wrapper which intelligently decides bypassing
+// compression based on past data
+// Defines CostAwareCompressor which currently tries to predict the cpu and io
+// cost of the compression
+
+#pragma once
+#include <memory>
+
+#include "rocksdb/advanced_compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Auto Skip Compression Components
+// Predict rejection probability using a moving window approach
+class CompressionRejectionProbabilityPredictor {
+ public:
+  explicit CompressionRejectionProbabilityPredictor(int window_size)
+      : pred_rejection_prob_percentage_(0),
+        rejected_count_(0),
+        compressed_count_(0),
+        window_size_(window_size) {}
+  int Predict() const;
+  bool Record(Slice uncompressed_block_data, std::string* compressed_output,
+              const CompressionOptions& opts);
+  size_t attempted_compression_count() const;
+
+ protected:
+  int pred_rejection_prob_percentage_;
+  size_t rejected_count_;
+  size_t compressed_count_;
+  size_t window_size_;
+};
+
+class AutoSkipWorkingArea : public Compressor::WorkingArea {
+ public:
+  explicit AutoSkipWorkingArea(Compressor::ManagedWorkingArea&& wa)
+      : wrapped(std::move(wa)),
+        predictor(
+            std::make_shared<CompressionRejectionProbabilityPredictor>(10)) {}
+  ~AutoSkipWorkingArea() {}
+  AutoSkipWorkingArea(const AutoSkipWorkingArea&) = delete;
+  AutoSkipWorkingArea& operator=(const AutoSkipWorkingArea&) = delete;
+  AutoSkipWorkingArea(AutoSkipWorkingArea&& other) noexcept
+      : wrapped(std::move(other.wrapped)),
+        predictor(std::move(other.predictor)) {}
+
+  AutoSkipWorkingArea& operator=(AutoSkipWorkingArea&& other) noexcept {
+    if (this != &other) {
+      wrapped = std::move(other.wrapped);
+      predictor = std::move(other.predictor);
+    }
+    return *this;
+  }
+  Compressor::ManagedWorkingArea wrapped;
+  std::shared_ptr<CompressionRejectionProbabilityPredictor> predictor;
+};
+class AutoSkipCompressorWrapper : public CompressorWrapper {
+ public:
+  const char* Name() const override;
+  explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
+                                     const CompressionOptions& opts);
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  void ReleaseWorkingArea(WorkingArea* wa) override;
+
+ private:
+  Status CompressBlockAndRecord(Slice uncompressed_data,
+                                std::string* compressed_output,
+                                CompressionType* out_compression_type,
+                                AutoSkipWorkingArea* wa);
+  static constexpr int kExplorationPercentage = 10;
+  static constexpr int kProbabilityCutOff = 50;
+  const CompressionOptions opts_;
+};
+
+class AutoSkipCompressorManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+// Cost Aware Components
+template <typename T>
+class WindowAveragePredictor {
+ public:
+  explicit WindowAveragePredictor(int window_size)
+      : sum_(0), prediction_(0), count_(0), kWindowSize(window_size) {}
+  T Predict() { return prediction_; }
+  bool Record(T data) {
+    sum_ += data;
+    count_++;
+    if (count_ >= kWindowSize) {
+      prediction_ = sum_ / count_;
+      sum_ = 0;
+      count_ = 0;
+    }
+    return true;
+  }
+  void SetPrediction(T prediction) { prediction_ = prediction; }
+
+ private:
+  T sum_;
+  T prediction_;
+  int count_;
+  const int kWindowSize;
+};
+
+using IOCostPredictor = WindowAveragePredictor<size_t>;
+using CPUUtilPredictor = WindowAveragePredictor<uint64_t>;
+
+struct IOCPUCostPredictor {
+  explicit IOCPUCostPredictor(int window_size)
+      : IOPredictor(window_size), CPUPredictor(window_size) {}
+  IOCostPredictor IOPredictor;
+  CPUUtilPredictor CPUPredictor;
+};
+class CostAwareWorkingArea : public Compressor::WorkingArea {
+ public:
+  explicit CostAwareWorkingArea(Compressor::ManagedWorkingArea&& wa)
+      : wrapped_(std::move(wa)) {}
+  ~CostAwareWorkingArea() {}
+  CostAwareWorkingArea(const CostAwareWorkingArea&) = delete;
+  CostAwareWorkingArea& operator=(const CostAwareWorkingArea&) = delete;
+  CostAwareWorkingArea(CostAwareWorkingArea&& other) noexcept
+      : wrapped_(std::move(other.wrapped_)) {}
+
+  CostAwareWorkingArea& operator=(CostAwareWorkingArea&& other) noexcept {
+    if (this != &other) {
+      wrapped_ = std::move(other.wrapped_);
+      cost_predictors_ = std::move(other.cost_predictors_);
+    }
+    return *this;
+  }
+  Compressor::ManagedWorkingArea wrapped_;
+  std::vector<std::vector<IOCPUCostPredictor*>> cost_predictors_;
+};
+
+class CostAwareCompressor : public Compressor {
+ public:
+  explicit CostAwareCompressor(const CompressionOptions& opts);
+  const char* Name() const override;
+  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;
+  Slice GetSerializedDict() const override;
+  CompressionType GetPreferredCompressionType() const override;
+  ManagedWorkingArea ObtainWorkingArea() override;
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override;
+
+  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override;
+  void ReleaseWorkingArea(WorkingArea* wa) override;
+
+ private:
+  Status CompressBlockAndRecord(size_t choosen_compression_type,
+                                size_t compresion_level_ptr,
+                                Slice uncompressed_data,
+                                std::string* compressed_output,
+                                CompressionType* out_compression_type,
+                                CostAwareWorkingArea* wa);
+  static constexpr int kExplorationPercentage = 10;
+  static constexpr int kProbabilityCutOff = 50;
+  // This is the vector containing the list of compression levels that
+  // CostAwareCompressor will use create compressor and predicts the cost
+  // The vector contains list of compression level for compression algorithm in
+  // the order defined by enum CompressionType
+  static const std::vector<std::vector<int>> kCompressionLevels;
+  const CompressionOptions opts_;
+  std::vector<std::vector<std::unique_ptr<Compressor>>> allcompressors_;
+  std::vector<std::pair<size_t, size_t>> allcompressors_index_;
+};
+
+class CostAwareCompressorManager : public CompressionManagerWrapper {
+  using CompressionManagerWrapper::CompressionManagerWrapper;
+  const char* Name() const override;
+  std::unique_ptr<Compressor> GetCompressorForSST(
+      const FilterBuildingContext& context, const CompressionOptions& opts,
+      CompressionType preferred) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression_test.cc b/util/compression_test.cc
index 5840d180a9a3..afe4a508f09f 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -17,6 +17,7 @@
 #include "rocksdb/flush_block_policy.h"
 #include "table/block_based/block_builder.h"
 #include "test_util/testutil.h"
+#include "util/auto_tune_compressor.h"
 #include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -157,8 +158,7 @@ class DBAutoSkip : public DBTestBase {
         options(CurrentOptions()),
         rnd_(231),
         key_index_(0) {
-    options.compression_manager =
-        CreateAutoSkipCompressionManager(GetBuiltinV2CompressionManager());
+    options.compression_manager = CreateAutoSkipCompressionManager();
     auto statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     options.statistics = statistics;
     options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
@@ -220,6 +220,122 @@ TEST_F(DBAutoSkip, AutoSkipCompressionManager) {
     ASSERT_OK(Flush());
   }
 }
+class CostAwareTestFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+  explicit CostAwareTestFlushBlockPolicy(const int window,
+                                         const BlockBuilder& data_block_builder)
+      : window_(window),
+        num_keys_(0),
+        data_block_builder_(data_block_builder) {}
+
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    auto nth_window = num_keys_ / window_;
+    if (data_block_builder_.empty()) {
+      // First key in this block
+      return false;
+    }
+    // Check every window
+    if (num_keys_ % window_ == 0) {
+      auto get_predictor = [&](void* arg) {
+        // gets the predictor and sets the mocked cpu and io cost
+        predictor_ = static_cast<IOCPUCostPredictor*>(arg);
+        predictor_->CPUPredictor.SetPrediction(1000);
+        predictor_->IOPredictor.SetPrediction(100);
+      };
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+
+      // Add syncpoint to get the cpu and io cost
+      SyncPoint::GetInstance()->SetCallBack(
+          "CostAwareCompressor::CompressBlockAndRecord::"
+          "GetPredictor",
+          get_predictor);
+      SyncPoint::GetInstance()->EnableProcessing();
+      // use nth window to detect test cases and set the expected
+      switch (nth_window) {
+        case 0:
+          break;
+        case 1:
+          // Verify that the Mocked cpu cost and io cost are predicted correctly
+          auto predicted_cpu_time = predictor_->CPUPredictor.Predict();
+          auto predicted_io_bytes = predictor_->IOPredictor.Predict();
+          EXPECT_EQ(predicted_io_bytes, 100);
+          EXPECT_EQ(predicted_cpu_time, 1000);
+          break;
+      }
+    }
+    num_keys_++;
+    return true;
+  }
+
+ private:
+  int window_;
+  int num_keys_;
+  const BlockBuilder& data_block_builder_;
+  IOCPUCostPredictor* predictor_;
+};
+class CostAwareTestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit CostAwareTestFlushBlockPolicyFactory(const int window)
+      : window_(window) {}
+
+  virtual const char* Name() const override {
+    return "CostAwareTestFlushBlockPolicyFactory";
+  }
+
+  virtual FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& data_block_builder) const override {
+    (void)data_block_builder;
+    return new CostAwareTestFlushBlockPolicy(window_, data_block_builder);
+  }
+
+ private:
+  int window_;
+};
+class DBCompresssionCostPredictor : public DBTestBase {
+ public:
+  Options options;
+  DBCompresssionCostPredictor()
+      : DBTestBase("db_cpuio_skip", /*env_do_fsync=*/true),
+        options(CurrentOptions()) {
+    options.compression_manager = CreateCostAwareCompressionManager();
+    auto statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.statistics = statistics;
+    options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+    BlockBasedTableOptions bbto;
+    bbto.enable_index_compression = false;
+    bbto.flush_block_policy_factory.reset(
+        new CostAwareTestFlushBlockPolicyFactory(10));
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+  }
+};
+TEST_F(DBCompresssionCostPredictor, CostAwareCompressorManager) {
+  // making sure that the compression is supported
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kValueSize = 20000;
+  int next_key = 0;
+  Random rnd(231);
+  auto value = rnd.RandomBinaryString(kValueSize);
+  int window_size = 10;
+  auto WindowWrite = [&]() {
+    for (auto i = 0; i < window_size; ++i) {
+      auto status = Put(Key(next_key), value);
+      EXPECT_OK(status);
+      next_key++;
+    }
+  };
+  // This denotes the first window
+  // Mocked to have specific cpu utilization and io cost
+  WindowWrite();
+  // check the predictor is predicting the correct cpu and io cost
+  WindowWrite();
+  ASSERT_OK(Flush());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff --git a/util/stop_watch.h b/util/stop_watch.h
index 28781304577d..36ae9bea802b 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -102,6 +102,7 @@ class StopWatch {
 };
 
 // a nano second precision stopwatch
+template <bool use_cpu_time = false>
 class StopWatchNano {
  public:
   explicit StopWatchNano(SystemClock* clock, bool auto_start = false)
@@ -110,27 +111,36 @@ class StopWatchNano {
       Start();
     }
   }
-
-  void Start() { start_ = clock_->NowNanos(); }
-
+  void Start() {
+    if constexpr (use_cpu_time) {
+      start_ = clock_->CPUNanos();
+    } else {
+      start_ = clock_->NowNanos();
+    }
+  }
   uint64_t ElapsedNanos(bool reset = false) {
-    auto now = clock_->NowNanos();
+    uint64_t now = 0;
+    if constexpr (use_cpu_time) {
+      now = clock_->CPUNanos();
+    } else {
+      now = clock_->NowNanos();
+    }
     auto elapsed = now - start_;
     if (reset) {
       start_ = now;
     }
     return elapsed;
   }
-
   uint64_t ElapsedNanosSafe(bool reset = false) {
     return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U;
   }
-
   bool IsStarted() { return start_ != 0; }
+  uint64_t ElapsedMicros(bool reset = false) {
+    return ElapsedNanos(reset) / 1000;
+  }
 
  private:
   SystemClock* clock_;
   uint64_t start_;
 };
-
 }  // namespace ROCKSDB_NAMESPACE

From 8b84390517254c4d1b5ef5a6a682b0519cd1e2b1 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 26 Jun 2025 12:19:16 -0700
Subject: [PATCH 155/500] Add upper bound support for forward scans in
 MultiScan (#13723)

Summary:
Respect the scan upper bound/limit, if specified, in `MultiScan`. This applies to block based table and other native RocksDB SSTs. In order to properly support it, the `MultiScan` object caches the `ReadOptions` passed by the user and sets the `iterate_upper_bound` as appropriate. We optimize for the case of either all scans specifying the upper bound, or none of them. In case of mixed scans, we reallocate the DB iterator anytime `ReadOptions` has to be updated.

Tests:
New unit tests in `db_iterator_test`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13723

Reviewed By: cbi42

Differential Revision: D77385049

Pulled By: anand1976

fbshipit-source-id: 9c02d125770cbedbe6e8c10767ba537e7f7540e1
---
 BUCK                                          |   1 +
 CMakeLists.txt                                |   1 +
 db/db_impl/db_impl.cc                         |   6 +-
 db/db_iterator_test.cc                        | 186 ++++++++++++++++++
 db/multi_scan.cc                              |  70 +++++++
 include/rocksdb/db.h                          |  26 ++-
 include/rocksdb/multi_scan.h                  |  57 +++---
 src.mk                                        |   1 +
 table/table_test.cc                           |   1 +
 .../bug_fixes/multi_scan_upper_bound.md       |   1 +
 10 files changed, 317 insertions(+), 33 deletions(-)
 create mode 100644 db/multi_scan.cc
 create mode 100644 unreleased_history/bug_fixes/multi_scan_upper_bound.md

diff --git a/BUCK b/BUCK
index 1a106f9a5262..ce8ca8a1b8fc 100644
--- a/BUCK
+++ b/BUCK
@@ -88,6 +88,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "db/memtable_list.cc",
         "db/merge_helper.cc",
         "db/merge_operator.cc",
+        "db/multi_scan.cc",
         "db/output_validator.cc",
         "db/periodic_task_scheduler.cc",
         "db/range_del_aggregator.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1e7a9215775..08abd4daf4ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -721,6 +721,7 @@ set(SOURCES
         db/memtable_list.cc
         db/merge_helper.cc
         db/merge_operator.cc
+        db/multi_scan.cc
         db/output_validator.cc
         db/periodic_task_scheduler.cc
         db/range_del_aggregator.cc
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index deb3b9dee700..8b0164e3328e 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3823,10 +3823,8 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
 std::unique_ptr<MultiScan> DBImpl::NewMultiScan(
     const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
     const std::vector<ScanOptions>& scan_opts) {
-  std::unique_ptr<Iterator> iter(NewIterator(_read_options, column_family));
-  iter->Prepare(scan_opts);
-  std::unique_ptr<MultiScan> ms_iter =
-      std::make_unique<MultiScan>(scan_opts, std::move(iter));
+  std::unique_ptr<MultiScan> ms_iter = std::make_unique<MultiScan>(
+      _read_options, scan_opts, this, column_family);
   return ms_iter;
 }
 
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index a4477804e0c8..d2b83e4ed89d 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -8,6 +8,8 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <functional>
+#include <iomanip>
+#include <iostream>
 
 #include "db/arena_wrapped_db_iter.h"
 #include "db/db_iter.h"
@@ -4139,6 +4141,190 @@ TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTriggerByOverwrites) {
   ASSERT_OK(db_->WaitForCompact({}));
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
+
+class DBMultiScanIteratorTest : public DBTestBase {
+ public:
+  DBMultiScanIteratorTest()
+      : DBTestBase("db_multi_scan_iterator_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBMultiScanIteratorTest, BasicTest) {
+  // Create a file
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), "val" + ss.str()));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
+  ReadOptions ro;
+  std::vector<ScanOptions> scan_options(
+      {ScanOptions(key_ranges[0], key_ranges[1]),
+       ScanOptions(key_ranges[2], key_ranges[3])});
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 32);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  // Test the overlapping scan case
+  key_ranges[1] = "k30";
+  scan_options[0] = ScanOptions(key_ranges[0], key_ranges[1]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 52);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  // Test the no limit scan case
+  scan_options[0] = ScanOptions(key_ranges[0]);
+  scan_options[1] = ScanOptions(key_ranges[2]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
+        if (it.first.ToString().compare(key_ranges[idx + 1]) == 0) {
+          break;
+        }
+        count++;
+      }
+      idx += 2;
+    }
+    ASSERT_EQ(count, 52);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
+
+TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
+  // Create a file
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), "val" + ss.str()));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_ranges(
+      {"k03", "k10", "k25", "k50", "k75", "k90"});
+  ReadOptions ro;
+  std::vector<ScanOptions> scan_options(
+      {ScanOptions(key_ranges[0], key_ranges[1]), ScanOptions(key_ranges[2]),
+       ScanOptions(key_ranges[4], key_ranges[5])});
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(
+                      scan_options[idx].range.start->ToString()),
+                  0);
+        if (scan_options[idx].range.limit) {
+          ASSERT_LT(it.first.ToString().compare(
+                        scan_options[idx].range.limit->ToString()),
+                    0);
+        }
+        count++;
+      }
+      idx++;
+    }
+    ASSERT_EQ(count, 97);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  scan_options[0] = ScanOptions(key_ranges[0]);
+  scan_options[1] = ScanOptions(key_ranges[2], key_ranges[3]);
+  scan_options[2] = ScanOptions(key_ranges[4]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int idx = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString().compare(
+                      scan_options[idx].range.start->ToString()),
+                  0);
+        if (scan_options[idx].range.limit) {
+          ASSERT_LT(it.first.ToString().compare(
+                        scan_options[idx].range.limit->ToString()),
+                    0);
+        }
+        count++;
+      }
+      idx++;
+    }
+    ASSERT_EQ(count, 147);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/multi_scan.cc b/db/multi_scan.cc
new file mode 100644
index 000000000000..663793240139
--- /dev/null
+++ b/db/multi_scan.cc
@@ -0,0 +1,70 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using MultiScanIterator = MultiScan::MultiScanIterator;
+
+MultiScan::MultiScan(const ReadOptions& read_options,
+                     const std::vector<ScanOptions>& scan_opts, DB* db,
+                     ColumnFamilyHandle* cfh)
+    : read_options_(read_options), scan_opts_(scan_opts), db_(db), cfh_(cfh) {
+  bool slow_path = false;
+  // Setup read_options with iterate_uuper_bound based on the first scan.
+  // Subsequent scans will update and allocate a new DB iterator as necessary
+  if (scan_opts[0].range.limit) {
+    upper_bound_ = *scan_opts[0].range.limit;
+    read_options_.iterate_upper_bound = &upper_bound_;
+  } else {
+    read_options_.iterate_upper_bound = nullptr;
+  }
+  for (auto opts : scan_opts) {
+    // Check that all the ScanOptions either specify an upper bound or not. If
+    // its mixed we take the slow path which avoids calling Prepare: we have to
+    // reallocate the Iterator with updated read_options everytime we switch
+    // between upper bound or no upper bound, which complicates Prepare.
+    if (opts.range.limit.has_value() != scan_opts[0].range.limit.has_value()) {
+      slow_path = true;
+      break;
+    }
+  }
+  db_iter_.reset(db->NewIterator(read_options_, cfh));
+  if (!slow_path) {
+    db_iter_->Prepare(scan_opts);
+  }
+}
+
+MultiScanIterator& MultiScanIterator::operator++() {
+  if (idx_ >= scan_opts_.size()) {
+    throw std::logic_error("Index out of range");
+  }
+  idx_++;
+  if (idx_ < scan_opts_.size()) {
+    // Check if we need to update read_options_
+    if (scan_opts_[idx_].range.limit.has_value() !=
+        (read_options_.iterate_upper_bound != nullptr)) {
+      if (scan_opts_[idx_].range.limit) {
+        *upper_bound_ = *scan_opts_[idx_].range.limit;
+        read_options_.iterate_upper_bound = upper_bound_;
+      } else {
+        read_options_.iterate_upper_bound = nullptr;
+      }
+      db_iter_.reset(db_->NewIterator(read_options_, cfh_));
+      scan_.Reset(db_iter_.get());
+    } else if (scan_opts_[idx_].range.limit) {
+      *upper_bound_ = *scan_opts_[idx_].range.limit;
+    }
+    db_iter_->Seek(*scan_opts_[idx_].range.start);
+    status_ = db_iter_->status();
+    if (!status_.ok()) {
+      throw MultiScanException(status_);
+    }
+  }
+  return *this;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index ae78b6a3ce1a..5313963e9693 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -57,6 +57,7 @@ struct WaitForCompactOptions;
 class Env;
 class EventListener;
 class FileSystem;
+class MultiScan;
 class Replayer;
 class StatsHistoryIterator;
 class TraceReader;
@@ -1092,7 +1093,30 @@ class DB {
 
   // Get an iterator that scans multiple key ranges. The scan ranges should
   // be in increasing order of start key. See multi_scan_iterator.h for more
-  // details.
+  // details. For optimal performance, ensure that either all entries in
+  // scan_opts specify the range limit, or none of them do.
+  //
+  // NOTE: iterate_upper_bound in ReadOptions will be ignored. Instead, the
+  // range.limit in ScanOptions is consulted to determine the upper bound key,
+  // if specified.
+  //
+  // Example usage -
+  //  std::vector<ScanOptions> scans{{.start = Slice("bar")},
+  //                              {.start = Slice("foo")}};
+  //  std::unique_ptr<MultiScan> iter.reset(
+  //                                      db->NewMultiScan());
+  //  try {
+  //    for (auto scan : *iter) {
+  //      for (auto it : scan) {
+  //        // Do something with key - it.first
+  //        // Do something with value - it.second
+  //      }
+  //    }
+  //  } catch (MultiScanException& ex) {
+  //    // Check ex.status()
+  //  } catch (std::logic_error& ex) {
+  //    // Check ex.what()
+  //  }
   virtual std::unique_ptr<MultiScan> NewMultiScan(
       const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
       const std::vector<ScanOptions>& /*scan_opts*/) {
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
index dc173e48e6df..c76cb9c7c407 100644
--- a/include/rocksdb/multi_scan.h
+++ b/include/rocksdb/multi_scan.h
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 
@@ -72,6 +73,8 @@ class Scan {
 
   explicit Scan(Iterator* db_iter) : db_iter_(db_iter) {}
 
+  void Reset(Iterator* db_iter) { db_iter_ = db_iter; }
+
   ScanIterator begin() { return ScanIterator(db_iter_); }
 
   std::nullptr_t end() { return nullptr; }
@@ -149,9 +152,9 @@ class Scan {
 // A Status exception is thrown if there is an error.
 class MultiScan {
  public:
-  MultiScan(const std::vector<ScanOptions>& scan_opts,
-            std::unique_ptr<Iterator>&& db_iter)
-      : scan_opts_(scan_opts), db_iter_(std::move(db_iter)) {}
+  MultiScan(const ReadOptions& read_options,
+            const std::vector<ScanOptions>& scan_opts, DB* db,
+            ColumnFamilyHandle* cfh);
 
   explicit MultiScan(std::unique_ptr<Iterator>&& db_iter)
       : db_iter_(std::move(db_iter)) {}
@@ -168,9 +171,17 @@ class MultiScan {
     using difference_type = int;
     using iterator_category = std::input_iterator_tag;
 
-    MultiScanIterator(const std::vector<ScanOptions>& scan_opts,
-                      Iterator* db_iter)
-        : scan_opts_(scan_opts), idx_(0), db_iter_(db_iter), scan_(db_iter_) {
+    MultiScanIterator(const std::vector<ScanOptions>& scan_opts, DB* db,
+                      ColumnFamilyHandle* cfh, ReadOptions& read_options,
+                      Slice* upper_bound, std::unique_ptr<Iterator>& db_iter)
+        : scan_opts_(scan_opts),
+          db_(db),
+          cfh_(cfh),
+          read_options_(read_options),
+          upper_bound_(upper_bound),
+          idx_(0),
+          db_iter_(db_iter),
+          scan_(db_iter_.get()) {
       if (scan_opts_.empty()) {
         throw std::logic_error("Zero scans in multi-scan");
       }
@@ -181,28 +192,9 @@ class MultiScan {
       }
     }
 
-    explicit MultiScanIterator(const std::vector<ScanOptions>& scan_opts)
-        : scan_opts_(scan_opts),
-          idx_(scan_opts_.size()),
-          db_iter_(nullptr),
-          scan_(nullptr) {}
-
     ~MultiScanIterator() { assert(status_.ok()); }
 
-    MultiScanIterator& operator++() {
-      if (idx_ >= scan_opts_.size()) {
-        throw std::logic_error("Index out of range");
-      }
-      idx_++;
-      if (idx_ < scan_opts_.size()) {
-        db_iter_->Seek(*scan_opts_[idx_].range.start);
-        status_ = db_iter_->status();
-        if (!status_.ok()) {
-          throw MultiScanException(status_);
-        }
-      }
-      return *this;
-    }
+    MultiScanIterator& operator++();
 
     bool operator==(std::nullptr_t /*other*/) const {
       return idx_ >= scan_opts_.size();
@@ -217,20 +209,29 @@ class MultiScan {
 
    private:
     const std::vector<ScanOptions>& scan_opts_;
+    DB* db_;
+    ColumnFamilyHandle* cfh_;
+    ReadOptions& read_options_;
+    Slice* upper_bound_;
     size_t idx_;
-    Iterator* db_iter_;
+    std::unique_ptr<Iterator>& db_iter_;
     Status status_;
     Scan scan_;
   };
 
   MultiScanIterator begin() {
-    return MultiScanIterator(scan_opts_, db_iter_.get());
+    return MultiScanIterator(scan_opts_, db_, cfh_, read_options_,
+                             &upper_bound_, db_iter_);
   }
 
   std::nullptr_t end() { return nullptr; }
 
  private:
+  ReadOptions read_options_;
   const std::vector<ScanOptions> scan_opts_;
+  DB* db_;
+  ColumnFamilyHandle* cfh_;
+  Slice upper_bound_;
   std::unique_ptr<Iterator> db_iter_;
 };
 
diff --git a/src.mk b/src.mk
index fff9c6e55086..8d341f03c58b 100644
--- a/src.mk
+++ b/src.mk
@@ -80,6 +80,7 @@ LIB_SOURCES =                                                   \
   db/memtable_list.cc                                           \
   db/merge_helper.cc                                            \
   db/merge_operator.cc                                          \
+  db/multi_scan.cc						\
   db/output_validator.cc                                        \
   db/periodic_task_scheduler.cc                                 \
   db/range_del_aggregator.cc                                    \
diff --git a/table/table_test.cc b/table/table_test.cc
index 23371787a6e2..faa339e824fa 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7396,6 +7396,7 @@ TEST_F(ExternalTableTest, IngestionTest) {
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
 }
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/bug_fixes/multi_scan_upper_bound.md b/unreleased_history/bug_fixes/multi_scan_upper_bound.md
new file mode 100644
index 000000000000..973bc84401ea
--- /dev/null
+++ b/unreleased_history/bug_fixes/multi_scan_upper_bound.md
@@ -0,0 +1 @@
+Fix DB::NewMultiScan iterator to respect the scan upper bound specified in ScanOptions

From 4e425887e7b7ac337faac1585f36f23b254dbe1c Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Thu, 26 Jun 2025 13:32:28 -0700
Subject: [PATCH 156/500] Removing typo sss in spelling of the compression
 (#13735)

Summary:
Corrected misspelling of "Compression". Changed "Compresssion" to "Compression".

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13735

Test Plan:
All the test case for compression is still working properly.
```bash
./compression_test
```

Reviewed By: hx235

Differential Revision: D77390273

Pulled By: shubhajeet

fbshipit-source-id: f5310e393e23f5d6c8310154cb929db4b6c60a77
---
 util/compression_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/util/compression_test.cc b/util/compression_test.cc
index afe4a508f09f..d2590fabfe66 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -293,10 +293,10 @@ class CostAwareTestFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
  private:
   int window_;
 };
-class DBCompresssionCostPredictor : public DBTestBase {
+class DBCompressionCostPredictor : public DBTestBase {
  public:
   Options options;
-  DBCompresssionCostPredictor()
+  DBCompressionCostPredictor()
       : DBTestBase("db_cpuio_skip", /*env_do_fsync=*/true),
         options(CurrentOptions()) {
     options.compression_manager = CreateCostAwareCompressionManager();
@@ -311,7 +311,7 @@ class DBCompresssionCostPredictor : public DBTestBase {
     DestroyAndReopen(options);
   }
 };
-TEST_F(DBCompresssionCostPredictor, CostAwareCompressorManager) {
+TEST_F(DBCompressionCostPredictor, CostAwareCompressorManager) {
   // making sure that the compression is supported
   if (!ZSTD_Supported()) {
     return;

From ca2413545c74b75f993a4900c3acc5f1e881eb7f Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Thu, 26 Jun 2025 13:55:33 -0700
Subject: [PATCH 157/500] Remove the 2 duplicated fields in block.h Block class
 (#13733)

Summary:
`data_` and `size_` fields are duplicated in `Block` class, as `contents_` field already have a `data` member variable, which contains `data` and `size` already. This reduces memory consumption by 16 bytes per block.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13733

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D77389791

Pulled By: xingbowang

fbshipit-source-id: 50a56bc5fae494ed5bc39bdfde7303ca06ce87c6
---
 table/block_based/block.cc | 65 ++++++++++++++++++--------------------
 table/block_based/block.h  |  6 ++--
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index ea4d559a2a40..d2a5d8d70a17 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -1016,10 +1016,10 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
 }
 
 uint32_t Block::NumRestarts() const {
-  assert(size_ >= 2 * sizeof(uint32_t));
-  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  assert(size() >= 2 * sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
   uint32_t num_restarts = block_footer;
-  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+  if (size() > kMaxBlockSizeSupportedByHashIndex) {
     // In BlockBuilder, we have ensured a block with HashIndex is less than
     // kMaxBlockSizeSupportedByHashIndex (64KiB).
     //
@@ -1038,12 +1038,12 @@ uint32_t Block::NumRestarts() const {
 }
 
 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
-  assert(size_ >= 2 * sizeof(uint32_t));
-  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+  assert(size() >= 2 * sizeof(uint32_t));
+  if (size() > kMaxBlockSizeSupportedByHashIndex) {
     // The check is for the same reason as that in NumRestarts()
     return BlockBasedTableOptions::kDataBlockBinarySearch;
   }
-  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
   uint32_t num_restarts = block_footer;
   BlockBasedTableOptions::DataBlockIndexType index_type;
   UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
@@ -1059,54 +1059,51 @@ Block::~Block() {
 
 Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
              Statistics* statistics)
-    : contents_(std::move(contents)),
-      data_(contents_.data.data()),
-      size_(contents_.data.size()),
-      restart_offset_(0),
-      num_restarts_(0) {
+    : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
   TEST_SYNC_POINT("Block::Block:0");
-  if (size_ < sizeof(uint32_t)) {
-    size_ = 0;  // Error marker
+  auto& size = contents_.data.size_;
+  if (size < sizeof(uint32_t)) {
+    size = 0;  // Error marker
   } else {
     // Should only decode restart points for uncompressed blocks
     num_restarts_ = NumRestarts();
     switch (IndexType()) {
       case BlockBasedTableOptions::kDataBlockBinarySearch:
-        restart_offset_ = static_cast<uint32_t>(size_) -
+        restart_offset_ = static_cast<uint32_t>(size) -
                           (1 + num_restarts_) * sizeof(uint32_t);
-        if (restart_offset_ > size_ - sizeof(uint32_t)) {
+        if (restart_offset_ > size - sizeof(uint32_t)) {
           // The size is too small for NumRestarts() and therefore
           // restart_offset_ wrapped around.
-          size_ = 0;
+          size = 0;
         }
         break;
       case BlockBasedTableOptions::kDataBlockBinaryAndHash:
-        if (size_ < sizeof(uint32_t) /* block footer */ +
-                        sizeof(uint16_t) /* NUM_BUCK */) {
-          size_ = 0;
+        if (size < sizeof(uint32_t) /* block footer */ +
+                       sizeof(uint16_t) /* NUM_BUCK */) {
+          size = 0;
           break;
         }
 
         uint16_t map_offset;
         data_block_hash_index_.Initialize(
-            data_, static_cast<uint16_t>(size_ - sizeof(uint32_t)), /*chop off
-                                                                NUM_RESTARTS*/
-            &map_offset);
+            contents_.data.data(),
+            /* chop off NUM_RESTARTS */
+            static_cast<uint16_t>(size - sizeof(uint32_t)), &map_offset);
 
         restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
 
         if (restart_offset_ > map_offset) {
           // map_offset is too small for NumRestarts() and
           // therefore restart_offset_ wrapped around.
-          size_ = 0;
+          size = 0;
           break;
         }
         break;
       default:
-        size_ = 0;  // Error marker
+        size = 0;  // Error marker
     }
   }
-  if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
+  if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
     read_amp_bitmap_.reset(new BlockReadAmpBitmap(
         restart_offset_, read_amp_bytes_per_bit, statistics));
   }
@@ -1148,7 +1145,7 @@ void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
       assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
     }
     if (!iter->status().ok()) {
-      size_ = 0;  // Error marker
+      contents_.data.size_ = 0;  // Error marker
       return;
     }
     protection_bytes_per_key_ = protection_bytes_per_key;
@@ -1197,7 +1194,7 @@ void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
       assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
     }
     if (!iter->status().ok()) {
-      size_ = 0;  // Error marker
+      contents_.data.size_ = 0;  // Error marker
       return;
     }
     protection_bytes_per_key_ = protection_bytes_per_key;
@@ -1231,7 +1228,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
       assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
     }
     if (!iter->status().ok()) {
-      size_ = 0;  // Error marker
+      contents_.data.size_ = 0;  // Error marker
       return;
     }
     protection_bytes_per_key_ = protection_bytes_per_key;
@@ -1240,14 +1237,14 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
 
 MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
   MetaBlockIter* iter = new MetaBlockIter();
-  if (size_ < 2 * sizeof(uint32_t)) {
+  if (size() < 2 * sizeof(uint32_t)) {
     iter->Invalidate(Status::Corruption("bad block contents"));
     return iter;
   } else if (num_restarts_ == 0) {
     // Empty block.
     iter->Invalidate(Status::OK());
   } else {
-    iter->Initialize(data_, restart_offset_, num_restarts_,
+    iter->Initialize(data(), restart_offset_, num_restarts_,
                      block_contents_pinned, protection_bytes_per_key_,
                      kv_checksum_, block_restart_interval_);
   }
@@ -1265,7 +1262,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
   } else {
     ret_iter = new DataBlockIter;
   }
-  if (size_ < 2 * sizeof(uint32_t)) {
+  if (size() < 2 * sizeof(uint32_t)) {
     ret_iter->Invalidate(Status::Corruption("bad block contents"));
     return ret_iter;
   }
@@ -1275,7 +1272,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
     return ret_iter;
   } else {
     ret_iter->Initialize(
-        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
+        raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
         read_amp_bitmap_.get(), block_contents_pinned,
         user_defined_timestamps_persisted,
         data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
@@ -1303,7 +1300,7 @@ IndexBlockIter* Block::NewIndexIterator(
   } else {
     ret_iter = new IndexBlockIter;
   }
-  if (size_ < 2 * sizeof(uint32_t)) {
+  if (size() < 2 * sizeof(uint32_t)) {
     ret_iter->Invalidate(Status::Corruption("bad block contents"));
     return ret_iter;
   }
@@ -1315,7 +1312,7 @@ IndexBlockIter* Block::NewIndexIterator(
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
     ret_iter->Initialize(
-        raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno,
+        raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
         prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
         block_contents_pinned, user_defined_timestamps_persisted,
         protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 2cd2918a82d7..7d7011d40571 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -163,8 +163,8 @@ class Block {
 
   ~Block();
 
-  size_t size() const { return size_; }
-  const char* data() const { return data_; }
+  size_t size() const { return contents_.data.size(); }
+  const char* data() const { return contents_.data.data(); }
   // The additional memory space taken by the block data.
   size_t usable_size() const { return contents_.usable_size(); }
   uint32_t NumRestarts() const;
@@ -277,8 +277,6 @@ class Block {
 
  private:
   BlockContents contents_;
-  const char* data_;         // contents_.data.data()
-  size_t size_;              // contents_.data.size()
   uint32_t restart_offset_;  // Offset in data_ of restart array
   uint32_t num_restarts_;
   std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;

From 7183422b175ace55068264ae6bbe7bb6fbd6ccdb Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Thu, 26 Jun 2025 15:19:26 -0700
Subject: [PATCH 158/500] Check that NewWritableFile succeeded when copying
 over backup files (#13734)

Summary:
I am seeing crashes during backups. The stack trace points back to `WritableFileWriter` creation inside `BackupEngineImpl::CopyOrCreateFile`. I believe the issue is that we are calling `writable_file_->GetRequiredBufferAlignment()` with a `null` `writable_file`.

https://github.com/facebook/rocksdb/blob/v10.2.1/utilities/backup/backup_engine.cc#L2396-L2397

https://github.com/facebook/rocksdb/blob/v10.2.1/file/writable_file_writer.h#L210

Here's how I think the flow is:

```cpp
  io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
                                                   &dst_file, nullptr);
// say there was some issue and dst_file is nullptr
// evaluates to false
if (io_s.ok() && !src.empty()) {
   // we don't go down this branch
    auto src_file_options = FileOptions(src_env_options);
    src_file_options.temperature = *src_temperature;
    io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options,
                                                       &src_file, nullptr);
  }
  // say this evaluates to true
  if (io_s.IsPathNotFound() && *src_temperature != Temperature::kUnknown) {
    // Retry without temperature hint in case the FileSystem is strict with
    // non-kUnknown temperature option
    io_s = src_env->GetFileSystem()->NewSequentialFile(
        src, FileOptions(src_env_options), &src_file, nullptr);
  }
// this is now from the NewSequentialFile call, not NewWritableFile
  if (!io_s.ok()) {
    return io_s;
  }
// dst_file is still nullptr
```

If the first `NewWritableFile` fails and `IsPathNotFound

Tests: existing unit tests

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13734

Reviewed By: pdillinger

Differential Revision: D77390694

Pulled By: archang19

fbshipit-source-id: 865a3a646079ae2349a3b6f25e53ae85df8e4985
---
 utilities/backup/backup_engine.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc
index b9b5c27f2371..420dc8155e2c 100644
--- a/utilities/backup/backup_engine.cc
+++ b/utilities/backup/backup_engine.cc
@@ -2372,7 +2372,11 @@ IOStatus BackupEngineImpl::CopyOrCreateFile(
 
   io_s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
                                                    &dst_file, nullptr);
-  if (io_s.ok() && !src.empty()) {
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  if (!src.empty()) {
     auto src_file_options = FileOptions(src_env_options);
     src_file_options.temperature = *src_temperature;
     io_s = src_env->GetFileSystem()->NewSequentialFile(src, src_file_options,

From 80c9eec6b6543a3b0607dd75cfa7dea0fe0333b6 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 27 Jun 2025 12:47:32 -0700
Subject: [PATCH 159/500] Improve debugging of CacheWithSecondaryAdapter
 failures (#13737)

Summary:
improve assertions, one apparently a previous typo in https://github.com/facebook/rocksdb/issues/13606 and one a suspected possible area of logic error

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13737

Test Plan: watch crash test

Reviewed By: anand1976

Differential Revision: D77453102

Pulled By: pdillinger

fbshipit-source-id: d4196910a9e8d59ef814130a52ff4ebf188a976d
---
 cache/secondary_cache_adapter.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index e1b41fb54d4a..f0d450eeadd2 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -130,7 +130,7 @@ CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
               "Secondary cache reserved: %zu\n",
               pri_cache_res_->GetTotalMemoryUsed(), sec_capacity,
               sec_reserved_);
-      assert(pri_cache_res_mismatch);
+      assert(!pri_cache_res_mismatch);
     }
   }
 #endif  // NDEBUG
@@ -613,6 +613,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio(
     //    cache utilization (increase in capacity - increase in share of cache
     //    reservation)
     // 3. Increase secondary cache capacity
+    assert(new_sec_reserved >= sec_reserved_);
     s = secondary_cache_->Deflate(new_sec_reserved - sec_reserved_);
     assert(s.ok());
     s = pri_cache_res_->UpdateCacheReservation(

From 3cc76aae83804732229a9438ca96b8b0fa6682e8 Mon Sep 17 00:00:00 2001
From: Sujit Maharjan <sujitmaharjan@meta.com>
Date: Sat, 28 Jun 2025 09:24:14 -0700
Subject: [PATCH 160/500] Fix nightly build failure because preferred
 compression type was kNoCompression. (#13739)

Summary:
CostAwareCompressor simply ignores the preferred compression type as compression manager setting takes precedence over the compression type setting. Thus, I am removing the assert statement as it itself is unnecessary for this case.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13739

Test Plan:
Run nightly build test
```bash
make V=1 J=4 -j4 check
```

Reviewed By: hx235

Differential Revision: D77470932

Pulled By: shubhajeet

fbshipit-source-id: ebb69367d2ffb9bd72432fd04b0cd12ce2d6240a
---
 util/auto_tune_compressor.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
index b76b27db32e6..9716322b9f75 100644
--- a/util/auto_tune_compressor.cc
+++ b/util/auto_tune_compressor.cc
@@ -294,7 +294,6 @@ std::unique_ptr<Compressor> CostAwareCompressorManager::GetCompressorForSST(
     const FilterBuildingContext& context, const CompressionOptions& opts,
     CompressionType preferred) {
   assert(GetSupportedCompressions().size() > 1);
-  assert(preferred != kNoCompression);
   (void)context;
   (void)preferred;
   return std::make_unique<CostAwareCompressor>(opts);

From 4f7d3a0cb228edb23cfdf5c53decc08782435999 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Tue, 1 Jul 2025 11:07:51 -0700
Subject: [PATCH 161/500] Add a new periodic task to trigger compactions
 (#13736)

Summary:
address an existing limitation on compaction triggering mechanism that relies on events like flush/compaction/SetOptions. This is important for periodic compactions where files can become eligible without any of these events. The periodic task now runs every 12 hours and check CFs that enables `periodic_compaction_second` (TBD if we want to expand to all CFs) for eligible compactions.

Some of the periodic tasks probably don't need to run immediately after Register(). I'm keeping the existing behavior for now for patch release and to makes tests happy.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13736

Test Plan:
- new unit test that fails before this change.
- ran crash test for hours with the periodic task running every 5 seconds: `python3 ./tools/db_crashtest.py blackbox --test_batches_snapshot=0 --periodic_compaction_seconds=10`

Reviewed By: pdillinger

Differential Revision: D77460715

Pulled By: cbi42

fbshipit-source-id: 00f61502753185e76830c9ed44c5ccc4f4f16bfa
---
 db/column_family.cc                           |  2 +
 db/db_compaction_test.cc                      | 53 ++++++++++++++++++
 db/db_impl/db_impl.cc                         | 54 ++++++++++++++++---
 db/db_impl/db_impl.h                          |  6 +++
 db/periodic_task_scheduler.cc                 | 22 +++++---
 db/periodic_task_scheduler.h                  |  8 ++-
 db/periodic_task_scheduler_test.cc            | 26 +++++++--
 .../periodic-compaction-trigger.md            |  1 +
 8 files changed, 153 insertions(+), 19 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/periodic-compaction-trigger.md

diff --git a/db/column_family.cc b/db/column_family.cc
index 03d4f8a8c34a..5968fa726ae7 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1633,6 +1633,8 @@ Status ColumnFamilyData::SetOptions(
   Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
                                            &cf_opts);
   if (s.ok()) {
+    // FIXME: we should call SanitizeOptions() too or consolidate it with
+    // ValidateOptions().
     s = ValidateOptions(db_opts, cf_opts);
   }
   if (s.ok()) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 26af75656ea0..6614edba92bc 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -11093,6 +11093,59 @@ TEST_F(DBCompactionTest, RecordNewestKeyTimeForTtlCompaction) {
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 }
+
+class PeriodicCompactionListener : public EventListener {
+ public:
+  explicit PeriodicCompactionListener() {}
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    if (ci.compaction_reason == CompactionReason::kPeriodicCompaction) {
+      ++num_periodic_compactions;
+    }
+  }
+
+  std::atomic<int> num_periodic_compactions = 0;
+};
+
+TEST_F(DBCompactionTest, PeriodicTask) {
+  // Tests that when no trigger event is fired (flush/compaction/setoptions),
+  // periodic compaction is still triggered by a scheduled periodic function.
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  mock_clock->SetCurrentTime(100);
+  mock_clock->InstallTimedWaitFixCallback();
+  auto mock_env = std::make_unique<CompositeEnvWrapper>(env_, mock_clock);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+        auto periodic_task_scheduler_ptr =
+            static_cast<PeriodicTaskScheduler*>(arg);
+        periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock.get());
+      });
+
+  Options options;
+  options.env = mock_env.get();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.statistics = CreateDBStatistics();
+  int kPeriodicCompactionSeconds = 7 * 24 * 60 * 60;  // 1 week
+  options.periodic_compaction_seconds = kPeriodicCompactionSeconds;
+  options.num_levels = 50;
+  auto listener = std::make_shared<PeriodicCompactionListener>();
+  options.listeners.push_back(listener);
+  ASSERT_OK(TryReopen(options));
+
+  Random* rnd = Random::GetTLSInstance();
+  for (int k = 0; k < 10; ++k) {
+    ASSERT_OK(Put(Key(k), rnd->RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(49));
+
+  dbfull()->TEST_WaitForPeriodicTaskRun(
+      [&] { mock_clock->MockSleepForSeconds(kPeriodicCompactionSeconds + 1); });
+  ASSERT_OK(db_->WaitForCompact({}));
+
+  ASSERT_EQ(listener->num_periodic_compactions, 1);
+  Close();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 8b0164e3328e..e64471b07c24 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -253,6 +253,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   periodic_task_functions_.emplace(
       PeriodicTaskType::kRecordSeqnoTime,
       [this]() { this->RecordSeqnoToTimeMapping(); });
+  periodic_task_functions_.emplace(
+      PeriodicTaskType::kTriggerCompaction,
+      [this]() { this->TriggerPeriodicCompaction(); });
 
   versions_.reset(new VersionSet(
       dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
@@ -787,7 +790,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
     Status s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kDumpStats,
         periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
-        mutable_db_options_.stats_dump_period_sec);
+        mutable_db_options_.stats_dump_period_sec,
+        /*run_immediately=*/true);
     if (!s.ok()) {
       return s;
     }
@@ -796,7 +800,8 @@ Status DBImpl::StartPeriodicTaskScheduler() {
     Status s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kPersistStats,
         periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
-        mutable_db_options_.stats_persist_period_sec);
+        mutable_db_options_.stats_persist_period_sec,
+        /*run_immediately=*/true);
     if (!s.ok()) {
       return s;
     }
@@ -804,7 +809,15 @@ Status DBImpl::StartPeriodicTaskScheduler() {
 
   Status s = periodic_task_scheduler_.Register(
       PeriodicTaskType::kFlushInfoLog,
-      periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog));
+      periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog),
+      /*run_immediately=*/true);
+
+  if (s.ok()) {
+    s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kTriggerCompaction,
+        periodic_task_functions_.at(PeriodicTaskType::kTriggerCompaction),
+        /*run_immediately=*/false);
+  }
 
   return s;
 }
@@ -855,7 +868,7 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker() {
     s = periodic_task_scheduler_.Register(
         PeriodicTaskType::kRecordSeqnoTime,
         periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
-        seqno_time_cadence);
+        seqno_time_cadence, /*run_immediately=*/true);
   }
 
   return s;
@@ -1365,7 +1378,7 @@ Status DBImpl::SetDBOptions(
         s = periodic_task_scheduler_.Register(
             PeriodicTaskType::kDumpStats,
             periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
-            new_options.stats_dump_period_sec);
+            new_options.stats_dump_period_sec, /*run_immediately=*/true);
       }
       if (new_options.max_total_wal_size !=
           mutable_db_options_.max_total_wal_size) {
@@ -1380,7 +1393,7 @@ Status DBImpl::SetDBOptions(
           s = periodic_task_scheduler_.Register(
               PeriodicTaskType::kPersistStats,
               periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
-              new_options.stats_persist_period_sec);
+              new_options.stats_persist_period_sec, /*run_immediately=*/true);
         }
       }
       mutex_.Lock();
@@ -6882,6 +6895,35 @@ void DBImpl::RecordSeqnoToTimeMapping() {
   sv_context.Clean();
 }
 
+void DBImpl::TriggerPeriodicCompaction() {
+  TEST_SYNC_POINT("DBImpl::TriggerPeriodicCompaction:StartRunning");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Running the periodic task to trigger compactions.");
+
+    for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->GetLatestCFOptions().periodic_compaction_seconds &&
+          !cfd->queued_for_compaction()) {
+        cfd->current()->storage_info()->ComputeCompactionScore(
+            cfd->ioptions(), cfd->GetLatestMutableCFOptions());
+        EnqueuePendingCompaction(cfd);
+        if (cfd->queued_for_compaction()) {
+          ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                         "Periodic task to trigger compaction queued Column "
+                         "family [%s] for compaction.",
+                         cfd->GetName().c_str());
+        }
+      }
+    }
+    MaybeScheduleFlushOrCompaction();
+    bg_cv_.SignalAll();
+  }
+}
+
 void DBImpl::TrackOrUntrackFiles(
     const std::vector<std::string>& existing_data_files, bool track) {
   auto sfm = static_cast_with_check<SstFileManagerImpl>(
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 66918e1d0077..be51ac567cb7 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1286,6 +1286,12 @@ class DBImpl : public DB {
   // For the background timer job
   void RecordSeqnoToTimeMapping();
 
+  // Compactions rely on an event triggers like flush/compaction/SetOptions.
+  // We need to trigger periodic compactions even when there is no such trigger.
+  // This function checks and schedules available compactions and will run
+  // periodically.
+  void TriggerPeriodicCompaction();
+
   // REQUIRES: DB mutex held
   std::pair<SequenceNumber, uint64_t> GetSeqnoToTimeSample() const;
 
diff --git a/db/periodic_task_scheduler.cc b/db/periodic_task_scheduler.cc
index 2f266529c57c..ee3f07b91e73 100644
--- a/db/periodic_task_scheduler.cc
+++ b/db/periodic_task_scheduler.cc
@@ -26,6 +26,7 @@ static const std::map<PeriodicTaskType, uint64_t> kDefaultPeriodSeconds = {
     {PeriodicTaskType::kPersistStats, kInvalidPeriodSec},
     {PeriodicTaskType::kFlushInfoLog, 10},
     {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec},
+    {PeriodicTaskType::kTriggerCompaction, 12 * 60 * 60}  // 12 hours
 };
 
 static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
@@ -33,16 +34,20 @@ static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
     {PeriodicTaskType::kPersistStats, "pst_st"},
     {PeriodicTaskType::kFlushInfoLog, "flush_info_log"},
     {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"},
+    {PeriodicTaskType::kTriggerCompaction, "trigger_compaction"},
 };
 
 Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
-                                       const PeriodicTaskFunc& fn) {
-  return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type));
+                                       const PeriodicTaskFunc& fn,
+                                       bool run_immediately) {
+  return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type),
+                  run_immediately);
 }
 
 Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
                                        const PeriodicTaskFunc& fn,
-                                       uint64_t repeat_period_seconds) {
+                                       uint64_t repeat_period_seconds,
+                                       bool run_immediately) {
   MutexLock l(&timer_mutex);
   static std::atomic<uint64_t> initial_delay(0);
 
@@ -65,10 +70,13 @@ Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
   std::string unique_id =
       kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++);
 
-  bool succeeded = timer_->Add(
-      fn, unique_id,
-      (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond,
-      repeat_period_seconds * kMicrosInSecond);
+  uint64_t initial_delay_micros =
+      (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond;
+  if (!run_immediately) {
+    initial_delay_micros += repeat_period_seconds * kMicrosInSecond;
+  }
+  bool succeeded = timer_->Add(fn, unique_id, initial_delay_micros,
+                               repeat_period_seconds * kMicrosInSecond);
   if (!succeeded) {
     return Status::Aborted("Failed to register periodic task");
   }
diff --git a/db/periodic_task_scheduler.h b/db/periodic_task_scheduler.h
index 3ac8a3b9cee6..8511f5f2d8e7 100644
--- a/db/periodic_task_scheduler.h
+++ b/db/periodic_task_scheduler.h
@@ -21,6 +21,7 @@ enum class PeriodicTaskType : uint8_t {
   kPersistStats,
   kFlushInfoLog,
   kRecordSeqnoTime,
+  kTriggerCompaction,
   kMax,
 };
 
@@ -42,13 +43,16 @@ class PeriodicTaskScheduler {
   PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete;
 
   // Register a task with its default repeat period. Thread safe call.
-  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn);
+  // @param run_immediately If true, the task will run soon after it's
+  // scheduled, instead of waiting for the repeat period.
+  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
+                  bool run_immediately);
 
   // Register a task with specified repeat period. 0 is an invalid argument
   // (kInvalidPeriodSec). To stop the task, please use Unregister().
   // Thread safe call.
   Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
-                  uint64_t repeat_period_seconds);
+                  uint64_t repeat_period_seconds, bool run_immediately);
 
   // Unregister the task. Thread safe call.
   Status Unregister(PeriodicTaskType task_type);
diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc
index baf74ed15e3a..e0904abe3dd0 100644
--- a/db/periodic_task_scheduler_test.cc
+++ b/db/periodic_task_scheduler_test.cc
@@ -56,6 +56,12 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
   SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::FlushInfoLog:StartRunning",
       [&](void*) { flush_info_log_counter++; });
+
+  int trigger_compaction_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::TriggerPeriodicCompaction:StartRunning",
+      [&](void*) { trigger_compaction_counter++; });
+
   SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(options);
@@ -70,7 +76,7 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
 
   const PeriodicTaskScheduler& scheduler =
       dbfull()->TEST_GetPeriodicTaskScheduler();
-  ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
+  ASSERT_EQ((int)PeriodicTaskType::kMax - 1, scheduler.TEST_GetValidTaskNum());
 
   ASSERT_EQ(1, dump_st_counter);
   ASSERT_EQ(1, pst_st_counter);
@@ -103,14 +109,14 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
   ASSERT_EQ(3, pst_st_counter);
   ASSERT_EQ(4, flush_info_log_counter);
 
-  ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum());
+  ASSERT_EQ(2u, scheduler.TEST_GetValidTaskNum());
 
   // Re-enable one task
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}}));
   ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
   ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
 
-  ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum());
+  ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
 
   dbfull()->TEST_WaitForPeriodicTaskRun(
       [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
@@ -118,6 +124,16 @@ TEST_F(PeriodicTaskSchedulerTest, Basic) {
   ASSERT_EQ(3, pst_st_counter);
   ASSERT_EQ(5, flush_info_log_counter);
 
+  ASSERT_EQ(0, trigger_compaction_counter);
+  dbfull()->TEST_WaitForPeriodicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(12 * 60 * 60));
+  });
+  ASSERT_EQ(1, trigger_compaction_counter);
+  dbfull()->TEST_WaitForPeriodicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(12 * 60 * 60));
+  });
+  ASSERT_EQ(2, trigger_compaction_counter);
+
   Close();
 }
 
@@ -150,7 +166,9 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
   auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
 
   const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler();
-  ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum());
+  // kRecordSeqnoTime is not registered since the feature is not enabled
+  ASSERT_EQ(kInstanceNum * ((int)PeriodicTaskType::kMax - 1),
+            scheduler.TEST_GetValidTaskNum());
 
   int expected_run = kInstanceNum;
   dbi->TEST_WaitForPeriodicTaskRun(
diff --git a/unreleased_history/behavior_changes/periodic-compaction-trigger.md b/unreleased_history/behavior_changes/periodic-compaction-trigger.md
new file mode 100644
index 000000000000..b9c8e7c1c227
--- /dev/null
+++ b/unreleased_history/behavior_changes/periodic-compaction-trigger.md
@@ -0,0 +1 @@
+* RocksDB now triggers eligible compactions every 12 hours when periodic compaction is configured. This solves a limitation of the compaction trigger mechanism, which would only trigger compaction after specific events like flush, compaction, or SetOptions.

From f081d145cf512ac662d8df62e733c246214ed18e Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 2 Jul 2025 17:31:16 -0700
Subject: [PATCH 162/500] Backport internal changes (#13752)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13752

... to github repo. This include changes from D77323287,  D77473923 and the release note change in patch release: D77611483.

Reviewed By: archang19

Differential Revision: D77670619

fbshipit-source-id: 37d877f3317c71de190128fa4da6b18f6dfcf3c5
---
 db/version_set.cc                             | 14 +++++++------
 .../bug_fixes/backup-engine-crash.md          |  1 +
 util/timer_queue_test.cc                      | 20 ++++++++++++++++---
 3 files changed, 26 insertions(+), 9 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/backup-engine-crash.md

diff --git a/db/version_set.cc b/db/version_set.cc
index 537287577e41..66b33527ffed 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2741,9 +2741,10 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
           RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
                      mget_tasks.size());
           // Collect all results so far
-          std::vector<Status> statuses = folly::coro::blockingWait(
-              folly::coro::collectAllRange(std::move(mget_tasks))
-                  .scheduleOn(&range->context()->executor()));
+          std::vector<Status> statuses =
+              folly::coro::blockingWait(co_withExecutor(
+                  &range->context()->executor(),
+                  folly::coro::collectAllRange(std::move(mget_tasks))));
           if (s.ok()) {
             for (Status stat : statuses) {
               if (!stat.ok()) {
@@ -3028,9 +3029,10 @@ Status Version::MultiGetAsync(
         assert(waiting.size());
         RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
         // Collect all results so far
-        std::vector<Status> statuses = folly::coro::blockingWait(
-            folly::coro::collectAllRange(std::move(mget_tasks))
-                .scheduleOn(&range->context()->executor()));
+        std::vector<Status> statuses =
+            folly::coro::blockingWait(co_withExecutor(
+                &range->context()->executor(),
+                folly::coro::collectAllRange(std::move(mget_tasks))));
         mget_tasks.clear();
         if (s.ok()) {
           for (Status stat : statuses) {
diff --git a/unreleased_history/bug_fixes/backup-engine-crash.md b/unreleased_history/bug_fixes/backup-engine-crash.md
new file mode 100644
index 000000000000..20ce0894f83e
--- /dev/null
+++ b/unreleased_history/bug_fixes/backup-engine-crash.md
@@ -0,0 +1 @@
+* Fix a bug in BackupEngine that can crash backup due to a null FSWritableFile passed to WritableFileWriter.
diff --git a/util/timer_queue_test.cc b/util/timer_queue_test.cc
index b3c3768ec797..3afae866290d 100644
--- a/util/timer_queue_test.cc
+++ b/util/timer_queue_test.cc
@@ -28,6 +28,10 @@
 
 #include <future>
 
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
 namespace Timing {
 
 using Clock = std::chrono::high_resolution_clock;
@@ -39,7 +43,9 @@ double now() {
 
 }  // namespace Timing
 
-int main() {
+class TimerQueueTest : public testing::Test {};
+
+TEST_F(TimerQueueTest, BasicFunctionality) {
   TimerQueue q;
 
   double tnow = Timing::now();
@@ -68,6 +74,14 @@ int main() {
   // assert(ret == 1);
   // q.cancelAll();
 
-  return 0;
+  // Test passes if we can create and add timers without crashing
+  ASSERT_TRUE(true);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-//////////////////////////////////////////

From ab6ba62eb172ac53fc053de09d6036af0de4768a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 2 Jul 2025 18:58:14 -0700
Subject: [PATCH 163/500] Possible fix for CacheWithSecondaryAdapter assertion
 failures (#13747)

Summary:
Was reading sec_cache_res_ratio_ outside of mutex and using the result for computation that needs to be synchronized

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13747

Test Plan: existing tests. Has been showing up in crash test, and there's no interesting concurrency here that would warrant a regression test based on sync points.

Reviewed By: cbi42

Differential Revision: D77607660

Pulled By: pdillinger

fbshipit-source-id: 12a71936b3558c7528d229a11c7d2e43982ad06b
---
 cache/secondary_cache_adapter.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index f0d450eeadd2..4a9e3decc94a 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -489,12 +489,10 @@ const char* CacheWithSecondaryAdapter::Name() const {
 // as well. At the moment, we don't have a good way of handling the case
 // where the new capacity < total cache reservations.
 void CacheWithSecondaryAdapter::SetCapacity(size_t capacity) {
-  size_t sec_capacity = static_cast<size_t>(
-      capacity * (distribute_cache_res_ ? sec_cache_res_ratio_ : 0.0));
-  size_t old_sec_capacity = 0;
-
   if (distribute_cache_res_) {
     MutexLock m(&cache_res_mutex_);
+    size_t sec_capacity = static_cast<size_t>(capacity * sec_cache_res_ratio_);
+    size_t old_sec_capacity = 0;
 
     Status s = secondary_cache_->GetCapacity(old_sec_capacity);
     if (!s.ok()) {

From 805ac7c887bbebd7e6124bbc7e8d0803aeed8f07 Mon Sep 17 00:00:00 2001
From: Alan Paxton <alan.paxton@gmail.com>
Date: Mon, 7 Jul 2025 13:28:49 -0700
Subject: [PATCH 164/500] Update compression libraries to latest releases
 (#13609)

Summary:
See `Makefile` for actual changes:

* ZLIB remains the same
* BZIP2 remains the same
* SNAPPY is a minor update
* LZ4 is a significant update with multithreaded/multicore compression https://github.com/lz4/lz4/releases/tag/v1.10.0
* ZSTD is a significant update RocksDB is called out as benefiting in particular from the performance improvements herein https://github.com/facebook/zstd/releases/tag/v1.5.7

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13609

Reviewed By: archang19

Differential Revision: D77877295

Pulled By: mszeszko-meta

fbshipit-source-id: bf9a257e8f68dec3d02743b339aa2df65df4ab2c
---
 .github/workflows/pr-jobs.yml                 |  6 +--
 Makefile                                      | 36 +++++++++---------
 .../java/org/rocksdb/NativeLibraryLoader.java | 37 +++++++++++++++++--
 java/src/main/java/org/rocksdb/RocksDB.java   | 34 +++++++++++------
 4 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 83e229378ea7..eddaa85b1952 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -429,7 +429,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:centos6_x64-be
+      image: evolvedbinary/rocksjava:centos7_x64-be
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
@@ -457,7 +457,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:centos6_x64-be
+      image: evolvedbinary/rocksjava:centos7_x64-be
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
@@ -552,7 +552,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:rockylinux8_x64-be
+      image: evolvedbinary/rocksjava:alpine3_x64-be
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
diff --git a/Makefile b/Makefile
index 5426c099f9ad..db84477e9e6a 100644
--- a/Makefile
+++ b/Makefile
@@ -2147,14 +2147,14 @@ ZLIB_DOWNLOAD_BASE ?= http://zlib.net
 BZIP2_VER ?= 1.0.8
 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269
 BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2
-SNAPPY_VER ?= 1.2.1
-SNAPPY_SHA256 ?= 736aeb64d86566d2236ddffa2865ee5d7a82d26c9016b36218fcc27ea4f09f86
+SNAPPY_VER ?= 1.2.2
+SNAPPY_SHA256 ?= 90f74bc1fbf78a6c56b3c4a082a05103b3a56bb17bca1a27e052ea11723292dc
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
-LZ4_VER ?= 1.9.4
-LZ4_SHA256 ?= 0b0e3aa07c8c063ddf40b082bdf7e37a1562bda40a0ff5272957f3e987e0e54b
+LZ4_VER ?= 1.10.0
+LZ4_SHA256 ?= 537512904744b35e232912055ccf8ec66d768639ff3abe5788d90d792ec5f48b
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.5.5
-ZSTD_SHA256 ?= 98e9c3d949d1b924e28e01eccb7deed865eefebf25c2f21c702e5cd5b63b85e1
+ZSTD_VER ?= 1.5.7
+ZSTD_SHA256 ?= 37d7284556b20954e56e1ca85b80226768902e2edabd3b649e9e72c0c9012ee3
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 
@@ -2375,27 +2375,27 @@ rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_x86-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_x64-be --platform linux/amd64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerppc64le:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_ppc64le-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_ppc64le-be --platform linux/ppc64le --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerarm64v8:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_arm64v8-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_arm64v8-be --platform linux/aarch64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:centos7_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockers390x:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_s390x-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_s390x-be --platform linux/s390x --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu18_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerriscv64:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_riscv64-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_riscv64-be --platform linux/riscv64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:ubuntu20_riscv64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerx86musl:
 	mkdir -p java/target
@@ -2403,19 +2403,19 @@ rocksdbjavastaticdockerx86musl:
 
 rocksdbjavastaticdockerx86_64musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x64-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_x64-musl-be --platform linux/amd64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_x64-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerppc64lemusl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_ppc64le-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_ppc64le-musl-be --platform linux/ppc64le --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockerarm64v8musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_arm64v8-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_arm64v8-musl-be --platform linux/aarch64 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_arm64v8-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticdockers390xmusl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_s390x-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
+	docker run --rm --name rocksdb_linux_s390x-musl-be --platform linux/s390x --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) --env J=$(J) evolvedbinary/rocksjava:alpine3_s390x-be /rocksdb-host/java/crossbuild/docker-build-linux.sh
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
@@ -2470,8 +2470,8 @@ jtest_run:
 jtest: rocksdbjava
 	cd java;$(MAKE) sample test
 
-jpmd: rocksdbjava rocksdbjavageneratepom
-	cd java;$(MAKE) pmd
+jpmd: rocksdbjavageneratepom
+	cd java;$(MAKE) java java_test pmd
 
 jdb_bench:
 	cd java;$(MAKE) db_bench;
diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
index 6fe97994d201..aa841c6f3688 100644
--- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@@ -30,6 +30,14 @@ public class NativeLibraryLoader {
   private static final String tempFilePrefix = "librocksdbjni";
   private static final String tempFileSuffix = Environment.getJniLibraryExtension();
 
+  /**
+   * If you set the System Property ROCKS_JAVA_DEBUG_NLL can be to true
+   * messages about attempts to load the native library will be printed
+   * to std out.
+   */
+  private static boolean DEBUG_LOADING =
+      "true".equals(System.getProperty("ROCKS_JAVA_DEBUG_NLL", "false"));
+
   /**
    * Get a reference to the NativeLibraryLoader
    *
@@ -55,7 +63,7 @@ public static NativeLibraryLoader getInstance() {
    *
    * @throws java.io.IOException if a filesystem operation fails.
    */
-  @SuppressWarnings("PMD.EmptyCatchBlock")
+  @SuppressWarnings({"PMD.EmptyCatchBlock", "PMD.SystemPrintln"})
   public synchronized void loadLibrary(final String tmpDir) throws IOException {
     try {
       // try dynamic library
@@ -63,6 +71,9 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException {
       return;
     } catch (final UnsatisfiedLinkError ule) {
       // ignore - try from static library
+      if (DEBUG_LOADING) {
+        System.out.println("Unable to load shared dynamic library: " + sharedLibraryName);
+      }
     }
 
     try {
@@ -71,6 +82,9 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException {
       return;
     } catch (final UnsatisfiedLinkError ule) {
       // ignore - then try static library fallback or from jar
+      if (DEBUG_LOADING) {
+        System.out.println("Unable to load shared static library: " + jniLibraryName);
+      }
     }
 
     if (fallbackJniLibraryName != null) {
@@ -80,6 +94,10 @@ public synchronized void loadLibrary(final String tmpDir) throws IOException {
         return;
       } catch (final UnsatisfiedLinkError ule) {
         // ignore - then try from jar
+        if (DEBUG_LOADING) {
+          System.out.println(
+              "Unable to load shared static fallback library: " + fallbackJniLibraryName);
+        }
       }
     }
 
@@ -137,18 +155,23 @@ private File createTemp(final String tmpDir, final String libraryFileName) throw
     }
   }
 
-  @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources"})
+  @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources", "PMD.SystemPrintln"})
   File loadLibraryFromJarToTemp(final String tmpDir) throws IOException {
     try (InputStream is = getClass().getClassLoader().getResourceAsStream(jniLibraryFileName)) {
       if (is != null) {
         final File temp = createTemp(tmpDir, jniLibraryFileName);
         Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
         return temp;
+      } else {
+        if (DEBUG_LOADING) {
+          System.out.println("Unable to find: " + jniLibraryFileName + " on the classpath");
+        }
       }
     }
 
     if (fallbackJniLibraryFileName == null) {
-      throw new RuntimeException(fallbackJniLibraryFileName + " was not found inside JAR.");
+      throw new RuntimeException(
+          jniLibraryFileName + " was not found inside JAR, and there is no fallback.");
     }
 
     try (InputStream is =
@@ -157,10 +180,16 @@ File loadLibraryFromJarToTemp(final String tmpDir) throws IOException {
         final File temp = createTemp(tmpDir, fallbackJniLibraryFileName);
         Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
         return temp;
+      } else {
+        if (DEBUG_LOADING) {
+          System.out.println(
+              "Unable to find fallback: " + fallbackJniLibraryFileName + " on the classpath");
+        }
       }
     }
 
-    throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
+    throw new RuntimeException("Neither " + jniLibraryFileName + " or " + fallbackJniLibraryFileName
+        + " were found inside the JAR, and there is no fallback.");
   }
 
   /**
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index b423649c111d..fe2f38af64f9 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -84,13 +84,7 @@ public static void loadLibrary() {
       return;
     }
 
-    while (libraryLoaded.get() == LibraryState.LOADING) {
-      try {
-        Thread.sleep(10);
-      } catch(final InterruptedException e) {
-        //ignore
-      }
-    }
+    waitForLibraryToBeLoaded();
   }
 
   /**
@@ -146,12 +140,28 @@ public static void loadLibrary(final List<String> paths) {
       return;
     }
 
-    while (libraryLoaded.get() == LibraryState.LOADING) {
-      try {
-        Thread.sleep(10);
-      } catch(final InterruptedException e) {
-        //ignore
+    waitForLibraryToBeLoaded();
+  }
+
+  private static void waitForLibraryToBeLoaded() {
+    final long wait = 10; // Time to wait before re-checking if another thread loaded the library
+    final long timeout =
+        10 * 1000; // Maximum time to wait for another thread to load the library (10 seconds)
+    long waited = 0;
+    try {
+      while (libraryLoaded.get() == LibraryState.LOADING) {
+        Thread.sleep(wait);
+        waited += wait;
+
+        if (waited >= timeout) {
+          throw new RuntimeException(
+              "Exceeded timeout whilst trying to load the RocksDB shared library");
+        }
       }
+    } catch (final InterruptedException e) {
+      // restore interrupted status
+      Thread.currentThread().interrupt();
+      throw new RuntimeException("Interrupted whilst trying to load the RocksDB shared library", e);
     }
   }
 

From 3381e4d787b7f83d48f453780d9f0ec8fec99602 Mon Sep 17 00:00:00 2001
From: akabcenell <akabcenell@users.noreply.github.com>
Date: Tue, 8 Jul 2025 15:01:59 -0700
Subject: [PATCH 165/500] Change GetWaitingTxns() to return blocking lock on
 timeout (#13754)

Summary:
While a transaction is waiting on a lock, we can use GetWaitingTxns() to determine the transactionID of the blocking transaction and the contended key. However, this gets cleared when the lock times out, so if a client has widespread timeout errors, you need to catch a transaction 'in the act' before they actually hit the timeout in order to understand the contention pattern. This diff adds a new TransactionOptions variable enable_get_waiting_txn_after_timeout, which persists the lock contention information after timeout so it can be accessed by the client after they have received the timeout error.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13754

Test Plan:
- updated TransactionTest.WaitingTxn to test the changed behavior
- ran production shadow tests on traffic with frequent timeouts

Reviewed By: cbi42

Differential Revision: D77703598

Pulled By: akabcenell

fbshipit-source-id: b4448ca1b6a3694d51bfe1ce801b09eb376ff3e9
---
 .../get-waiting-txns-after-timeout.md         |  1 +
 .../lock/point/point_lock_manager.cc          |  5 +++++
 .../transactions/pessimistic_transaction.h    | 19 +++++++++++++++----
 utilities/transactions/transaction_test.cc    | 10 ++++++++++
 4 files changed, 31 insertions(+), 4 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md

diff --git a/unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md b/unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md
new file mode 100644
index 000000000000..71ace60e9e3a
--- /dev/null
+++ b/unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md
@@ -0,0 +1 @@
+* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occured.
diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 6f9d95aefa90..9e0426429cff 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -359,6 +359,11 @@ Status PointLockManager::AcquireWithTimeout(
 
   stripe->stripe_mutex->UnLock();
 
+  // On timeout, persist the lock information so we can debug the contention
+  if (result.IsTimedOut()) {
+    txn->SetWaitingTxn(wait_ids, column_family_id, &key, true);
+  }
+
   return result;
 }
 
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index dd166bd080ad..b55d69685dd1 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -71,18 +71,26 @@ class PessimisticTransaction : public TransactionBaseImpl {
                                             std::string* key) const override {
     std::lock_guard<std::mutex> lock(wait_mutex_);
     std::vector<TransactionID> ids(waiting_txn_ids_.size());
-    if (key) *key = waiting_key_ ? *waiting_key_ : "";
+    if (timed_out_key_.has_value()) {
+      if (key) *key = timed_out_key_.value();
+    } else {
+      if (key) *key = waiting_key_ ? *waiting_key_ : "";
+    }
     if (column_family_id) *column_family_id = waiting_cf_id_;
     std::copy(waiting_txn_ids_.begin(), waiting_txn_ids_.end(), ids.begin());
     return ids;
   }
 
   void SetWaitingTxn(autovector<TransactionID> ids, uint32_t column_family_id,
-                     const std::string* key) {
+                     const std::string* key, bool is_timed_out = false) {
     std::lock_guard<std::mutex> lock(wait_mutex_);
     waiting_txn_ids_ = ids;
     waiting_cf_id_ = column_family_id;
-    waiting_key_ = key;
+    if (is_timed_out) {
+      timed_out_key_ = key ? *key : "";
+    } else {
+      waiting_key_ = key;
+    }
   }
 
   void ClearWaitingTxn() {
@@ -182,7 +190,7 @@ class PessimisticTransaction : public TransactionBaseImpl {
 
   // IDs for the transactions that are blocking the current transaction.
   //
-  // empty if current transaction is not waiting.
+  // empty if current transaction is not waiting or has timed out
   autovector<TransactionID> waiting_txn_ids_;
 
   // The following two represents the (cf, key) that a transaction is waiting
@@ -196,6 +204,9 @@ class PessimisticTransaction : public TransactionBaseImpl {
   uint32_t waiting_cf_id_;
   const std::string* waiting_key_;
 
+  // Waiting key with lifetime of the txn so it can be accessed after timeouts
+  std::optional<std::string> timed_out_key_;
+
   // Mutex protecting waiting_txn_ids_, waiting_cf_id_ and waiting_key_.
   mutable std::mutex wait_mutex_;
 
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 226914733524..83b115711167 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -561,6 +561,16 @@ TEST_P(TransactionTest, WaitingTxn) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 
+  // We expect GetWaitingTxns still returns the waiting values as it would
+  // normally before timeout
+  std::string key;
+  uint32_t cf_id;
+  std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
+  ASSERT_EQ(key, "foo");
+  ASSERT_EQ(wait.size(), 1);
+  ASSERT_EQ(wait[0], id1);
+  ASSERT_EQ(cf_id, 0U);
+
   delete cfa;
   delete txn1;
   delete txn2;

From 9758482360de92ec69bd58f6249ea404b7d0a98b Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Tue, 8 Jul 2025 15:10:10 -0700
Subject: [PATCH 166/500] User defined index builder (#13726)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13726

Add UserDefinedIndexFactory and UserDefinedIndexBuilder interfaces to allow users to plugin custom index implementation into block based table. The factory is specified in BlockBasedTableOptions. If non-null, BlockBasedTableBuilder allocates a wrapper index builder encapsulating the native index and the custom index. The custom index is exposed to BlockBasedTableBuilder as a meta_block of type kUserDefinedIndex. This block type is not compressed.

The IndexBuilder OnKeyAdded interface is enhanced to accept the value in addition to the key. Only full values are supported, and parallel compression is not supported since we cannot obtain the value when calling OnKeyAdded.

Reviewed By: pdillinger

Differential Revision: D76165614

fbshipit-source-id: dfad9cbd6d0359987b7f4abe64cae58c472836f9
---
 include/rocksdb/table.h                       |   9 +
 include/rocksdb/user_defined_index.h          |  88 +++++++
 options/options_settable_test.cc              |   2 +
 .../block_based/block_based_table_builder.cc  |  50 +++-
 .../block_based/block_based_table_factory.cc  |   6 +
 table/block_based/block_based_table_reader.cc |   5 +
 table/block_based/block_type.h                |   1 +
 table/block_based/index_builder.h             |  17 +-
 .../block_based/user_defined_index_wrapper.h  | 126 ++++++++++
 table/external_table.cc                       |   2 +-
 table/sst_file_writer.cc                      |   6 +-
 table/table_test.cc                           | 237 ++++++++++++++++++
 12 files changed, 533 insertions(+), 16 deletions(-)
 create mode 100644 include/rocksdb/user_defined_index.h
 create mode 100644 table/block_based/user_defined_index_wrapper.h

diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index cc6368fe0c27..fb2b1c16adb4 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -44,6 +44,7 @@ class TableReader;
 class WritableFileWriter;
 struct ConfigOptions;
 struct EnvOptions;
+class UserDefinedIndexFactory;
 
 // Types of checksums to use for checking integrity of logical blocks within
 // files. All checksums currently use 32 bits of checking power (1 in 4B
@@ -492,8 +493,16 @@ struct BlockBasedTableOptions {
   // Because filters only impact performance and are not data-critical, an
   // SST file can be opened and used without filters if (a) the filter
   // policy name or schema is unrecognized, or (b) filter_policy is nullptr.
+  // See filter_policy regarding filters.
   std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
 
+  // EXPERIMENTAL
+  //
+  // If non-nullptr, use the specified factory to build user-defined index.
+  // This allows users to define their own index format and build the index
+  // during table building.
+  std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory = nullptr;
+
   // If true, place whole keys in the filter (not just prefixes).
   // This must generally be true for gets to be efficient.
   bool whole_key_filtering = true;
diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
new file mode 100644
index 000000000000..a4fd5c90ae09
--- /dev/null
+++ b/include/rocksdb/user_defined_index.h
@@ -0,0 +1,88 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+//  *****************************************************************
+//  EXPERIMENTAL - subject to change while under development
+//  *****************************************************************
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/customizable.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Prefix for user-defined index block names
+inline const std::string kUserDefinedIndexPrefix =
+    "rocksdb.user_defined_index.";
+
+// This is a public API for user-defined index builders.
+// It allows users to define their own index format and build custom
+// indexes during table building.
+
+// The interface for building user-defined index.
+class UserDefinedIndexBuilder {
+ public:
+  // Right now, we only support Puts. In the future, we may support merges,
+  // deletions etc.
+  enum ValueType {
+    kValue,
+    kTypeMax,
+  };
+
+  // File offset and size of the data block
+  struct BlockHandle {
+    uint64_t offset;
+    uint64_t size;
+  };
+
+  virtual ~UserDefinedIndexBuilder() = default;
+
+  // Add a new index entry to index block. The key for the new index entry
+  // should be >= last_key_in_current_block and < first_key_in_next_block.
+  // The previous index entry key and the new index entry key cover
+  // all the keys in the data block associated with the new index entry.
+  //
+  // Called before the OnKeyAdded() call for first_key_in_next_block.
+  // @last_key_in_current_block: The last key in the current data block
+  // @first_key_in_next_block: it will be nullptr if the entry being added is
+  //                           the last one in the table
+  // @separator_scratch: a scratch buffer to back a computed separator between
+  //                     those, as needed. May be modified on each call.
+  // @return: the key or separator stored in the index, which could be
+  //          last_key_in_current_block or a computed separator backed by
+  //          separator_scratch.
+  virtual Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                              const Slice* first_key_in_next_block,
+                              const BlockHandle& block_handle,
+                              std::string* separator_scratch) = 0;
+
+  // This method will be called whenever a key is added. The subclasses may
+  // override OnKeyAdded() if they need to collect additional information.
+  // The type argument indicates whether the value is a full value or partial.
+  // At the moment, only full values are supported.
+  virtual void OnKeyAdded(const Slice& /*key*/, ValueType /*type*/,
+                          const Slice& /*value*/) {}
+
+  // Finish building the index.
+  // Returns a Status and the serialized index contents.
+  // The memory backing the contents should not be freed until this builder
+  // object is destructed.
+  virtual Status Finish(Slice* index_contents) = 0;
+};
+
+// Factory for creating user-defined index builders.
+class UserDefinedIndexFactory : public Customizable {
+ public:
+  virtual ~UserDefinedIndexFactory() = default;
+
+  // Create a new builder for user-defined index.
+  virtual UserDefinedIndexBuilder* NewBuilder() const = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 05a86b6d690c..5b099ab4d367 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -129,6 +129,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
        sizeof(CacheUsageOptions)},
       {offsetof(struct BlockBasedTableOptions, filter_policy),
        sizeof(std::shared_ptr<const FilterPolicy>)},
+      {offsetof(struct BlockBasedTableOptions, user_defined_index_factory),
+       sizeof(std::shared_ptr<UserDefinedIndexFactory>)},
   };
 
   // In this test, we catch a new option of BlockBasedTableOptions that is not
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 89888d5d71d4..57e8ebd4e837 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -46,6 +46,7 @@
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/partitioned_filter_block.h"
+#include "table/block_based/user_defined_index_wrapper.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
@@ -408,7 +409,15 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     }
   }
 
-  ~ParallelCompressionRep() { block_rep_pool.finish(); }
+  ~ParallelCompressionRep() {
+    block_rep_pool.finish();
+#ifndef NDEBUG
+    // Silence ASSERT_STATUS_CHECKED warnings
+    for (auto& block_rep : block_rep_buf) {
+      assert(block_rep.status.ok());
+    }
+#endif
+  }
 
   // Make a block prepared to be emitted to compression thread
   // Used in non-buffered mode
@@ -889,6 +898,27 @@ struct BlockBasedTableBuilder::Rep {
           &this->internal_prefix_transform, use_delta_encoding_for_index_values,
           table_options, ts_sz, persist_user_defined_timestamps));
     }
+
+    // If user_defined_index_factory is provided, wrap the index builder with
+    // UserDefinedIndexWrapper
+    if (table_options.user_defined_index_factory != nullptr) {
+      if (tbo.moptions.compression_opts.parallel_threads > 1 ||
+          tbo.moptions.bottommost_compression_opts.parallel_threads > 1) {
+        SetStatus(
+            Status::InvalidArgument("user_defined_index_factory not supported "
+                                    "with parallel compression"));
+      } else {
+        std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder(
+            table_options.user_defined_index_factory->NewBuilder());
+        if (user_defined_index_builder != nullptr) {
+          index_builder.reset(new UserDefinedIndexBuilderWrapper(
+              std::string(table_options.user_defined_index_factory->Name()),
+              std::move(index_builder), std::move(user_defined_index_builder),
+              &internal_comparator, ts_sz, persist_user_defined_timestamps));
+        }
+      }
+    }
+
     if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
       // Apply optimize_filters_for_hits setting here when applicable by
       // skipping filter generation
@@ -1192,7 +1222,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
       // `Finish()` once compression dictionary has been finalized.
     } else {
       if (!r->IsParallelCompressionEnabled()) {
-        r->index_builder->OnKeyAdded(ikey);
+        r->index_builder->OnKeyAdded(ikey, value);
       }
     }
     // TODO offset passed in is not accurate for parallel compression case
@@ -1634,7 +1664,7 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
         r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
         prev_key_no_ts = key_no_ts;
       }
-      r->index_builder->OnKeyAdded(key);
+      r->index_builder->OnKeyAdded(key, {});
     }
     if (r->filter_builder != nullptr) {
       prev_block_last_key_no_ts.assign(prev_key_no_ts.data(),
@@ -1808,7 +1838,13 @@ void BlockBasedTableBuilder::WriteIndexBlock(
   if (ok()) {
     for (const auto& item : index_blocks.meta_blocks) {
       BlockHandle block_handle;
-      WriteBlock(item.second, &block_handle, BlockType::kIndex);
+      if (item.second.first == BlockType::kIndex) {
+        WriteBlock(item.second.second, &block_handle, item.second.first);
+      } else {
+        assert(item.second.first == BlockType::kUserDefinedIndex);
+        WriteMaybeCompressedBlock(item.second.second, kNoCompression,
+                                  &block_handle, item.second.first);
+      }
       if (!ok()) {
         break;
       }
@@ -1854,8 +1890,8 @@ void BlockBasedTableBuilder::WriteIndexBlock(
     }
   }
   // If success and need to record in metaindex rather than footer...
-  if (!FormatVersionUsesIndexHandleInFooter(
-          rep_->table_options.format_version)) {
+  if (ok() && !FormatVersionUsesIndexHandleInFooter(
+                  rep_->table_options.format_version)) {
     meta_index_builder->Add(kIndexBlockName, *index_block_handle);
   }
 }
@@ -2184,7 +2220,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
           r->filter_builder->Add(
               ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
         }
-        r->index_builder->OnKeyAdded(key);
+        r->index_builder->OnKeyAdded(key, iter->value());
       }
       WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
       if (ok() && i + 1 < r->data_block_buffers.size()) {
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 1bf18f0b9f84..ee4d941c7297 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -690,6 +690,12 @@ Status BlockBasedTableFactory::ValidateOptions(
         "data_block_hash_table_util_ratio should be greater than 0 when "
         "data_block_index_type is set to kDataBlockBinaryAndHash");
   }
+  if (table_options_.user_defined_index_factory &&
+      (cf_opts.compression_opts.parallel_threads > 1 ||
+       cf_opts.bottommost_compression_opts.parallel_threads > 1)) {
+    return Status::InvalidArgument(
+        "user_defined_index_factory not supported with parallel compression");
+  }
   if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) {
     // TODO(myabandeh): support it
     return Status::InvalidArgument(
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 6c36e5d47232..13fee36bf56f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -46,6 +46,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/trace_record.h"
+#include "rocksdb/user_defined_index.h"
 #include "table/block_based/binary_search_index_reader.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -2747,6 +2748,10 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName(
     return BlockType::kIndex;
   }
 
+  if (meta_block_name.starts_with(kUserDefinedIndexPrefix)) {
+    return BlockType::kUserDefinedIndex;
+  }
+
   if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) {
     // Obsolete but possible in old files
     return BlockType::kInvalid;
diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h
index a9d6a1a773b4..0098491c5dfc 100644
--- a/table/block_based/block_type.h
+++ b/table/block_based/block_type.h
@@ -27,6 +27,7 @@ enum class BlockType : uint8_t {
   kHashIndexMetadata,
   kMetaIndex,
   kIndex,
+  kUserDefinedIndex,
   // Note: keep kInvalid the last value when adding new enum values.
   kInvalid
 };
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 99b348b2ff1d..14388abf1827 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -46,7 +46,7 @@ class IndexBuilder {
   //     primary index.
   struct IndexBlocks {
     Slice index_block_contents;
-    std::unordered_map<std::string, Slice> meta_blocks;
+    std::unordered_map<std::string, std::pair<BlockType, Slice>> meta_blocks;
   };
   IndexBuilder(const InternalKeyComparator* comparator, size_t ts_sz,
                bool persist_user_defined_timestamps)
@@ -78,7 +78,8 @@ class IndexBuilder {
 
   // This method will be called whenever a key is added. The subclasses may
   // override OnKeyAdded() if they need to collect additional information.
-  virtual void OnKeyAdded(const Slice& /*key*/) {}
+  virtual void OnKeyAdded(const Slice& /*key*/,
+                          const std::optional<Slice>& /*value*/) {}
 
   // Inform the index builder that all entries has been written. Block builder
   // may therefore perform any operation required for block finalization.
@@ -180,7 +181,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
     seperator_is_key_plus_seq_ = (format_version <= 2);
   }
 
-  void OnKeyAdded(const Slice& key) override {
+  void OnKeyAdded(const Slice& key,
+                  const std::optional<Slice>& /*value*/) override {
     if (include_first_key_ && current_block_first_internal_key_.empty()) {
       current_block_first_internal_key_.assign(key.data(), key.size());
     }
@@ -358,7 +360,8 @@ class HashIndexBuilder : public IndexBuilder {
         separator_scratch);
   }
 
-  void OnKeyAdded(const Slice& key) override {
+  void OnKeyAdded(const Slice& key,
+                  const std::optional<Slice>& /*value*/) override {
     auto key_prefix = hash_key_extractor_->Transform(key);
     bool is_first_entry = pending_block_num_ == 0;
 
@@ -393,9 +396,9 @@ class HashIndexBuilder : public IndexBuilder {
     Status s = primary_index_builder_.Finish(index_blocks,
                                              last_partition_block_handle);
     index_blocks->meta_blocks.insert(
-        {kHashIndexPrefixesBlock.c_str(), prefix_block_});
-    index_blocks->meta_blocks.insert(
-        {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+        {kHashIndexPrefixesBlock.c_str(), {BlockType::kIndex, prefix_block_}});
+    index_blocks->meta_blocks.insert({kHashIndexPrefixesMetadataBlock.c_str(),
+                                      {BlockType::kIndex, prefix_meta_block_}});
     return s;
   }
 
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
new file mode 100644
index 000000000000..06bb75d03d82
--- /dev/null
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -0,0 +1,126 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/user_defined_index.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/index_builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// UserDefinedIndexWrapper wraps around the existing index types in block based
+// table, and supports plugging in an additional user defined index. The wrapper
+// class forwards calls to both the wrapped internal index, and a user defined
+// index builder.
+class UserDefinedIndexBuilderWrapper : public IndexBuilder {
+ public:
+  UserDefinedIndexBuilderWrapper(
+      const std::string& name,
+      std::unique_ptr<IndexBuilder> internal_index_builder,
+      std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder,
+      const InternalKeyComparator* comparator, size_t ts_sz,
+      bool persist_user_defined_timestamps)
+      : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps),
+        name_(name),
+        internal_index_builder_(std::move(internal_index_builder)),
+        user_defined_index_builder_(std::move(user_defined_index_builder)) {}
+
+  // Note: We don't provide a simplified constructor that tries to extract
+  // parameters from internal_index_builder because IndexBuilder's members are
+  // protected and there are no accessor methods to get them
+
+  ~UserDefinedIndexBuilderWrapper() override = default;
+
+  Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                      const Slice* first_key_in_next_block,
+                      const BlockHandle& block_handle,
+                      std::string* separator_scratch) override {
+    UserDefinedIndexBuilder::BlockHandle handle;
+    handle.offset = block_handle.offset();
+    handle.size = block_handle.size();
+    // Forward the call to both index builders
+    user_defined_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                               first_key_in_next_block, handle,
+                                               separator_scratch);
+    return internal_index_builder_->AddIndexEntry(
+        last_key_in_current_block, first_key_in_next_block, block_handle,
+        separator_scratch);
+  }
+
+  void OnKeyAdded(const Slice& key,
+                  const std::optional<Slice>& value) override {
+    if (status_.ok()) {
+      if (!value.has_value()) {
+        status_ = Status::InvalidArgument(
+            "user_defined_index_factory not supported with parallel "
+            "compression");
+      } else {
+        ParsedInternalKey pkey;
+        status_ = ParseInternalKey(key, &pkey, /*lof_err_key*/ false);
+        if (status_.ok() && pkey.type != ValueType::kTypeValue) {
+          status_ = Status::InvalidArgument(
+              "user_defined_index_factory only supported with Puts");
+        }
+      }
+    }
+    if (!status_.ok()) {
+      return;
+    }
+
+    // Forward the call to both index builders
+    internal_index_builder_->OnKeyAdded(key, value);
+    user_defined_index_builder_->OnKeyAdded(
+        key, UserDefinedIndexBuilder::ValueType::kValue, value.value());
+  }
+
+  Status Finish(IndexBlocks* index_blocks,
+                const BlockHandle& last_partition_block_handle) override {
+    if (!status_.ok()) {
+      return status_;
+    }
+
+    // Finish the internal index builder
+    status_ = internal_index_builder_->Finish(index_blocks,
+                                              last_partition_block_handle);
+    if (!status_.ok()) {
+      return status_;
+    }
+
+    // Finish the user defined index builder
+    Slice user_index_contents;
+    status_ = user_defined_index_builder_->Finish(&user_index_contents);
+    if (!status_.ok()) {
+      return status_;
+    }
+
+    // Add the user defined index to the meta blocks
+    std::string block_name = kUserDefinedIndexPrefix + name_;
+    index_blocks->meta_blocks.insert(
+        {block_name, {BlockType::kUserDefinedIndex, user_index_contents}});
+
+    index_size_ = internal_index_builder_->IndexSize();
+    return status_;
+  }
+
+  size_t IndexSize() const override { return index_size_; }
+
+  bool seperator_is_key_plus_seq() override {
+    return internal_index_builder_->seperator_is_key_plus_seq();
+  }
+
+ private:
+  const std::string name_;
+  std::unique_ptr<IndexBuilder> internal_index_builder_;
+  std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder_;
+  Status status_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/external_table.cc b/table/external_table.cc
index a85073737bf8..8835d7e013a3 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -303,7 +303,7 @@ class ExternalTableBuilderAdapter : public TableBuilder {
         properties_.num_entries++;
         properties_.raw_key_size += key.size();
         properties_.raw_value_size += value.size();
-        NotifyCollectTableCollectorsOnAdd(key, value, /*offset=*/0,
+        NotifyCollectTableCollectorsOnAdd(key, value, /*file_size=*/0,
                                           table_properties_collectors_,
                                           ioptions_.logger);
       }
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index 8d1b03380d40..9343b6feed91 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -472,6 +472,7 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
   }
   if (r->file_info.num_entries == 0 &&
       r->file_info.num_range_del_entries == 0) {
+    r->builder->status().PermitUncheckedError();
     return Status::InvalidArgument("Cannot create sst file with no entries");
   }
 
@@ -495,7 +496,10 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
         r->file_writer->GetFileChecksumFuncName();
   }
   if (!s.ok()) {
-    r->ioptions.env->DeleteFile(r->file_info.file_path);
+    Status status = r->ioptions.env->DeleteFile(r->file_info.file_path);
+    // Silence ASSERT_STATUS_CHECKED warning
+    assert(status.ok());
+    ;
   }
 
   if (file_info != nullptr) {
diff --git a/table/table_test.cc b/table/table_test.cc
index faa339e824fa..357ef20dcf8b 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -29,6 +29,7 @@
 #include "db/write_batch_internal.h"
 #include "memtable/stl_wrappers.h"
 #include "monitoring/statistics_impl.h"
+#include "options/cf_options.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -51,6 +52,7 @@
 #include "rocksdb/table_properties.h"
 #include "rocksdb/trace_record.h"
 #include "rocksdb/unique_id.h"
+#include "rocksdb/user_defined_index.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_builder.h"
@@ -7397,6 +7399,241 @@ TEST_F(ExternalTableTest, IngestionTest) {
   ASSERT_OK(db->Close());
 }
 
+class UserDefinedIndexTest : public BlockBasedTableTestBase {
+ public:
+  class CustomFlushBlockPolicy : public FlushBlockPolicy {
+   public:
+    explicit CustomFlushBlockPolicy(int keys_per_block)
+        : keys_in_current_block_(0), keys_per_block_(keys_per_block) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      keys_in_current_block_++;
+      if (keys_in_current_block_ >= keys_per_block_) {
+        keys_in_current_block_ = 0;
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    int keys_in_current_block_;
+    int keys_per_block_;
+  };
+
+  class CustomFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+   public:
+    CustomFlushBlockPolicyFactory(int keys_per_block = 3)
+        : keys_per_block_(keys_per_block) {}
+    const char* Name() const override { return "CustomFlushBlockPolicy"; }
+    FlushBlockPolicy* NewFlushBlockPolicy(const BlockBasedTableOptions&,
+                                          const BlockBuilder&) const override {
+      return new CustomFlushBlockPolicy(keys_per_block_);
+    }
+    int keys_per_block_;
+  };
+
+ public:
+  class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder {
+   public:
+    TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {}
+
+    Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                        const Slice* first_key_in_next_block,
+                        const BlockHandle& block_handle,
+                        std::string* separator_scratch) override {
+      // Unused parameters
+      (void)first_key_in_next_block;
+      (void)separator_scratch;
+      entries_added_++;
+      // Store the block handle for each key
+      PutFixed64(&index_data_[last_key_in_current_block.ToString()],
+                 block_handle.offset);
+      PutFixed64(&index_data_[last_key_in_current_block.ToString()],
+                 block_handle.size);
+      PutFixed32(&index_data_[last_key_in_current_block.ToString()],
+                 keys_added_);
+      keys_added_ = 0;
+      return last_key_in_current_block;
+    }
+
+    void OnKeyAdded(const Slice& /*key*/, ValueType /*value*/,
+                    const Slice& /*value*/) override {
+      // Track keys added to the index
+      keys_added_++;
+    }
+
+    Status Finish(Slice* index_contents) override {
+      // Serialize the index data
+      std::string result;
+      for (const auto& entry : index_data_) {
+        PutLengthPrefixedSlice(&result, entry.first);
+        result.append(entry.second);
+      }
+      index_contents_data_ = result;
+      *index_contents = index_contents_data_;
+      return Status::OK();
+    }
+
+    int GetEntriesAdded() const { return entries_added_; }
+
+   private:
+    int entries_added_;
+    std::map<std::string, std::string> index_data_;
+    uint32_t keys_added_;
+    std::string index_contents_data_;
+  };
+
+  class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
+   public:
+    const char* Name() const override { return "test_index"; }
+    UserDefinedIndexBuilder* NewBuilder() const override {
+      return new TestUserDefinedIndexBuilder();
+    }
+  };
+};
+
+TEST_F(UserDefinedIndexTest, BasicTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Add 100 keys instead of just 5
+  for (int i = 0; i < 100; i++) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  ImmutableOptions ioptions(options);
+  MutableCFOptions moptions((ColumnFamilyOptions(options)));
+  EnvOptions eoptions(options);
+  TableReaderOptions toptions(
+      ioptions, moptions.prefix_extractor, /*compression_manager*/ nullptr,
+      eoptions, ioptions.internal_comparator,
+      moptions.block_protection_bytes_per_key,
+      /*skip_filters*/ false, /*immortal*/ false,
+      /*force_direct_prefetch*/ false, /*level*/ -1,
+      /*block_cache_tracer*/ nullptr,
+      /*max_file_size_for_l0_meta_pin*/ 0, /*cur_db_session_id*/ "",
+      /*cur_file_num*/ 0,
+      /* unique_id */ {}, /* largest_seqno */ 0,
+      /* tail_size */ 0, ioptions.persist_user_defined_timestamps);
+  // Verify that the user-defined index was created
+  std::string meta_block_name = kUserDefinedIndexPrefix + "test_index";
+  BlockHandle block_handle;
+  uint64_t file_size = 0;
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  const auto& fs = options.env->GetFileSystem();
+  ASSERT_OK(fs->GetFileSize(ingest_file, IOOptions(), &file_size, nullptr));
+  ASSERT_OK(fs->NewRandomAccessFile(ingest_file, eoptions, &file, nullptr));
+  file_reader.reset(new RandomAccessFileReader(std::move(file), ingest_file));
+  ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size,
+                                kBlockBasedTableMagicNumber, ioptions,
+                                ReadOptions(), meta_block_name, &block_handle));
+  file_reader.reset();
+  // With our custom flush policy that flushes every 3 keys,
+  // we expect around 34 data blocks (100/3 rounded up)
+  // Verify the number of entries in the user-defined index
+  // Each data block should have an entry in the index
+  // With our flush policy of 3 keys per block, we expect around 34 entries
+  int expected_entries = (100 + 2) / 3;  // Ceiling of 100/3
+  ASSERT_GE(block_handle.size(),
+            expected_entries);  // At least this many entries
+
+  std::unique_ptr<SstFileReader> reader(new SstFileReader(options));
+  ASSERT_OK(reader->Open(ingest_file));
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->SeekToFirst(); iter->Valid() && iter->status().ok();
+       iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 100);  // We added 100 keys
+  ASSERT_OK(iter->status());
+}
+
+TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression_opts.parallel_threads = 10;
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  std::string key = "foo";
+  std::string value = "bar";
+  ASSERT_EQ(writer->Put(key, value), Status::InvalidArgument());
+  ASSERT_EQ(writer->Finish(), Status::InvalidArgument());
+  writer.reset();
+}
+
+TEST_F(UserDefinedIndexTest, InvalidArgumentTest2) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  std::string key = "foo";
+  std::string value = "bar";
+  ASSERT_OK(writer->Merge(key, value));
+  ASSERT_EQ(writer->Finish(), Status::InvalidArgument());
+  writer.reset();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 29c65d8bffa1dce5f63cacead05e09747b38da06 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Tue, 8 Jul 2025 15:54:42 -0700
Subject: [PATCH 167/500] Remove stats_ field from SstFileManagerImpl (#13757)

Summary:
`SstFileManager` is supposed to be thread-safe for all of its public methods, but `SetStatisticsPtr` leads to a race condition because the access to `stat_` is not synchronized. We don't use `stat_` internally so we can get rid of it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13757

Test Plan: Existing unit tests.

Reviewed By: mszeszko-meta

Differential Revision: D77962592

Pulled By: archang19

fbshipit-source-id: e8e56194dda034935ddef44e479243770a73d065
---
 file/sst_file_manager_impl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
index 96ec271eee37..b98d8594e851 100644
--- a/file/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -162,7 +162,6 @@ class SstFileManagerImpl : public SstFileManager {
   void Close();
 
   void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) override {
-    stats_ = stats;
     delete_scheduler_.SetStatisticsPtr(stats);
   }
 
@@ -216,7 +215,6 @@ class SstFileManagerImpl : public SstFileManager {
   std::list<ErrorHandler*> error_handler_list_;
   // Pointer to ErrorHandler instance that is currently processing recovery
   ErrorHandler* cur_instance_;
-  std::shared_ptr<Statistics> stats_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE

From 11a259a5f0a67fb932f0589d1dc33d56ae6f66c0 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Wed, 9 Jul 2025 10:40:28 -0700
Subject: [PATCH 168/500] Support GetFileSize API in FSRandomAccessFile
 (#13676)

Summary:
Add file size validation in ReadFooterFromFile function.
    Deprecate skip_checking_sst_file_sizes_on_db_open option.
    This change is used to address this issue
    https://github.com/facebook/rocksdb/issues/13619
    It supports file size validation in ReadFooterFromFile. In favor of this
    change, CheckConsistency function and
    skip_checking_sst_file_sizes_on_db_open flag are deprecated.

    The CheckConsistency function checks each file size matches what was
    recorded in manifest during DB open. Meantime, ReadFooterFromFile was
    called for each file in LoadTables function. Since ReadFooterFromFile
    always validates file size, the CheckConsistency is redundant.

    In addtion, CheckConsistency is executed in a single thread. This could
    slow down DB open when a network file system is used. Therefore, the
    flag skip_checking_sst_file_sizes_on_db_open was added to skip this
    check. After this change, ReadFooterFromFile was executed in parallel
    through multiple threads. Therefore, the concern of DB open slowness is
    eliminated, and the flag could be deprecated.

    When paranoid check flag is set to true, corrupted file will fail to open the DB.
    When paranoid check flag is set to false, DB will still be able to open, the
    healthy ones can be accessed, while the corrupted ones not.

    There is 2 slight concerns of this change.

    *If max_open_files is set with smaller value, engine will not open all
    the files during DB open. This means if there is a corruption on file
    size, it will not be detected during DB open, but rather at a later
    time. Since the default is -1, which means open all the files, and it is
    rarely overridden and a lot of new features rely on it to be -1, the
    risk is very low.

    *If FIFO compaction is used, engine could fail to open DB unnecessarily
    on the corrupted files that would never be used again. However, this is
    a very rare case as well. The error could still be ignored by setting
    paranoid_checks operationally. The risk is very low.

    To remain backward compatibility. The public facing flag was kept and
    marked as no-op internally. Another change is required to fully remove
    the flag.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13676

Test Plan:
make check
    A new unit test was added to validate file size check API works as
    expected.

Reviewed By: pdillinger

Differential Revision: D76168033

Pulled By: xingbowang

fbshipit-source-id: 8ceacf39bcfe02ff7aa289868c341366ee9f3a8e
---
 db/c_test.c                                   | 12 ---
 db/corruption_test.cc                         | 77 ++++++++++++++++-
 db/db_encryption_test.cc                      | 43 +++++++++-
 db/db_impl/db_impl.cc                         | 79 -----------------
 db/db_impl/db_impl.h                          |  4 -
 db/db_impl/db_impl_follower.cc                |  3 -
 db/db_impl/db_impl_open.cc                    |  9 --
 db/db_impl/db_impl_secondary.cc               | 46 ----------
 db/db_impl/db_impl_secondary.h                |  6 --
 db/db_secondary_test.cc                       | 43 +---------
 db/db_sst_test.cc                             | 54 ------------
 db/db_test_util.h                             |  8 ++
 db/external_sst_file_ingestion_job.cc         | 18 +++-
 env/composite_env.cc                          |  4 +
 env/env.cc                                    |  4 +
 env/env_test.cc                               | 37 +++++++-
 env/fs_posix.cc                               | 24 +++++-
 env/io_posix.cc                               | 16 ++++
 env/io_posix.h                                |  3 +
 env/mock_env.cc                               |  5 ++
 file/readahead_raf.cc                         |  4 +
 include/rocksdb/env.h                         | 10 +++
 include/rocksdb/env_encryption.h              |  9 ++
 include/rocksdb/file_system.h                 | 12 +++
 include/rocksdb/options.h                     | 20 +++--
 port/win/io_win.cc                            | 20 +++++
 port/win/io_win.h                             |  4 +
 table/format.cc                               | 84 +++++++++++--------
 test_util/testutil.cc                         |  1 -
 test_util/testutil.h                          |  5 ++
 ...ize_api_at_FSRandomAccessFile_interface.md |  1 +
 ...skip_checking_sst_file_sizes_on_db_open.md |  1 +
 utilities/env_mirror.cc                       | 10 +++
 utilities/fault_injection_env.cc              |  5 ++
 utilities/fault_injection_env.h               |  2 +
 utilities/fault_injection_fs.cc               |  8 ++
 utilities/fault_injection_fs.h                | 16 +++-
 37 files changed, 398 insertions(+), 309 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md
 create mode 100644 unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md

diff --git a/db/c_test.c b/db/c_test.c
index 73bdf564706e..4e74651f4690 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -2230,10 +2230,6 @@ int main(int argc, char** argv) {
     rocksdb_options_set_skip_stats_update_on_db_open(o, 1);
     CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
 
-    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1);
-    CheckCondition(
-        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
-
     rocksdb_options_set_max_write_buffer_number(o, 97);
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
 
@@ -2493,8 +2489,6 @@ int main(int argc, char** argv) {
     CheckCondition(2.0 ==
                    rocksdb_options_get_max_bytes_for_level_multiplier(copy));
     CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
-    CheckCondition(
-        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
     CheckCondition(23 ==
                    rocksdb_options_get_min_write_buffer_number_to_merge(copy));
@@ -2681,12 +2675,6 @@ int main(int argc, char** argv) {
     CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
     CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
 
-    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0);
-    CheckCondition(
-        0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
-    CheckCondition(
-        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
-
     rocksdb_options_set_max_write_buffer_number(copy, 2000);
     CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy));
     CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index d7f87faefed4..9a7b789b2d25 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -556,6 +556,74 @@ TEST_F(CorruptionTest, TableFileFooterNotMagic) {
   ASSERT_TRUE(s.ToString().find(".sst") != std::string::npos);
 }
 
+TEST_F(CorruptionTest, DBOpenWithWrongFileSize) {
+  // Validate that when paranoid flag is true, DB::Open() fails if one of the
+  // file corrupted. Validate that when paranoid flag is false, DB::Open()
+  // succeed if one of the file corrupted, and the healthy file is readable.
+  CloseDb();
+
+  const std::string test_cf_name = "test_cf";
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+  cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
+
+  {
+    options_.create_missing_column_families = true;
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k1", "v1"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+
+    // ********************************************
+    // Corrupt the file by making the file bigger
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    std::string filename = dbname_ + metadata[0].name;
+    const auto& fs = options_.env->GetFileSystem();
+    {
+      std::unique_ptr<FSWritableFile> f;
+      ASSERT_OK(fs->ReopenWritableFile(filename, FileOptions(), &f, nullptr));
+      ASSERT_OK(f->Append("blahblah", IOOptions(), nullptr));
+      ASSERT_OK(f->Close(IOOptions(), nullptr));
+    }
+    CloseDb();
+  }
+
+  // DB failed to open due to one of the file is corrupted, as paranoid flag is
+  // true
+  options_.paranoid_checks = true;
+  std::vector<ColumnFamilyHandle*> cfhs;
+  auto s = DB::Open(options_, dbname_, cf_descs, &cfhs, &db_);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
+
+  // DB opened successfully, as paranoid flag is false, validate the one that is
+  // healthy is still accessible
+  options_.paranoid_checks = false;
+  ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), cfhs[1], "k1", &v));
+  ASSERT_EQ(v, "v1");
+
+  // Validate the default column family is corrupted
+  Check(0, 0);
+  s = db_->Get(ReadOptions(), cfhs[0], "k1", &v);
+  ASSERT_TRUE(s.IsCorruption());
+
+  delete cfhs[1];
+  delete cfhs[0];
+}
+
 TEST_F(CorruptionTest, TableFileWrongSize) {
   Build(100);
   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
@@ -579,13 +647,16 @@ TEST_F(CorruptionTest, TableFileWrongSize) {
   // DB actually accepts this without paranoid checks, relying on size
   // recorded in manifest to locate the SST footer.
   options_.paranoid_checks = false;
-  options_.skip_checking_sst_file_sizes_on_db_open = false;
   Reopen();
-  Check(100, 100);
+  // As footer could not be extraced, file is completely unreadable
+  Check(0, 0);
+  std::string v;
+  auto s = db_->Get(ReadOptions(), "k1", &v);
+  ASSERT_TRUE(s.IsCorruption());
 
   // But reports the issue with paranoid checks
   options_.paranoid_checks = true;
-  Status s = TryReopen();
+  s = TryReopen();
   ASSERT_TRUE(s.IsCorruption());
   ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
 
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index 1d17e5d9bbd1..7967719888bb 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -17,9 +17,10 @@ class DBEncryptionTest : public DBTestBase {
  public:
   DBEncryptionTest()
       : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {}
-  Env* GetTargetEnv() {
+  Env* GetNonEncryptedEnv() {
     if (encrypted_env_ != nullptr) {
-      return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+      return (static_cast_with_check<CompositeEnvWrapper>(encrypted_env_))
+          ->env_target();
     } else {
       return env_;
     }
@@ -38,7 +39,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
   auto status = env_->GetChildren(dbname_, &fileNames);
   ASSERT_OK(status);
 
-  Env* target = GetTargetEnv();
+  Env* target = GetNonEncryptedEnv();
   int hits = 0;
   for (auto it = fileNames.begin(); it != fileNames.end(); ++it) {
     if (*it == "LOCK") {
@@ -89,7 +90,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
 }
 
 TEST_F(DBEncryptionTest, ReadEmptyFile) {
-  auto defaultEnv = GetTargetEnv();
+  auto defaultEnv = GetNonEncryptedEnv();
 
   // create empty file for reading it back in later
   auto envOptions = EnvOptions(CurrentOptions());
@@ -116,6 +117,40 @@ TEST_F(DBEncryptionTest, ReadEmptyFile) {
   ASSERT_TRUE(data.empty());
 }
 
+TEST_F(DBEncryptionTest, NotSupportedGetFileSize) {
+  // Validate envrypted env does not support GetFileSize.
+  // The goal of the test is to validate the encrypted env/fs does not support
+  // GetFileSize API on FSRandomAccessFile interface.
+  // This test combined with the rest of the integration tests validate that
+  // the new API GetFileSize on FSRandomAccessFile interface is not required to
+  // be supported for database to work properly.
+  // The GetFileSize API is used in ReadFooterFromFile() API to get the file
+  // size. When GetFileSize API is not supported, the ReadFooterFromFile() API
+  // will use FileSystem GetFileSize API as fallback. Refer to the
+  // EncryptedRandomAccessFile class definition for more details.
+  if (!encrypted_env_) {
+    return;
+  }
+
+  auto fs = encrypted_env_->GetFileSystem();
+
+  // create empty file for reading it back in later
+  auto filePath = dbname_ + "/empty.empty";
+
+  // Create empty file
+  CreateFile(fs.get(), filePath, "", false);
+
+  // Open it for reading footer
+  std::unique_ptr<FSRandomAccessFile> randomAccessFile;
+  auto status = fs->NewRandomAccessFile(filePath, FileOptions(),
+                                        &randomAccessFile, nullptr);
+  ASSERT_OK(status);
+
+  uint64_t fileSize;
+  status = randomAccessFile->GetFileSize(&fileSize);
+  ASSERT_TRUE(status.IsNotSupported());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index e64471b07c24..d9cf5b848f88 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -5055,85 +5055,6 @@ void DBImpl::GetAllColumnFamilyMetaData(
   }
 }
 
-Status DBImpl::CheckConsistency() {
-  mutex_.AssertHeld();
-  std::vector<LiveFileMetaData> metadata;
-  versions_->GetLiveFilesMetaData(&metadata);
-  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
-
-  std::string corruption_messages;
-
-  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
-    // Instead of calling GetFileSize() for each expected file, call
-    // GetChildren() for the DB directory and check that all expected files
-    // are listed, without checking their sizes.
-    // Since sst files might be in different directories, do it for each
-    // directory separately.
-    std::map<std::string, std::vector<std::string>> files_by_directory;
-    for (const auto& md : metadata) {
-      // md.name has a leading "/". Remove it.
-      std::string fname = md.name;
-      if (!fname.empty() && fname[0] == '/') {
-        fname = fname.substr(1);
-      }
-      files_by_directory[md.db_path].push_back(fname);
-    }
-
-    IOOptions io_opts;
-    io_opts.do_not_recurse = true;
-    for (const auto& dir_files : files_by_directory) {
-      std::string directory = dir_files.first;
-      std::vector<std::string> existing_files;
-      Status s = fs_->GetChildren(directory, io_opts, &existing_files,
-                                  /*IODebugContext*=*/nullptr);
-      if (!s.ok()) {
-        corruption_messages +=
-            "Can't list files in " + directory + ": " + s.ToString() + "\n";
-        continue;
-      }
-      std::sort(existing_files.begin(), existing_files.end());
-
-      for (const std::string& fname : dir_files.second) {
-        if (!std::binary_search(existing_files.begin(), existing_files.end(),
-                                fname) &&
-            !std::binary_search(existing_files.begin(), existing_files.end(),
-                                Rocks2LevelTableFileName(fname))) {
-          corruption_messages +=
-              "Missing sst file " + fname + " in " + directory + "\n";
-        }
-      }
-    }
-  } else {
-    for (const auto& md : metadata) {
-      // md.name has a leading "/".
-      std::string file_path = md.db_path + md.name;
-
-      uint64_t fsize = 0;
-      TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
-      Status s = env_->GetFileSize(file_path, &fsize);
-      if (!s.ok() &&
-          env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
-        s = Status::OK();
-      }
-      if (!s.ok()) {
-        corruption_messages +=
-            "Can't access " + md.name + ": " + s.ToString() + "\n";
-      } else if (fsize != md.size) {
-        corruption_messages += "Sst file size mismatch: " + file_path +
-                               ". Size recorded in manifest " +
-                               std::to_string(md.size) + ", actual size " +
-                               std::to_string(fsize) + "\n";
-      }
-    }
-  }
-
-  if (corruption_messages.size() == 0) {
-    return Status::OK();
-  } else {
-    return Status::Corruption(corruption_messages);
-  }
-}
-
 Status DBImpl::GetDbIdentity(std::string& identity) const {
   identity.assign(db_id_);
   return Status::OK();
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index be51ac567cb7..fce9421de19d 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -804,10 +804,6 @@ class DBImpl : public DB {
   // being detected.
   const Snapshot* GetSnapshotForWriteConflictBoundary();
 
-  // checks if all live files exist on file system and that their file sizes
-  // match to our in-memory records
-  virtual Status CheckConsistency();
-
   // max_file_num_to_ignore allows bottom level compaction to filter out newly
   // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
   // disable the filtering
diff --git a/db/db_impl/db_impl_follower.cc b/db/db_impl/db_impl_follower.cc
index 90c4326ceb15..1ff12cec0153 100644
--- a/db/db_impl/db_impl_follower.cc
+++ b/db/db_impl/db_impl_follower.cc
@@ -70,9 +70,6 @@ Status DBImplFollower::Recover(
     }
     return s;
   }
-  if (immutable_db_options_.paranoid_checks && s.ok()) {
-    s = CheckConsistency();
-  }
   if (s.ok()) {
     default_cf_handle_ = new ColumnFamilyHandleImpl(
         versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index f19ab4965835..eaa806283ce9 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -191,12 +191,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
                    "wal_compression is disabled since only zstd is supported");
   }
 
-  if (!result.paranoid_checks) {
-    result.skip_checking_sst_file_sizes_on_db_open = true;
-    ROCKS_LOG_INFO(result.info_log,
-                   "file size check will be skipped during open.");
-  }
-
   return result;
 }
 
@@ -694,9 +688,6 @@ Status DBImpl::Recover(
     s = MaybeUpdateNextFileNumber(recovery_ctx);
   }
 
-  if (immutable_db_options_.paranoid_checks && s.ok()) {
-    s = CheckConsistency();
-  }
   if (s.ok() && !read_only) {
     // TODO: share file descriptors (FSDirectory) with SetDirectories above
     std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index e5f33dc20b40..04abfc3d6d22 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -49,9 +49,6 @@ Status DBImplSecondary::Recover(
     }
     return s;
   }
-  if (immutable_db_options_.paranoid_checks && s.ok()) {
-    s = CheckConsistency();
-  }
   // Initial max_total_in_memory_state_ before recovery logs.
   max_total_in_memory_state_ = 0;
   for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -653,49 +650,6 @@ Status DBImplSecondary::NewIterators(
   return Status::OK();
 }
 
-Status DBImplSecondary::CheckConsistency() {
-  mutex_.AssertHeld();
-  Status s = DBImpl::CheckConsistency();
-  // If DBImpl::CheckConsistency() which is stricter returns success, then we
-  // do not need to give a second chance.
-  if (s.ok()) {
-    return s;
-  }
-  // It's possible that DBImpl::CheckConssitency() can fail because the primary
-  // may have removed certain files, causing the GetFileSize(name) call to
-  // fail and returning a PathNotFound. In this case, we take a best-effort
-  // approach and just proceed.
-  TEST_SYNC_POINT_CALLBACK(
-      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
-
-  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
-    return Status::OK();
-  }
-
-  std::vector<LiveFileMetaData> metadata;
-  versions_->GetLiveFilesMetaData(&metadata);
-
-  std::string corruption_messages;
-  for (const auto& md : metadata) {
-    // md.name has a leading "/".
-    std::string file_path = md.db_path + md.name;
-
-    uint64_t fsize = 0;
-    s = env_->GetFileSize(file_path, &fsize);
-    if (!s.ok() &&
-        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
-         s.IsPathNotFound())) {
-      s = Status::OK();
-    }
-    if (!s.ok()) {
-      corruption_messages +=
-          "Can't access " + md.name + ": " + s.ToString() + "\n";
-    }
-  }
-  return corruption_messages.empty() ? Status::OK()
-                                     : Status::Corruption(corruption_messages);
-}
-
 Status DBImplSecondary::TryCatchUpWithPrimary() {
   assert(versions_.get() != nullptr);
   Status s;
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index c0d72c67e9f4..b18822b171b3 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -248,12 +248,6 @@ class DBImplSecondary : public DBImpl {
   Status MaybeInitLogReader(uint64_t log_number,
                             log::FragmentBufferedReader** log_reader);
 
-  // Check if all live files exist on file system and that their file sizes
-  // matche to the in-memory records. It is possible that some live files may
-  // have been deleted by the primary. In this case, CheckConsistency() does
-  // not flag the missing file as inconsistency.
-  Status CheckConsistency() override;
-
 #ifndef NDEBUG
   Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options,
                                          ColumnFamilyHandle* cfh,
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index e983a580b9a2..e34a95d55417 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -605,6 +605,9 @@ class TraceFileEnv : public EnvWrapper {
                   char* scratch) const override {
         return target_->Read(offset, n, result, scratch);
       }
+      Status GetFileSize(uint64_t* file_size) override {
+        return target_->GetFileSize(file_size);
+      }
 
      private:
       std::unique_ptr<RandomAccessFile> target_;
@@ -1291,46 +1294,6 @@ TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
   ASSERT_OK(iter3->status());
 }
 
-TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
-  bool called = false;
-  Options options;
-  options.env = env_;
-  options.disable_auto_compactions = true;
-  Reopen(options);
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->SetCallBack(
-      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
-        ASSERT_NE(nullptr, arg);
-        called = true;
-        auto* s = static_cast<Status*>(arg);
-        ASSERT_NOK(*s);
-      });
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
-        "BackgroundCallCompaction:0"},
-       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
-        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  ASSERT_OK(Put("a", "value0"));
-  ASSERT_OK(Put("c", "value0"));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put("b", "value1"));
-  ASSERT_OK(Put("d", "value1"));
-  ASSERT_OK(Flush());
-  port::Thread thread([this]() {
-    Options opts;
-    opts.env = env_;
-    opts.max_open_files = -1;
-    OpenSecondary(opts);
-  });
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  thread.join();
-  ASSERT_TRUE(called);
-}
-
 TEST_F(DBSecondaryTest, StartFromInconsistent) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 71511cee7420..d0579a2c3e4b 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -135,21 +135,6 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
   Destroy(options);
 }
 
-// Check that we don't crash when opening DB with
-// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
-TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
-  ASSERT_OK(Put("pika", "choo"));
-  ASSERT_OK(Flush());
-
-  // Just open the DB with the option set to true and check that we don't crash.
-  Options options;
-  options.env = env_;
-  options.skip_checking_sst_file_sizes_on_db_open = true;
-  Reopen(options);
-
-  ASSERT_EQ("choo", Get("pika"));
-}
-
 TEST_F(DBSSTTest, DontDeleteMovedFile) {
   // This test triggers move compaction and verifies that the file is not
   // deleted when it's part of move compaction
@@ -1748,45 +1733,6 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) {
-  Options options = CurrentOptions();
-  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
-  options.env = env.get();
-  options.disable_auto_compactions = true;
-  options.compression = kNoCompression;
-  options.enable_blob_files = true;
-  options.blob_file_size = 32;  // create one blob per file
-  options.skip_checking_sst_file_sizes_on_db_open = true;
-
-  DestroyAndReopen(options);
-  // Generate 5 files in L0
-  for (int i = 0; i < 5; i++) {
-    for (int j = 0; j < 10; j++) {
-      std::string val = "val_file_" + std::to_string(i);
-      ASSERT_OK(Put(Key(j), val));
-    }
-    ASSERT_OK(Flush());
-  }
-  Close();
-
-  bool is_get_file_size_called = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) {
-        std::string* filename = static_cast<std::string*>(arg);
-        if (filename->find(".blob") != std::string::npos) {
-          is_get_file_size_called = true;
-        }
-      });
-
-  SyncPoint::GetInstance()->EnableProcessing();
-  Reopen(options);
-  ASSERT_FALSE(is_get_file_size_called);
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  Destroy(options);
-}
-
 TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
diff --git a/db/db_test_util.h b/db/db_test_util.h
index 4a00ea4371b8..ea2ff609663a 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -452,6 +452,10 @@ class SpecialEnv : public EnvWrapper {
         return s;
       }
 
+      Status GetFileSize(uint64_t* s) override {
+        return target_->GetFileSize(s);
+      }
+
      private:
       std::unique_ptr<RandomAccessFile> target_;
       anon::AtomicCounter* counter_;
@@ -478,6 +482,10 @@ class SpecialEnv : public EnvWrapper {
         return target_->Prefetch(offset, n);
       }
 
+      Status GetFileSize(uint64_t* s) override {
+        return target_->GetFileSize(s);
+      }
+
      private:
       std::unique_ptr<RandomAccessFile> target_;
       std::atomic<uint64_t>* fail_cnt_;
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 7e99dc9f918e..e99c04300fb2 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -156,9 +156,21 @@ Status ExternalSstFileIngestionJob::Prepare(
         // It is unsafe to assume application had sync the file and file
         // directory before ingest the file. For integrity of RocksDB we need
         // to sync the file.
-        std::unique_ptr<FSWritableFile> file_to_sync;
-        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
-                                           &file_to_sync, nullptr);
+        // Use FSRandomRWFile instead of FSWritableFile, as in encrypted file
+        // system the FSWritableFile will append a new prefix to the end of the
+        // file when the file exists, which causes file corruption. On the
+        // contrary, FSRandomRWFile handles an existing file correctly.
+
+        // TODO(xingbo), We should in general be moving away from production
+        // uses of ReuseWritableFile (except explicitly for WAL recycling),
+        // ReopenWritableFile, and NewRandomRWFile. We should create a
+        // FileSystem::SyncFile/FsyncFile API that by default does the
+        // re-open+sync+close combo but can (a) be reused easily, and (b) be
+        // overridden to do that more cleanly, e.g. in EncryptedEnv.
+        // https://github.com/facebook/rocksdb/issues/13741
+        std::unique_ptr<FSRandomRWFile> file_to_sync;
+        Status s = fs_->NewRandomRWFile(path_inside_db, env_options_,
+                                        &file_to_sync, nullptr);
         TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
                                  &s);
         // Some file systems (especially remote/distributed) don't support
diff --git a/env/composite_env.cc b/env/composite_env.cc
index 59434785ced5..a0a4d9edf66d 100644
--- a/env/composite_env.cc
+++ b/env/composite_env.cc
@@ -100,6 +100,10 @@ class CompositeRandomAccessFileWrapper : public RandomAccessFile {
     return target_->InvalidateCache(offset, length);
   }
 
+  Status GetFileSize(uint64_t* size) override {
+    return target_->GetFileSize(size);
+  }
+
  private:
   std::unique_ptr<FSRandomAccessFile> target_;
 };
diff --git a/env/env.cc b/env/env.cc
index 8326c5619346..896c31a477d3 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -186,6 +186,10 @@ class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
   IOStatus InvalidateCache(size_t offset, size_t length) override {
     return status_to_io_status(target_->InvalidateCache(offset, length));
   }
+  IOStatus GetFileSize(uint64_t* result) override {
+    auto status = target_->GetFileSize(result);
+    return status_to_io_status(std::move(status));
+  }
 
  private:
   std::unique_ptr<RandomAccessFile> target_;
diff --git a/env/env_test.cc b/env/env_test.cc
index e89f48531dc1..421d13ec5ea5 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -3467,7 +3467,6 @@ class ReadAsyncRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
 
  private:
   ReadAsyncFS& fs_;
-  std::unique_ptr<FSRandomAccessFile> file_;
   int counter = 0;
 };
 
@@ -3657,6 +3656,42 @@ TEST(EnvTestMisc, StaticDestruction) {
   static_destruction_tester.activated = true;
 }
 
+// Test GetFileSize API
+class TestGetFileSize : public testing::Test {
+ public:
+  TestGetFileSize() { env_ = Env::Default(); }
+  Env* env_;
+};
+
+// Validate GetFileSize API returns the right value.
+// Use the default implementation from env
+TEST_F(TestGetFileSize, GetFileSize) {
+  EnvOptions soptions;
+  auto fs = env_->GetFileSystem();
+
+  std::string fname = test::PerThreadDBPath(env_, "getFileSizeTestfile");
+
+  // randomize file size
+  auto rnd = Random::GetTLSInstance();
+  auto expectedFileSize = rnd->Uniform(256 * 1024) + 1;
+  auto content = rnd->RandomBinaryString(static_cast<int>(expectedFileSize));
+
+  ASSERT_OK(CreateFile(fs.get(), fname, content, false));
+
+  std::unique_ptr<FSRandomAccessFile> file;
+  ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr));
+
+  uint64_t fileSizeFromFileSystemAPI;
+  ASSERT_OK(
+      fs->GetFileSize(fname, IOOptions(), &fileSizeFromFileSystemAPI, nullptr));
+  ASSERT_EQ(fileSizeFromFileSystemAPI, expectedFileSize);
+
+  uint64_t fileSizeFromFsRandomAccessFileAPI;
+  ASSERT_OK(file->GetFileSize(&fileSizeFromFsRandomAccessFileAPI));
+
+  ASSERT_EQ(fileSizeFromFsRandomAccessFileAPI, expectedFileSize);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 82bda886db05..bc28b52de214 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -243,7 +243,7 @@ class PosixFileSystem : public FileSystem {
       // Use mmap when virtual address-space is plentiful.
       uint64_t size;
       IOOptions opts;
-      s = GetFileSize(fname, opts, &size, nullptr);
+      s = GetFileSizeOnOpenedFile(fd, fname, &size);
       if (s.ok()) {
         void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
         if (base != MAP_FAILED) {
@@ -324,7 +324,7 @@ class PosixFileSystem : public FileSystem {
     }
     uint64_t initial_file_size = 0;
     if (reopen) {
-      s = GetFileSize(fname, IOOptions(), &initial_file_size, nullptr);
+      s = GetFileSizeOnOpenedFile(fd, fname, &initial_file_size);
       if (!s.ok()) {
         close(fd);
         return s;
@@ -509,7 +509,7 @@ class PosixFileSystem : public FileSystem {
     uint64_t size;
     if (status.ok()) {
       IOOptions opts;
-      status = GetFileSize(fname, opts, &size, nullptr);
+      status = GetFileSizeOnOpenedFile(fd, fname, &size);
     }
     void* base = nullptr;
     if (status.ok()) {
@@ -671,7 +671,7 @@ class PosixFileSystem : public FileSystem {
 
   IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
                        uint64_t* size, IODebugContext* /*dbg*/) override {
-    struct stat sbuf;
+    struct stat sbuf {};
     if (stat(fname.c_str(), &sbuf) != 0) {
       *size = 0;
       return IOError("while stat a file for size", fname, errno);
@@ -974,6 +974,22 @@ class PosixFileSystem : public FileSystem {
  private:
   bool forceMmapOff_ = false;  // do we override Env options?
 
+  // This is a faster API comparing to the public method that uses stat to get
+  // file size. However this API only works on opened file.
+  IOStatus GetFileSizeOnOpenedFile(const int fd, const std::string& name,
+                                   uint64_t* size) {
+    struct stat sb {};
+    *size = 0;
+    // Get file information using fstat
+    if (fstat(fd, &sb) == -1) {
+      return IOError(
+          "while fstat a file for size with fd " + std::to_string(fd), name,
+          errno);
+    }
+    *size = sb.st_size;
+    return IOStatus::OK();
+  }
+
 #ifdef OS_LINUX
   // Get the minimum "linux system limit" (i.e, the largest I/O size that the OS
   // can issue to block devices under a directory, also known as
diff --git a/env/io_posix.cc b/env/io_posix.cc
index db1a6da64666..0c7ddc73cd5a 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -607,6 +607,17 @@ PosixRandomAccessFile::PosixRandomAccessFile(
 
 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
 
+IOStatus PosixRandomAccessFile::GetFileSize(uint64_t* result) {
+  struct stat sbuf {};
+  if (fstat(fd_, &sbuf) != 0) {
+    *result = 0;
+    return IOError("While fstat with fd " + std::to_string(fd_), filename_,
+                   errno);
+  }
+  *result = sbuf.st_size;
+  return IOStatus::OK();
+}
+
 IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
                                      const IOOptions& /*opts*/, Slice* result,
                                      char* scratch,
@@ -1056,6 +1067,11 @@ IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
 #endif
 }
 
+IOStatus PosixMmapReadableFile::GetFileSize(uint64_t* result) {
+  *result = length_;
+  return IOStatus::OK();
+}
+
 /*
  * PosixMmapFile
  *
diff --git a/env/io_posix.h b/env/io_posix.h
index c85ff0122d26..39fd8c0f49d1 100644
--- a/env/io_posix.h
+++ b/env/io_posix.h
@@ -352,6 +352,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile {
                              void* cb_arg, void** io_handle,
                              IOHandleDeleter* del_fn,
                              IODebugContext* dbg) override;
+
+  virtual IOStatus GetFileSize(uint64_t* result) override;
 };
 
 class PosixWritableFile : public FSWritableFile {
@@ -437,6 +439,7 @@ class PosixMmapReadableFile : public FSRandomAccessFile {
                 char* scratch, IODebugContext* dbg) const override;
   void Hint(AccessPattern pattern) override;
   IOStatus InvalidateCache(size_t offset, size_t length) override;
+  virtual IOStatus GetFileSize(uint64_t* result) override;
 };
 
 class PosixMmapFile : public FSWritableFile {
diff --git a/env/mock_env.cc b/env/mock_env.cc
index bf0e76adbbe4..0f9e5ab47f67 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -322,6 +322,11 @@ class MockRandomAccessFile : public FSRandomAccessFile {
     }
   }
 
+  IOStatus GetFileSize(uint64_t* size) override {
+    *size = file_->Size();
+    return IOStatus::OK();
+  }
+
  private:
   MemFile* file_;
   bool use_direct_io_;
diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc
index dd09822e3e23..004f2ab746ba 100644
--- a/file/readahead_raf.cc
+++ b/file/readahead_raf.cc
@@ -108,6 +108,10 @@ class ReadaheadRandomAccessFile : public FSRandomAccessFile {
 
   bool use_direct_io() const override { return file_->use_direct_io(); }
 
+  IOStatus GetFileSize(uint64_t* result) override {
+    return file_->GetFileSize(result);
+  }
+
  private:
   // Tries to read from buffer_ n bytes starting at offset. If anything was read
   // from the cache, it sets cached_len to the number of bytes actually read,
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 648c7bdf36c6..bffa22028839 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -866,6 +866,13 @@ class RandomAccessFile {
         "RandomAccessFile::InvalidateCache not supported.");
   }
 
+  // The default implementation returns "not supported" so that user
+  // implementations of FSRandomAccessFile do not need to immediately implement
+  // this function.
+  virtual Status GetFileSize(uint64_t* /*result*/) {
+    return Status::NotSupported("RandomAccessFile::GetFileSize not supported.");
+  }
+
   // If you're adding methods here, remember to add them to
   // RandomAccessFileWrapper too.
 };
@@ -1750,6 +1757,9 @@ class RandomAccessFileWrapper : public RandomAccessFile {
   Status InvalidateCache(size_t offset, size_t length) override {
     return target_->InvalidateCache(offset, length);
   }
+  Status GetFileSize(uint64_t* file_size) override {
+    return target_->GetFileSize(file_size);
+  }
 
  private:
   RandomAccessFile* target_;
diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h
index 6b4a13e039b6..118e8a052231 100644
--- a/include/rocksdb/env_encryption.h
+++ b/include/rocksdb/env_encryption.h
@@ -240,6 +240,15 @@ class EncryptedRandomAccessFile : public FSRandomAccessFile {
   size_t GetRequiredBufferAlignment() const override;
 
   IOStatus InvalidateCache(size_t offset, size_t length) override;
+
+  // Intentionally leave GetFileSize not overridden here, so that it inherits
+  // the default implementation from its parent class, which is Not Supported.
+  //
+  // As GetFileSize API is not required to be implemented yet, we use encrypted
+  // file system in unit test to validate the rest of the system could continue
+  // working with the Not Supported behavior.
+  //
+  // IOStatus GetFileSize(uint64_t* /*result*/) override;
 };
 
 class EncryptedWritableFile : public FSWritableFile {
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index f2c827ad60a6..a68dee516679 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -1051,6 +1051,14 @@ class FSRandomAccessFile {
   // open.
   virtual Temperature GetTemperature() const { return Temperature::kUnknown; }
 
+  // Get the file size on an open-for-reading file without re-seeking the file's
+  // path in the filesystem. The default implementation returns "not supported"
+  // so that user implementations of FSRandomAccessFile do not need to
+  // immediately implement this function.
+  virtual IOStatus GetFileSize(uint64_t* /*result*/) {
+    return IOStatus::NotSupported("GetFileSize Not Supported");
+  }
+
   // If you're adding methods here, remember to add them to
   // RandomAccessFileWrapper too.
 };
@@ -1772,6 +1780,10 @@ class FSRandomAccessFileWrapper : public FSRandomAccessFile {
     return target_->GetTemperature();
   }
 
+  virtual IOStatus GetFileSize(uint64_t* result) override {
+    return target_->GetFileSize(result);
+  }
+
  private:
   std::unique_ptr<FSRandomAccessFile> guard_;
   FSRandomAccessFile* target_;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 734dad323074..ad4efe021c06 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -608,6 +608,13 @@ struct DBOptions {
   // checksums. True also enters a read-only mode when a DB write fails;
   // see DB::Resume().
   //
+  // When set to true, the DB will fail to open if any SST files fail to open
+  // e.g. due to incorrect file size or corrupted footer.
+  //
+  // When set to false, when there are files corrupted, the DB will still be
+  // opened, and the healthy ones could still be accessed, while corrupted one
+  // will not
+  //
   // As most workloads value data correctness over availability, this option
   // is on by default. Note that the name of this old option is potentially
   // misleading, and other options and operations go further in proactive
@@ -1297,12 +1304,13 @@ struct DBOptions {
   // Default: false
   bool skip_stats_update_on_db_open = false;
 
-  // If true, then DB::Open() will not fetch and check sizes of all sst files.
-  // This may significantly speed up startup if there are many sst files,
-  // especially when using non-default Env with expensive GetFileSize().
-  // We'll still check that all required sst files exist.
-  // If paranoid_checks is false, this option is ignored, and sst files are
-  // not checked at all.
+  // This option is deprecated and marked as no-op. Kept for backward
+  // compatibility until usage is fully removed.
+  // File size check will be performed through a thread
+  // pool during DB Open, when max_open_files is set to -1.
+  // Therefore, the concern of DB Open slowness is eliminated.
+  // Note that when max_open_files is not set to -1, only a subset of files will
+  // be opened and checked during DB Open.
   //
   // Default: false
   bool skip_checking_sst_file_sizes_on_db_open = false;
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index 2ba64b326554..63e5d6a7e16e 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -242,6 +242,16 @@ size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
   return GetUniqueIdFromFile(hFile_, id, max_size);
 }
 
+IOStatus WinMmapReadableFile::GetFileSize(uint64_t* size) {
+  LARGE_INTEGER fileSize;
+  if (GetFileSizeEx(hFile_, &fileSize)) {
+    *size = fileSize.QuadPart;
+    return IOStatus::OK();
+  } else {
+    return IOStatus::IOError("Failed to get file size", filename_);
+  }
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// WinMmapFile
 
@@ -735,6 +745,16 @@ size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
   return GetAlignment();
 }
 
+IOStatus WinRandomAccessFile::GetFileSize(uint64_t* size) {
+  LARGE_INTEGER fileSize;
+  if (GetFileSizeEx(hFile_, &fileSize)) {
+    *size = fileSize.QuadPart;
+    return IOStatus::OK();
+  } else {
+    return IOStatus::IOError("Failed to get file size", filename_);
+  }
+}
+
 /////////////////////////////////////////////////////////////////////////////
 // WinWritableImpl
 //
diff --git a/port/win/io_win.h b/port/win/io_win.h
index e1a6197ce86b..29511d47ee68 100644
--- a/port/win/io_win.h
+++ b/port/win/io_win.h
@@ -152,6 +152,8 @@ class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
   IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   size_t GetUniqueId(char* id, size_t max_size) const override;
+
+  IOStatus GetFileSize(uint64_t* file_size) override;
 };
 
 // We preallocate and use memcpy to append new
@@ -292,6 +294,8 @@ class WinRandomAccessFile
   IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   size_t GetRequiredBufferAlignment() const override;
+
+  IOStatus GetFileSize(uint64_t* file_size) override;
 };
 
 // This is a sequential write class. It has been mimicked (as others) after
diff --git a/table/format.cc b/table/format.cc
index 13cebde7682e..2898749be44b 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -482,15 +482,42 @@ bool& TEST_AllowUnsupportedFormatVersion() {
   return allow;
 }
 
-static Status ReadFooterFromFileInternal(const IOOptions& opts,
-                                         RandomAccessFileReader* file,
-                                         FileSystem& fs,
-                                         FilePrefetchBuffer* prefetch_buffer,
-                                         uint64_t file_size, Footer* footer,
-                                         uint64_t enforce_table_magic_number) {
-  if (file_size < Footer::kMinEncodedLength) {
+static Status ReadFooterFromFileInternal(
+    const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs,
+    FilePrefetchBuffer* prefetch_buffer, uint64_t expected_file_size,
+    Footer* footer, uint64_t enforce_table_magic_number) {
+  uint64_t file_size_from_file_system = 0;
+  Status s;
+  s = file->file()->GetFileSize(&file_size_from_file_system);
+  if (!s.ok()) {
+    auto corrupted_status =
+        Status::Corruption("Failed to get file size: " + s.ToString() +
+                           " for file " + file->file_name());
+    if (s.IsNotSupported()) {
+      // If file handle does not support GetFileSize, try File System API
+      s = fs.GetFileSize(file->file_name(), IOOptions(),
+                         &file_size_from_file_system, nullptr);
+      if (!s.ok()) {
+        return corrupted_status;
+      }
+    } else {
+      return corrupted_status;
+    }
+  }
+
+  if (expected_file_size != file_size_from_file_system) {
+    // When file is opened during DB Open, the expected file size is from
+    // manifest. Otherwise it is not guaranteed.
+    return Status::Corruption("Sst file size mismatch between expected " +
+                              std::to_string(expected_file_size) +
+                              " and file system " +
+                              std::to_string(file_size_from_file_system) +
+                              " sstable: " + file->file_name());
+  }
+
+  if (expected_file_size < Footer::kMinEncodedLength) {
     return Status::Corruption("file is too short (" +
-                              std::to_string(file_size) +
+                              std::to_string(expected_file_size) +
                               " bytes) to be an "
                               "sstable: " +
                               file->file_name());
@@ -499,10 +526,9 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts,
   std::array<char, Footer::kMaxEncodedLength + 1> footer_buf;
   AlignedBuf internal_buf;
   Slice footer_input;
-  uint64_t read_offset = (file_size > Footer::kMaxEncodedLength)
-                             ? file_size - Footer::kMaxEncodedLength
+  uint64_t read_offset = (expected_file_size > Footer::kMaxEncodedLength)
+                             ? expected_file_size - Footer::kMaxEncodedLength
                              : 0;
-  Status s;
   // TODO: Need to pass appropriate deadline to TryReadFromCache(). Right now,
   // there is no readahead for point lookups, so TryReadFromCache will fail if
   // the required data is not in the prefetch buffer. Once deadline is enabled
@@ -527,23 +553,14 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts,
 
   TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
 
-  // Check that we actually read the whole footer from the file. It may be
-  // that size isn't correct.
+  // Check that we actually read the whole footer from the file.
   if (footer_input.size() < Footer::kMinEncodedLength) {
-    uint64_t size_on_disk = 0;
-    if (fs.GetFileSize(file->file_name(), IOOptions(), &size_on_disk, nullptr)
-            .ok()) {
-      // Similar to CheckConsistency message, but not completely sure the
-      // expected size always came from manifest.
-      return Status::Corruption("Sst file size mismatch: " + file->file_name() +
-                                ". Expected " + std::to_string(file_size) +
-                                ", actual size " +
-                                std::to_string(size_on_disk) + "\n");
-    } else {
-      return Status::Corruption(
-          "Missing SST footer data in file " + file->file_name() +
-          " File too short? Expected size: " + std::to_string(file_size));
-    }
+    return Status::Corruption(
+        "The number of bytes read for Footer input " +
+        std::to_string(footer_input.size()) +
+        " is smaller than minimum footer encoded length: " +
+        std::to_string(Footer::kMinEncodedLength) + " for file " +
+        file->file_name() + "\n");
   }
 
   s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number);
@@ -556,20 +573,21 @@ static Status ReadFooterFromFileInternal(const IOOptions& opts,
 
 Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
                           FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
-                          uint64_t file_size, Footer* footer,
+                          uint64_t expected_file_size, Footer* footer,
                           uint64_t enforce_table_magic_number,
                           Statistics* stats) {
-  Status s =
-      ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer, file_size,
-                                 footer, enforce_table_magic_number);
+  Status s = ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer,
+                                        expected_file_size, footer,
+                                        enforce_table_magic_number);
   if (s.IsCorruption() &&
       CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
     IOOptions new_opts = opts;
     new_opts.verify_and_reconstruct_read = true;
     footer->Reset();
     s = ReadFooterFromFileInternal(new_opts, file, fs,
-                                   /*prefetch_buffer=*/nullptr, file_size,
-                                   footer, enforce_table_magic_number);
+                                   /*prefetch_buffer=*/nullptr,
+                                   expected_file_size, footer,
+                                   enforce_table_magic_number);
     RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
     if (s.ok()) {
       RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index d65cefd60fb7..d5a786f0d735 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -311,7 +311,6 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
   db_opt->track_and_verify_wals = rnd->Uniform(2);
   db_opt->verify_sst_unique_id_in_manifest = rnd->Uniform(2);
   db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
-  db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
   db_opt->use_adaptive_mutex = rnd->Uniform(2);
   db_opt->use_fsync = rnd->Uniform(2);
   db_opt->recycle_log_file_num = rnd->Uniform(2);
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 2e641ac89c63..5f36ec5154de 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -360,6 +360,11 @@ class StringSource : public FSRandomAccessFile {
 
   void set_total_reads(int tr) { total_reads_ = tr; }
 
+  IOStatus GetFileSize(uint64_t* file_size) override {
+    *file_size = contents_.size();
+    return IOStatus::OK();
+  }
+
  private:
   std::string contents_;
   uint64_t uniq_id_;
diff --git a/unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md b/unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md
new file mode 100644
index 000000000000..e006e114f53a
--- /dev/null
+++ b/unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md
@@ -0,0 +1 @@
+A new API GetFileSize is added to FSRandomAccessFile interface class. It uses fstat vs stat on the posix implementation which is more efficient. Caller could use it to get file size faster. This function might be required in the future for FileSystem implementation outside of the RocksDB code base.
diff --git a/unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md b/unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md
new file mode 100644
index 000000000000..901537f5163d
--- /dev/null
+++ b/unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md
@@ -0,0 +1 @@
+DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed.
diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc
index 8e128c9d0617..1b559bceaf37 100644
--- a/utilities/env_mirror.cc
+++ b/utilities/env_mirror.cc
@@ -94,6 +94,16 @@ class RandomAccessFileMirror : public RandomAccessFile {
     // NOTE: not verified
     return a_->GetUniqueId(id, max_size);
   }
+
+  Status GetFileSize(uint64_t* file_size) override {
+    uint64_t asize = 0, bsize = 0;
+    Status as = a_->GetFileSize(&asize);
+    Status bs = b_->GetFileSize(&bsize);
+    assert(as == bs);
+    assert(asize == bsize);
+    *file_size = asize;
+    return as;
+  }
 };
 
 class WritableFileMirror : public WritableFile {
diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc
index fb443cc87f30..6aedb87ab634 100644
--- a/utilities/fault_injection_env.cc
+++ b/utilities/fault_injection_env.cc
@@ -159,6 +159,11 @@ Status TestRandomAccessFile::MultiRead(ReadRequest* reqs, size_t num_reqs) {
   return target_->MultiRead(reqs, num_reqs);
 }
 
+Status TestRandomAccessFile::GetFileSize(uint64_t* file_size) {
+  assert(target_);
+  return target_->GetFileSize(file_size);
+}
+
 TestWritableFile::TestWritableFile(const std::string& fname,
                                    std::unique_ptr<WritableFile>&& f,
                                    FaultInjectionTestEnv* env)
diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h
index 5612718c6c79..eaece031848d 100644
--- a/utilities/fault_injection_env.h
+++ b/utilities/fault_injection_env.h
@@ -59,6 +59,8 @@ class TestRandomAccessFile : public RandomAccessFile {
 
   Status MultiRead(ReadRequest* reqs, size_t num_reqs) override;
 
+  Status GetFileSize(uint64_t* file_size) override;
+
  private:
   std::unique_ptr<RandomAccessFile> target_;
   FaultInjectionTestEnv* env_;
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 36dec96eba89..1c55cbcba6ff 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -562,6 +562,14 @@ size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
   }
 }
 
+IOStatus TestFSRandomAccessFile::GetFileSize(uint64_t* file_size) {
+  if (fs_->ShouldFailGetFileSize()) {
+    return IOStatus::IOError("GetFileSize failed");
+  } else {
+    return target_->GetFileSize(file_size);
+  }
+}
+
 namespace {
 // Modifies `result` to start at the beginning of `scratch` if not already,
 // copying data there if needed.
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 1f82c5144d10..0c8b789b8049 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -155,6 +155,8 @@ class TestFSRandomAccessFile : public FSRandomAccessFile {
 
   size_t GetUniqueId(char* id, size_t max_size) const override;
 
+  IOStatus GetFileSize(uint64_t* file_size) override;
+
  private:
   std::unique_ptr<FSRandomAccessFile> target_;
   FaultInjectionTestFS* fs_;
@@ -218,7 +220,8 @@ class FaultInjectionTestFS : public FileSystemWrapper {
             DeleteThreadLocalErrorContext),
         ingest_data_corruption_before_write_(false),
         checksum_handoff_func_type_(kCRC32c),
-        fail_get_file_unique_id_(false) {}
+        fail_get_file_unique_id_(false),
+        fail_get_file_size_(false) {}
   virtual ~FaultInjectionTestFS() override { fs_error_.PermitUncheckedError(); }
 
   static const char* kClassName() { return "FaultInjectionTestFS"; }
@@ -477,6 +480,16 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     return fail_get_file_unique_id_;
   }
 
+  void SetFailGetFileSize(bool flag) {
+    MutexLock l(&mutex_);
+    fail_get_file_size_ = flag;
+  }
+
+  bool ShouldFailGetFileSize() {
+    MutexLock l(&mutex_);
+    return fail_get_file_size_;
+  }
+
   // Specify what the operation, so we can inject the right type of error
   enum ErrorOperation : char {
     kRead = 0,
@@ -636,6 +649,7 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   bool ingest_data_corruption_before_write_;
   ChecksumType checksum_handoff_func_type_;
   bool fail_get_file_unique_id_;
+  bool fail_get_file_size_;
 
   // Inject an error. For a READ operation, a status of IOError(), a
   // corruption in the contents of scratch, or truncation of slice

From 9a64ebde0c3f9814bf9ee4ff96110347020bb872 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Wed, 9 Jul 2025 16:35:04 -0700
Subject: [PATCH 169/500] Fix unused-return in
 internal_repo_rocksdb/repo/db/log_reader.cc +1

Summary:
LLVM has a warning `-Wunreachable-code-return` which identifies return statements that cannot be reached.

In innocuous situations such statements are often present:
* to satisfy a compiler warning that existed before `[[noreturn]]` was introduced. Now that we have `[[noreturn]]`, this use is not necessary.
* to specify a return type. But there are clearer ways to do this.
* in place of the more legible `__builtin_unreachable()` (which will soon become `std::unreachable()`). In this case, we should use the more legible alternative.
* because the programmer was afraid of the function unexpectedly returning. But we check for this condition with `-Wreturn-type`.

In dangerous situations such statements can obscure the intended execution of the program or even hide an erroneous early return.

In this diff, we remove one or more unreachable returns.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Differential Revision: D77424529

fbshipit-source-id: fe41b5a640264d0a299d5ad330c645f94b147323
---
 db/log_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/log_reader.cc b/db/log_reader.cc
index 0f0e25033ab5..fe5bad609724 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -353,7 +353,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
       }
     }
   }
-  return false;
+  __builtin_unreachable();
 }
 
 void Reader::MaybeVerifyPredecessorWALInfo(

From e929bde2bf4844f59d147da409ff68a727a80644 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Wed, 9 Jul 2025 16:36:31 -0700
Subject: [PATCH 170/500] Del redundant-static-def in
 rocksdb/src/table/block_based/filter_policy.cc +1

Summary:
LLVM has a warning `-Wdeprecated-redundant-constexpr-static-def` which raises the warning:

> warning: out-of-line definition of constexpr static data member is redundant in C++17 and is deprecated

Since we are now on C++20, we can remove the out-of-line definition of constexpr static data members. This diff does so.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Differential Revision: D77423205

fbshipit-source-id: 4ee4a390431a5d25e7733311f3fa40395dfd4bc0
---
 table/block_based/filter_policy.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 3df973aa4ca8..08314ccc9db0 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -1012,9 +1012,6 @@ class Standard128RibbonBitsBuilder : public XXPH3FilterBitsBuilder {
   FastLocalBloomBitsBuilder bloom_fallback_;
 };
 
-// for the linker, at least with DEBUG_LEVEL=2
-constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
-
 class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
  public:
   Standard128RibbonBitsReader(const char* data, size_t len_bytes,

From 988357696dc7961789b32092a2e66effb7c2528e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 9 Jul 2025 17:22:15 -0700
Subject: [PATCH 171/500] Improve internal lossless_cast to work on pointers
 (#13648)

Summary:
I was going to use this in some code I was working on but ended up not needing it. But it's useful nonetheless and I'm using it in a few places to replace reinterpret_cast.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13648

Test Plan: existing tests, manually see compilation fail when pointed-to types are not same size integral types

Reviewed By: cbi42

Differential Revision: D75576195

Pulled By: pdillinger

fbshipit-source-id: e10c7a4959340f6f2b536de8088072a90e871fcf
---
 util/cast_util.h | 33 ++++++++++++++++++++++-----------
 util/coding.h    |  7 ++++---
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/util/cast_util.h b/util/cast_util.h
index 414feda9cbea..24532cda7866 100644
--- a/util/cast_util.h
+++ b/util/cast_util.h
@@ -39,19 +39,30 @@ inline std::shared_ptr<DestClass> static_cast_with_check(
 }
 
 // A wrapper around static_cast for lossless conversion between integral
-// types, including enum types. For example, this can be used for converting
-// between signed/unsigned or enum type and underlying type without fear of
-// stripping away data, now or in the future.
+// types, including enum types, and pointers to such types. For example, this
+// can be used for converting between signed/unsigned or enum type and
+// underlying type without fear of stripping away data, now or in the future.
 template <typename To, typename From>
 inline To lossless_cast(From x) {
-  using FromValue = typename std::remove_reference<From>::type;
-  static_assert(
-      std::is_integral<FromValue>::value || std::is_enum<FromValue>::value,
-      "Only works on integral types");
-  static_assert(std::is_integral<To>::value || std::is_enum<To>::value,
-                "Only works on integral types");
-  static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
-  return static_cast<To>(x);
+  using FromValue = typename std::remove_reference_t<From>;
+  if constexpr (std::is_pointer_v<FromValue>) {
+    static_assert(std::is_pointer_v<To>);
+    using FromDeref = typename std::remove_pointer_t<FromValue>;
+    using ToDeref = typename std::remove_pointer_t<To>;
+    static_assert(std::is_integral_v<FromDeref> || std::is_enum_v<FromDeref>,
+                  "Only works on integral types");
+    static_assert(std::is_integral_v<ToDeref> || std::is_enum_v<To>,
+                  "Only works on integral types");
+    static_assert(sizeof(ToDeref) == sizeof(FromDeref), "Must be lossless");
+    return reinterpret_cast<To>(x);
+  } else {
+    static_assert(std::is_integral_v<FromValue> || std::is_enum_v<FromValue>,
+                  "Only works on integral types");
+    static_assert(std::is_integral_v<To> || std::is_enum_v<To>,
+                  "Only works on integral types");
+    static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless");
+    return static_cast<To>(x);
+  }
 }
 
 // For disambiguating a potentially heterogeneous aggregate as a homogeneous
diff --git a/util/coding.h b/util/coding.h
index 929c8e42c462..9e0d2f0fd099 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -21,6 +21,7 @@
 
 #include "port/port.h"
 #include "rocksdb/slice.h"
+#include "util/cast_util.h"
 #include "util/coding_lean.h"
 
 // Some processors does not allow unaligned access to memory
@@ -105,7 +106,7 @@ const char* GetVarint32PtrFallback(const char* p, const char* limit,
 inline const char* GetVarint32Ptr(const char* p, const char* limit,
                                   uint32_t* value) {
   if (p < limit) {
-    uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+    uint32_t result = *(lossless_cast<const unsigned char*>(p));
     if ((result & 128) == 0) {
       *value = result;
       return p + 1;
@@ -172,13 +173,13 @@ inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1,
 
 inline char* EncodeVarint64(char* dst, uint64_t v) {
   static const unsigned int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  unsigned char* ptr = lossless_cast<unsigned char*>(dst);
   while (v >= B) {
     *(ptr++) = (v & (B - 1)) | B;
     v >>= 7;
   }
   *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
+  return lossless_cast<char*>(ptr);
 }
 
 inline void PutVarint64(std::string* dst, uint64_t v) {

From 83b99db98a6ef33c365863ed7b554c17539ca8ac Mon Sep 17 00:00:00 2001
From: generatedunixname89002005232357
 <generatedunixname89002005232357@meta.com>
Date: Thu, 10 Jul 2025 12:47:22 -0700
Subject: [PATCH 172/500] Revert D77424529

Summary:
This diff reverts D77424529
Unland reason: This diff broke our Windows 2022 build for Open Source CI (T230460952).

Depends on D77424529

Reviewed By: pdillinger

Differential Revision: D78107313

fbshipit-source-id: 6177448e1015c239abcebb0e68470dfd841b6fa0
---
 db/log_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/log_reader.cc b/db/log_reader.cc
index fe5bad609724..0f0e25033ab5 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -353,7 +353,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
       }
     }
   }
-  __builtin_unreachable();
+  return false;
 }
 
 void Reader::MaybeVerifyPredecessorWALInfo(

From f9f7ad702c27c1058149e9eebd0665bfa9660186 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 10 Jul 2025 13:23:15 -0700
Subject: [PATCH 173/500] Move some tests from db_test(2) to compression_test
 (#13763)

Summary:
... to improve compilation times on db_test and db_test2 and to consolidate more compression-related tests into compression_test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13763

Test Plan:
existing tests, and seems like I haven't thrown anything away:
```
$ git diff | grep -Ec '^[-]' # lines removed
1535
$ git diff | grep -Ec '^[+]' # lines added
1535
$
```

Reviewed By: hx235

Differential Revision: D78103064

Pulled By: pdillinger

fbshipit-source-id: 9cb4c1b2473d8928f890e72d3a9b5012617819a8
---
 db/db_test.cc            |  292 --------
 db/db_test2.cc           | 1235 ------------------------------
 util/compression_test.cc | 1537 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 1532 insertions(+), 1532 deletions(-)

diff --git a/db/db_test.cc b/db/db_test.cc
index 276b330f6c0b..1919be904c23 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1278,12 +1278,6 @@ class DelayFilterFactory : public CompactionFilterFactory {
 };
 }  // anonymous namespace
 
-static std::string CompressibleString(Random* rnd, int len) {
-  std::string r;
-  test::CompressibleString(rnd, 0.8, len, &r);
-  return r;
-}
-
 TEST_F(DBTest, FailMoreDbPaths) {
   Options options = CurrentOptions();
   options.db_paths.emplace_back(dbname_, 10000000);
@@ -5407,271 +5401,6 @@ TEST_F(DBTest, FlushOnDestroy) {
   CancelAllBackgroundWork(db_);
 }
 
-TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-  const int kNKeys = 120;
-  int keys[kNKeys];
-  for (int i = 0; i < kNKeys; i++) {
-    keys[i] = i;
-  }
-
-  Random rnd(301);
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 20480;
-  options.write_buffer_size = 20480;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.target_file_size_base = 20480;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 102400;
-  options.max_bytes_for_level_multiplier = 4;
-  options.max_background_compactions = 1;
-  options.num_levels = 5;
-  options.statistics = CreateDBStatistics();
-
-  options.compression_per_level.resize(3);
-  // No compression for L0
-  options.compression_per_level[0] = kNoCompression;
-  // No compression for the Ln whre L0 is compacted to
-  options.compression_per_level[1] = kNoCompression;
-  // Snappy compression for Ln+1
-  options.compression_per_level[2] = kSnappyCompression;
-
-  OnFileDeletionListener* listener = new OnFileDeletionListener();
-  options.listeners.emplace_back(listener);
-
-  DestroyAndReopen(options);
-
-  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
-  // be compressed, so there shouldn't be any compression.
-  for (int i = 0; i < 20; i++) {
-    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
-    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
-  }
-  ASSERT_OK(Flush());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0);
-
-  // Verify there was no compression
-  auto num_block_compressed =
-      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
-  ASSERT_EQ(num_block_compressed, 0);
-
-  // Insert 400KB and there will be some files end up in L3. According to the
-  // above compression settings for each level, there will be some compression.
-  ASSERT_OK(options.statistics->Reset());
-  ASSERT_EQ(num_block_compressed, 0);
-  for (int i = 20; i < 120; i++) {
-    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
-    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
-  }
-  ASSERT_OK(Flush());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GE(NumTableFilesAtLevel(3), 1);
-  ASSERT_GE(NumTableFilesAtLevel(4), 1);
-
-  // Verify there was compression
-  num_block_compressed =
-      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
-  ASSERT_GT(num_block_compressed, 0);
-
-  // Make sure data in files in L3 is not compacted by removing all files
-  // in L4 and calculate number of rows
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
-  }));
-  ColumnFamilyMetaData cf_meta;
-  db_->GetColumnFamilyMetaData(&cf_meta);
-
-  // Ensure that L1+ files are non-overlapping and together with L0 encompass
-  // full key range between smallestkey and largestkey from CF file metadata.
-  int largestkey_in_prev_level = -1;
-  int keys_found = 0;
-  for (int level = (int)cf_meta.levels.size() - 1; level >= 0; level--) {
-    int files_in_level = (int)cf_meta.levels[level].files.size();
-    int largestkey_in_prev_file = -1;
-    for (int j = 0; j < files_in_level; j++) {
-      int smallestkey = IdFromKey(cf_meta.levels[level].files[j].smallestkey);
-      int largestkey = IdFromKey(cf_meta.levels[level].files[j].largestkey);
-      int num_entries = (int)cf_meta.levels[level].files[j].num_entries;
-      ASSERT_EQ(num_entries, largestkey - smallestkey + 1);
-      keys_found += num_entries;
-      if (level > 0) {
-        if (j == 0) {
-          ASSERT_GT(smallestkey, largestkey_in_prev_level);
-        }
-        if (j > 0) {
-          ASSERT_GT(smallestkey, largestkey_in_prev_file);
-        }
-        if (j == files_in_level - 1) {
-          largestkey_in_prev_level = largestkey;
-        }
-      }
-      largestkey_in_prev_file = largestkey;
-    }
-  }
-  ASSERT_EQ(keys_found, kNKeys);
-
-  for (const auto& file : cf_meta.levels[4].files) {
-    listener->SetExpectedFileName(dbname_ + file.name);
-    const RangeOpt ranges(file.smallestkey, file.largestkey);
-    // Given verification from above, we're guaranteed that by deleting all the
-    // files in [<smallestkey>, <largestkey>] range, we're effectively deleting
-    // that very single file and nothing more.
-    EXPECT_OK(dbfull()->DeleteFilesInRanges(dbfull()->DefaultColumnFamily(),
-                                            &ranges, true /* include_end */));
-  }
-  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
-
-  int num_keys = 0;
-  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    num_keys++;
-  }
-  ASSERT_OK(iter->status());
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GE(NumTableFilesAtLevel(3), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(4), 0);
-
-  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
-}
-
-TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
-  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
-    return;
-  }
-  const int kNKeys = 500;
-  int keys[kNKeys];
-  for (int i = 0; i < kNKeys; i++) {
-    keys[i] = i;
-  }
-  RandomShuffle(std::begin(keys), std::end(keys));
-
-  Random rnd(301);
-  Options options;
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 6000000;
-  options.write_buffer_size = 600000;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
-  options.target_file_size_base = 20;
-  options.env = env_;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 200;
-  options.max_bytes_for_level_multiplier = 8;
-  options.max_background_compactions = 1;
-  options.num_levels = 5;
-  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
-  options.table_factory = mtf;
-
-  options.compression_per_level.resize(3);
-  options.compression_per_level[0] = kNoCompression;
-  options.compression_per_level[1] = kLZ4Compression;
-  options.compression_per_level[2] = kZlibCompression;
-
-  DestroyAndReopen(options);
-  // When base level is L4, L4 is LZ4.
-  std::atomic<int> num_zlib(0);
-  std::atomic<int> num_lz4(0);
-  std::atomic<int> num_no(0);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = static_cast<Compaction*>(arg);
-        if (compaction->output_level() == 4) {
-          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
-          num_lz4.fetch_add(1);
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
-        auto* compression = static_cast<CompressionType*>(arg);
-        ASSERT_TRUE(*compression == kNoCompression);
-        num_no.fetch_add(1);
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  for (int i = 0; i < 100; i++) {
-    std::string value = rnd.RandomString(200);
-    ASSERT_OK(Put(Key(keys[i]), value));
-    if (i % 25 == 24) {
-      ASSERT_OK(Flush());
-      ASSERT_OK(dbfull()->TEST_WaitForCompact());
-    }
-  }
-
-  ASSERT_OK(Flush());
-  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(NumTableFilesAtLevel(4), 0);
-  ASSERT_GT(num_no.load(), 2);
-  ASSERT_GT(num_lz4.load(), 0);
-  int prev_num_files_l4 = NumTableFilesAtLevel(4);
-
-  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
-  num_lz4.store(0);
-  num_no.store(0);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = static_cast<Compaction*>(arg);
-        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
-          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
-          num_zlib.fetch_add(1);
-        } else {
-          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
-          num_lz4.fetch_add(1);
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
-        auto* compression = static_cast<CompressionType*>(arg);
-        ASSERT_TRUE(*compression == kNoCompression);
-        num_no.fetch_add(1);
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  for (int i = 101; i < 500; i++) {
-    std::string value = rnd.RandomString(200);
-    ASSERT_OK(Put(Key(keys[i]), value));
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush());
-      ASSERT_OK(dbfull()->TEST_WaitForCompact());
-    }
-  }
-
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GT(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
-  ASSERT_GT(num_no.load(), 2);
-  ASSERT_GT(num_lz4.load(), 0);
-  ASSERT_GT(num_zlib.load(), 0);
-}
-
 TEST_F(DBTest, DynamicCompactionOptions) {
   // minimum write buffer size is enforced at 64KB
   const uint64_t k32KB = 1 << 15;
@@ -7349,27 +7078,6 @@ TEST_F(DBTest, LastWriteBufferDelay) {
 }
 #endif  // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
 
-TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
-  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
-                                    kLZ4Compression, kLZ4HCCompression,
-                                    kXpressCompression};
-  for (auto comp : compressions) {
-    if (!CompressionTypeSupported(comp)) {
-      // not supported, we should fail the Open()
-      Options options = CurrentOptions();
-      options.compression = comp;
-      ASSERT_TRUE(!TryReopen(options).ok());
-      // Try if CreateColumnFamily also fails
-      options.compression = kNoCompression;
-      ASSERT_OK(TryReopen(options));
-      ColumnFamilyOptions cf_options(options);
-      cf_options.compression = comp;
-      ColumnFamilyHandle* handle;
-      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
-    }
-  }
-}
-
 TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
   Options options = CurrentOptions();
   options.max_open_files = 100;
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 4eabda2ba258..0d9d306e6bf2 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -24,13 +24,11 @@
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/trace_record.h"
 #include "rocksdb/trace_record_result.h"
-#include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/replayer.h"
 #include "rocksdb/wal_filter.h"
 #include "test_util/testutil.h"
 #include "util/defer.h"
 #include "util/random.h"
-#include "util/simple_mixed_compressor.h"
 #include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -1187,1239 +1185,6 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
   ASSERT_EQ(index, keys_cf.size());
 }
 
-TEST_F(DBTest2, PresetCompressionDict) {
-  // Verifies that compression ratio improves when dictionary is enabled, and
-  // improves even further when the dictionary is trained by ZSTD.
-  const size_t kBlockSizeBytes = 4 << 10;
-  const size_t kL0FileBytes = 128 << 10;
-  const size_t kApproxPerBlockOverheadBytes = 50;
-  const int kNumL0Files = 5;
-
-  Options options;
-  // Make sure to use any custom env that the test is configured with.
-  options.env = CurrentOptions().env;
-  options.allow_concurrent_memtable_write = false;
-  options.arena_block_size = kBlockSizeBytes;
-  options.create_if_missing = true;
-  options.disable_auto_compactions = true;
-  options.level0_file_num_compaction_trigger = kNumL0Files;
-  options.memtable_factory.reset(
-      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
-  options.num_levels = 2;
-  options.target_file_size_base = kL0FileBytes;
-  options.target_file_size_multiplier = 2;
-  options.write_buffer_size = kL0FileBytes;
-  BlockBasedTableOptions table_options;
-  table_options.block_size = kBlockSizeBytes;
-  std::vector<CompressionType> compression_types;
-  if (Zlib_Supported()) {
-    compression_types.push_back(kZlibCompression);
-  }
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  compression_types.push_back(kLZ4Compression);
-  compression_types.push_back(kLZ4HCCompression);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-  if (ZSTD_Supported()) {
-    compression_types.push_back(kZSTD);
-  }
-
-  enum DictionaryTypes : int {
-    kWithoutDict,
-    kWithDict,
-    kWithZSTDfinalizeDict,
-    kWithZSTDTrainedDict,
-    kDictEnd,
-  };
-
-  for (auto compression_type : compression_types) {
-    options.compression = compression_type;
-    size_t bytes_without_dict = 0;
-    size_t bytes_with_dict = 0;
-    size_t bytes_with_zstd_finalize_dict = 0;
-    size_t bytes_with_zstd_trained_dict = 0;
-    for (int i = kWithoutDict; i < kDictEnd; i++) {
-      // First iteration: compress without preset dictionary
-      // Second iteration: compress with preset dictionary
-      // Third iteration (zstd only): compress with zstd-trained dictionary
-      //
-      // To make sure the compression dictionary has the intended effect, we
-      // verify the compressed size is smaller in successive iterations. Also in
-      // the non-first iterations, verify the data we get out is the same data
-      // we put in.
-      switch (i) {
-        case kWithoutDict:
-          options.compression_opts.max_dict_bytes = 0;
-          options.compression_opts.zstd_max_train_bytes = 0;
-          break;
-        case kWithDict:
-          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-          options.compression_opts.zstd_max_train_bytes = 0;
-          break;
-        case kWithZSTDfinalizeDict:
-          if (compression_type != kZSTD ||
-              !ZSTD_FinalizeDictionarySupported()) {
-            continue;
-          }
-          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
-          options.compression_opts.use_zstd_dict_trainer = false;
-          break;
-        case kWithZSTDTrainedDict:
-          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
-            continue;
-          }
-          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
-          options.compression_opts.use_zstd_dict_trainer = true;
-          break;
-        default:
-          assert(false);
-      }
-
-      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      CreateAndReopenWithCF({"pikachu"}, options);
-      Random rnd(301);
-      std::string seq_datas[10];
-      for (int j = 0; j < 10; ++j) {
-        seq_datas[j] =
-            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
-      }
-
-      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
-      for (int j = 0; j < kNumL0Files; ++j) {
-        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
-          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
-          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
-                        seq_datas[(key_num / 10) % 10]));
-        }
-        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
-        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
-      }
-      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                                            true /* disallow_trivial_move */));
-      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
-      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
-
-      // Get the live sst files size
-      size_t total_sst_bytes = TotalSize(1);
-      if (i == kWithoutDict) {
-        bytes_without_dict = total_sst_bytes;
-      } else if (i == kWithDict) {
-        bytes_with_dict = total_sst_bytes;
-      } else if (i == kWithZSTDfinalizeDict) {
-        bytes_with_zstd_finalize_dict = total_sst_bytes;
-      } else if (i == kWithZSTDTrainedDict) {
-        bytes_with_zstd_trained_dict = total_sst_bytes;
-      }
-
-      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
-           j++) {
-        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
-      }
-      if (i == kWithDict) {
-        ASSERT_GT(bytes_without_dict, bytes_with_dict);
-      } else if (i == kWithZSTDTrainedDict) {
-        // In zstd compression, it is sometimes possible that using a finalized
-        // dictionary does not get as good a compression ratio as raw content
-        // dictionary. But using a dictionary should always get better
-        // compression ratio than not using one.
-        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
-                    bytes_without_dict > bytes_with_zstd_finalize_dict);
-      } else if (i == kWithZSTDTrainedDict) {
-        // In zstd compression, it is sometimes possible that using a trained
-        // dictionary does not get as good a compression ratio as without
-        // training.
-        // But using a dictionary (with or without training) should always get
-        // better compression ratio than not using one.
-        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
-                    bytes_without_dict > bytes_with_zstd_trained_dict);
-      }
-
-      DestroyAndReopen(options);
-    }
-  }
-}
-
-TEST_F(DBTest2, PresetCompressionDictLocality) {
-  if (!ZSTD_Supported()) {
-    return;
-  }
-  // Verifies that compression dictionary is generated from local data. The
-  // verification simply checks all output SSTs have different compression
-  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
-  // the future.
-  const int kNumEntriesPerFile = 1 << 10;  // 1KB
-  const int kNumBytesPerEntry = 1 << 10;   // 1KB
-  const int kNumFiles = 4;
-  Options options = CurrentOptions();
-  options.compression = kZSTD;
-  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
-  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
-  BlockBasedTableOptions table_options;
-  table_options.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  Reopen(options);
-
-  Random rnd(301);
-  for (int i = 0; i < kNumFiles; ++i) {
-    for (int j = 0; j < kNumEntriesPerFile; ++j) {
-      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
-                    rnd.RandomString(kNumBytesPerEntry)));
-    }
-    ASSERT_OK(Flush());
-    MoveFilesToLevel(1);
-    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
-  }
-
-  // Store all the dictionaries generated during a full compaction.
-  std::vector<std::string> compression_dicts;
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
-      [&](void* arg) {
-        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  CompactRangeOptions compact_range_opts;
-  compact_range_opts.bottommost_level_compaction =
-      BottommostLevelCompaction::kForceOptimized;
-  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
-
-  // Dictionary compression should not be so good as to compress four totally
-  // random files into one. If it does then there's probably something wrong
-  // with the test.
-  ASSERT_GT(NumTableFilesAtLevel(1), 1);
-
-  // Furthermore, there should be one compression dictionary generated per file.
-  // And they should all be different from each other.
-  ASSERT_EQ(NumTableFilesAtLevel(1),
-            static_cast<int>(compression_dicts.size()));
-  for (size_t i = 1; i < compression_dicts.size(); ++i) {
-    std::string& a = compression_dicts[i - 1];
-    std::string& b = compression_dicts[i];
-    size_t alen = a.size();
-    size_t blen = b.size();
-    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
-  }
-}
-
-class PresetCompressionDictTest
-    : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
- public:
-  PresetCompressionDictTest()
-      : DBTestBase("db_test2", false /* env_do_fsync */),
-        compression_type_(std::get<0>(GetParam())),
-        bottommost_(std::get<1>(GetParam())) {}
-
- protected:
-  const CompressionType compression_type_;
-  const bool bottommost_;
-};
-
-INSTANTIATE_TEST_CASE_P(
-    DBTest2, PresetCompressionDictTest,
-    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
-                       ::testing::Bool()));
-
-TEST_P(PresetCompressionDictTest, Flush) {
-  // Verifies that dictionary is generated and written during flush only when
-  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
-  // size of the dictionary is within expectations according to the limit on
-  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
-  const size_t kValueLen = 256;
-  const size_t kKeysPerFile = 1 << 10;
-  const size_t kDictLen = 16 << 10;
-  const size_t kBlockLen = 4 << 10;
-
-  Options options = CurrentOptions();
-  if (bottommost_) {
-    options.bottommost_compression = compression_type_;
-    options.bottommost_compression_opts.enabled = true;
-    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
-    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
-  } else {
-    options.compression = compression_type_;
-    options.compression_opts.max_dict_bytes = kDictLen;
-    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
-  }
-  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
-  options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.block_size = kBlockLen;
-  bbto.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  Random rnd(301);
-  for (size_t i = 0; i <= kKeysPerFile; ++i) {
-    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
-  }
-  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-
-  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
-  // compression dictionary exists since dictionaries would be preloaded when
-  // the flush finishes.
-  if (bottommost_) {
-    // Flush is never considered bottommost. This should change in the future
-    // since flushed files may have nothing underneath them, like the one in
-    // this test case.
-    ASSERT_EQ(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
-  } else {
-    ASSERT_GT(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        0);
-    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-    // number of bytes needs to be adjusted in case the cached block is in
-    // ZSTD's digested dictionary format.
-    if (compression_type_ != kZSTD) {
-      // Although we limited buffering to `kBlockLen`, there may be up to two
-      // blocks of data included in the dictionary since we only check limit
-      // after each block is built.
-      ASSERT_LE(TestGetTickerCount(options,
-                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-                2 * kBlockLen);
-    }
-  }
-}
-
-TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
-  // Verifies that dictionary is generated and written during compaction to
-  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
-  // dictionary. Also verifies the size of the dictionary is within expectations
-  // according to the limit on buffering set by
-  // `CompressionOptions::max_dict_buffer_bytes`.
-  const size_t kValueLen = 256;
-  const size_t kKeysPerFile = 1 << 10;
-  const size_t kDictLen = 16 << 10;
-  const size_t kBlockLen = 4 << 10;
-
-  Options options = CurrentOptions();
-  if (bottommost_) {
-    options.bottommost_compression = compression_type_;
-    options.bottommost_compression_opts.enabled = true;
-    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
-    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
-  } else {
-    options.compression = compression_type_;
-    options.compression_opts.max_dict_bytes = kDictLen;
-    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
-  }
-  options.disable_auto_compactions = true;
-  options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.block_size = kBlockLen;
-  bbto.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  Random rnd(301);
-  for (size_t j = 0; j <= kKeysPerFile; ++j) {
-    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
-  }
-  ASSERT_OK(Flush());
-  MoveFilesToLevel(2);
-
-  for (int i = 0; i < 2; ++i) {
-    for (size_t j = 0; j <= kKeysPerFile; ++j) {
-      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
-    }
-    ASSERT_OK(Flush());
-  }
-  ASSERT_EQ("2,0,1", FilesPerLevel(0));
-
-  uint64_t prev_compression_dict_bytes_inserted =
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
-  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
-  // file is not bottommost due to the existing L2 file covering the same key-
-  // range.
-  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
-  ASSERT_EQ("0,1,1", FilesPerLevel(0));
-  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
-  // compression dictionary exists since dictionaries would be preloaded when
-  // the compaction finishes.
-  if (bottommost_) {
-    ASSERT_EQ(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted);
-  } else {
-    ASSERT_GT(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted);
-    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-    // number of bytes needs to be adjusted in case the cached block is in
-    // ZSTD's digested dictionary format.
-    if (compression_type_ != kZSTD) {
-      // Although we limited buffering to `kBlockLen`, there may be up to two
-      // blocks of data included in the dictionary since we only check limit
-      // after each block is built.
-      ASSERT_LE(TestGetTickerCount(options,
-                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
-    }
-  }
-}
-
-TEST_P(PresetCompressionDictTest, CompactBottommost) {
-  // Verifies that dictionary is generated and written during compaction to
-  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
-  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
-  // verifies the size of the dictionary is within expectations according to the
-  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
-  const size_t kValueLen = 256;
-  const size_t kKeysPerFile = 1 << 10;
-  const size_t kDictLen = 16 << 10;
-  const size_t kBlockLen = 4 << 10;
-
-  Options options = CurrentOptions();
-  if (bottommost_) {
-    options.bottommost_compression = compression_type_;
-    options.bottommost_compression_opts.enabled = true;
-    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
-    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
-  } else {
-    options.compression = compression_type_;
-    options.compression_opts.max_dict_bytes = kDictLen;
-    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
-  }
-  options.disable_auto_compactions = true;
-  options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.block_size = kBlockLen;
-  bbto.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  Random rnd(301);
-  for (int i = 0; i < 2; ++i) {
-    for (size_t j = 0; j <= kKeysPerFile; ++j) {
-      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
-    }
-    ASSERT_OK(Flush());
-  }
-  ASSERT_EQ("2", FilesPerLevel(0));
-
-  uint64_t prev_compression_dict_bytes_inserted =
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
-  CompactRangeOptions cro;
-  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ASSERT_GT(
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-      prev_compression_dict_bytes_inserted);
-  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
-  // digested dictionary format.
-  if (compression_type_ != kZSTD) {
-    // Although we limited buffering to `kBlockLen`, there may be up to two
-    // blocks of data included in the dictionary since we only check limit after
-    // each block is built.
-    ASSERT_LE(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
-  }
-}
-
-class CompactionCompressionListener : public EventListener {
- public:
-  explicit CompactionCompressionListener(Options* db_options)
-      : db_options_(db_options) {}
-
-  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
-    // Figure out last level with files
-    int bottommost_level = 0;
-    for (int level = 0; level < db->NumberLevels(); level++) {
-      std::string files_at_level;
-      ASSERT_TRUE(
-          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
-                          &files_at_level));
-      if (files_at_level != "0") {
-        bottommost_level = level;
-      }
-    }
-
-    if (db_options_->bottommost_compression != kDisableCompressionOption &&
-        ci.output_level == bottommost_level) {
-      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
-    } else if (db_options_->compression_per_level.size() != 0) {
-      ASSERT_EQ(ci.compression,
-                db_options_->compression_per_level[ci.output_level]);
-    } else {
-      ASSERT_EQ(ci.compression, db_options_->compression);
-    }
-    max_level_checked = std::max(max_level_checked, ci.output_level);
-  }
-
-  int max_level_checked = 0;
-  const Options* db_options_;
-};
-
-enum CompressionFailureType {
-  kTestCompressionFail,
-  kTestDecompressionFail,
-  kTestDecompressionCorruption
-};
-
-class CompressionFailuresTest
-    : public DBTest2,
-      public testing::WithParamInterface<std::tuple<
-          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
- public:
-  CompressionFailuresTest() {
-    std::tie(compression_failure_type_, compression_type_,
-             compression_max_dict_bytes_, compression_parallel_threads_) =
-        GetParam();
-  }
-
-  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
-  CompressionType compression_type_ = kNoCompression;
-  uint32_t compression_max_dict_bytes_ = 0;
-  uint32_t compression_parallel_threads_ = 0;
-};
-
-INSTANTIATE_TEST_CASE_P(
-    DBTest2, CompressionFailuresTest,
-    ::testing::Combine(::testing::Values(kTestCompressionFail,
-                                         kTestDecompressionFail,
-                                         kTestDecompressionCorruption),
-                       ::testing::ValuesIn(GetSupportedCompressions()),
-                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
-
-TEST_P(CompressionFailuresTest, CompressionFailures) {
-  if (compression_type_ == kNoCompression) {
-    return;
-  }
-
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = 2;
-  options.max_bytes_for_level_base = 1024;
-  options.max_bytes_for_level_multiplier = 2;
-  options.num_levels = 7;
-  options.max_background_compactions = 1;
-  options.target_file_size_base = 512;
-
-  BlockBasedTableOptions table_options;
-  table_options.block_size = 512;
-  table_options.verify_compression = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
-  options.compression = compression_type_;
-  options.compression_opts.parallel_threads = compression_parallel_threads_;
-  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
-  options.bottommost_compression_opts.parallel_threads =
-      compression_parallel_threads_;
-  options.bottommost_compression_opts.max_dict_bytes =
-      compression_max_dict_bytes_;
-
-  if (compression_failure_type_ == kTestCompressionFail) {
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "CompressData:TamperWithReturnValue", [](void* arg) {
-          bool* ret = static_cast<bool*>(arg);
-          *ret = false;
-        });
-  } else if (compression_failure_type_ == kTestDecompressionFail) {
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "DecompressBlockData:TamperWithReturnValue", [](void* arg) {
-          Status* ret = static_cast<Status*>(arg);
-          ASSERT_OK(*ret);
-          *ret = Status::Corruption("kTestDecompressionFail");
-        });
-  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
-    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "DecompressBlockData:TamperWithDecompressionOutput", [](void* arg) {
-          BlockContents* contents = static_cast<BlockContents*>(arg);
-          // Ensure uncompressed data != original data
-          const size_t len = contents->data.size() + 1;
-          std::unique_ptr<char[]> fake_data(new char[len]());
-          *contents = BlockContents(std::move(fake_data), len);
-        });
-  }
-
-  std::map<std::string, std::string> key_value_written;
-
-  const int kKeySize = 5;
-  const int kValUnitSize = 16;
-  const int kValSize = 256;
-  Random rnd(405);
-
-  Status s = Status::OK();
-
-  DestroyAndReopen(options);
-  // Write 10 random files
-  for (int i = 0; i < 10; i++) {
-    for (int j = 0; j < 5; j++) {
-      std::string key = rnd.RandomString(kKeySize);
-      // Ensure good compression ratio
-      std::string valueUnit = rnd.RandomString(kValUnitSize);
-      std::string value;
-      for (int k = 0; k < kValSize; k += kValUnitSize) {
-        value += valueUnit;
-      }
-      s = Put(key, value);
-      if (compression_failure_type_ == kTestCompressionFail) {
-        key_value_written[key] = value;
-        ASSERT_OK(s);
-      }
-    }
-    s = Flush();
-    if (compression_failure_type_ == kTestCompressionFail) {
-      ASSERT_OK(s);
-    }
-    s = dbfull()->TEST_WaitForCompact();
-    if (compression_failure_type_ == kTestCompressionFail) {
-      ASSERT_OK(s);
-    }
-    if (i == 4) {
-      // Make compression fail at the mid of table building
-      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-    }
-  }
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-
-  if (compression_failure_type_ == kTestCompressionFail) {
-    // Should be kNoCompression, check content consistency
-    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
-    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
-      std::string key = db_iter->key().ToString();
-      std::string value = db_iter->value().ToString();
-      ASSERT_NE(key_value_written.find(key), key_value_written.end());
-      ASSERT_EQ(key_value_written[key], value);
-      key_value_written.erase(key);
-    }
-    ASSERT_OK(db_iter->status());
-    ASSERT_EQ(0, key_value_written.size());
-  } else if (compression_failure_type_ == kTestDecompressionFail) {
-    ASSERT_EQ(std::string(s.getState()),
-              "Could not decompress: kTestDecompressionFail");
-  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
-    ASSERT_EQ(std::string(s.getState()),
-              "Decompressed block did not match pre-compression block");
-  }
-}
-
-TEST_F(DBTest2, CompressionOptions) {
-  if (!Zlib_Supported() || !Snappy_Supported()) {
-    return;
-  }
-
-  Options options = CurrentOptions();
-  options.level0_file_num_compaction_trigger = 2;
-  options.max_bytes_for_level_base = 100;
-  options.max_bytes_for_level_multiplier = 2;
-  options.num_levels = 7;
-  options.max_background_compactions = 1;
-
-  CompactionCompressionListener* listener =
-      new CompactionCompressionListener(&options);
-  options.listeners.emplace_back(listener);
-
-  const int kKeySize = 5;
-  const int kValSize = 20;
-  Random rnd(301);
-
-  std::vector<uint32_t> compression_parallel_threads = {1, 4};
-
-  std::map<std::string, std::string> key_value_written;
-
-  for (int iter = 0; iter <= 2; iter++) {
-    listener->max_level_checked = 0;
-
-    if (iter == 0) {
-      // Use different compression algorithms for different levels but
-      // always use Zlib for bottommost level
-      options.compression_per_level = {kNoCompression,     kNoCompression,
-                                       kNoCompression,     kSnappyCompression,
-                                       kSnappyCompression, kSnappyCompression,
-                                       kZlibCompression};
-      options.compression = kNoCompression;
-      options.bottommost_compression = kZlibCompression;
-    } else if (iter == 1) {
-      // Use Snappy except for bottommost level use ZLib
-      options.compression_per_level = {};
-      options.compression = kSnappyCompression;
-      options.bottommost_compression = kZlibCompression;
-    } else if (iter == 2) {
-      // Use Snappy everywhere
-      options.compression_per_level = {};
-      options.compression = kSnappyCompression;
-      options.bottommost_compression = kDisableCompressionOption;
-    }
-
-    for (auto num_threads : compression_parallel_threads) {
-      options.compression_opts.parallel_threads = num_threads;
-      options.bottommost_compression_opts.parallel_threads = num_threads;
-
-      DestroyAndReopen(options);
-      // Write 10 random files
-      for (int i = 0; i < 10; i++) {
-        for (int j = 0; j < 5; j++) {
-          std::string key = rnd.RandomString(kKeySize);
-          std::string value = rnd.RandomString(kValSize);
-          key_value_written[key] = value;
-          ASSERT_OK(Put(key, value));
-        }
-        ASSERT_OK(Flush());
-        ASSERT_OK(dbfull()->TEST_WaitForCompact());
-      }
-
-      // Make sure that we wrote enough to check all 7 levels
-      ASSERT_EQ(listener->max_level_checked, 6);
-
-      // Make sure database content is the same as key_value_written
-      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
-      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
-        std::string key = db_iter->key().ToString();
-        std::string value = db_iter->value().ToString();
-        ASSERT_NE(key_value_written.find(key), key_value_written.end());
-        ASSERT_EQ(key_value_written[key], value);
-        key_value_written.erase(key);
-      }
-      ASSERT_OK(db_iter->status());
-      ASSERT_EQ(0, key_value_written.size());
-    }
-  }
-}
-
-TEST_F(DBTest2, RoundRobinManager) {
-  if (ZSTD_Supported()) {
-    auto mgr =
-        std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
-
-    std::vector<std::string> values;
-    for (bool use_wrapper : {true}) {
-      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
-
-      Options options = CurrentOptions();
-      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
-      BlockBasedTableOptions bbto;
-      bbto.enable_index_compression = false;
-      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-      options.compression_manager = use_wrapper ? mgr : nullptr;
-      DestroyAndReopen(options);
-
-      Random rnd(301);
-      constexpr int kCount = 13;
-
-      // Highly compressible blocks, except 1 non-compressible. Half of the
-      // compressible are morked for bypass and 1 marked for rejection. Values
-      // are large enough to ensure just 1 k-v per block.
-      for (int i = 0; i < kCount; ++i) {
-        std::string value;
-        if (i == 6) {
-          // One non-compressible block
-          value = rnd.RandomBinaryString(20000);
-        } else {
-          test::CompressibleString(&rnd, 0.1, 20000, &value);
-        }
-        values.push_back(value);
-        ASSERT_OK(Put(Key(i), value));
-        ASSERT_EQ(Get(Key(i)), value);
-      }
-      ASSERT_OK(Flush());
-
-      // Ensure well-formed for reads
-      for (int i = 0; i < kCount; ++i) {
-        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-        ASSERT_EQ(Get(Key(i)), values[i]);
-      }
-      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
-    }
-  }
-}
-
-TEST_F(DBTest2, RandomMixedCompressionManager) {
-  if (ZSTD_Supported()) {
-    auto mgr = std::make_shared<RandomMixedCompressionManager>(
-        GetBuiltinV2CompressionManager());
-    std::vector<std::string> values;
-    for (bool use_wrapper : {true}) {
-      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
-
-      Options options = CurrentOptions();
-      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
-      BlockBasedTableOptions bbto;
-      bbto.enable_index_compression = false;
-      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-      options.compression_manager = use_wrapper ? mgr : nullptr;
-      DestroyAndReopen(options);
-
-      Random rnd(301);
-      constexpr int kCount = 13;
-
-      // Highly compressible blocks, except 1 non-compressible. Half of the
-      // compressible are morked for bypass and 1 marked for rejection. Values
-      // are large enough to ensure just 1 k-v per block.
-      for (int i = 0; i < kCount; ++i) {
-        std::string value;
-        if (i == 6) {
-          // One non-compressible block
-          value = rnd.RandomBinaryString(20000);
-        } else {
-          test::CompressibleString(&rnd, 0.1, 20000, &value);
-        }
-        values.push_back(value);
-        ASSERT_OK(Put(Key(i), value));
-        ASSERT_EQ(Get(Key(i)), value);
-      }
-      ASSERT_OK(Flush());
-
-      // Ensure well-formed for reads
-      for (int i = 0; i < kCount; ++i) {
-        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-        ASSERT_EQ(Get(Key(i)), values[i]);
-      }
-      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
-    }
-  }
-}
-
-TEST_F(DBTest2, CompressionManagerWrapper) {
-  // Test that we can use a custom CompressionManager to wrap the built-in
-  // CompressionManager, thus adopting a custom *strategy* based on existing
-  // algorithms. This will "mark" some blocks (in their contents) as "do not
-  // compress", i.e. no attempt to compress, and some blocks as "reject
-  // compression", i.e. compression attempted but rejected because of ratio
-  // or otherwise. These cases are distinguishable for statistics that
-  // approximate "wasted effort".
-  static std::string kDoNotCompress = "do_not_compress";
-  static std::string kRejectCompression = "reject_compression";
-
-  struct MyCompressor : public CompressorWrapper {
-    using CompressorWrapper::CompressorWrapper;
-    const char* Name() const override { return "MyCompressor"; }
-
-    Status CompressBlock(Slice uncompressed_data,
-                         std::string* compressed_output,
-                         CompressionType* out_compression_type,
-                         ManagedWorkingArea* working_area) override {
-      auto begin = uncompressed_data.data();
-      auto end = uncompressed_data.data() + uncompressed_data.size();
-      if (std::search(begin, end, kDoNotCompress.begin(),
-                      kDoNotCompress.end()) != end) {
-        // Do not attempt compression
-        EXPECT_EQ(*out_compression_type, kNoCompression);
-        return Status::OK();
-      } else if (std::search(begin, end, kRejectCompression.begin(),
-                             kRejectCompression.end()) != end) {
-        // Simulate attempted & rejected compression
-        *compressed_output = "blah";
-        EXPECT_EQ(*out_compression_type, kNoCompression);
-        return Status::OK();
-      } else {
-        return wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                       out_compression_type, working_area);
-      }
-    }
-  };
-  struct MyManager : public CompressionManagerWrapper {
-    using CompressionManagerWrapper::CompressionManagerWrapper;
-    const char* Name() const override { return "MyManager"; }
-    std::unique_ptr<Compressor> GetCompressorForSST(
-        const FilterBuildingContext& context, const CompressionOptions& opts,
-        CompressionType preferred) override {
-      return std::make_unique<MyCompressor>(
-          wrapped_->GetCompressorForSST(context, opts, preferred));
-    }
-  };
-  auto mgr = std::make_shared<MyManager>(GetBuiltinV2CompressionManager());
-
-  for (CompressionType type : GetSupportedCompressions()) {
-    for (bool use_wrapper : {false, true}) {
-      if (type == kNoCompression) {
-        continue;
-      }
-      SCOPED_TRACE("Compression type: " + std::to_string(type) +
-                   (use_wrapper ? " with " : " no ") + "wrapper");
-
-      Options options = CurrentOptions();
-      options.compression = type;
-      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
-      BlockBasedTableOptions bbto;
-      bbto.enable_index_compression = false;
-      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-      options.compression_manager = use_wrapper ? mgr : nullptr;
-      DestroyAndReopen(options);
-
-      auto PopStat = [&](Tickers t) -> uint64_t {
-        return options.statistics->getAndResetTickerCount(t);
-      };
-
-      Random rnd(301);
-      constexpr int kCount = 13;
-
-      // Highly compressible blocks, except 1 non-compressible. Half of the
-      // compressible are morked for bypass and 1 marked for rejection. Values
-      // are large enough to ensure just 1 k-v per block.
-      for (int i = 0; i < kCount; ++i) {
-        std::string value;
-        if (i == 6) {
-          // One non-compressible block
-          value = rnd.RandomBinaryString(20000);
-        } else {
-          test::CompressibleString(&rnd, 0.1, 20000, &value);
-          if ((i % 2) == 0) {
-            // Half for bypass
-            value += kDoNotCompress;
-          } else if (i == 7) {
-            // One for rejection
-            value += kRejectCompression;
-          }
-        }
-        ASSERT_OK(Put(Key(i), value));
-      }
-      ASSERT_OK(Flush());
-
-      if (use_wrapper) {
-        EXPECT_EQ(kCount / 2 - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
-        EXPECT_EQ(kCount / 2, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
-        EXPECT_EQ(1 + 1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
-      } else {
-        EXPECT_EQ(kCount - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
-        EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
-        EXPECT_EQ(1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
-      }
-
-      // Ensure well-formed for reads
-      for (int i = 0; i < kCount; ++i) {
-        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
-      }
-      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
-    }
-  }
-}
-
-TEST_F(DBTest2, CompressionManagerCustomCompression) {
-  // Test that we can use a custom CompressionManager to implement custom
-  // compression algorithms, and that there are appropriate schema guard rails
-  // to ensure data is not processed by the wrong algorithm.
-  using Compressor8A = test::CompressorCustomAlg<kCustomCompression8A>;
-  using Compressor8B = test::CompressorCustomAlg<kCustomCompression8B>;
-  using Compressor8C = test::CompressorCustomAlg<kCustomCompression8C>;
-
-  if (!Compressor8A::Supported() || !LZ4_Supported()) {
-    fprintf(stderr,
-            "Prerequisite compression library not supported. Skipping\n");
-    return;
-  }
-
-  class MyManager : public CompressionManager {
-   public:
-    explicit MyManager(const char* compat_name) : compat_name_(compat_name) {}
-    const char* Name() const override { return name_.c_str(); }
-    const char* CompatibilityName() const override { return compat_name_; }
-
-    bool SupportsCompressionType(CompressionType type) const override {
-      return type == kCustomCompression8A || type == kCustomCompression8B ||
-             type == kCustomCompression8C ||
-             GetBuiltinV2CompressionManager()->SupportsCompressionType(type);
-    }
-
-    int used_compressor8A_count_ = 0;
-    int used_compressor8B_count_ = 0;
-    int used_compressor8C_count_ = 0;
-
-    std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
-                                              CompressionType type) override {
-      switch (static_cast<unsigned char>(type)) {
-        case kCustomCompression8A:
-          used_compressor8A_count_++;
-          return std::make_unique<Compressor8A>();
-        case kCustomCompression8B:
-          used_compressor8B_count_++;
-          return std::make_unique<Compressor8B>();
-        case kCustomCompression8C:
-          used_compressor8C_count_++;
-          return std::make_unique<Compressor8C>();
-        // Also support built-in compression algorithms
-        default:
-          return GetBuiltinV2CompressionManager()->GetCompressor(opts, type);
-      }
-    }
-
-    std::shared_ptr<Decompressor> GetDecompressor() override {
-      return std::make_shared<test::DecompressorCustomAlg>();
-    }
-
-    RelaxedAtomic<CompressionType> last_specific_decompressor_type_{
-        kNoCompression};
-
-    std::shared_ptr<Decompressor> GetDecompressorForTypes(
-        const CompressionType* types_begin,
-        const CompressionType* types_end) override {
-      assert(types_end > types_begin);
-      last_specific_decompressor_type_.StoreRelaxed(*types_begin);
-      auto decomp = std::make_shared<test::DecompressorCustomAlg>();
-      decomp->SetAllowedTypes(types_begin, types_end);
-      return decomp;
-    }
-
-    void AddFriend(const std::shared_ptr<CompressionManager>& mgr) {
-      friends_[mgr->CompatibilityName()] = mgr;
-    }
-    std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
-        Slice compatibility_name) override {
-      std::shared_ptr<CompressionManager> rv =
-          CompressionManager::FindCompatibleCompressionManager(
-              compatibility_name);
-      if (!rv) {
-        auto it = friends_.find(compatibility_name.ToString());
-        if (it != friends_.end()) {
-          return it->second.lock();
-        }
-      }
-      return rv;
-    }
-
-   private:
-    const char* compat_name_;
-    std::string name_;
-    // weak_ptr to avoid cycles
-    std::map<std::string, std::weak_ptr<CompressionManager>> friends_;
-  };
-
-  for (bool use_dict : {false, true}) {
-    SCOPED_TRACE(use_dict ? "With dict" : "No dict");
-
-    // Although these compression managers are actually compatible, we must
-    // respect their distinct compatibility names and treat them as incompatible
-    // (or else risk processing data incorrectly)
-    // NOTE: these are not registered in ObjectRegistry to test what happens
-    // when the original CompressionManager might not be available, but
-    // mgr_bar will be registered during the test, with different names to
-    // prevent interference between iterations.
-    auto mgr_foo = std::make_shared<MyManager>("Foo");
-    auto mgr_bar = std::make_shared<MyManager>(use_dict ? "Bar1" : "Bar2");
-
-    // And this one claims to be fully compatible with the built-in compression
-    // manager when it's not fully compatible (for custom CompressionTypes)
-    auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
-
-    constexpr uint16_t kValueSize = 10000;
-
-    Options options = CurrentOptions();
-    options.level0_file_num_compaction_trigger = 20;
-    BlockBasedTableOptions bbto;
-    bbto.enable_index_compression = false;
-    bbto.format_version = 6;  // Before custom compression alg support
-    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-    // Claims not to use custom compression (and doesn't unless setting a custom
-    // CompressionType)
-    options.compression_manager = mgr_claim_compatible;
-    // Use a built-in compression type with dictionary support
-    options.compression = kLZ4Compression;
-    options.compression_opts.max_dict_bytes = kValueSize / 2;
-    DestroyAndReopen(options);
-
-    Random rnd(404);
-    std::string value;
-    ASSERT_OK(
-        Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-
-    // That data should be readable without access to the original compression
-    // manager, because it used the built-in CompatibilityName and a built-in
-    // CompressionType
-    options.compression_manager = nullptr;
-    Reopen(options);
-    ASSERT_EQ(Get("a"), value);
-
-    // Verify it was compressed
-    Range r = {"a", "a0"};
-    TablePropertiesCollection tables_properties;
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "LZ4");
-
-    // Disallow setting a custom CompressionType with a CompressionManager
-    // claiming to be built-in compatible.
-    options.compression_manager = mgr_claim_compatible;
-    options.compression = kCustomCompression8A;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    options.compression_manager = nullptr;
-    options.compression = kCustomCompressionFE;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-    options.compression =
-        static_cast<CompressionType>(kLastBuiltinCompression + 1);
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    // Custom compression schema (different CompatibilityName) not supported
-    // before format_version=7
-    options.compression_manager = mgr_foo;
-    options.compression = kLZ4Compression;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    // Set format version supporting custom compression
-    bbto.format_version = 7;
-    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-
-    // Custom compression type not supported with built-in schema name, even
-    // with format_version=7
-    options.compression_manager = mgr_claim_compatible;
-    options.compression = kCustomCompression8B;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    // Custom compression schema, but specifying a custom compression type it
-    // doesn't support.
-    options.compression_manager = mgr_foo;
-    options.compression = kCustomCompressionF0;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
-
-    // Using a built-in compression type with fv=7 but named custom schema
-    options.compression = kLZ4Compression;
-    Reopen(options);
-    ASSERT_OK(
-        Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-    ASSERT_EQ(Get("b"), value);
-
-    // Verify it was compressed with LZ4
-    r = {"b", "b0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    // Uses new format for "compression_name" property
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kLZ4Compression);
-
-    // Custom compression type
-    options.compression = kCustomCompression8A;
-    Reopen(options);
-    ASSERT_OK(
-        Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
-    ASSERT_EQ(Get("c"), value);
-    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
-
-    // Verify it was compressed with custom format
-    r = {"c", "c0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kCustomCompression8A);
-
-    // Also dynamically changeable, because the compression manager will respect
-    // the current setting as reported under the legacy logic
-    ASSERT_OK(dbfull()->SetOptions({{"compression", "kLZ4Compression"}}));
-    ASSERT_OK(
-        Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 4);
-    ASSERT_EQ(Get("d"), value);
-
-    // Verify it was compressed with LZ4
-    r = {"d", "d0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kLZ4Compression);
-
-    // Dynamically changeable to custom compressions also
-    ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
-    ASSERT_OK(
-        Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
-    ASSERT_EQ(Get("e"), value);
-
-    // Verify it was compressed with custom format
-    r = {"e", "e0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kCustomCompression8B);
-
-    // Fails to re-open with incompatible compression manager (can't find
-    // compression manager Foo because it's not registered nor known by Bar)
-    options.compression_manager = mgr_bar;
-    options.compression = kLZ4Compression;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
-
-    // But should re-open if we make Bar aware of the Foo compression manager
-    mgr_bar->AddFriend(mgr_foo);
-    Reopen(options);
-
-    // Can still read everything
-    ASSERT_EQ(Get("a").size(), kValueSize);
-    ASSERT_EQ(Get("b").size(), kValueSize);
-    ASSERT_EQ(Get("c").size(), kValueSize);
-    ASSERT_EQ(Get("d").size(), kValueSize);
-    ASSERT_EQ(Get("e").size(), kValueSize);
-
-    // Add a file using mgr_bar
-    ASSERT_OK(
-        Put("f", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 6);
-    ASSERT_EQ(Get("f"), value);
-
-    // Verify it was compressed appropriately
-    r = {"f", "f0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(mgr_bar->last_specific_decompressor_type_.LoadRelaxed(),
-              kLZ4Compression);
-
-    // Fails to re-open with incompatible compression manager (can't find
-    // compression manager Bar because it's not registered nor known by Foo)
-    options.compression_manager = mgr_foo;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
-
-    // Register and re-open
-    auto& library = *ObjectLibrary::Default();
-    library.AddFactory<CompressionManager>(
-        mgr_bar->CompatibilityName(),
-        [mgr_bar](const std::string& /*uri*/,
-                  std::unique_ptr<CompressionManager>* guard,
-                  std::string* /*errmsg*/) {
-          *guard = std::make_unique<MyManager>(mgr_bar->CompatibilityName());
-          return guard->get();
-        });
-    Reopen(options);
-
-    // Can still read everything
-    ASSERT_EQ(Get("a").size(), kValueSize);
-    ASSERT_EQ(Get("b").size(), kValueSize);
-    ASSERT_EQ(Get("c").size(), kValueSize);
-    ASSERT_EQ(Get("d").size(), kValueSize);
-    ASSERT_EQ(Get("e").size(), kValueSize);
-    ASSERT_EQ(Get("f").size(), kValueSize);
-
-    // TODO: test old version of a compression manager unable to read a
-    // compression type
-  }
-}
-
 class CompactionStallTestListener : public EventListener {
  public:
   CompactionStallTestListener()
diff --git a/util/compression_test.cc b/util/compression_test.cc
index d2590fabfe66..17521982b2b4 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -3,11 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-// Testing the features of auto skip compression manager
-//
-// ***********************************************************************
-// EXPERIMENTAL - subject to change while under development
-// ***********************************************************************
+// Testing various compression features
 
 #include <cstdlib>
 #include <memory>
@@ -15,12 +11,1543 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/flush_block_policy.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "table/block_based/block_builder.h"
 #include "test_util/testutil.h"
 #include "util/auto_tune_compressor.h"
 #include "util/random.h"
+#include "util/simple_mixed_compressor.h"
 
 namespace ROCKSDB_NAMESPACE {
+class DBCompressionTest : public DBTestBase {
+ public:
+  DBCompressionTest() : DBTestBase("compression_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBCompressionTest, PresetCompressionDict) {
+  // Verifies that compression ratio improves when dictionary is enabled, and
+  // improves even further when the dictionary is trained by ZSTD.
+  const size_t kBlockSizeBytes = 4 << 10;
+  const size_t kL0FileBytes = 128 << 10;
+  const size_t kApproxPerBlockOverheadBytes = 50;
+  const int kNumL0Files = 5;
+
+  Options options;
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.arena_block_size = kBlockSizeBytes;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+  options.num_levels = 2;
+  options.target_file_size_base = kL0FileBytes;
+  options.target_file_size_multiplier = 2;
+  options.write_buffer_size = kL0FileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = kBlockSizeBytes;
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  }
+
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDfinalizeDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
+  for (auto compression_type : compression_types) {
+    options.compression = compression_type;
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_finalize_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
+      // First iteration: compress without preset dictionary
+      // Second iteration: compress with preset dictionary
+      // Third iteration (zstd only): compress with zstd-trained dictionary
+      //
+      // To make sure the compression dictionary has the intended effect, we
+      // verify the compressed size is smaller in successive iterations. Also in
+      // the non-first iterations, verify the data we get out is the same data
+      // we put in.
+      switch (i) {
+        case kWithoutDict:
+          options.compression_opts.max_dict_bytes = 0;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithZSTDfinalizeDict:
+          if (compression_type != kZSTD ||
+              !ZSTD_FinalizeDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = false;
+          break;
+        case kWithZSTDTrainedDict:
+          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = true;
+          break;
+        default:
+          assert(false);
+      }
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      CreateAndReopenWithCF({"pikachu"}, options);
+      Random rnd(301);
+      std::string seq_datas[10];
+      for (int j = 0; j < 10; ++j) {
+        seq_datas[j] =
+            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      }
+
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      for (int j = 0; j < kNumL0Files; ++j) {
+        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+                        seq_datas[(key_num / 10) % 10]));
+        }
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+      }
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                            true /* disallow_trivial_move */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDfinalizeDict) {
+        bytes_with_zstd_finalize_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
+      }
+
+      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+           j++) {
+        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+      }
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a finalized
+        // dictionary does not get as good a compression ratio as raw content
+        // dictionary. But using a dictionary should always get better
+        // compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
+                    bytes_without_dict > bytes_with_zstd_finalize_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
+      }
+
+      DestroyAndReopen(options);
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, PresetCompressionDictLocality) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  // Verifies that compression dictionary is generated from local data. The
+  // verification simply checks all output SSTs have different compression
+  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+  // the future.
+  const int kNumEntriesPerFile = 1 << 10;  // 1KB
+  const int kNumBytesPerEntry = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+                    rnd.RandomString(kNumBytesPerEntry)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+  }
+
+  // Store all the dictionaries generated during a full compaction.
+  std::vector<std::string> compression_dicts;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* arg) {
+        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  CompactRangeOptions compact_range_opts;
+  compact_range_opts.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+  // Dictionary compression should not be so good as to compress four totally
+  // random files into one. If it does then there's probably something wrong
+  // with the test.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Furthermore, there should be one compression dictionary generated per file.
+  // And they should all be different from each other.
+  ASSERT_EQ(NumTableFilesAtLevel(1),
+            static_cast<int>(compression_dicts.size()));
+  for (size_t i = 1; i < compression_dicts.size(); ++i) {
+    std::string& a = compression_dicts[i - 1];
+    std::string& b = compression_dicts[i];
+    size_t alen = a.size();
+    size_t blen = b.size();
+    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+  }
+}
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+TEST_F(DBCompressionTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+
+  Random rnd(301);
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 20480;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  options.statistics = CreateDBStatistics();
+
+  options.compression_per_level.resize(3);
+  // No compression for L0
+  options.compression_per_level[0] = kNoCompression;
+  // No compression for the Ln whre L0 is compacted to
+  options.compression_per_level[1] = kNoCompression;
+  // Snappy compression for Ln+1
+  options.compression_per_level[2] = kSnappyCompression;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so there shouldn't be any compression.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0);
+
+  // Verify there was no compression
+  auto num_block_compressed =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  ASSERT_EQ(num_block_compressed, 0);
+
+  // Insert 400KB and there will be some files end up in L3. According to the
+  // above compression settings for each level, there will be some compression.
+  ASSERT_OK(options.statistics->Reset());
+  ASSERT_EQ(num_block_compressed, 0);
+  for (int i = 20; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GE(NumTableFilesAtLevel(3), 1);
+  ASSERT_GE(NumTableFilesAtLevel(4), 1);
+
+  // Verify there was compression
+  num_block_compressed =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  ASSERT_GT(num_block_compressed, 0);
+
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  // Ensure that L1+ files are non-overlapping and together with L0 encompass
+  // full key range between smallestkey and largestkey from CF file metadata.
+  int largestkey_in_prev_level = -1;
+  int keys_found = 0;
+  for (int level = (int)cf_meta.levels.size() - 1; level >= 0; level--) {
+    int files_in_level = (int)cf_meta.levels[level].files.size();
+    int largestkey_in_prev_file = -1;
+    for (int j = 0; j < files_in_level; j++) {
+      int smallestkey = IdFromKey(cf_meta.levels[level].files[j].smallestkey);
+      int largestkey = IdFromKey(cf_meta.levels[level].files[j].largestkey);
+      int num_entries = (int)cf_meta.levels[level].files[j].num_entries;
+      ASSERT_EQ(num_entries, largestkey - smallestkey + 1);
+      keys_found += num_entries;
+      if (level > 0) {
+        if (j == 0) {
+          ASSERT_GT(smallestkey, largestkey_in_prev_level);
+        }
+        if (j > 0) {
+          ASSERT_GT(smallestkey, largestkey_in_prev_file);
+        }
+        if (j == files_in_level - 1) {
+          largestkey_in_prev_level = largestkey;
+        }
+      }
+      largestkey_in_prev_file = largestkey;
+    }
+  }
+  ASSERT_EQ(keys_found, kNKeys);
+
+  for (const auto& file : cf_meta.levels[4].files) {
+    listener->SetExpectedFileName(dbname_ + file.name);
+    const RangeOpt ranges(file.smallestkey, file.largestkey);
+    // Given verification from above, we're guaranteed that by deleting all the
+    // files in [<smallestkey>, <largestkey>] range, we're effectively deleting
+    // that very single file and nothing more.
+    EXPECT_OK(dbfull()->DeleteFilesInRanges(dbfull()->DefaultColumnFamily(),
+                                            &ranges, true /* include_end */));
+  }
+  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GE(NumTableFilesAtLevel(3), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(4), 0);
+
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBCompressionTest, DynamicLevelCompressionPerLevel2) {
+  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+    return;
+  }
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000000;
+  options.write_buffer_size = 600000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.target_file_size_base = 20;
+  options.env = env_;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
+
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = static_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = static_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 100; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 25 == 24) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = static_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = static_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
+
+class PresetCompressionDictTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+  PresetCompressionDictTest()
+      : DBTestBase("db_test2", false /* env_do_fsync */),
+        compression_type_(std::get<0>(GetParam())),
+        bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+  const CompressionType compression_type_;
+  const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompressionTest, PresetCompressionDictTest,
+    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+                       ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+  // Verifies that dictionary is generated and written during flush only when
+  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+  // size of the dictionary is within expectations according to the limit on
+  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t i = 0; i <= kKeysPerFile; ++i) {
+    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the flush finishes.
+  if (bottommost_) {
+    // Flush is never considered bottommost. This should change in the future
+    // since flushed files may have nothing underneath them, like the one in
+    // this test case.
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+  // dictionary. Also verifies the size of the dictionary is within expectations
+  // according to the limit on buffering set by
+  // `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t j = 0; j <= kKeysPerFile; ++j) {
+    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("2,0,1", FilesPerLevel(0));
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+  // file is not bottommost due to the existing L2 file covering the same key-
+  // range.
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ("0,1,1", FilesPerLevel(0));
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the compaction finishes.
+  if (bottommost_) {
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+  // verifies the size of the dictionary is within expectations according to the
+  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("2", FilesPerLevel(0));
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_GT(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      prev_compression_dict_bytes_inserted);
+  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
+  // digested dictionary format.
+  if (compression_type_ != kZSTD) {
+    // Although we limited buffering to `kBlockLen`, there may be up to two
+    // blocks of data included in the dictionary since we only check limit after
+    // each block is built.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+  }
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+  explicit CompactionCompressionListener(Options* db_options)
+      : db_options_(db_options) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    // Figure out last level with files
+    int bottommost_level = 0;
+    for (int level = 0; level < db->NumberLevels(); level++) {
+      std::string files_at_level;
+      ASSERT_TRUE(
+          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
+                          &files_at_level));
+      if (files_at_level != "0") {
+        bottommost_level = level;
+      }
+    }
+
+    if (db_options_->bottommost_compression != kDisableCompressionOption &&
+        ci.output_level == bottommost_level) {
+      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+    } else if (db_options_->compression_per_level.size() != 0) {
+      ASSERT_EQ(ci.compression,
+                db_options_->compression_per_level[ci.output_level]);
+    } else {
+      ASSERT_EQ(ci.compression, db_options_->compression);
+    }
+    max_level_checked = std::max(max_level_checked, ci.output_level);
+  }
+
+  int max_level_checked = 0;
+  const Options* db_options_;
+};
+
+enum CompressionFailureType {
+  kTestCompressionFail,
+  kTestDecompressionFail,
+  kTestDecompressionCorruption
+};
+
+class CompressionFailuresTest
+    : public DBCompressionTest,
+      public testing::WithParamInterface<std::tuple<
+          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
+ public:
+  CompressionFailuresTest() {
+    std::tie(compression_failure_type_, compression_type_,
+             compression_max_dict_bytes_, compression_parallel_threads_) =
+        GetParam();
+  }
+
+  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
+  CompressionType compression_type_ = kNoCompression;
+  uint32_t compression_max_dict_bytes_ = 0;
+  uint32_t compression_parallel_threads_ = 0;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompressionTest, CompressionFailuresTest,
+    ::testing::Combine(::testing::Values(kTestCompressionFail,
+                                         kTestDecompressionFail,
+                                         kTestDecompressionCorruption),
+                       ::testing::ValuesIn(GetSupportedCompressions()),
+                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
+
+TEST_P(CompressionFailuresTest, CompressionFailures) {
+  if (compression_type_ == kNoCompression) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 512;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 512;
+  table_options.verify_compression = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  options.compression = compression_type_;
+  options.compression_opts.parallel_threads = compression_parallel_threads_;
+  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
+  options.bottommost_compression_opts.parallel_threads =
+      compression_parallel_threads_;
+  options.bottommost_compression_opts.max_dict_bytes =
+      compression_max_dict_bytes_;
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompressData:TamperWithReturnValue", [](void* arg) {
+          bool* ret = static_cast<bool*>(arg);
+          *ret = false;
+        });
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DecompressBlockData:TamperWithReturnValue", [](void* arg) {
+          Status* ret = static_cast<Status*>(arg);
+          ASSERT_OK(*ret);
+          *ret = Status::Corruption("kTestDecompressionFail");
+        });
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DecompressBlockData:TamperWithDecompressionOutput", [](void* arg) {
+          BlockContents* contents = static_cast<BlockContents*>(arg);
+          // Ensure uncompressed data != original data
+          const size_t len = contents->data.size() + 1;
+          std::unique_ptr<char[]> fake_data(new char[len]());
+          *contents = BlockContents(std::move(fake_data), len);
+        });
+  }
+
+  std::map<std::string, std::string> key_value_written;
+
+  const int kKeySize = 5;
+  const int kValUnitSize = 16;
+  const int kValSize = 256;
+  Random rnd(405);
+
+  Status s = Status::OK();
+
+  DestroyAndReopen(options);
+  // Write 10 random files
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 5; j++) {
+      std::string key = rnd.RandomString(kKeySize);
+      // Ensure good compression ratio
+      std::string valueUnit = rnd.RandomString(kValUnitSize);
+      std::string value;
+      for (int k = 0; k < kValSize; k += kValUnitSize) {
+        value += valueUnit;
+      }
+      s = Put(key, value);
+      if (compression_failure_type_ == kTestCompressionFail) {
+        key_value_written[key] = value;
+        ASSERT_OK(s);
+      }
+    }
+    s = Flush();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    if (i == 4) {
+      // Make compression fail at the mid of table building
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    // Should be kNoCompression, check content consistency
+    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      std::string key = db_iter->key().ToString();
+      std::string value = db_iter->value().ToString();
+      ASSERT_NE(key_value_written.find(key), key_value_written.end());
+      ASSERT_EQ(key_value_written[key], value);
+      key_value_written.erase(key);
+    }
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(0, key_value_written.size());
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Could not decompress: kTestDecompressionFail");
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Decompressed block did not match pre-compression block");
+  }
+}
+
+TEST_F(DBCompressionTest, CompressionOptions) {
+  if (!Zlib_Supported() || !Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 100;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+
+  CompactionCompressionListener* listener =
+      new CompactionCompressionListener(&options);
+  options.listeners.emplace_back(listener);
+
+  const int kKeySize = 5;
+  const int kValSize = 20;
+  Random rnd(301);
+
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  std::map<std::string, std::string> key_value_written;
+
+  for (int iter = 0; iter <= 2; iter++) {
+    listener->max_level_checked = 0;
+
+    if (iter == 0) {
+      // Use different compression algorithms for different levels but
+      // always use Zlib for bottommost level
+      options.compression_per_level = {kNoCompression,     kNoCompression,
+                                       kNoCompression,     kSnappyCompression,
+                                       kSnappyCompression, kSnappyCompression,
+                                       kZlibCompression};
+      options.compression = kNoCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 1) {
+      // Use Snappy except for bottommost level use ZLib
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 2) {
+      // Use Snappy everywhere
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kDisableCompressionOption;
+    }
+
+    for (auto num_threads : compression_parallel_threads) {
+      options.compression_opts.parallel_threads = num_threads;
+      options.bottommost_compression_opts.parallel_threads = num_threads;
+
+      DestroyAndReopen(options);
+      // Write 10 random files
+      for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < 5; j++) {
+          std::string key = rnd.RandomString(kKeySize);
+          std::string value = rnd.RandomString(kValSize);
+          key_value_written[key] = value;
+          ASSERT_OK(Put(key, value));
+        }
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      }
+
+      // Make sure that we wrote enough to check all 7 levels
+      ASSERT_EQ(listener->max_level_checked, 6);
+
+      // Make sure database content is the same as key_value_written
+      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+        std::string key = db_iter->key().ToString();
+        std::string value = db_iter->value().ToString();
+        ASSERT_NE(key_value_written.find(key), key_value_written.end());
+        ASSERT_EQ(key_value_written[key], value);
+        key_value_written.erase(key);
+      }
+      ASSERT_OK(db_iter->status());
+      ASSERT_EQ(0, key_value_written.size());
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, RoundRobinManager) {
+  if (ZSTD_Supported()) {
+    auto mgr =
+        std::make_shared<RoundRobinManager>(GetBuiltinV2CompressionManager());
+
+    std::vector<std::string> values;
+    for (bool use_wrapper : {true}) {
+      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
+
+      Options options = CurrentOptions();
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+        }
+        values.push_back(value);
+        ASSERT_OK(Put(Key(i), value));
+        ASSERT_EQ(Get(Key(i)), value);
+      }
+      ASSERT_OK(Flush());
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+        ASSERT_EQ(Get(Key(i)), values[i]);
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, RandomMixedCompressionManager) {
+  if (ZSTD_Supported()) {
+    auto mgr = std::make_shared<RandomMixedCompressionManager>(
+        GetBuiltinV2CompressionManager());
+    std::vector<std::string> values;
+    for (bool use_wrapper : {true}) {
+      SCOPED_TRACE((use_wrapper ? "With " : "No ") + std::string("wrapper"));
+
+      Options options = CurrentOptions();
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+        }
+        values.push_back(value);
+        ASSERT_OK(Put(Key(i), value));
+        ASSERT_EQ(Get(Key(i)), value);
+      }
+      ASSERT_OK(Flush());
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+        ASSERT_EQ(Get(Key(i)), values[i]);
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, CompressionManagerWrapper) {
+  // Test that we can use a custom CompressionManager to wrap the built-in
+  // CompressionManager, thus adopting a custom *strategy* based on existing
+  // algorithms. This will "mark" some blocks (in their contents) as "do not
+  // compress", i.e. no attempt to compress, and some blocks as "reject
+  // compression", i.e. compression attempted but rejected because of ratio
+  // or otherwise. These cases are distinguishable for statistics that
+  // approximate "wasted effort".
+  static std::string kDoNotCompress = "do_not_compress";
+  static std::string kRejectCompression = "reject_compression";
+
+  struct MyCompressor : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "MyCompressor"; }
+
+    Status CompressBlock(Slice uncompressed_data,
+                         std::string* compressed_output,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      auto begin = uncompressed_data.data();
+      auto end = uncompressed_data.data() + uncompressed_data.size();
+      if (std::search(begin, end, kDoNotCompress.begin(),
+                      kDoNotCompress.end()) != end) {
+        // Do not attempt compression
+        EXPECT_EQ(*out_compression_type, kNoCompression);
+        return Status::OK();
+      } else if (std::search(begin, end, kRejectCompression.begin(),
+                             kRejectCompression.end()) != end) {
+        // Simulate attempted & rejected compression
+        *compressed_output = "blah";
+        EXPECT_EQ(*out_compression_type, kNoCompression);
+        return Status::OK();
+      } else {
+        return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       out_compression_type, working_area);
+      }
+    }
+  };
+  struct MyManager : public CompressionManagerWrapper {
+    using CompressionManagerWrapper::CompressionManagerWrapper;
+    const char* Name() const override { return "MyManager"; }
+    std::unique_ptr<Compressor> GetCompressorForSST(
+        const FilterBuildingContext& context, const CompressionOptions& opts,
+        CompressionType preferred) override {
+      return std::make_unique<MyCompressor>(
+          wrapped_->GetCompressorForSST(context, opts, preferred));
+    }
+  };
+  auto mgr = std::make_shared<MyManager>(GetBuiltinV2CompressionManager());
+
+  for (CompressionType type : GetSupportedCompressions()) {
+    for (bool use_wrapper : {false, true}) {
+      if (type == kNoCompression) {
+        continue;
+      }
+      SCOPED_TRACE("Compression type: " + std::to_string(type) +
+                   (use_wrapper ? " with " : " no ") + "wrapper");
+
+      Options options = CurrentOptions();
+      options.compression = type;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+      BlockBasedTableOptions bbto;
+      bbto.enable_index_compression = false;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      options.compression_manager = use_wrapper ? mgr : nullptr;
+      DestroyAndReopen(options);
+
+      auto PopStat = [&](Tickers t) -> uint64_t {
+        return options.statistics->getAndResetTickerCount(t);
+      };
+
+      Random rnd(301);
+      constexpr int kCount = 13;
+
+      // Highly compressible blocks, except 1 non-compressible. Half of the
+      // compressible are morked for bypass and 1 marked for rejection. Values
+      // are large enough to ensure just 1 k-v per block.
+      for (int i = 0; i < kCount; ++i) {
+        std::string value;
+        if (i == 6) {
+          // One non-compressible block
+          value = rnd.RandomBinaryString(20000);
+        } else {
+          test::CompressibleString(&rnd, 0.1, 20000, &value);
+          if ((i % 2) == 0) {
+            // Half for bypass
+            value += kDoNotCompress;
+          } else if (i == 7) {
+            // One for rejection
+            value += kRejectCompression;
+          }
+        }
+        ASSERT_OK(Put(Key(i), value));
+      }
+      ASSERT_OK(Flush());
+
+      if (use_wrapper) {
+        EXPECT_EQ(kCount / 2 - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(kCount / 2, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+        EXPECT_EQ(1 + 1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+      } else {
+        EXPECT_EQ(kCount - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
+        EXPECT_EQ(1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+      }
+
+      // Ensure well-formed for reads
+      for (int i = 0; i < kCount; ++i) {
+        ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+      }
+      ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(DBCompressionTest, CompressionManagerCustomCompression) {
+  // Test that we can use a custom CompressionManager to implement custom
+  // compression algorithms, and that there are appropriate schema guard rails
+  // to ensure data is not processed by the wrong algorithm.
+  using Compressor8A = test::CompressorCustomAlg<kCustomCompression8A>;
+  using Compressor8B = test::CompressorCustomAlg<kCustomCompression8B>;
+  using Compressor8C = test::CompressorCustomAlg<kCustomCompression8C>;
+
+  if (!Compressor8A::Supported() || !LZ4_Supported()) {
+    fprintf(stderr,
+            "Prerequisite compression library not supported. Skipping\n");
+    return;
+  }
+
+  class MyManager : public CompressionManager {
+   public:
+    explicit MyManager(const char* compat_name) : compat_name_(compat_name) {}
+    const char* Name() const override { return name_.c_str(); }
+    const char* CompatibilityName() const override { return compat_name_; }
+
+    bool SupportsCompressionType(CompressionType type) const override {
+      return type == kCustomCompression8A || type == kCustomCompression8B ||
+             type == kCustomCompression8C ||
+             GetBuiltinV2CompressionManager()->SupportsCompressionType(type);
+    }
+
+    int used_compressor8A_count_ = 0;
+    int used_compressor8B_count_ = 0;
+    int used_compressor8C_count_ = 0;
+
+    std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
+                                              CompressionType type) override {
+      switch (static_cast<unsigned char>(type)) {
+        case kCustomCompression8A:
+          used_compressor8A_count_++;
+          return std::make_unique<Compressor8A>();
+        case kCustomCompression8B:
+          used_compressor8B_count_++;
+          return std::make_unique<Compressor8B>();
+        case kCustomCompression8C:
+          used_compressor8C_count_++;
+          return std::make_unique<Compressor8C>();
+        // Also support built-in compression algorithms
+        default:
+          return GetBuiltinV2CompressionManager()->GetCompressor(opts, type);
+      }
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressor() override {
+      return std::make_shared<test::DecompressorCustomAlg>();
+    }
+
+    RelaxedAtomic<CompressionType> last_specific_decompressor_type_{
+        kNoCompression};
+
+    std::shared_ptr<Decompressor> GetDecompressorForTypes(
+        const CompressionType* types_begin,
+        const CompressionType* types_end) override {
+      assert(types_end > types_begin);
+      last_specific_decompressor_type_.StoreRelaxed(*types_begin);
+      auto decomp = std::make_shared<test::DecompressorCustomAlg>();
+      decomp->SetAllowedTypes(types_begin, types_end);
+      return decomp;
+    }
+
+    void AddFriend(const std::shared_ptr<CompressionManager>& mgr) {
+      friends_[mgr->CompatibilityName()] = mgr;
+    }
+    std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
+        Slice compatibility_name) override {
+      std::shared_ptr<CompressionManager> rv =
+          CompressionManager::FindCompatibleCompressionManager(
+              compatibility_name);
+      if (!rv) {
+        auto it = friends_.find(compatibility_name.ToString());
+        if (it != friends_.end()) {
+          return it->second.lock();
+        }
+      }
+      return rv;
+    }
+
+   private:
+    const char* compat_name_;
+    std::string name_;
+    // weak_ptr to avoid cycles
+    std::map<std::string, std::weak_ptr<CompressionManager>> friends_;
+  };
+
+  for (bool use_dict : {false, true}) {
+    SCOPED_TRACE(use_dict ? "With dict" : "No dict");
+
+    // Although these compression managers are actually compatible, we must
+    // respect their distinct compatibility names and treat them as incompatible
+    // (or else risk processing data incorrectly)
+    // NOTE: these are not registered in ObjectRegistry to test what happens
+    // when the original CompressionManager might not be available, but
+    // mgr_bar will be registered during the test, with different names to
+    // prevent interference between iterations.
+    auto mgr_foo = std::make_shared<MyManager>("Foo");
+    auto mgr_bar = std::make_shared<MyManager>(use_dict ? "Bar1" : "Bar2");
+
+    // And this one claims to be fully compatible with the built-in compression
+    // manager when it's not fully compatible (for custom CompressionTypes)
+    auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
+
+    constexpr uint16_t kValueSize = 10000;
+
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 20;
+    BlockBasedTableOptions bbto;
+    bbto.enable_index_compression = false;
+    bbto.format_version = 6;  // Before custom compression alg support
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    // Claims not to use custom compression (and doesn't unless setting a custom
+    // CompressionType)
+    options.compression_manager = mgr_claim_compatible;
+    // Use a built-in compression type with dictionary support
+    options.compression = kLZ4Compression;
+    options.compression_opts.max_dict_bytes = kValueSize / 2;
+    DestroyAndReopen(options);
+
+    Random rnd(404);
+    std::string value;
+    ASSERT_OK(
+        Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+
+    // That data should be readable without access to the original compression
+    // manager, because it used the built-in CompatibilityName and a built-in
+    // CompressionType
+    options.compression_manager = nullptr;
+    Reopen(options);
+    ASSERT_EQ(Get("a"), value);
+
+    // Verify it was compressed
+    Range r = {"a", "a0"};
+    TablePropertiesCollection tables_properties;
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "LZ4");
+
+    // Disallow setting a custom CompressionType with a CompressionManager
+    // claiming to be built-in compatible.
+    options.compression_manager = mgr_claim_compatible;
+    options.compression = kCustomCompression8A;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    options.compression_manager = nullptr;
+    options.compression = kCustomCompressionFE;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+    options.compression =
+        static_cast<CompressionType>(kLastBuiltinCompression + 1);
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    // Custom compression schema (different CompatibilityName) not supported
+    // before format_version=7
+    options.compression_manager = mgr_foo;
+    options.compression = kLZ4Compression;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    // Set format version supporting custom compression
+    bbto.format_version = 7;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+    // Custom compression type not supported with built-in schema name, even
+    // with format_version=7
+    options.compression_manager = mgr_claim_compatible;
+    options.compression = kCustomCompression8B;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+    // Custom compression schema, but specifying a custom compression type it
+    // doesn't support.
+    options.compression_manager = mgr_foo;
+    options.compression = kCustomCompressionF0;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+    // Using a built-in compression type with fv=7 but named custom schema
+    options.compression = kLZ4Compression;
+    Reopen(options);
+    ASSERT_OK(
+        Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+    ASSERT_EQ(Get("b"), value);
+
+    // Verify it was compressed with LZ4
+    r = {"b", "b0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    // Uses new format for "compression_name" property
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kLZ4Compression);
+
+    // Custom compression type
+    options.compression = kCustomCompression8A;
+    Reopen(options);
+    ASSERT_OK(
+        Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+    ASSERT_EQ(Get("c"), value);
+    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
+
+    // Verify it was compressed with custom format
+    r = {"c", "c0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kCustomCompression8A);
+
+    // Also dynamically changeable, because the compression manager will respect
+    // the current setting as reported under the legacy logic
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kLZ4Compression"}}));
+    ASSERT_OK(
+        Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+    ASSERT_EQ(Get("d"), value);
+
+    // Verify it was compressed with LZ4
+    r = {"d", "d0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kLZ4Compression);
+
+    // Dynamically changeable to custom compressions also
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
+    ASSERT_OK(
+        Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    ASSERT_EQ(Get("e"), value);
+
+    // Verify it was compressed with custom format
+    r = {"e", "e0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
+    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+              kCustomCompression8B);
+
+    // Fails to re-open with incompatible compression manager (can't find
+    // compression manager Foo because it's not registered nor known by Bar)
+    options.compression_manager = mgr_bar;
+    options.compression = kLZ4Compression;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+    // But should re-open if we make Bar aware of the Foo compression manager
+    mgr_bar->AddFriend(mgr_foo);
+    Reopen(options);
+
+    // Can still read everything
+    ASSERT_EQ(Get("a").size(), kValueSize);
+    ASSERT_EQ(Get("b").size(), kValueSize);
+    ASSERT_EQ(Get("c").size(), kValueSize);
+    ASSERT_EQ(Get("d").size(), kValueSize);
+    ASSERT_EQ(Get("e").size(), kValueSize);
+
+    // Add a file using mgr_bar
+    ASSERT_OK(
+        Put("f", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 6);
+    ASSERT_EQ(Get("f"), value);
+
+    // Verify it was compressed appropriately
+    r = {"f", "f0"};
+    tables_properties.clear();
+    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
+                                                1, &tables_properties));
+    ASSERT_EQ(tables_properties.size(), 1U);
+    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+    EXPECT_EQ(mgr_bar->last_specific_decompressor_type_.LoadRelaxed(),
+              kLZ4Compression);
+
+    // Fails to re-open with incompatible compression manager (can't find
+    // compression manager Bar because it's not registered nor known by Foo)
+    options.compression_manager = mgr_foo;
+    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+    // Register and re-open
+    auto& library = *ObjectLibrary::Default();
+    library.AddFactory<CompressionManager>(
+        mgr_bar->CompatibilityName(),
+        [mgr_bar](const std::string& /*uri*/,
+                  std::unique_ptr<CompressionManager>* guard,
+                  std::string* /*errmsg*/) {
+          *guard = std::make_unique<MyManager>(mgr_bar->CompatibilityName());
+          return guard->get();
+        });
+    Reopen(options);
+
+    // Can still read everything
+    ASSERT_EQ(Get("a").size(), kValueSize);
+    ASSERT_EQ(Get("b").size(), kValueSize);
+    ASSERT_EQ(Get("c").size(), kValueSize);
+    ASSERT_EQ(Get("d").size(), kValueSize);
+    ASSERT_EQ(Get("e").size(), kValueSize);
+    ASSERT_EQ(Get("f").size(), kValueSize);
+
+    // TODO: test old version of a compression manager unable to read a
+    // compression type
+  }
+}
+
+TEST_F(DBCompressionTest, FailWhenCompressionNotSupportedTest) {
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      // not supported, we should fail the Open()
+      Options options = CurrentOptions();
+      options.compression = comp;
+      ASSERT_TRUE(!TryReopen(options).ok());
+      // Try if CreateColumnFamily also fails
+      options.compression = kNoCompression;
+      ASSERT_OK(TryReopen(options));
+      ColumnFamilyOptions cf_options(options);
+      cf_options.compression = comp;
+      ColumnFamilyHandle* handle;
+      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+    }
+  }
+}
 
 class AutoSkipTestFlushBlockPolicy : public FlushBlockPolicy {
  public:

From 6c267a32174f31bcb4e8cb85a501d81b76da82da Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 11 Jul 2025 09:23:14 -0700
Subject: [PATCH 174/500] Improve some unreachable-after-loop code (#13764)

Summary:
in log_reader.cc.
* `for (;;)` (with no matching break inside) should be more structurally recognizable to compilers as unreachable after compared to `while (true)` which compilers can treat as conditional for warning/error purposes because `true` might have come from a macro, etc.
* Comment the `break` statements to indicate they are for the `switch` (not the `for`)
* No code or annotation is apparently needed for the unreachable end of the non-void function, so just a comment

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13764

Test Plan: CI

Reviewed By: archang19

Differential Revision: D78135493

Pulled By: pdillinger

fbshipit-source-id: e313435a846a6e15346acf40404f755be98ab09a
---
 db/log_reader.cc | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/db/log_reader.cc b/db/log_reader.cc
index 0f0e25033ab5..71b84b428987 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -95,7 +95,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
   uint64_t prospective_record_offset = 0;
 
   Slice fragment;
-  while (true) {
+  for (;;) {
     uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
     size_t drop_size = 0;
     const uint8_t record_type =
@@ -140,7 +140,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
         prospective_record_offset = physical_record_offset;
         scratch->assign(fragment.data(), fragment.size());
         in_fragmented_record = true;
-        break;
+        break;  // switch
 
       case kMiddleType:
       case kRecyclableMiddleType:
@@ -153,7 +153,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           }
           scratch->append(fragment.data(), fragment.size());
         }
-        break;
+        break;  // switch
 
       case kLastType:
       case kRecyclableLastType:
@@ -171,7 +171,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           first_record_read_ = true;
           return true;
         }
-        break;
+        break;  // switch
 
       case kSetCompressionType: {
         if (compression_type_record_read_) {
@@ -193,7 +193,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
         } else {
           InitCompression(compression_record);
         }
-        break;
+        break;  // switch
       }
       case kPredecessorWALInfoType:
       case kRecyclePredecessorWALInfoType: {
@@ -210,7 +210,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           MaybeVerifyPredecessorWALInfo(wal_recovery_mode, fragment,
                                         recorded_predecessor_wal_info);
         }
-        break;
+        break;  // switch
       }
       case kUserDefinedTimestampSizeType:
       case kRecyclableUserDefinedTimestampSizeType: {
@@ -235,7 +235,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
             ReportCorruption(fragment.size(), s.getState());
           }
         }
-        break;
+        break;  // switch
       }
 
       case kBadHeader:
@@ -304,7 +304,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           in_fragmented_record = false;
           scratch->clear();
         }
-        break;
+        break;  // switch
 
       case kBadRecordLen:
         if (eof_) {
@@ -337,7 +337,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           in_fragmented_record = false;
           scratch->clear();
         }
-        break;
+        break;  // switch
 
       default: {
         if ((record_type & kRecordTypeSafeIgnoreMask) == 0) {
@@ -349,11 +349,11 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
         }
         in_fragmented_record = false;
         scratch->clear();
-        break;
+        break;  // switch
       }
     }
   }
-  return false;
+  // unreachable
 }
 
 void Reader::MaybeVerifyPredecessorWALInfo(

From b7cd1fd66279bc029ac1f26b8d1247b1a6ab3757 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 14 Jul 2025 11:04:00 -0700
Subject: [PATCH 175/500] Track FSRandomRWFile open/close in Fault injection fs
 (#13771)

Summary:
The Stress test was broken due to a change in switching from ReopenWritableFile to FSRandomRWFile for sync linked file in external Sst ingestion job. The Stress test is using FaultInjectionFs, which tracks the opening of ReopenWritableFile properly, but does not track FSRandomRWFile properly. This change fixes the tracking of FSRandomRWFile in FaultInjectionFs.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13771

Test Plan: unit test, stress test

Reviewed By: mszeszko-meta

Differential Revision: D78282719

Pulled By: xingbowang

fbshipit-source-id: f8f2ed8a5b28a76836f75effbdfa2c3bb172dc51
---
 utilities/fault_injection_fs.cc | 12 ++++++++++--
 utilities/fault_injection_fs.h  |  9 +++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 1c55cbcba6ff..24916019dd8d 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -399,10 +399,10 @@ IOStatus TestFSWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
   return io_s;
 }
 
-TestFSRandomRWFile::TestFSRandomRWFile(const std::string& /*fname*/,
+TestFSRandomRWFile::TestFSRandomRWFile(const std::string& fname,
                                        std::unique_ptr<FSRandomRWFile>&& f,
                                        FaultInjectionTestFS* fs)
-    : target_(std::move(f)), file_opened_(true), fs_(fs) {
+    : fname_(fname), target_(std::move(f)), file_opened_(true), fs_(fs) {
   assert(target_ != nullptr);
 }
 
@@ -433,6 +433,7 @@ IOStatus TestFSRandomRWFile::Read(uint64_t offset, size_t n,
 
 IOStatus TestFSRandomRWFile::Close(const IOOptions& options,
                                    IODebugContext* dbg) {
+  fs_->RandomRWFileClosed(fname_);
   if (!fs_->IsFilesystemActive()) {
     return fs_->GetError();
   }
@@ -1273,6 +1274,13 @@ IOStatus FaultInjectionTestFS::AbortIO(std::vector<void*>& io_handles) {
   return target()->AbortIO(io_handles);
 }
 
+void FaultInjectionTestFS::RandomRWFileClosed(const std::string& fname) {
+  MutexLock l(&mutex_);
+  if (open_managed_files_.find(fname) != open_managed_files_.end()) {
+    open_managed_files_.erase(fname);
+  }
+}
+
 void FaultInjectionTestFS::WritableFileClosed(const FSFileState& state) {
   MutexLock l(&mutex_);
   if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 0c8b789b8049..e2399a191663 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -106,8 +106,8 @@ class TestFSWritableFile : public FSWritableFile {
   const bool unsync_data_loss_;
 };
 
-// A wrapper around WritableFileWriter* file
-// is written to or sync'ed.
+// A wrapper around FSRandomRWFile* file
+// is read from/write to or sync'ed.
 class TestFSRandomRWFile : public FSRandomRWFile {
  public:
   explicit TestFSRandomRWFile(const std::string& fname,
@@ -128,6 +128,9 @@ class TestFSRandomRWFile : public FSRandomRWFile {
   bool use_direct_io() const override { return target_->use_direct_io(); }
 
  private:
+  // keep a copy of file name, so we can untrack it in File system, when it is
+  // closed
+  std::string fname_;
   std::unique_ptr<FSRandomRWFile> target_;
   bool file_opened_;
   FaultInjectionTestFS* fs_;
@@ -341,6 +344,8 @@ class FaultInjectionTestFS : public FileSystemWrapper {
 
   void WritableFileAppended(const FSFileState& state);
 
+  void RandomRWFileClosed(const std::string& fname);
+
   IOStatus DropUnsyncedFileData();
 
   IOStatus DropRandomUnsyncedFileData(Random* rnd);

From 60a017209631cadcf49171b1c96d981bbe8c39c9 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 14 Jul 2025 17:26:22 -0700
Subject: [PATCH 176/500] Compression API clarifcations/minor fixes (#13775)

Summary:
* A number of comments clarifying contracts, etc.
* Make ReleaseWorkingArea public instead of protected because there are some limited cases where a wrapper implementation might want to call it directly
* Check non-empty dictionary precondition on MaybeCloneForDict
* Expand testing of wrapped WorkingAreas
* Random documentation improvement in block_builder.cc

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13775

Test Plan: existing and expanded tests and assertions

Reviewed By: hx235

Differential Revision: D78304550

Pulled By: pdillinger

fbshipit-source-id: e5f064e8405a5a49be123ee13145cb3626bbbfbf
---
 include/rocksdb/advanced_compression.h | 27 ++++++++++++++++++++++++--
 table/block_based/block_builder.cc     |  8 ++++++--
 util/compression.cc                    |  2 ++
 util/compression.h                     | 22 +++++++++++++++------
 util/compression_test.cc               | 21 ++++++++++++++++++++
 5 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index 42cd87ec391e..7ff257f58b79 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -50,6 +50,10 @@ struct FilterBuildingContext;
 // a number of built-in CompressionTypes that ignore any dictionary block in
 // the file; therefore they cannot accommodate dictionary compression in the
 // future without a schema change / extension.)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class Compressor {
  public:
   Compressor() = default;
@@ -134,15 +138,15 @@ class Compressor {
   // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
   struct WorkingArea {};
 
- protected:
   // To allow for flexible re-use / reclaimation, we have explicit Get and
   // Release functions, and usually wrap in a special RAII smart pointer.
   // For example, a WorkingArea could be saved/recycled in thread-local or
   // core-local storage, or heap managed, etc., though an explicit WorkingArea
   // is only advised for repeated compression (by a single thread).
+  // ReleaseWorkingArea() in not intended to be called directly, but used by
+  // ManagedWorkingArea.
   virtual void ReleaseWorkingArea(WorkingArea*) {}
 
- public:
   using ManagedWorkingArea =
       ManagedPtr<WorkingArea, Compressor, &Compressor::ReleaseWorkingArea>;
 
@@ -221,6 +225,10 @@ class Compressor {
 // decompressed into part of a single buffer allocated to hold a block's
 // uncompressed contents along with an in-memory object representation of the
 // block (to reduce fragmentation and other overheads of separate objects).
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class Decompressor {
  public:
   Decompressor() = default;
@@ -278,6 +286,9 @@ class Decompressor {
   // supported for this kind of Decompressor. Corruption - dictionary is
   // malformed (though many implementations will accept any data as a
   // dictionary)
+  //
+  // RocksDB promises not to call this function with an empty dictionary slice
+  // (equivalent to no dictionary).
   virtual Status MaybeCloneForDict(const Slice& /*serialized_dict*/,
                                    std::unique_ptr<Decompressor>* /*out*/) {
     return Status::NotSupported(
@@ -339,6 +350,10 @@ class Decompressor {
 //   (because that would break backward compatibility, potential quiet
 //   corruption)
 // TODO: consider adding optional streaming compression support (low priority)
+//
+// Exceptions MUST NOT propagate out of overridden functions into RocksDB,
+// because RocksDB is not exception-safe. This could cause undefined behavior
+// including data loss, unreported corruption, deadlocks, and more.
 class CompressionManager
     : public std::enable_shared_from_this<CompressionManager>,
       public Customizable {
@@ -466,6 +481,10 @@ class CompressorWrapper : public Compressor {
     return wrapped_->ObtainWorkingArea();
   }
 
+  // NOTE: Don't need to override ReleaseWorkingArea() here because
+  // ManagedWorkingArea takes care of calling it on the Compressor that created
+  // the WorkingArea.
+
   Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* working_area) override {
@@ -491,6 +510,10 @@ class DecompressorWrapper : public Decompressor {
     wrapped_->ReleaseWorkingArea(wa);
   }
 
+  // NOTE: Don't need to override ReleaseWorkingArea() here because
+  // ManagedWorkingArea takes care of calling it on the Decompressor that
+  // created the WorkingArea.
+
   ManagedWorkingArea ObtainWorkingArea(CompressionType preferred) override {
     return wrapped_->ObtainWorkingArea(preferred);
   }
diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc
index e4950e4356bf..f3a2834ab1d0 100644
--- a/table/block_based/block_builder.cc
+++ b/table/block_based/block_builder.cc
@@ -21,15 +21,19 @@
 // An entry for a particular key-value pair has the form:
 //     shared_bytes: varint32
 //     unshared_bytes: varint32
-//     value_length: varint32
+//     value_length: varint32 (NOTE1)
 //     key_delta: char[unshared_bytes]
 //     value: char[value_length]
-// shared_bytes == 0 for restart points.
+// shared_bytes == 0 (explicitly stored) for restart points.
 //
 // The trailer of the block has the form:
 //     restarts: uint32[num_restarts]
 //     num_restarts: uint32
 // restarts[i] contains the offset within the block of the ith restart point.
+//
+// NOTE1: omitted for format_version >= 4 index blocks, because the value is
+// composed of one (shared_bytes > 0) or two (shared_bytes == 0) varints, whose
+// length is self-describing.
 
 #include "table/block_based/block_builder.h"
 
diff --git a/util/compression.cc b/util/compression.cc
index 0aa473b179dc..16177f09ce45 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -746,6 +746,8 @@ class BuiltinDecompressorV2WithDict : public BuiltinDecompressorV2 {
 
 Status BuiltinDecompressorV2::MaybeCloneForDict(
     const Slice& dict, std::unique_ptr<Decompressor>* out) {
+  // Check RocksDB-promised precondition
+  assert(dict.size() > 0);
   // Because of unfortunate decisions in handling built-in compression types,
   // all the compression types before ZSTD that do not actually support
   // dictionary compression pretend to support it. Specifically, we have to be
diff --git a/util/compression.h b/util/compression.h
index 15f576b53623..8c613b2f373d 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -240,15 +240,25 @@ struct DecompressorDict {
 
  private:
   void Populate(Decompressor& from_decompressor, Slice dict) {
-    Status s = from_decompressor.MaybeCloneForDict(dict, &decompressor_);
-    if (decompressor_ == nullptr) {
+    if (UNLIKELY(dict.empty())) {
       dict_str_ = {};
       dict_allocation_ = {};
-      assert(!s.ok());
-      decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
+      // Appropriately reject bad files with empty dictionary block.
+      // It is longstanding not to write an empty dictionary block:
+      // https://github.com/facebook/rocksdb/blame/10.2.fb/table/block_based/block_based_table_builder.cc#L1841
+      decompressor_ = std::make_unique<FailureDecompressor>(
+          Status::Corruption("Decompression dictionary is empty"));
     } else {
-      assert(s.ok());
-      assert(decompressor_->GetSerializedDict() == dict);
+      Status s = from_decompressor.MaybeCloneForDict(dict, &decompressor_);
+      if (decompressor_ == nullptr) {
+        dict_str_ = {};
+        dict_allocation_ = {};
+        assert(!s.ok());
+        decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
+      } else {
+        assert(s.ok());
+        assert(decompressor_->GetSerializedDict() == dict);
+      }
     }
 
     memory_usage_ = sizeof(struct DecompressorDict);
diff --git a/util/compression_test.cc b/util/compression_test.cc
index 17521982b2b4..176179ff704f 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -1124,6 +1124,27 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
                                        out_compression_type, working_area);
       }
     }
+
+    // Also check WorkingArea handling
+    struct MyWorkingArea : public WorkingArea {
+      explicit MyWorkingArea(ManagedWorkingArea&& wrapped)
+          : wrapped_(std::move(wrapped)) {}
+      ManagedWorkingArea wrapped_;
+    };
+    ManagedWorkingArea ObtainWorkingArea() override {
+      ManagedWorkingArea rv{
+          new MyWorkingArea{CompressorWrapper::ObtainWorkingArea()}, this};
+      if (GetPreferredCompressionType() == kZSTD) {
+        // ZSTD should always use WorkingArea, so this is our chance to ensure
+        // CompressorWrapper::ObtainWorkingArea() is properly connected
+        assert(rv.get() != nullptr);
+      }
+      return rv;
+    }
+
+    void ReleaseWorkingArea(WorkingArea* wa) override {
+      delete static_cast<MyWorkingArea*>(wa);
+    }
   };
   struct MyManager : public CompressionManagerWrapper {
     using CompressionManagerWrapper::CompressionManagerWrapper;

From f6841d1e68eb18a6b0770d6e7caa6a003e5e804e Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 14 Jul 2025 18:34:56 -0700
Subject: [PATCH 177/500] Fix DeleteFile error handling in
 SstFileWriter::Finish (#13776)

Summary:
In SstFileWriter::Finish, the call to DeleteFile to delete the output file in case of an error may fail. The current behavior is to ignore the error. In stress tests, there may be expected failures due to error injection. Not acting on the return status will cause the ASSERT_STATUS_CHECKED test to fail, so silence it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13776

Reviewed By: mszeszko-meta

Differential Revision: D78307124

Pulled By: anand1976

fbshipit-source-id: d27d9397c15cac5cb33b27094c9123a3fde7fa24
---
 table/sst_file_writer.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index 9343b6feed91..fae60d82dd4d 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -497,9 +497,9 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
   }
   if (!s.ok()) {
     Status status = r->ioptions.env->DeleteFile(r->file_info.file_path);
-    // Silence ASSERT_STATUS_CHECKED warning
-    assert(status.ok());
-    ;
+    // Silence ASSERT_STATUS_CHECKED warning, since DeleteFile may fail under
+    // some error injection, and we can just ignore the failure
+    status.PermitUncheckedError();
   }
 
   if (file_info != nullptr) {

From 768ef1fad4306231bbfdaf59bc2a608688e3d773 Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Tue, 15 Jul 2025 10:37:20 -0700
Subject: [PATCH 178/500] User defined index reader and iterator (#13727)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13727

Add UserDefinedIndexReader and UserDefinedIndexIterator. The BlockBasedTable reads the user defined index meta block during open, verifies the checksum, pins in cache or heap depending on configuration, and allocates a UserDefinedIndexReader object with the contents. Similar to the builder, an IndexReader wrapper is allocated. The wrapper forwards the calls to the native reader and/or user defined index reader as appropriate.

A new option, table_index_name, in ReadOptions specifies the index to use when creating a new Iterator.

Reviewed By: pdillinger

Differential Revision: D76165694

fbshipit-source-id: c30bde4c5ce91ea3dc9ad302e73fe4963c1ed457
---
 include/rocksdb/options.h                     |  12 ++
 include/rocksdb/user_defined_index.h          |  52 +++++-
 table/block_based/block_based_table_reader.cc |  38 +++-
 table/block_based/block_based_table_reader.h  |  22 ++-
 table/block_based/block_cache.cc              |   6 +
 table/block_based/block_cache.h               |  12 ++
 .../block_based/user_defined_index_wrapper.h  | 122 ++++++++++++-
 table/table_test.cc                           | 170 +++++++++++++++++-
 8 files changed, 416 insertions(+), 18 deletions(-)

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index ad4efe021c06..8c1593b13c28 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -57,6 +57,7 @@ class Statistics;
 class InternalKeyComparator;
 class WalFilter;
 class FileSystem;
+class UserDefinedIndexFactory;
 
 struct Options;
 struct DbPath;
@@ -2069,6 +2070,17 @@ struct ReadOptions {
   // Default: false
   bool auto_refresh_iterator_with_snapshot = false;
 
+  // EXPERIMENTAL
+  //
+  // Specify an alternate index to use in the SST files instead of the native
+  // block based table index. The table_factory used for the column family
+  // must support building/reading this index.
+  //
+  // Currently, only forward scans are supported. For forward scans, only Seek()
+  // is supported. SeekToFirst() is not supported. If the caller wishes to scan
+  // from start to end, the native index must be used.
+  const UserDefinedIndexFactory* table_index_factory = nullptr;
+
   // *** END options only relevant to iterators or scans ***
 
   // *** BEGIN options for RocksDB internal use only ***
diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
index a4fd5c90ae09..d399d260908c 100644
--- a/include/rocksdb/user_defined_index.h
+++ b/include/rocksdb/user_defined_index.h
@@ -11,7 +11,9 @@
 
 #include <string>
 
+#include "rocksdb/advanced_iterator.h"
 #include "rocksdb/customizable.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 
@@ -23,7 +25,8 @@ inline const std::string kUserDefinedIndexPrefix =
 
 // This is a public API for user-defined index builders.
 // It allows users to define their own index format and build custom
-// indexes during table building.
+// indexes during table building. Currently, only a monolithic index
+// block is supported (no partitioned index).
 
 // The interface for building user-defined index.
 class UserDefinedIndexBuilder {
@@ -52,6 +55,9 @@ class UserDefinedIndexBuilder {
   // @last_key_in_current_block: The last key in the current data block
   // @first_key_in_next_block: it will be nullptr if the entry being added is
   //                           the last one in the table
+  // @block_handle: offset/size of the data block referenced by this index
+  //                entry. This should be stored along with the index entry
+  //                key
   // @separator_scratch: a scratch buffer to back a computed separator between
   //                     those, as needed. May be modified on each call.
   // @return: the key or separator stored in the index, which could be
@@ -76,6 +82,45 @@ class UserDefinedIndexBuilder {
   virtual Status Finish(Slice* index_contents) = 0;
 };
 
+// The interface for iterating the user defined index. This will be
+// instantiated and used by a scan to iterate through the index entries
+// covered by the scan.
+class UserDefinedIndexIterator {
+ public:
+  virtual ~UserDefinedIndexIterator() = default;
+
+  // Given the target key, position the index iterator at the index entry
+  // with the smallest key >= target. The result must be updated with the
+  // index key, and the bound_check_result. The bound_check_result should
+  // be set to kOutOfBound if no block satisfies the target key and
+  // termination criteria, kInbound if the data block is definitely fully
+  // within bounds, or kUnknown if the data block could be partially
+  // within bounds.
+  virtual Status SeekAndGetResult(const Slice& target,
+                                  IterateResult* result) = 0;
+
+  // Advance to the next index entry. The result must be populated similar
+  // to SeekAndGetResult.
+  virtual Status NextAndGetResult(IterateResult* result) = 0;
+
+  // Return the BlockHandle in the current index entry
+  virtual UserDefinedIndexBuilder::BlockHandle value() = 0;
+};
+
+// A reader interface for the user defined index
+class UserDefinedIndexReader {
+ public:
+  virtual ~UserDefinedIndexReader() = default;
+
+  // Allocate an iterator that will be used by RocksDB to perform scans
+  virtual std::unique_ptr<UserDefinedIndexIterator> NewIterator(
+      const ReadOptions& read_options) = 0;
+
+  // The memory usage of the index, including the size of the raw contents and
+  // any other heap data structures allocated by the reader
+  virtual size_t ApproximateMemoryUsage() const = 0;
+};
+
 // Factory for creating user-defined index builders.
 class UserDefinedIndexFactory : public Customizable {
  public:
@@ -83,6 +128,11 @@ class UserDefinedIndexFactory : public Customizable {
 
   // Create a new builder for user-defined index.
   virtual UserDefinedIndexBuilder* NewBuilder() const = 0;
+
+  // Create a new user defined index reader given the contents of the index
+  // block
+  virtual std::unique_ptr<UserDefinedIndexReader> NewReader(
+      Slice& index_block) const = 0;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 13fee36bf56f..685d6eb99bcc 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -59,6 +59,7 @@
 #include "table/block_based/hash_index_reader.h"
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/block_based/partitioned_index_reader.h"
+#include "table/block_based/user_defined_index_wrapper.h"
 #include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/get_context.h"
@@ -114,6 +115,7 @@ INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kIndex);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kFilterPartitionIndex);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kRangeDeletion);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kMetaIndex);
+INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kUserDefinedIndex);
 
 }  // namespace ROCKSDB_NAMESPACE
 
@@ -1319,6 +1321,34 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   if (!s.ok()) {
     return s;
   }
+  if (table_options.user_defined_index_factory != nullptr) {
+    std::string udi_name(table_options.user_defined_index_factory->Name());
+    BlockHandle udi_block_handle;
+
+    // Should we use FindOptionalMetaBlock here?
+    s = FindMetaBlock(meta_iter, kUserDefinedIndexPrefix + udi_name,
+                      &udi_block_handle);
+    if (!s.ok()) {
+      return s;
+    }
+    // Read the block, and allocate on heap or pin in cache. The UDI block is
+    // not compressed. RetrieveBlock will verify the checksum.
+    s = RetrieveBlock(prefetch_buffer, ro, udi_block_handle,
+                      rep_->decompressor.get(), &rep_->udi_block,
+                      /*get_context=*/nullptr, lookup_context,
+                      /*for_compaction=*/false, use_cache, /*async_read=*/false,
+                      /*use_block_cache_for_lookup=*/false);
+    if (!s.ok()) {
+      return s;
+    }
+    assert(!rep_->udi_block.IsEmpty());
+
+    std::unique_ptr<UserDefinedIndexReader> udi_reader =
+        table_options.user_defined_index_factory->NewReader(
+            rep_->udi_block.GetValue()->data);
+    index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
+        udi_name, std::move(index_reader), std::move(udi_reader));
+  }
 
   rep_->index_reader = std::move(index_reader);
 
@@ -1766,8 +1796,7 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache(
         ro.fill_cache) {
       Statistics* statistics = rep_->ioptions.stats;
       const bool maybe_compressed =
-          TBlocklike::kBlockType != BlockType::kFilter &&
-          TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
+          BlockTypeMaybeCompressed(TBlocklike::kBlockType) &&
           rep_->decompressor;
       // This flag, if true, tells BlockFetcher to return the uncompressed
       // block when ReadBlockContents() is called.
@@ -1911,6 +1940,7 @@ BlockBasedTable::SaveLookupContextOrTraceRecord(
       trace_block_type = TraceType::kBlockTraceRangeDeletionBlock;
       break;
     case BlockType::kIndex:
+    case BlockType::kUserDefinedIndex:
       trace_block_type = TraceType::kBlockTraceIndexBlock;
       break;
     default:
@@ -2003,9 +2033,7 @@ WithBlocklikeCheck<Status, TBlocklike> BlockBasedTable::RetrieveBlock(
   }
 
   const bool maybe_compressed =
-      TBlocklike::kBlockType != BlockType::kFilter &&
-      TBlocklike::kBlockType != BlockType::kCompressionDictionary &&
-      rep_->decompressor;
+      BlockTypeMaybeCompressed(TBlocklike::kBlockType) && rep_->decompressor;
   std::unique_ptr<TBlocklike> block;
 
   {
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 8a8a3e493a02..14616e7cf069 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -228,11 +228,15 @@ class BlockBasedTable : public TableReader {
 
     // Create an iterator for index access. If iter is null, then a new object
     // is created on the heap, and the callee will have the ownership.
-    // If a non-null iter is passed in, it will be used, and the returned value
-    // is either the same as iter or a new on-heap object that
-    // wraps the passed iter. In the latter case the return value points
-    // to a different object then iter, and the callee has the ownership of the
-    // returned object.
+    // If a non-null iter is passed in, it may be used, and the returned value
+    // is either the same as iter or a new on-heap object.
+    // In the latter case the return value points to a different object then
+    // iter, and the callee has the ownership of the returned object.
+    //
+    // Under all circumstances, the caller MUST use the returned iterator
+    // for further operations. If the returned iterator != iter, then the
+    // caller MUST ensure that iter stays in scope until the returned
+    // iterator is destroyed.
     virtual InternalIteratorBase<IndexValue>* NewIterator(
         const ReadOptions& read_options, bool disable_prefix_seek,
         IndexBlockIter* iter, GetContext* get_context,
@@ -544,6 +548,12 @@ class BlockBasedTable : public TableReader {
 
   bool TimestampMayMatch(const ReadOptions& read_options) const;
 
+  bool BlockTypeMaybeCompressed(BlockType type) const {
+    return type != BlockType::kFilter &&
+           type != BlockType::kCompressionDictionary &&
+           type != BlockType::kUserDefinedIndex;
+  }
+
   // A cumulative data block file read in MultiGet lower than this size will
   // use a stack buffer
   static constexpr size_t kMultiGetReadStackBufSize = 8192;
@@ -689,6 +699,8 @@ struct BlockBasedTable::Rep {
   std::unique_ptr<CacheReservationManager::CacheReservationHandle>
       table_reader_cache_res_handle = nullptr;
 
+  CachableEntry<Block_kUserDefinedIndex> udi_block;
+
   SequenceNumber get_global_seqno(BlockType block_type) const {
     return (block_type == BlockType::kFilterPartitionIndex ||
             block_type == BlockType::kCompressionDictionary)
diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc
index f71ea5b65041..28d181db5652 100644
--- a/table/block_based/block_cache.cc
+++ b/table/block_based/block_cache.cc
@@ -46,6 +46,12 @@ void BlockCreateContext::Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
       protection_bytes_per_key);
 }
 
+void BlockCreateContext::Create(
+    std::unique_ptr<Block_kUserDefinedIndex>* parsed_out,
+    BlockContents&& block) {
+  parsed_out->reset(new Block_kUserDefinedIndex(std::move(block)));
+}
+
 void BlockCreateContext::Create(
     std::unique_ptr<ParsedFullFilterBlock>* parsed_out, BlockContents&& block) {
   parsed_out->reset(new ParsedFullFilterBlock(
diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h
index 2827e0a8ae87..564dcf0062db 100644
--- a/table/block_based/block_cache.h
+++ b/table/block_based/block_cache.h
@@ -67,6 +67,16 @@ class Block_kMetaIndex : public Block {
   static constexpr BlockType kBlockType = BlockType::kMetaIndex;
 };
 
+class Block_kUserDefinedIndex : public BlockContents {
+ public:
+  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kIndexBlock;
+  static constexpr BlockType kBlockType = BlockType::kUserDefinedIndex;
+
+  explicit Block_kUserDefinedIndex(BlockContents&& other)
+      : BlockContents(std::move(other)) {}
+  const Slice& ContentSlice() const { return data; }
+};
+
 struct BlockCreateContext : public Cache::CreateContext {
   BlockCreateContext() {}
   BlockCreateContext(const BlockBasedTableOptions* _table_options,
@@ -126,6 +136,8 @@ struct BlockCreateContext : public Cache::CreateContext {
               BlockContents&& block);
   void Create(std::unique_ptr<Block_kMetaIndex>* parsed_out,
               BlockContents&& block);
+  void Create(std::unique_ptr<Block_kUserDefinedIndex>* parsed_out,
+              BlockContents&& block);
   void Create(std::unique_ptr<ParsedFullFilterBlock>* parsed_out,
               BlockContents&& block);
   void Create(std::unique_ptr<DecompressorDict>* parsed_out,
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 06bb75d03d82..4b1b02e0a172 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -12,7 +12,9 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/user_defined_index.h"
+#include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_type.h"
+#include "table/block_based/cachable_entry.h"
 #include "table/block_based/index_builder.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -34,10 +36,6 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
         internal_index_builder_(std::move(internal_index_builder)),
         user_defined_index_builder_(std::move(user_defined_index_builder)) {}
 
-  // Note: We don't provide a simplified constructor that tries to extract
-  // parameters from internal_index_builder because IndexBuilder's members are
-  // protected and there are no accessor methods to get them
-
   ~UserDefinedIndexBuilderWrapper() override = default;
 
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
@@ -123,4 +121,120 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
   std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder_;
   Status status_;
 };
+
+class UserDefinedIndexIteratorWrapper
+    : public InternalIteratorBase<IndexValue> {
+ public:
+  explicit UserDefinedIndexIteratorWrapper(
+      std::unique_ptr<UserDefinedIndexIterator>&& udi_iter)
+      : udi_iter_(std::move(udi_iter)), valid_(false) {}
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    status_ = Status::NotSupported("SeekToFirst not supported");
+  }
+
+  void SeekToLast() override {
+    status_ = Status::NotSupported("SeekToLast not supported");
+  }
+
+  void Seek(const Slice& target) override {
+    ParsedInternalKey pkey;
+    status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
+    if (status_.ok()) {
+      status_ = udi_iter_->SeekAndGetResult(pkey.user_key, &result_);
+      valid_ = status_.ok() &&
+               result_.bound_check_result == IterBoundCheck::kInbound;
+    }
+  }
+
+  void Next() override {
+    status_ = udi_iter_->NextAndGetResult(&result_);
+    valid_ =
+        status_.ok() && result_.bound_check_result == IterBoundCheck::kInbound;
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    status_ = udi_iter_->NextAndGetResult(&result_);
+    valid_ =
+        status_.ok() && result_.bound_check_result == IterBoundCheck::kInbound;
+    if (status_.ok()) {
+      *result = result_;
+    }
+    return valid_;
+  }
+
+  void SeekForPrev(const Slice& /*target*/) override {
+    status_ = Status::NotSupported("SeekForPrev not supported");
+  }
+
+  void Prev() override { status_ = Status::NotSupported("Prev not supported"); }
+
+  Slice key() const override { return result_.key; }
+
+  IndexValue value() const override {
+    auto handle = udi_iter_->value();
+    IndexValue val(BlockHandle(handle.offset, handle.size), Slice());
+    return val;
+  }
+
+  Status status() const override { return status_; }
+
+ private:
+  std::unique_ptr<UserDefinedIndexIterator> udi_iter_;
+  IterateResult result_;
+  Status status_;
+  bool valid_;
+};
+
+class UserDefinedIndexReaderWrapper : public BlockBasedTable::IndexReader {
+ public:
+  UserDefinedIndexReaderWrapper(
+      const std::string& name,
+      std::unique_ptr<BlockBasedTable::IndexReader>&& reader,
+      std::unique_ptr<UserDefinedIndexReader>&& udi_reader)
+      : name_(name),
+        reader_(std::move(reader)),
+        udi_reader_(std::move(udi_reader)) {}
+
+  virtual InternalIteratorBase<IndexValue>* NewIterator(
+      const ReadOptions& read_options, bool disable_prefix_seek,
+      IndexBlockIter* iter, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context) override {
+    if (!read_options.table_index_factory) {
+      return reader_->NewIterator(read_options, disable_prefix_seek, iter,
+                                  get_context, lookup_context);
+    }
+    if (name_ != read_options.table_index_factory->Name()) {
+      return NewErrorInternalIterator<IndexValue>(Status::InvalidArgument(
+          "Bad index name" +
+          std::string(read_options.table_index_factory->Name()) +
+          ". Only supported UDI is " + name_));
+    }
+    std::unique_ptr<UserDefinedIndexIterator> udi_iter =
+        udi_reader_->NewIterator(read_options);
+    return new UserDefinedIndexIteratorWrapper(std::move(udi_iter));
+  }
+
+  virtual Status CacheDependencies(
+      const ReadOptions& ro, bool pin,
+      FilePrefetchBuffer* tail_prefetch_buffer) override {
+    return reader_->CacheDependencies(ro, pin, tail_prefetch_buffer);
+  }
+
+  size_t ApproximateMemoryUsage() const override {
+    return reader_->ApproximateMemoryUsage();
+  }
+
+  virtual void EraseFromCacheBeforeDestruction(
+      uint32_t uncache_aggressiveness) override {
+    reader_->EraseFromCacheBeforeDestruction(uncache_aggressiveness);
+  }
+
+ private:
+  std::string name_;
+  std::unique_ptr<BlockBasedTable::IndexReader> reader_;
+  std::unique_ptr<UserDefinedIndexReader> udi_reader_;
+};
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/table_test.cc b/table/table_test.cc
index 357ef20dcf8b..8ea8937629ea 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -73,7 +73,7 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
-#include "util/coding_lean.h"
+#include "util/coding.h"
 #include "util/compression.h"
 #include "util/file_checksum_helper.h"
 #include "util/random.h"
@@ -7483,12 +7483,121 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
     std::string index_contents_data_;
   };
 
+  class TestUserDefinedIndexReader : public UserDefinedIndexReader {
+   public:
+    explicit TestUserDefinedIndexReader(Slice& index_block) {
+      Slice block = index_block;
+      while (!block.empty()) {
+        Slice key;
+        uint64_t offset = 0;
+        uint64_t size = 0;
+        uint32_t num_keys = 0;
+        EXPECT_TRUE(GetLengthPrefixedSlice(&block, &key));
+        EXPECT_TRUE(GetFixed64(&block, &offset));
+        EXPECT_TRUE(GetFixed64(&block, &size));
+        EXPECT_TRUE(GetFixed32(&block, &num_keys));
+
+        UserDefinedIndexBuilder::BlockHandle handle{0, 0};
+        handle.offset = offset;
+        handle.size = size;
+        index_data_[key.ToString()] =
+            std::make_pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>(
+                std::move(handle), std::move(num_keys));
+      }
+    }
+
+    std::unique_ptr<UserDefinedIndexIterator> NewIterator(
+        const ReadOptions& ro) override {
+      return std::make_unique<TestUserDefinedIndexIterator>(ro, index_data_);
+    }
+
+    size_t ApproximateMemoryUsage() const override { return 0; }
+
+   private:
+    class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
+     public:
+      TestUserDefinedIndexIterator(
+          const ReadOptions& ro,
+          std::map<std::string,
+                   std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
+              index)
+          : ro_(ro), index_(index), iter_(index_.end()) {}
+
+      Status SeekAndGetResult(const Slice& key,
+                              IterateResult* result) override {
+        Status s;
+        TEST_SYNC_POINT_CALLBACK("TestUserDefinedIndexIterator::Seek", &s);
+        if (!s.ok()) {
+          return s;
+        }
+        iter_ = index_.lower_bound(key.ToString());
+        if (iter_ != index_.end()) {
+          result->bound_check_result = IterBoundCheck::kInbound;
+          result->key = Slice(iter_->first);
+        } else {
+          result->bound_check_result = IterBoundCheck::kOutOfBound;
+          result->key = Slice();
+        }
+        return Status::OK();
+      }
+
+      Status NextAndGetResult(IterateResult* result) override {
+        Status s;
+        TEST_SYNC_POINT_CALLBACK("TestUserDefinedIndexIterator::Next", &s);
+        if (!s.ok()) {
+          return s;
+        }
+        if (ro_.iterate_upper_bound) {
+          if (iter_->first.compare(ro_.iterate_upper_bound->ToString()) >= 0) {
+            result->bound_check_result = IterBoundCheck::kOutOfBound;
+            result->key = Slice();
+            return Status::OK();
+          }
+        }
+        iter_++;
+        if (iter_ != index_.end()) {
+          result->bound_check_result = IterBoundCheck::kInbound;
+          result->key = Slice(iter_->first);
+        } else {
+          // EOF
+          result->bound_check_result = IterBoundCheck::kUnknown;
+          result->key = Slice();
+        }
+        return Status::OK();
+      }
+
+      UserDefinedIndexBuilder::BlockHandle value() override {
+        UserDefinedIndexBuilder::BlockHandle handle{0, 0};
+        handle.offset = iter_->second.first.offset;
+        handle.size = iter_->second.first.size;
+        return handle;
+      }
+
+     private:
+      const ReadOptions& ro_;
+      std::map<std::string,
+               std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
+          index_;
+      std::map<std::string, std::pair<UserDefinedIndexBuilder::BlockHandle,
+                                      uint32_t>>::iterator iter_;
+    };
+
+    std::map<std::string,
+             std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>
+        index_data_;
+  };
+
   class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
    public:
     const char* Name() const override { return "test_index"; }
     UserDefinedIndexBuilder* NewBuilder() const override {
       return new TestUserDefinedIndexBuilder();
     }
+
+    std::unique_ptr<UserDefinedIndexReader> NewReader(
+        Slice& index_block) const override {
+      return std::make_unique<TestUserDefinedIndexReader>(index_block);
+    }
   };
 };
 
@@ -7528,8 +7637,8 @@ TEST_F(UserDefinedIndexTest, BasicTest) {
   MutableCFOptions moptions((ColumnFamilyOptions(options)));
   EnvOptions eoptions(options);
   TableReaderOptions toptions(
-      ioptions, moptions.prefix_extractor, /*compression_manager*/ nullptr,
-      eoptions, ioptions.internal_comparator,
+      ioptions, moptions.prefix_extractor,
+      /*_compression_manager=*/nullptr, eoptions, ioptions.internal_comparator,
       moptions.block_protection_bytes_per_key,
       /*skip_filters*/ false, /*immortal*/ false,
       /*force_direct_prefetch*/ false, /*level*/ -1,
@@ -7576,6 +7685,61 @@ TEST_F(UserDefinedIndexTest, BasicTest) {
   }
   ASSERT_EQ(key_count, 100);  // We added 100 keys
   ASSERT_OK(iter->status());
+  iter.reset();
+
+  ro.table_index_factory = user_defined_index_factory.get();
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+
+  // Test that we can read all the keys
+  key_count = 0;
+  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 91);
+  ASSERT_OK(iter->status());
+
+  Slice ub("key75");
+  ro.iterate_upper_bound = &ub;
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+
+  // Test that we can read all the keys
+  key_count = 0;
+  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 66);
+  ASSERT_OK(iter->status());
+
+  SyncPoint::GetInstance()->SetCallBack("TestUserDefinedIndexIterator::Seek",
+                                        [](void* arg) {
+                                          Status* s = static_cast<Status*>(arg);
+                                          *s = Status::IOError();
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("key09");
+  ASSERT_NOK(iter->status());
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  SyncPoint::GetInstance()->SetCallBack("TestUserDefinedIndexIterator::Next",
+                                        [](void* arg) {
+                                          Status* s = static_cast<Status*>(arg);
+                                          *s = Status::IOError();
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+  iter->Seek("key09");
+  ASSERT_OK(iter->status());
+  iter->Next();
+  iter->Next();
+  ASSERT_NOK(iter->status());
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
 }
 
 TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {

From 0788cb8a80c441cfef533b5e90c01f45a9fa9b52 Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Wed, 16 Jul 2025 00:16:03 -0700
Subject: [PATCH 179/500] Add Prepare interface to user defined index iterator
 (#13728)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13728

The Prepare interface allows the user defined index iterator to prefetch index entries, as well as take custom scan termination criteria specified in the property_bag into account.

Reviewed By: pdillinger

Differential Revision: D76165546

fbshipit-source-id: 83d628598924aa7a60dff7ed62a16ae575b2c8ec
---
 include/rocksdb/iterator.h                    |  20 +-
 include/rocksdb/user_defined_index.h          |   4 +
 .../block_based/block_based_table_iterator.h  |   4 +
 .../block_based/user_defined_index_wrapper.h  |   4 +
 table/table_test.cc                           | 544 +++++++++++++-----
 5 files changed, 409 insertions(+), 167 deletions(-)

diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index 0a36cb2f8559..af7934e34a1b 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -95,14 +95,18 @@ class Iterator : public IteratorBase {
     return Slice();
   }
 
-  // RocksDB Internal - DO NOT USE
-  // Prepare the iterator to scan the ranges specified in scan_opts. The
-  // upper bound and other table specific limits may be specified. This will
-  // typically be followed by Seeks to the start keys in the order they're
-  // specified in scan_opts. If the user does a Seek to some other target key,
-  // the iterator should disregard the scan_opts from that point onwards and
-  // behave like a normal iterator. Its the user's responsibility to again
-  // call Prepare().
+  // Prepare the iterator to scan the ranges specified in scan_opts. This
+  // includes prefetching relevant blocks from disk. The upper bound and
+  // other table specific limits should be specified for each
+  // scan for best results. If an upper bound is not specified, Prepare may
+  // skip prefetching as it cannot accurately determine how much to prefetch.
+  //
+  // Prepare should typically be followed by Seeks to the start keys in the
+  // order they're specified in scan_opts. If the user does a Seek to some
+  // other target key, the iterator should disregard the scan_opts from that
+  // point onwards and behave like a normal iterator. Its the user's
+  // responsibility to again call Prepare().
+  //
   // If Prepare() is called, it overrides the iterate_upper_bound in
   // ReadOptions
   virtual void Prepare(const std::vector<ScanOptions>& /*scan_opts*/) {}
diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
index d399d260908c..6aabed4d1dea 100644
--- a/include/rocksdb/user_defined_index.h
+++ b/include/rocksdb/user_defined_index.h
@@ -89,6 +89,10 @@ class UserDefinedIndexIterator {
  public:
   virtual ~UserDefinedIndexIterator() = default;
 
+  // Prepare the iterator for a series of scans. The iterator should use
+  // this as an opportunity to do any prefetching and buffering of results.
+  virtual void Prepare(const ScanOptions scan_opts[], size_t num_opts) = 0;
+
   // Given the target key, position the index iterator at the index entry
   // with the smallest key >= target. The result must be updated with the
   // index key, and the bound_check_result. The bound_check_result should
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index d49224de4ac2..2e46d96ac40b 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -222,6 +222,10 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     }
   }
 
+  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+    index_iter_->Prepare(scan_opts);
+  }
+
   FilePrefetchBuffer* prefetch_buffer() {
     return block_prefetcher_.prefetch_buffer();
   }
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 4b1b02e0a172..8a760a09ea9e 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -181,6 +181,10 @@ class UserDefinedIndexIteratorWrapper
 
   Status status() const override { return status_; }
 
+  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+    udi_iter_->Prepare(scan_opts->data(), scan_opts->size());
+  }
+
  private:
   std::unique_ptr<UserDefinedIndexIterator> udi_iter_;
   IterateResult result_;
diff --git a/table/table_test.cc b/table/table_test.cc
index 8ea8937629ea..d40e4b6ec62c 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7407,11 +7407,11 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         : keys_in_current_block_(0), keys_per_block_(keys_per_block) {}
 
     bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
-      keys_in_current_block_++;
       if (keys_in_current_block_ >= keys_per_block_) {
-        keys_in_current_block_ = 0;
+        keys_in_current_block_ = 1;
         return true;
       }
+      keys_in_current_block_++;
       return false;
     }
 
@@ -7433,171 +7433,229 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
   };
 
  public:
-  class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder {
+  class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
    public:
-    TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {}
-
-    Slice AddIndexEntry(const Slice& last_key_in_current_block,
-                        const Slice* first_key_in_next_block,
-                        const BlockHandle& block_handle,
-                        std::string* separator_scratch) override {
-      // Unused parameters
-      (void)first_key_in_next_block;
-      (void)separator_scratch;
-      entries_added_++;
-      // Store the block handle for each key
-      PutFixed64(&index_data_[last_key_in_current_block.ToString()],
-                 block_handle.offset);
-      PutFixed64(&index_data_[last_key_in_current_block.ToString()],
-                 block_handle.size);
-      PutFixed32(&index_data_[last_key_in_current_block.ToString()],
-                 keys_added_);
-      keys_added_ = 0;
-      return last_key_in_current_block;
-    }
-
-    void OnKeyAdded(const Slice& /*key*/, ValueType /*value*/,
-                    const Slice& /*value*/) override {
-      // Track keys added to the index
-      keys_added_++;
-    }
-
-    Status Finish(Slice* index_contents) override {
-      // Serialize the index data
-      std::string result;
-      for (const auto& entry : index_data_) {
-        PutLengthPrefixedSlice(&result, entry.first);
-        result.append(entry.second);
-      }
-      index_contents_data_ = result;
-      *index_contents = index_contents_data_;
-      return Status::OK();
+    const char* Name() const override { return "test_index"; }
+    UserDefinedIndexBuilder* NewBuilder() const override {
+      return new TestUserDefinedIndexBuilder();
     }
 
-    int GetEntriesAdded() const { return entries_added_; }
+    std::unique_ptr<UserDefinedIndexReader> NewReader(
+        Slice& index_block) const override {
+      return std::make_unique<TestUserDefinedIndexReader>(index_block, this);
+    }
+
+    uint64_t seek_error_count_ = 0;
+    uint64_t next_error_count_ = 0;
 
    private:
-    int entries_added_;
-    std::map<std::string, std::string> index_data_;
-    uint32_t keys_added_;
-    std::string index_contents_data_;
-  };
+    class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder {
+     public:
+      TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {}
+
+      Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                          const Slice* first_key_in_next_block,
+                          const BlockHandle& block_handle,
+                          std::string* separator_scratch) override {
+        // Unused parameters
+        (void)first_key_in_next_block;
+        (void)separator_scratch;
+        entries_added_++;
+        // Store the block handle for each key
+        PutFixed64(&index_data_[last_key_in_current_block.ToString()],
+                   block_handle.offset);
+        PutFixed64(&index_data_[last_key_in_current_block.ToString()],
+                   block_handle.size);
+        PutFixed32(&index_data_[last_key_in_current_block.ToString()],
+                   keys_added_);
+        keys_added_ = 0;
+        return last_key_in_current_block;
+      }
 
-  class TestUserDefinedIndexReader : public UserDefinedIndexReader {
-   public:
-    explicit TestUserDefinedIndexReader(Slice& index_block) {
-      Slice block = index_block;
-      while (!block.empty()) {
-        Slice key;
-        uint64_t offset = 0;
-        uint64_t size = 0;
-        uint32_t num_keys = 0;
-        EXPECT_TRUE(GetLengthPrefixedSlice(&block, &key));
-        EXPECT_TRUE(GetFixed64(&block, &offset));
-        EXPECT_TRUE(GetFixed64(&block, &size));
-        EXPECT_TRUE(GetFixed32(&block, &num_keys));
-
-        UserDefinedIndexBuilder::BlockHandle handle{0, 0};
-        handle.offset = offset;
-        handle.size = size;
-        index_data_[key.ToString()] =
-            std::make_pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>(
-                std::move(handle), std::move(num_keys));
+      void OnKeyAdded(const Slice& /*key*/, ValueType /*value*/,
+                      const Slice& /*value*/) override {
+        // Track keys added to the index
+        keys_added_++;
       }
-    }
 
-    std::unique_ptr<UserDefinedIndexIterator> NewIterator(
-        const ReadOptions& ro) override {
-      return std::make_unique<TestUserDefinedIndexIterator>(ro, index_data_);
-    }
+      Status Finish(Slice* index_contents) override {
+        // Serialize the index data
+        std::string result;
+        for (const auto& entry : index_data_) {
+          PutLengthPrefixedSlice(&result, entry.first);
+          result.append(entry.second);
+        }
+        index_contents_data_ = result;
+        *index_contents = index_contents_data_;
+        return Status::OK();
+      }
 
-    size_t ApproximateMemoryUsage() const override { return 0; }
+      int GetEntriesAdded() const { return entries_added_; }
 
-   private:
-    class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
+     private:
+      int entries_added_;
+      std::map<std::string, std::string> index_data_;
+      uint32_t keys_added_;
+      std::string index_contents_data_;
+    };
+
+    class TestUserDefinedIndexReader : public UserDefinedIndexReader {
      public:
-      TestUserDefinedIndexIterator(
-          const ReadOptions& ro,
-          std::map<std::string,
-                   std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
-              index)
-          : ro_(ro), index_(index), iter_(index_.end()) {}
-
-      Status SeekAndGetResult(const Slice& key,
-                              IterateResult* result) override {
-        Status s;
-        TEST_SYNC_POINT_CALLBACK("TestUserDefinedIndexIterator::Seek", &s);
-        if (!s.ok()) {
-          return s;
+      explicit TestUserDefinedIndexReader(
+          Slice& index_block, const TestUserDefinedIndexFactory* factory)
+          : factory_(factory) {
+        Slice block = index_block;
+        while (!block.empty()) {
+          Slice key;
+          uint64_t offset = 0;
+          uint64_t size = 0;
+          uint32_t num_keys = 0;
+          EXPECT_TRUE(GetLengthPrefixedSlice(&block, &key));
+          EXPECT_TRUE(GetFixed64(&block, &offset));
+          EXPECT_TRUE(GetFixed64(&block, &size));
+          EXPECT_TRUE(GetFixed32(&block, &num_keys));
+
+          UserDefinedIndexBuilder::BlockHandle handle{0, 0};
+          handle.offset = offset;
+          handle.size = size;
+          index_data_[key.ToString()] =
+              std::make_pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>(
+                  std::move(handle), std::move(num_keys));
         }
-        iter_ = index_.lower_bound(key.ToString());
-        if (iter_ != index_.end()) {
-          result->bound_check_result = IterBoundCheck::kInbound;
-          result->key = Slice(iter_->first);
-        } else {
-          result->bound_check_result = IterBoundCheck::kOutOfBound;
-          result->key = Slice();
-        }
-        return Status::OK();
       }
 
-      Status NextAndGetResult(IterateResult* result) override {
-        Status s;
-        TEST_SYNC_POINT_CALLBACK("TestUserDefinedIndexIterator::Next", &s);
-        if (!s.ok()) {
-          return s;
-        }
-        if (ro_.iterate_upper_bound) {
-          if (iter_->first.compare(ro_.iterate_upper_bound->ToString()) >= 0) {
+      std::unique_ptr<UserDefinedIndexIterator> NewIterator(
+          const ReadOptions& ro) override {
+        return std::make_unique<TestUserDefinedIndexIterator>(ro, index_data_,
+                                                              factory_);
+      }
+
+      size_t ApproximateMemoryUsage() const override { return 0; }
+
+     private:
+      class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
+       public:
+        TestUserDefinedIndexIterator(
+            const ReadOptions& ro,
+            std::map<std::string,
+                     std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
+                index,
+            const TestUserDefinedIndexFactory* factory)
+            : ro_(ro),
+              index_(index),
+              iter_(index_.end()),
+              scan_opts_(nullptr),
+              num_opts_(0),
+              target_num_keys_(0),
+              seek_error_count_(factory->seek_error_count_),
+              next_error_count_(factory->next_error_count_) {}
+
+        Status SeekAndGetResult(const Slice& key,
+                                IterateResult* result) override {
+          Status s;
+          if (seek_error_count_) {
+            seek_error_count_--;
+            s = Status::IOError();
+          }
+          if (!s.ok()) {
+            return s;
+          }
+          if (scan_opts_) {
+            if (scan_opts_[scan_idx_].range.start.value().compare(key) == 0) {
+              EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value());
+              target_num_keys_ = std::stoi(scan_opts_[scan_idx_]
+                                               .property_bag.value()
+                                               .find("count")
+                                               ->second);
+              scan_idx_++;
+            } else {
+              scan_opts_ = nullptr;
+            }
+          }
+          iter_ = index_.lower_bound(key.ToString());
+          if (iter_ != index_.end()) {
+            result->bound_check_result = IterBoundCheck::kInbound;
+            result->key = Slice(iter_->first);
+            if (scan_opts_ && target_num_keys_ > 0 &&
+                iter_->first.compare(key.ToString()) == 0) {
+              target_num_keys_--;
+            }
+          } else {
             result->bound_check_result = IterBoundCheck::kOutOfBound;
             result->key = Slice();
+          }
+          return Status::OK();
+        }
+
+        Status NextAndGetResult(IterateResult* result) override {
+          Status s;
+          if (next_error_count_) {
+            next_error_count_--;
+            s = Status::IOError();
+          }
+          if (!s.ok()) {
+            return s;
+          }
+          if (ro_.iterate_upper_bound) {
+            if (iter_->first.compare(ro_.iterate_upper_bound->ToString()) >=
+                0) {
+              result->bound_check_result = IterBoundCheck::kOutOfBound;
+              result->key = Slice();
+              return Status::OK();
+            }
+          }
+          if (scan_opts_ && target_num_keys_ == 0) {
+            result->key = Slice();
+            result->bound_check_result = IterBoundCheck::kOutOfBound;
             return Status::OK();
           }
+          iter_++;
+          if (iter_ != index_.end()) {
+            result->bound_check_result = IterBoundCheck::kInbound;
+            result->key = Slice(iter_->first);
+            target_num_keys_ -=
+                std::min(target_num_keys_, iter_->second.second);
+          } else {
+            // EOF
+            result->bound_check_result = IterBoundCheck::kUnknown;
+            result->key = Slice();
+          }
+          return Status::OK();
         }
-        iter_++;
-        if (iter_ != index_.end()) {
-          result->bound_check_result = IterBoundCheck::kInbound;
-          result->key = Slice(iter_->first);
-        } else {
-          // EOF
-          result->bound_check_result = IterBoundCheck::kUnknown;
-          result->key = Slice();
+
+        UserDefinedIndexBuilder::BlockHandle value() override {
+          UserDefinedIndexBuilder::BlockHandle handle{0, 0};
+          handle.offset = iter_->second.first.offset;
+          handle.size = iter_->second.first.size;
+          return handle;
         }
-        return Status::OK();
-      }
 
-      UserDefinedIndexBuilder::BlockHandle value() override {
-        UserDefinedIndexBuilder::BlockHandle handle{0, 0};
-        handle.offset = iter_->second.first.offset;
-        handle.size = iter_->second.first.size;
-        return handle;
-      }
+        void Prepare(const ScanOptions scan_opts[], size_t num_opts) override {
+          scan_opts_ = scan_opts;
+          num_opts_ = num_opts;
+          scan_idx_ = 0;
+        }
 
-     private:
-      const ReadOptions& ro_;
+       private:
+        const ReadOptions& ro_;
+        std::map<std::string,
+                 std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
+            index_;
+        std::map<std::string, std::pair<UserDefinedIndexBuilder::BlockHandle,
+                                        uint32_t>>::iterator iter_;
+        const ScanOptions* scan_opts_;
+        size_t num_opts_{};
+        size_t scan_idx_{};
+        uint32_t target_num_keys_;
+        uint64_t seek_error_count_;
+        uint64_t next_error_count_;
+      };
+
+      const TestUserDefinedIndexFactory* factory_;
       std::map<std::string,
-               std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
-          index_;
-      std::map<std::string, std::pair<UserDefinedIndexBuilder::BlockHandle,
-                                      uint32_t>>::iterator iter_;
+               std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>
+          index_data_;
     };
-
-    std::map<std::string,
-             std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>
-        index_data_;
-  };
-
-  class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
-   public:
-    const char* Name() const override { return "test_index"; }
-    UserDefinedIndexBuilder* NewBuilder() const override {
-      return new TestUserDefinedIndexBuilder();
-    }
-
-    std::unique_ptr<UserDefinedIndexReader> NewReader(
-        Slice& index_block) const override {
-      return std::make_unique<TestUserDefinedIndexReader>(index_block);
-    }
   };
 };
 
@@ -7712,34 +7770,43 @@ TEST_F(UserDefinedIndexTest, BasicTest) {
   ASSERT_EQ(key_count, 66);
   ASSERT_OK(iter->status());
 
-  SyncPoint::GetInstance()->SetCallBack("TestUserDefinedIndexIterator::Seek",
-                                        [](void* arg) {
-                                          Status* s = static_cast<Status*>(arg);
-                                          *s = Status::IOError();
-                                        });
-  SyncPoint::GetInstance()->EnableProcessing();
+  user_defined_index_factory->seek_error_count_ = 1;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
   iter->Seek("key09");
   ASSERT_NOK(iter->status());
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->DisableProcessing();
 
-  SyncPoint::GetInstance()->SetCallBack("TestUserDefinedIndexIterator::Next",
-                                        [](void* arg) {
-                                          Status* s = static_cast<Status*>(arg);
-                                          *s = Status::IOError();
-                                        });
-  SyncPoint::GetInstance()->EnableProcessing();
+  user_defined_index_factory->seek_error_count_ = 0;
+  user_defined_index_factory->next_error_count_ = 1;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
   iter->Seek("key09");
   ASSERT_OK(iter->status());
   iter->Next();
+  ASSERT_OK(iter->status());
+  iter->Next();
+  ASSERT_OK(iter->status());
   iter->Next();
   ASSERT_NOK(iter->status());
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->DisableProcessing();
+  user_defined_index_factory->next_error_count_ = 0;
+
+  ro.iterate_upper_bound = nullptr;
+  iter.reset(reader->NewIterator(ro));
+  ASSERT_NE(iter, nullptr);
+  std::vector<ScanOptions> scan_opts({ScanOptions("key20")});
+  ;
+  scan_opts[0].property_bag.emplace().emplace("count", std::to_string(25));
+  iter->Prepare(scan_opts);
+  // Test that we can read all the keys
+  key_count = 0;
+  for (iter->Seek(scan_opts[0].range.start.value()); iter->Valid();
+       iter->Next()) {
+    key_count++;
+  }
+  ASSERT_GE(key_count, 25);
+  // The index may undercount by 2 blocks
+  ASSERT_LE(key_count, 30);
+  ASSERT_OK(iter->status());
 }
 
 TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {
@@ -7798,6 +7865,165 @@ TEST_F(UserDefinedIndexTest, InvalidArgumentTest2) {
   ASSERT_EQ(writer->Finish(), Status::InvalidArgument());
   writer.reset();
 }
+
+TEST_F(UserDefinedIndexTest, IngestTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Add 100 keys instead of just 5
+  for (int i = 0; i < 100; i++) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 100);  // We added 100 keys
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ro.table_index_factory = user_defined_index_factory.get();
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  // Test that we can read all the keys
+  key_count = 0;
+  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 91);
+  ASSERT_OK(iter->status());
+
+  Slice ub("key75");
+  ro.iterate_upper_bound = &ub;
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  // Test that we can read all the keys
+  key_count = 0;
+  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 66);
+  ASSERT_OK(iter->status());
+
+  ro.iterate_upper_bound = nullptr;
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  std::vector<ScanOptions> scan_opts({ScanOptions("key20")});
+  ;
+  scan_opts[0].property_bag.emplace().emplace("count", std::to_string(25));
+  iter->Prepare(scan_opts);
+  // Test that we can read all the keys
+  key_count = 0;
+  for (iter->Seek(scan_opts[0].range.start.value()); iter->Valid();
+       iter->Next()) {
+    key_count++;
+  }
+  ASSERT_GE(key_count, 25);
+  // The index may undercount by 2 blocks
+  ASSERT_LE(key_count, 30);
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
+// Verify that external file ingestion fails if we try to ingest an SST file
+// without the UDI and a UDI factory is configured in BlockBasedTableOptions
+TEST_F(UserDefinedIndexTest, IngestFailTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Add 100 keys instead of just 5
+  for (int i = 0; i < 100; i++) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_NOK(s);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From b09e27b207c81c5cef5c04b17b7bf093b587932f Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Wed, 16 Jul 2025 11:42:47 -0700
Subject: [PATCH 180/500] Add MultiScan DB Bench Benchmark (#13765)

Summary:
This diff add's a DB Bench Benchmark dedicated to sequential non-overlapping sets of scans using the MultiScan API.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13765

Test Plan:
```
make release

// Setup the DB
./db_bench --db=$DB \
    --benchmarks="fillseq,compact" \
    --disable_wal=1 --key_size=$KEYSIZE \
    --value_size=$VALUESIZE --num=$NUMKEYS --threads=32

// Run the benchmark
./db_bench --use_existing_db=1 \
    --benchmarks=multiscan \
    --disable_auto_compactions=1 --seek_nexts=1000 \
    --key_size=$KEYSIZE --value_size=$VALUESIZE \
    --num=$NUMKEYS --threads=32 --duration=30
```

Reviewed By: anand1976

Differential Revision: D78129962

Pulled By: krhancoc

fbshipit-source-id: 1c524d531b62a8576374ed1377e29d59a83cedec
---
 tools/db_bench_tool.cc | 84 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 2 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 439df43ed93e..10c082af1f2a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -129,6 +129,7 @@ DEFINE_string(
     "compact1,"
     "waitforcompaction,"
     "multireadrandom,"
+    "multiscan,"
     "mixgraph,"
     "readseq,"
     "readtorowcache,"
@@ -333,6 +334,13 @@ DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
 DEFINE_int64(batch_size, 1, "Batch size");
 
+DEFINE_int64(multiscan_size, 10,
+             "MultiScan size - number of multiscans of size `batch_size`");
+
+DEFINE_int64(
+    multiscan_stride, 100,
+    "The amount of keys between two successive Scan operations in multiscan");
+
 static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
   return true;
 }
@@ -2190,7 +2198,8 @@ enum OperationType : unsigned char {
   kUncompress,
   kCrc,
   kHash,
-  kOthers
+  kOthers,
+  kMultiScan
 };
 
 static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
@@ -2199,7 +2208,7 @@ static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
                            {kMerge, "merge"},       {kUpdate, "update"},
                            {kCompress, "compress"}, {kCompress, "uncompress"},
                            {kCrc, "crc"},           {kHash, "hash"},
-                           {kOthers, "op"}};
+                           {kOthers, "op"},         {kMultiScan, "multiscan"}};
 
 class CombinedStats;
 class Stats {
@@ -3641,6 +3650,12 @@ class Benchmark {
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
+      } else if (name == "multiscan") {
+        fprintf(stderr, "multiscan_stride = %" PRIi64 "\n",
+                FLAGS_multiscan_stride);
+        fprintf(stderr, "multiscan_size = %" PRIi64 "\n", FLAGS_multiscan_size);
+        fprintf(stderr, "seek_nexts = %" PRIi32 "\n", FLAGS_seek_nexts);
+        method = &Benchmark::MultiScan;
       } else if (name == "multireadwhilewriting") {
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
@@ -6369,6 +6384,71 @@ class Benchmark {
     thread->stats.AddMessage(msg);
   }
 
+  void MultiScan(ThreadState* thread) {
+    const int64_t scan_size = FLAGS_seek_nexts ? FLAGS_seek_nexts : 50;
+    const int64_t readahead =
+        FLAGS_readahead_size ? FLAGS_readahead_size : 1024 * 24;
+    const int64_t multiscan_size = FLAGS_multiscan_size;
+    auto count_hist = std::make_shared<HistogramImpl>();
+    ReadOptions options = read_options_;
+
+    int64_t multiscans_done = 0;
+
+    options.async_io = true;
+    options.readahead_size = readahead;
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      std::vector<ScanOptions> opts;
+      std::vector<std::unique_ptr<const char[]>> guards;
+      opts.reserve(multiscan_size);
+      // We create 1 random start, and then multiscan will start from that
+      // random start point And create a set of scans of `scan_size` in size
+      // with `multiscan_stride` space between each scan.
+      uint64_t range = static_cast<uint64_t>(FLAGS_num) -
+                       ((scan_size + FLAGS_multiscan_stride) * multiscan_size);
+      uint64_t start_key = thread->rand.Uniform(range);
+      for (int64_t i = 0; i < multiscan_size; i++) {
+        std::unique_ptr<const char[]> skey_guard;
+        Slice skey = AllocateKey(&skey_guard);
+        guards.push_back(std::move(skey_guard));
+        std::unique_ptr<const char[]> ekey_guard;
+        Slice ekey = AllocateKey(&ekey_guard);
+        guards.push_back(std::move(ekey_guard));
+
+        GenerateKeyFromInt(start_key, FLAGS_num, &skey);
+        uint64_t end_key = start_key + scan_size;
+        GenerateKeyFromInt(end_key, FLAGS_num, &ekey);
+
+        opts.emplace_back(skey, ekey);
+        start_key += scan_size + FLAGS_multiscan_stride;
+      }
+
+      auto iter =
+          db->NewMultiScan(read_options_, db->DefaultColumnFamily(), opts);
+      for (auto rng : *iter) {
+        size_t keys = 0;
+        for (auto it __attribute__((__unused__)) : rng) {
+          keys++;
+        }
+        assert(keys > 0);
+      }
+
+      if (thread->shared->read_rate_limiter.get() != nullptr) {
+        thread->shared->read_rate_limiter->Request(
+            1, Env::IO_HIGH, nullptr /* stats */, RateLimiter::OpType::kRead);
+      }
+
+      thread->stats.FinishedOps(nullptr, db, 1, kMultiScan);
+      multiscans_done += 1;
+    }
+
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(multscans:%" PRIu64 ")", multiscans_done);
+    thread->stats.AddMessage(msg);
+  }
+
   void ApproximateMemtableStats(ThreadState* thread) {
     const size_t batch_size = entries_per_batch_;
     std::unique_ptr<const char[]> skey_guard;

From e46972d7a4008375dd6346a3db4cd063a0ce2c58 Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Wed, 16 Jul 2025 11:56:35 -0700
Subject: [PATCH 181/500] Add Exit Hooks to ToolHooks (#13772)

Summary:
This diff introduces the ability to override behavior of exits, allow for users to catch exits in a try catch for example as opposed to fully exiting the process.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13772

Reviewed By: hx235

Differential Revision: D78244499

Pulled By: krhancoc

fbshipit-source-id: b403327ed5b494a22b6beeaad4083945a1def0c7
---
 include/rocksdb/tool_hooks.h |   3 +
 tools/db_bench_tool.cc       | 141 +++++++++++++++++++----------------
 2 files changed, 80 insertions(+), 64 deletions(-)

diff --git a/include/rocksdb/tool_hooks.h b/include/rocksdb/tool_hooks.h
index b31780c032f8..507c32d5e457 100644
--- a/include/rocksdb/tool_hooks.h
+++ b/include/rocksdb/tool_hooks.h
@@ -69,6 +69,7 @@ class ToolHooks {
   virtual Status Open(const Options& options,
                       const blob_db::BlobDBOptions& bdb_options,
                       const std::string& dbname, blob_db::BlobDB** blob_db) = 0;
+  virtual void Exit(int status) = 0;
 };
 
 class DefaultHooks : public ToolHooks {
@@ -117,6 +118,8 @@ class DefaultHooks : public ToolHooks {
                       const blob_db::BlobDBOptions& bdb_options,
                       const std::string& dbname,
                       blob_db::BlobDB** blob_db) override;
+
+  virtual void Exit(int status) override { exit(status); }
 };
 
 extern DefaultHooks defaultHooks;
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 10c082af1f2a..3461dc265505 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1298,6 +1298,18 @@ DEFINE_uint32(memtable_op_scan_flush_trigger,
 DEFINE_bool(verify_compression, false,
             "See BlockBasedTableOptions::verify_compression");
 
+ROCKSDB_NAMESPACE::ToolHooks* hooks_ = nullptr;
+[[noreturn]] void db_bench_exit(int status) {
+  if (hooks_ == nullptr) {
+    exit(status);
+  }
+
+  hooks_->Exit(status);
+
+  // We should exit here but in case they don't we exit anyway.
+  exit(-1);
+};
+
 static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     const char* ctype) {
   assert(ctype);
@@ -1320,7 +1332,7 @@ static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType(
     return ROCKSDB_NAMESPACE::kZSTD;
   } else {
     fprintf(stderr, "Cannot parse compression type '%s'\n", ctype);
-    exit(1);
+    db_bench_exit(1);
   }
 }
 
@@ -1340,7 +1352,7 @@ static enum ROCKSDB_NAMESPACE::TieredAdmissionPolicy StringToAdmissionPolicy(
     return ROCKSDB_NAMESPACE::kAdmPolicyAllowAll;
   } else {
     fprintf(stderr, "Cannot parse admission policy %s\n", policy);
-    exit(1);
+    db_bench_exit(1);
   }
 }
 
@@ -1879,7 +1891,7 @@ static enum DistributionType StringToDistributionType(const char* ctype) {
   }
 
   fprintf(stdout, "Cannot parse distribution type '%s'\n", ctype);
-  exit(1);
+  db_bench_exit(1);
 }
 
 class BaseDistribution {
@@ -2905,11 +2917,11 @@ class Benchmark {
       fprintf(stderr, "Running in NUMA enabled mode.\n");
 #ifndef NUMA
       fprintf(stderr, "NUMA is not defined in the system.\n");
-      exit(1);
+      db_bench_exit(1);
 #else
       if (numa_available() == -1) {
         fprintf(stderr, "NUMA is not supported by the system.\n");
-        exit(1);
+        db_bench_exit(1);
       }
 #endif
     }
@@ -3111,14 +3123,14 @@ class Benchmark {
       JemallocAllocatorOptions jemalloc_options;
       if (!NewJemallocNodumpAllocator(jemalloc_options, &allocator).ok()) {
         fprintf(stderr, "JemallocNodumpAllocator not supported.\n");
-        exit(1);
+        db_bench_exit(1);
       }
     } else if (FLAGS_use_cache_memkind_kmem_allocator) {
 #ifdef MEMKIND
       allocator = std::make_shared<MemkindKmemAllocator>();
 #else
       fprintf(stderr, "Memkind library is not linked with the binary.\n");
-      exit(1);
+      db_bench_exit(1);
 #endif
     }
 
@@ -3162,7 +3174,7 @@ class Benchmark {
             stderr,
             "Cannot specify both --secondary_cache_uri and "
             "--use_compressed_secondary_cache when using a non-tiered cache\n");
-        exit(1);
+        db_bench_exit(1);
       }
       Status s = SecondaryCache::CreateFromString(
           ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@@ -3170,7 +3182,7 @@ class Benchmark {
         fprintf(stderr,
                 "No secondary cache registered matching string: %s status=%s\n",
                 FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     }
 
@@ -3181,11 +3193,11 @@ class Benchmark {
       if (block_cache == nullptr) {
         fprintf(stderr, "No  cache registered matching string: %s status=%s\n",
                 FLAGS_cache_uri.c_str(), s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     } else if (FLAGS_cache_type == "clock_cache") {
       fprintf(stderr, "Old clock cache implementation has been removed.\n");
-      exit(1);
+      db_bench_exit(1);
     } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) {
       size_t estimated_entry_charge;
       if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
@@ -3195,7 +3207,7 @@ class Benchmark {
         estimated_entry_charge = 0;
       } else {
         fprintf(stderr, "Cache type not supported.");
-        exit(1);
+        db_bench_exit(1);
       }
       HyperClockCacheOptions opts(FLAGS_cache_size, estimated_entry_charge,
                                   FLAGS_cache_numshardbits);
@@ -3251,12 +3263,12 @@ class Benchmark {
       }
     } else {
       fprintf(stderr, "Cache type not supported.");
-      exit(1);
+      db_bench_exit(1);
     }
 
     if (!block_cache) {
       fprintf(stderr, "Unable to allocate block cache\n");
-      exit(1);
+      db_bench_exit(1);
     }
     return block_cache;
   }
@@ -3304,7 +3316,7 @@ class Benchmark {
 
     if (FLAGS_prefix_size > FLAGS_key_size) {
       fprintf(stderr, "prefix size is larger than key size");
-      exit(1);
+      db_bench_exit(1);
     }
 
     std::vector<std::string> files;
@@ -3454,7 +3466,7 @@ class Benchmark {
     auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     ReadOptions ro;
     ro.total_order_seek = true;
@@ -3484,7 +3496,7 @@ class Benchmark {
 
   void ErrorExit() {
     DeleteDBs();
-    exit(1);
+    db_bench_exit(1);
   }
 
   void Run(ToolHooks& hooks) {
@@ -3720,7 +3732,7 @@ class Benchmark {
         if (FLAGS_merge_operator.empty()) {
           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
                   name.c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         method = &Benchmark::MergeRandom;
       } else if (name == "randomwithverify") {
@@ -4164,7 +4176,7 @@ class Benchmark {
       thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers);
     }
     if (ptr == nullptr) {
-      exit(1);  // Disable unused variable warning.
+      db_bench_exit(1);  // Disable unused variable warning.
     }
   }
 
@@ -4259,7 +4271,7 @@ class Benchmark {
       }
       fprintf(stderr, "Unable to load options file %s --- %s\n",
               FLAGS_options_file.c_str(), s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     return false;
   }
@@ -4331,7 +4343,7 @@ class Benchmark {
       options.comparator = test::Uint64Comparator();
       if (FLAGS_key_size != 8) {
         fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
-        exit(1);
+        db_bench_exit(1);
       }
     }
     if (FLAGS_use_stderr_info_logger) {
@@ -4365,14 +4377,14 @@ class Benchmark {
     if (!s.ok()) {
       fprintf(stderr, "Could not create memtable factory: %s\n",
               s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     } else if ((FLAGS_prefix_size == 0) &&
                (options.memtable_factory->IsInstanceOf("prefix_hash") ||
                 options.memtable_factory->IsInstanceOf("hash_linkedlist"))) {
       fprintf(stderr,
               "prefix_size should be non-zero if PrefixHash or "
               "HashLinkedList memtablerep is used\n");
-      exit(1);
+      db_bench_exit(1);
     }
     if (FLAGS_use_plain_table) {
       if (!options.memtable_factory->IsInstanceOf("prefix_hash") &&
@@ -4395,12 +4407,12 @@ class Benchmark {
     } else if (FLAGS_use_cuckoo_table) {
       if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) {
         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
-        exit(1);
+        db_bench_exit(1);
       }
 
       if (!FLAGS_mmap_read) {
         fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
-        exit(1);
+        db_bench_exit(1);
       }
 
       ROCKSDB_NAMESPACE::CuckooTableOptions table_options;
@@ -4416,7 +4428,7 @@ class Benchmark {
         if (FLAGS_prefix_size == 0) {
           fprintf(stderr,
                   "prefix_size not assigned when enable use_hash_search \n");
-          exit(1);
+          db_bench_exit(1);
         }
         block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
       } else {
@@ -4584,7 +4596,7 @@ class Benchmark {
         if (!rc_status.ok()) {
           fprintf(stderr, "Error initializing read cache, %s\n",
                   rc_status.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
       }
 
@@ -4604,7 +4616,7 @@ class Benchmark {
                 stderr,
                 "Unable to create a standalone blob cache if blob_cache_size "
                 "<= 0.\n");
-            exit(1);
+            db_bench_exit(1);
           }
         }
         switch (FLAGS_prepopulate_blob_cache) {
@@ -4616,7 +4628,7 @@ class Benchmark {
             break;
           default:
             fprintf(stderr, "Unknown prepopulate blob cache mode\n");
-            exit(1);
+            db_bench_exit(1);
         }
 
         fprintf(stdout,
@@ -4644,7 +4656,7 @@ class Benchmark {
         fprintf(stderr, "Insufficient number of fanouts specified %d\n",
                 static_cast<int>(
                     FLAGS_max_bytes_for_level_multiplier_additional_v.size()));
-        exit(1);
+        db_bench_exit(1);
       }
       options.max_bytes_for_level_multiplier_additional =
           FLAGS_max_bytes_for_level_multiplier_additional_v;
@@ -4743,7 +4755,7 @@ class Benchmark {
       if (!s.ok()) {
         fprintf(stderr, "invalid merge operator[%s]: %s\n",
                 FLAGS_merge_operator.c_str(), s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     }
     options.max_successive_merges = FLAGS_max_successive_merges;
@@ -4786,7 +4798,7 @@ class Benchmark {
     if (FLAGS_user_timestamp_size > 0) {
       if (FLAGS_user_timestamp_size != 8) {
         fprintf(stderr, "Only 64 bits timestamps are supported.\n");
-        exit(1);
+        db_bench_exit(1);
       }
       options.comparator = test::BytewiseComparatorWithU64TsWrapper();
     }
@@ -4814,12 +4826,12 @@ class Benchmark {
 
     if (FLAGS_readonly && FLAGS_transaction_db) {
       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
-      exit(1);
+      db_bench_exit(1);
     }
     if (FLAGS_use_secondary_db &&
         (FLAGS_transaction_db || FLAGS_optimistic_transaction_db)) {
       fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n");
-      exit(1);
+      db_bench_exit(1);
     }
     options.memtable_protection_bytes_per_key =
         FLAGS_memtable_protection_bytes_per_key;
@@ -4998,7 +5010,7 @@ class Benchmark {
         }
         if (sum != 100) {
           fprintf(stderr, "column_family_distribution items must sum to 100\n");
-          exit(1);
+          db_bench_exit(1);
         }
         if (cfh_idx_to_prob.size() != num_hot) {
           fprintf(stderr,
@@ -5006,7 +5018,7 @@ class Benchmark {
                   " column_family_distribution items; expected "
                   "%" ROCKSDB_PRIszt "\n",
                   cfh_idx_to_prob.size(), num_hot);
-          exit(1);
+          db_bench_exit(1);
         }
       }
       if (FLAGS_readonly) {
@@ -5120,7 +5132,7 @@ class Benchmark {
     }
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
 
@@ -5719,7 +5731,7 @@ class Benchmark {
         if (sorted_runs[i].size() < num_levels - 1) {
           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
                   num_levels);
-          exit(1);
+          db_bench_exit(1);
         }
       }
       for (size_t i = 0; i < num_db; i++) {
@@ -5774,7 +5786,7 @@ class Benchmark {
         if (sorted_runs[i].size() < num_levels) {
           fprintf(stderr, "n is too small to fill %" ROCKSDB_PRIszt " levels\n",
                   num_levels);
-          exit(1);
+          db_bench_exit(1);
         }
       }
       for (size_t i = 0; i < num_db; i++) {
@@ -7148,7 +7160,7 @@ class Benchmark {
       thread->stats.FinishedOps(nullptr, db, entries_per_batch_, kDelete);
       if (!s.ok()) {
         fprintf(stderr, "del error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       i += entries_per_batch_;
     }
@@ -7264,7 +7276,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       bytes += key.size() + val.size() + user_timestamp_size_;
       thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
@@ -7291,7 +7303,7 @@ class Benchmark {
                                &expanded_keys[offset]);
             if (!db->Delete(write_options_, expanded_keys[offset]).ok()) {
               fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
-              exit(1);
+              db_bench_exit(1);
             }
           }
         } else {
@@ -7302,7 +7314,7 @@ class Benchmark {
                                begin_key, end_key)
                    .ok()) {
             fprintf(stderr, "deleterange error: %s\n", s.ToString().c_str());
-            exit(1);
+            db_bench_exit(1);
           }
         }
         thread->stats.FinishedOps(&db_, db_.db, 1, kWrite);
@@ -7536,7 +7548,7 @@ class Benchmark {
         Status s = PutMany(db, write_options_, key, gen.Generate());
         if (!s.ok()) {
           fprintf(stderr, "putmany error: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         put_weight--;
         puts_done++;
@@ -7545,7 +7557,7 @@ class Benchmark {
         Status s = DeleteMany(db, write_options_, key);
         if (!s.ok()) {
           fprintf(stderr, "deletemany error: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         delete_weight--;
         deletes_done++;
@@ -7689,7 +7701,7 @@ class Benchmark {
       }
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       bytes += key.size() + val.size() + user_timestamp_size_;
       thread->stats.FinishedOps(nullptr, db, 1, kUpdate);
@@ -7736,7 +7748,7 @@ class Benchmark {
       } else if (!status.IsNotFound()) {
         fprintf(stderr, "Get returned an error: %s\n",
                 status.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
 
       Slice value =
@@ -7874,7 +7886,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
       bytes += key.size() + val.size();
       thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
@@ -7916,7 +7928,7 @@ class Benchmark {
         Status s = db->Merge(write_options_, key, gen.Generate());
         if (!s.ok()) {
           fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
         num_merges++;
         thread->stats.FinishedOps(nullptr, db, 1, kMerge);
@@ -8108,7 +8120,7 @@ class Benchmark {
     Status s = db->VerifyChecksum(ro);
     if (!s.ok()) {
       fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
 
@@ -8125,7 +8137,7 @@ class Benchmark {
     if (!s.ok()) {
       fprintf(stderr, "VerifyFileChecksums() failed: %s\n",
               s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
 
@@ -8249,7 +8261,7 @@ class Benchmark {
       }
       if (!s.ok()) {
         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     }
 
@@ -8287,7 +8299,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
 
       thread->stats.FinishedOps(nullptr, db, 1, kOthers);
@@ -8594,7 +8606,7 @@ class Benchmark {
 
       if (!s.ok()) {
         fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
-        exit(1);
+        db_bench_exit(1);
       }
     } else {
       for (const auto& db_with_cfh : multi_dbs_) {
@@ -8608,7 +8620,7 @@ class Benchmark {
 
         if (!s.ok()) {
           fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
-          exit(1);
+          db_bench_exit(1);
         }
       }
     }
@@ -8724,7 +8736,7 @@ class Benchmark {
           "Encountered an error creating a TraceReader from the trace file. "
           "Error: %s\n",
           s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     std::unique_ptr<Replayer> replayer;
     s = db_with_cfh->db->NewDefaultReplayer(db_with_cfh->cfh,
@@ -8734,7 +8746,7 @@ class Benchmark {
               "Encountered an error creating a default Replayer. "
               "Error: %s\n",
               s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
     s = replayer->Prepare();
     if (!s.ok()) {
@@ -8800,6 +8812,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ConfigOptions config_options;
   static bool initialized = false;
+  hooks_ = &hooks;
   if (!initialized) {
     SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                     " [OPTIONS]...");
@@ -8812,7 +8825,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   if (FLAGS_statistics && !FLAGS_statistics_string.empty()) {
     fprintf(stderr,
             "Cannot provide both --statistics and --statistics_string.\n");
-    exit(1);
+    db_bench_exit(1);
   }
   if (!FLAGS_statistics_string.empty()) {
     Status s = Statistics::CreateFromString(config_options,
@@ -8821,7 +8834,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
       fprintf(stderr,
               "No Statistics registered matching string: %s status=%s\n",
               FLAGS_statistics_string.c_str(), s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   }
   if (FLAGS_statistics) {
@@ -8860,7 +8873,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
   if (env_opts > 1) {
     fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
-    exit(1);
+    db_bench_exit(1);
   }
 
   if (env_opts == 1) {
@@ -8868,7 +8881,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
                                   &FLAGS_env, &env_guard);
     if (!s.ok()) {
       fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
-      exit(1);
+      db_bench_exit(1);
     }
   } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") {
     //**TODO: Make the simulate fs something that can be loaded
@@ -8889,7 +8902,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
     std::string build_info;
     std::cout << GetRocksBuildInfoAsString(build_info, true) << std::endl;
     // Similar to --version, nothing else will be done when this flag is set
-    exit(0);
+    db_bench_exit(0);
   }
 
   if (!FLAGS_seed) {
@@ -8905,7 +8918,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
     fprintf(stderr,
             "`-use_existing_db` must be true for `-use_existing_keys` to be "
             "settable\n");
-    exit(1);
+    db_bench_exit(1);
   }
 
   FLAGS_value_size_distribution_type_e =
@@ -8944,7 +8957,7 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
 
   if (FLAGS_seek_missing_prefix && FLAGS_prefix_size <= 8) {
     fprintf(stderr, "prefix_size > 8 required by --seek_missing_prefix\n");
-    exit(1);
+    db_bench_exit(1);
   }
 
   ROCKSDB_NAMESPACE::Benchmark benchmark;

From 0be850a000515007593afb1e2588e4f84e9e1f72 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Wed, 16 Jul 2025 12:18:47 -0700
Subject: [PATCH 182/500] Avoid divide by 0 in ComputeCompactionScore for FIFO
 compaction (#13767)

Summary:
When max_table_files_size was accidentally configured with 0 value, engine could crash on divide by 0 operation. Although RocksDB do configuration validation during bootstrap, it typically does not do this for runtime dynamic parameter validation. Therefore, there is a chance where max_table_files_size could be set to 0. This PR only focuses on fixing a code path where max_table_files_size ack as divisor.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13767

Test Plan: Unit test.

Reviewed By: cbi42

Differential Revision: D78420516

Pulled By: xingbowang

fbshipit-source-id: 6fdcc85b28a2c6319066665262b981e513719703
---
 db/version_set.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 66b33527ffed..ce1fff3cdf90 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3509,8 +3509,13 @@ void VersionStorageInfo::ComputeCompactionScore(
       }
 
       if (compaction_style_ == kCompactionStyleFIFO) {
-        score = static_cast<double>(total_size) /
-                mutable_cf_options.compaction_options_fifo.max_table_files_size;
+        auto max_table_files_size =
+            mutable_cf_options.compaction_options_fifo.max_table_files_size;
+        if (max_table_files_size == 0) {
+          // avoid divide 0
+          max_table_files_size = 1;
+        }
+        score = static_cast<double>(total_size) / max_table_files_size;
         if (score < 1 &&
             mutable_cf_options.compaction_options_fifo.allow_compaction) {
           score = std::max(

From 6e4113e92dfccf417e0481a8ce467e648be1bec4 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 16 Jul 2025 14:06:56 -0700
Subject: [PATCH 183/500] Remove reductant Compaction parameters (#13777)

Summary:
**Context/Summary:** a small refactoring to make Compaction constructor simpler (though still complicated now).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13777

Test Plan: Existing tests

Reviewed By: jaykorean

Differential Revision: D78385166

Pulled By: hx235

fbshipit-source-id: cd93d1ba3936d9f9077ffceb0dc4ef5506e51017
---
 db/compaction/compaction.cc                  | 16 +++++------
 db/compaction/compaction.h                   |  5 ++--
 db/compaction/compaction_job_test.cc         |  3 +-
 db/compaction/compaction_picker.cc           | 14 ++++------
 db/compaction/compaction_picker_fifo.cc      | 26 +++++++-----------
 db/compaction/compaction_picker_level.cc     |  6 ++--
 db/compaction/compaction_picker_universal.cc | 29 ++++++++------------
 db/db_impl/db_impl_compaction_flush.cc       | 22 +++++++--------
 db/external_sst_file_ingestion_job.cc        |  7 ++---
 9 files changed, 56 insertions(+), 72 deletions(-)

diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index e5c817a0f218..8046444ff828 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -284,9 +284,9 @@ Compaction::Compaction(
     CompressionOptions _compression_opts, Temperature _output_temperature,
     uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
     std::optional<SequenceNumber> _earliest_snapshot,
-    const SnapshotChecker* _snapshot_checker, bool _manual_compaction,
-    const std::string& _trim_ts, double _score, bool _deletion_compaction,
-    bool l0_files_might_overlap, CompactionReason _compaction_reason,
+    const SnapshotChecker* _snapshot_checker,
+    CompactionReason _compaction_reason, const std::string& _trim_ts,
+    double _score, bool l0_files_might_overlap,
     BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
     double _blob_garbage_collection_age_cutoff)
     : input_vstorage_(vstorage),
@@ -304,7 +304,9 @@ Compaction::Compaction(
       output_compression_(_compression),
       output_compression_opts_(_compression_opts),
       output_temperature_(_output_temperature),
-      deletion_compaction_(_deletion_compaction),
+      deletion_compaction_(_compaction_reason == CompactionReason::kFIFOTtl ||
+                           _compaction_reason ==
+                               CompactionReason::kFIFOMaxSize),
       l0_files_might_overlap_(l0_files_might_overlap),
       inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
       grandparents_(std::move(_grandparents)),
@@ -321,7 +323,8 @@ Compaction::Compaction(
               ? false
               : IsBottommostLevel(output_level_, vstorage, inputs_)),
       is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
-      is_manual_compaction_(_manual_compaction),
+      is_manual_compaction_(_compaction_reason ==
+                            CompactionReason::kManualCompaction),
       trim_ts_(_trim_ts),
       is_trivial_move_(false),
       compaction_reason_(_compaction_reason),
@@ -349,9 +352,6 @@ Compaction::Compaction(
                                       immutable_options_, start_level_,
                                       output_level_)) {
   MarkFilesBeingCompacted(true);
-  if (is_manual_compaction_) {
-    compaction_reason_ = CompactionReason::kManualCompaction;
-  }
   if (max_subcompactions_ == 0) {
     max_subcompactions_ = _mutable_db_options.max_subcompactions;
   }
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 373a8b647492..b1498a877010 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -94,10 +94,9 @@ class Compaction {
              std::vector<FileMetaData*> grandparents,
              std::optional<SequenceNumber> earliest_snapshot,
              const SnapshotChecker* snapshot_checker,
-             bool manual_compaction = false, const std::string& trim_ts = "",
-             double score = -1, bool deletion_compaction = false,
+             CompactionReason compaction_reason,
+             const std::string& trim_ts = "", double score = -1,
              bool l0_files_might_overlap = true,
-             CompactionReason compaction_reason = CompactionReason::kUnknown,
              BlobGarbageCollectionPolicy blob_garbage_collection_policy =
                  BlobGarbageCollectionPolicy::kUseDefault,
              double blob_garbage_collection_age_cutoff = -1);
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index d2517e6aa3ad..c8178feb1b6a 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -651,7 +651,8 @@ class CompactionJobTestBase : public testing::Test {
         mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
         cfd->GetLatestMutableCFOptions().compression_opts,
         Temperature::kUnknown, max_subcompactions, grandparents,
-        /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, true);
+        /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr,
+        CompactionReason::kManualCompaction);
     compaction.FinalizeInputInfo(cfd->current());
 
     assert(db_options_.info_log);
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index f65556d38de6..2bdd9a9bb327 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -381,7 +381,7 @@ Compaction* CompactionPicker::CompactFiles(
       mutable_cf_options.default_write_temperature,
       compact_options.max_subcompactions,
       /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr, true);
+      /* snapshot_checker */ nullptr, CompactionReason::kManualCompaction);
   RegisterCompaction(c);
   return c;
 }
@@ -680,10 +680,9 @@ Compaction* CompactionPicker::CompactRange(
         mutable_cf_options.default_write_temperature,
         compact_range_options.max_subcompactions,
         /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
-        /* snapshot_checker */ nullptr,
-        /* is manual */ true, trim_ts, /* score */ -1,
-        /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
-        CompactionReason::kUnknown,
+        /* snapshot_checker */ nullptr, CompactionReason::kManualCompaction,
+        trim_ts, /* score */ -1,
+        /* l0_files_might_overlap */ true,
         compact_range_options.blob_garbage_collection_policy,
         compact_range_options.blob_garbage_collection_age_cutoff);
 
@@ -873,9 +872,8 @@ Compaction* CompactionPicker::CompactRange(
       mutable_cf_options.default_write_temperature,
       compact_range_options.max_subcompactions, std::move(grandparents),
       /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
-      /* is manual */ true, trim_ts, /* score */ -1,
-      /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
-      CompactionReason::kUnknown,
+      CompactionReason::kManualCompaction, trim_ts, /* score */ -1,
+      /* l0_files_might_overlap */ true,
       compact_range_options.blob_garbage_collection_policy,
       compact_range_options.blob_garbage_collection_age_cutoff);
 
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index 4f18cdda35c5..e2a241b625cf 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -127,11 +127,9 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
       mutable_cf_options.compression_opts,
       mutable_cf_options.default_write_temperature,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false,
+      /* snapshot_checker */ nullptr, CompactionReason::kFIFOTtl,
       /* trim_ts */ "", vstorage->CompactionScore(0),
-      /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
-      CompactionReason::kFIFOTtl);
+      /* l0_files_might_overlap */ true);
   return c;
 }
 
@@ -200,11 +198,10 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
             mutable_cf_options.default_write_temperature,
             0 /* max_subcompactions */, {},
             /* earliest_snapshot */ std::nullopt,
-            /* snapshot_checker */ nullptr, /* is manual */ false,
+            /* snapshot_checker */ nullptr,
+            CompactionReason::kFIFOReduceNumFiles,
             /* trim_ts */ "", vstorage->CompactionScore(0),
-            /* is deletion compaction */ false,
-            /* l0_files_might_overlap */ true,
-            CompactionReason::kFIFOReduceNumFiles);
+            /* l0_files_might_overlap */ true);
         return c;
       }
     }
@@ -297,11 +294,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
       mutable_cf_options.compression_opts,
       mutable_cf_options.default_write_temperature,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false,
+      /* snapshot_checker */ nullptr, CompactionReason::kFIFOMaxSize,
       /* trim_ts */ "", vstorage->CompactionScore(0),
-      /* is deletion compaction */ true,
-      /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
+      /* l0_files_might_overlap */ true);
   return c;
 }
 
@@ -416,10 +411,9 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
       mutable_cf_options.compression, mutable_cf_options.compression_opts,
       compaction_target_temp,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0),
-      /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
-      CompactionReason::kChangeTemperature);
+      /* snapshot_checker */ nullptr, CompactionReason::kChangeTemperature,
+      /* trim_ts */ "", vstorage->CompactionScore(0),
+      /* l0_files_might_overlap */ true);
   return c;
 }
 
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 108c80cf3a76..3c6407da1683 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -145,7 +145,6 @@ class LevelCompactionBuilder {
   int parent_index_ = -1;
   int base_index_ = -1;
   double start_level_score_ = 0;
-  bool is_manual_ = false;
   bool is_l0_trivial_move_ = false;
   CompactionInputFiles start_level_inputs_;
   std::vector<CompactionInputFiles> compaction_inputs_;
@@ -561,9 +560,8 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
       mutable_cf_options_.default_write_temperature,
       /* max_subcompactions */ 0, std::move(grandparents_),
       /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
-      is_manual_,
-      /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
-      l0_files_might_overlap, compaction_reason_);
+      compaction_reason_,
+      /* trim_ts */ "", start_level_score_, l0_files_might_overlap);
 
   // If it's level 0 compaction, make sure we don't execute any other level 0
   // compactions in parallel
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index f9ba649273f7..3d76d53a0dec 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -1097,10 +1097,9 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
                         mutable_cf_options_.default_write_temperature,
                         /* max_subcompactions */ 0, grandparents,
                         /* earliest_snapshot */ std::nullopt,
-                        /* snapshot_checker */ nullptr,
-                        /* is manual */ false, /* trim_ts */ "", score_,
-                        false /* deletion_compaction */,
-                        /* l0_files_might_overlap */ true, compaction_reason);
+                        /* snapshot_checker */ nullptr, compaction_reason,
+                        /* trim_ts */ "", score_,
+                        /* l0_files_might_overlap */ true);
 }
 
 // Look at overall size amplification. If size amplification
@@ -1447,10 +1446,9 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
       /* max_subcompactions */ 0, /* grandparents */ {},
       /* earliest_snapshot */ std::nullopt,
       /* snapshot_checker */ nullptr,
-      /* is manual */ false,
-      /* trim_ts */ "", score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ true,
-      CompactionReason::kUniversalSizeAmplification);
+      CompactionReason::kUniversalSizeAmplification,
+      /* trim_ts */ "", score_,
+      /* l0_files_might_overlap */ true);
 }
 
 // Pick files marked for compaction. Typically, files are marked by
@@ -1600,11 +1598,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
       mutable_cf_options_.default_write_temperature,
       /* max_subcompactions */ 0, grandparents, earliest_snapshot_,
-      snapshot_checker_,
-      /* is manual */ false,
-      /* trim_ts */ "", score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ true,
-      CompactionReason::kFilesMarkedForCompaction);
+      snapshot_checker_, CompactionReason::kFilesMarkedForCompaction,
+      /* trim_ts */ "", score_,
+      /* l0_files_might_overlap */ true);
 }
 
 Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
@@ -1700,10 +1696,9 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
       mutable_cf_options_.default_write_temperature,
       /* max_subcompactions */ 0, /* grandparents */ {},
       /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr,
-      /* is manual */ false,
-      /* trim_ts */ "", score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ true, compaction_reason);
+      /* snapshot_checker */ nullptr, compaction_reason,
+      /* trim_ts */ "", score_,
+      /* l0_files_might_overlap */ true);
 }
 
 Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 2e178053a9f5..94b10f2c3573 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1849,10 +1849,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
         0 /* max_subcompactions, not applicable */,
         {} /* grandparents, not applicable */,
         std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
-        false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
-        false /* is deletion compaction, not applicable */,
-        false /* l0_files_might_overlap, not applicable */,
-        CompactionReason::kRefitLevel));
+        CompactionReason::kRefitLevel, "" /* trim_ts */,
+        -1 /* score, not applicable */,
+        false /* l0_files_might_overlap, not applicable */));
     cfd->compaction_picker()->RegisterCompaction(c.get());
     TEST_SYNC_POINT("DBImpl::ReFitLevel:PostRegisterCompaction");
     VersionEdit edit;
@@ -4424,13 +4423,14 @@ Compaction* DBImpl::CreateIntendedCompactionForwardedToBottomPriorityPool(
 
   c->ReleaseCompactionFiles(Status::OK());
 
-  Compaction* intended_compaction = new Compaction(
-      vstorage, io, mo, mutable_db_options_, std::move(inputs),
-      c->output_level(), c->target_output_file_size(),
-      c->max_compaction_bytes(), c->output_path_id(), c->output_compression(),
-      c->output_compression_opts(), c->output_temperature(),
-      c->max_subcompactions(), c->grandparents(),
-      std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */);
+  Compaction* intended_compaction =
+      new Compaction(vstorage, io, mo, mutable_db_options_, std::move(inputs),
+                     c->output_level(), c->target_output_file_size(),
+                     c->max_compaction_bytes(), c->output_path_id(),
+                     c->output_compression(), c->output_compression_opts(),
+                     c->output_temperature(), c->max_subcompactions(),
+                     c->grandparents(), std::nullopt /* earliest_snapshot */,
+                     nullptr /* snapshot_checker */, c->compaction_reason());
 
   cfd->compaction_picker()->RegisterCompaction(intended_compaction);
   vstorage->ComputeCompactionScore(io, mo);
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index e99c04300fb2..9259fdd78db3 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -701,10 +701,9 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
         0 /* max_subcompaction, not applicable */,
         {} /* grandparents, not applicable */,
         std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
-        false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */,
-        false /* is deletion compaction, not applicable */,
-        files_overlap_ /* l0_files_might_overlap, not applicable */,
-        CompactionReason::kExternalSstIngestion));
+        CompactionReason::kExternalSstIngestion, "" /* trim_ts */,
+        -1 /* score, not applicable */,
+        files_overlap_ /* l0_files_might_overlap, not applicable */));
   }
 }
 

From 3bb3142b7e31383efe594f8cf9d2afcace4c0be5 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 17 Jul 2025 07:56:20 -0700
Subject: [PATCH 184/500] Upgrade Maven to 3.9.11 (#13779)

Summary:
Similar to https://github.com/facebook/rocksdb/pull/13684, the link for version 3.9.10 is broken again, and we are upgrading Maven as part of the fix.

This time, we are no longer using the link from https://dlcdn.apache.org/maven/maven-3/ because they occasionally remove versions, which can break our CI at any time.

Instead, changing the link to use Apache Archive which should be stable

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13779

Test Plan:
CI

`install-maven` step is now passing - https://github.com/facebook/rocksdb/actions/runs/16328986469/job/46126398150?pr=13779

Reviewed By: krhancoc

Differential Revision: D78428965

Pulled By: jaykorean

fbshipit-source-id: 9c218f6efbd1188be7847f43be338908efffe002
---
 .github/actions/install-maven/action.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/actions/install-maven/action.yml b/.github/actions/install-maven/action.yml
index 0c6a68d3c3a2..815ec751f2de 100644
--- a/.github/actions/install-maven/action.yml
+++ b/.github/actions/install-maven/action.yml
@@ -4,8 +4,8 @@ runs:
   steps:
   - name: Install Maven
     run: |
-      wget --no-check-certificate https://dlcdn.apache.org/maven/maven-3/3.9.10/binaries/apache-maven-3.9.10-bin.tar.gz
-      tar zxf apache-maven-3.9.10-bin.tar.gz
-      echo "export M2_HOME=$(pwd)/apache-maven-3.9.10" >> $GITHUB_ENV
-      echo "$(pwd)/apache-maven-3.9.10/bin" >> $GITHUB_PATH
+      wget --no-check-certificate https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.tar.gz
+      tar zxf apache-maven-3.9.11-bin.tar.gz
+      echo "export M2_HOME=$(pwd)/apache-maven-3.9.11" >> $GITHUB_ENV
+      echo "$(pwd)/apache-maven-3.9.11/bin" >> $GITHUB_PATH
     shell: bash

From 2850ccb96be0da6e19377576cd5a8e88f4481656 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 17 Jul 2025 10:00:21 -0700
Subject: [PATCH 185/500] Support Prepare() in BlockBasedTableIterator For
 MultiScan (#13778)

Summary:
initial support for Prepare() to optimize the performance of MultiScan when using block-based tables. In Prepare(), we do the following:
1. Load all data blocks that will be read in multiscan to block cache
2. Pin the data blocks during the scan
3. if I/O is needed, coalesce I/Os when they are adjacent.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13778

Test Plan:
Added a new unit test.

Benchmark:
1. Set up the DB, I use FIFO here so that files will be in L0 and iterator will use BlockBasedTableIterator directly instead of LevelIterator, where Prepare() call is not implemented yet.
```
./db_bench --benchmarks="fillseq,compact" --disable_wal=1 --threads=1 --num_levels=1 --compaction_style=2 --fifo_compaction_max_table_files_size_mb=1000 --write_buffer_size=268435456
```

2. Multi-scan: based on https://github.com/facebook/rocksdb/issues/13765
```
./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --seek_nexts=100 --threads=32 --duration=10 --statistics=1

multiscan_stride = 100
multiscan_size = 10
seek_nexts = 100

Main:
multiscan    :     449.386 micros/op 70562 ops/sec 10.359 seconds 730968 operations; (multscans:22999)
multiscan    :     453.606 micros/op 69433 ops/sec 10.369 seconds 719968 operations; (multscans:22999)
rocksdb.non.last.level.read.bytes COUNT : 47763519421
rocksdb.non.last.level.read.count COUNT : 21573878
Branch:
multiscan    :     332.670 micros/op 94698 ops/sec 10.285 seconds 973968 operations; (multscans:29999)
rocksdb.non.last.level.read.bytes COUNT : 111791308336
rocksdb.non.last.level.read.count COUNT : 1062942

With direct-IO:
./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --seek_nexts=100 --threads=32 --duration=10 --statistics=1 --use_direct_reads=1

Main:
multiscan    :     586.045 micros/op 53825 ops/sec 10.366 seconds 557968 operations; (multscans:14999)
rocksdb.non.last.level.read.bytes COUNT : 69107458693
rocksdb.non.last.level.read.count COUNT : 6724651
Branch:
multiscan    :     386.679 micros/op 81282 ops/sec 10.359 seconds 841968 operations; (multscans:25999)
rocksdb.non.last.level.read.bytes COUNT : 96605800558
rocksdb.non.last.level.read.count COUNT : 918973
```

Throughput is 36% higher with non-direct IO and 50% higher with direct IO. The improvement is likely from doing less number of I/Os due to I/O coalescing during Prepare(), as shown in `rocksdb.non.last.level.read.count`. The total bytes read is more with this PR for the same reason.

3. Regular iterator:
```
./db_bench --use_existing_db=1 --db="/tmp/rocksdbtest-543376/dbbench" --benchmarks=seekrandom --disable_auto_compactions=1 --seek_nexts=10 --threads=32 --duration=10

Main:
seekrandom   :      13.014 micros/op 2456735 ops/sec 10.014 seconds 24602968 operations; 2717.8 MB/s (773999 of 773999 found)
Branch:
seekrandom   :      13.048 micros/op 2450554 ops/sec 10.013 seconds 24537968 operations; 2710.9 MB/s (772999 of 772999 found)
```
The result fluctuates but without noticeable regression.

Reviewed By: anand1976

Differential Revision: D78440807

Pulled By: cbi42

fbshipit-source-id: 80ac6fd222696fa65ac0b4b5441748be5ee0b979
---
 .../block_based/block_based_table_iterator.cc | 369 +++++++++++++++++-
 .../block_based/block_based_table_iterator.h  |  94 +++--
 table/block_based/block_based_table_reader.cc |  17 +-
 table/block_based/block_based_table_reader.h  |  10 +
 .../block_based_table_reader_test.cc          | 170 +++++++-
 table/format.h                                |   2 +-
 .../bbiter-multiscan.md                       |   1 +
 7 files changed, 630 insertions(+), 33 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/bbiter-multiscan.md

diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 3f55f82a77a5..dc23be9128d1 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -37,6 +37,14 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
                                        bool async_prefetch) {
   // TODO(hx235): set `seek_key_prefix_for_readahead_trimming_`
   // even when `target == nullptr` that is when `SeekToFirst()` is called
+  if (multi_scan_) {
+    if (SeekMultiScan(target)) {
+      return;
+    }
+  }
+
+  assert(!multi_scan_);
+
   if (target != nullptr && prefix_extractor_ &&
       read_options_.prefix_same_as_start) {
     const Slice& seek_user_key = ExtractUserKey(*target);
@@ -56,7 +64,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
   ResetBlockCacheLookupVar();
 
   bool autotune_readaheadsize =
-      is_first_pass && read_options_.auto_readahead_size &&
+      read_options_.auto_readahead_size &&
       (read_options_.iterate_upper_bound || read_options_.prefix_same_as_start);
 
   if (autotune_readaheadsize &&
@@ -181,6 +189,7 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
 }
 
 void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
+  multi_scan_.reset();
   direction_ = IterDirection::kBackward;
   ResetBlockCacheLookupVar();
   is_out_of_bound_ = false;
@@ -255,6 +264,7 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) {
 }
 
 void BlockBasedTableIterator::SeekToLast() {
+  multi_scan_.reset();
   direction_ = IterDirection::kBackward;
   ResetBlockCacheLookupVar();
   is_out_of_bound_ = false;
@@ -278,7 +288,9 @@ void BlockBasedTableIterator::SeekToLast() {
 }
 
 void BlockBasedTableIterator::Next() {
+  assert(Valid());
   if (is_at_first_key_from_index_ && !MaterializeCurrentBlock()) {
+    assert(!multi_scan_);
     return;
   }
   assert(block_iter_points_to_real_block_);
@@ -299,7 +311,9 @@ bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) {
 }
 
 void BlockBasedTableIterator::Prev() {
-  if (readahead_cache_lookup_ && !IsIndexAtCurr()) {
+  assert(!multi_scan_);
+  if ((readahead_cache_lookup_ && !IsIndexAtCurr()) || multi_scan_) {
+    multi_scan_.reset();
     // In case of readahead_cache_lookup_, index_iter_ has moved forward. So we
     // need to reseek the index_iter_ to point to current block by using
     // block_iter_'s key.
@@ -566,6 +580,10 @@ void BlockBasedTableIterator::FindKeyForward() {
 }
 
 void BlockBasedTableIterator::FindBlockForward() {
+  if (multi_scan_) {
+    FindBlockForwardInMultiScan();
+    return;
+  }
   // TODO the while loop inherits from two-level-iterator. We don't know
   // whether a block can be empty so it can be replaced by an "if".
   do {
@@ -901,4 +919,351 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
   ResetPreviousBlockOffset();
 }
 
+// Note:
+// - Iterator should not be reused for multiple multiscans or mixing
+// multiscan with regular iterator usage.
+// - scan ranges should be non-overlapping, and have increasing start keys.
+// If a scan range's limit is not set, then there should only be one scan range.
+// - After Prepare(), the iterator expects Seek to be called on the start key
+// of each ScanOption in order. If any other seek is done, the optimization here
+// is aborted and fall back to vanilla iterator.
+// FIXME: DBIter and MergingIterator may
+// internally do Seek() on child iterators, e.g. due to
+// ReadOptions::max_skippable_internal_keys or reseeking into range deletion
+// end key. So these Seeks can cause iterator to fall back to normal
+// (non-prepared) iterator and ignore the optimizations done in Prepare().
+void BlockBasedTableIterator::Prepare(
+    const std::vector<ScanOptions>* scan_opts) {
+  index_iter_->Prepare(scan_opts);
+
+  assert(!multi_scan_);
+  if (multi_scan_) {
+    multi_scan_.reset();
+    return;
+  }
+  if (scan_opts == nullptr || scan_opts->empty()) {
+    return;
+  }
+  const bool has_limit = scan_opts->front().range.limit.has_value();
+  if (!has_limit && scan_opts->size() > 1) {
+    // Abort: overlapping ranges
+    return;
+  }
+
+  // Validate scan ranges to be increasing and with limit.
+  for (size_t i = 0; i < scan_opts->size(); ++i) {
+    const auto& scan_range = (*scan_opts)[i].range;
+    if (!scan_range.start.has_value()) {
+      // Abort: no start key
+      return;
+    }
+
+    // Assume for each scan range start <= limit.
+    if (scan_range.limit.has_value()) {
+      assert(user_comparator_.Compare(scan_range.start.value(),
+                                      scan_range.limit.value()) <= 0);
+    }
+
+    if (i > 0) {
+      if (!scan_range.limit.has_value()) {
+        // multiple no limit scan ranges
+        return;
+      }
+
+      const auto& last_end_key = (*scan_opts)[i - 1].range.limit.value();
+      if (user_comparator_.Compare(scan_range.start.value(), last_end_key) <
+          0) {
+        // Abort: overlapping ranges
+        return;
+      }
+    }
+  }
+
+  // Gather all relevant data block handles
+  std::vector<BlockHandle> blocks_to_prepare;
+  Status s;
+  std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
+  for (const auto& scan_opt : *scan_opts) {
+    size_t num_blocks = 0;
+    // Current scan overlap the last block of the previous scan.
+    bool check_overlap = !blocks_to_prepare.empty();
+
+    // Scan range is specified in user key, here we seek to the minimum internal
+    // key with this user key.
+    InternalKey start_key(scan_opt.range.start.value(), kMaxSequenceNumber,
+                          kValueTypeForSeek);
+    index_iter_->Seek(start_key.Encode());
+    while (index_iter_->Valid() &&
+           (!scan_opt.range.limit.has_value() ||
+            user_comparator_.CompareWithoutTimestamp(
+                index_iter_->user_key(),
+                /*a_has_ts*/ true, *scan_opt.range.limit,
+                /*b_has_ts=*/false) <= 0)) {
+      if (check_overlap &&
+          blocks_to_prepare.back() == index_iter_->value().handle) {
+        // Skip the current block since it's already in the list
+      } else {
+        blocks_to_prepare.push_back(index_iter_->value().handle);
+      }
+      ++num_blocks;
+      index_iter_->Next();
+      check_overlap = false;
+    }
+    // Stop until index->key > limit
+    // Include the current block since it can still contain keys <= limit
+    if (index_iter_->Valid()) {
+      if (check_overlap &&
+          blocks_to_prepare.back() == index_iter_->value().handle) {
+        // Skip adding the current block since it's already in the list
+      } else {
+        blocks_to_prepare.push_back(index_iter_->value().handle);
+      }
+      ++num_blocks;
+    }
+
+    if (!index_iter_->status().ok()) {
+      // Abort: index iterator error
+      return;
+    }
+
+    block_ranges_per_scan.emplace_back(blocks_to_prepare.size() - num_blocks,
+                                       blocks_to_prepare.size());
+  }
+
+  // blocks_to_prepare has all the blocks that need to be read.
+  // Look up entries in cache and pin if exist.
+  // Store indices of blocks to read.
+  std::vector<size_t> blocks_to_read;
+  std::vector<CachableEntry<Block>> pinned_data_blocks_guard;
+  pinned_data_blocks_guard.resize(blocks_to_prepare.size());
+  for (size_t i = 0; i < blocks_to_prepare.size(); ++i) {
+    const auto& data_block_handle = blocks_to_prepare[i];
+    s = table_->LookupAndPinBlocksInCache<Block_kData>(
+        read_options_, data_block_handle,
+        &pinned_data_blocks_guard[i].As<Block_kData>());
+
+    if (!s.ok()) {
+      // Abort: block cache look up failed.
+      return;
+    }
+    if (!pinned_data_blocks_guard[i].GetValue()) {
+      // Block not in cache, will read it below.
+      blocks_to_read.emplace_back(i);
+    }
+  }
+
+  // Coalesce IOs
+  // TODO: limit prefetching size to bound memory usage.
+  if (!blocks_to_read.empty()) {
+    // Each vector correspond to blocks to read in a single read request.
+    // Each member in the vector is an index into blocks_to_prepare.
+    std::vector<std::vector<size_t>> collapsed_blocks_to_read(1);
+
+    // TODO: make this threshold configurable
+    constexpr size_t kCoalesceThreshold = 16 << 10;  // 16KB
+
+    for (const auto& block_idx : blocks_to_read) {
+      if (!collapsed_blocks_to_read.back().empty()) {
+        // Check if we can coalesce.
+        const auto& last_block =
+            blocks_to_prepare[collapsed_blocks_to_read.back().back()];
+        uint64_t last_block_end =
+            last_block.offset() +
+            BlockBasedTable::BlockSizeWithTrailer(last_block);
+        uint64_t current_start = blocks_to_prepare[block_idx].offset();
+
+        if (current_start > last_block_end + kCoalesceThreshold) {
+          // new IO
+          collapsed_blocks_to_read.emplace_back();
+        }
+      }
+      collapsed_blocks_to_read.back().emplace_back(block_idx);
+    }
+
+    // do IO
+    IOOptions io_opts;
+    s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
+    if (!s.ok()) {
+      // Abort: PrepareIOOptions failed
+      return;
+    }
+
+    // Init read requests for Multi-Read
+    std::vector<FSReadRequest> read_reqs;
+    read_reqs.reserve(collapsed_blocks_to_read.size());
+    size_t total_len = 0;
+    for (const auto& blocks : collapsed_blocks_to_read) {
+      assert(blocks.size());
+      const auto& first_block = blocks_to_prepare[blocks[0]];
+      const auto& last_block = blocks_to_prepare[blocks.back()];
+
+      const auto start_offset = first_block.offset();
+      const auto end_offset = last_block.offset() +
+                              BlockBasedTable::BlockSizeWithTrailer(last_block);
+      assert(end_offset > start_offset);
+      FSReadRequest read_req;
+      read_req.offset = start_offset;
+      read_req.len = end_offset - start_offset;
+      total_len += read_req.len;
+      read_reqs.emplace_back(std::move(read_req));
+    }
+
+    // Init buffer for read
+    std::unique_ptr<char[]> buf;
+    const bool direct_io = table_->get_rep()->file->use_direct_io();
+    if (direct_io) {
+      for (auto& read_req : read_reqs) {
+        read_req.scratch = nullptr;
+      }
+    } else {
+      // TODO: optimize if FSSupportedOps::kFSBuffer is supported.
+      buf.reset(new char[total_len]);
+      size_t offset = 0;
+      for (auto& read_req : read_reqs) {
+        read_req.scratch = buf.get() + offset;
+        offset += read_req.len;
+      }
+    }
+
+    AlignedBuf aligned_buf;
+    s = table_->get_rep()->file.get()->MultiRead(
+        io_opts, read_reqs.data(), read_reqs.size(),
+        direct_io ? &aligned_buf : nullptr);
+    if (!s.ok()) {
+      return;
+    }
+    for (auto& req : read_reqs) {
+      if (!req.status.ok()) {
+        return;
+      }
+    }
+
+    // Init blocks and pin them in block cache.
+    MemoryAllocator* memory_allocator =
+        table_->get_rep()->table_options.block_cache->memory_allocator();
+    for (size_t i = 0; i < collapsed_blocks_to_read.size(); i++) {
+      const auto& blocks = collapsed_blocks_to_read[i];
+      const auto& read_req = read_reqs[i];
+      for (const auto& block_idx : blocks) {
+        const auto& block = blocks_to_prepare[block_idx];
+        const auto block_size_with_trailer =
+            BlockBasedTable::BlockSizeWithTrailer(block);
+        const auto block_offset_in_buffer = block.offset() - read_req.offset;
+
+        CacheAllocationPtr data =
+            AllocateBlock(block_size_with_trailer, memory_allocator);
+        memcpy(data.get(), read_req.result.data() + block_offset_in_buffer,
+               block_size_with_trailer);
+        BlockContents tmp_contents(std::move(data), block.size());
+
+#ifndef NDEBUG
+        tmp_contents.has_trailer =
+            table_->get_rep()->footer.GetBlockTrailerSize() > 0;
+#endif
+        assert(pinned_data_blocks_guard[block_idx].IsEmpty());
+        s = table_->CreateAndPinBlockInCache<Block_kData>(
+            read_options_, block, &tmp_contents,
+            &(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
+        if (!s.ok()) {
+          // Abort: failed to create and pin block in cache
+          return;
+        }
+      }
+    }
+  }
+
+  // Successful Prepare, init related states so the iterator reads from prepared
+  // blocks
+  multi_scan_.reset(new MultiScanState(scan_opts,
+                                       std::move(pinned_data_blocks_guard),
+                                       std::move(block_ranges_per_scan)));
+  is_index_at_curr_block_ = false;
+  block_iter_points_to_real_block_ = false;
+}
+
+bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
+  assert(multi_scan_);
+  // This is a MultiScan and Preapre() has been called.
+  //
+  // Validate seek key with scan options
+  if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
+    multi_scan_.reset();
+  } else if (!target) {
+    // start key must be set for multi-scan
+    multi_scan_.reset();
+  } else if (user_comparator_.CompareWithoutTimestamp(
+                 ExtractUserKey(*target), /*a_has_ts=*/true,
+                 (*multi_scan_->scan_opts)[multi_scan_->next_scan_idx]
+                     .range.start.value(),
+                 /*b_has_ts=*/false) != 0) {
+    // Unexpected seek key
+    multi_scan_.reset();
+  } else {
+    auto [cur_scan_start_idx, cur_scan_end_idx] =
+        multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx];
+    // We should have the data block already loaded
+    ++multi_scan_->next_scan_idx;
+    if (cur_scan_start_idx >= cur_scan_end_idx) {
+      is_out_of_bound_ = true;
+      assert(!Valid());
+      return true;
+    } else {
+      is_out_of_bound_ = false;
+    }
+
+    if (!block_iter_points_to_real_block_ ||
+        multi_scan_->cur_data_block_idx != cur_scan_start_idx) {
+      if (block_iter_points_to_real_block_) {
+        // Should be scan in increasing key range.
+        // All blocks before cur_data_block_idx_ are not pinned anymore.
+        assert(multi_scan_->cur_data_block_idx < cur_scan_start_idx);
+      }
+
+      ResetDataIter();
+      // Note that the block_iter_ takes ownership of the pinned data block
+      // TODO: we can delegate the clean up like with pinned_iters_mgr_ if
+      // need to pin blocks longer.
+      table_->NewDataBlockIterator<DataBlockIter>(
+          read_options_, multi_scan_->pinned_data_blocks[cur_scan_start_idx],
+          &block_iter_, Status::OK());
+    }
+    multi_scan_->cur_data_block_idx = cur_scan_start_idx;
+    block_iter_points_to_real_block_ = true;
+    block_iter_.Seek(*target);
+    FindKeyForward();
+    return true;
+  }
+
+  return false;
+}
+
+void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
+  assert(multi_scan_);
+  assert(multi_scan_->next_scan_idx >= 1);
+  const auto cur_scan_end_idx = std::get<1>(
+      multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
+      // We don't ResetDataIter() here since next scan might be reading from
+      // the same block. ResetDataIter() will free the underlying block cache
+      // handle and we don't want the block to be unpinned.
+      is_out_of_bound_ = true;
+      assert(!Valid());
+      return;
+    }
+    // Move to the next pinned data block
+    ResetDataIter();
+    ++multi_scan_->cur_data_block_idx;
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_,
+        multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx],
+        &block_iter_, Status::OK());
+    block_iter_points_to_real_block_ = true;
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 2e46d96ac40b..ccf7d8044822 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -41,11 +41,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
             compaction_readahead_size,
             table_->get_rep()->table_options.initial_auto_readahead_size),
         allow_unprepared_value_(allow_unprepared_value),
-        block_iter_points_to_real_block_(false),
         check_filter_(check_filter),
         need_upper_bound_check_(need_upper_bound_check),
         async_read_in_progress_(false),
-        is_last_level_(table->IsLastLevel()) {}
+        is_last_level_(table->IsLastLevel()),
+        block_iter_points_to_real_block_(false) {}
 
   ~BlockBasedTableIterator() override { ClearBlockHandles(); }
 
@@ -69,6 +69,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   Slice key() const override {
     assert(Valid());
     if (is_at_first_key_from_index_) {
+      assert(!multi_scan_);
       return index_iter_->value().first_internal_key;
     } else {
       return block_iter_.key();
@@ -141,10 +142,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     // Prefix index set status to NotFound when the prefix does not exist.
     if (IsIndexAtCurr() && !index_iter_->status().ok() &&
         !index_iter_->status().IsNotFound()) {
+      assert(!multi_scan_);
       return index_iter_->status();
     } else if (block_iter_points_to_real_block_) {
       return block_iter_.status();
     } else if (async_read_in_progress_) {
+      assert(!multi_scan_);
       return Status::TryAgain("Async read in progress");
     } else {
       return Status::OK();
@@ -222,9 +225,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     }
   }
 
-  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
-    index_iter_->Prepare(scan_opts);
-  }
+  void Prepare(const std::vector<ScanOptions>* scan_opts) override;
 
   FilePrefetchBuffer* prefetch_buffer() {
     return block_prefetcher_.prefetch_buffer();
@@ -312,12 +313,20 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   BlockPrefetcher block_prefetcher_;
 
+  // It stores all the block handles that are lookuped in cache ahead when
+  // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to
+  // different blocks when readahead_size is calculated in
+  // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek,
+  // block_handles_ is used.
+  // `block_handles_` is lazily constructed to save CPU when it is unused
+  std::unique_ptr<std::deque<BlockHandleInfo>> block_handles_;
+
+  // The prefix of the key called with SeekImpl().
+  // This is for readahead trimming so no data blocks containing keys of a
+  // different prefix are prefetched
+  std::string seek_key_prefix_for_readahead_trimming_ = "";
+
   const bool allow_unprepared_value_;
-  // True if block_iter_ is initialized and points to the same block
-  // as index iterator.
-  bool block_iter_points_to_real_block_;
-  // See InternalIteratorBase::IsOutOfBound().
-  bool is_out_of_bound_ = false;
   // How current data block's boundary key with the next block is compared with
   // iterate upper bound.
   BlockUpperBound block_upper_bound_check_ = BlockUpperBound::kUnknown;
@@ -337,18 +346,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // size based on cache hit and miss.
   bool readahead_cache_lookup_ = false;
 
-  // It stores all the block handles that are lookuped in cache ahead when
-  // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to
-  // different blocks when readahead_size is calculated in
-  // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek,
-  // block_handles_ is used.
-  // `block_handles_` is lazily constructed to save CPU when it is unused
-  std::unique_ptr<std::deque<BlockHandleInfo>> block_handles_;
-
-  // During cache lookup to find readahead size, index_iter_ is iterated and it
-  // can point to a different block. is_index_at_curr_block_ keeps track of
-  // that.
-  bool is_index_at_curr_block_ = true;
   bool is_index_out_of_bound_ = false;
 
   // Used in case of auto_readahead_size to disable the block_cache lookup if
@@ -357,10 +354,48 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // is used to disable the lookup.
   IterDirection direction_ = IterDirection::kForward;
 
-  // The prefix of the key called with SeekImpl().
-  // This is for readahead trimming so no data blocks containing keys of a
-  // different prefix are prefetched
-  std::string seek_key_prefix_for_readahead_trimming_ = "";
+  //*** BEGIN States used by both regular scan and multiscan
+
+  // True if block_iter_ is initialized and points to the same block
+  // as index iterator.
+  bool block_iter_points_to_real_block_;
+  // See InternalIteratorBase::IsOutOfBound().
+  bool is_out_of_bound_ = false;
+  // During cache lookup to find readahead size, index_iter_ is iterated and it
+  // can point to a different block.
+  // If Prepare() is called, index_iter_ is used to prefetch data blocks for the
+  // multiscan, so is_index_at_curr_block_ will be false.
+  // Whether index is expected to match the current data_block_iter_.
+  bool is_index_at_curr_block_ = true;
+
+  // *** END States used by both regular scan and multiscan
+
+  // *** BEGIN MultiScan related states ***
+  struct MultiScanState {
+    // bool prepared_ = false;
+    const std::vector<ScanOptions>* scan_opts;
+    std::vector<CachableEntry<Block>> pinned_data_blocks;
+
+    // Indicies into multiscan_pinned_data_blocks_ for data blocks that are
+    // relevant for each scan range.
+    // inclusive start, exclusive end
+    std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
+    size_t next_scan_idx;
+    size_t cur_data_block_idx;
+
+    MultiScanState(
+        const std::vector<ScanOptions>* _scan_opts,
+        std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
+        std::vector<std::tuple<size_t, size_t>>&& _block_ranges_per_scan)
+        : scan_opts(_scan_opts),
+          pinned_data_blocks(std::move(_pinned_data_blocks)),
+          block_ranges_per_scan(std::move(_block_ranges_per_scan)),
+          next_scan_idx(0),
+          cur_data_block_idx(0) {}
+  };
+
+  std::unique_ptr<MultiScanState> multi_scan_;
+  // *** END MultiScan related APIs and states ***
 
   void SeekSecondPass(const Slice* target);
 
@@ -476,5 +511,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
                                     uint64_t& end_updated_offset,
                                     size_t& prev_handles_size);
   // *** END APIs relevant to auto tuning of readahead_size ***
+
+  // *** BEGIN APIs relevant to multiscan ***
+  // Returns true iff seek is successful.
+  bool SeekMultiScan(const Slice* target);
+
+  void FindBlockForwardInMultiScan();
+  // *** END APIs relevant to multiscan ***
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 685d6eb99bcc..21b56d4724ac 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -106,7 +106,11 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
       bool use_block_cache_for_lookup) const;                                  \
   template Status BlockBasedTable::LookupAndPinBlocksInCache<T>(               \
       const ReadOptions& ro, const BlockHandle& handle,                        \
-      CachableEntry<T>* out_parsed_block) const;
+      CachableEntry<T>* out_parsed_block) const;                               \
+  template Status BlockBasedTable::CreateAndPinBlockInCache<T>(                \
+      const ReadOptions& ro, const BlockHandle& handle,                        \
+      BlockContents* block_contents, CachableEntry<T>* out_parsed_block)       \
+      const;
 
 INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(DecompressorDict);
@@ -1735,6 +1739,17 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
   return s;
 }
 
+template <typename TBlocklike>
+Status BlockBasedTable::CreateAndPinBlockInCache(
+    const ReadOptions& ro, const BlockHandle& handle, BlockContents* contents,
+    CachableEntry<TBlocklike>* out_parsed_block) const {
+  return MaybeReadBlockAndLoadToCache(
+      nullptr, ro, handle, rep_->decompressor.get(),
+      /*for_compaction=*/false, out_parsed_block, nullptr, nullptr, contents,
+      /*async_read=*/false,
+      /*use_block_cache_for_lookup=*/true);
+}
+
 // If contents is nullptr, this function looks up the block caches for the
 // data block referenced by handle, and read the block from disk if necessary.
 // If contents is non-null, it skips the cache lookup and disk read, since
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 14616e7cf069..107f2b6e66e7 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -299,11 +299,21 @@ class BlockBasedTable : public TableReader {
   Status GetKVPairsFromDataBlocks(const ReadOptions& read_options,
                                   std::vector<KVPairBlock>* kv_pair_blocks);
 
+  // Look up the block cache for the specified block.
+  // out_parsed_block is set to nullptr if the block is not found in the cache.
   template <typename TBlocklike>
   Status LookupAndPinBlocksInCache(
       const ReadOptions& ro, const BlockHandle& handle,
       CachableEntry<TBlocklike>* out_parsed_block) const;
 
+  // Create the block given in `block_contents` and insert it into block cache.
+  // `out_parsed_block` points to the inserted block if successful.
+  template <typename TBlocklike>
+  Status CreateAndPinBlockInCache(
+      const ReadOptions& ro, const BlockHandle& handle,
+      BlockContents* block_contents,
+      CachableEntry<TBlocklike>* out_parsed_block) const;
+
   struct Rep;
 
   Rep* get_rep() { return rep_; }
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index a8c6d5d17fd3..3eec5cfbd7f8 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -173,7 +173,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
         0 /* _tail_size */, user_defined_timestamps_persisted);
 
     std::unique_ptr<RandomAccessFileReader> file;
-    NewFileReader(table_name, foptions, &file);
+    NewFileReader(table_name, foptions, &file, ioptions.statistics.get());
 
     uint64_t file_size = 0;
     ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size));
@@ -222,12 +222,15 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   }
 
   void NewFileReader(const std::string& filename, const FileOptions& opt,
-                     std::unique_ptr<RandomAccessFileReader>* reader) {
+                     std::unique_ptr<RandomAccessFileReader>* reader,
+                     Statistics* stats = nullptr) {
     std::string path = Path(filename);
     std::unique_ptr<FSRandomAccessFile> f;
     ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
     reader->reset(new RandomAccessFileReader(std::move(f), path,
-                                             env_->GetSystemClock().get()));
+                                             env_->GetSystemClock().get(),
+                                             /*io_tracer=*/nullptr,
+                                             /*stats=*/stats));
   }
 };
 
@@ -990,6 +993,167 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
   ASSERT_EQ(s.code(), Status::kCorruption);
 }
 
+TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
+  Options options;
+  options.statistics = CreateDBStatistics();
+  ReadOptions read_opts;
+  size_t ts_sz = options.comparator->timestamp_size();
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          100 /* num_block */,
+          true /* mixed_with_human_readable_string_value */, ts_sz);
+
+  std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
+                           CompressionTypeToString(compression_type_);
+
+  ImmutableOptions ioptions(options);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = true;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // Should coalesce into a single I/O
+  std::vector<ScanOptions> scan_options(
+      {ScanOptions(ExtractUserKey(kv[0].first),
+                   ExtractUserKey(kv[kEntriesPerBlock].first)),
+       ScanOptions(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
+                   ExtractUserKey(kv[3 * kEntriesPerBlock].first))});
+
+  auto read_count_before =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+  auto read_count_after =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 1, read_count_after);
+  iter->Seek(kv[0].first);
+  for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  // Iter may still be valid after scan range. Upper layer (DBIter) handles
+  // exact upper bound checking. So we don't check !iter->Valid() here.
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[2 * kEntriesPerBlock].first);
+  for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  // No IO coalesce, should do MultiRead with 2 read requests.
+  scan_options = {ScanOptions(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
+                              ExtractUserKey(kv[75 * kEntriesPerBlock].first)),
+                  ScanOptions(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
+                              ExtractUserKey(kv[95 * kEntriesPerBlock].first))};
+  read_count_before =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+  read_count_after =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 2, read_count_after);
+
+  iter->Seek(kv[70 * kEntriesPerBlock].first);
+  for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[90 * kEntriesPerBlock].first);
+  for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  // Should do two I/Os since blocks 80-81 and 90-95 are already in block cache,
+  // reads from blocks 50-79 and 82-.. are co
+  scan_options = {ScanOptions(ExtractUserKey(kv[50 * kEntriesPerBlock].first))};
+  read_count_before =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+  read_count_after =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 3, read_count_after);
+  iter->Seek(kv[50 * kEntriesPerBlock].first);
+  for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+
+  // Check cases when Seek key does not match start key in ScanOptions
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
+                              ExtractUserKey(kv[20 * kEntriesPerBlock].first)),
+                  ScanOptions(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                              ExtractUserKey(kv[40 * kEntriesPerBlock].first))};
+  iter->Prepare(&scan_options);
+  // Match start key
+  iter->Seek(kv[10 * kEntriesPerBlock].first);
+  for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  // Does not match start key of the second ScanOptions.
+  iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
+  for (size_t i = 50 * kEntriesPerBlock + 1; i < 100 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first)),
+                  ScanOptions(ExtractUserKey(kv[11 * kEntriesPerBlock].first))};
+  iter->Prepare(&scan_options);
+  // Does not match the first ScanOptions.
+  iter->SeekToFirst();
+  for (size_t i = 0; i < kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[10 * kEntriesPerBlock].first);
+  for (size_t i = 10 * kEntriesPerBlock; i < 12 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+}
+
 // Param 1: compression type
 // Param 2: whether to use direct reads
 // Param 3: Block Based Table Index type, partitioned filters are also enabled
diff --git a/table/format.h b/table/format.h
index 0e914a4d9530..c8e1c86e4141 100644
--- a/table/format.h
+++ b/table/format.h
@@ -55,7 +55,7 @@ class BlockHandle {
   uint64_t offset() const { return offset_; }
   void set_offset(uint64_t _offset) { offset_ = _offset; }
 
-  // The size of the stored block
+  // The size of the stored block, this size does not include the block trailer.
   uint64_t size() const { return size_; }
   void set_size(uint64_t _size) { size_ = _size; }
 
diff --git a/unreleased_history/performance_improvements/bbiter-multiscan.md b/unreleased_history/performance_improvements/bbiter-multiscan.md
new file mode 100644
index 000000000000..7a01d9892868
--- /dev/null
+++ b/unreleased_history/performance_improvements/bbiter-multiscan.md
@@ -0,0 +1 @@
+* Optimized MultiScan using BlockBasedTable to coalesce I/Os and prefetch all data blocks.

From 9967c3255d8a6260fb02e532e259d5b6a6c9d152 Mon Sep 17 00:00:00 2001
From: Zaidoon Abd Al Hadi <43054535+zaidoon1@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:08:03 -0700
Subject: [PATCH 186/500] expose flush reason for flush job info as well as
 compaction reason for sub compaction job info via c api (#13770)

Summary:
follow up to https://github.com/facebook/rocksdb/pull/13601

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13770

Reviewed By: hx235

Differential Revision: D78426229

Pulled By: cbi42

fbshipit-source-id: d583288b87f9ab0d05421b3daeb57e297edf5ad6
---
 db/c.cc             | 13 +++++++++++++
 include/rocksdb/c.h |  5 +++++
 2 files changed, 18 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index c7221f2fd118..85ce472fcd4a 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -925,6 +925,10 @@ void rocksdb_backup_engine_options_destroy(
   delete options;
 }
 
+void rocksdb_status_ptr_get_error(rocksdb_status_ptr_t* status, char** errptr) {
+  SaveError(errptr, *(status->rep));
+}
+
 rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
                                                        char** errptr) {
   Checkpoint* checkpoint;
@@ -3061,6 +3065,10 @@ uint64_t rocksdb_flushjobinfo_smallest_seqno(
   return info->rep.smallest_seqno;
 }
 
+uint32_t rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info) {
+  return static_cast<uint32_t>(info->rep.flush_reason);
+}
+
 void rocksdb_reset_status(rocksdb_status_ptr_t* status_ptr) {
   auto ptr = status_ptr->rep;
   *ptr = Status::OK();
@@ -3192,6 +3200,11 @@ int rocksdb_subcompactionjobinfo_output_level(
   return info->rep.output_level;
 }
 
+uint32_t rocksdb_subcompactionjobinfo_compaction_reason(
+    const rocksdb_subcompactionjobinfo_t* info) {
+  return static_cast<uint32_t>(info->rep.compaction_reason);
+}
+
 /* ExternalFileIngestionInfo */
 
 const char* rocksdb_externalfileingestioninfo_cf_name(
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 92a47b25b212..048609bfd38f 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1171,6 +1171,8 @@ extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_flushjobinfo_smallest_seqno(const rocksdb_flushjobinfo_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_reset_status(
     rocksdb_status_ptr_t* status_ptr);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info);
 
 /* Compaction job info */
 extern ROCKSDB_LIBRARY_API void rocksdb_compactionjobinfo_status(
@@ -1221,6 +1223,9 @@ extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_base_input_level(
     const rocksdb_subcompactionjobinfo_t*);
 extern ROCKSDB_LIBRARY_API int rocksdb_subcompactionjobinfo_output_level(
     const rocksdb_subcompactionjobinfo_t*);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_subcompactionjobinfo_compaction_reason(
+    const rocksdb_subcompactionjobinfo_t* info);
 
 /* External file ingestion info */
 extern ROCKSDB_LIBRARY_API const char*

From 551ba21e9b9b6f9ef396859b9100b62dd76eb509 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 18 Jul 2025 14:22:29 -0700
Subject: [PATCH 187/500] Support recompress-with-CompressionManager in
 sst_dump (#13783)

Summary:
So that we can use --command=recompress with a custom CompressionManager. (It's not required for reading files using a custom CompressionManager because those can already use ObjectLibrary for dependency injection.)

Suggested follow-up:
* These tests should not be using C arrays, snprintf, manual delete, etc. except for thin compatibility with argc/argv.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13783

Test Plan: unit test added, some manual testing

Reviewed By: archang19

Differential Revision: D78574434

Pulled By: pdillinger

fbshipit-source-id: 609e6c6439090e6b7e9b63fbd4c2d3f04b104fcf
---
 .github/workflows/pr-jobs.yml |  2 +-
 options/options_helper.h      |  3 ++
 table/sst_file_dumper.cc      | 21 +++++++----
 table/sst_file_dumper.h       |  4 +-
 tools/sst_dump_test.cc        | 71 +++++++++++++++++++++++++++++++++++
 tools/sst_dump_tool.cc        | 59 ++++++++++++++++++-----------
 6 files changed, 126 insertions(+), 34 deletions(-)

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index eddaa85b1952..f71aa6cf8cdd 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -182,7 +182,7 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - run: ENCRYPTED_ENV=1 ROCKSDB_DISABLE_SNAPPY=1 ROCKSDB_DISABLE_ZLIB=1 ROCKSDB_DISABLE_BZIP=1 ROCKSDB_DISABLE_LZ4=1 ROCKSDB_DISABLE_ZSTD=1 make V=1 J=32 -j32 check
-    - run: "./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression\n"
+    - run: "./sst_dump --help | grep -E -q 'Supported built-in compression types: kNoCompression$' # Verify no compiled in compression\n"
     - uses: "./.github/actions/post-steps"
   # ======================== Linux No Test Runs ======================= #
   build-linux-release:
diff --git a/options/options_helper.h b/options/options_helper.h
index f03179066eaf..74e953b9f507 100644
--- a/options/options_helper.h
+++ b/options/options_helper.h
@@ -72,6 +72,9 @@ std::unique_ptr<Configurable> CFOptionsAsConfigurable(
 Status StringToMap(const std::string& opts_str,
                    std::unordered_map<std::string, std::string>* opts_map);
 
+Status GetStringFromCompressionType(std::string* compression_str,
+                                    CompressionType compression_type);
+
 struct OptionsHelper {
   static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/;
   static const std::string kDBOptionsName /*= "DBOptions" */;
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index b19fc0ab4aff..e96cc6f0771c 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -260,16 +260,21 @@ Status SstFileDumper::CalculateCompressedTableSize(
 }
 
 Status SstFileDumper::ShowAllCompressionSizes(
-    size_t block_size,
-    const std::vector<std::pair<CompressionType, const char*>>&
-        compression_types,
+    size_t block_size, const std::vector<CompressionType>& compression_types,
     int32_t compress_level_from, int32_t compress_level_to,
     uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
     uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
   fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
-  for (auto& i : compression_types) {
-    if (CompressionTypeSupported(i.first)) {
-      fprintf(stdout, "Compression: %-24s\n", i.second);
+  for (CompressionType ctype : compression_types) {
+    std::string cname;
+    if (!GetStringFromCompressionType(&cname, ctype).ok()) {
+      // Can produce names like "Reserved4F" for unrecognized values
+      cname = CompressionTypeToString(ctype);
+    }
+    if (options_.compression_manager
+            ? options_.compression_manager->SupportsCompressionType(ctype)
+            : CompressionTypeSupported(ctype)) {
+      fprintf(stdout, "Compression: %-24s\n", cname.c_str());
       CompressionOptions compress_opt;
       compress_opt.max_dict_bytes = max_dict_bytes;
       compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
@@ -278,13 +283,13 @@ Status SstFileDumper::ShowAllCompressionSizes(
       for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
         fprintf(stdout, "Compression level: %d", j);
         compress_opt.level = j;
-        Status s = ShowCompressionSize(block_size, i.first, compress_opt);
+        Status s = ShowCompressionSize(block_size, ctype, compress_opt);
         if (!s.ok()) {
           return s;
         }
       }
     } else {
-      fprintf(stdout, "Unsupported compression type: %s.\n", i.second);
+      fprintf(stdout, "Unsupported compression type: %s.\n", cname.c_str());
     }
   }
   return Status::OK();
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index a1a857115a8b..7ce1b016d3b8 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -43,9 +43,7 @@ class SstFileDumper {
   Status getStatus() { return init_result_; }
 
   Status ShowAllCompressionSizes(
-      size_t block_size,
-      const std::vector<std::pair<CompressionType, const char*>>&
-          compression_types,
+      size_t block_size, const std::vector<CompressionType>& compression_types,
       int32_t compress_level_from, int32_t compress_level_to,
       uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
       uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer);
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index c513a23bc93d..2fcc33eb1d34 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/sst_dump_tool.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/sst_file_dumper.h"
 #include "table/table_builder.h"
@@ -346,6 +347,76 @@ TEST_F(SSTDumpToolTest, CompressedSizes) {
   }
 }
 
+namespace {
+using Compressor8A = test::CompressorCustomAlg<kCustomCompression8A>;
+class MyManager : public CompressionManager {
+ public:
+  static constexpr const char* kCompatibilityName = "SSTDumpToolTest:MyManager";
+  const char* Name() const override { return kCompatibilityName; }
+  const char* CompatibilityName() const override { return kCompatibilityName; }
+
+  bool SupportsCompressionType(CompressionType type) const override {
+    return type == kCustomCompression8A;
+  }
+
+  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& /*opts*/,
+                                            CompressionType type) override {
+    switch (static_cast<unsigned char>(type)) {
+      case kCustomCompression8A:
+        return std::make_unique<Compressor8A>();
+      default:
+        return nullptr;
+    }
+  }
+
+  std::shared_ptr<Decompressor> GetDecompressor() override {
+    return std::make_shared<test::DecompressorCustomAlg>();
+  }
+};
+}  // namespace
+
+TEST_F(SSTDumpToolTest, CompressionManager) {
+  if (!Compressor8A::Supported()) {
+    fprintf(stderr,
+            "Prerequisite compression library not supported. Skipping\n");
+    return;
+  }
+
+  // Registery in ObjectLibrary to check that sst_dump can use named
+  // CompressionManagers with dependency injection
+  auto& library = *ObjectLibrary::Default();
+  library.AddFactory<CompressionManager>(
+      MyManager::kCompatibilityName,
+      [](const std::string& /*uri*/, std::unique_ptr<CompressionManager>* guard,
+         std::string* /*errmsg*/) {
+        *guard = std::make_unique<MyManager>();
+        return guard->get();
+      });
+
+  Options opts;
+  opts.env = env();
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(
+      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path, 10);
+
+  char* usage[5];
+  PopulateCommandArgs(file_path, "--command=recompress", usage);
+  snprintf(usage[3], kOptLength, "--compression_manager=%s",
+           MyManager::kCompatibilityName);
+  snprintf(usage[4], kOptLength, "--compression_types=kCustomCompression8A");
+
+  ROCKSDB_NAMESPACE::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(5, usage, opts));
+
+  cleanup(opts, file_path);
+  for (int i = 0; i < 5; i++) {
+    delete[] usage[i];
+  }
+}
+
 TEST_F(SSTDumpToolTest, MemEnv) {
   std::unique_ptr<Env> mem_env(NewMemEnv(env()));
   Options opts;
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 94bf38245559..2710809bb46b 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -17,17 +17,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-static const std::vector<std::pair<CompressionType, const char*>>
-    kCompressions = {
-        {CompressionType::kNoCompression, "kNoCompression"},
-        {CompressionType::kSnappyCompression, "kSnappyCompression"},
-        {CompressionType::kZlibCompression, "kZlibCompression"},
-        {CompressionType::kBZip2Compression, "kBZip2Compression"},
-        {CompressionType::kLZ4Compression, "kLZ4Compression"},
-        {CompressionType::kLZ4HCCompression, "kLZ4HCCompression"},
-        {CompressionType::kXpressCompression, "kXpressCompression"},
-        {CompressionType::kZSTD, "kZSTD"}};
-
 namespace {
 
 void print_help(bool to_stderr) {
@@ -98,10 +87,15 @@ void print_help(bool to_stderr) {
       be used when trying different compression algorithms
 
     --compression_types=<comma-separated list of CompressionType members, e.g.,
-      kSnappyCompression>
+      kSnappyCompression or kCustomCompressionC4>
       Can be combined with --command=recompress to run recompression for this
       list of compression types
-      Supported compression types: %s
+      Supported built-in compression types: %s
+
+    --compression_manager=<compression manager string>
+      Used with --command=recompress to specify a compression manager to use
+      instead of the built-in compression manager, which may support a
+      different set of compression types.
 
     --parse_internal_key=<0xKEY>
       Convenience option to parse an internal key on the command line. Dumps the
@@ -178,7 +172,8 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   std::string compression_level_to_str;
   size_t block_size = 0;
   size_t readahead_size = 2 * 1024 * 1024;
-  std::vector<std::pair<CompressionType, const char*>> compression_types;
+  std::vector<CompressionType> compression_types;
+  std::shared_ptr<CompressionManager> compression_manager;
   uint64_t total_num_files = 0;
   uint64_t total_num_data_blocks = 0;
   uint64_t total_data_block_size = 0;
@@ -244,19 +239,36 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       std::istringstream iss(compression_types_csv);
       std::string compression_type;
       has_specified_compression_types = true;
+
       while (std::getline(iss, compression_type, ',')) {
-        auto iter = std::find_if(
-            kCompressions.begin(), kCompressions.end(),
-            [&compression_type](std::pair<CompressionType, const char*> curr) {
-              return curr.second == compression_type;
-            });
-        if (iter == kCompressions.end()) {
+        auto iter =
+            OptionsHelper::compression_type_string_map.find(compression_type);
+        if (iter == OptionsHelper::compression_type_string_map.end()) {
           fprintf(stderr, "%s is not a valid CompressionType\n",
                   compression_type.c_str());
           exit(1);
         }
-        compression_types.emplace_back(*iter);
+        compression_types.emplace_back(iter->second);
+      }
+    } else if (strncmp(argv[i], "--compression_manager=", 22) == 0) {
+      std::string compression_manager_str = argv[i] + 22;
+      ConfigOptions config_options;
+      config_options.ignore_unsupported_options = false;
+      Status s = CompressionManager::CreateFromString(
+          config_options, compression_manager_str, &compression_manager);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to create compression manager: %s\n",
+                s.ToString().c_str());
+        exit(1);
+      }
+      if (compression_manager == nullptr) {
+        fprintf(stderr, "No compression manager created: %s\n",
+                compression_manager_str.c_str());
+        exit(1);
       }
+      options.compression_manager = compression_manager;
+      printf("Using compression manager: %s\n",
+             compression_manager->GetId().c_str());
     } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) {
       std::string in_key(argv[i] + 21);
       try {
@@ -450,9 +462,12 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     }
 
     if (command == "recompress") {
+      // TODO: consider getting supported compressions from the compression
+      // manager
       st = dumper.ShowAllCompressionSizes(
           set_block_size ? block_size : 16384,
-          compression_types.empty() ? kCompressions : compression_types,
+          compression_types.empty() ? GetSupportedCompressions()
+                                    : compression_types,
           compress_level_from, compress_level_to, compression_max_dict_bytes,
           compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes,
           !compression_use_zstd_finalize_dict);

From 57ff2b24923d02f645719112181a2c04abb05426 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Fri, 18 Jul 2025 15:02:49 -0700
Subject: [PATCH 188/500] Update for next release 10.6 (#13784)

Summary:
This includes:
1. Release notes from 10.5 branch
2. Version.h update
3. Format compatibility check
4. Folly commit hash update (I chose https://github.com/facebook/folly/releases/tag/v2025.06.30.00 because later commits were causing CI failures)

Previous release: https://github.com/facebook/rocksdb/pull/13719

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13784

Reviewed By: pdillinger

Differential Revision: D78587604

Pulled By: archang19

fbshipit-source-id: a8611ef4527c3c6ee5c830349b7ae41701c1efb6
---
 HISTORY.md                                       | 16 ++++++++++++++++
 Makefile                                         |  2 +-
 include/rocksdb/version.h                        |  2 +-
 tools/check_format_compatible.sh                 |  2 +-
 .../get-waiting-txns-after-timeout.md            |  1 -
 ...leSize_api_at_FSRandomAccessFile_interface.md |  1 -
 .../periodic-compaction-trigger.md               |  1 -
 .../bug_fixes/backup-engine-crash.md             |  1 -
 .../bug_fixes/multi_scan_upper_bound.md          |  1 -
 .../performance_improvements/bbiter-multiscan.md |  1 -
 ...ag_skip_checking_sst_file_sizes_on_db_open.md |  1 -
 11 files changed, 19 insertions(+), 10 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md
 delete mode 100644 unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md
 delete mode 100644 unreleased_history/behavior_changes/periodic-compaction-trigger.md
 delete mode 100644 unreleased_history/bug_fixes/backup-engine-crash.md
 delete mode 100644 unreleased_history/bug_fixes/multi_scan_upper_bound.md
 delete mode 100644 unreleased_history/performance_improvements/bbiter-multiscan.md
 delete mode 100644 unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md

diff --git a/HISTORY.md b/HISTORY.md
index b9089d9e50a4..03e08a7dc2db 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,22 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.5.0 (07/18/2025)
+### Public API Changes
+* DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed.
+
+### Behavior Changes
+* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occured.
+* A new API GetFileSize is added to FSRandomAccessFile interface class. It uses fstat vs stat on the posix implementation which is more efficient. Caller could use it to get file size faster. This function might be required in the future for FileSystem implementation outside of the RocksDB code base.
+* RocksDB now triggers eligible compactions every 12 hours when periodic compaction is configured. This solves a limitation of the compaction trigger mechanism, which would only trigger compaction after specific events like flush, compaction, or SetOptions.
+
+### Bug Fixes
+* Fix a bug in BackupEngine that can crash backup due to a null FSWritableFile passed to WritableFileWriter.
+* Fix DB::NewMultiScan iterator to respect the scan upper bound specified in ScanOptions
+
+### Performance Improvements
+* Optimized MultiScan using BlockBasedTable to coalesce I/Os and prefetch all data blocks.
+
 ## 10.4.0 (06/20/2025)
 ### New Features
 * Add a new CF option `memtable_avg_op_scan_flush_trigger` that supports triggering memtable flush when an iterator scans through an expensive range of keys, with the average number of skipped keys from the active memtable exceeding the threshold.
diff --git a/Makefile b/Makefile
index db84477e9e6a..903ef3ce4b08 100644
--- a/Makefile
+++ b/Makefile
@@ -2492,7 +2492,7 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard 5c626dd6a028a02e461edb5396694d48305e9284
+	cd third-party/folly && git reset --hard e95383b7c8b5b1e46cf47acf2f317d54f93c8268
 	@# Apparently missing include
 	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
 	@# Warning-as-error on memcpy
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index c81a31aa96c4..6616b7e4e658 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 5
+#define ROCKSDB_MINOR 6
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 0b8df70b0295..416dfb0eaa6e 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md b/unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md
deleted file mode 100644
index 71ace60e9e3a..000000000000
--- a/unreleased_history/behavior_changes/get-waiting-txns-after-timeout.md
+++ /dev/null
@@ -1 +0,0 @@
-* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occured.
diff --git a/unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md b/unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md
deleted file mode 100644
index e006e114f53a..000000000000
--- a/unreleased_history/behavior_changes/new_GetFileSize_api_at_FSRandomAccessFile_interface.md
+++ /dev/null
@@ -1 +0,0 @@
-A new API GetFileSize is added to FSRandomAccessFile interface class. It uses fstat vs stat on the posix implementation which is more efficient. Caller could use it to get file size faster. This function might be required in the future for FileSystem implementation outside of the RocksDB code base.
diff --git a/unreleased_history/behavior_changes/periodic-compaction-trigger.md b/unreleased_history/behavior_changes/periodic-compaction-trigger.md
deleted file mode 100644
index b9c8e7c1c227..000000000000
--- a/unreleased_history/behavior_changes/periodic-compaction-trigger.md
+++ /dev/null
@@ -1 +0,0 @@
-* RocksDB now triggers eligible compactions every 12 hours when periodic compaction is configured. This solves a limitation of the compaction trigger mechanism, which would only trigger compaction after specific events like flush, compaction, or SetOptions.
diff --git a/unreleased_history/bug_fixes/backup-engine-crash.md b/unreleased_history/bug_fixes/backup-engine-crash.md
deleted file mode 100644
index 20ce0894f83e..000000000000
--- a/unreleased_history/bug_fixes/backup-engine-crash.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug in BackupEngine that can crash backup due to a null FSWritableFile passed to WritableFileWriter.
diff --git a/unreleased_history/bug_fixes/multi_scan_upper_bound.md b/unreleased_history/bug_fixes/multi_scan_upper_bound.md
deleted file mode 100644
index 973bc84401ea..000000000000
--- a/unreleased_history/bug_fixes/multi_scan_upper_bound.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix DB::NewMultiScan iterator to respect the scan upper bound specified in ScanOptions
diff --git a/unreleased_history/performance_improvements/bbiter-multiscan.md b/unreleased_history/performance_improvements/bbiter-multiscan.md
deleted file mode 100644
index 7a01d9892868..000000000000
--- a/unreleased_history/performance_improvements/bbiter-multiscan.md
+++ /dev/null
@@ -1 +0,0 @@
-* Optimized MultiScan using BlockBasedTable to coalesce I/Os and prefetch all data blocks.
diff --git a/unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md b/unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md
deleted file mode 100644
index 901537f5163d..000000000000
--- a/unreleased_history/public_api_changes/deprecate_flag_skip_checking_sst_file_sizes_on_db_open.md
+++ /dev/null
@@ -1 +0,0 @@
-DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed.

From fe68fbcd7f80b51e961da1be3e037f4d59f89f29 Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Mon, 21 Jul 2025 13:09:53 -0700
Subject: [PATCH 189/500] Prepare() Scan Option Pruning for LevelIterator
 (#13780)

Summary:
This diff introduces the ScanOption Pruning, previously the intent was to do prefetching for each sub-iterator of the level iterator, however since BlockBasedIterator does not prefetch asynchronously, this optimization does not make sense just yet.

For now we will prune the ScanOptions to the overlapping ranges and make sure they are properly piped to the underlying layers (during Prepare, and Seek).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13780

Reviewed By: cbi42

Differential Revision: D78436869

Pulled By: krhancoc

fbshipit-source-id: 681fe7f7f88b04b5c2d60cb3a5de01e03f6f8431
---
 db/version_set.cc | 71 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 6 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index ce1fff3cdf90..1acbfebda1ca 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -95,6 +95,8 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace {
 
+using ScanOptionsMap = std::unordered_map<size_t, std::vector<ScanOptions>>;
+
 // Find File in LevelFilesBrief data structure
 // Within an index range defined by left and right
 int FindFileInRange(const InternalKeyComparator& icmp,
@@ -1100,9 +1102,46 @@ class LevelIterator final : public InternalIterator {
   }
 
   void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+    // We assume here that scan_opts is sorted such that
+    // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping
     scan_opts_ = scan_opts;
-    if (file_iter_.iter()) {
-      file_iter_.Prepare(scan_opts_);
+    if (scan_opts_ == nullptr) {
+      return;
+    }
+
+    file_to_scan_opts_ = std::make_unique<ScanOptionsMap>();
+    for (size_t k = 0; k < scan_opts_->size(); k++) {
+      const ScanOptions& opt = scan_opts_->at(k);
+      auto start = opt.range.start;
+      auto end = opt.range.limit;
+
+      if (!start.has_value()) {
+        continue;
+      }
+
+      // We can capture this case in the future, but for now lets skip this.
+      if (!end.has_value()) {
+        continue;
+      }
+
+      InternalKey istart(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
+      InternalKey iend(end.value(), 0, kValueTypeForSeekForPrev);
+
+      // TODO: This needs to be optimized, right now we iterate twice, which
+      // we dont need to. We can do this in N rather than 2N.
+      size_t fstart = FindFile(icomparator_, *flevel_, istart.Encode());
+      size_t fend = FindFile(icomparator_, *flevel_, iend.Encode());
+
+      // We need to check the relevant cases
+      // Cases:
+      // 1. [  S        E  ]
+      // 2. [  S  ]  [  E  ]
+      // 3. [  S  ] ...... [  E  ]
+      for (auto i = fstart; i <= fend; i++) {
+        if (i < flevel_->num_files) {
+          (*file_to_scan_opts_)[i].emplace_back(start.value(), end.value());
+        }
+      }
     }
   }
 
@@ -1233,6 +1272,9 @@ class LevelIterator final : public InternalIterator {
   bool to_return_sentinel_ = false;
   const std::vector<ScanOptions>* scan_opts_;
 
+  // Our stored scan_opts for each prefix
+  std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
+
   // Sets flags for if we should return the sentinel key next.
   // The condition for returning sentinel is reaching the end of current
   // file_iter_: !Valid() && status.().ok().
@@ -1494,7 +1536,18 @@ bool LevelIterator::SkipEmptyFileForward() {
     // LevelIterator::Seek*, it should also call Seek* into the corresponding
     // range tombstone iterator.
     if (file_iter_.iter() != nullptr) {
-      file_iter_.SeekToFirst();
+      // If we are doing prepared scan opts then we should seek to the values
+      // specified by the scan opts
+      if (scan_opts_ && (*file_to_scan_opts_)[file_index_].size()) {
+        const ScanOptions& opts = file_to_scan_opts_->at(file_index_).front();
+        if (opts.range.start.has_value()) {
+          InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber,
+                             kValueTypeForSeek);
+          file_iter_.Seek(target.Encode());
+        }
+      } else {
+        file_iter_.SeekToFirst();
+      }
       if (range_tombstone_iter_) {
         if (*range_tombstone_iter_) {
           (*range_tombstone_iter_)->SeekToFirst();
@@ -1542,10 +1595,15 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
   }
 
   InternalIterator* old_iter = file_iter_.Set(iter);
-  // Since this is a new table iterator, no need to call Prepare() if
-  // scan_opts_ is null
   if (iter && scan_opts_) {
-    file_iter_.Prepare(scan_opts_);
+    if (file_to_scan_opts_.get() &&
+        file_to_scan_opts_->find(file_index_) != file_to_scan_opts_->end()) {
+      const std::vector<ScanOptions>& opts =
+          file_to_scan_opts_->at(file_index_);
+      file_iter_.Prepare(&opts);
+    } else {
+      file_iter_.Prepare(scan_opts_);
+    }
   }
 
   // Update the read pattern for PrefetchBuffer.
@@ -1582,6 +1640,7 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
     }
   }
 }
+
 }  // anonymous namespace
 
 Status Version::GetTableProperties(const ReadOptions& read_options,

From c50a2b68bb269404473f4df786b9c77edba398fe Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 21 Jul 2025 14:43:40 -0700
Subject: [PATCH 190/500] Expose GetTtl() API in TTL DB (#13790)

Summary:
As title

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13790

Test Plan:
```
./ttl_test --gtest_filter="*TtlTest.ChangeTtlOnOpenDb*"
```

Reviewed By: cbi42

Differential Revision: D78670347

Pulled By: jaykorean

fbshipit-source-id: 1b2538d6cd0f2a0fbf397a5d2f677852f97272c4
---
 include/rocksdb/utilities/db_ttl.h             |  2 ++
 .../public_api_changes/get_ttl_in_ttl_db.md    |  1 +
 utilities/ttl/db_ttl_impl.cc                   | 18 ++++++++++++++++++
 utilities/ttl/db_ttl_impl.h                    |  3 +++
 utilities/ttl/ttl_test.cc                      |  3 +++
 5 files changed, 27 insertions(+)
 create mode 100644 unreleased_history/public_api_changes/get_ttl_in_ttl_db.md

diff --git a/include/rocksdb/utilities/db_ttl.h b/include/rocksdb/utilities/db_ttl.h
index 12f5cbac0f75..02313277cd8a 100644
--- a/include/rocksdb/utilities/db_ttl.h
+++ b/include/rocksdb/utilities/db_ttl.h
@@ -63,6 +63,8 @@ class DBWithTTL : public StackableDB {
 
   virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
 
+  virtual Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) = 0;
+
  protected:
   explicit DBWithTTL(DB* db) : StackableDB(db) {}
 };
diff --git a/unreleased_history/public_api_changes/get_ttl_in_ttl_db.md b/unreleased_history/public_api_changes/get_ttl_in_ttl_db.md
new file mode 100644
index 000000000000..6a118735a526
--- /dev/null
+++ b/unreleased_history/public_api_changes/get_ttl_in_ttl_db.md
@@ -0,0 +1 @@
+GetTtl() API is now available in TTL DB
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 55354c6cbce3..08a2515197f0 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -635,4 +635,22 @@ void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) {
   filter->SetTtl(ttl);
 }
 
+Status DBWithTTLImpl::GetTtl(ColumnFamilyHandle* h, int32_t* ttl) {
+  if (h == nullptr || ttl == nullptr) {
+    return Status::InvalidArgument(
+        "column family handle or ttl cannot be null");
+  }
+  std::shared_ptr<TtlCompactionFilterFactory> filter;
+  Options opts;
+  opts = GetOptions(h);
+  filter = std::static_pointer_cast<TtlCompactionFilterFactory>(
+      opts.compaction_filter_factory);
+  if (!filter) {
+    return Status::InvalidArgument(
+        "TTLCompactionFilterFactory is not set for TTLDB");
+  }
+  *ttl = filter->GetTtl();
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 731cd3955fe1..9b7710739aa5 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -100,6 +100,8 @@ class DBWithTTLImpl : public DBWithTTL {
 
   void SetTtl(ColumnFamilyHandle* h, int32_t ttl) override;
 
+  Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) override;
+
  private:
   // remember whether the Close completes or not
   bool closed_;
@@ -184,6 +186,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override;
   void SetTtl(int32_t ttl) { ttl_ = ttl; }
+  int32_t GetTtl() { return ttl_; }
 
   const char* Name() const override { return kClassName(); }
   static const char* kClassName() { return "TtlCompactionFilterFactory"; }
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 37bfa7d662a0..798d1d4425e2 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -720,6 +720,9 @@ TEST_F(TtlTest, ChangeTtlOnOpenDb) {
 
   OpenTtl(1);  // T=0:Open the db with ttl = 2
   SetTtl(3);
+  int32_t ttl = 0;
+  ASSERT_OK(db_ttl_->GetTtl(db_ttl_->DefaultColumnFamily(), &ttl));
+  ASSERT_EQ(ttl, 3);
   PutValues(0, kSampleSize_);  // T=0:Insert Set1. Delete at t=2
   SleepCompactCheck(2, 0, kSampleSize_, true);  // T=2:Set1 should be there
   CloseTtl();

From ca5d60fd692f681be1b9836ffb9855ffd554f02f Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 21 Jul 2025 16:21:19 -0700
Subject: [PATCH 191/500] Switch back to FSWritableFile in external sst file
 ingestion job (#13791)

Summary:
This patch reverted "NewRandomRWFile" back to "ReopenWritableFile" in external sst file ingestion job when file is linked instead of copied. The reason is that some of the file systems do not support "NewRandomRWFile". A long term fix is being worked in progress.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13791

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D78697825

Pulled By: xingbowang

fbshipit-source-id: d3651223ab1f2369aac34b772bba8049c6c2c628
---
 db/external_sst_file_ingestion_job.cc | 10 ++-----
 env/env_encryption.cc                 | 41 +++++++++++++++++++++++++--
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 9259fdd78db3..811bb04ac1bf 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -156,10 +156,6 @@ Status ExternalSstFileIngestionJob::Prepare(
         // It is unsafe to assume application had sync the file and file
         // directory before ingest the file. For integrity of RocksDB we need
         // to sync the file.
-        // Use FSRandomRWFile instead of FSWritableFile, as in encrypted file
-        // system the FSWritableFile will append a new prefix to the end of the
-        // file when the file exists, which causes file corruption. On the
-        // contrary, FSRandomRWFile handles an existing file correctly.
 
         // TODO(xingbo), We should in general be moving away from production
         // uses of ReuseWritableFile (except explicitly for WAL recycling),
@@ -168,9 +164,9 @@ Status ExternalSstFileIngestionJob::Prepare(
         // re-open+sync+close combo but can (a) be reused easily, and (b) be
         // overridden to do that more cleanly, e.g. in EncryptedEnv.
         // https://github.com/facebook/rocksdb/issues/13741
-        std::unique_ptr<FSRandomRWFile> file_to_sync;
-        Status s = fs_->NewRandomRWFile(path_inside_db, env_options_,
-                                        &file_to_sync, nullptr);
+        std::unique_ptr<FSWritableFile> file_to_sync;
+        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                           &file_to_sync, nullptr);
         TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
                                  &s);
         // Some file systems (especially remote/distributed) don't support
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 16a3c32819f0..9565b9d9bc90 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -665,17 +665,52 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem {
                               std::unique_ptr<FSWritableFile>* result,
                               IODebugContext* dbg) override {
     result->reset();
-    if (options.use_mmap_writes) {
+    if (options.use_mmap_reads || options.use_mmap_writes) {
       return IOStatus::InvalidArgument();
     }
+
+    size_t prefix_length = 0;
+    std::unique_ptr<BlockAccessCipherStream> stream;
+
     // Open file using underlying Env implementation
     std::unique_ptr<FSWritableFile> underlying;
-    IOStatus status =
+    auto status =
         FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
+
+    if (underlying->GetFileSize(options.io_options, dbg) != 0) {
+      // read the cipher stream from file for non-empty file
+      std::unique_ptr<FSRandomAccessFile> underlying_file_reader;
+      status = FileSystemWrapper::NewRandomAccessFile(
+          fname, options, &underlying_file_reader, dbg);
+      if (!status.ok()) {
+        return status;
+      }
+
+      status = CreateRandomReadCipherStream(
+          fname, underlying_file_reader, options, &prefix_length, &stream, dbg);
+
+      if (!status.ok()) {
+        return status;
+      }
+    } else {
+      // create cipher stream for new or empty file
+      status = CreateWritableCipherStream(fname, underlying, options,
+                                          &prefix_length, &stream, dbg);
+      if (!status.ok()) {
+        return status;
+      }
+    }
+
+    if (stream) {
+      result->reset(new EncryptedWritableFile(
+          std::move(underlying), std::move(stream), prefix_length));
+    } else {
+      result->reset(underlying.release());
+    }
+    return status;
   }
 
   IOStatus ReuseWritableFile(const std::string& fname,

From 463f9fd9f264919a018bf144fa35e2a7efaf55a4 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Tue, 22 Jul 2025 10:56:07 -0700
Subject: [PATCH 192/500] Del redundant-static-def in
 internal_repo_rocksdb/repo/tools/sst_dump_test.cc +1 (#13793)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13793

LLVM has a warning `-Wdeprecated-redundant-constexpr-static-def` which raises the warning:

> warning: out-of-line definition of constexpr static data member is redundant in C++17 and is deprecated

Since we are now on C++20, we can remove the out-of-line definition of constexpr static data members. This diff does so.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Reviewed By: meyering

Differential Revision: D78635005

fbshipit-source-id: bd7cbfff0580b9579e78237ec4371615d3609536
---
 tools/sst_dump_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 2fcc33eb1d34..9b789b4bb2bf 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -177,7 +177,6 @@ class SSTDumpToolTest : public testing::Test {
   constexpr static int kNumKey = 1024;
 };
 
-constexpr int SSTDumpToolTest::kNumKey;
 
 TEST_F(SSTDumpToolTest, HelpAndVersion) {
   Options opts;

From 668067e0bf723b560b5c6e523e628c60c8f6a146 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Tue, 22 Jul 2025 11:49:12 -0700
Subject: [PATCH 193/500] Del redundant-static-def in
 internal_repo_rocksdb/repo/db/db_with_timestamp_basic_test.cc +1 (#13794)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13794

LLVM has a warning `-Wdeprecated-redundant-constexpr-static-def` which raises the warning:

> warning: out-of-line definition of constexpr static data member is redundant in C++17 and is deprecated

Since we are now on C++20, we can remove the out-of-line definition of constexpr static data members. This diff does so.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Reviewed By: meyering

Differential Revision: D78635037

fbshipit-source-id: a90c68469947705c65f36588b2d575237689dbe8
---
 db/db_with_timestamp_basic_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index 1e20ae018477..17390681e0b4 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -2400,7 +2400,6 @@ class DataVisibilityTest : public DBBasicTestWithTimestampBase {
     }
   }
 };
-constexpr int DataVisibilityTest::kTestDataSize;
 
 // Application specifies timestamp but not snapshot.
 //           reader              writer

From 351d212777fcd532f9f3bfa48827ee0582b4f7d2 Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Tue, 22 Jul 2025 17:46:45 -0700
Subject: [PATCH 194/500] Ensure Property Bags are Pushed Down to
 BlockBasedIterator (#13795)

Summary:
This diff fixes up a miss in which the property_bag was not pushed down to the BlockBasedIterator.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13795

Reviewed By: anand1976

Differential Revision: D78762294

Pulled By: krhancoc

fbshipit-source-id: 8970b0a87e35d07d5a0dd16f360ec96859f66550
---
 db/version_set.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db/version_set.cc b/db/version_set.cc
index 1acbfebda1ca..742198d44cd8 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1140,6 +1140,7 @@ class LevelIterator final : public InternalIterator {
       for (auto i = fstart; i <= fend; i++) {
         if (i < flevel_->num_files) {
           (*file_to_scan_opts_)[i].emplace_back(start.value(), end.value());
+          (*file_to_scan_opts_)[i].back().property_bag = opt.property_bag;
         }
       }
     }

From 124dd30879c3c3761c0fabd844cd0ee27a86d55f Mon Sep 17 00:00:00 2001
From: jainpr <8664107+jainpr@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:43:03 -0700
Subject: [PATCH 195/500] Remove yield in point lock manager (#13796)

Summary:
The yield is actually of not much use because waitFor should already be doing that.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13796

Reviewed By: pdillinger

Differential Revision: D78823656

Pulled By: jainpr

fbshipit-source-id: 040eaf596938ce8db535bc810ad77a9e50b2d551
---
 utilities/transactions/lock/point/point_lock_manager.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 9e0426429cff..82a12f17d506 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -322,9 +322,6 @@ Status PointLockManager::AcquireWithTimeout(
         // instead of exiting this while loop below.
         uint64_t now = env->NowMicros();
         if (static_cast<uint64_t>(cv_end_time) > now) {
-          // This may be invoked multiple times since we divide
-          // the time into smaller intervals.
-          (void)ROCKSDB_THREAD_YIELD_CHECK_ABORT();
           result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
                                               cv_end_time - now);
           cv_wait_fail = !result.ok() && !result.IsTimedOut();

From 961880b4580d0b83225e8f718bb51bec329236e7 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Wed, 23 Jul 2025 17:12:41 -0700
Subject: [PATCH 196/500] Create a new API FileSystem::SyncFile for file sync
 (#13762)

Summary:
Create a new API FileSystem::SyncFile for file sync, so that we could use file sync directly in places where we need to sync file content to file system without any modification. This is mostly used combined with link file. In some file system link file does not guarantee the file content is synced to file system.

https://github.com/facebook/rocksdb/issues/13741

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13762

Test Plan:
Unit test
T229418750

Reviewed By: pdillinger

Differential Revision: D78121137

Pulled By: xingbowang

fbshipit-source-id: 0ea8a5a3b486e0b61636700400613fed6bbd3faa
---
 db/external_sst_file_basic_test.cc            | 12 ++---
 db/external_sst_file_ingestion_job.cc         | 49 ++++++++-----------
 env/composite_env_wrapper.h                   |  6 +++
 env/env.cc                                    | 44 +++++++++++++++++
 env/env_encryption.cc                         | 11 +++++
 env/file_system.cc                            | 17 +++++++
 env/mock_env.cc                               |  8 +++
 env/mock_env.h                                |  4 ++
 include/rocksdb/env.h                         | 12 +++++
 include/rocksdb/file_system.h                 | 18 +++++++
 .../new_SyncFile_api_at_FileSystem_interface  |  1 +
 utilities/fault_injection_env.cc              | 11 +++++
 utilities/fault_injection_env.h               |  3 ++
 utilities/fault_injection_fs.cc               | 11 +++++
 utilities/fault_injection_fs.h                |  4 ++
 15 files changed, 176 insertions(+), 35 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface

diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 6bc46938658d..6a1986cc5398 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -1311,7 +1311,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
     });
     if (i == 0) {
       SyncPoint::GetInstance()->SetCallBack(
-          "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) {
+          "ExternalSstFileIngestionJob::CheckSyncReturnCode", [&](void* s) {
             Status* status = static_cast<Status*>(s);
             if (status->IsNotSupported()) {
               no_sync = true;
@@ -1372,11 +1372,11 @@ TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
   options.create_if_missing = true;
   options.env = env_;
 
-  SyncPoint::GetInstance()->SetCallBack(
-      "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
-        Status* s = static_cast<Status*>(arg);
-        *s = Status::NotSupported();
-      });
+  SyncPoint::GetInstance()->SetCallBack("FileSystem::SyncFile:Open",
+                                        [&](void* arg) {
+                                          Status* s = static_cast<Status*>(arg);
+                                          *s = Status::NotSupported();
+                                        });
   SyncPoint::GetInstance()->EnableProcessing();
 
   DestroyAndReopen(options);
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 811bb04ac1bf..0807f40a8f4f 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -156,35 +156,26 @@ Status ExternalSstFileIngestionJob::Prepare(
         // It is unsafe to assume application had sync the file and file
         // directory before ingest the file. For integrity of RocksDB we need
         // to sync the file.
-
-        // TODO(xingbo), We should in general be moving away from production
-        // uses of ReuseWritableFile (except explicitly for WAL recycling),
-        // ReopenWritableFile, and NewRandomRWFile. We should create a
-        // FileSystem::SyncFile/FsyncFile API that by default does the
-        // re-open+sync+close combo but can (a) be reused easily, and (b) be
-        // overridden to do that more cleanly, e.g. in EncryptedEnv.
-        // https://github.com/facebook/rocksdb/issues/13741
-        std::unique_ptr<FSWritableFile> file_to_sync;
-        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
-                                           &file_to_sync, nullptr);
-        TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
-                                 &s);
-        // Some file systems (especially remote/distributed) don't support
-        // reopening a file for writing and don't require reopening and
-        // syncing the file. Ignore the NotSupported error in that case.
-        if (!s.IsNotSupported()) {
-          status = s;
-          if (status.ok()) {
-            TEST_SYNC_POINT(
-                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
-            status = SyncIngestedFile(file_to_sync.get());
-            TEST_SYNC_POINT(
-                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
-            if (!status.ok()) {
-              ROCKS_LOG_WARN(db_options_.info_log,
-                             "Failed to sync ingested file %s: %s",
-                             path_inside_db.c_str(), status.ToString().c_str());
-            }
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+        auto s = fs_->SyncFile(path_inside_db, env_options_, IOOptions(),
+                               db_options_.use_fsync, nullptr);
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+        TEST_SYNC_POINT_CALLBACK(
+            "ExternalSstFileIngestionJob::CheckSyncReturnCode", &s);
+        if (!s.ok()) {
+          if (s.IsNotSupported()) {
+            // Some file systems (especially remote/distributed) don't support
+            // SyncFile API. Ignore the NotSupported error in that case.
+            ROCKS_LOG_WARN(db_options_.info_log,
+                           "After link the file, SyncFile API is not supported "
+                           "for file %s: %s",
+                           path_inside_db.c_str(), status.ToString().c_str());
+          } else {
+            // for other errors, propagate the error
+            status = s;
+            ROCKS_LOG_WARN(db_options_.info_log,
+                           "Failed to sync ingested file %s: %s",
+                           path_inside_db.c_str(), status.ToString().c_str());
           }
         }
       } else if (status.IsNotSupported() &&
diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h
index f9b9c6994e53..e2eab9957f85 100644
--- a/env/composite_env_wrapper.h
+++ b/env/composite_env_wrapper.h
@@ -142,6 +142,12 @@ class CompositeEnv : public Env {
     return file_system_->LinkFile(s, t, io_opts, &dbg);
   }
 
+  Status SyncFile(const std::string& fname, const EnvOptions& env_options,
+                  bool use_fsync) override {
+    return file_system_->SyncFile(fname, env_options, IOOptions(), use_fsync,
+                                  nullptr);
+  }
+
   Status NumFileLinks(const std::string& fname, uint64_t* count) override {
     IOOptions io_opts;
     IODebugContext dbg;
diff --git a/env/env.cc b/env/env.cc
index 896c31a477d3..7d97c42b0fa0 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -528,6 +528,13 @@ class LegacyFileSystemWrapper : public FileSystem {
     return status_to_io_status(target_->LinkFile(s, t));
   }
 
+  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
+                    const IOOptions& /*io_options*/, bool use_fsync,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->SyncFile(fname, file_options, use_fsync));
+  }
+
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
                         uint64_t* count, IODebugContext* /*dbg*/) override {
     return status_to_io_status(target_->NumFileLinks(fname, count));
@@ -859,6 +866,43 @@ std::string Env::GenerateUniqueId() {
   return result;
 }
 
+// This API Env::SyncFile is used for testing for 2 reasons:
+//
+// 1. The default implementation of SyncFile API is essentially a wrapper of
+// other FileSystem APIs. FaultInjectionTestEnv uses this default
+// implementation to call other FileSystem APIs defined at
+// FaultInjectionTestEnv class to inject failurses. See
+// FaultInjectionTestEnv::SyncFile for more details
+//
+// 2. Some of old tests are using LegacyFileSystemWrapper.
+// LegacyFileSystemWrapper forwards the API call to EnvWrapper, which forwards
+// to CompositeEnv, and then forwards to the actual FileSystem implemention.
+// Without this API in Env, LegacyFileSystemWrapper will not be able to
+// forward the API call to EnvWrapper, causing the default FileSystem API to
+// be called.
+//
+// Due to the above reason, adding a new API in FileSystem, would very likely
+// require the same API to be added to Env.
+//
+// TODO xingbo. Getting rid of FileSystem functions from Env.
+// We need to simplify the relationship between Env and FileSystem. At least
+// for internal test, we should stop using Env and switch to FileSystem, if
+// possible. Related github issue #9274
+Status Env::SyncFile(const std::string& fname, const EnvOptions& env_options,
+                     bool use_fsync) {
+  std::unique_ptr<WritableFile> file_to_sync;
+  auto status = ReopenWritableFile(fname, &file_to_sync, env_options);
+  TEST_SYNC_POINT_CALLBACK("FileSystem::SyncFile:Open", &status);
+  if (status.ok()) {
+    if (use_fsync) {
+      status = file_to_sync->Fsync();
+    } else {
+      status = file_to_sync->Sync();
+    }
+  }
+  return status;
+}
+
 SequentialFile::~SequentialFile() = default;
 
 RandomAccessFile::~RandomAccessFile() = default;
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 9565b9d9bc90..98c1f38083a8 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -664,6 +664,8 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem {
                               const FileOptions& options,
                               std::unique_ptr<FSWritableFile>* result,
                               IODebugContext* dbg) override {
+    // TODO xingbo Add unit test for the new implementation of
+    // EncryptedFileSysmteImpl::ReopenWritableFile.
     result->reset();
     if (options.use_mmap_reads || options.use_mmap_writes) {
       return IOStatus::InvalidArgument();
@@ -814,6 +816,15 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem {
     return status;
   }
 
+  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
+                    const IOOptions& io_options, bool use_fsync,
+                    IODebugContext* dbg) override {
+    // Use the underlying file system to sync the file, as we don't need to
+    // read/write the file.
+    return FileSystemWrapper::SyncFile(fname, file_options, io_options,
+                                       use_fsync, dbg);
+  }
+
  private:
   std::shared_ptr<EncryptionProvider> provider_;
 };
diff --git a/env/file_system.cc b/env/file_system.cc
index fad48cc1175f..5d160078965d 100644
--- a/env/file_system.cc
+++ b/env/file_system.cc
@@ -107,6 +107,23 @@ IOStatus FileSystem::ReuseWritableFile(const std::string& fname,
   return NewWritableFile(fname, opts, result, dbg);
 }
 
+IOStatus FileSystem::SyncFile(const std::string& fname,
+                              const FileOptions& file_options,
+                              const IOOptions& io_options, bool use_fsync,
+                              IODebugContext* dbg) {
+  std::unique_ptr<FSWritableFile> file_to_sync;
+  auto status = ReopenWritableFile(fname, file_options, &file_to_sync, dbg);
+  TEST_SYNC_POINT_CALLBACK("FileSystem::SyncFile:Open", &status);
+  if (status.ok()) {
+    if (use_fsync) {
+      status = file_to_sync->Fsync(io_options, dbg);
+    } else {
+      status = file_to_sync->Sync(io_options, dbg);
+    }
+  }
+  return status;
+}
+
 IOStatus FileSystem::NewLogger(const std::string& fname,
                                const IOOptions& io_opts,
                                std::shared_ptr<Logger>* result,
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 0f9e5ab47f67..3088984445fe 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -957,6 +957,14 @@ IOStatus MockFileSystem::LinkFile(const std::string& src,
   return IOStatus::OK();
 }
 
+IOStatus MockFileSystem::SyncFile(const std::string& /*fname*/,
+                                  const FileOptions& /*file_options*/,
+                                  const IOOptions& /*io_options*/,
+                                  bool /*use_fsync*/, IODebugContext* /*dbg*/) {
+  // Noop
+  return IOStatus::OK();
+}
+
 IOStatus MockFileSystem::NewLogger(const std::string& fname,
                                    const IOOptions& io_opts,
                                    std::shared_ptr<Logger>* result,
diff --git a/env/mock_env.h b/env/mock_env.h
index 406a31f63570..040235e1ab8a 100644
--- a/env/mock_env.h
+++ b/env/mock_env.h
@@ -86,6 +86,10 @@ class MockFileSystem : public FileSystem {
   IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
                     const IOOptions& /*options*/,
                     IODebugContext* /*dbg*/) override;
+  IOStatus SyncFile(const std::string& /*fname*/,
+                    const FileOptions& /*file_options*/,
+                    const IOOptions& /*io_options*/, bool /*use_fsync*/,
+                    IODebugContext* /*dbg*/) override;
   IOStatus LockFile(const std::string& fname, const IOOptions& options,
                     FileLock** lock, IODebugContext* dbg) override;
   IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index bffa22028839..58f182751d16 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -385,6 +385,13 @@ class Env : public Customizable {
     return Status::NotSupported("LinkFile is not supported for this Env");
   }
 
+  // Sync the file content to file system.
+  // This API is only used for testing.
+  // See FileSystem::SyncFile comment for details
+  virtual Status SyncFile(const std::string& /*fname*/,
+                          const EnvOptions& /*env_options*/,
+                          bool /*use_fsync*/);
+
   virtual Status NumFileLinks(const std::string& /*fname*/,
                               uint64_t* /*count*/) {
     return Status::NotSupported(
@@ -1543,6 +1550,11 @@ class EnvWrapper : public Env {
     return target_.env->LinkFile(s, t);
   }
 
+  Status SyncFile(const std::string& fname, const EnvOptions& env_options,
+                  bool use_fsync) override {
+    return target_.env->SyncFile(fname, env_options, use_fsync);
+  }
+
   Status NumFileLinks(const std::string& fname, uint64_t* count) override {
     return target_.env->NumFileLinks(fname, count);
   }
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index a68dee516679..c0a064d6639f 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -606,6 +606,18 @@ class FileSystem : public Customizable {
         "LinkFile is not supported for this FileSystem");
   }
 
+  // Sync the file content to file system.
+  // The default implementation would open, sync and close the file.
+  // This function could be overridden with no-op, if the file system
+  // automatically sync the data when file is closed.
+  // This is used when a user-provided file, probably unsynced, is pulled into a
+  // context where power-outage-proof persistence is required (e.g.
+  // IngestExternalFile without copy).
+  virtual IOStatus SyncFile(const std::string& fname,
+                            const FileOptions& file_options,
+                            const IOOptions& io_options, bool use_fsync,
+                            IODebugContext* dbg);
+
   virtual IOStatus NumFileLinks(const std::string& /*fname*/,
                                 const IOOptions& /*options*/,
                                 uint64_t* /*count*/, IODebugContext* /*dbg*/) {
@@ -1592,6 +1604,12 @@ class FileSystemWrapper : public FileSystem {
     return target_->LinkFile(s, t, options, dbg);
   }
 
+  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
+                    const IOOptions& io_options, bool use_fsync,
+                    IODebugContext* dbg) override {
+    return target_->SyncFile(fname, file_options, io_options, use_fsync, dbg);
+  }
+
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
                         uint64_t* count, IODebugContext* dbg) override {
     return target_->NumFileLinks(fname, options, count, dbg);
diff --git a/unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface b/unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface
new file mode 100644
index 000000000000..6918f05f34f3
--- /dev/null
+++ b/unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface
@@ -0,0 +1 @@
+A new Filesystem::SyncFile function is added for syncing a file that was already written, such as on file ingestion. The default implementation matches previous RocksDB behavior: re-open the file for read-write, sync it, and close it. We recommend overriding for FileSystems that do not require syncing for crash recovery or do not handle (well) re-opening for writes.
diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc
index 6aedb87ab634..1bbe587f52cb 100644
--- a/utilities/fault_injection_env.cc
+++ b/utilities/fault_injection_env.cc
@@ -464,6 +464,17 @@ Status FaultInjectionTestEnv::LinkFile(const std::string& s,
   return ret;
 }
 
+Status FaultInjectionTestEnv::SyncFile(const std::string& fname,
+                                       const EnvOptions& env_options,
+                                       bool use_fsync) {
+  // Call the default implement of SyncFile API in Env, so that it would call
+  // other FileSystem API at FaultInjectionTestEnv layer for failure injection.
+  // Otherwise, the default behavior is WrapperEnv::SyncFile, which forward the
+  // call to the underlying FileSystem, instead of the ones in
+  // FaultInjectionTestEnv.
+  return Env::SyncFile(fname, env_options, use_fsync);
+}
+
 void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
   MutexLock l(&mutex_);
   if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h
index eaece031848d..fedcb2ae22ff 100644
--- a/utilities/fault_injection_env.h
+++ b/utilities/fault_injection_env.h
@@ -177,6 +177,9 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   Status LinkFile(const std::string& s, const std::string& t) override;
 
+  Status SyncFile(const std::string& fname, const EnvOptions& env_options,
+                  bool use_fsync) override;
+
 // Undef to eliminate clash on Windows
 #undef GetFreeSpace
   Status GetFreeSpace(const std::string& path, uint64_t* disk_free) override {
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 24916019dd8d..143ed760c3f7 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -1197,6 +1197,17 @@ IOStatus FaultInjectionTestFS::LinkFile(const std::string& s,
   }
   return io_s;
 }
+IOStatus FaultInjectionTestFS::SyncFile(const std::string& fname,
+                                        const FileOptions& file_options,
+                                        const IOOptions& io_options,
+                                        bool use_fsync, IODebugContext* dbg) {
+  // Call the default implement of SyncFile API in FileSystem, so that it would
+  // call other FileSystem API at FaultInjectionTestFS layer for failure
+  // injection. Otherwise, the default behavior is calling target()->SyncFile,
+  // which forward the call to the underlying FileSystem, instead of the ones in
+  // FaultInjectionTestFS.
+  return FileSystem::SyncFile(fname, file_options, io_options, use_fsync, dbg);
+}
 
 IOStatus FaultInjectionTestFS::NumFileLinks(const std::string& fname,
                                             const IOOptions& options,
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index e2399a191663..129b3153e46a 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -302,6 +302,10 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   IOStatus LinkFile(const std::string& src, const std::string& target,
                     const IOOptions& options, IODebugContext* dbg) override;
 
+  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
+                    const IOOptions& io_options, bool use_fsync,
+                    IODebugContext* dbg) override;
+
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
                         uint64_t* count, IODebugContext* dbg) override;
 

From ee6b0def5573e59f710b123372a8ba5b8e2b302d Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 25 Jul 2025 13:39:25 -0700
Subject: [PATCH 197/500] Refactor, improve CompressedSecondaryCache (#13797)

Summary:
To be compatible with some upcoming compression change/refactoring where we supply a fixed size buffer to CompressBlock, we need to support CompressedSecondaryCache storing uncompressed values when the compression ratio is not suitable. It seems crazy that CompressedSecondaryCache currently stores compressed values that are *larger* than the uncompressed value, and even explicitly exercises that case (almost exclusively) in the existing unit tests. But it's true.

This change fixes that with some other nearby refactoring/improvement:
* Update the in-memory representation of these cache entries to support uncompressed entries even when compression is enabled. AFAIK this also allows us to safely get rid of "don't support custom split/merge for the tiered case".
* Use more efficient in-memory representation for non-split entries
  * For CompressionType and CacheTier, which are defined as single-byte data types, use a single byte instead of varint32. (I don't know if varint32 was an attempt at future-proofing for a memory-only schema or what.) Now using lossless_cast will raise a compiler error if either of these types is made too large for a single byte.
  * Don't wrap entries in a CacheAllocationPtr object; it's not necessary. We can rely on the same allocator being provided at delete time.
* Restructure serialization/deserialization logic, hopefully simpler or easier to read/understand.
* Use a RelaxedAtomic for disable_cache_ to avoid race.

Suggested follow-up on CompressedSecondaryCache:
* Refine the exact strategy for rejecting compressions
* Still have a lot of buffer copies; try to reduce
* Revisit the split-merge logic and try to make it more efficient overall, more unified with non-split case

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13797

Test Plan:
Unit tests updated to use actually compressible strings in many places and more testing around non-compressible string.

## Performance Test
There was some pre-existing issue causing decompression failures in compressed secondary cache with cache_bench that is somehow fixed in this change. This decompression failures were present before the new compression API, but since then cause assertion failures rather than being quietly ignored. For the "before" test here, they are back to quietly ignored. And the cache_bench changes here were  back-ported to the "before" configuration.

### No compressed secondary (setting expectations)
```
./cache_bench --cache_type=auto_hyper_clock_cache -cache_size=8000000000 -populate_cache
```
Max key             : 3906250

Before:
Complete in 12.784 s; Rough parallel ops/sec = 2503123
Thread ops/sec = 160329; Lookup hit ratio: 0.686771

After:
Complete in 12.745 s; Rough parallel ops/sec = 2510717 (in the noise)
Thread ops/sec = 159498; Lookup hit ratio: 0.68686

### Compressed secondary, no split/merge
Same max key and approximate total memory size
```
/usr/bin/time ./cache_bench --cache_type=auto_hyper_clock_cache -cache_size=4000000000 -populate_cache -resident_ratio=0.125 -compressible_to_ratio=0.4 --secondary_cache_uri=compressed_secondary_cache://capacity=4000000000
```
Before:
Complete in 18.690 s; Rough parallel ops/sec = 1712144
Thread ops/sec = 108683; Lookup hit ratio: 0.776683
Latency: P50: 4205.19 P75: 15281.76 P99: 43810.98 P99.9: 71487.41 P99.99: 165453.32
max RSS (according to /usr/bin/time): 9341856

After:
Complete in 17.878 s; Rough parallel ops/sec = 1789951 (+4.5%)
Thread ops/sec = 114957; Lookup hit ratio: 0.792998 (+0.016)
Latency: P50: 4012.70 P75: 14477.63 P99: 40039.70 P99.9: 62521.04 P99.99: 167049.18
max RSS (according to /usr/bin/time): 9235688

The improved hit ratio is probably from fixing the failed decompressions (somehow). And my modifications could have improved CPU efficiency, or it could be the small penalty the benchmark naturally imposes on most misses (generate another value and insert it).

### Compressed secondary, with split/merge
```
/usr/bin/time ./cache_bench --cache_type=auto_hyper_clock_cache -cache_size=4000000000 -populate_cache -resident_ratio=0.125 -compressible_to_ratio=0.4 --secondary_cache_uri='compressed_secondary_cache://capacity=4000000000;enable_custom_split_merge=true'
```
Before:
Complete in 20.062 s; Rough parallel ops/sec = 1595075
Thread ops/sec = 101759; Lookup hit ratio: 0.787129
Latency: P50: 5338.53 P75: 16073.46 P99: 46752.65 P99.9: 73459.11 P99.99: 201318.75
max RSS (according to /usr/bin/time): 9049852

After:
Complete in 18.564 s; Rough parallel ops/sec = 1723771 (+8.1%)
Thread ops/sec = 110724; Lookup hit ratio: 0.813414 (+0.026)
Latency: P50: 5234.75 P75: 14590.43 P99: 41401.03 P99.9: 65606.50 P99.99: 157248.04
max RSS (according to /usr/bin/time): 8917592

Looks like an improvement

Reviewed By: anand1976

Differential Revision: D78842120

Pulled By: pdillinger

fbshipit-source-id: 5f754b160c37ebee789279178ebb5e862071bdb2
---
 cache/cache_bench_tool.cc                |  21 +-
 cache/compressed_secondary_cache.cc      | 320 ++++++++++++-----------
 cache/compressed_secondary_cache.h       |  16 +-
 cache/compressed_secondary_cache_test.cc | 163 ++++++++----
 include/rocksdb/secondary_cache.h        |   4 +-
 memory/memory_allocator_impl.h           |   6 +-
 test_util/testutil.h                     |   7 +
 util/coding.h                            |   6 +-
 8 files changed, 323 insertions(+), 220 deletions(-)

diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index a5e589f4689f..0e29dc67b189 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -60,6 +60,8 @@ DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
 DEFINE_uint32(value_bytes_estimate, 0,
               "If > 0, overrides estimated_entry_charge or "
               "min_avg_entry_charge depending on cache_type.");
+DEFINE_double(compressible_to_ratio, 0.5,
+              "Approximate size ratio that values can be compressed to.");
 
 DEFINE_int32(
     degenerate_hash_bits, 0,
@@ -291,10 +293,19 @@ struct KeyGen {
 
 Cache::ObjectPtr createValue(Random64& rnd, MemoryAllocator* alloc) {
   char* rv = AllocateBlock(FLAGS_value_bytes, alloc).release();
-  // Fill with some filler data, and take some CPU time
-  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+  // Fill with some filler data, and take some CPU time, but add redundancy
+  // as requested for compressibility.
+  uint32_t random_fill_size = std::max(
+      uint32_t{1}, std::min(FLAGS_value_bytes,
+                            static_cast<uint32_t>(FLAGS_compressible_to_ratio *
+                                                  FLAGS_value_bytes)));
+  uint32_t i = 0;
+  for (; i < random_fill_size; i += 8) {
     EncodeFixed64(rv + i, rnd.Next());
   }
+  for (; i < FLAGS_value_bytes; i++) {
+    rv[i] = rv[i % random_fill_size];
+  }
   return rv;
 }
 
@@ -309,16 +320,16 @@ Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/,
 
 Status CreateFn(const Slice& data, CompressionType /*type*/,
                 CacheTier /*source*/, Cache::CreateContext* /*context*/,
-                MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj,
+                MemoryAllocator* alloc, Cache::ObjectPtr* out_obj,
                 size_t* out_charge) {
-  *out_obj = new char[data.size()];
+  *out_obj = AllocateBlock(data.size(), alloc).release();
   memcpy(*out_obj, data.data(), data.size());
   *out_charge = data.size();
   return Status::OK();
 };
 
 void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* alloc) {
-  CustomDeleter{alloc}(static_cast<char*>(value));
+  CacheAllocationDeleter{alloc}(static_cast<char*>(value));
 }
 
 Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn);
diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index d4d505d873c4..e30e48494fbe 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -16,6 +16,31 @@
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+// Format of values in CompressedSecondaryCache:
+// If enable_custom_split_merge:
+//  * A chain of CacheValueChunk representing the sequence of bytes for a tagged
+//    value. The overall length of the tagged value is determined by the chain
+//    of CacheValueChunks.
+// If !enable_custom_split_merge:
+//  * A LengthPrefixedSlice (starts with varint64 size) of a tagged value.
+//
+// A tagged value has a 2-byte header before the "saved" or compressed block
+// data:
+//  * 1 byte for "source" CacheTier indicating which tier is responsible for
+//    compression/decompression.
+//  * 1 byte for compression type which is generated/used by
+//    CompressedSecondaryCache iff source == CacheTier::kVolatileCompressedTier
+//    (original entry passed in was uncompressed). Otherwise, the compression
+//    type is preserved from the entry passed in.
+constexpr uint32_t kTagSize = 2;
+
+// Size of tag + varint size prefix when applicable
+uint32_t GetHeaderSize(size_t data_size, bool enable_split_merge) {
+  return (enable_split_merge ? 0 : VarintLength(kTagSize + data_size)) +
+         kTagSize;
+}
+}  // namespace
 
 CompressedSecondaryCache::CompressedSecondaryCache(
     const CompressedSecondaryCacheOptions& opts)
@@ -40,13 +65,9 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
     Statistics* stats, bool& kept_in_sec_cache) {
   assert(helper);
-  // This is a minor optimization. Its ok to skip it in TSAN in order to
-  // avoid a false positive.
-#ifndef __SANITIZE_THREAD__
-  if (disable_cache_) {
+  if (disable_cache_.LoadRelaxed()) {
     return nullptr;
   }
-#endif
 
   std::unique_ptr<SecondaryCacheResultHandle> handle;
   kept_in_sec_cache = false;
@@ -62,75 +83,53 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     return nullptr;
   }
 
-  CacheAllocationPtr* ptr{nullptr};
-  CacheAllocationPtr merged_value;
-  size_t handle_value_charge{0};
-  const char* data_ptr = nullptr;
-  CacheTier source = CacheTier::kVolatileCompressedTier;
-  CompressionType type = cache_options_.compression_type;
+  std::string merged_value;
+  Slice tagged_data;
   if (cache_options_.enable_custom_split_merge) {
     CacheValueChunk* value_chunk_ptr =
-        reinterpret_cast<CacheValueChunk*>(handle_value);
-    merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge);
-    ptr = &merged_value;
-    data_ptr = ptr->get();
+        static_cast<CacheValueChunk*>(handle_value);
+    merged_value = MergeChunksIntoValue(value_chunk_ptr);
+    tagged_data = Slice(merged_value);
   } else {
-    uint32_t type_32 = static_cast<uint32_t>(type);
-    uint32_t source_32 = static_cast<uint32_t>(source);
-    ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
-    handle_value_charge = cache_->GetCharge(lru_handle);
-    data_ptr = ptr->get();
-    const char* limit = ptr->get() + handle_value_charge;
-    data_ptr =
-        GetVarint32Ptr(data_ptr, limit, static_cast<uint32_t*>(&type_32));
-    type = static_cast<CompressionType>(type_32);
-    data_ptr =
-        GetVarint32Ptr(data_ptr, limit, static_cast<uint32_t*>(&source_32));
-    source = static_cast<CacheTier>(source_32);
-    uint64_t data_size = 0;
-    data_ptr =
-        GetVarint64Ptr(data_ptr, limit, static_cast<uint64_t*>(&data_size));
-    assert(handle_value_charge > data_size);
-    handle_value_charge = data_size;
+    tagged_data = GetLengthPrefixedSlice(static_cast<char*>(handle_value));
   }
-  MemoryAllocator* allocator = cache_options_.memory_allocator.get();
 
-  Status s;
-  Cache::ObjectPtr value{nullptr};
-  size_t charge{0};
+  auto source = lossless_cast<CacheTier>(tagged_data[0]);
+  auto type = lossless_cast<CompressionType>(tagged_data[1]);
+
+  std::unique_ptr<char[]> uncompressed;
+  Slice saved(tagged_data.data() + kTagSize, tagged_data.size() - kTagSize);
   if (source == CacheTier::kVolatileCompressedTier) {
-    if (cache_options_.compression_type == kNoCompression ||
-        cache_options_.do_not_compress_roles.Contains(helper->role)) {
-      s = helper->create_cb(Slice(data_ptr, handle_value_charge),
-                            kNoCompression, CacheTier::kVolatileTier,
-                            create_context, allocator, &value, &charge);
-    } else {
-      // TODO: can we work some magic with create_cb, which might be based on
-      // custom compression, to decompress without an extra copy in create_cb?
+    if (type != kNoCompression) {
+      // TODO: can we do something to avoid yet another allocation?
       Decompressor::Args args;
-      args.compressed_data = Slice(data_ptr, handle_value_charge);
-      args.compression_type = cache_options_.compression_type;
-      s = decompressor_->ExtractUncompressedSize(args);
-      assert(s.ok());
+      args.compressed_data = saved;
+      args.compression_type = type;
+      Status s = decompressor_->ExtractUncompressedSize(args);
+      assert(s.ok());  // in-memory data
       if (s.ok()) {
-        auto uncompressed = std::make_unique<char[]>(args.uncompressed_size);
+        uncompressed = std::make_unique<char[]>(args.uncompressed_size);
         s = decompressor_->DecompressBlock(args, uncompressed.get());
-        assert(s.ok());
-        if (s.ok()) {
-          s = helper->create_cb(
-              Slice(uncompressed.get(), args.uncompressed_size), kNoCompression,
-              CacheTier::kVolatileTier, create_context, allocator, &value,
-              &charge);
-        }
+        assert(s.ok());  // in-memory data
       }
+      if (!s.ok()) {
+        cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
+        return nullptr;
+      }
+      saved = Slice(uncompressed.get(), args.uncompressed_size);
+      type = kNoCompression;
+      // Free temporary compressed data
+      merged_value = std::string();
     }
-  } else {
-    // The item was not compressed by us. Let the helper create_cb
-    // uncompress it
-    s = helper->create_cb(Slice(data_ptr, handle_value_charge), type, source,
-                          create_context, allocator, &value, &charge);
+    // Reduced as if it came from primary cache
+    source = CacheTier::kVolatileTier;
   }
 
+  Cache::ObjectPtr result_value = nullptr;
+  size_t result_charge = 0;
+  Status s = helper->create_cb(saved, type, source, create_context,
+                               cache_options_.memory_allocator.get(),
+                               &result_value, &result_charge);
   if (!s.ok()) {
     cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
     return nullptr;
@@ -148,7 +147,8 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
     kept_in_sec_cache = true;
     cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
   }
-  handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
+  handle.reset(
+      new CompressedSecondaryCacheResultHandle(result_value, result_charge));
   RecordTick(stats, COMPRESSED_SECONDARY_CACHE_HITS);
   return handle;
 }
@@ -171,85 +171,107 @@ bool CompressedSecondaryCache::MaybeInsertDummy(const Slice& key) {
 
 Status CompressedSecondaryCache::InsertInternal(
     const Slice& key, Cache::ObjectPtr value,
-    const Cache::CacheItemHelper* helper, CompressionType type,
+    const Cache::CacheItemHelper* helper, CompressionType from_type,
     CacheTier source) {
-  if (source != CacheTier::kVolatileCompressedTier &&
-      cache_options_.enable_custom_split_merge) {
-    // We don't support custom split/merge for the tiered case
-    return Status::OK();
-  }
-
-  auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
-  char header[20];
-  char* payload = header;
-  payload = EncodeVarint32(payload, static_cast<uint32_t>(type));
-  payload = EncodeVarint32(payload, static_cast<uint32_t>(source));
-  size_t data_size = (*helper->size_cb)(value);
-  char* data_size_ptr = payload;
-  payload = EncodeVarint64(payload, data_size);
-
-  size_t header_size = payload - header;
-  size_t total_size = data_size + header_size;
-  CacheAllocationPtr ptr =
-      AllocateBlock(total_size, cache_options_.memory_allocator.get());
-  char* data_ptr = ptr.get() + header_size;
-
-  Status s = (*helper->saveto_cb)(value, 0, data_size, data_ptr);
+  bool enable_split_merge = cache_options_.enable_custom_split_merge;
+  const Cache::CacheItemHelper* internal_helper = GetHelper(enable_split_merge);
+
+  // TODO: variant of size_cb that also returns a pointer to the data if
+  // already available. Saves an allocation if we keep the compressed version.
+  const size_t data_size_original = (*helper->size_cb)(value);
+
+  // Allocate enough memory for header/tag + original data because (a) we might
+  // not be attempting compression at all, and (b) we might keep the original if
+  // compression is insufficient. But we don't need the length prefix with
+  // enable_split_merge. TODO: be smarter with CacheValueChunk to save an
+  // allocation in the enable_split_merge case.
+  size_t header_size = GetHeaderSize(data_size_original, enable_split_merge);
+  CacheAllocationPtr allocation = AllocateBlock(
+      header_size + data_size_original, cache_options_.memory_allocator.get());
+  char* data_ptr = allocation.get() + header_size;
+  Slice tagged_data(data_ptr - kTagSize, data_size_original + kTagSize);
+  assert(tagged_data.data() >= allocation.get());
+
+  Status s = (*helper->saveto_cb)(value, 0, data_size_original, data_ptr);
   if (!s.ok()) {
     return s;
   }
-  Slice val(data_ptr, data_size);
 
-  std::string compressed_val;
-  if (cache_options_.compression_type != kNoCompression &&
-      type == kNoCompression &&
+  std::string data_compressed;
+  CompressionType to_type = kNoCompression;
+  if (compressor_ && from_type == kNoCompression &&
       !cache_options_.do_not_compress_roles.Contains(helper->role)) {
-    PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size);
-
-    CompressionType to_type = kNoCompression;
-    s = compressor_->CompressBlock(val, &compressed_val, &to_type,
+    assert(source == CacheTier::kVolatileCompressedTier);
+    s = compressor_->CompressBlock(Slice(data_ptr, data_size_original),
+                                   &data_compressed, &to_type,
                                    nullptr /*working_area*/);
     if (!s.ok()) {
       return s;
     }
-    // TODO: allow values not compressed when there's no size savings?
-    assert(to_type == cache_options_.compression_type);
-    if (to_type != cache_options_.compression_type) {
-      return Status::Corruption("Failed to compress value.");
-    }
-
-    val = Slice(compressed_val);
-    data_size = compressed_val.size();
-    payload = EncodeVarint64(data_size_ptr, data_size);
-    header_size = payload - header;
-    total_size = header_size + data_size;
-    PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size);
-
-    if (!cache_options_.enable_custom_split_merge) {
-      ptr = AllocateBlock(total_size, cache_options_.memory_allocator.get());
-      data_ptr = ptr.get() + header_size;
-      memcpy(data_ptr, compressed_val.data(), data_size);
+    PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes,
+                     data_size_original);
+    // TOOD: improve compression sufficiency check
+    if (to_type == kNoCompression ||
+        data_compressed.size() >= data_size_original) {
+      // Compression rejected
+      to_type = kNoCompression;
+      data_compressed.clear();
+      // TODO: consider separate counters for rejected compressions
+      PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes,
+                       data_size_original);
+    } else {
+      size_t data_size_compressed = data_compressed.size();
+      PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes,
+                       data_size_compressed);
+      if (enable_split_merge) {
+        // Only need tagged_data for copying into CacheValueChunks. Insert
+        // space for tag.
+        // TODO: improve efficiency of this case (will be fixed with update to
+        // CompressBlock API)
+        data_compressed.insert(/*pos=*/0, /*n=*/kTagSize, char{});
+        tagged_data = data_compressed;
+        allocation.reset();
+      } else {
+        // Replace allocation with compressed version, copied from string
+        header_size = GetHeaderSize(data_size_compressed, enable_split_merge);
+        allocation = AllocateBlock(header_size + data_size_compressed,
+                                   cache_options_.memory_allocator.get());
+        data_ptr = allocation.get() + header_size;
+        std::memcpy(data_ptr, data_compressed.data(), data_size_compressed);
+        tagged_data =
+            Slice(data_ptr - kTagSize, data_size_compressed + kTagSize);
+        assert(tagged_data.data() >= allocation.get());
+      }
     }
   }
 
   PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
-  if (cache_options_.enable_custom_split_merge) {
+
+  // Save the tag fields
+  const_cast<char*>(tagged_data.data())[0] = lossless_cast<char>(source);
+  const_cast<char*>(tagged_data.data())[1] = lossless_cast<char>(
+      source == CacheTier::kVolatileCompressedTier ? to_type : from_type);
+
+  if (enable_split_merge) {
     size_t split_charge{0};
-    CacheValueChunk* value_chunks_head = SplitValueIntoChunks(
-        val, cache_options_.compression_type, split_charge);
-    return cache_->Insert(key, value_chunks_head, internal_helper,
-                          split_charge);
+    CacheValueChunk* value_chunks_head =
+        SplitValueIntoChunks(tagged_data, split_charge);
+    s = cache_->Insert(key, value_chunks_head, internal_helper, split_charge);
+    assert(s.ok());  // LRUCache::Insert() with handle==nullptr always OK
   } else {
+    // Save the size prefix
+    char* ptr = allocation.get();
+    ptr = EncodeVarint64(ptr, tagged_data.size());
+    assert(ptr == tagged_data.data());
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    size_t charge = malloc_usable_size(ptr.get());
+    size_t charge = malloc_usable_size(allocation.get());
 #else
-    size_t charge = total_size;
+    size_t charge = tagged_data.size();
 #endif
-    std::memcpy(ptr.get(), header, header_size);
-    CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
-    charge += sizeof(CacheAllocationPtr);
-    return cache_->Insert(key, buf, internal_helper, charge);
+    s = cache_->Insert(key, allocation.release(), internal_helper, charge);
+    assert(s.ok());  // LRUCache::Insert() with handle==nullptr always OK
   }
+  return Status::OK();
 }
 
 Status CompressedSecondaryCache::Insert(const Slice& key,
@@ -271,7 +293,12 @@ Status CompressedSecondaryCache::Insert(const Slice& key,
 Status CompressedSecondaryCache::InsertSaved(
     const Slice& key, const Slice& saved, CompressionType type = kNoCompression,
     CacheTier source = CacheTier::kVolatileTier) {
-  if (type == kNoCompression) {
+  if (type == kNoCompression || source == CacheTier::kVolatileCompressedTier) {
+    assert(source != CacheTier::kVolatileCompressedTier);
+    return Status::OK();
+  }
+  if (cache_options_.enable_custom_split_merge) {
+    // We don't support custom split/merge for the tiered case
     return Status::OK();
   }
 
@@ -291,7 +318,7 @@ Status CompressedSecondaryCache::SetCapacity(size_t capacity) {
   MutexLock l(&capacity_mutex_);
   cache_options_.capacity = capacity;
   cache_->SetCapacity(capacity);
-  disable_cache_ = capacity == 0;
+  disable_cache_.StoreRelaxed(capacity == 0);
   return Status::OK();
 }
 
@@ -321,9 +348,14 @@ std::string CompressedSecondaryCache::GetPrintableOptions() const {
   return ret;
 }
 
+// FIXME: this could use a lot of attention, including:
+// * Use allocator
+// * We shouldn't be worse than non-split; be more pro-actively aware of
+// internal fragmentation
+// * Consider a unified object/chunk structure that may or may not split
+// * Optimize size overhead of chunks
 CompressedSecondaryCache::CacheValueChunk*
 CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
-                                               CompressionType compression_type,
                                                size_t& charge) {
   assert(!value.empty());
   const char* src_ptr = value.data();
@@ -344,15 +376,14 @@ CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
     // size, or there is no compression.
     if (upper == malloc_bin_sizes_.begin() ||
         upper == malloc_bin_sizes_.end() ||
-        *upper - predicted_chunk_size < malloc_bin_sizes_.front() ||
-        compression_type == kNoCompression) {
+        *upper - predicted_chunk_size < malloc_bin_sizes_.front()) {
       tmp_size = predicted_chunk_size;
     } else {
       tmp_size = *(--upper);
     }
 
     CacheValueChunk* new_chunk =
-        reinterpret_cast<CacheValueChunk*>(new char[tmp_size]);
+        static_cast<CacheValueChunk*>(static_cast<void*>(new char[tmp_size]));
     current_chunk->next = new_chunk;
     current_chunk = current_chunk->next;
     actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1;
@@ -367,28 +398,24 @@ CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
   return dummy_head.next;
 }
 
-CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
-    const void* chunks_head, size_t& charge) {
-  const CacheValueChunk* head =
-      reinterpret_cast<const CacheValueChunk*>(chunks_head);
+std::string CompressedSecondaryCache::MergeChunksIntoValue(
+    const CacheValueChunk* head) {
   const CacheValueChunk* current_chunk = head;
-  charge = 0;
+  size_t total_size = 0;
   while (current_chunk != nullptr) {
-    charge += current_chunk->size;
+    total_size += current_chunk->size;
     current_chunk = current_chunk->next;
   }
 
-  CacheAllocationPtr ptr =
-      AllocateBlock(charge, cache_options_.memory_allocator.get());
+  std::string result;
+  result.reserve(total_size);
   current_chunk = head;
-  size_t pos{0};
   while (current_chunk != nullptr) {
-    memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size);
-    pos += current_chunk->size;
+    result.append(current_chunk->data, current_chunk->size);
     current_chunk = current_chunk->next;
   }
-
-  return ptr;
+  assert(result.size() == total_size);
+  return result;
 }
 
 const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
@@ -402,16 +429,16 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
             CacheValueChunk* tmp_chunk = chunks_head;
             chunks_head = chunks_head->next;
             tmp_chunk->Free();
-            obj = nullptr;
           }
         }};
     return &kHelper;
   } else {
     static const Cache::CacheItemHelper kHelper{
         CacheEntryRole::kMisc,
-        [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
-          delete static_cast<CacheAllocationPtr*>(obj);
-          obj = nullptr;
+        [](Cache::ObjectPtr obj, MemoryAllocator* alloc) {
+          if (obj != nullptr) {
+            CacheAllocationDeleter{alloc}(static_cast<char*>(obj));
+          }
         }};
     return &kHelper;
   }
@@ -422,12 +449,7 @@ size_t CompressedSecondaryCache::TEST_GetCharge(const Slice& key) {
   if (lru_handle == nullptr) {
     return 0;
   }
-
   size_t charge = cache_->GetCharge(lru_handle);
-  if (cache_->Value(lru_handle) != nullptr &&
-      !cache_options_.enable_custom_split_merge) {
-    charge -= 10;
-  }
   cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
   return charge;
 }
diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h
index f66d9a0ffe78..52b3d84b6dda 100644
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@@ -10,13 +10,12 @@
 #include <memory>
 
 #include "cache/cache_reservation_manager.h"
-#include "cache/lru_cache.h"
 #include "memory/memory_allocator_impl.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/secondary_cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "util/compression.h"
-#include "util/mutexlock.h"
+#include "util/atomic.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -124,14 +123,9 @@ class CompressedSecondaryCache : public SecondaryCache {
   // Split value into chunks to better fit into jemalloc bins. The chunks
   // are stored in CacheValueChunk and extra charge is needed for each chunk,
   // so the cache charge is recalculated here.
-  CacheValueChunk* SplitValueIntoChunks(const Slice& value,
-                                        CompressionType compression_type,
-                                        size_t& charge);
+  CacheValueChunk* SplitValueIntoChunks(const Slice& value, size_t& charge);
 
-  // After merging chunks, the extra charge for each chunk is removed, so
-  // the charge is recalculated.
-  CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
-                                          size_t& charge);
+  std::string MergeChunksIntoValue(const CacheValueChunk* head);
 
   bool MaybeInsertDummy(const Slice& key);
 
@@ -149,7 +143,7 @@ class CompressedSecondaryCache : public SecondaryCache {
   std::shared_ptr<Decompressor> decompressor_;
   mutable port::Mutex capacity_mutex_;
   std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
-  bool disable_cache_;
+  RelaxedAtomic<bool> disable_cache_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc
index df319390eedb..ebd7759bfd0b 100644
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@@ -24,6 +24,14 @@ namespace ROCKSDB_NAMESPACE {
 using secondary_cache_test_util::GetTestingCacheTypes;
 using secondary_cache_test_util::WithCacheType;
 
+// Read and reset a statistic
+template <typename T>
+T Pop(T& var) {
+  T ret = var;
+  var = T();
+  return ret;
+}
+
 // 16 bytes for HCC compatibility
 const std::string key0 = "____    ____key0";
 const std::string key1 = "____    ____key1";
@@ -51,7 +59,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
 
     Random rnd(301);
     // Insert and Lookup the item k1 for the first time.
-    std::string str1(rnd.RandomString(1000));
+    std::string str1 = test::CompressibleString(&rnd, 0.5, 1000);
     TestItem item1(str1.data(), str1.length());
     // A dummy handle is inserted if the item is inserted for the first time.
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
@@ -68,7 +76,14 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
 
-    ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), 1000);
+    if (sec_cache_is_compressed) {
+      ASSERT_GT(comp_sec_cache->TEST_GetCharge(key1), str1.length() / 4);
+      ASSERT_LT(comp_sec_cache->TEST_GetCharge(key1), str1.length() * 3 / 4);
+    } else {
+      ASSERT_GE(comp_sec_cache->TEST_GetCharge(key1), str1.length());
+      // NOTE: split-merge is worse (1048 vs. 1024)
+      ASSERT_LE(comp_sec_cache->TEST_GetCharge(key1), 1048U);
+    }
 
     std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
         sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
@@ -76,10 +91,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_NE(handle1_2, nullptr);
     ASSERT_FALSE(kept_in_sec_cache);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                1000);
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                1007);
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str1.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                str1.length() * 3 / 4);
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str1.length() / 4);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -97,7 +115,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_EQ(handle1_3, nullptr);
 
     // Insert and Lookup the item k2.
-    std::string str2(rnd.RandomString(1000));
+    std::string str2 = test::CompressibleString(&rnd, 0.5, 1017);
     TestItem item2(str2.data(), str2.length());
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
@@ -109,10 +127,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                2000);
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                2014);
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str2.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                str2.length() * 3 / 4);
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str2.length() / 4);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -126,9 +147,48 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_NE(val2, nullptr);
     ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
 
+    // Release handles
     std::vector<SecondaryCacheResultHandle*> handles = {handle1_2.get(),
                                                         handle2_2.get()};
     sec_cache->WaitAll(handles);
+    handle1_2.reset();
+    handle2_2.reset();
+
+    // Insert and Lookup a non-compressible item k3.
+    std::string str3 = rnd.RandomBinaryString(480);
+    TestItem item3(str3.data(), str3.length());
+    ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelper(), false));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
+    std::unique_ptr<SecondaryCacheResultHandle> handle3_1 =
+        sec_cache->Lookup(key3, GetHelper(), this, true, /*advise_erase=*/false,
+                          /*stats=*/nullptr, kept_in_sec_cache);
+    ASSERT_EQ(handle3_1, nullptr);
+
+    ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelper(), false));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 3);
+    if (sec_cache_is_compressed) {
+      // TODO: consider a compression rejected stat?
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str3.length());
+      ASSERT_EQ(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str3.length());
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+
+    std::unique_ptr<SecondaryCacheResultHandle> handle3_2 =
+        sec_cache->Lookup(key3, GetHelper(), this, true, /*advise_erase=*/false,
+                          /*stats=*/nullptr, kept_in_sec_cache);
+    ASSERT_NE(handle3_2, nullptr);
+    std::unique_ptr<TestItem> val3 =
+        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle3_2->Value()));
+    ASSERT_NE(val3, nullptr);
+    ASSERT_EQ(memcmp(val3->Buf(), item3.Buf(), item3.Size()), 0);
+
+    EXPECT_GE(comp_sec_cache->TEST_GetCharge(key3), str3.length());
+    EXPECT_LE(comp_sec_cache->TEST_GetCharge(key3), 512);
 
     sec_cache.reset();
   }
@@ -178,8 +238,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
       secondary_cache_opts.compression_type = CompressionType::kNoCompression;
     }
 
-    secondary_cache_opts.capacity = 1100;
+    secondary_cache_opts.capacity = 1400;
     secondary_cache_opts.num_shard_bits = 0;
+    secondary_cache_opts.strict_capacity_limit = true;
     std::shared_ptr<SecondaryCache> sec_cache =
         NewCompressedSecondaryCache(secondary_cache_opts);
 
@@ -193,7 +254,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false));
 
     // Insert and Lookup the second item.
-    std::string str2(rnd.RandomString(200));
+    std::string str2(rnd.RandomString(500));
     TestItem item2(str2.data(), str2.length());
     // Insert a dummy handle, k1 is not evicted.
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
@@ -201,16 +262,23 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     std::unique_ptr<SecondaryCacheResultHandle> handle1 =
         sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
                           /*stats=*/nullptr, kept_in_sec_cache);
-    ASSERT_EQ(handle1, nullptr);
+    ASSERT_NE(handle1, nullptr);
+    std::unique_ptr<TestItem> val1{static_cast<TestItem*>(handle1->Value())};
+    ASSERT_NE(val1, nullptr);
+    ASSERT_EQ(val1->ToString(), str1);
+    handle1.reset();
 
     // Insert k2 and k1 is evicted.
     ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false));
+    handle1 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
+                          /*stats=*/nullptr, kept_in_sec_cache);
+    ASSERT_EQ(handle1, nullptr);
     std::unique_ptr<SecondaryCacheResultHandle> handle2 =
         sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
                           /*stats=*/nullptr, kept_in_sec_cache);
     ASSERT_NE(handle2, nullptr);
-    std::unique_ptr<TestItem> val2 =
-        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
+    std::unique_ptr<TestItem> val2{static_cast<TestItem*>(handle2->Value())};
     ASSERT_NE(val2, nullptr);
     ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
 
@@ -232,7 +300,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     // Save Fails.
     std::string str3 = rnd.RandomString(10);
     TestItem item3(str3.data(), str3.length());
-    // The Status is OK because a dummy handle is inserted.
+    // The first Status is OK because a dummy handle is inserted.
     ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail(), false));
     ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail(), false));
 
@@ -265,11 +333,11 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
 
     get_perf_context()->Reset();
     Random rnd(301);
-    std::string str1 = rnd.RandomString(1001);
+    std::string str1 = test::CompressibleString(&rnd, 0.5, 1001);
     auto item1_1 = new TestItem(str1.data(), str1.length());
     ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));
 
-    std::string str2 = rnd.RandomString(1012);
+    std::string str2 = test::CompressibleString(&rnd, 0.5, 1012);
     auto item2_1 = new TestItem(str2.data(), str2.length());
     // After this Insert, primary cache contains k2 and secondary cache contains
     // k1's dummy item.
@@ -278,7 +346,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
 
-    std::string str3 = rnd.RandomString(1024);
+    std::string str3 = test::CompressibleString(&rnd, 0.5, 1024);
     auto item3_1 = new TestItem(str3.data(), str3.length());
     // After this Insert, primary cache contains k3 and secondary cache contains
     // k1's dummy item and k2's dummy item.
@@ -297,10 +365,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str1.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
                 str1.length());
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                1008);
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str1.length() / 10);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -312,10 +383,13 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length()));
     ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
     if (sec_cache_is_compressed) {
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                str1.length() + str2.length());
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                2027);
+      ASSERT_EQ(
+          Pop(get_perf_context()->compressed_sec_cache_uncompressed_bytes),
+          str2.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                str2.length());
+      ASSERT_GT(Pop(get_perf_context()->compressed_sec_cache_compressed_bytes),
+                str2.length() / 10);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@@ -641,8 +715,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     size_t str_size{8500};
     std::string str = rnd.RandomString(static_cast<int>(str_size));
     size_t charge{0};
-    CacheValueChunk* chunks_head =
-        sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+    CacheValueChunk* chunks_head = sec_cache->SplitValueIntoChunks(str, charge);
     ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
 
     CacheValueChunk* current_chunk = chunks_head;
@@ -688,12 +761,9 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     std::unique_ptr<CompressedSecondaryCache> sec_cache =
         std::make_unique<CompressedSecondaryCache>(
             CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0));
-    size_t charge{0};
-    CacheAllocationPtr value =
-        sec_cache->MergeChunksIntoValue(chunks_head, charge);
-    ASSERT_EQ(charge, size1 + size2 + size3);
-    std::string value_str{value.get(), charge};
-    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+    std::string value_str = sec_cache->MergeChunksIntoValue(chunks_head);
+    ASSERT_EQ(value_str.size(), size1 + size2 + size3);
+    ASSERT_EQ(value_str, str);
 
     while (chunks_head != nullptr) {
       CacheValueChunk* tmp_chunk = chunks_head;
@@ -725,15 +795,12 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
     size_t str_size{8500};
     std::string str = rnd.RandomString(static_cast<int>(str_size));
     size_t charge{0};
-    CacheValueChunk* chunks_head =
-        sec_cache->SplitValueIntoChunks(str, kLZ4Compression, charge);
+    CacheValueChunk* chunks_head = sec_cache->SplitValueIntoChunks(str, charge);
     ASSERT_EQ(charge, str_size + 3 * (sizeof(CacheValueChunk) - 1));
 
-    CacheAllocationPtr value =
-        sec_cache->MergeChunksIntoValue(chunks_head, charge);
-    ASSERT_EQ(charge, str_size);
-    std::string value_str{value.get(), charge};
-    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
+    std::string value_str = sec_cache->MergeChunksIntoValue(chunks_head);
+    ASSERT_EQ(value_str.size(), str_size);
+    ASSERT_EQ(value_str, str);
 
     sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
   }
@@ -896,8 +963,8 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
 
   std::shared_ptr<SecondaryCache> sec_cache = NewCompressedSecondaryCache(opts);
 
-  // Fixed seed to ensure consistent compressibility (doesn't compress)
-  std::string junk(Random(301).RandomString(1000));
+  Random rnd(301);
+  std::string junk = test::CompressibleString(&rnd, 0.5, 1000);
 
   for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
     CacheEntryRole role = static_cast<CacheEntryRole>(i);
@@ -930,9 +997,11 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
         sec_cache_is_compressed_ && !do_not_compress.Contains(role);
     if (compressed) {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
-                1000);
-      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
-                1007);
+                junk.length());
+      ASSERT_LT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                junk.length() * 3 / 4);
+      ASSERT_GT(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                junk.length() / 4);
     } else {
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
       ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h
index e8644c45469f..c7b7b6886efb 100644
--- a/include/rocksdb/secondary_cache.h
+++ b/include/rocksdb/secondary_cache.h
@@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE {
 // Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to
 // return true, but (depending on the implementation) IsReady() might never
 // return true without Wait() or SecondaryCache::WaitAll(). After the handle
-// is known ready, calling Value() is required to avoid a memory leak in case
-// of a cache hit.
+// is known ready, calling Value() and taking ownership is required to avoid
+// a memory leak in case of a cache hit.
 class SecondaryCacheResultHandle {
  public:
   virtual ~SecondaryCacheResultHandle() = default;
diff --git a/memory/memory_allocator_impl.h b/memory/memory_allocator_impl.h
index a71ce0accdfb..65ebfebb94c9 100644
--- a/memory/memory_allocator_impl.h
+++ b/memory/memory_allocator_impl.h
@@ -12,8 +12,8 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-struct CustomDeleter {
-  CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+struct CacheAllocationDeleter {
+  CacheAllocationDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
 
   void operator()(char* ptr) const {
     if (allocator) {
@@ -26,7 +26,7 @@ struct CustomDeleter {
   MemoryAllocator* allocator;
 };
 
-using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+using CacheAllocationPtr = std::unique_ptr<char[], CacheAllocationDeleter>;
 
 inline CacheAllocationPtr AllocateBlock(size_t size,
                                         MemoryAllocator* allocator) {
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 5f36ec5154de..dbff5c8fc263 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -75,6 +75,13 @@ bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode);
 Slice CompressibleString(Random* rnd, double compressed_to_fraction, int len,
                          std::string* dst);
 
+inline std::string CompressibleString(Random* rnd,
+                                      double compressed_to_fraction, int len) {
+  std::string dst;
+  CompressibleString(rnd, compressed_to_fraction, len, &dst);
+  return dst;
+}
+
 #ifndef NDEBUG
 // An internal comparator that just forward comparing results from the
 // user comparator in it. Can be used to test entities that have no dependency
diff --git a/util/coding.h b/util/coding.h
index 9e0d2f0fd099..8648d9a13ba2 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -92,7 +92,7 @@ inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
 }
 
 // Returns the length of the varint32 or varint64 encoding of "v"
-int VarintLength(uint64_t v);
+uint16_t VarintLength(uint64_t v);
 
 // Lower-level versions of Put... that write directly into a character buffer
 // and return a pointer just past the last byte written.
@@ -245,8 +245,8 @@ inline void PutLengthPrefixedSlicePartsWithPadding(
   dst->append(pad_sz, '\0');
 }
 
-inline int VarintLength(uint64_t v) {
-  int len = 1;
+inline uint16_t VarintLength(uint64_t v) {
+  uint16_t len = 1;
   while (v >= 128) {
     v >>= 7;
     len++;

From 217e075df8d3138182a8f0111ead91edb9761e42 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 28 Jul 2025 07:29:03 -0700
Subject: [PATCH 198/500] Simulate e2e flow in Stress Test (#13800)

Summary:
Simulate Remote Compaction in Stress Test by running a separate set of threads that runs remote compaction.
Queue and ResultMap for the remote compactions are stored in memory as part of the `SharedState`. They are shared across main worker threads and remote compaction worker threads.

`enable_remote_compaction` is replaced by `remote_compaction_worker_threads`.
If `remote_compaction_worker_threads` is set to 0, remote compaction is not enabled in Stress Test.

**To Follow up**

This PR covers happy path only. Failure injection in the remote worker thread will be added as a follow up.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13800

Test Plan:
```
./db_stress --remote_compaction_worker_threads=4  --flush_one_in=1000 --writepercent=40 --readpercent=40 --iterpercent=10 --prefixpercent=0 --delpercent=10 --destroy_db_initially=0 --clear_column_family_one_in=0 --reopen=0
```
```
python3 -u tools/db_crashtest.py blackbox --remote_compaction_worker_threads=8
```

Reviewed By: hx235

Differential Revision: D78862084

Pulled By: jaykorean

fbshipit-source-id: b262058c92d7fecc5e014cef5df9cca4a209921b
---
 db_stress_tool/db_stress_common.cc            | 60 +++++++++++++++
 db_stress_tool/db_stress_common.h             |  5 +-
 db_stress_tool/db_stress_compaction_service.h | 76 ++++++++++++++++---
 db_stress_tool/db_stress_driver.cc            | 31 +++++++-
 db_stress_tool/db_stress_gflags.cc            | 10 ++-
 db_stress_tool/db_stress_shared_state.h       | 57 ++++++++++++++
 db_stress_tool/db_stress_test_base.cc         | 18 +++--
 db_stress_tool/db_stress_test_base.h          |  1 +
 include/rocksdb/options.h                     |  1 +
 tools/db_crashtest.py                         |  2 +-
 10 files changed, 241 insertions(+), 20 deletions(-)

diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index 968a6c16c0f8..19f5eeec9e86 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -228,6 +228,66 @@ void CompressedCacheSetCapacityThread(void* v) {
   }
 }
 
+void RemoteCompactionWorkerThread(void* v) {
+  assert(FLAGS_remote_compaction_worker_threads > 0);
+  assert(FLAGS_remote_compaction_worker_interval > 0);
+  auto* thread = static_cast<ThreadState*>(v);
+  SharedState* shared = thread->shared;
+  StressTest* stress_test = shared->GetStressTest();
+  assert(stress_test != nullptr);
+  while (true) {
+    {
+      MutexLock l(shared->GetMutex());
+      if (shared->ShouldStopBgThread()) {
+        shared->IncBgThreadsFinished();
+        if (shared->BgThreadsFinished()) {
+          shared->GetCondVar()->SignalAll();
+        }
+        return;
+      }
+    }
+    std::string job_id;
+    CompactionServiceJobInfo job_info;
+    std::string serialized_input;
+    if (shared->DequeueRemoteCompaction(&job_id, &job_info,
+                                        &serialized_input)) {
+      auto options = stress_test->GetOptions(job_info.cf_id);
+      CompactionServiceOptionsOverride override_options{
+          .file_checksum_gen_factory = options.file_checksum_gen_factory,
+          .merge_operator = options.merge_operator,
+          .compaction_filter = options.compaction_filter,
+          .compaction_filter_factory = options.compaction_filter_factory,
+          .prefix_extractor = options.prefix_extractor,
+          .table_factory = options.table_factory,
+          .sst_partitioner_factory = options.sst_partitioner_factory,
+          .listeners = {},
+          .statistics = options.statistics,
+          .table_properties_collector_factories =
+              options.table_properties_collector_factories};
+      std::string tmp_output_dir = job_info.db_name + "/" + "tmp_output_" +
+                                   db_stress_env->GenerateUniqueId();
+      std::string serialized_output;
+      Status s = DB::OpenAndCompact(OpenAndCompactOptions{}, job_info.db_name,
+                                    tmp_output_dir, serialized_input,
+                                    &serialized_output, override_options);
+      if (!s.ok()) {
+        // Print in stdout instead of stderr to avoid stress test failure,
+        // because OpenAndCompact() failure doesn't necessarily mean
+        // primary db instance failure.
+        fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n",
+                job_info.db_name.c_str(), s.ToString().c_str());
+      }
+      // Add the output regardless of status, so that primary DB doesn't rely on
+      // the timeout to finish waiting. The actual failure from the
+      // deserialization can fail the compaction properly
+      shared->AddRemoteCompactionResult(job_id, serialized_output);
+    }
+    db_stress_env->SleepForMicroseconds(
+        thread->rand.Next() % FLAGS_remote_compaction_worker_interval * 1000 +
+        1);
+  }
+}
+
 void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) {
   if (!FLAGS_verbose) {
     return;
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index f911f09ca230..4152e1e2c009 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -422,7 +422,8 @@ DECLARE_string(file_temperature_age_thresholds);
 DECLARE_bool(allow_trivial_copy_when_change_temperature);
 DECLARE_uint32(commit_bypass_memtable_one_in);
 DECLARE_bool(track_and_verify_wals);
-DECLARE_bool(enable_remote_compaction);
+DECLARE_int32(remote_compaction_worker_threads);
+DECLARE_int32(remote_compaction_worker_interval);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
 DECLARE_uint32(memtable_op_scan_flush_trigger);
 DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
@@ -755,6 +756,8 @@ void PoolSizeChangeThread(void* v);
 
 void DbVerificationThread(void* v);
 
+void RemoteCompactionWorkerThread(void* v);
+
 void CompressedCacheSetCapacityThread(void* v);
 
 void TimestampedSnapshotsThread(void* v);
diff --git a/db_stress_tool/db_stress_compaction_service.h b/db_stress_tool/db_stress_compaction_service.h
index f1fc04ea4467..824d77b11d11 100644
--- a/db_stress_tool/db_stress_compaction_service.h
+++ b/db_stress_tool/db_stress_compaction_service.h
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include "db_stress_shared_state.h"
+#include "db_stress_tool/db_stress_common.h"
 #include "rocksdb/options.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -12,28 +14,82 @@ namespace ROCKSDB_NAMESPACE {
 // Service to simulate Remote Compaction in Stress Test
 class DbStressCompactionService : public CompactionService {
  public:
-  explicit DbStressCompactionService() {}
+  explicit DbStressCompactionService(SharedState* shared)
+      : shared_(shared), aborted_(false) {}
 
   static const char* kClassName() { return "DbStressCompactionService"; }
 
   const char* Name() const override { return kClassName(); }
 
+  static constexpr uint64_t kWaitIntervalInMicros = 10 * 1000;  // 10ms
+  static constexpr uint64_t kWaitTimeoutInMicros =
+      30 * 1000 * 1000;  // 30 seconds
+
   CompactionServiceScheduleResponse Schedule(
-      const CompactionServiceJobInfo& /*info*/,
-      const std::string& /*compaction_service_input*/) override {
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    std::string job_id = info.db_id + "_" + info.db_session_id + "_" +
+                         std::to_string(info.job_id);
+    if (aborted_.load()) {
+      return CompactionServiceScheduleResponse(
+          job_id, CompactionServiceJobStatus::kUseLocal);
+    }
+    shared_->EnqueueRemoteCompaction(job_id, info, compaction_service_input);
     CompactionServiceScheduleResponse response(
-        "Implement Me", CompactionServiceJobStatus::kUseLocal);
+        job_id, CompactionServiceJobStatus::kSuccess);
     return response;
   }
 
-  CompactionServiceJobStatus Wait(const std::string& /*scheduled_job_id*/,
-                                  std::string* /*result*/) override {
-    // TODO - Implement
-    return CompactionServiceJobStatus::kUseLocal;
+  CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
+                                  std::string* result) override {
+    auto start = Env::Default()->NowMicros();
+    while (Env::Default()->NowMicros() - start < kWaitTimeoutInMicros) {
+      if (aborted_.load()) {
+        return CompactionServiceJobStatus::kUseLocal;
+      }
+      if (shared_->GetRemoteCompactionResult(scheduled_job_id, result).ok()) {
+        if (result && result->empty()) {
+          // Race: Remote worker aborted before client sets aborted_ = true
+          return CompactionServiceJobStatus::kUseLocal;
+        }
+        return CompactionServiceJobStatus::kSuccess;
+      }
+      Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
+    }
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  void OnInstallation(const std::string& scheduled_job_id,
+                      CompactionServiceJobStatus /*status*/) override {
+    // Clean up tmp directory
+    std::string serialized;
+    CompactionServiceResult result;
+    if (shared_->GetRemoteCompactionResult(scheduled_job_id, &serialized)
+            .ok()) {
+      if (CompactionServiceResult::Read(serialized, &result).ok()) {
+        std::vector<std::string> filenames;
+        Status s = Env::Default()->GetChildren(result.output_path, &filenames);
+        for (size_t i = 0; s.ok() && i < filenames.size(); ++i) {
+          s = Env::Default()->DeleteFile(result.output_path + "/" +
+                                         filenames[i]);
+          if (!s.ok()) {
+            // TODO - Handle clean up failure?
+            break;
+          }
+        }
+        if (s.ok()) {
+          Env::Default()->DeleteDir(result.output_path).PermitUncheckedError();
+        }
+      }
+      shared_->RemoveRemoteCompactionResult(scheduled_job_id);
+    }
   }
 
-  // TODO - Implement
-  void CancelAwaitingJobs() override {}
+  void CancelAwaitingJobs() override { aborted_.store(true); }
+
+ private:
+  SharedState* shared_;
+  std::atomic_bool aborted_{false};
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc
index d5fb3e643652..5361c7d3ca41 100644
--- a/db_stress_tool/db_stress_driver.cc
+++ b/db_stress_tool/db_stress_driver.cc
@@ -102,6 +102,14 @@ bool RunStressTestImpl(SharedState* shared) {
     shared->IncBgThreads();
   }
 
+  uint32_t remote_compaction_worker_thread_count =
+      FLAGS_remote_compaction_worker_threads;
+  if (remote_compaction_worker_thread_count > 0) {
+    for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
+      shared->IncBgThreads();
+    }
+  }
+
   std::vector<ThreadState*> threads(n);
   for (uint32_t i = 0; i < n; i++) {
     threads[i] = new ThreadState(i, shared);
@@ -126,6 +134,17 @@ bool RunStressTestImpl(SharedState* shared) {
                                &compressed_cache_set_capacity_thread);
   }
 
+  std::vector<ThreadState*> remote_compaction_worker_threads;
+  if (remote_compaction_worker_thread_count > 0) {
+    remote_compaction_worker_threads.reserve(
+        remote_compaction_worker_thread_count);
+    for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
+      remote_compaction_worker_threads[i] = new ThreadState(i, shared);
+      db_stress_env->StartThread(RemoteCompactionWorkerThread,
+                                 remote_compaction_worker_threads[i]);
+    }
+  }
+
   // Each thread goes through the following states:
   // initializing -> wait for others to init -> read/populate/depopulate
   // wait for others to operate -> verify -> done
@@ -218,6 +237,7 @@ bool RunStressTestImpl(SharedState* shared) {
     delete threads[i];
     threads[i] = nullptr;
   }
+
   now = clock->NowMicros();
   if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots &&
       !shared->HasVerificationFailedYet()) {
@@ -232,7 +252,8 @@ bool RunStressTestImpl(SharedState* shared) {
   if (FLAGS_compaction_thread_pool_adjust_interval > 0 ||
       FLAGS_continuous_verification_interval > 0 ||
       FLAGS_compressed_secondary_cache_size > 0 ||
-      FLAGS_compressed_secondary_cache_ratio > 0.0) {
+      FLAGS_compressed_secondary_cache_ratio > 0.0 ||
+      FLAGS_remote_compaction_worker_threads > 0) {
     MutexLock l(shared->GetMutex());
     shared->SetShouldStopBgThread();
     while (!shared->BgThreadsFinished()) {
@@ -240,6 +261,14 @@ bool RunStressTestImpl(SharedState* shared) {
     }
   }
 
+  // Kill remote compaction workers
+  assert(remote_compaction_worker_threads.capacity() ==
+         remote_compaction_worker_thread_count);
+  for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
+    delete remote_compaction_worker_threads[i];
+    remote_compaction_worker_threads[i] = nullptr;
+  }
+
   if (shared->HasVerificationFailedYet()) {
     fprintf(stderr, "Verification failed :(\n");
     return false;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 19636d38112c..59eb7e4de326 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -844,8 +844,14 @@ DEFINE_bool(track_and_verify_wals,
             ROCKSDB_NAMESPACE::Options().track_and_verify_wals,
             "See Options::track_and_verify_wals");
 
-DEFINE_bool(enable_remote_compaction, false,
-            "Enable (simulated) Remote Compaction");
+DEFINE_int32(
+    remote_compaction_worker_threads, 2,
+    "Remote Compaction Worker Thread count. If 0, remote compaction is "
+    "disabled");
+
+DEFINE_int32(remote_compaction_worker_interval, 10,
+             "Remote Compaction Worker Thread dequeue tasks every N "
+             "milliseconds. (Default: 10ms)");
 
 DEFINE_uint32(ingest_wbwi_one_in, 0,
               "If set, will call"
diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h
index 9a14986b396b..4da55a513d9e 100644
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@@ -276,6 +276,53 @@ class SharedState {
     return expected_state_manager_->GetPersistedSeqno();
   }
 
+  void EnqueueRemoteCompaction(const std::string& job_id,
+                               const CompactionServiceJobInfo& job_info,
+                               const std::string& serialized_input) {
+    MutexLock l(&remote_compaction_queue_mu_);
+    remote_compaction_queue_.emplace(job_id, job_info, serialized_input);
+  }
+
+  bool DequeueRemoteCompaction(std::string* job_id,
+                               CompactionServiceJobInfo* job_info,
+                               std::string* serialized_input) {
+    assert(job_id);
+    assert(job_info);
+    assert(serialized_input);
+    MutexLock l(&remote_compaction_queue_mu_);
+    if (!remote_compaction_queue_.empty()) {
+      const auto [id, info, input] = remote_compaction_queue_.front();
+      *job_id = id;
+      *job_info = info;
+      *serialized_input = input;
+      remote_compaction_queue_.pop();
+      return true;
+    }
+    return false;
+  }
+
+  void AddRemoteCompactionResult(const std::string& job_id,
+                                 const std::string& result) {
+    MutexLock l(&remote_compaction_result_map_mu_);
+    remote_compaction_result_map_.emplace(job_id, result);
+  }
+
+  Status GetRemoteCompactionResult(const std::string& job_id,
+                                   std::string* result) {
+    MutexLock l(&remote_compaction_result_map_mu_);
+    if (remote_compaction_result_map_.find(job_id) !=
+        remote_compaction_result_map_.end()) {
+      *result = remote_compaction_result_map_.at(job_id);
+      return Status::OK();
+    }
+    return Status::NotFound();
+  }
+
+  void RemoveRemoteCompactionResult(const std::string& job_id) {
+    MutexLock l(&remote_compaction_result_map_mu_);
+    remote_compaction_result_map_.erase(job_id);
+  }
+
   // Prepare a Put that will be started but not finish yet
   // This is useful for crash-recovery testing when the process may crash
   // before updating the corresponding expected value
@@ -430,6 +477,16 @@ class SharedState {
   std::atomic<bool> verification_failure_;
   std::atomic<bool> should_stop_test_;
 
+  // Queue for the remote compaction. Tuple of job id, job info and serialized
+  // compaction_service_input
+  port::Mutex remote_compaction_queue_mu_;
+  std::queue<std::tuple<std::string, CompactionServiceJobInfo, std::string>>
+      remote_compaction_queue_;
+  // Result Map for the remote compaciton. Key is the scheduled_job_id and value
+  // is serialized compaction_service_result
+  port::Mutex remote_compaction_result_map_mu_;
+  std::unordered_map<std::string, std::string> remote_compaction_result_map_;
+
   // Keys that should not be overwritten
   const std::unordered_set<int64_t> no_overwrite_ids_;
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index e72ce36795dc..18d58e988a1a 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -793,6 +793,12 @@ Status StressTest::SetOptions(ThreadState* thread) {
   return db_->SetOptions(cfh, opts);
 }
 
+Options StressTest::GetOptions(int cf_id) {
+  auto cfh = column_families_[cf_id];
+  assert(cfh);
+  return db_->GetOptions(cfh);
+}
+
 void StressTest::ProcessRecoveredPreparedTxns(SharedState* shared) {
   assert(txn_db_);
   std::vector<Transaction*> recovered_prepared_trans;
@@ -3435,6 +3441,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
           });
     });
   }
+
   if (!strcasecmp(FLAGS_compression_manager.c_str(), "custom")) {
     options_.compression_manager =
         std::make_shared<DbStressCustomCompressionManager>();
@@ -3466,6 +3473,12 @@ void StressTest::Open(SharedState* shared, bool reopen) {
             "memtablerep != prefix_hash\n");
   }
 
+  // Remote Compaction
+  if (FLAGS_remote_compaction_worker_threads > 0) {
+    options_.compaction_service =
+        std::make_shared<DbStressCompactionService>(shared);
+  }
+
   if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
        FLAGS_allow_setting_blob_options_dynamically) &&
       FLAGS_best_efforts_recovery) {
@@ -4374,11 +4387,6 @@ void InitializeOptionsFromFlags(
   options.inplace_update_support = FLAGS_inplace_update_support;
   options.uncache_aggressiveness = FLAGS_uncache_aggressiveness;
 
-  // Remote Compaction
-  if (FLAGS_enable_remote_compaction) {
-    options.compaction_service = std::make_shared<DbStressCompactionService>();
-  }
-
   options.memtable_op_scan_flush_trigger = FLAGS_memtable_op_scan_flush_trigger;
   options.compaction_options_universal.reduce_file_locking =
       FLAGS_universal_reduce_file_locking;
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index bba5c1665f13..19f7364d081b 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -53,6 +53,7 @@ class StressTest {
     Status s = db_->EnableAutoCompaction(column_families_);
     return s;
   }
+  Options GetOptions(int cf_id);
   void CleanUp();
 
  protected:
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 8c1593b13c28..c913295c12f3 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -498,6 +498,7 @@ struct CompactionServiceJobInfo {
   // the output level of the compaction.
   int output_level;
 
+  CompactionServiceJobInfo() {}
   CompactionServiceJobInfo(std::string db_name_, std::string db_id_,
                            std::string db_session_id_, uint32_t cf_id_,
                            std::string cf_name_, uint64_t job_id_,
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 3d3582c0372c..0f8849028c02 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -343,7 +343,7 @@
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
-    "enable_remote_compaction": lambda: random.choice([0, 1]),
+    "remote_compaction_worker_threads": lambda: random.choice([0, 4]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),

From 6ae1cb8837022ef84f3993633791a4a54b18393d Mon Sep 17 00:00:00 2001
From: RROP <kimjiho1121@naver.com>
Date: Mon, 28 Jul 2025 13:14:14 -0700
Subject: [PATCH 199/500] Switch fragmented range tombstone cache to C++20
 atomic<shared_ptr> API (#13744)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
• Guard on __cpp_lib_atomic_shared_ptr to use std::atomic<std::shared_ptr<T>>::load()/store()
• Fallback to std::atomic_load_explicit()/store_explicit() under C++17

When attempting to build with CXX 20 using clang in a Linux environment, the build fails due to deprecation of atomic_load_explicit.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13744

Reviewed By: xingbowang

Differential Revision: D78997919

Pulled By: cbi42

fbshipit-source-id: f829c282cba878f072d4b0ad44192a87f73b8a90
---
 db/memtable.cc | 33 ++++++++++++++++++++++++++++++---
 db/memtable.h  |  6 ++++++
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index 4f07704c7337..6fbd44a3b76f 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -134,6 +134,16 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
   auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
   size_t size = cached_range_tombstone_.Size();
   for (size_t i = 0; i < size; ++i) {
+#if defined(__cpp_lib_atomic_shared_ptr)
+    std::atomic<std::shared_ptr<FragmentedRangeTombstoneListCache>>*
+        local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i);
+    auto new_local_cache_ref = std::make_shared<
+        const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+    std::shared_ptr<FragmentedRangeTombstoneListCache> aliased_ptr(
+        new_local_cache_ref, new_cache.get());
+    local_cache_ref_ptr->store(std::move(aliased_ptr),
+                               std::memory_order_relaxed);
+#else
     std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
         cached_range_tombstone_.AccessAtCore(i);
     auto new_local_cache_ref = std::make_shared<
@@ -143,6 +153,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
         std::shared_ptr<FragmentedRangeTombstoneListCache>(new_local_cache_ref,
                                                            new_cache.get()),
         std::memory_order_relaxed);
+#endif
   }
   const Comparator* ucmp = cmp.user_comparator();
   assert(ucmp);
@@ -200,8 +211,8 @@ bool MemTable::ShouldFlushNow() {
   assert(range_del_table_->ApproximateMemoryUsage() == 0);
   // If arena still have room for new block allocation, we can safely say it
   // shouldn't flush.
-  auto allocated_memory = table_->ApproximateMemoryUsage() +
-                          arena_.MemoryAllocatedBytes();
+  auto allocated_memory =
+      table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
 
   approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
 
@@ -790,8 +801,13 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
 
   // takes current cache
   std::shared_ptr<FragmentedRangeTombstoneListCache> cache =
+#if defined(__cpp_lib_atomic_shared_ptr)
+      cached_range_tombstone_.Access()->load(std::memory_order_relaxed)
+#else
       std::atomic_load_explicit(cached_range_tombstone_.Access(),
-                                std::memory_order_relaxed);
+                                std::memory_order_relaxed)
+#endif
+      ;
   // construct fragmented tombstone list if necessary
   if (!cache->initialized.load(std::memory_order_acquire)) {
     cache->reader_mutex.lock();
@@ -1059,6 +1075,16 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
       range_del_mutex_.lock();
     }
     for (size_t i = 0; i < size; ++i) {
+#if defined(__cpp_lib_atomic_shared_ptr)
+      std::atomic<std::shared_ptr<FragmentedRangeTombstoneListCache>>*
+          local_cache_ref_ptr = cached_range_tombstone_.AccessAtCore(i);
+      auto new_local_cache_ref = std::make_shared<
+          const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+      std::shared_ptr<FragmentedRangeTombstoneListCache> aliased_ptr(
+          new_local_cache_ref, new_cache.get());
+      local_cache_ref_ptr->store(std::move(aliased_ptr),
+                                 std::memory_order_relaxed);
+#else
       std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
           cached_range_tombstone_.AccessAtCore(i);
       auto new_local_cache_ref = std::make_shared<
@@ -1073,6 +1099,7 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
           std::shared_ptr<FragmentedRangeTombstoneListCache>(
               new_local_cache_ref, new_cache.get()),
           std::memory_order_relaxed);
+#endif
     }
 
     if (allow_concurrent) {
diff --git a/db/memtable.h b/db/memtable.h
index 79e9bbdd77c3..da0067297e03 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -949,9 +949,15 @@ class MemTable final : public ReadOnlyMemTable {
 
   // makes sure there is a single range tombstone writer to invalidate cache
   std::mutex range_del_mutex_;
+#if defined(__cpp_lib_atomic_shared_ptr)
+  CoreLocalArray<
+      std::atomic<std::shared_ptr<FragmentedRangeTombstoneListCache>>>
+      cached_range_tombstone_;
+#else
   CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
       cached_range_tombstone_;
 
+#endif
   void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
                            const Slice& key, const Slice& value, ValueType type,
                            SequenceNumber s, char* checksum_ptr);

From f8535fb9556ee87d51becd943773c62613c1a427 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 28 Jul 2025 14:40:32 -0700
Subject: [PATCH 200/500] Build fix and GitHub CI enhancements (#13813)

Summary:
Building db_bench with clang and DEBUG_LEVEL=0 was failing with unused variable. This was not caught by CI so I have added this to the build-linux-clang-13-no_test_run job.

Also, while I was touching CI:
* Fold build-linux-release-rtti into build-linux-release by reducing the number of combinations tested between static/dynamic lib and rtti/not. I don't expect these to interact meaningfully with an extremely mature compiler.
* Combine build-linux-clang10-asan and build-linux-clang10-ubsan because clang is extremely reliable running both together

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13813

Test Plan: manual builds, CI

Reviewed By: krhancoc

Differential Revision: D79112643

Pulled By: pdillinger

fbshipit-source-id: 4ffc672718c05fa4597d637aacbc5a179ad8a0cf
---
 .github/workflows/pr-jobs.yml | 45 ++++++++---------------------------
 tools/db_bench_tool.cc        |  4 ++--
 2 files changed, 12 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index f71aa6cf8cdd..8d423c240ce5 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -196,36 +196,21 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - run: make V=1 -j32 LIB_MODE=shared release
     - run: ls librocksdb.so
-    - run: "./db_stress --version"
+    - run: "./trace_analyzer --version" # A tool dependent on gflags that can run in release build
     - run: make clean
-    - run: make V=1 -j32 release
+    - run: USE_RTTI=1 make V=1 -j32 release
     - run: ls librocksdb.a
-    - run: "./db_stress --version"
+    - run: "./trace_analyzer --version"
     - run: make clean
     - run: apt-get remove -y libgflags-dev
     - run: make V=1 -j32 LIB_MODE=shared release
     - run: ls librocksdb.so
-    - run: if ./db_stress --version; then false; else true; fi
+    - run: if ./trace_analyzer --version; then false; else true; fi
     - run: make clean
-    - run: make V=1 -j32 release
+    - run: USE_RTTI=1 make V=1 -j32 release
     - run: ls librocksdb.a
-    - run: if ./db_stress --version; then false; else true; fi
+    - run: if ./trace_analyzer --version; then false; else true; fi
     - uses: "./.github/actions/post-steps"
-  build-linux-release-rtti:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 8-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
-    - run: "./db_stress --version"
-    - run: make clean
-    - run: apt-get remove -y libgflags-dev
-    - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
-    - run: if ./db_stress --version; then false; else true; fi
   build-linux-clang-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
@@ -248,6 +233,8 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench
+    - run: make clean
+    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release
     - uses: "./.github/actions/post-steps"
   build-linux-gcc-8-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -321,7 +308,7 @@ jobs:
     - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
     - uses: "./.github/actions/post-steps"
   # ======================= Linux with Sanitizers ===================== #
-  build-linux-clang10-asan:
+  build-linux-clang10-asan-ubsan:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 32-core-ubuntu
@@ -331,19 +318,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
-    - uses: "./.github/actions/post-steps"
-  build-linux-clang10-ubsan:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_UBSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check
+    - run: COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j40 check
     - uses: "./.github/actions/post-steps"
   build-linux-clang13-mini-tsan:
     if: ${{ github.repository_owner == 'facebook' }}
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 3461dc265505..5995493a683f 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -6440,8 +6440,8 @@ class Benchmark {
       auto iter =
           db->NewMultiScan(read_options_, db->DefaultColumnFamily(), opts);
       for (auto rng : *iter) {
-        size_t keys = 0;
-        for (auto it __attribute__((__unused__)) : rng) {
+        [[maybe_unused]] size_t keys = 0;
+        for ([[maybe_unused]] auto it : rng) {
           keys++;
         }
         assert(keys > 0);

From 07f15202903268431a63c40ba50b5a7c50031b1a Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 28 Jul 2025 15:39:58 -0700
Subject: [PATCH 201/500] Add MultiScan to db_stress (#13803)

Summary:
Add the new MultiScan operation to db_stress (disabled by default)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13803

Test Plan: python3 tools/db_crashtest.py whitebox --iterpercent=60 --prefix_size=-1 --prefixpercent=0 --readpercent=0 --test_batches_snapshots=0 --use_multiscan=1

Reviewed By: krhancoc

Differential Revision: D78938131

Pulled By: anand1976

fbshipit-source-id: 30fced56e46b79cebebc7ec4d4588c6c2fca232a
---
 db_stress_tool/db_stress_common.h     |   1 +
 db_stress_tool/db_stress_gflags.cc    |   4 +
 db_stress_tool/db_stress_test_base.cc | 171 +++++++++++++++++++++++++-
 db_stress_tool/db_stress_test_base.h  |   4 +
 tools/db_crashtest.py                 |   2 +
 5 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 4152e1e2c009..e675dbee38b6 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -429,6 +429,7 @@ DECLARE_uint32(memtable_op_scan_flush_trigger);
 DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
 DECLARE_uint32(ingest_wbwi_one_in);
 DECLARE_bool(universal_reduce_file_locking);
+DECLARE_bool(use_multiscan);
 
 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 59eb7e4de326..8e0d6a5c10c4 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1501,4 +1501,8 @@ DEFINE_bool(
         .compaction_options_universal.reduce_file_locking,
     "Sets "
     "ColumnFamilyOptions().compaciton_options_universal.reduce_file_locking.");
+
+DEFINE_bool(use_multiscan, false,
+            "If set, use the batched MultiScan API for scans.");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 18d58e988a1a..f5c2bc224234 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1464,9 +1464,23 @@ void StressTest::OperateDb(ThreadState* thread) {
       } else if (prob_op < iterate_bound) {
         assert(delrange_bound <= prob_op);
         // OPERATION iterate
-        if (!FLAGS_skip_verifydb &&
-            thread->rand.OneInOpt(
-                FLAGS_verify_iterator_with_expected_state_one_in)) {
+        if (FLAGS_use_multiscan) {
+          int num_seeks = static_cast<int>(
+              std::min(static_cast<uint64_t>(thread->rand.Uniform(64)),
+                       static_cast<uint64_t>(FLAGS_ops_per_thread - i - 1)));
+          // Generate 2x num_seeks random keys, as each scan has a start key
+          // and an upper bound
+          rand_keys = GenerateNKeys(thread, num_seeks * 2, i);
+          i += num_seeks - 1;
+          ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking);
+          ThreadStatusUtil::SetThreadOperation(
+              ThreadStatus::OperationType::OP_DBITERATOR);
+          Status s;
+          s = TestMultiScan(thread, read_opts, rand_column_families, rand_keys);
+          ThreadStatusUtil::ResetThreadStatus();
+        } else if (!FLAGS_skip_verifydb &&
+                   thread->rand.OneInOpt(
+                       FLAGS_verify_iterator_with_expected_state_one_in)) {
           ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking);
           ThreadStatusUtil::SetThreadOperation(
               ThreadStatus::OperationType::OP_DBITERATOR);
@@ -1644,6 +1658,157 @@ Status StressTest::TestIterateAttributeGroups(
       verify_func);
 }
 
+Status StressTest::TestMultiScan(ThreadState* thread,
+                                 const ReadOptions& read_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys) {
+  size_t num_scans = rand_keys.size() / 2;
+  assert(!rand_column_families.empty());
+  assert(!rand_keys.empty());
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ReadOptions ro = read_opts;
+  ro.snapshot = snapshot_guard.snapshot();
+
+  std::string read_ts_str;
+  Slice read_ts_slice;
+  MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice, ro);
+
+  std::vector<std::string> start_key_strs;
+  std::vector<std::string> end_key_strs;
+  std::vector<ScanOptions> scan_opts;
+  start_key_strs.reserve(num_scans);
+  end_key_strs.reserve(num_scans);
+
+  for (size_t i = 0; i < num_scans * 2; i += 2) {
+    assert(rand_keys[i] <= rand_keys[i + 1]);
+    start_key_strs.emplace_back(Key(rand_keys[i]));
+    end_key_strs.emplace_back(Key(rand_keys[i + 1]));
+    scan_opts.emplace_back(start_key_strs.back(), end_key_strs.back());
+  }
+
+  std::string op_logs;
+  ro.pin_data = thread->rand.OneIn(2);
+  ro.background_purge_on_iterator_cleanup = thread->rand.OneIn(2);
+
+  assert(options_.prefix_extractor.get() == nullptr);
+
+  std::unique_ptr<Iterator> iter;
+  iter.reset(db_->NewIterator(ro, column_families_[rand_column_families[0]]));
+  iter->Prepare(scan_opts);
+
+  constexpr size_t kOpLogsLimit = 10000;
+
+  auto verify_func = [](Iterator* iterator) {
+    if (!VerifyWideColumns(iterator->value(), iterator->columns())) {
+      fprintf(stderr,
+              "Value and columns inconsistent for iterator: value: %s, "
+              "columns: %s\n",
+              iterator->value().ToString(/* hex */ true).c_str(),
+              WideColumnsToHex(iterator->columns()).c_str());
+      return false;
+    }
+    return true;
+  };
+
+  for (const ScanOptions& scan_opt : scan_opts) {
+    if (op_logs.size() > kOpLogsLimit) {
+      // Shouldn't take too much memory for the history log. Clear it.
+      op_logs = "(cleared...)\n";
+    }
+
+    // Set up an iterator, perform the same operations without bounds and with
+    // total order seek, and compare the results. This is to identify bugs
+    // related to bounds, prefix extractor, or reseeking. Sometimes we are
+    // comparing iterators with the same set-up, and it doesn't hurt to check
+    // them to be equal.
+    //
+    // This `ReadOptions` is for validation purposes. Ignore
+    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
+    ReadOptions cmp_ro;
+    cmp_ro.timestamp = ro.timestamp;
+    cmp_ro.iter_start_ts = ro.iter_start_ts;
+    cmp_ro.snapshot = snapshot_guard.snapshot();
+    cmp_ro.auto_refresh_iterator_with_snapshot =
+        ro.auto_refresh_iterator_with_snapshot;
+    cmp_ro.total_order_seek = true;
+
+    ColumnFamilyHandle* const cmp_cfh =
+        GetControlCfh(thread, rand_column_families[0]);
+    assert(cmp_cfh);
+
+    std::unique_ptr<Iterator> cmp_iter(db_->NewIterator(cmp_ro, cmp_cfh));
+
+    bool diverged = false;
+
+    assert(scan_opt.range.start);
+    assert(scan_opt.range.limit);
+    Slice key = scan_opt.range.start.value();
+    Slice ub = scan_opt.range.limit.value();
+    ro.iterate_upper_bound = &ub;
+
+    LastIterateOp last_op;
+    iter->Seek(key);
+    cmp_iter->Seek(key);
+    last_op = kLastOpSeek;
+    op_logs += "S " + key.ToString(true) + " ";
+
+    if (iter->Valid() && ro.allow_unprepared_value) {
+      op_logs += "*";
+
+      if (!iter->PrepareValue()) {
+        assert(!iter->Valid());
+        assert(!iter->status().ok());
+      }
+    }
+
+    if (!iter->status().ok() && IsErrorInjectedAndRetryable(iter->status())) {
+      return iter->status();
+    } else if (!cmp_iter->status().ok() &&
+               IsErrorInjectedAndRetryable(cmp_iter->status())) {
+      return cmp_iter->status();
+    }
+
+    VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
+                   key, op_logs, verify_func, &diverged);
+
+    while (iter->Valid()) {
+      iter->Next();
+      if (!diverged) {
+        assert(cmp_iter->Valid());
+        cmp_iter->Next();
+      }
+      op_logs += "N";
+
+      if (iter->Valid() && ro.allow_unprepared_value) {
+        op_logs += "*";
+
+        if (!iter->PrepareValue()) {
+          assert(!iter->Valid());
+          assert(!iter->status().ok());
+        }
+      }
+
+      if (!iter->status().ok() && IsErrorInjectedAndRetryable(iter->status())) {
+        return iter->status();
+      } else if (!cmp_iter->status().ok() &&
+                 IsErrorInjectedAndRetryable(cmp_iter->status())) {
+        return cmp_iter->status();
+      }
+
+      VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
+                     key, op_logs, verify_func, &diverged);
+    }
+
+    thread->stats.AddIterations(1);
+
+    op_logs += "; ";
+  }
+
+  return Status::OK();
+}
+
 template <typename IterType, typename NewIterFunc, typename VerifyFunc>
 Status StressTest::TestIterateImpl(ThreadState* thread,
                                    const ReadOptions& read_opts,
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index 19f7364d081b..dd72d5e2ea7e 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -275,6 +275,10 @@ class StressTest {
     return Status::NotSupported();
   }
 
+  Status TestMultiScan(ThreadState* thread, const ReadOptions& read_opts,
+                       const std::vector<int>& rand_column_families,
+                       const std::vector<int64_t>& rand_keys);
+
   // Enum used by VerifyIterator() to identify the mode to validate.
   enum LastIterateOp {
     kLastOpSeek,
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 0f8849028c02..c769918393a5 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -181,6 +181,7 @@
     "format_version": lambda: random.choice([2, 3, 4, 5, 6, 7, 7]),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget": lambda: random.randint(0, 1),
+    "use_multiscan": 0,
     "use_get_entity": lambda: random.choice([0] * 7 + [1]),
     "use_multi_get_entity": lambda: random.choice([0] * 7 + [1]),
     "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
@@ -739,6 +740,7 @@ def finalize_and_sanitize(src_params):
         dest_params["metadata_write_fault_one_in"] = 0
         dest_params["read_fault_one_in"] = 0
         dest_params["metadata_read_fault_one_in"] = 0
+        dest_params["use_multiscan"] = 0
         if dest_params["prefix_size"] < 0:
             dest_params["prefix_size"] = 1
 

From f66ac769388f5c13167878628bd19333cb3d1a28 Mon Sep 17 00:00:00 2001
From: huangmengbin <hmb.nju@gmail.com>
Date: Mon, 28 Jul 2025 19:17:45 -0700
Subject: [PATCH 202/500] prevent data loss when all entries are expired in
 Remote Compaction (#13743)

Summary:
**Issue**:
When running remote compaction, if all entries in the input files are expired, RocksDB incorrectly deletes an active file from the primary DB, leading to data loss and corruption.

**Root Cause**:
The current logic mistakenly mixed up the input and output file paths during the cleanup phase when no keys survive the compaction (all expired). This results in deleting the input files (which belong to the primary DB) instead of the output files (which belong to the SecondaryDB).

**Fix**:
Use `GetTableFileName` (virtual function) instead of `TableFileName`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13743

Reviewed By: hx235

Differential Revision: D79108650

Pulled By: jaykorean

fbshipit-source-id: 1c9ba971a0e9a62c15ebc014436cb8fc961af95c
---
 db/compaction/compaction_job.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index f2a36907de42..843feb763e6f 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1741,9 +1741,7 @@ Status CompactionJob::FinishCompactionOutputFile(
     // If there is nothing to output, no necessary to generate a sst file.
     // This happens when the output level is bottom level, at the same time
     // the sub_compact output nothing.
-    std::string fname =
-        TableFileName(sub_compact->compaction->immutable_options().cf_paths,
-                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    std::string fname = GetTableFileName(meta->fd.GetNumber());
 
     // TODO(AR) it is not clear if there are any larger implications if
     // DeleteFile fails here

From 5435032c4c213a039029960b00eaf8c54a7fd660 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 29 Jul 2025 08:48:19 -0700
Subject: [PATCH 203/500] Temporarily Disable Remote Compaction in Stress Test
 (#13815)

Summary:
As title. We will re-enable it once fixed

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13815

Test Plan: N/A - Disabling the test.

Reviewed By: archang19

Differential Revision: D79172697

Pulled By: jaykorean

fbshipit-source-id: 936de3743816049cda811bde48b3b2207ed256ee
---
 db_stress_tool/db_stress_driver.cc | 12 +++++++-----
 tools/db_crashtest.py              |  3 ++-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc
index 5361c7d3ca41..21b23b4283da 100644
--- a/db_stress_tool/db_stress_driver.cc
+++ b/db_stress_tool/db_stress_driver.cc
@@ -262,11 +262,13 @@ bool RunStressTestImpl(SharedState* shared) {
   }
 
   // Kill remote compaction workers
-  assert(remote_compaction_worker_threads.capacity() ==
-         remote_compaction_worker_thread_count);
-  for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
-    delete remote_compaction_worker_threads[i];
-    remote_compaction_worker_threads[i] = nullptr;
+  if (remote_compaction_worker_thread_count > 0) {
+    assert(remote_compaction_worker_threads.capacity() ==
+           remote_compaction_worker_thread_count);
+    for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
+      delete remote_compaction_worker_threads[i];
+      remote_compaction_worker_threads[i] = nullptr;
+    }
   }
 
   if (shared->HasVerificationFailedYet()) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index c769918393a5..b314b1022e32 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -344,7 +344,8 @@
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
-    "remote_compaction_worker_threads": lambda: random.choice([0, 4]),
+    # TODO(jaykorean): re-enable remote compaction stress test once fixed
+    "remote_compaction_worker_threads": lambda: 0,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),

From e7a4505a2ef57aab082d4ebde0773516d9404087 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 30 Jul 2025 12:00:54 -0700
Subject: [PATCH 204/500] Preserve tombstones for `allow_ingest_behind`
 (#13807)

Summary:
Preserve tombstone when allow_ingest_behind` is enabled so that they can be applied to ingested files. This can be useful when users use ingest_behind to buffer updates where Deletion needs to be preserved. This fixes https://github.com/facebook/rocksdb/issues/13571.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13807

Test Plan: updated a unit test to verify that tombstones are not dropped during compaction.

Reviewed By: hx235

Differential Revision: D79016109

Pulled By: cbi42

fbshipit-source-id: c4d31ef32c88468ababcc1ea5af5db6de42a3b0d
---
 db/compaction/compaction.cc                   |  2 +
 db/compaction/compaction_iterator.cc          | 45 ++++++++++++++-----
 db/compaction/compaction_iterator.h           | 11 +++--
 db/external_sst_file_test.cc                  | 32 ++++++++++++-
 db/merge_helper.cc                            |  1 +
 include/rocksdb/options.h                     |  5 ++-
 .../ingest_behind_tombstone.md                |  1 +
 7 files changed, 80 insertions(+), 17 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/ingest_behind_tombstone.md

diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 8046444ff828..afbabbaa510d 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -647,6 +647,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
     return true;
   } else if (output_level_ != 0 &&
              cfd_->ioptions().compaction_style == kCompactionStyleLevel) {
+    // TODO: apply the optimization here to other compaction styles and
+    // compaction/flush to L0.
     // Maybe use binary search to find right entry instead of linear search?
     const Comparator* user_cmp = cfd_->user_comparator();
     for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index dc441817c6cc..d21672e8906c 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -83,9 +83,8 @@ CompactionIterator::CompactionIterator(
       compaction_filter_(compaction_filter),
       shutting_down_(shutting_down),
       manual_compaction_canceled_(manual_compaction_canceled),
-      bottommost_level_(!compaction_ ? false
-                                     : compaction_->bottommost_level() &&
-                                           !compaction_->allow_ingest_behind()),
+      bottommost_level_(compaction_ && compaction_->bottommost_level() &&
+                        !compaction_->allow_ingest_behind()),
       // snapshots_ cannot be nullptr, but we will assert later in the body of
       // the constructor.
       visible_at_tip_(snapshots_ ? snapshots_->empty() : false),
@@ -161,6 +160,7 @@ void CompactionIterator::Next() {
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
       if (!s.ok()) {
+        // FIXME: should fail compaction after this fatal logging.
         ROCKS_LOG_FATAL(
             info_log_, "Invalid ikey %s in compaction. %s",
             allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
@@ -642,7 +642,8 @@ void CompactionIterator::NextFromInput() {
     } else if (ikey_.type == kTypeSingleDeletion) {
       // We can compact out a SingleDelete if:
       // 1) We encounter the corresponding PUT -OR- we know that this key
-      //    doesn't appear past this output level
+      //    doesn't appear past this output level and  we are not in
+      //    ingest_behind mode.
       // =AND=
       // 2) We've already returned a record in this snapshot -OR-
       //    there are no earlier earliest_write_conflict_snapshot.
@@ -731,6 +732,8 @@ void CompactionIterator::NextFromInput() {
             "CompactionIterator::NextFromInput:SingleDelete:1",
             const_cast<Compaction*>(c));
         if (last_key_seq_zeroed_) {
+          // Drop SD and the next key since they are both in the last
+          // snapshot (since last key has seqno zeroed).
           ++iter_stats_.num_record_drop_hidden;
           ++iter_stats_.num_record_drop_obsolete;
           assert(bottommost_level_);
@@ -841,7 +844,7 @@ void CompactionIterator::NextFromInput() {
         // iteration. If the next key is corrupt, we return before the
         // comparison, so the value of has_current_user_key does not matter.
         has_current_user_key_ = false;
-        if (compaction_ != nullptr &&
+        if (compaction_ != nullptr && !compaction_->allow_ingest_behind() &&
             DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
             compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                        &level_ptrs_) &&
@@ -854,6 +857,9 @@ void CompactionIterator::NextFromInput() {
             ++iter_stats_.num_optimized_del_drop_obsolete;
           }
         } else if (last_key_seq_zeroed_) {
+          // Sequence number zeroing requires bottommost_level_, which is
+          // false with ingest_behind.
+          assert(!compaction_->allow_ingest_behind());
           // Skip.
           ++iter_stats_.num_record_drop_hidden;
           ++iter_stats_.num_record_drop_obsolete;
@@ -870,6 +876,7 @@ void CompactionIterator::NextFromInput() {
     } else if (last_sequence != kMaxSequenceNumber &&
                (last_snapshot == current_user_key_snapshot_ ||
                 last_snapshot < current_user_key_snapshot_)) {
+      // rule (A):
       // If the earliest snapshot is which this key is visible in
       // is the same as the visibility of a previous instance of the
       // same key, then this kv is not visible in any snapshot.
@@ -878,6 +885,15 @@ void CompactionIterator::NextFromInput() {
       // Note: Dropping this key will not affect TransactionDB write-conflict
       // checking since there has already been a record returned for this key
       // in this snapshot.
+      // When ingest_behind is enabled, it's ok that we drop an overwritten
+      // Delete here. The overwritting key still covers whatever that will be
+      // ingested. Note that we will not drop SingleDelete here as SingleDelte
+      // is handled entirely in its own if clause. This is important, see
+      // example: from new to old: SingleDelete_1, PUT_1, SingleDelete_2, PUT_2,
+      // where all operations are on the same key and PUT_2 is ingested with
+      // ingest_behind=true. If SingleDelete_2 is dropped due to being compacted
+      // together with PUT_1, and then PUT_1 is compacted away together with
+      // SingleDelete_1, PUT_2 can incorrectly becomes visible.
       if (last_sequence < current_user_key_sequence_) {
         ROCKS_LOG_FATAL(info_log_,
                         "key %s, last_sequence (%" PRIu64
@@ -887,12 +903,13 @@ void CompactionIterator::NextFromInput() {
         assert(false);
       }
 
-      ++iter_stats_.num_record_drop_hidden;  // rule (A)
+      ++iter_stats_.num_record_drop_hidden;
       AdvanceInputIter();
     } else if (compaction_ != nullptr &&
                (ikey_.type == kTypeDeletion ||
                 (ikey_.type == kTypeDeletionWithTimestamp &&
                  cmp_with_history_ts_low_ < 0)) &&
+               !compaction_->allow_ingest_behind() &&
                DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
                compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                           &level_ptrs_)) {
@@ -928,11 +945,13 @@ void CompactionIterator::NextFromInput() {
                 (ikey_.type == kTypeDeletionWithTimestamp &&
                  cmp_with_history_ts_low_ < 0)) &&
                bottommost_level_) {
+      assert(compaction_);
+      assert(!compaction_->allow_ingest_behind());  // bottommost_level_ is true
       // Handle the case where we have a delete key at the bottom most level
       // We can skip outputting the key iff there are no subsequent puts for
       // this key
-      assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
-                                 ikey_.user_key, &level_ptrs_));
+      assert(compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                        &level_ptrs_));
       ParsedInternalKey next_ikey;
       AdvanceInputIter();
 #ifndef NDEBUG
@@ -974,6 +993,12 @@ void CompactionIterator::NextFromInput() {
                 (compaction_ != nullptr &&
                  compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                             &level_ptrs_)))) {
+      // FIXME: it's possible that we are setting sequence number to 0 as
+      // preferred sequence number here. If cf_ingest_behind is enabled, this
+      // may fail ingestions since they expect all keys above the last level
+      // to have non-zero sequence number. We should probably not allow seqno
+      // zeroing here.
+      //
       // This section that attempts to swap preferred sequence number will not
       // be invoked if this is a CompactionIterator created for flush, since
       // `compaction_` will be nullptr and it's not bottommost either.
@@ -1274,11 +1299,11 @@ void CompactionIterator::PrepareOutput() {
     //
     // Can we do the same for levels above bottom level as long as
     // KeyNotExistsBeyondOutputLevel() return true?
-    if (Valid() && compaction_ != nullptr &&
-        !compaction_->allow_ingest_behind() && bottommost_level_ &&
+    if (Valid() && bottommost_level_ &&
         DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
         ikey_.type != kTypeMerge && current_key_committed_ &&
         ikey_.sequence <= preserve_seqno_after_ && !is_range_del_) {
+      assert(compaction_ != nullptr && !compaction_->allow_ingest_behind());
       if (ikey_.type == kTypeDeletion ||
           (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
         ROCKS_LOG_FATAL(
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index c3e4942ac342..5293d647b3d9 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -417,13 +417,15 @@ class CompactionIterator {
   // NextFromInput()).
   ParsedInternalKey ikey_;
 
-  // Stores whether ikey_.user_key is valid. If set to false, the user key is
-  // not compared against the current key in the underlying iterator.
+  // Stores whether current_user_key_ is valid. If so, current_user_key_
+  // stores the user key of the last key seen by the iterator.
+  // If false, treat the next key to read as a new user key.
   bool has_current_user_key_ = false;
   // If false, the iterator holds a copy of the current compaction iterator
   // output (or current key in the underlying iterator during NextFromInput()).
   bool at_next_ = false;
 
+  // A copy of the current internal key.
   IterKey current_key_;
   Slice current_user_key_;
   std::string curr_ts_;
@@ -433,8 +435,9 @@ class CompactionIterator {
   // True if the iterator has already returned a record for the current key.
   bool has_outputted_key_ = false;
 
-  // truncated the value of the next key and output it without applying any
-  // compaction rules.  This is used for outputting a put after a single delete.
+  // Truncate the value of the next key and output it without applying any
+  // compaction rules. This is an optimization for outputting a put after
+  // a single delete. See more in `NextFromInput()` under Optimization 3.
   bool clear_and_output_next_key_ = false;
 
   MergeOutputIterator merge_out_iter_;
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 2e4cae427731..64965643c729 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -2430,7 +2430,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) {
     ASSERT_OK(Put(Key(i), "memtable"));
   }
 
-  // Insert 100 -> 200 using IngestExternalFile
+  // Insert 0 -> 20 using IngestExternalFile
   file_data.clear();
   for (int i = 0; i <= 20; i++) {
     file_data.emplace_back(Key(i), "ingest_behind");
@@ -2462,13 +2462,41 @@ TEST_P(ExternalSSTFileTest, IngestBehind) {
     ASSERT_OK(Put(Key(i), "memtable"));
     true_data[Key(i)] = "memtable";
   }
+
+  // Test that tombstones for Key(7) and Key(8) are not dropped during
+  // compaction. Will verify below that after ingesting Puts for Key(7) and
+  // Key(8), they are covered by these two tombstones.
+  ASSERT_OK(Delete(Key(7)));
+  ASSERT_OK(SingleDelete(Key(8)));
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // Universal picker should go at second from the bottom level
   ASSERT_EQ("0,1", FilesPerLevel());
+
+  // Test that SingleDelte overwritten by Put is not dropped.
+  // From old to new, we issue SD, PUT, CompactRange, SD, CompactRange. The
+  // first CompactRange() should not drop the overwritten SD. The second
+  // CompactRange() will drop the new SD with PUT. If the older SD was dropped,
+  // the ingested behind data will be incorrectly visible below.
+  ASSERT_OK(SingleDelete(Key(1)));
+  ASSERT_OK(Put(Key(1), "overwrite_sd"));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(SingleDelete(Key(1)));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
   ASSERT_OK(GenerateAndAddExternalFile(
       options, file_data, -1, allow_global_seqno, write_global_seqno,
       verify_checksums_before_ingest, true /*ingest_behind*/,
       false /*sort_data*/, &true_data));
+  // adjust expected data for tombtones
+  true_data.erase(Key(7));
+  true_data.erase(Key(8));
+  true_data.erase(Key(1));
+  std::unordered_set<std::string> not_found_set;
+  // Tombstones will be verified in VerifyDBFromMap() below.
+  not_found_set.insert(Key(7));
+  not_found_set.insert(Key(8));
+  not_found_set.insert(Key(1));
+
   ASSERT_EQ("0,1,1", FilesPerLevel());
   // this time ingest should fail as the file doesn't fit to the bottom level
   ASSERT_NOK(GenerateAndAddExternalFile(
@@ -2485,7 +2513,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) {
   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
   ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
   size_t kcnt = 0;
-  VerifyDBFromMap(true_data, &kcnt, false);
+  VerifyDBFromMap(true_data, &kcnt, false, nullptr, nullptr, &not_found_set);
 
   // Auto-compaction should not include the last level.
   // Trigger compaction if size amplification exceeds 110%.
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index 2576aae840d7..0261ba0e27db 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -497,6 +497,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
           ikey.sequence <= latest_snapshot_
               ? CompactionFilter::Decision::kKeep
               : FilterMerge(orig_ikey.user_key, value_slice);
+      // FIXME: should also check for kRemove here
       if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
           range_del_agg != nullptr &&
           range_del_agg->ShouldDelete(
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index c913295c12f3..e3604fb5f62b 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1368,9 +1368,12 @@ struct DBOptions {
   // 1) Disable some internal optimizations around SST file compression.
   // 2) Reserve the last level for ingested files only.
   // 3) Compaction will not include any file from the last level.
+  // 4) Compaction will preserve necessary tombstones that can apply on
+  // top of ingested files.
   // Note that only Universal Compaction supports allow_ingest_behind.
   // `num_levels` should be >= 3 if this option is turned on.
-  //
+  // Note that if TimedPut was issued to a CF, ingest behind into that
+  // CF may fail.
   //
   // DEFAULT: false
   // Immutable.
diff --git a/unreleased_history/behavior_changes/ingest_behind_tombstone.md b/unreleased_history/behavior_changes/ingest_behind_tombstone.md
new file mode 100644
index 000000000000..ce54cf221fd5
--- /dev/null
+++ b/unreleased_history/behavior_changes/ingest_behind_tombstone.md
@@ -0,0 +1 @@
+* When `allow_ingest_behind` is enabled, compaction will no longer drop tombstones based on the absence of underlying data. Tombstones will be preserved to apply to ingested files.

From 3757e5479d54957819a5d6b881c16e62af169d6b Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 30 Jul 2025 13:00:37 -0700
Subject: [PATCH 205/500] Improve detection and reporting for fbcode build
 (#13820)

Summary:
We were seeing some internal builds apparently failing the `-d /mnt/gvfs/third-party` check. Although third-party2 is likely a better check (see dependencies_platform010.sh), that would create a big headache with check_format_compatible.sh which has to work across codebase versions.
* Report a WARNING when we detect on a Meta machine but the `-d /mnt/gvfs/third-party` check fails
* Let USE_CLANG influence default compiler choice so that things might still work in that case (e.g. `USE_CLANG=1 make -j24 check`)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13820

Test Plan: manual, CI

Reviewed By: jaykorean

Differential Revision: D79277197

Pulled By: pdillinger

fbshipit-source-id: 19b2d45ed794f64bbf838f4414568d77ae9ca6f1
---
 build_tools/build_detect_platform | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 629b670b43d6..f0d4bb004cb4 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -55,8 +55,11 @@ fi
 # we currently depend on POSIX platform
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
 
-# Default to fbcode gcc on internal fb machines
-if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
+# Default to fbcode gcc on Meta internal machines
+IS_META_HOST="$(hostname | grep -E '(facebook|meta).com|fbinfra.net')"
+if [ -z "$ROCKSDB_NO_FBCODE" -a "$IS_META_HOST" ]; then
+  if [ -d /mnt/gvfs/third-party ]; then
+    echo "NOTE: Using fbcode build" >&2
     FBCODE_BUILD="true"
     # If we're compiling with TSAN or shared lib, we need pic build
     PIC_BUILD=$COMPILE_WITH_TSAN
@@ -64,6 +67,11 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
       PIC_BUILD=1
     fi
     source "$PWD/build_tools/fbcode_config_platform010.sh"
+  else
+    echo "************************************************************************" >&2
+    echo "WARNING: -d /mnt/gvfs/third-party failed; no fbcode build" >&2
+    echo "************************************************************************" >&2
+  fi
 fi
 
 # Delete existing output, if it exists
@@ -71,7 +79,9 @@ rm -f "$OUTPUT"
 touch "$OUTPUT"
 
 if test -z "$CC"; then
-    if [ -x "$(command -v cc)" ]; then
+    if [ "$USE_CLANG" -a -x "$(command -v clang)" ]; then
+        CC=clang
+    elif [ -x "$(command -v cc)" ]; then
         CC=cc
     elif [ -x "$(command -v clang)" ]; then
         CC=clang
@@ -81,7 +91,9 @@ if test -z "$CC"; then
 fi
 
 if test -z "$CXX"; then
-    if [ -x "$(command -v g++)" ]; then
+    if [ "$USE_CLANG" -a -x "$(command -v clang++)" ]; then
+        CXX=clang++
+    elif [ -x "$(command -v g++)" ]; then
         CXX=g++
     elif [ -x "$(command -v clang++)" ]; then
         CXX=clang++
@@ -91,7 +103,9 @@ if test -z "$CXX"; then
 fi
 
 if test -z "$AR"; then
-    if [ -x "$(command -v gcc-ar)" ]; then
+    if [ "$USE_CLANG" -a -x "$(command -v llvm-ar)" ]; then
+        AR=llvm-ar
+    elif [ -x "$(command -v gcc-ar)" ]; then
         AR=gcc-ar
     elif [ -x "$(command -v llvm-ar)" ]; then
         AR=llvm-ar

From 7f14960816e6984dfd572efc28e503ef011acc43 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 30 Jul 2025 14:13:31 -0700
Subject: [PATCH 206/500] UnitTest for Remote Compaction Empty Result (#13812)

Summary:
Unit Test for a repro for the fix that was reported by https://github.com/facebook/rocksdb/pull/13743

There's potential dataloss when Remote Compaction entries are all removed due to various reasons (CompactionFilter, DeleteRange covering all keys of the SST file, etc)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13812

Test Plan:
```
./compaction_service_test --gtest_filter="*CompactionServiceTest.EmptyResult*"
```

Failed before merging https://github.com/facebook/rocksdb/pull/13743, now passing

Reviewed By: cbi42

Differential Revision: D79192829

Pulled By: jaykorean

fbshipit-source-id: e200300c4a7993de21c63cd92bda65b692921b89
---
 db/compaction/compaction_job.cc               |  5 +-
 db/compaction/compaction_service_test.cc      | 73 +++++++++++++++++++
 .../remote_compaction_empty_result.md         |  1 +
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 unreleased_history/bug_fixes/remote_compaction_empty_result.md

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 843feb763e6f..5d666429ad6a 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1736,7 +1736,6 @@ Status CompactionJob::FinishCompactionOutputFile(
   if (s.ok()) {
     tp = outputs.GetTableProperties();
   }
-
   if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
     // If there is nothing to output, no necessary to generate a sst file.
     // This happens when the output level is bottom level, at the same time
@@ -1940,6 +1939,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
 
   // no need to lock because VersionSet::next_file_number_ is atomic
   uint64_t file_number = versions_->NewFileNumber();
+#ifndef NDEBUG
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::OpenCompactionOutputFile::NewFileNumber", &file_number);
+#endif
   std::string fname = GetTableFileName(file_number);
   // Fire events.
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index a5c0295540d5..08a2a9cf0716 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -793,6 +793,79 @@ TEST_F(CompactionServiceTest, VerifyInputRecordCount) {
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_F(CompactionServiceTest, EmptyResult) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  // Delete range to cover entire range
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), "key", "keyz"));
+  ASSERT_OK(Flush());
+
+  // In this unit test, both remote compaction and primary db instance are
+  // running in the same process, so NewFileNumber will never have a collision.
+  // In the real-world remote compactions, when the compaction is indeed running
+  // in another process, this is not going to be the case.
+  // To simulate the SST file with the same name created in the tmp directory,
+  // override the file number in remote compaction to re-use old SST file
+  // number.
+  bool need_to_override_file_number = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::OpenAndCompact::BeforeLoadingOptions:0",
+      [&](void*) { need_to_override_file_number = true; });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile::NewFileNumber",
+      [&](void* file_number) {
+        if (need_to_override_file_number) {
+          auto n = static_cast<uint64_t*>(file_number);
+          ColumnFamilyMetaData cf_meta;
+          db_->GetColumnFamilyMetaData(&cf_meta);
+          for (const auto& level : cf_meta.levels) {
+            for (const auto& file : level.files) {
+              // Use one of the existing file name
+              *n = test::GetFileNumber(file.name);
+              need_to_override_file_number = false;
+              return;
+            }
+          }
+        }
+      });
+
+  // Inject failure, so that the remote compaction fails after
+  // ProcessKeyValueCompaction()
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+        // override job status
+        auto s = static_cast<Status*>(status);
+        *s = Status::Aborted("MyTestCompactionService failed to compact!");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Compaction should fail and SST files in the primary db should exist
+  {
+    ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    for (const auto& level : meta.levels) {
+      for (const auto& file : level.files) {
+        std::string fname = file.db_path + "/" + file.name;
+        ASSERT_OK(db_->GetEnv()->FileExists(fname));
+      }
+    }
+  }
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(CompactionServiceTest, CorruptedOutput) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
diff --git a/unreleased_history/bug_fixes/remote_compaction_empty_result.md b/unreleased_history/bug_fixes/remote_compaction_empty_result.md
new file mode 100644
index 000000000000..dcb93d2cc9c2
--- /dev/null
+++ b/unreleased_history/bug_fixes/remote_compaction_empty_result.md
@@ -0,0 +1 @@
+Fixed a bug in remote compaction that may mistakenly delete live SST file(s) during the cleanup phase when no keys survive the compaction (all expired)

From 0a169cea0e0311a4d44f58df6a413054aeae09a1 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 31 Jul 2025 08:39:56 -0700
Subject: [PATCH 207/500] Compressor::CompressBlock API change and
 refactoring/improvement (#13805)

Summary:
The main motivation for this change is to more flexibly and efficiently support compressing data without extra copies when we do not want to support saving compressed data that is LARGER than the uncompressed. We believe pretty strongly that for the various workloads served by RocksDB, it is well worth a single byte compression marker so that we have the flexibility to save compressed or uncompressed data when compression is attempted. Why? Compression algorithms can add tens of bytes in fixed overheads and percents of bytes in relative overheads. It is also an advantage for the reader when they can bypass decompression, including at least a buffer copy in most cases, after reading just one byte.

The block-based table format in RocksDB follows this model with a single-byte compression marker, and at least after https://github.com/facebook/rocksdb/pull/13797 so does CompressedSecondaryCache. (Notably, the blob file format DOES NOT. This is left to follow-up work.)

In particular, Compressor::CompressBlock now takes in a fixed size buffer for output rather than a `std::string*`. CompressBlock itself rejects the compression if the output would not fit in the provided buffer. This also works well with `max_compressed_bytes_per_kb` option to reject compression even sooner if its ratio is insufficient (implemented in this change). In the future we might use this functionality to reduce a buffer copy (in many cases) into the WritableFileWriter buffer of the block based table builder.

This is a large change because we needed to (or were compelled to)
* Update all the existing callers of CompressBlock, sometimes with substantial changes. This includes introducing GrowableBuffer to reuse between calls rather than std::string, which (at least in C++17) requires zeroing out data when allocating/growing a buffer.
* Re-implement built-in Compressors (V2; V1 is obsolete) to efficiently implement the new version of the API, no longer wrapping the `OLD_CompressData()` function. The new compressors appropriately leverage the CompressBlock virtual call required for the customization interface and no rely on `switch` on compression type for each block. The implementations are largely adaptations of the old implementations, except
  * LZ4 and LZ4HC are notably upgraded to take advantage of WorkingArea (see performance tests). And for simplicity in the new implementation, we are dropping support for some super old versions of the library.
  * Getting snappy to work with limited-size output buffer required using the Sink/Source interfaces, which appear to be well supported for a long time and efficient (see performance tests).
* Replace awkward old CompressionManager::GetDecompressorForCompressor with Compressor::GetOptimizedDecompressor (which is optional to implement)
* Small behavior change where we treat lack of support for compression closer to not configuring compression, such as incompatibility with block_align. This is motivated by giving CompressionManager the freedom of determining when compression can be excluded for an entire file despite the configured "compression" type, and thus only surfacing actual incompatibilities not hypothetical ones that might be irrelevant to the CompressionManager (or build configuration). Unit tests in `table_test` and `compact_files_test` required update.
* Some lingering clean up of CompressedSecondaryCache and a re-optimization made possible by compressing into an existing buffer.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13805

Test Plan:
for correctness, existing tests

## Performance Test

As I generally only modified compression paths, I'm using a db_bench write benchmark, with before & after configurations running at the same time. vc=1 means verify_compression=1

```
USE_CLANG=1 DEBUG_LEVEL=0 LIB_MODE=static make -j100 db_bench
SUFFIX=`tty | sed 's|/|_|g'`; for CT in zlib bzip2 none snappy zstd lz4 lz4hc none snappy zstd lz4 bzip2; do for VC in 0 1; do echo "$CT vc=$VC"; (for I in `seq 1 20`; do BIN=/dev/shm/dbbench${SUFFIX}.bin; rm -f $BIN; cp db_bench $BIN; $BIN -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 -compression_type=$CT -verify_compression=$VC 2>&1 | grep micros/op; done) | awk '{n++; sum += $5;} END { print int(sum / n); }'; done; done
```

zlib vc=0 524198 -> 524904 (+0.1%)
zlib vc=1 430521 -> 430699 (+0.0%)
bzip2 vc=0 61841 -> 60835 (-1.6%)
bzip2 vc=1 49232 -> 48734 (-1.0%)
none vc=0 1802375 -> 1906227 (+5.8%)
none vc=1 1837181 -> 1950308 (+6.2%)
snappy vc=0 1783266 -> 1901461 (+6.6%)
snappy vc=1 1799703 -> 1879660 (+4.4%)
zstd vc=0 1216779 -> 1230507 (+1.1%)
zstd vc=1 996370 -> 1015415 (+1.9%)
lz4 vc=0 1801473 -> 1943095 (+7.9%)
lz4 vc=1 1799155 -> 1935242 (+7.6%)
lz4hc vc=0 349719 -> 1126909 (+222.2%)
lz4hc vc=1 348099 -> 1108933 (+218.6%)
(Repeating the most important ones)
none vc=0 1816878 -> 1952221 (+7.4%)
none vc=1 1813736 -> 1904622 (+5.0%)
snappy vc=0 1794816 -> 1875062 (+4.5%)
snappy vc=1 1789363 -> 1873771 (+4.7%)
zstd vc=0 1202592 -> 1225164 (+1.9%)
zstd vc=1 994322 -> 1016688 (+2.2%)
lz4 vc=0 1786959 -> 1971518 (+10.3%)
lz4 vc=1 1829483 -> 1935871 (+5.8%)

I confirmed manually that the new WorkingArea for LZ4HC makes the huge difference on that one, but not as much difference for LZ4, presumably because LZ4HC uses much larger buffers/structures/whatever for better compression ratios.

Reviewed By: hx235

Differential Revision: D79111736

Pulled By: pdillinger

fbshipit-source-id: 1ce1b14af9f15365f1b6da49906b5073a8cecc14
---
 cache/compressed_secondary_cache.cc           |  50 +-
 db/compact_files_test.cc                      |   4 +
 include/rocksdb/advanced_compression.h        |  64 +-
 port/win/xpress_win.cc                        |  51 ++
 port/win/xpress_win.h                         |   4 +
 .../block_based/block_based_table_builder.cc  |  42 +-
 table/block_based/block_based_table_builder.h |   2 +-
 table/table_test.cc                           |  30 +-
 test_util/testutil.h                          |  19 +-
 .../compression_perf.md                       |   1 +
 .../public_api_changes/lz4_etc.md             |   2 +
 util/aligned_buffer.h                         |  69 ++
 util/auto_tune_compressor.cc                  |  49 +-
 util/auto_tune_compressor.h                   |  16 +-
 util/compression.cc                           | 793 ++++++++++++++++--
 util/compression.h                            |   5 +
 util/compression_test.cc                      |   8 +-
 util/simple_mixed_compressor.cc               |  16 +-
 util/simple_mixed_compressor.h                |   6 +-
 19 files changed, 1030 insertions(+), 201 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/compression_perf.md
 create mode 100644 unreleased_history/public_api_changes/lz4_etc.md

diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index e30e48494fbe..f570600339b8 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -118,7 +118,12 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
       }
       saved = Slice(uncompressed.get(), args.uncompressed_size);
       type = kNoCompression;
-      // Free temporary compressed data
+      // Free temporary compressed data as early as we can. This could matter
+      // for unusually large blocks because we also have
+      // * Another compressed copy above (from lru_cache).
+      // * The uncompressed copy in `uncompressed`.
+      // * Another uncompressed copy in `result_value` below.
+      // Let's try to max out at 3 copies instead of 4.
       merged_value = std::string();
     }
     // Reduced as if it came from primary cache
@@ -197,39 +202,40 @@ Status CompressedSecondaryCache::InsertInternal(
     return s;
   }
 
-  std::string data_compressed;
+  std::unique_ptr<char[]> tagged_compressed_data;
   CompressionType to_type = kNoCompression;
   if (compressor_ && from_type == kNoCompression &&
       !cache_options_.do_not_compress_roles.Contains(helper->role)) {
     assert(source == CacheTier::kVolatileCompressedTier);
+
+    // TODO: consider malloc sizes for max acceptable compressed size
+    // Or maybe max_compressed_bytes_per_kb
+    size_t data_size_compressed = data_size_original - 1;
+    tagged_compressed_data =
+        std::make_unique<char[]>(data_size_compressed + kTagSize);
     s = compressor_->CompressBlock(Slice(data_ptr, data_size_original),
-                                   &data_compressed, &to_type,
+                                   tagged_compressed_data.get() + kTagSize,
+                                   &data_size_compressed, &to_type,
                                    nullptr /*working_area*/);
     if (!s.ok()) {
       return s;
     }
     PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes,
                      data_size_original);
-    // TOOD: improve compression sufficiency check
-    if (to_type == kNoCompression ||
-        data_compressed.size() >= data_size_original) {
-      // Compression rejected
+    if (to_type == kNoCompression) {
+      // Compression rejected or otherwise aborted/failed
       to_type = kNoCompression;
-      data_compressed.clear();
+      tagged_compressed_data.reset();
       // TODO: consider separate counters for rejected compressions
       PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes,
                        data_size_original);
     } else {
-      size_t data_size_compressed = data_compressed.size();
       PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes,
                        data_size_compressed);
       if (enable_split_merge) {
-        // Only need tagged_data for copying into CacheValueChunks. Insert
-        // space for tag.
-        // TODO: improve efficiency of this case (will be fixed with update to
-        // CompressBlock API)
-        data_compressed.insert(/*pos=*/0, /*n=*/kTagSize, char{});
-        tagged_data = data_compressed;
+        // Only need tagged_data for copying into CacheValueChunks.
+        tagged_data = Slice(tagged_compressed_data.get(),
+                            data_size_compressed + kTagSize);
         allocation.reset();
       } else {
         // Replace allocation with compressed version, copied from string
@@ -237,7 +243,10 @@ Status CompressedSecondaryCache::InsertInternal(
         allocation = AllocateBlock(header_size + data_size_compressed,
                                    cache_options_.memory_allocator.get());
         data_ptr = allocation.get() + header_size;
-        std::memcpy(data_ptr, data_compressed.data(), data_size_compressed);
+        // Ignore unpopulated tag on tagged_compressed_data; will only be
+        // populated on the new allocation.
+        std::memcpy(data_ptr, tagged_compressed_data.get() + kTagSize,
+                    data_size_compressed);
         tagged_data =
             Slice(data_ptr - kTagSize, data_size_compressed + kTagSize);
         assert(tagged_data.data() >= allocation.get());
@@ -293,12 +302,17 @@ Status CompressedSecondaryCache::Insert(const Slice& key,
 Status CompressedSecondaryCache::InsertSaved(
     const Slice& key, const Slice& saved, CompressionType type = kNoCompression,
     CacheTier source = CacheTier::kVolatileTier) {
-  if (type == kNoCompression || source == CacheTier::kVolatileCompressedTier) {
+  if (source == CacheTier::kVolatileCompressedTier) {
+    // Unexpected, would violate InsertInternal preconditions
     assert(source != CacheTier::kVolatileCompressedTier);
     return Status::OK();
   }
+  if (type == kNoCompression) {
+    // Not currently supported (why?)
+    return Status::OK();
+  }
   if (cache_options_.enable_custom_split_merge) {
-    // We don't support custom split/merge for the tiered case
+    // We don't support custom split/merge for the tiered case (why?)
     return Status::OK();
   }
 
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index d037f53accb9..83bec82b94c7 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -441,6 +441,10 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
 }
 
 TEST_F(CompactFilesTest, CompressionWithBlockAlign) {
+  if (!Snappy_Supported()) {
+    ROCKSDB_GTEST_SKIP("Test requires Snappy support");
+    return;
+  }
   Options options;
   options.compression = CompressionType::kNoCompression;
   options.create_if_missing = true;
diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index 7ff257f58b79..bd0294949827 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -19,6 +19,7 @@ namespace ROCKSDB_NAMESPACE {
 
 // TODO: alias/adapt for compression
 struct FilterBuildingContext;
+class Decompressor;
 
 // A Compressor represents a very specific but potentially adapting strategy for
 // compressing blocks, including the relevant algorithm(s), options, dictionary,
@@ -156,9 +157,11 @@ class Compressor {
     return {};
   }
 
-  // Compress `uncompressed_data` to `compressed_output`, which should be
-  // passed in empty. Note that the compressed output will be decompressed
-  // by the sequence Decompressor::ExtractUncompressedSize() followed by
+  // Compress `uncompressed_data` to buffer `compressed_output` of size
+  // `*compressed_output_size`, storing the final compressed size in
+  // `*compressed_output_size` and compression type in `*out_compression_type`.
+  // Note that the compressed output will be decompressed by the sequence
+  // Decompressor::ExtractUncompressedSize() followed by
   // Decompressor::DecompressBlock(), which must also be provided the same
   // CompressionType saved in `out_compression_type`. (In many configurations,
   // `compressed_output` will have a prefix storing the uncompressed_data size
@@ -170,28 +173,34 @@ class Compressor {
   // If return status is not OK, then some fatal condition has arisen. On OK
   // status, setting `*out_compression_type = kNoCompression` means compression
   // is declined and the caller should use the original uncompressed_data and
-  // ignore any result in `compressed_output`. Otherwise, compression has
-  // happened with results in `compressed_output` and `out_compression_type`,
-  // which are allowed to vary from call to call.
+  // ignore any result in `compressed_output`. In this case, setting
+  // *compressed_output_size to 0 suggests that compression was quickly
+  // "bypassed" and *compressed_output_size > 0 suggests that compression was
+  // attempted but rejected (e.g. insufficient compression ratio).
+  //
+  // On OK status and `*out_compression_type != kNoCompression`, compression has
+  // happened with results in `compressed_output`, `compressed_output_size`, and
+  // `out_compression_type`. The output compression type is allowed to vary from
+  // call to call but does not for compressors from BuiltinV2CompressionManager.
   //
   // The working area is optional and used to optimize repeated compression by
   // a single thread. ManagedWorkingArea is provided rather than just
   // WorkingArea so that it can be used only if the `owner` matches expectation.
   // This could be useful for a Compressor wrapping more than one alternative
   // underlying Compressor.
-  //
-  // TODO: instead of string, consider a buffer only large enough for max
-  // tolerable compressed size. Does that work for all existing algorithms?
-  // * Looks like Snappy doesn't support that. :(
-  //   * Except perhaps using the Sink interface
-  // * But looks like everything else should. :)
-  // Could save CPU by eliminating extra zero-ing and giving up quicker when
-  // ratio is insufficient.
-  virtual Status CompressBlock(Slice uncompressed_data,
-                               std::string* compressed_output,
+  virtual Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                               size_t* compressed_output_size,
                                CompressionType* out_compression_type,
                                ManagedWorkingArea* working_area) = 0;
 
+  // OPTIONAL: Return a decompressor that is optimized for output from this
+  // compressor.
+  virtual std::shared_ptr<Decompressor> GetOptimizedDecompressor() const {
+    // Default implementation: no optimization. Get a Decompressor from the
+    // CompressionManager.
+    return nullptr;
+  }
+
   // TODO: something to populate table properties based on settings, after all
   // or as WorkingAreas released. Maybe also update stats, or that could be in
   // thread-specific WorkingArea.
@@ -441,14 +450,6 @@ class CompressionManager
     // Safe default implementation
     return GetDecompressor();
   }
-
-  // Get a decompressor that is allowed to have support only for the
-  // CompressionTypes used by the given Compressor.
-  virtual std::shared_ptr<Decompressor> GetDecompressorForCompressor(
-      const Compressor& compressor) {
-    // Reasonable default implementation
-    return GetDecompressorOptimizeFor(compressor.GetPreferredCompressionType());
-  }
 };
 
 // ************************* Utility wrappers etc. *********************** //
@@ -485,11 +486,17 @@ class CompressorWrapper : public Compressor {
   // ManagedWorkingArea takes care of calling it on the Compressor that created
   // the WorkingArea.
 
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* working_area) override {
     return wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                   out_compression_type, working_area);
+                                   compressed_output_size, out_compression_type,
+                                   working_area);
+  }
+
+  std::shared_ptr<Decompressor> GetOptimizedDecompressor() const override {
+    return wrapped_->GetOptimizedDecompressor();
   }
 
  protected:
@@ -592,11 +599,6 @@ class CompressionManagerWrapper : public CompressionManager {
     return wrapped_->GetDecompressorForTypes(types_begin, types_end);
   }
 
-  std::shared_ptr<Decompressor> GetDecompressorForCompressor(
-      const Compressor& compressor) override {
-    return wrapped_->GetDecompressorForCompressor(compressor);
-  }
-
  protected:
   std::shared_ptr<CompressionManager> wrapped_;
 };
diff --git a/port/win/xpress_win.cc b/port/win/xpress_win.cc
index 959ee382e284..a90179bc1283 100644
--- a/port/win/xpress_win.cc
+++ b/port/win/xpress_win.cc
@@ -125,6 +125,57 @@ bool Compress(const char* input, size_t length, std::string* output) {
   return true;
 }
 
+size_t CompressWithMaxSize(const char* input, size_t length, char* output,
+                           size_t max_output_size) {
+  assert(input != nullptr);
+  if (max_output_size == 0) {
+    return 0;
+  }
+  assert(output != nullptr);
+
+  COMPRESS_ALLOCATION_ROUTINES* allocRoutinesPtr = nullptr;
+
+  COMPRESSOR_HANDLE compressor = NULL;
+
+  BOOL success =
+      CreateCompressor(COMPRESS_ALGORITHM_XPRESS,  //  Compression Algorithm
+                       allocRoutinesPtr,  //  Optional allocation routine
+                       &compressor);      //  Handle
+
+  if (!success) {
+#ifdef _DEBUG
+    std::cerr << "XPRESS: Failed to create Compressor LastError: "
+              << GetLastError() << std::endl;
+#endif
+    return 0;
+  }
+
+  std::unique_ptr<void, decltype(CloseCompressorFun)> compressorGuard(
+      compressor, CloseCompressorFun);
+
+  SIZE_T compressed_size = 0;
+  //  Compress
+  success = ::Compress(compressor,                //  Compressor Handle
+                       const_cast<char*>(input),  //  Input buffer
+                       length,                    //  Uncompressed data size
+                       output,                    //  Compressed Buffer
+                       max_output_size,           //  Compressed Buffer size
+                       &compressed_size);         //  Compressed Data size
+
+  if (!success) {
+#ifdef _DEBUG
+    auto error = GetLastError();
+    if (error != ERROR_INSUFFICIENT_BUFFER) {
+      std::cerr << "XPRESS: Failed to compress LastError " << error
+                << std::endl;
+    }
+#endif
+    return 0;
+  } else {
+    return compressed_size;
+  }
+}
+
 char* Decompress(const char* input_data, size_t input_length,
                  size_t* uncompressed_size) {
   assert(input_data != nullptr);
diff --git a/port/win/xpress_win.h b/port/win/xpress_win.h
index 3bab9c29894a..00cc1b9fc3dc 100644
--- a/port/win/xpress_win.h
+++ b/port/win/xpress_win.h
@@ -19,6 +19,10 @@ namespace xpress {
 
 bool Compress(const char* input, size_t length, std::string* output);
 
+// Returns written size or 0 on failure including if buffer is too small.
+size_t CompressWithMaxSize(const char* input, size_t length, char* output,
+                           size_t max_output_size);
+
 char* Decompress(const char* input_data, size_t input_length,
                  size_t* uncompressed_size);
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 57e8ebd4e837..4ed77c532d9b 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -251,7 +251,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep {
     // Uncompressed block contents
     std::string uncompressed;
-    std::string compressed;
+    GrowableBuffer compressed;
     CompressionType compression_type = kNoCompression;
     // For efficiency, the std::string is repeatedly overwritten without
     // checking for "has no value". Only at the end of its life will it be
@@ -464,7 +464,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   // Reap a block from compression thread
   void ReapBlock(BlockRep* block_rep) {
     assert(block_rep != nullptr);
-    block_rep->compressed.clear();
+    block_rep->compressed.ResetForSize(0);
     block_rep_pool.push(block_rep);
 
     if (!first_block_processed.load(std::memory_order_relaxed)) {
@@ -621,7 +621,7 @@ struct BlockBasedTableBuilder::Rep {
 
   BlockHandle pending_handle;  // Handle to add to index block
 
-  std::string single_threaded_compressed_output;
+  GrowableBuffer single_threaded_compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
 
   std::vector<std::unique_ptr<InternalTblPropColl>> table_properties_collectors;
@@ -835,7 +835,11 @@ struct BlockBasedTableBuilder::Rep {
               data_block_compressor->ObtainWorkingArea();
         }
       }
-      basic_decompressor = mgr->GetDecompressorForCompressor(*basic_compressor);
+      basic_decompressor = basic_compressor->GetOptimizedDecompressor();
+      if (basic_decompressor == nullptr) {
+        // Optimized version not available
+        basic_decompressor = mgr->GetDecompressor();
+      }
       create_context.decompressor = basic_decompressor.get();
 
       if (table_options.verify_compression) {
@@ -1121,7 +1125,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
   if (rep_->IsParallelCompressionEnabled()) {
     StartParallelCompression();
   } else if (rep_->basic_compressor) {
-    rep_->single_threaded_compressed_output.reserve(table_options.block_size);
+    rep_->single_threaded_compressed_output.ResetForSize(
+        table_options.block_size);
   }
 }
 
@@ -1397,7 +1402,7 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
                                 ? uncompressed_block_data
                                 : Slice(r->single_threaded_compressed_output),
                             type, handle, block_type, &uncompressed_block_data);
-  r->single_threaded_compressed_output.clear();
+  r->single_threaded_compressed_output.Reset();
   if (is_data_block) {
     r->props.data_size = r->get_offset();
     ++r->props.num_data_blocks;
@@ -1420,7 +1425,7 @@ void BlockBasedTableBuilder::BGWorkCompression(WorkingAreaPair& working_area) {
 
 void BlockBasedTableBuilder::CompressAndVerifyBlock(
     const Slice& uncompressed_block_data, bool is_data_block,
-    WorkingAreaPair& working_area, std::string* compressed_output,
+    WorkingAreaPair& working_area, GrowableBuffer* compressed_output,
     CompressionType* result_compression_type, Status* out_status) {
   Rep* r = rep_;
 
@@ -1434,6 +1439,7 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
     verify_decomp = r->verify_decompressor.get();
   }
 
+  compressed_output->Reset();
   CompressionType type = kNoCompression;
   if (LIKELY(uncompressed_block_data.size() < kCompressionSizeLimit)) {
     if (compressor) {
@@ -1441,26 +1447,20 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
           r->ioptions.clock,
           ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
 
-      *out_status =
-          compressor->CompressBlock(uncompressed_block_data, compressed_output,
-                                    &type, &working_area.compress);
+      size_t max_compressed_size = static_cast<size_t>(
+          (static_cast<uint64_t>(r->max_compressed_bytes_per_kb) *
+           uncompressed_block_data.size()) >>
+          10);
+      compressed_output->ResetForSize(max_compressed_size);
+      *out_status = compressor->CompressBlock(
+          uncompressed_block_data, compressed_output->data(),
+          &compressed_output->MutableSize(), &type, &working_area.compress);
 
       // Post-condition of Compressor::CompressBlock
       assert(type == kNoCompression || out_status->ok());
       assert(type == kNoCompression ||
              r->table_options.verify_compression == (verify_decomp != nullptr));
 
-      // Check for acceptable compression ratio. (For efficiency, avoid floating
-      // point and division.)
-      // TODO: integrate into Compressor?
-      if (compressed_output->size() >
-          (static_cast<uint64_t>(r->max_compressed_bytes_per_kb) *
-           uncompressed_block_data.size()) >>
-          10) {
-        // Prefer to keep uncompressed
-        type = kNoCompression;
-      }
-
       // Some of the compression algorithms are known to be unreliable. If
       // the verify_compression flag is set then try to de-compress the
       // compressed data and compare to the input.
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index b1c4829c95f5..f86216d2e184 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -177,7 +177,7 @@ class BlockBasedTableBuilder : public TableBuilder {
   // compression type
   void CompressAndVerifyBlock(const Slice& uncompressed_block_data,
                               bool is_data_block, WorkingAreaPair& working_area,
-                              std::string* compressed_output,
+                              GrowableBuffer* compressed_output,
                               CompressionType* result_compression_type,
                               Status* out_status);
 
diff --git a/table/table_test.cc b/table/table_test.cc
index d40e4b6ec62c..bb356a90869e 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -6230,6 +6230,12 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
 class ChargeCompressionDictionaryBuildingBufferTest
     : public BlockBasedTableTestBase {};
 TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
+  if (GetSupportedDictCompressions().empty()) {
+    ROCKSDB_GTEST_SKIP("No supported dict compression");
+    return;
+  }
+  const auto kCompression = GetSupportedDictCompressions()[0];
+
   constexpr std::size_t kSizeDummyEntry = 256 * 1024;
   constexpr std::size_t kMetaDataChargeOverhead = 10000;
   constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
@@ -6253,7 +6259,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
         {CacheEntryRole::kCompressionDictionaryBuildingBuffer,
          {/*.charged = */ charge_compression_dictionary_building_buffer}});
     Options options;
-    options.compression = kSnappyCompression;
+    options.compression = kCompression;
     options.compression_opts.max_dict_bytes = kMaxDictBytes;
     options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -6274,7 +6280,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
         options.table_factory->NewTableBuilder(
             TableBuilderOptions(ioptions, moptions, read_options, write_options,
                                 ikc, &internal_tbl_prop_coll_factories,
-                                kSnappyCompression, options.compression_opts,
+                                kCompression, options.compression_opts,
                                 kUnknownColumnFamily, "test_cf", -1 /* level */,
                                 kUnknownNewestKeyTime),
             file_writer.get()));
@@ -6313,6 +6319,12 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, Basic) {
 
 TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
        BasicWithBufferLimitExceed) {
+  if (GetSupportedDictCompressions().empty()) {
+    ROCKSDB_GTEST_SKIP("No supported dict compression");
+    return;
+  }
+  const auto kCompression = GetSupportedDictCompressions()[0];
+
   constexpr std::size_t kSizeDummyEntry = 256 * 1024;
   constexpr std::size_t kMetaDataChargeOverhead = 10000;
   constexpr std::size_t kCacheCapacity = 8 * 1024 * 1024;
@@ -6332,7 +6344,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
       std::make_shared<FlushBlockEveryKeyPolicyFactory>();
 
   Options options;
-  options.compression = kSnappyCompression;
+  options.compression = kCompression;
   options.compression_opts.max_dict_bytes = kMaxDictBytes;
   options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -6351,7 +6363,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
   const WriteOptions write_options;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
-                          &internal_tbl_prop_coll_factories, kSnappyCompression,
+                          &internal_tbl_prop_coll_factories, kCompression,
                           options.compression_opts, kUnknownColumnFamily,
                           "test_cf", -1 /* level */, kUnknownNewestKeyTime),
       file_writer.get()));
@@ -6394,6 +6406,12 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest,
 }
 
 TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
+  if (GetSupportedDictCompressions().empty()) {
+    ROCKSDB_GTEST_SKIP("No supported dict compression");
+    return;
+  }
+  const auto kCompression = GetSupportedDictCompressions()[0];
+
   constexpr std::size_t kSizeDummyEntry = 256 * 1024;
   constexpr std::size_t kMetaDataChargeOverhead = 10000;
   // A small kCacheCapacity is chosen so that increase cache charging for
@@ -6419,7 +6437,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
       std::make_shared<FlushBlockEveryKeyPolicyFactory>();
 
   Options options;
-  options.compression = kSnappyCompression;
+  options.compression = kCompression;
   options.compression_opts.max_dict_bytes = kMaxDictBytes;
   options.compression_opts.max_dict_buffer_bytes = kMaxDictBufferBytes;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -6438,7 +6456,7 @@ TEST_F(ChargeCompressionDictionaryBuildingBufferTest, BasicWithCacheFull) {
   const WriteOptions write_options;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, read_options, write_options, ikc,
-                          &internal_tbl_prop_coll_factories, kSnappyCompression,
+                          &internal_tbl_prop_coll_factories, kCompression,
                           options.compression_opts, kUnknownColumnFamily,
                           "test_cf", -1 /* level */, kUnknownNewestKeyTime),
       file_writer.get()));
diff --git a/test_util/testutil.h b/test_util/testutil.h
index dbff5c8fc263..fc172b8e4b39 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -766,17 +766,26 @@ struct CompressorCustomAlg : public CompressorWrapper {
     return kCompression;
   }
 
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* working_area) override {
+    size_t allowed_output_size = *compressed_output_size;
     Status s = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       compressed_output_size,
                                        out_compression_type, working_area);
     if (s.ok() && *out_compression_type != kNoCompression) {
       assert(*out_compression_type == kLZ4Compression);
-      std::string header(/*size=*/5, 0);
-      header[0] = lossless_cast<char>(kCompression);
-      EncodeFixed32(&header[1], dictionary_hash_);
-      compressed_output->insert(0, header);
+      if (*compressed_output_size + 5 > allowed_output_size) {
+        *out_compression_type = kNoCompression;
+        return Status::OK();
+      }
+      // Generate & insert header
+      std::memmove(compressed_output + 5, compressed_output,
+                   *compressed_output_size);
+      compressed_output[0] = lossless_cast<char>(kCompression);
+      EncodeFixed32(&compressed_output[1], dictionary_hash_);
+      *compressed_output_size += 5;
       *out_compression_type = kCompression;
     }
     return s;
diff --git a/unreleased_history/performance_improvements/compression_perf.md b/unreleased_history/performance_improvements/compression_perf.md
new file mode 100644
index 000000000000..ed567e4e0fa8
--- /dev/null
+++ b/unreleased_history/performance_improvements/compression_perf.md
@@ -0,0 +1 @@
+* Small improvement to CPU efficiency of compression using built-in algorithms, and a dramatic efficiency improvement for LZ4HC, based on reusing data structures between invocations.
diff --git a/unreleased_history/public_api_changes/lz4_etc.md b/unreleased_history/public_api_changes/lz4_etc.md
new file mode 100644
index 000000000000..e961f656ec96
--- /dev/null
+++ b/unreleased_history/public_api_changes/lz4_etc.md
@@ -0,0 +1,2 @@
+* Minimum supported version of LZ4 library is now 1.7.0 (r129 from 2015)
+* Some changes to experimental Compressor and CompressionManager APIs
diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h
index 4d1471c7aef7..d1137642bfcf 100644
--- a/util/aligned_buffer.h
+++ b/util/aligned_buffer.h
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cassert>
 
+#include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/file_system.h"
 namespace ROCKSDB_NAMESPACE {
@@ -251,4 +252,72 @@ class AlignedBuffer {
 
   void Size(size_t cursize) { cursize_ = cursize; }
 };
+
+// Related to std::string but more easily avoids zeroing out a buffer that's
+// going to be overwritten anyway.
+class GrowableBuffer {
+ public:
+  GrowableBuffer() : capacity_(0) {}
+  ~GrowableBuffer() { free(data_); }
+  // No copies
+  GrowableBuffer(const GrowableBuffer&) = delete;
+  GrowableBuffer& operator=(const GrowableBuffer&) = delete;
+  // Movable
+  GrowableBuffer(GrowableBuffer&& other) noexcept
+      : data_(other.data_), size_(other.size_), capacity_(other.capacity_) {
+    other.data_ = nullptr;
+    other.size_ = 0;
+    other.capacity_ = 0;
+  }
+  GrowableBuffer& operator=(GrowableBuffer&& other) noexcept {
+    if (this == &other) {
+      return *this;
+    }
+    free(data_);
+    data_ = other.data_;
+    size_ = other.size_;
+    capacity_ = other.capacity_;
+    other.data_ = nullptr;
+    other.size_ = 0;
+    other.capacity_ = 0;
+    return *this;
+  }
+
+  char* data() { return data_; }
+  const char* data() const { return data_; }
+
+  size_t size() const { return size_; }
+  size_t& MutableSize() { return size_; }
+
+  bool empty() const { return size_ == 0; }
+
+  void Reset() { size_ = 0; }
+  void ResetForSize(size_t new_size) {
+    if (new_size > capacity_) {
+      free(data_);
+      size_t new_capacity = std::max(capacity_ * 2, new_size);
+      new_capacity = std::max(size_t{64}, new_capacity);
+      data_ = static_cast<char*>(malloc(new_capacity));
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      capacity_ = malloc_usable_size(data_);
+#else
+      capacity_ = new_capacity;
+#endif
+      // Warm the memory in CPU cache
+      for (size_t i = 0; i < new_capacity; i += CACHE_LINE_SIZE) {
+        data_[i] = 1;
+      }
+    }
+    size_ = new_size;
+  }
+
+  Slice AsSlice() const { return Slice(data_, size_); }
+  operator Slice() const { return AsSlice(); }
+
+ private:
+  char* data_ = nullptr;
+  size_t size_ = 0;
+  size_t capacity_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
index 9716322b9f75..ed3bff812791 100644
--- a/util/auto_tune_compressor.cc
+++ b/util/auto_tune_compressor.cc
@@ -32,12 +32,9 @@ size_t CompressionRejectionProbabilityPredictor::attempted_compression_count()
 }
 
 bool CompressionRejectionProbabilityPredictor::Record(
-    Slice uncompressed_block_data, std::string* compressed_output,
-    const CompressionOptions& opts) {
-  if (compressed_output->size() >
-      (static_cast<uint64_t>(opts.max_compressed_bytes_per_kb) *
-       uncompressed_block_data.size()) >>
-      10) {
+    Slice /*uncompressed_block_data*/, char* /*compressed_output*/,
+    size_t /*compressed_output_size*/, CompressionType compression_type) {
+  if (compression_type == kNoCompression) {
     rejected_count_++;
   } else {
     compressed_count_++;
@@ -63,15 +60,17 @@ const char* AutoSkipCompressorWrapper::Name() const {
 }
 
 Status AutoSkipCompressorWrapper::CompressBlock(
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    ManagedWorkingArea* wa) {
   // Check if the managed working area is provided or owned by this object.
   // If not, bypass auto-skip logic since the working area lacks a predictor to
   // record or make necessary decisions to compress or bypass compression of the
   // block
   if (wa == nullptr || wa->owner() != this) {
     return wrapped_->CompressBlock(uncompressed_data, compressed_output,
-                                   out_compression_type, wa);
+                                   compressed_output_size, out_compression_type,
+                                   wa);
   }
   bool exploration =
       Random::GetTLSInstance()->PercentTrue(kExplorationPercentage);
@@ -81,17 +80,20 @@ Status AutoSkipCompressorWrapper::CompressBlock(
   auto autoskip_wa = static_cast<AutoSkipWorkingArea*>(wa->get());
   if (exploration) {
     return CompressBlockAndRecord(uncompressed_data, compressed_output,
-                                  out_compression_type, autoskip_wa);
+                                  compressed_output_size, out_compression_type,
+                                  autoskip_wa);
   } else {
     auto predictor_ptr = autoskip_wa->predictor;
     auto prediction = predictor_ptr->Predict();
     if (prediction <= kProbabilityCutOff) {
       // decide to compress
       return CompressBlockAndRecord(uncompressed_data, compressed_output,
+                                    compressed_output_size,
                                     out_compression_type, autoskip_wa);
     } else {
       // decide to bypass compression
       *out_compression_type = kNoCompression;
+      *compressed_output_size = 0;
       return Status::OK();
     }
   }
@@ -107,13 +109,16 @@ void AutoSkipCompressorWrapper::ReleaseWorkingArea(WorkingArea* wa) {
 }
 
 Status AutoSkipCompressorWrapper::CompressBlockAndRecord(
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, AutoSkipWorkingArea* wa) {
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    AutoSkipWorkingArea* wa) {
   Status status = wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                          compressed_output_size,
                                           out_compression_type, &(wa->wrapped));
   // determine if it was rejected or compressed
   auto predictor_ptr = wa->predictor;
-  predictor_ptr->Record(uncompressed_data, compressed_output, opts_);
+  predictor_ptr->Record(uncompressed_data, compressed_output,
+                        *compressed_output_size, *out_compression_type);
   return status;
 }
 
@@ -193,7 +198,8 @@ std::unique_ptr<Compressor> CostAwareCompressor::MaybeCloneSpecialized(
       block_type, std::move(dict_samples));
 }
 Status CostAwareCompressor::CompressBlock(Slice uncompressed_data,
-                                          std::string* compressed_output,
+                                          char* compressed_output,
+                                          size_t* compressed_output_size,
                                           CompressionType* out_compression_type,
                                           ManagedWorkingArea* wa) {
   // Check if the managed working area is provided or owned by this object.
@@ -207,7 +213,7 @@ Status CostAwareCompressor::CompressBlock(Slice uncompressed_data,
     size_t compression_level_ptr = 2;
     return allcompressors_[choosen_compression_type][compression_level_ptr]
         ->CompressBlock(uncompressed_data, compressed_output,
-                        out_compression_type, wa);
+                        compressed_output_size, out_compression_type, wa);
   }
   auto local_wa = static_cast<CostAwareWorkingArea*>(wa->get());
   std::pair<size_t, size_t> choosen_index(6, 2);
@@ -215,7 +221,8 @@ Status CostAwareCompressor::CompressBlock(Slice uncompressed_data,
   size_t compresion_level_ptr = choosen_index.second;
   return CompressBlockAndRecord(choosen_compression_type, compresion_level_ptr,
                                 uncompressed_data, compressed_output,
-                                out_compression_type, local_wa);
+                                compressed_output_size, out_compression_type,
+                                local_wa);
 }
 
 Compressor::ManagedWorkingArea CostAwareCompressor::ObtainWorkingArea() {
@@ -252,8 +259,9 @@ void CostAwareCompressor::ReleaseWorkingArea(WorkingArea* wa) {
 
 Status CostAwareCompressor::CompressBlockAndRecord(
     size_t choosen_compression_type, size_t compression_level_ptr,
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, CostAwareWorkingArea* wa) {
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    CostAwareWorkingArea* wa) {
   assert(choosen_compression_type < allcompressors_.size());
   assert(compression_level_ptr <
          allcompressors_[choosen_compression_type].size());
@@ -264,9 +272,10 @@ Status CostAwareCompressor::CompressBlockAndRecord(
   Status status =
       allcompressors_[choosen_compression_type][compression_level_ptr]
           ->CompressBlock(uncompressed_data, compressed_output,
-                          out_compression_type, &(wa->wrapped_));
+                          compressed_output_size, out_compression_type,
+                          &(wa->wrapped_));
   std::pair<size_t, size_t> measured_data(timer.ElapsedMicros(),
-                                          compressed_output->size());
+                                          *compressed_output_size);
   auto predictor =
       wa->cost_predictors_[choosen_compression_type][compression_level_ptr];
   auto output_length = measured_data.second;
diff --git a/util/auto_tune_compressor.h b/util/auto_tune_compressor.h
index 79bd7eed7db4..818d8c43e753 100644
--- a/util/auto_tune_compressor.h
+++ b/util/auto_tune_compressor.h
@@ -24,8 +24,8 @@ class CompressionRejectionProbabilityPredictor {
         compressed_count_(0),
         window_size_(window_size) {}
   int Predict() const;
-  bool Record(Slice uncompressed_block_data, std::string* compressed_output,
-              const CompressionOptions& opts);
+  bool Record(Slice uncompressed_block_data, char* compressed_output,
+              size_t compressed_output_size, CompressionType compression_type);
   size_t attempted_compression_count() const;
 
  protected:
@@ -64,7 +64,8 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
   explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
                                      const CompressionOptions& opts);
 
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
   ManagedWorkingArea ObtainWorkingArea() override;
@@ -72,7 +73,8 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
 
  private:
   Status CompressBlockAndRecord(Slice uncompressed_data,
-                                std::string* compressed_output,
+                                char* compressed_output,
+                                size_t* compressed_output_size,
                                 CompressionType* out_compression_type,
                                 AutoSkipWorkingArea* wa);
   static constexpr int kExplorationPercentage = 10;
@@ -154,7 +156,8 @@ class CostAwareCompressor : public Compressor {
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole block_type, DictSampleArgs&& dict_samples) override;
 
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
   void ReleaseWorkingArea(WorkingArea* wa) override;
@@ -163,7 +166,8 @@ class CostAwareCompressor : public Compressor {
   Status CompressBlockAndRecord(size_t choosen_compression_type,
                                 size_t compresion_level_ptr,
                                 Slice uncompressed_data,
-                                std::string* compressed_output,
+                                char* compressed_output,
+                                size_t* compressed_output_size,
                                 CompressionType* out_compression_type,
                                 CostAwareWorkingArea* wa);
   static constexpr int kExplorationPercentage = 10;
diff --git a/util/compression.cc b/util/compression.cc
index 16177f09ce45..f259bc947815 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -154,19 +154,28 @@ const Slice& Decompressor::GetSerializedDict() const {
 
 namespace {
 
-class BuiltinCompressorV1 : public Compressor {
+class CompressorBase : public Compressor {
+ public:
+  explicit CompressorBase(const CompressionOptions& opts) : opts_(opts) {}
+
+ protected:
+  CompressionOptions opts_;
+};
+
+class BuiltinCompressorV1 : public CompressorBase {
  public:
   const char* Name() const override { return "BuiltinCompressorV1"; }
 
   explicit BuiltinCompressorV1(const CompressionOptions& opts,
                                CompressionType type)
-      : opts_(opts), type_(type) {
+      : CompressorBase(opts), type_(type) {
     assert(type != kNoCompression);
   }
 
   CompressionType GetPreferredCompressionType() const override { return type_; }
 
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override {
     std::optional<CompressionContext> tmp_ctx;
@@ -179,47 +188,696 @@ class BuiltinCompressorV1 : public Compressor {
       ctx = &*tmp_ctx;
     }
     CompressionInfo info(opts_, *ctx, CompressionDict::GetEmptyDict(), type_);
+    std::string str_output;
+    str_output.reserve(uncompressed_data.size());
     if (!OLD_CompressData(uncompressed_data, info,
-                          1 /*compress_format_version*/, compressed_output)) {
+                          1 /*compress_format_version*/, &str_output)) {
+      // Maybe rejected or bypassed
+      *compressed_output_size = str_output.size();
       *out_compression_type = kNoCompression;
       return Status::OK();
     }
+    if (str_output.size() > *compressed_output_size) {
+      // Compression rejected
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+    std::memcpy(compressed_output, str_output.data(), str_output.size());
+    *compressed_output_size = str_output.size();
     *out_compression_type = type_;
     return Status::OK();
   }
 
  protected:
-  const CompressionOptions opts_;
   const CompressionType type_;
 };
 
-class BuiltinCompressorV2 : public Compressor {
+class CompressorWithSimpleDictBase : public CompressorBase {
  public:
-  const char* Name() const override { return "BuiltinCompressorV2"; }
+  explicit CompressorWithSimpleDictBase(const CompressionOptions& opts,
+                                        std::string&& dict_data = {})
+      : CompressorBase(opts), dict_data_(std::move(dict_data)) {}
 
-  explicit BuiltinCompressorV2(const CompressionOptions& opts,
-                               CompressionType type,
-                               CompressionDict&& dict = {})
-      : opts_(opts), type_(type), dict_(std::move(dict)) {
-    assert(type != kNoCompression);
+  size_t GetMaxSampleSizeIfWantDict(
+      CacheEntryRole /*block_type*/) const override {
+    return opts_.max_dict_bytes;
+  }
+
+  // NOTE: empty dict is equivalent to no dict
+  Slice GetSerializedDict() const override { return dict_data_; }
+
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole /*block_type*/,
+      DictSampleArgs&& dict_samples) final override {
+    assert(dict_samples.Verify());
+    if (dict_samples.empty()) {
+      // Nothing to specialize on
+      return nullptr;
+    } else {
+      return CloneForDict(std::move(dict_samples.sample_data));
+    }
+  }
+
+  virtual std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) = 0;
+
+ protected:
+  const std::string dict_data_;
+};
+
+// NOTE: the legacy behavior is to pretend to use dictionary compression when
+// enabled, including storing a dictionary block, but to ignore it. That is
+// matched here.
+class BuiltinSnappyCompressorV2 : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinSnappyCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kSnappyCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+    return std::make_unique<BuiltinSnappyCompressorV2>(opts_,
+                                                       std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef SNAPPY
+    struct MySink : public snappy::Sink {
+      MySink(char* output, size_t output_size)
+          : output_(output), output_size_(output_size) {}
+
+      char* output_;
+      size_t output_size_;
+      size_t pos_ = 0;
+
+      void Append(const char* data, size_t n) override {
+        if (pos_ + n <= output_size_) {
+          std::memcpy(output_ + pos_, data, n);
+          pos_ += n;
+        } else {
+          // Virtual abort
+          pos_ = output_size_ + 1;
+        }
+      }
+
+      char* GetAppendBuffer(size_t length, char* scratch) override {
+        if (pos_ + length <= output_size_) {
+          return output_ + pos_;
+        }
+        return scratch;
+      }
+    };
+    MySink sink{compressed_output, *compressed_output_size};
+    snappy::ByteArraySource source{uncompressed_data.data(),
+                                   uncompressed_data.size()};
+
+    size_t outlen = snappy::Compress(&source, &sink);
+    if (outlen > 0 && sink.pos_ <= sink.output_size_) {
+      // Compression kept/successful
+      assert(outlen == sink.pos_);
+      *compressed_output_size = outlen;
+      *out_compression_type = kSnappyCompression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+
+  std::shared_ptr<Decompressor> GetOptimizedDecompressor() const override;
+};
+
+[[maybe_unused]]
+std::pair<char*, size_t> StartCompressBlockV2(Slice uncompressed_data,
+                                              char* compressed_output,
+                                              size_t compressed_output_size) {
+  if (  // Can't compress more than 4GB
+      uncompressed_data.size() > std::numeric_limits<uint32_t>::max() ||
+      // Need enough output space for encoding uncompressed size
+      compressed_output_size <= 5) {
+    // Compression bypassed
+    return {nullptr, 0};
+  }
+  // Standard format for prepending uncompressed size to the compressed
+  // data in compress_format_version=2
+  char* alg_output = EncodeVarint32(
+      compressed_output, static_cast<uint32_t>(uncompressed_data.size()));
+  size_t alg_max_output_size =
+      compressed_output_size - (alg_output - compressed_output);
+  return {alg_output, alg_max_output_size};
+}
+
+class BuiltinZlibCompressorV2 : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinZlibCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kZlibCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+    return std::make_unique<BuiltinZlibCompressorV2>(opts_,
+                                                     std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef ZLIB
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // The memLevel parameter specifies how much memory should be allocated for
+    // the internal compression state.
+    // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
+    // memLevel=9 uses maximum memory for optimal speed.
+    // The default value is 8. See zconf.h for more details.
+    static const int memLevel = 8;
+    int level = opts_.level;
+    if (level == CompressionOptions::kDefaultCompressionLevel) {
+      level = Z_DEFAULT_COMPRESSION;
+    }
+
+    z_stream stream;
+    memset(&stream, 0, sizeof(z_stream));
+
+    // Initialize the zlib stream
+    int st = deflateInit2(&stream, level, Z_DEFLATED, opts_.window_bits,
+                          memLevel, opts_.strategy);
+    if (st != Z_OK) {
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // Set dictionary if available
+    if (!dict_data_.empty()) {
+      st = deflateSetDictionary(
+          &stream, reinterpret_cast<const Bytef*>(dict_data_.data()),
+          static_cast<unsigned int>(dict_data_.size()));
+      if (st != Z_OK) {
+        deflateEnd(&stream);
+        *compressed_output_size = 0;
+        *out_compression_type = kNoCompression;
+        return Status::OK();
+      }
+    }
+
+    // Set up input
+    stream.next_in = (Bytef*)uncompressed_data.data();
+    stream.avail_in = static_cast<unsigned int>(uncompressed_data.size());
+
+    // Set up output
+    stream.next_out = reinterpret_cast<Bytef*>(alg_output);
+    stream.avail_out = static_cast<unsigned int>(alg_max_output_size);
+
+    // Compress
+    st = deflate(&stream, Z_FINISH);
+    size_t outlen = alg_max_output_size - stream.avail_out;
+    deflateEnd(&stream);
+
+    if (st == Z_STREAM_END) {
+      // Compression kept/successful
+      *compressed_output_size =
+          outlen + /*header size*/ (alg_output - compressed_output);
+      *out_compression_type = kZlibCompression;
+      return Status::OK();
+    }
+    // Compression failed or rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinBZip2CompressorV2 : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinBZip2CompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kBZip2Compression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+    return std::make_unique<BuiltinBZip2CompressorV2>(opts_,
+                                                      std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef BZIP2
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // BZip2 doesn't actually use the dictionary, but we store it for
+    // compatibility similar to BuiltinSnappyCompressorV2
+
+    // Initialize the bzip2 stream
+    bz_stream stream;
+    memset(&stream, 0, sizeof(bz_stream));
+
+    // Block size 1 is 100K.
+    // 0 is for silent.
+    // 30 is the default workFactor
+    int st = BZ2_bzCompressInit(&stream, 1, 0, 30);
+    if (st != BZ_OK) {
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    // Set up input
+    stream.next_in = const_cast<char*>(uncompressed_data.data());
+    stream.avail_in = static_cast<unsigned int>(uncompressed_data.size());
+
+    // Set up output
+    stream.next_out = alg_output;
+    stream.avail_out = static_cast<unsigned int>(alg_max_output_size);
+
+    // Compress
+    st = BZ2_bzCompress(&stream, BZ_FINISH);
+    size_t outlen = alg_max_output_size - stream.avail_out;
+    BZ2_bzCompressEnd(&stream);
+
+    // Check for success
+    if (st == BZ_STREAM_END) {
+      // Compression kept/successful
+      *compressed_output_size = outlen + (alg_output - compressed_output);
+      *out_compression_type = kBZip2Compression;
+      return Status::OK();
+    }
+    // Compression failed or rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinLZ4CompressorV2 : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinLZ4CompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kLZ4Compression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+    return std::make_unique<BuiltinLZ4CompressorV2>(opts_,
+                                                    std::move(dict_data));
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+#ifdef LZ4
+    return {reinterpret_cast<WorkingArea*>(LZ4_createStream()), this};
+#else
+    return {};
+#endif
+  }
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    if (wa) {
+#ifdef LZ4
+      LZ4_freeStream(reinterpret_cast<LZ4_stream_t*>(wa));
+#endif
+    }
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef LZ4
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    ManagedWorkingArea tmp_wa;
+    LZ4_stream_t* stream;
+    if (wa != nullptr && wa->owner() == this) {
+      stream = reinterpret_cast<LZ4_stream_t*>(wa->get());
+#if LZ4_VERSION_NUMBER >= 10900  // >= version 1.9.0
+      LZ4_resetStream_fast(stream);
+#else
+      LZ4_resetStream(stream);
+#endif
+    } else {
+      tmp_wa = ObtainWorkingArea();
+      stream = reinterpret_cast<LZ4_stream_t*>(tmp_wa.get());
+    }
+    if (!dict_data_.empty()) {
+      // TODO: more optimization possible here?
+      LZ4_loadDict(stream, dict_data_.data(),
+                   static_cast<int>(dict_data_.size()));
+    }
+    int acceleration;
+    if (opts_.level < 0) {
+      acceleration = -opts_.level;
+    } else {
+      acceleration = 1;
+    }
+    auto outlen = LZ4_compress_fast_continue(
+        stream, uncompressed_data.data(), alg_output,
+        static_cast<int>(uncompressed_data.size()),
+        static_cast<int>(alg_max_output_size), acceleration);
+    if (outlen > 0) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kLZ4Compression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinLZ4HCCompressorV2 : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinLZ4HCCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kLZ4HCCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+    return std::make_unique<BuiltinLZ4HCCompressorV2>(opts_,
+                                                      std::move(dict_data));
+  }
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+#ifdef LZ4
+    return {reinterpret_cast<WorkingArea*>(LZ4_createStreamHC()), this};
+#else
+    return {};
+#endif
+  }
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    if (wa) {
+#ifdef LZ4
+      LZ4_freeStreamHC(reinterpret_cast<LZ4_streamHC_t*>(wa));
+#endif
+    }
   }
 
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef LZ4
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    int level = opts_.level;
+    if (level == CompressionOptions::kDefaultCompressionLevel) {
+      level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
+    }
+
+    ManagedWorkingArea tmp_wa;
+    LZ4_streamHC_t* stream;
+    if (wa != nullptr && wa->owner() == this) {
+      stream = reinterpret_cast<LZ4_streamHC_t*>(wa->get());
+    } else {
+      tmp_wa = ObtainWorkingArea();
+      stream = reinterpret_cast<LZ4_streamHC_t*>(tmp_wa.get());
+    }
+#if LZ4_VERSION_NUMBER >= 10900  // >= version 1.9.0
+    LZ4_resetStreamHC_fast(stream, level);
+#else
+    LZ4_resetStreamHC(stream, level);
+#endif
+    if (dict_data_.size() > 0) {
+      // TODO: more optimization possible here?
+      LZ4_loadDictHC(stream, dict_data_.data(),
+                     static_cast<int>(dict_data_.size()));
+    }
+
+    auto outlen =
+        LZ4_compress_HC_continue(stream, uncompressed_data.data(), alg_output,
+                                 static_cast<int>(uncompressed_data.size()),
+                                 static_cast<int>(alg_max_output_size));
+    if (outlen > 0) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kLZ4HCCompression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinXpressCompressorV2 : public CompressorWithSimpleDictBase {
+ public:
+  using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
+
+  const char* Name() const override { return "BuiltinXpressCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override {
+    return kXpressCompression;
+  }
+
+  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+    return std::make_unique<BuiltinXpressCompressorV2>(opts_,
+                                                       std::move(dict_data));
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea*) override {
+#ifdef XPRESS
+    // XPRESS doesn't actually use the dictionary, but we store it for
+    // compatibility similar to BuiltinSnappyCompressorV2
+
+    // Use the new CompressWithMaxSize function that writes directly to the
+    // output buffer
+    size_t compressed_size = port::xpress::CompressWithMaxSize(
+        uncompressed_data.data(), uncompressed_data.size(), compressed_output,
+        *compressed_output_size);
+
+    if (compressed_size > 0) {
+      // Compression kept/successful
+      *compressed_output_size = compressed_size;
+      *out_compression_type = kXpressCompression;
+      return Status::OK();
+    }
+
+    // Compression rejected or failed
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
+class BuiltinZSTDCompressorV2 : public CompressorBase {
+ public:
+  explicit BuiltinZSTDCompressorV2(const CompressionOptions& opts,
+                                   CompressionDict&& dict = {})
+      : CompressorBase(opts), dict_(std::move(dict)) {}
+
+  const char* Name() const override { return "BuiltinZSTDCompressorV2"; }
+
+  CompressionType GetPreferredCompressionType() const override { return kZSTD; }
+
   size_t GetMaxSampleSizeIfWantDict(
       CacheEntryRole /*block_type*/) const override {
     if (opts_.max_dict_bytes == 0) {
       // Dictionary compression disabled
       return 0;
     } else {
-      return type_ == kZSTD && opts_.zstd_max_train_bytes > 0
-                 ? opts_.zstd_max_train_bytes
-                 : opts_.max_dict_bytes;
+      return opts_.zstd_max_train_bytes > 0 ? opts_.zstd_max_train_bytes
+                                            : opts_.max_dict_bytes;
     }
   }
 
   // NOTE: empty dict is equivalent to no dict
   Slice GetSerializedDict() const override { return dict_.GetRawDict(); }
 
-  CompressionType GetPreferredCompressionType() const override { return type_; }
+  ManagedWorkingArea ObtainWorkingArea() override {
+#ifdef ZSTD
+    ZSTD_CCtx* ctx =
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+        ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+        ZSTD_createCCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+    auto level = opts_.level;
+    if (level == CompressionOptions::kDefaultCompressionLevel) {
+      // NB: ZSTD_CLEVEL_DEFAULT is historically == 3
+      level = ZSTD_CLEVEL_DEFAULT;
+    }
+    size_t err = ZSTD_CCtx_setParameter(ctx, ZSTD_c_compressionLevel, level);
+    if (ZSTD_isError(err)) {
+      assert(false);
+      ZSTD_freeCCtx(ctx);
+      ctx = ZSTD_createCCtx();
+    }
+    if (opts_.checksum) {
+      err = ZSTD_CCtx_setParameter(ctx, ZSTD_c_checksumFlag, 1);
+      if (ZSTD_isError(err)) {
+        assert(false);
+        ZSTD_freeCCtx(ctx);
+        ctx = ZSTD_createCCtx();
+      }
+    }
+    return ManagedWorkingArea(reinterpret_cast<WorkingArea*>(ctx), this);
+#else
+    return {};
+#endif  // ZSTD
+  }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    if (wa) {
+#ifdef ZSTD
+      ZSTD_freeCCtx(reinterpret_cast<ZSTD_CCtx*>(wa));
+#endif  // ZSTD
+    }
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef ZSTD
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+
+    ManagedWorkingArea tmp_wa;
+    if (wa == nullptr || wa->owner() != this) {
+      tmp_wa = ObtainWorkingArea();
+      wa = &tmp_wa;
+    }
+    assert(wa->get() != nullptr);
+    ZSTD_CCtx* ctx = reinterpret_cast<ZSTD_CCtx*>(wa->get());
+
+    if (dict_.GetDigestedZstdCDict() != nullptr) {
+      ZSTD_CCtx_refCDict(ctx, dict_.GetDigestedZstdCDict());
+    } else {
+      ZSTD_CCtx_loadDictionary(ctx, dict_.GetRawDict().data(),
+                               dict_.GetRawDict().size());
+    }
+
+    // Compression level is set in `contex` during ObtainWorkingArea()
+    size_t outlen =
+        ZSTD_compress2(ctx, alg_output, alg_max_output_size,
+                       uncompressed_data.data(), uncompressed_data.size());
+    if (!ZSTD_isError(outlen)) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kZSTD;
+      return Status::OK();
+    }
+    if (ZSTD_getErrorCode(outlen) != ZSTD_error_dstSize_tooSmall) {
+      return Status::Corruption(std::string("ZSTD_compress2 failed: ") +
+                                ZSTD_getErrorName(outlen));
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole /*block_type*/, DictSampleArgs&& dict_samples) override {
@@ -230,7 +888,7 @@ class BuiltinCompressorV2 : public Compressor {
     }
     std::string dict_data;
     // Migrated from BlockBasedTableBuilder::EnterUnbuffered()
-    if (type_ == kZSTD && opts_.zstd_max_train_bytes > 0) {
+    if (opts_.zstd_max_train_bytes > 0) {
       assert(dict_samples.sample_data.size() <= opts_.zstd_max_train_bytes);
       if (opts_.use_zstd_dict_trainer) {
         dict_data = ZSTD_TrainDictionary(dict_samples.sample_data,
@@ -247,43 +905,13 @@ class BuiltinCompressorV2 : public Compressor {
       // dictionary." Or similar for other compressions.
       dict_data = std::move(dict_samples.sample_data);
     }
-    CompressionDict dict{std::move(dict_data), type_, opts_.level};
-    return std::make_unique<BuiltinCompressorV2>(opts_, type_, std::move(dict));
+    CompressionDict dict{std::move(dict_data), kZSTD, opts_.level};
+    return std::make_unique<BuiltinZSTDCompressorV2>(opts_, std::move(dict));
   }
 
-  // TODO: use ZSTD_CCtx directly
-  ManagedWorkingArea ObtainWorkingArea() override {
-    return ManagedWorkingArea(new CompressionContext(type_, opts_), this);
-  }
-  void ReleaseWorkingArea(WorkingArea* wa) override {
-    delete static_cast<CompressionContext*>(wa);
-  }
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
-                       CompressionType* out_compression_type,
-                       ManagedWorkingArea* wa) override {
-    std::optional<CompressionContext> tmp_ctx;
-    CompressionContext* ctx = nullptr;
-    if (wa != nullptr && wa->owner() == this) {
-      ctx = static_cast<CompressionContext*>(wa->get());
-    }
-    CompressionType type = type_;
-    if (ctx == nullptr) {
-      tmp_ctx.emplace(type, opts_);
-      ctx = &*tmp_ctx;
-    }
-    CompressionInfo info(opts_, *ctx, dict_, type);
-    if (!OLD_CompressData(uncompressed_data, info,
-                          2 /*compress_format_version*/, compressed_output)) {
-      *out_compression_type = kNoCompression;
-      return Status::OK();
-    }
-    *out_compression_type = type;
-    return Status::OK();
-  }
+  std::shared_ptr<Decompressor> GetOptimizedDecompressor() const override;
 
  protected:
-  const CompressionOptions opts_;
-  const CompressionType type_;
   const CompressionDict dict_;
 };
 
@@ -480,7 +1108,6 @@ Status LZ4_DecompressBlock(const Decompressor::Args& args, Slice dict,
                            char* uncompressed_output) {
 #ifdef LZ4
   int expected_uncompressed_size = static_cast<int>(args.uncompressed_size);
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
   if (!dict.empty()) {
     LZ4_setStreamDecode(stream, dict.data(), static_cast<int>(dict.size()));
@@ -490,16 +1117,6 @@ Status LZ4_DecompressBlock(const Decompressor::Args& args, Slice dict,
       static_cast<int>(args.compressed_data.size()),
       expected_uncompressed_size);
   LZ4_freeStreamDecode(stream);
-#else   // up to r123
-  if (!dict.empty()) {
-    return Status::NotSupported(
-        "This build doesn't support dictionary compression with LZ4");
-  }
-  int uncompressed_size =
-      LZ4_decompress_safe(args.compressed_data.data(), uncompressed_output,
-                          static_cast<int>(args.compressed_data.size()),
-                          expected_uncompressed_size);
-#endif  // LZ4_VERSION_NUMBER >= 10400
 
   if (uncompressed_size != expected_uncompressed_size) {
     if (uncompressed_size < 0) {
@@ -799,7 +1416,7 @@ class BuiltinDecompressorV2OptimizeZstd : public BuiltinDecompressorV2 {
 class BuiltinDecompressorV2OptimizeZstdWithDict
     : public BuiltinDecompressorV2OptimizeZstd {
  public:
-  BuiltinDecompressorV2OptimizeZstdWithDict(const Slice& dict)
+  explicit BuiltinDecompressorV2OptimizeZstdWithDict(const Slice& dict)
       :
 #ifdef ROCKSDB_ZSTD_DDICT
         dict_(dict),
@@ -875,14 +1492,29 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
       // No acceptable compression ratio => no compression
       return nullptr;
     }
-    if (type > kLastBuiltinCompression) {
-      // Unrecognized; fall back on default compression
+    if (!SupportsCompressionType(type)) {
+      // Unrecognized or support not compiled in. Fall back on default
       type = ColumnFamilyOptions{}.compression;
     }
-    if (type == kNoCompression) {
-      return nullptr;
-    } else {
-      return std::make_unique<BuiltinCompressorV2>(opts, type);
+    switch (type) {
+      case kNoCompression:
+      default:
+        assert(type == kNoCompression);  // Others should be excluded above
+        return nullptr;
+      case kSnappyCompression:
+        return std::make_unique<BuiltinSnappyCompressorV2>(opts);
+      case kZlibCompression:
+        return std::make_unique<BuiltinZlibCompressorV2>(opts);
+      case kBZip2Compression:
+        return std::make_unique<BuiltinBZip2CompressorV2>(opts);
+      case kLZ4Compression:
+        return std::make_unique<BuiltinLZ4CompressorV2>(opts);
+      case kLZ4HCCompression:
+        return std::make_unique<BuiltinLZ4HCCompressorV2>(opts);
+      case kXpressCompression:
+        return std::make_unique<BuiltinXpressCompressorV2>(opts);
+      case kZSTD:
+        return std::make_unique<BuiltinZSTDCompressorV2>(opts);
     }
   }
 
@@ -913,20 +1545,6 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
       return GetGeneralDecompressor();
     }
   }
-  std::shared_ptr<Decompressor> GetDecompressorForCompressor(
-      const Compressor& compressor) override {
-#ifdef ROCKSDB_USE_RTTI
-    // To be extra safe, only optimize here if we are certain we are not
-    // looking at a wrapped compressor, so that we are sure it only uses that
-    // one compression type.
-    if (dynamic_cast<const BuiltinCompressorV2*>(&compressor)) {
-      CompressionType type = compressor.GetPreferredCompressionType();
-      return GetDecompressorForTypes(&type, &type + 1);
-    }
-#endif
-    // Fallback
-    return CompressionManager::GetDecompressorForCompressor(compressor);
-  }
 
   bool SupportsCompressionType(CompressionType type) const override {
     return CompressionTypeSupported(type);
@@ -937,6 +1555,7 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
   BuiltinDecompressorV2OptimizeZstd zstd_decompressor_;
   BuiltinDecompressorV2SnappyOnly snappy_decompressor_;
 
+ public:
   inline std::shared_ptr<Decompressor> GetGeneralDecompressor() {
     return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
   }
@@ -959,6 +1578,16 @@ const std::shared_ptr<BuiltinCompressionManagerV2>
     kBuiltinCompressionManagerV2 =
         std::make_shared<BuiltinCompressionManagerV2>();
 
+std::shared_ptr<Decompressor>
+BuiltinZSTDCompressorV2::GetOptimizedDecompressor() const {
+  return kBuiltinCompressionManagerV2->GetZstdDecompressor();
+}
+
+std::shared_ptr<Decompressor>
+BuiltinSnappyCompressorV2::GetOptimizedDecompressor() const {
+  return kBuiltinCompressionManagerV2->GetSnappyDecompressor();
+}
+
 }  // namespace
 
 Status CompressionManager::CreateFromString(
diff --git a/util/compression.h b/util/compression.h
index 8c613b2f373d..c99bbba4d0d9 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -34,6 +34,7 @@
 #include "util/string_util.h"
 
 #ifdef SNAPPY
+#include <snappy-sinksource.h>
 #include <snappy.h>
 #endif
 
@@ -48,10 +49,14 @@
 #if defined(LZ4)
 #include <lz4.h>
 #include <lz4hc.h>
+#if LZ4_VERSION_NUMBER < 10700  // < r129
+#error "LZ4 support requires version >= 1.7.0 (lz4-devel)"
+#endif
 #endif
 
 #ifdef ZSTD
 #include <zstd.h>
+#include <zstd_errors.h>
 // ZSTD_Compress2(), ZSTD_compressStream2() and frame parameters all belong to
 // advanced APIs and require v1.4.0+, which is from April 2019.
 // https://github.com/facebook/zstd/blob/eb9f881eb810f2242f1ef36b3f3e7014eecb8fa6/lib/zstd.h#L297C40-L297C45
diff --git a/util/compression_test.cc b/util/compression_test.cc
index 176179ff704f..b51c872f1452 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -1102,8 +1102,8 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
     using CompressorWrapper::CompressorWrapper;
     const char* Name() const override { return "MyCompressor"; }
 
-    Status CompressBlock(Slice uncompressed_data,
-                         std::string* compressed_output,
+    Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                         size_t* compressed_output_size,
                          CompressionType* out_compression_type,
                          ManagedWorkingArea* working_area) override {
       auto begin = uncompressed_data.data();
@@ -1111,16 +1111,18 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
       if (std::search(begin, end, kDoNotCompress.begin(),
                       kDoNotCompress.end()) != end) {
         // Do not attempt compression
+        *compressed_output_size = 0;
         EXPECT_EQ(*out_compression_type, kNoCompression);
         return Status::OK();
       } else if (std::search(begin, end, kRejectCompression.begin(),
                              kRejectCompression.end()) != end) {
         // Simulate attempted & rejected compression
-        *compressed_output = "blah";
+        *compressed_output_size = 1;
         EXPECT_EQ(*out_compression_type, kNoCompression);
         return Status::OK();
       } else {
         return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                       compressed_output_size,
                                        out_compression_type, working_area);
       }
     }
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 4c952750cdeb..a1ee40481d8c 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -60,13 +60,15 @@ const char* RandomMixedCompressor::Name() const {
 }
 
 Status RandomMixedCompressor::CompressBlock(
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    ManagedWorkingArea* wa) {
   auto selected =
       Random::GetTLSInstance()->Uniform(static_cast<int>(compressors_.size()));
   auto& compressor = compressors_[selected];
   return compressor->CompressBlock(uncompressed_data, compressed_output,
-                                   out_compression_type, wa);
+                                   compressed_output_size, out_compression_type,
+                                   wa);
 }
 
 const char* RandomMixedCompressionManager::Name() const {
@@ -85,13 +87,15 @@ const char* RoundRobinCompressor::Name() const {
 }
 
 Status RoundRobinCompressor::CompressBlock(
-    Slice uncompressed_data, std::string* compressed_output,
-    CompressionType* out_compression_type, ManagedWorkingArea* wa) {
+    Slice uncompressed_data, char* compressed_output,
+    size_t* compressed_output_size, CompressionType* out_compression_type,
+    ManagedWorkingArea* wa) {
   auto counter = block_counter.FetchAddRelaxed(1);
   auto sel_idx = counter % (compressors_.size());
   auto& compressor = compressors_[sel_idx];
   return compressor->CompressBlock(uncompressed_data, compressed_output,
-                                   out_compression_type, wa);
+                                   compressed_output_size, out_compression_type,
+                                   wa);
 }
 
 RelaxedAtomic<uint64_t> RoundRobinCompressor::block_counter{0};
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 69c4cc1490dd..0c12d88a0ac3 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -34,7 +34,8 @@ class MultiCompressorWrapper : public Compressor {
 struct RandomMixedCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   const char* Name() const override;
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
 };
@@ -50,7 +51,8 @@ class RandomMixedCompressionManager : public CompressionManagerWrapper {
 struct RoundRobinCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   const char* Name() const override;
-  Status CompressBlock(Slice uncompressed_data, std::string* compressed_output,
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
                        CompressionType* out_compression_type,
                        ManagedWorkingArea* wa) override;
   static RelaxedAtomic<uint64_t> block_counter;

From ccd850fa56177d21514eff4db40c538036af06f4 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 31 Jul 2025 13:28:17 -0700
Subject: [PATCH 208/500] Bug fix in MultiScan and stress test (#13822)

Summary:
Fix a bug in MultiScan where BlockBasedTableIterator should not return out-of-bound when the all blocks of the last scan are exhausted. This prevented LevelIterator from entering the next file so iterator is returning less keys than expected.

Also fixed stress testing to specify iterate_upper_bound correctly.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13822

Test Plan:
- the following fails quickly before this PR and finishes after this PR
```python3 tools/db_crashtest.py whitebox --iterpercent=60 --prefix_size=-1 --prefixpercent=0 --readpercent=0 --test_batches_snapshots=0 --use_multiscan=1 --seed=1 --fill_cache=1 --read_fault_one_in=0 --column_families=1 --allow_unprepared_value=0 --kill_random_test=88888```
- new unit test that fails before this PR

Reviewed By: krhancoc

Differential Revision: D79308957

Pulled By: cbi42

fbshipit-source-id: c9eafd1c8750b959b0185d7c63199b503493cbd2
---
 db/db_iterator_test.cc                        | 34 +++++++++++++++++
 db_stress_tool/db_stress_test_base.cc         |  6 ++-
 .../block_based/block_based_table_iterator.cc | 37 ++++++++++++++++---
 .../block_based/block_based_table_iterator.h  |  2 +
 unreleased_history/bug_fixes/multi-scan.md    |  1 +
 5 files changed, 73 insertions(+), 7 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/multi-scan.md

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index d2b83e4ed89d..d5f1e1f43434 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4325,6 +4325,40 @@ TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
   }
   iter.reset();
 }
+
+TEST_F(DBMultiScanIteratorTest, RangeAcrossFiles) {
+  auto options = CurrentOptions();
+  options.target_file_size_base = 100 << 10;  // 20KB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  auto rnd = Random::GetTLSInstance();
+  // Write ~200KB data
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(Key(i), rnd->RandomString(2 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(2, NumTableFilesAtLevel(49));
+  std::vector<std::string> key_ranges({Key(10), Key(90)});
+  ReadOptions ro;
+  std::vector<ScanOptions> scan_options(
+      {ScanOptions(key_ranges[0], key_ranges[1])});
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  int i = 10;
+  for (auto range : *iter) {
+    for (auto it : range) {
+      ASSERT_EQ(it.first.ToString(), Key(i));
+      ++i;
+    }
+  }
+  ASSERT_EQ(i, 90);
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index f5c2bc224234..4f88c72e7f0a 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1681,6 +1681,9 @@ Status StressTest::TestMultiScan(ThreadState* thread,
   start_key_strs.reserve(num_scans);
   end_key_strs.reserve(num_scans);
 
+  // Will be initialized before Seek() below.
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
   for (size_t i = 0; i < num_scans * 2; i += 2) {
     assert(rand_keys[i] <= rand_keys[i + 1]);
     start_key_strs.emplace_back(Key(rand_keys[i]));
@@ -1745,8 +1748,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
     assert(scan_opt.range.start);
     assert(scan_opt.range.limit);
     Slice key = scan_opt.range.start.value();
-    Slice ub = scan_opt.range.limit.value();
-    ro.iterate_upper_bound = &ub;
+    ub = scan_opt.range.limit.value();
 
     LastIterateOp last_op;
     iter->Seek(key);
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index dc23be9128d1..7ca2dbc896a1 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -932,6 +932,7 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
 // ReadOptions::max_skippable_internal_keys or reseeking into range deletion
 // end key. So these Seeks can cause iterator to fall back to normal
 // (non-prepared) iterator and ignore the optimizations done in Prepare().
+// TODO: support fill_cache = false and when block cache is disabled.
 void BlockBasedTableIterator::Prepare(
     const std::vector<ScanOptions>* scan_opts) {
   index_iter_->Prepare(scan_opts);
@@ -1009,6 +1010,12 @@ void BlockBasedTableIterator::Prepare(
       index_iter_->Next();
       check_overlap = false;
     }
+
+    if (!index_iter_->status().ok()) {
+      // Abort: index iterator error
+      return;
+    }
+
     // Stop until index->key > limit
     // Include the current block since it can still contain keys <= limit
     if (index_iter_->Valid()) {
@@ -1019,13 +1026,14 @@ void BlockBasedTableIterator::Prepare(
         blocks_to_prepare.push_back(index_iter_->value().handle);
       }
       ++num_blocks;
-    }
-
-    if (!index_iter_->status().ok()) {
-      // Abort: index iterator error
+    } else if (num_blocks == 0) {
+      // We should not have scan ranges that are completely after the file's
+      // range. This is important for FindBlockForwardInMultiScan() which only
+      // lets the upper layer (LevelIterator) advance to the next SST file when
+      // the last scan range is exhausted.
       return;
     }
-
+    assert(num_blocks);
     block_ranges_per_scan.emplace_back(blocks_to_prepare.size() - num_blocks,
                                        blocks_to_prepare.size());
   }
@@ -1168,6 +1176,7 @@ void BlockBasedTableIterator::Prepare(
           // Abort: failed to create and pin block in cache
           return;
         }
+        assert(pinned_data_blocks_guard[block_idx].GetValue());
       }
     }
   }
@@ -1234,6 +1243,10 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
     return true;
   }
 
+  // We are aborting MultiScan.
+  ResetDataIter();
+  assert(!is_index_at_curr_block_);
+  assert(!block_iter_points_to_real_block_);
   return false;
 }
 
@@ -1247,7 +1260,21 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
       return;
     }
 
+    // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
+    // level has reached iterate_upper_bound_ and will not continue to iterate
+    // into the next file. When we are doing the last scan within a MultiScan
+    // for this file, it may need to continue to scan into the next file, so
+    // we do not set is_out_of_bound_ in this case.
     if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
+      if (multi_scan_->next_scan_idx >=
+          multi_scan_->block_ranges_per_scan.size()) {
+        // We are done with this file, should let LevelIter advance to the next
+        // file instead of ending the scan
+        ResetDataIter();
+        assert(!is_out_of_bound_);
+        assert(!Valid());
+        return;
+      }
       // We don't ResetDataIter() here since next scan might be reading from
       // the same block. ResetDataIter() will free the underlying block cache
       // handle and we don't want the block to be unpinned.
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index ccf7d8044822..dff61ad9c35a 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -160,6 +160,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     } else if (block_upper_bound_check_ ==
                BlockUpperBound::kUpperBoundBeyondCurBlock) {
       assert(!is_out_of_bound_);
+      // MultiScan does not do block level upper bound check yet.
+      assert(!multi_scan_);
       return IterBoundCheck::kInbound;
     } else {
       return IterBoundCheck::kUnknown;
diff --git a/unreleased_history/bug_fixes/multi-scan.md b/unreleased_history/bug_fixes/multi-scan.md
new file mode 100644
index 000000000000..9ba67ac40fa5
--- /dev/null
+++ b/unreleased_history/bug_fixes/multi-scan.md
@@ -0,0 +1 @@
+* Fix a bug in MultiScan where incorrect results can be returned when a Scan's range is across multiple files.

From 3829750b7012e916c135ae586d7a3f550c53f979 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 4 Aug 2025 13:20:49 -0700
Subject: [PATCH 209/500] Make CompactionPicker::CompactFiles() take
 earliest_snapshot and snapshot_checker (#13816)

Summary:
One of the parameters for constructing a Compaction object is `earliest_snapshot`, which is required for Standalone Range Deletion Optimization (introduced in [https://github.com/facebook/rocksdb/pull/13078](https://github.com/facebook/rocksdb/pull/13078)). Remote Compaction has been using the `CompactionPicker::CompactFiles()` API to create the Compaction object, but this API never sets the `earliest_snapshot` parameter. To address this, update `CompactionPicker::CompactFiles()` to optionally accept `earliest_snapshot` and pass it during the call in `DBImplSecondary::CompactWithoutInstallation()`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13816

Test Plan:
```
./compaction_service_test --gtest_filter="*CompactionServiceTest.StandaloneDeleteRangeTombstoneOptimization*"
```

\+ Tested in Meta's internal offload infra.

Reviewed By: hx235

Differential Revision: D79284769

Pulled By: jaykorean

fbshipit-source-id: 164834ef6972d5e0ddfc2970bb9234ef166d6e52
---
 db/compaction/compaction_picker.cc       |  8 ++-
 db/compaction/compaction_picker.h        | 17 +++--
 db/compaction/compaction_service_test.cc | 91 ++++++++++++++++++++++++
 db/db_impl/db_impl_secondary.cc          | 24 ++++---
 4 files changed, 123 insertions(+), 17 deletions(-)

diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 2bdd9a9bb327..7f6cdffdaca9 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -337,7 +337,9 @@ Compaction* CompactionPicker::CompactFiles(
     const CompactionOptions& compact_options,
     const std::vector<CompactionInputFiles>& input_files, int output_level,
     VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
-    const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
+    const MutableDBOptions& mutable_db_options, uint32_t output_path_id,
+    std::optional<SequenceNumber> earliest_snapshot,
+    const SnapshotChecker* snapshot_checker) {
 #ifndef NDEBUG
   assert(input_files.size());
   // This compaction output should not overlap with a running compaction as
@@ -380,8 +382,8 @@ Compaction* CompactionPicker::CompactFiles(
       GetCompressionOptions(mutable_cf_options, vstorage, output_level),
       mutable_cf_options.default_write_temperature,
       compact_options.max_subcompactions,
-      /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
-      /* snapshot_checker */ nullptr, CompactionReason::kManualCompaction);
+      /* grandparents */ {}, earliest_snapshot, snapshot_checker,
+      CompactionReason::kManualCompaction);
   RegisterCompaction(c);
   return c;
 }
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index bbcc8fbac662..1212e648b6b6 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -117,12 +117,17 @@ class CompactionPicker {
   // Caller must provide a set of input files that has been passed through
   // `SanitizeAndConvertCompactionInputFiles` earlier. The lock should not be
   // released between that call and this one.
-  Compaction* CompactFiles(const CompactionOptions& compact_options,
-                           const std::vector<CompactionInputFiles>& input_files,
-                           int output_level, VersionStorageInfo* vstorage,
-                           const MutableCFOptions& mutable_cf_options,
-                           const MutableDBOptions& mutable_db_options,
-                           uint32_t output_path_id);
+  //
+  //  TODO - Remove default values for earliest_snapshot and snapshot_checker
+  //  and require all callers to pass them in so that DB::CompactFiles() can
+  //  also benefit from Standalone Range Tombstone Optimization
+  Compaction* CompactFiles(
+      const CompactionOptions& compact_options,
+      const std::vector<CompactionInputFiles>& input_files, int output_level,
+      VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, uint32_t output_path_id,
+      std::optional<SequenceNumber> earliest_snapshot = std::nullopt,
+      const SnapshotChecker* snapshot_checker = nullptr);
 
   // Converts a set of compaction input file numbers into
   // a list of CompactionInputFiles.
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 08a2a9cf0716..1479a6c5a983 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -461,6 +461,97 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
   ASSERT_EQ(handles_[1]->GetName(), info.cf_name);
 }
 
+TEST_F(CompactionServiceTest, StandaloneDeleteRangeTombstoneOptimization) {
+  Options options = CurrentOptions();
+  options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  ReopenWithCompactionService(&options);
+
+  size_t num_files_after_filtered = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::MakeInputIterator:NewCompactionMergingIterator",
+      [&](void* arg) {
+        num_files_after_filtered = *static_cast<size_t*>(arg);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> files;
+  {
+    // Writes first version of data in range partitioned files.
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file1 = dbname_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    ASSERT_OK(sst_file_writer.Put("a", "a1"));
+    ASSERT_OK(sst_file_writer.Put("b", "b1"));
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    files.push_back(std::move(file1));
+
+    std::string file2 = dbname_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.Put("x", "x1"));
+    ASSERT_OK(sst_file_writer.Put("y", "y1"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+  }
+
+  IngestExternalFileOptions ifo;
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_EQ(Get("a"), "a1");
+  ASSERT_EQ(Get("b"), "b1");
+  ASSERT_EQ(Get("x"), "x1");
+  ASSERT_EQ(Get("y"), "y1");
+  ASSERT_EQ(2, NumTableFilesAtLevel(6));
+
+  auto my_cs = GetCompactionService();
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  {
+    // Atomically delete old version of data with one range delete file.
+    // And a new batch of range partitioned files with new version of data.
+    files.clear();
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file2 = dbname_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.DeleteRange("a", "z"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+
+    std::string file3 = dbname_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    ASSERT_OK(sst_file_writer.Put("a", "a2"));
+    ASSERT_OK(sst_file_writer.Put("b", "b2"));
+    ExternalSstFileInfo file3_info;
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
+    files.push_back(std::move(file3));
+
+    std::string file4 = dbname_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    ASSERT_OK(sst_file_writer.Put("x", "x2"));
+    ASSERT_OK(sst_file_writer.Put("y", "y2"));
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    files.push_back(std::move(file4));
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_OK(db_->WaitForCompact(WaitForCompactOptions()));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+  CompactionServiceResult result;
+  my_cs->GetResult(&result);
+  ASSERT_OK(result.status);
+  ASSERT_TRUE(result.stats.is_manual_compaction);
+  ASSERT_TRUE(result.stats.is_remote_compaction);
+
+  ASSERT_EQ(num_files_after_filtered, 1);
+
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(CompactionServiceTest, CompactionOutputFileIOError) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 04abfc3d6d22..9f220a05e342 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -848,7 +848,6 @@ Status DBImplSecondary::CompactWithoutInstallation(
 
   VersionStorageInfo* vstorage = version->storage_info();
 
-  // Use comp_options to reuse some CompactFiles functions
   CompactionOptions comp_options;
   comp_options.compression = kDisableCompressionOption;
   comp_options.output_file_size_limit = MaxFileSizeForLevel(
@@ -867,13 +866,27 @@ Status DBImplSecondary::CompactWithoutInstallation(
     return s;
   }
 
+  const int job_id = next_job_id_.fetch_add(1);
+  JobContext job_context(job_id, true /*create_superversion*/);
+  std::vector<SequenceNumber> snapshots = input.snapshots;
+
+  // TODO - snapshot_checker support in Remote Compaction
+  job_context.InitSnapshotContext(/*checker=*/nullptr,
+                                  /*managed_snapshot=*/nullptr,
+                                  kMaxSequenceNumber, std::move(snapshots));
+
+  // TODO - consider serializing the entire Compaction object and using it as
+  // input instead of recreating it in the remote worker
   std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
   c.reset(cfd->compaction_picker()->CompactFiles(
       comp_options, input_files, input.output_level, vstorage,
-      cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0));
+      cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0,
+      /*earliest_snapshot=*/job_context.snapshot_seqs.empty()
+          ? kMaxSequenceNumber
+          : job_context.snapshot_seqs.front(),
+      job_context.snapshot_checker));
   assert(c != nullptr);
-
   c->FinalizeInputInfo(version);
 
   // Create output directory if it's not existed yet
@@ -886,11 +899,6 @@ Status DBImplSecondary::CompactWithoutInstallation(
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
 
-  const int job_id = next_job_id_.fetch_add(1);
-  JobContext job_context(0, true /*create_superversion*/);
-  std::vector<SequenceNumber> snapshots = input.snapshots;
-  job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber,
-                                  std::move(snapshots));
   // use primary host's db_id for running the compaction, but db_session_id is
   // using the local one, which is to make sure the unique id is unique from
   // the remote compactors. Because the id is generated from db_id,

From 7c5c37a1a4fc1ad84889a3ba42e246260f788bc1 Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Mon, 4 Aug 2025 14:14:16 -0700
Subject: [PATCH 210/500] IntervalSet Data Structure (#13787)

Summary:
This diff introduces the IntervalSet data structure, which will be used to help create sets of non overlapping sets of intervals for MultiScan scan options. Specifically, we add specializations for Slices to assist in this.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13787

Test Plan: Added test to catch various cases within adding intervals.

Reviewed By: anand1976

Differential Revision: D78624970

Pulled By: krhancoc

fbshipit-source-id: 9a3e4a28738ab8428788467540fc05ab5c1a1b67
---
 BUCK                             |   6 +
 Makefile                         |   3 +
 include/rocksdb/data_structure.h | 487 +++++++++++++++++++++++++++++++
 src.mk                           |   1 +
 util/interval_test.cc            | 102 +++++++
 5 files changed, 599 insertions(+)
 create mode 100644 util/interval_test.cc

diff --git a/BUCK b/BUCK
index ce8ca8a1b8fc..efdc0083fde5 100644
--- a/BUCK
+++ b/BUCK
@@ -5194,6 +5194,12 @@ cpp_unittest_wrapper(name="inlineskiplist_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="interval_test",
+            srcs=["util/interval_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="io_posix_test",
             srcs=["env/io_posix_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/Makefile b/Makefile
index 903ef3ce4b08..4f0a7b5ff70f 100644
--- a/Makefile
+++ b/Makefile
@@ -2037,6 +2037,9 @@ wide_column_serialization_test: $(OBJ_DIR)/db/wide/wide_column_serialization_tes
 wide_columns_helper_test: $(OBJ_DIR)/db/wide/wide_columns_helper_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+interval_test: $(OBJ_DIR)/util/interval_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 PREFIX ?= /usr/local
diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h
index a903a9649966..bf0144cd2904 100644
--- a/include/rocksdb/data_structure.h
+++ b/include/rocksdb/data_structure.h
@@ -10,8 +10,13 @@
 #include <array>
 #include <cstddef>
 #include <cstdint>
+#include <functional>
+#include <set>
+#include <variant>
 
+#include "rocksdb/comparator.h"
 #include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -281,4 +286,486 @@ class ManagedPtr {
   Owner* owner_ = nullptr;
 };
 
+template <typename T, typename comp>
+class Interval;
+
+// The Interval Class is a generic class for holding a range, for example [2,
+// 4]. It can be used within the IntervalSet class, which is able to keep an
+// ordered, non-intersecting set of intervals within it.  Intervals can have
+// open-ended end points, (i.e., to infinity) for example [2,).
+template <typename T, typename comp = std::less<T>>
+class Interval {
+ public:
+  enum class End { INF };
+  struct CompareVariant {
+    comp comparator;
+    bool operator()(const std::variant<T, End>& a,
+                    const std::variant<T, End>& b) const {
+      if (std::holds_alternative<T>(a) && std::holds_alternative<T>(b)) {
+        return comparator(std::get<T>(a), std::get<T>(b));
+      }
+      if (std::holds_alternative<End>(a) && std::holds_alternative<End>(b)) {
+        return false;
+      }
+      if (std::holds_alternative<T>(a) && std::holds_alternative<End>(b)) {
+        return false;
+      }
+      return true;  // std::holds_alternative<End>(a) &&
+                    // std::holds_alternative<T>(b)
+    }
+  };
+
+  /* implicit */ Interval(const T& start, const T& end)
+      : start_(start), end_(end) {}
+  /* implicit */ Interval(const T& start) : start_(start), end_(End::INF) {}
+
+  // Add constructor that takes a pair
+  /* implicit */ Interval(const std::pair<T, T>& p)
+      : start_(p.first), end_(p.second) {}
+
+  T& start() { return start_; }
+
+  const T& start() const { return start_; }
+
+  bool has_end() const { return std::holds_alternative<T>(end_); }
+
+  T& end() { return std::get<T>(end_); }
+
+  const T& end() const { return std::get<T>(end_); }
+
+  // Support comparison with std::pair
+  bool operator==(const std::pair<T, T>& p) const {
+    return start_ == p.first && has_end() && end() == p.second;
+  }
+
+  // Support comparison with another Interval
+  bool operator==(const Interval& other) const {
+    if (start_ != other.start_) {
+      return false;
+    }
+
+    // Both have infinite end
+    if (!has_end() && !other.has_end()) {
+      return true;
+    }
+
+    // One has infinite end, the other doesn't
+    if (has_end() != other.has_end()) {
+      return false;
+    }
+
+    // Both have finite end
+    return end() == other.end();
+  }
+
+  // Support comparison with another Interval
+  bool operator<(const Interval& other) const {
+    return comparator(start_, other.start_);
+  }
+
+  bool Compare(const Interval& other) const {
+    return comparator(start_, other.start_);
+  }
+
+ private:
+  T start_;
+  std::variant<T, End> end_;
+  comp comparator;
+};
+
+// Specialized version of Interval for Slice
+template <>
+class Interval<Slice, Comparator> {
+ public:
+  enum class End { INF };
+
+  // Constructors that take a Comparator
+  /* implicit */ Interval(const Comparator* c, const Slice& start,
+                          const Slice& end)
+      : start_(start), end_(end), comparator_(c) {}
+
+  /* implicit */ Interval(const Comparator* c, const Slice& start)
+      : start_(start), end_(End::INF), comparator_(c) {}
+
+  // Constructor that takes a pair
+  /* implicit */ Interval(const Comparator* c, const std::pair<Slice, Slice>& p)
+      : start_(p.first), end_(p.second), comparator_(c) {}
+
+  Slice& start() { return start_; }
+
+  const Slice& start() const { return start_; }
+
+  bool has_end() const { return std::holds_alternative<Slice>(end_); }
+
+  Slice& end() { return std::get<Slice>(end_); }
+
+  const Slice& end() const { return std::get<Slice>(end_); }
+
+  // Support comparison with std::pair
+  bool operator==(const std::pair<Slice, Slice>& p) const {
+    return start_ == p.first && has_end() && end() == p.second;
+  }
+
+  // Support comparison with another Interval
+  bool operator==(const Interval& other) const {
+    if (comparator_->Compare(start_, other.start_) != 0) {
+      return false;
+    }
+
+    // Both have infinite end
+    if (!has_end() && !other.has_end()) {
+      return true;
+    }
+
+    // One has infinite end, the other doesn't
+    if (has_end() != other.has_end()) {
+      return false;
+    }
+
+    // Both have finite end
+    return comparator_->Compare(end(), other.end()) == 0;
+  }
+
+  // Support comparison with another Interval
+  bool operator<(const Interval& other) const {
+    return comparator_->Compare(start_, other.start_) < 0;
+  }
+
+  bool Compare(const Interval& other) const {
+    return comparator_->Compare(start_, other.start_) < 0;
+  }
+
+  const Comparator* GetComparator() const { return comparator_; }
+
+ private:
+  Slice start_;
+  std::variant<Slice, End> end_;
+  const Comparator* comparator_;
+
+  std::unordered_map<std::string, std::string> property_bag;
+};
+
+template <typename T, typename Compare = std::less<T>>
+struct CompareInterval {
+  bool operator()(const Interval<T, Compare>& a,
+                  const Interval<T, Compare>& b) const {
+    return a.Compare(b);
+  }
+};
+
+// IntervalSet will be used to represent a set of intervals (including unbounded
+// ones). The intervals are unique and disjoint. Intervals that are inserted
+// will merge with any range they intersect with.
+template <typename T, typename Compare = typename Interval<T>::CompareVariant>
+class IntervalSet {
+ public:
+  IntervalSet(Compare c = Compare()) : comp_(c) {}
+
+  void insert(Interval<T>&& i) { insertImpl(i); }
+
+  void insert(const T& start, const T& end) {
+    insertImpl(Interval<T>(start, end));
+  }
+
+  void insert(const T& start) { insertImpl(Interval<T>(start)); }
+
+  bool empty() const { return intervals_.empty(); }
+  void clear() { intervals_.clear(); }
+
+  auto begin() { return intervals_.begin(); }
+  auto end() { return intervals_.end(); }
+
+  auto cbegin() const { return intervals_.cbegin(); }
+  auto cend() const { return intervals_.cend(); }
+
+  size_t size() const { return intervals_.size(); }
+
+ private:
+  void insertImpl(const Interval<T>& i) {
+    // Skip empty intervals
+    if (i.has_end() && !comp_(i.start(), i.end()) &&
+        !comp_(i.end(), i.start())) {
+      return;
+    }
+
+    // First, check if there's any infinite interval that would contain this one
+    for (auto it = intervals_.begin(); it != intervals_.end(); ++it) {
+      if (!it->has_end() && !comp_(i.start(), it->start())) {
+        // This interval starts at or after an infinite interval
+        return;
+      }
+    }
+
+    // Find the position where the interval should be inserted
+    auto it = intervals_.begin();
+    while (it != intervals_.end() && comp_(it->start(), i.start())) {
+      ++it;
+    }
+
+    // Check if we need to consider the previous interval
+    if (it != intervals_.begin()) {
+      --it;
+      if (it->has_end() && comp_(it->end(), i.start())) {
+        ++it;
+      }
+    }
+
+    T new_start = i.start();
+    T new_end;
+    bool inf_end = false;
+    if (i.has_end()) {
+      new_end = i.end();
+    } else {
+      // For infinite end intervals, we need to merge all intervals that start
+      // after new_start
+      std::vector<decltype(it)> to_erase;
+      while (it != intervals_.end()) {
+        new_start = comp_(it->start(), new_start) ? it->start() : new_start;
+        to_erase.push_back(it++);
+      }
+
+      for (auto& eit : to_erase) {
+        intervals_.erase(eit);
+      }
+
+      // Insert the new interval with infinite end
+      intervals_.insert(Interval<T>(new_start));
+      return;
+    }
+
+    // For finite end intervals, proceed as before
+    std::vector<decltype(it)> to_erase;
+    while (it != intervals_.end() && !comp_(new_end, it->start())) {
+      if (it->has_end() && comp_(it->end(), new_start)) {
+        ++it;
+        continue;
+      }
+      new_start = comp_(it->start(), new_start) ? it->start() : new_start;
+      if (it->has_end()) {
+        new_end = comp_(new_end, it->end()) ? it->end() : new_end;
+      } else {
+        // If we encounter an interval with infinite end, our new interval also
+        // becomes infinite
+        inf_end = true;
+        break;
+      }
+      to_erase.push_back(it++);
+    }
+
+    // Check for any infinite intervals that start after this one
+    auto check_it = it;
+    while (check_it != intervals_.end()) {
+      if (!check_it->has_end()) {
+        inf_end = true;
+        to_erase.push_back(check_it);
+      }
+      ++check_it;
+    }
+
+    for (auto& eit : to_erase) {
+      intervals_.erase(eit);
+    }
+
+    if (inf_end) {
+      intervals_.insert(Interval<T>(new_start));
+    } else {
+      intervals_.insert(Interval<T>(new_start, new_end));
+    }
+  }
+
+  std::set<Interval<T>, CompareInterval<T>> intervals_;
+  Compare comp_;
+};
+
+// Specialization of IntervalSet for Slices.
+// Slice based intervals can have properties attached to them. This is used to
+// push down properties in the MultiScan API.  We accept two modes with
+// IntervalSet, fail_on_intersect, which imposes a restriction that inserted
+// ranges will be disjoint, this is needed when using properties. Insert will
+// fail if a range is found to not be disjoint. When fail_on_instersect is
+// false, the ranges will be merged.
+template <>
+class IntervalSet<Slice, Comparator> {
+ public:
+  explicit IntervalSet(const Comparator* c, bool fail_on_intersect = false)
+      : comp_(c), prop_(fail_on_intersect) {}
+
+  // Insert returns true if the interval was inserted. False indicates that the
+  // interval was not inserted, this could be do to an empty range OR that the
+  // IntervalSet is in with_properties mode and the interval overlaps with an
+  // existing interval.
+  bool insert(const Slice& start, const Slice& end) {
+    return insertImpl(Interval<Slice, Comparator>(comp_, start, end));
+  }
+
+  // Insert returns true if the interval was inserted. False indicates that the
+  // interval was not inserted, this could be do to an empty range OR that the
+  // IntervalSet is in with_properties mode and the interval overlaps with an
+  // existing interval.
+  bool insert(const Slice& start) {
+    // Create an interval with infinite end
+    Interval<Slice, Comparator> interval(comp_, start);
+    return insertImpl(interval);
+  }
+
+  bool insert(Interval<Slice, Comparator>&& i) { return insertImpl(i); }
+
+  bool empty() const { return intervals_.empty(); }
+  void clear() { intervals_.clear(); }
+
+  auto begin() { return intervals_.begin(); }
+  auto end() { return intervals_.end(); }
+
+  auto cbegin() const { return intervals_.cbegin(); }
+  auto cend() const { return intervals_.cend(); }
+
+  size_t size() const { return intervals_.size(); }
+
+ private:
+  // Custom comparator for finding intervals in the vector
+  struct IntervalComparator {
+    explicit IntervalComparator(const Comparator* comp) : comp_(comp) {}
+
+    bool operator()(const Interval<Slice, Comparator>& a,
+                    const Interval<Slice, Comparator>& b) const {
+      return comp_->Compare(a.start(), b.start()) < 0;
+    }
+
+    const Comparator* comp_;
+  };
+
+  typename std::vector<Interval<Slice, Comparator>>::iterator findPosition(
+      const Interval<Slice, Comparator>& interval) {
+    // Find the position where the new interval should be inserted
+    for (auto it = intervals_.begin(); it != intervals_.end(); ++it) {
+      if (comp_->Compare(it->start(), interval.start()) >= 0) {
+        return it;
+      }
+    }
+    return intervals_.end();
+  }
+
+  bool insertImpl(const Interval<Slice, Comparator>& i) {
+    // Skip empty intervals
+    if (i.has_end() && comp_->Compare(i.start(), i.end()) >= 0) {
+      return false;
+    }
+
+    // Find the position where this interval would be inserted
+    // This also checks if the interval is completely contained within an
+    // existing one
+    auto it = findPosition(i);
+
+    // Check if we need to merge with previous interval
+    if (it != intervals_.begin()) {
+      auto prev = it - 1;
+      if (prev->has_end() && comp_->Compare(prev->end(), i.start()) < 0) {
+        // No overlap with previous interval
+      } else {
+        // There is overlap, adjust iterator to include previous interval
+        if (prop_) {
+          return false;
+        }
+        it = prev;
+      }
+    }
+
+    Slice new_start = i.start();
+    Slice new_end;
+    bool inf_end = false;
+
+    if (i.has_end()) {
+      new_end = i.end();
+    } else {
+      // For infinite end intervals, we need to merge all intervals that start
+      // after new_start
+      auto erase_start = it;
+      while (it != intervals_.end()) {
+        if (comp_->Compare(it->start(), new_start) < 0) {
+          if (prop_) {
+            return false;
+          }
+          new_start = it->start();
+        }
+        ++it;
+      }
+
+      // Erase all intervals from erase_start to end
+      if (erase_start != intervals_.end()) {
+        if (prop_) {
+          return false;
+        }
+        intervals_.erase(erase_start, intervals_.end());
+      }
+
+      // Insert the new interval with infinite end
+      Interval<Slice, Comparator> new_interval(comp_, new_start);
+      auto pos = findPosition(new_interval);
+      intervals_.insert(pos, new_interval);
+      return true;
+    }
+
+    // For finite end intervals, find all overlapping intervals
+    auto erase_start = it;
+    auto erase_end = it;
+
+    while (it != intervals_.end() &&
+           comp_->Compare(new_end, it->start()) >= 0) {
+      if (it->has_end() && comp_->Compare(it->end(), new_start) < 0) {
+        // No overlap
+        ++it;
+        erase_end = it;
+        continue;
+      }
+
+      if (comp_->Compare(it->start(), new_start) < 0) {
+        new_start = it->start();
+      }
+
+      if (it->has_end()) {
+        if (comp_->Compare(new_end, it->end()) < 0) {
+          new_end = it->end();
+        }
+      } else {
+        // If we encounter an interval with infinite end, our new interval also
+        // becomes infinite
+        inf_end = true;
+        erase_end = intervals_.end();
+        break;
+      }
+
+      ++it;
+      erase_end = it;
+    }
+
+    // Check for any infinite intervals that start after this one
+    while (it != intervals_.end()) {
+      if (!it->has_end()) {
+        inf_end = true;
+        erase_end = intervals_.end();
+        break;
+      }
+      ++it;
+    }
+
+    // Erase all merged intervals
+    if (erase_start != erase_end) {
+      intervals_.erase(erase_start, erase_end);
+    }
+
+    // Insert the new merged interval
+    Interval<Slice, Comparator> new_interval =
+        inf_end ? Interval<Slice, Comparator>(comp_, new_start)
+                : Interval<Slice, Comparator>(comp_, new_start, new_end);
+
+    auto pos = findPosition(new_interval);
+    intervals_.insert(pos, new_interval);
+    return true;
+  }
+
+  const Comparator* comp_;
+  std::vector<Interval<Slice, Comparator>> intervals_;
+  bool prop_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index 8d341f03c58b..182edc695cfc 100644
--- a/src.mk
+++ b/src.mk
@@ -613,6 +613,7 @@ TEST_MAIN_SOURCES =                                                     \
   util/file_reader_writer_test.cc                                       \
   util/hash_test.cc                                                     \
   util/heap_test.cc                                                     \
+  util/interval_test.cc                                                 \
   util/random_test.cc                                                   \
   util/rate_limiter_test.cc                                             \
   util/repeatable_thread_test.cc                                        \
diff --git a/util/interval_test.cc b/util/interval_test.cc
new file mode 100644
index 000000000000..caa102df577e
--- /dev/null
+++ b/util/interval_test.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntervalSetTest : public testing::Test {};
+
+TEST_F(IntervalSetTest, BasicTest) {
+  IntervalSet<int> set;
+  set.insert({2, 15});
+  EXPECT_EQ(set.size(), 1);
+  set.insert({5, 9});
+  EXPECT_EQ(set.size(), 1);
+  set.insert({0, 10});
+  EXPECT_EQ(set.size(), 1);
+  set.insert({25, 30});
+  EXPECT_EQ(set.size(), 2);
+  set.insert({16, 25});
+  EXPECT_EQ(set.size(), 2);
+  set.insert({45, 85});
+  ASSERT_EQ(set.size(), 3);
+  auto iter = set.begin();
+  ASSERT_EQ(*iter, Interval<int>(0, 15));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(16, 30));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(45, 85));
+  set.insert({31});
+  iter = set.begin();
+  ASSERT_EQ(*iter, Interval<int>(0, 15));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(16, 30));
+  iter++;
+  ASSERT_EQ(*iter, Interval<int>(31));
+}
+
+TEST_F(IntervalSetTest, SliceTest) {
+  IntervalSet<Slice, Comparator> set(BytewiseComparator());
+  EXPECT_TRUE(set.insert("k00", "k10"));
+  // Should do nothing
+  EXPECT_TRUE(set.insert("k02", "k08"));
+  auto iter = set.begin();
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  ASSERT_EQ(set.size(), 1);
+  iter++;
+  ASSERT_EQ(iter, set.end());
+  EXPECT_TRUE(set.insert("k15", "k20"));
+  EXPECT_TRUE(set.insert("k16"));
+  ASSERT_EQ(set.size(), 2);
+  iter = set.begin();
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k15");
+  ASSERT_EQ(iter->has_end(), false);
+  //
+}
+
+TEST_F(IntervalSetTest, PropModeTest) {
+  IntervalSet<Slice, Comparator> set(BytewiseComparator(), true);
+  EXPECT_TRUE(set.insert("k00", "k10"));
+  // Should do nothing
+  EXPECT_FALSE(set.insert("k02", "k08"));
+  EXPECT_EQ(set.size(), 1);
+  EXPECT_TRUE(set.insert("k15", "k20"));
+  EXPECT_EQ(set.size(), 2);
+  EXPECT_FALSE(set.insert("k16"));
+  ASSERT_EQ(set.size(), 2);
+  auto iter = set.begin();
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k15");
+  ASSERT_EQ(iter->end().ToString(), "k20");
+  EXPECT_TRUE(set.insert("k12", "k14"));
+  iter = set.begin();
+  ASSERT_EQ(set.size(), 3);
+  ASSERT_EQ(iter->start().ToString(), "k00");
+  ASSERT_EQ(iter->end().ToString(), "k10");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k12");
+  ASSERT_EQ(iter->end().ToString(), "k14");
+  iter++;
+  ASSERT_EQ(iter->start().ToString(), "k15");
+  ASSERT_EQ(iter->end().ToString(), "k20");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}

From 53c39c2b01529a6de34d108e22542f54ce286155 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 4 Aug 2025 14:15:38 -0700
Subject: [PATCH 211/500] Refactor/improve
 PartitionedIndexBuilder::AddIndexEntry (#13828)

Summary:
In anticipation of an enhancement related to parallel compression
* Rename confusing state variables `seperator_is_key_plus_seq_` -> `must_use_separator_with_seq_`
* Eliminate copy-paste code in `PartitionedIndexBuilder::AddIndexEntry`
* Optimize/simplify `PartitionedIndexBuilder::flush_policy_` by allowing a single policy to be re-targetted to different block builders. Added some additional internal APIs to make this work, and it only works because the FlushBlockBySizePolicy is otherwise stateless (after creation).
* Improve some comments, including another proposed optimization especially for the common case of no live snapshots affecting a large compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13828

Test Plan:
existing tests are pretty exhaustive, especially with crash test

Planning to validate performance in combination with next change. (This change is saving some extra allocate/deallocate with partitioned index.)

Reviewed By: cbi42

Differential Revision: D79570576

Pulled By: pdillinger

fbshipit-source-id: f7a16f0e6e6ad2023a3d1a2ebaa3cc22aac717af
---
 .../block_based/block_based_table_builder.cc  |   2 +-
 table/block_based/flush_block_policy.cc       |  27 +++--
 table/block_based/flush_block_policy_impl.h   |  20 ++++
 table/block_based/index_builder.cc            | 106 ++++++++----------
 table/block_based/index_builder.h             |  54 +++++----
 table/block_based/partitioned_filter_block.cc |   4 +-
 .../partitioned_filter_block_test.cc          |   2 +-
 .../block_based/user_defined_index_wrapper.h  |   4 +-
 8 files changed, 120 insertions(+), 99 deletions(-)

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 4ed77c532d9b..bfe645b883aa 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1938,7 +1938,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
           rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
     }
     rep_->props.index_key_is_user_key =
-        !rep_->index_builder->seperator_is_key_plus_seq();
+        !rep_->index_builder->separator_is_key_plus_seq();
     rep_->props.index_value_is_delta_encoded =
         rep_->use_delta_encoding_for_index_values;
     if (rep_->sampled_input_data_bytes > 0) {
diff --git a/table/block_based/flush_block_policy.cc b/table/block_based/flush_block_policy.cc
index d5cc310013f2..f01315ceb970 100644
--- a/table/block_based/flush_block_policy.cc
+++ b/table/block_based/flush_block_policy.cc
@@ -19,7 +19,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 // Flush block by size
-class FlushBlockBySizePolicy : public FlushBlockPolicy {
+class FlushBlockBySizePolicy : public RetargetableFlushBlockPolicy {
  public:
   // @params block_size:           Approximate size of user data packed per
   //                               block.
@@ -28,19 +28,19 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
   FlushBlockBySizePolicy(const uint64_t block_size,
                          const uint64_t block_size_deviation, const bool align,
                          const BlockBuilder& data_block_builder)
-      : block_size_(block_size),
+      : RetargetableFlushBlockPolicy(data_block_builder),
+        block_size_(block_size),
         block_size_deviation_limit_(
             ((block_size * (100 - block_size_deviation)) + 99) / 100),
-        align_(align),
-        data_block_builder_(data_block_builder) {}
+        align_(align) {}
 
   bool Update(const Slice& key, const Slice& value) override {
     // it makes no sense to flush when the data block is empty
-    if (data_block_builder_.empty()) {
+    if (data_block_builder_->empty()) {
       return false;
     }
 
-    auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    auto curr_size = data_block_builder_->CurrentSizeEstimate();
 
     // Do flush if one of the below two conditions is true:
     // 1) if the current estimated size already exceeds the block size,
@@ -56,9 +56,9 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
       return false;
     }
 
-    const auto curr_size = data_block_builder_.CurrentSizeEstimate();
+    const auto curr_size = data_block_builder_->CurrentSizeEstimate();
     auto estimated_size_after =
-        data_block_builder_.EstimateSizeAfterKV(key, value);
+        data_block_builder_->EstimateSizeAfterKV(key, value);
 
     if (align_) {
       estimated_size_after += BlockBasedTable::kBlockTrailerSize;
@@ -72,7 +72,6 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
   const uint64_t block_size_;
   const uint64_t block_size_deviation_limit_;
   const bool align_;
-  const BlockBuilder& data_block_builder_;
 };
 
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
@@ -83,10 +82,18 @@ FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_options.block_align, data_block_builder);
 }
 
+std::unique_ptr<RetargetableFlushBlockPolicy> NewFlushBlockBySizePolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder) {
+  return std::make_unique<FlushBlockBySizePolicy>(size, deviation, false,
+                                                  data_block_builder);
+}
+
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
     const uint64_t size, const int deviation,
     const BlockBuilder& data_block_builder) {
-  return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
+  return NewFlushBlockBySizePolicy(size, deviation, data_block_builder)
+      .release();
 }
 
 static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library,
diff --git a/table/block_based/flush_block_policy_impl.h b/table/block_based/flush_block_policy_impl.h
index 4f79682bc25f..96132304d6e0 100644
--- a/table/block_based/flush_block_policy_impl.h
+++ b/table/block_based/flush_block_policy_impl.h
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
 #include "rocksdb/flush_block_policy.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -37,4 +38,23 @@ class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
   }
 };
 
+// For internal use, policy that is stateless after creation, meaning it can
+// be safely re-targeted to another block builder.
+class RetargetableFlushBlockPolicy : public FlushBlockPolicy {
+ public:
+  explicit RetargetableFlushBlockPolicy(const BlockBuilder& data_block_builder)
+      : data_block_builder_(&data_block_builder) {}
+
+  void Retarget(const BlockBuilder& data_block_builder) {
+    data_block_builder_ = &data_block_builder;
+  }
+
+ protected:
+  const BlockBuilder* data_block_builder_;
+};
+
+std::unique_ptr<RetargetableFlushBlockPolicy> NewFlushBlockBySizePolicy(
+    const uint64_t size, const int deviation,
+    const BlockBuilder& data_block_builder);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index a5a34d65b670..c3b360a07139 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -152,7 +152,7 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
       // sub_index_builder. Otherwise, it could be set to true even one of the
       // sub_index_builders could not safely exclude seq from the keys, then it
       // wil be enforced on all sub_index_builders on ::Finish.
-      seperator_is_key_plus_seq_(false),
+      must_use_separator_with_seq_(false),
       use_value_delta_encoding_(use_value_delta_encoding) {}
 
 void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
@@ -163,21 +163,28 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
       table_opt_.index_shortening, /* include_first_key */ false, ts_sz_,
       persist_user_defined_timestamps_);
 
-  // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if
-  // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by
+  BlockBuilder* builder_to_monitor;
+  // Set sub_index_builder_->must_use_separator_with_seq_ to true if
+  // must_use_separator_with_seq_ is true (internal-key mode) (set to false by
   // default on Creation) so that flush policy can point to
   // sub_index_builder_->index_block_builder_
-  if (seperator_is_key_plus_seq_) {
-    sub_index_builder_->seperator_is_key_plus_seq_ = true;
+  if (must_use_separator_with_seq_) {
+    sub_index_builder_->must_use_separator_with_seq_ = true;
+    builder_to_monitor = &sub_index_builder_->index_block_builder_;
+  } else {
+    builder_to_monitor = &sub_index_builder_->index_block_builder_without_seq_;
   }
 
-  flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
-      table_opt_.metadata_block_size, table_opt_.block_size_deviation,
-      // Note: this is sub-optimal since sub_index_builder_ could later reset
-      // seperator_is_key_plus_seq_ but the probability of that is low.
-      sub_index_builder_->seperator_is_key_plus_seq_
-          ? sub_index_builder_->index_block_builder_
-          : sub_index_builder_->index_block_builder_without_seq_));
+  if (flush_policy_ == nullptr) {
+    // Note: some partitions could be sub-optimal since sub_index_builder_
+    // could later reset must_use_separator_with_seq_ but the probability and
+    // impact of that are low.
+    flush_policy_ = NewFlushBlockBySizePolicy(table_opt_.metadata_block_size,
+                                              table_opt_.block_size_deviation,
+                                              *builder_to_monitor);
+  } else {
+    flush_policy_->Retarget(*builder_to_monitor);
+  }
   partition_cut_requested_ = false;
 }
 
@@ -191,30 +198,7 @@ Slice PartitionedIndexBuilder::AddIndexEntry(
     std::string* separator_scratch) {
   // Note: to avoid two consecuitive flush in the same method call, we do not
   // check flush policy when adding the last key
-  if (UNLIKELY(first_key_in_next_block == nullptr)) {  // no more keys
-    if (sub_index_builder_ == nullptr) {
-      MakeNewSubIndexBuilder();
-      // Reserve next partition entry, where we will modify the key and
-      // eventually set the value
-      entries_.push_back({{}, {}});
-    }
-    auto sep = sub_index_builder_->AddIndexEntry(
-        last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
-    if (!seperator_is_key_plus_seq_ &&
-        sub_index_builder_->seperator_is_key_plus_seq_) {
-      // We need to apply !seperator_is_key_plus_seq to all sub-index builders
-      seperator_is_key_plus_seq_ = true;
-      // Would associate flush_policy with the appropriate builder, but it won't
-      // be used again with no more keys
-      flush_policy_.reset();
-    }
-    entries_.back().key.assign(sep.data(), sep.size());
-    assert(entries_.back().value == nullptr);
-    std::swap(entries_.back().value, sub_index_builder_);
-    cut_filter_block = true;
-    return sep;
-  } else {
+  if (LIKELY(first_key_in_next_block != nullptr)) {
     // apply flush policy only to non-empty sub_index_builder_
     if (sub_index_builder_ != nullptr) {
       std::string handle_encoding;
@@ -228,27 +212,31 @@ Slice PartitionedIndexBuilder::AddIndexEntry(
         cut_filter_block = true;
       }
     }
-    if (sub_index_builder_ == nullptr) {
-      MakeNewSubIndexBuilder();
-      // Reserve next partition entry, where we will modify the key and
-      // eventually set the value
-      entries_.push_back({{}, {}});
-    }
-    auto sep = sub_index_builder_->AddIndexEntry(
-        last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
-    entries_.back().key.assign(sep.data(), sep.size());
-    if (!seperator_is_key_plus_seq_ &&
-        sub_index_builder_->seperator_is_key_plus_seq_) {
-      // We need to apply !seperator_is_key_plus_seq to all sub-index builders
-      seperator_is_key_plus_seq_ = true;
-      // And use a flush_policy with the appropriate builder
-      flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
-          table_opt_.metadata_block_size, table_opt_.block_size_deviation,
-          sub_index_builder_->index_block_builder_));
-    }
-    return sep;
   }
+
+  if (sub_index_builder_ == nullptr) {
+    MakeNewSubIndexBuilder();
+    // Reserve next partition entry, where we will modify the key and
+    // eventually set the value
+    entries_.push_back({{}, {}});
+  }
+  auto sep = sub_index_builder_->AddIndexEntry(last_key_in_current_block,
+                                               first_key_in_next_block,
+                                               block_handle, separator_scratch);
+  entries_.back().key.assign(sep.data(), sep.size());
+  if (!must_use_separator_with_seq_ &&
+      sub_index_builder_->must_use_separator_with_seq_) {
+    // We need to apply !must_use_separator_with_seq to all sub-index builders
+    must_use_separator_with_seq_ = true;
+    flush_policy_->Retarget(sub_index_builder_->index_block_builder_);
+  }
+  if (UNLIKELY(first_key_in_next_block == nullptr)) {
+    // no more keys
+    assert(entries_.back().value == nullptr);
+    std::swap(entries_.back().value, sub_index_builder_);
+    cut_filter_block = true;
+  }
+  return sep;
 }
 
 Status PartitionedIndexBuilder::Finish(
@@ -270,7 +258,7 @@ Status PartitionedIndexBuilder::Finish(
     const Slice handle_delta_encoding_slice(handle_delta_encoding);
     index_block_builder_.Add(last_entry.key, handle_encoding,
                              &handle_delta_encoding_slice);
-    if (!seperator_is_key_plus_seq_) {
+    if (!must_use_separator_with_seq_) {
       index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
                                            handle_encoding,
                                            &handle_delta_encoding_slice);
@@ -279,7 +267,7 @@ Status PartitionedIndexBuilder::Finish(
   }
   // If there is no sub_index left, then return the 2nd level index.
   if (UNLIKELY(entries_.empty())) {
-    if (seperator_is_key_plus_seq_) {
+    if (must_use_separator_with_seq_) {
       index_blocks->index_block_contents = index_block_builder_.Finish();
     } else {
       index_blocks->index_block_contents =
@@ -293,7 +281,7 @@ Status PartitionedIndexBuilder::Finish(
     // expect more calls to Finish
     Entry& entry = entries_.front();
     // Apply the policy to all sub-indexes
-    entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
+    entry.value->must_use_separator_with_seq_ = must_use_separator_with_seq_;
     auto s = entry.value->Finish(index_blocks);
     index_size_ += index_blocks->index_block_contents.size();
     finishing_indexes = true;
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 14388abf1827..a7ce797e0a29 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -18,6 +18,7 @@
 #include "rocksdb/comparator.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_builder.h"
+#include "table/block_based/flush_block_policy_impl.h"
 #include "table/format.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -109,7 +110,7 @@ class IndexBuilder {
   // Get the size for index block. Must be called after ::Finish.
   virtual size_t IndexSize() const = 0;
 
-  virtual bool seperator_is_key_plus_seq() { return true; }
+  virtual bool separator_is_key_plus_seq() { return true; }
 
  protected:
   // Given the last key in current block and the first key in the next block,
@@ -178,7 +179,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
         include_first_key_(include_first_key),
         shortening_mode_(shortening_mode) {
     // Making the default true will disable the feature for old versions
-    seperator_is_key_plus_seq_ = (format_version <= 2);
+    must_use_separator_with_seq_ = (format_version <= 2);
   }
 
   void OnKeyAdded(const Slice& key,
@@ -192,29 +193,29 @@ class ShortenedIndexBuilder : public IndexBuilder {
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
                       std::string* separator_scratch) override {
-    Slice separator;
+    Slice separator_with_seq;
     if (first_key_in_next_block != nullptr) {
       if (shortening_mode_ !=
           BlockBasedTableOptions::IndexShorteningMode::kNoShortening) {
-        separator = FindShortestInternalKeySeparator(
+        separator_with_seq = FindShortestInternalKeySeparator(
             *comparator_->user_comparator(), last_key_in_current_block,
             *first_key_in_next_block, separator_scratch);
       } else {
-        separator = last_key_in_current_block;
+        separator_with_seq = last_key_in_current_block;
       }
-      if (!seperator_is_key_plus_seq_ &&
+      if (!must_use_separator_with_seq_ &&
           ShouldUseKeyPlusSeqAsSeparator(last_key_in_current_block,
                                          *first_key_in_next_block)) {
-        seperator_is_key_plus_seq_ = true;
+        must_use_separator_with_seq_ = true;
       }
     } else {
       if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
                                   kShortenSeparatorsAndSuccessor) {
-        separator = FindShortInternalKeySuccessor(
+        separator_with_seq = FindShortInternalKeySuccessor(
             *comparator_->user_comparator(), last_key_in_current_block,
             separator_scratch);
       } else {
-        separator = last_key_in_current_block;
+        separator_with_seq = last_key_in_current_block;
       }
     }
 
@@ -254,21 +255,22 @@ class ShortenedIndexBuilder : public IndexBuilder {
     // away the UDT from key in index block as data block does the same thing.
     // What are the implications if a "FindShortInternalKeySuccessor"
     // optimization is provided.
-    index_block_builder_.Add(separator, encoded_entry,
+    index_block_builder_.Add(separator_with_seq, encoded_entry,
                              &delta_encoded_entry_slice);
-    if (!seperator_is_key_plus_seq_) {
-      index_block_builder_without_seq_.Add(
-          ExtractUserKey(separator), encoded_entry, &delta_encoded_entry_slice);
+    if (!must_use_separator_with_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(separator_with_seq),
+                                           encoded_entry,
+                                           &delta_encoded_entry_slice);
     }
 
     current_block_first_internal_key_.clear();
-    return separator;
+    return separator_with_seq;
   }
 
   using IndexBuilder::Finish;
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& /*last_partition_block_handle*/) override {
-    if (seperator_is_key_plus_seq_) {
+    if (must_use_separator_with_seq_) {
       index_blocks->index_block_contents = index_block_builder_.Finish();
     } else {
       index_blocks->index_block_contents =
@@ -280,8 +282,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
-  bool seperator_is_key_plus_seq() override {
-    return seperator_is_key_plus_seq_;
+  bool separator_is_key_plus_seq() override {
+    return must_use_separator_with_seq_;
   }
 
   // Changes *key to a short string >= *key.
@@ -299,9 +301,13 @@ class ShortenedIndexBuilder : public IndexBuilder {
 
  private:
   BlockBuilder index_block_builder_;
+  // TODO: consider optimizing to only one builder. When discovering that
+  // sequence numbers are needed, read existing entries without seq and rewrite
+  // them with seq (which should be trivial to populate since seq wasn't needed
+  // before).
   BlockBuilder index_block_builder_without_seq_;
   const bool use_value_delta_encoding_;
-  bool seperator_is_key_plus_seq_;
+  bool must_use_separator_with_seq_;
   const bool include_first_key_;
   BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
   BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
@@ -407,8 +413,8 @@ class HashIndexBuilder : public IndexBuilder {
            prefix_meta_block_.size();
   }
 
-  bool seperator_is_key_plus_seq() override {
-    return primary_index_builder_.seperator_is_key_plus_seq();
+  bool separator_is_key_plus_seq() override {
+    return primary_index_builder_.separator_is_key_plus_seq();
   }
 
  private:
@@ -491,8 +497,8 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // cutting the next partition
   void RequestPartitionCut();
 
-  bool seperator_is_key_plus_seq() override {
-    return seperator_is_key_plus_seq_;
+  bool separator_is_key_plus_seq() override {
+    return must_use_separator_with_seq_;
   }
 
   bool get_use_value_delta_encoding() const {
@@ -521,11 +527,11 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // the active partition index builder
   std::unique_ptr<ShortenedIndexBuilder> sub_index_builder_;
   // the last key in the active partition index builder
-  std::unique_ptr<FlushBlockPolicy> flush_policy_;
+  std::unique_ptr<RetargetableFlushBlockPolicy> flush_policy_;
   // true if Finish is called once but not complete yet.
   bool finishing_indexes = false;
   const BlockBasedTableOptions& table_opt_;
-  bool seperator_is_key_plus_seq_;
+  bool must_use_separator_with_seq_;
   bool use_value_delta_encoding_;
   // true if an external entity (such as filter partition builder) request
   // cutting the next partition
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index 42cfce462abe..c7024895453f 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -240,7 +240,7 @@ Status PartitionedFilterBlockBuilder::Finish(
 
     index_on_filter_block_builder_.Add(e.ikey, handle_encoding,
                                        &handle_delta_encoding_slice);
-    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+    if (!p_index_builder_->separator_is_key_plus_seq()) {
       index_on_filter_block_builder_without_seq_.Add(
           ExtractUserKey(e.ikey), handle_encoding,
           &handle_delta_encoding_slice);
@@ -267,7 +267,7 @@ Status PartitionedFilterBlockBuilder::Finish(
     if (UNLIKELY(filters_.empty())) {
       if (!index_on_filter_block_builder_.empty()) {
         // Simplest to just add them all at the end
-        if (p_index_builder_->seperator_is_key_plus_seq()) {
+        if (p_index_builder_->separator_is_key_plus_seq()) {
           *filter = index_on_filter_block_builder_.Finish();
         } else {
           *filter = index_on_filter_block_builder_without_seq_.Finish();
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 80cb131a990b..a5aa94a8e334 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -27,7 +27,7 @@ class MockedBlockBasedTable : public BlockBasedTable {
   MockedBlockBasedTable(Rep* rep, PartitionedIndexBuilder* pib)
       : BlockBasedTable(rep, /*block_cache_tracer=*/nullptr) {
     // Initialize what Open normally does as much as necessary for the test
-    rep->index_key_includes_seq = pib->seperator_is_key_plus_seq();
+    rep->index_key_includes_seq = pib->separator_is_key_plus_seq();
     rep->index_value_is_full = !pib->get_use_value_delta_encoding();
   }
 };
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 8a760a09ea9e..2cb42a0765fb 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -111,8 +111,8 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
-  bool seperator_is_key_plus_seq() override {
-    return internal_index_builder_->seperator_is_key_plus_seq();
+  bool separator_is_key_plus_seq() override {
+    return internal_index_builder_->separator_is_key_plus_seq();
   }
 
  private:

From a88d367096466638dfaacd4bbd0a4cd34f25acb9 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 4 Aug 2025 17:16:25 -0700
Subject: [PATCH 212/500] Minor Refactor - VerifyOutputRecordCount (#13830)

Summary:
Introduce `CompactionJob::VerifyOutputRecordCount()` and make it align with `VerifyInputRecordCount()`.

Functionality-wise, it should be the same except when `db_options_.compaction_verify_record_count` is false. RocksDB will only print WARN message upon verification failure and not return `Status::Corruption()`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13830

Test Plan:
Existing tests cover both
```
 ./compaction_service_test --gtest_filter="*CompactionServiceTest.VerifyInputRecordCount*"
```

```
 ./compaction_service_test --gtest_filter="*CompactionServiceTest.CorruptedOutput*"
```

Reviewed By: hx235

Differential Revision: D79584795

Pulled By: jaykorean

fbshipit-source-id: 5851328999005601b28504085b688b80880bca7c
---
 db/compaction/compaction_job.cc | 83 +++++++++++++++++----------------
 db/compaction/compaction_job.h  |  1 +
 2 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 5d666429ad6a..adf86acd54e2 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -867,12 +867,6 @@ Status CompactionJob::Run() {
   if (status.ok() && ok) {
     if (job_stats_->has_num_input_records) {
       status = VerifyInputRecordCount(num_input_range_del);
-      if (!status.ok()) {
-        ROCKS_LOG_WARN(
-            db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
-            compact_->compaction->column_family_data()->GetName().c_str(),
-            job_context_->job_id, status.ToString().c_str());
-      }
     }
     UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
   }
@@ -881,39 +875,11 @@ Status CompactionJob::Run() {
   // Verify number of output records
   // Only verify on table with format collects table properties
   const auto& mutable_cf_options = compact_->compaction->mutable_cf_options();
-  if (status.ok() &&
-      (mutable_cf_options.table_factory->IsInstanceOf(
-           TableFactory::kBlockBasedTableName()) ||
-       mutable_cf_options.table_factory->IsInstanceOf(
-           TableFactory::kPlainTableName())) &&
-      db_options_.compaction_verify_record_count) {
-    uint64_t total_output_num = 0;
-    for (const auto& state : compact_->sub_compact_states) {
-      for (const auto& output : state.GetOutputs()) {
-        total_output_num += output.table_properties->num_entries -
-                            output.table_properties->num_range_deletions;
-      }
-    }
-
-    uint64_t expected = internal_stats_.output_level_stats.num_output_records;
-    if (internal_stats_.has_proximal_level_output) {
-      expected += internal_stats_.proximal_level_stats.num_output_records;
-    }
-    if (expected != total_output_num) {
-      char scratch[2345];
-      compact_->compaction->Summary(scratch, sizeof(scratch));
-      std::string msg =
-          "Number of keys in compaction output SST files does not match "
-          "number of keys added. Expected " +
-          std::to_string(expected) + " but there are " +
-          std::to_string(total_output_num) +
-          " in output SST files. Compaction summary: " + scratch;
-      ROCKS_LOG_WARN(
-          db_options_.info_log, "[%s] [JOB %d] Compaction with status: %s",
-          compact_->compaction->column_family_data()->GetName().c_str(),
-          job_context_->job_id, msg.c_str());
-      status = Status::Corruption(msg);
-    }
+  if (status.ok() && (mutable_cf_options.table_factory->IsInstanceOf(
+                          TableFactory::kBlockBasedTableName()) ||
+                      mutable_cf_options.table_factory->IsInstanceOf(
+                          TableFactory::kPlainTableName()))) {
+    status = VerifyOutputRecordCount();
   }
 
   RecordCompactionIOStats();
@@ -2378,6 +2344,11 @@ Status CompactionJob::VerifyInputRecordCount(
           "number of keys processed. Expected " +
           std::to_string(expected) + " but processed " +
           std::to_string(actual) + ". Compaction summary: " + scratch;
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] VerifyInputRecordCount() Status: %s",
+          compact_->compaction->column_family_data()->GetName().c_str(),
+          job_context_->job_id, msg.c_str());
       if (db_options_.compaction_verify_record_count) {
         return Status::Corruption(msg);
       }
@@ -2386,4 +2357,38 @@ Status CompactionJob::VerifyInputRecordCount(
   return Status::OK();
 }
 
+Status CompactionJob::VerifyOutputRecordCount() const {
+  uint64_t total_output_num = 0;
+  for (const auto& state : compact_->sub_compact_states) {
+    for (const auto& output : state.GetOutputs()) {
+      total_output_num += output.table_properties->num_entries -
+                          output.table_properties->num_range_deletions;
+    }
+  }
+
+  uint64_t expected = internal_stats_.output_level_stats.num_output_records;
+  if (internal_stats_.has_proximal_level_output) {
+    expected += internal_stats_.proximal_level_stats.num_output_records;
+  }
+  if (expected != total_output_num) {
+    char scratch[2345];
+    compact_->compaction->Summary(scratch, sizeof(scratch));
+    std::string msg =
+        "Number of keys in compaction output SST files does not match "
+        "number of keys added. Expected " +
+        std::to_string(expected) + " but there are " +
+        std::to_string(total_output_num) +
+        " in output SST files. Compaction summary: " + scratch;
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "[%s] [JOB %d] VerifyOutputRecordCount() status: %s",
+        compact_->compaction->column_family_data()->GetName().c_str(),
+        job_context_->job_id, msg.c_str());
+    if (db_options_.compaction_verify_record_count) {
+      return Status::Corruption(msg);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index e7e209c74412..bc116de9971f 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -250,6 +250,7 @@ class CompactionJob {
       uint64_t num_input_range_del) const;
 
   Status VerifyInputRecordCount(uint64_t num_input_range_del) const;
+  Status VerifyOutputRecordCount() const;
 
   // Generates a histogram representing potential divisions of key ranges from
   // the input. It adds the starting and/or ending keys of certain input files

From 799079cac5ff62ea3404a7fdacb4a09fa9d9d917 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 5 Aug 2025 10:48:49 -0700
Subject: [PATCH 213/500] Handle drop column family version edit in file
 checksum retriever (#13832)

Summary:
... by ensuring that files in dropped column family are not returned to the caller upon successful, offline MANIFEST iteration.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13832

Test Plan: `DBTest2, GetFileChecksumsFromCurrentManifest_CRC32`

Reviewed By: pdillinger

Differential Revision: D79607298

Pulled By: mszeszko-meta

fbshipit-source-id: e7948e086ba6e6fb953a3959fdcc81300613d73e
---
 db/db_test2.cc                                | 16 +++++++
 db/experimental.cc                            |  9 ++--
 db/version_edit_handler.cc                    | 48 +++++++++++++------
 db/version_edit_handler.h                     | 33 +++++++++++--
 ...edit_version_in_file_checksum_retriever.md |  1 +
 5 files changed, 84 insertions(+), 23 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md

diff --git a/db/db_test2.cc b/db/db_test2.cc
index 0d9d306e6bf2..b84c4b35a1fb 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -7466,11 +7466,27 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) {
   FlushOptions fopts;
   fopts.wait = true;
   Random rnd(test::RandomSeed());
+
+  // Write 4 files into the default column family.
   for (int i = 0; i < 4; i++) {
     ASSERT_OK(db->Put(wopts, Key(i), rnd.RandomString(100)));
     ASSERT_OK(db->Flush(fopts));
   }
 
+  // Create a new column family, write 1 file into it and drop it.
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db->CreateColumnFamily(ColumnFamilyOptions(), "soon_to_be_deleted", &cf));
+  ASSERT_OK(db->Put(wopts, cf, "some_key", "some_value"));
+  ASSERT_OK(db->Flush(fopts, cf));
+
+  // Drop column family should generate corresponding version edit
+  // in manifest, which we expect to be correctly interpreted by
+  // GetFileChecksumsFromCurrentManifest API after db close.
+  ASSERT_OK(db->DropColumnFamily(cf));
+  delete cf;
+  cf = nullptr;
+
   // Obtain rich files metadata for source of truth.
   std::vector<LiveFileMetaData> live_files;
   db->GetLiveFilesMetaData(&live_files);
diff --git a/db/experimental.cc b/db/experimental.cc
index 2193342e056f..597767b37b70 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -88,11 +88,12 @@ Status GetFileChecksumsFromCurrentManifest(FileSystem* fs,
 
   // Read all records from the manifest file...
   uint64_t manifest_file_size = std::numeric_limits<uint64_t>::max();
-  FileChecksumRetriever retriever(read_options, manifest_file_size,
-                                  *checksum_list);
+  FileChecksumRetriever retriever(read_options, manifest_file_size);
   retriever.Iterate(reader, &s);
-
-  return retriever.status();
+  if (!retriever.status().ok()) {
+    return retriever.status();
+  }
+  return retriever.FetchFileChecksumList(*checksum_list);
 }
 
 Status UpdateManifestForFilesState(
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index c89fe0a42123..42d83b84d627 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -117,21 +117,43 @@ Status ListColumnFamiliesHandler::ApplyVersionEdit(
   return s;
 }
 
+Status FileChecksumRetriever::FetchFileChecksumList(
+    FileChecksumList& file_checksum_list) {
+  Status s = Status::OK();
+  for (const auto& [cf, file_checksums] : cf_file_checksums_) {
+    [[maybe_unused]] const auto& _ = cf;
+    for (const auto& [file_number, info] : file_checksums) {
+      if (!(s = file_checksum_list.InsertOneFileChecksum(
+                file_number, info.first, info.second))
+               .ok()) {
+        break;
+      }
+    }
+  }
+  return s;
+}
+
 Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
                                                ColumnFamilyData** /*unused*/) {
+  uint32_t column_family_id = edit.GetColumnFamily();
+  if (edit.IsColumnFamilyDrop()) {
+    cf_file_checksums_.erase(column_family_id);
+  }
   for (const auto& deleted_file : edit.GetDeletedFiles()) {
-    Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
-    if (!s.ok()) {
-      return s;
+    if (cf_file_checksums_.find(column_family_id) == cf_file_checksums_.end()) {
+      return Status::NotFound();
+    }
+    if (cf_file_checksums_[column_family_id].find(deleted_file.second) ==
+        cf_file_checksums_[column_family_id].end()) {
+      return Status::NotFound();
     }
+    cf_file_checksums_[column_family_id].erase(deleted_file.second);
   }
   for (const auto& new_file : edit.GetNewFiles()) {
-    Status s = file_checksum_list_.InsertOneFileChecksum(
-        new_file.second.fd.GetNumber(), new_file.second.file_checksum,
-        new_file.second.file_checksum_func_name);
-    if (!s.ok()) {
-      return s;
-    }
+    cf_file_checksums_[column_family_id].emplace(
+        new_file.second.fd.GetNumber(),
+        std::make_pair(new_file.second.file_checksum,
+                       new_file.second.file_checksum_func_name));
   }
   for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
     std::string checksum_value = new_blob_file.GetChecksumValue();
@@ -141,11 +163,9 @@ Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
       checksum_value = kUnknownFileChecksum;
       checksum_method = kUnknownFileChecksumFuncName;
     }
-    Status s = file_checksum_list_.InsertOneFileChecksum(
-        new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
-    if (!s.ok()) {
-      return s;
-    }
+    cf_file_checksums_[column_family_id].emplace(
+        new_blob_file.GetBlobFileNumber(),
+        std::make_pair(checksum_value, checksum_method));
   }
   return Status::OK();
 }
diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h
index 0cef558826b8..1d4b22e3c13e 100644
--- a/db/version_edit_handler.h
+++ b/db/version_edit_handler.h
@@ -80,19 +80,42 @@ class ListColumnFamiliesHandler : public VersionEditHandlerBase {
 
 class FileChecksumRetriever : public VersionEditHandlerBase {
  public:
-  FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size,
-                        FileChecksumList& file_checksum_list)
-      : VersionEditHandlerBase(read_options, max_read_size),
-        file_checksum_list_(file_checksum_list) {}
+  FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size)
+      : VersionEditHandlerBase(read_options, max_read_size) {}
 
   ~FileChecksumRetriever() override {}
 
+  Status FetchFileChecksumList(FileChecksumList& file_checksum_list);
+
  protected:
   Status ApplyVersionEdit(VersionEdit& edit,
                           ColumnFamilyData** /*unused*/) override;
 
  private:
-  FileChecksumList& file_checksum_list_;
+  // Map from CF to file # to string pair, where first portion of the value
+  // is checksum, and second portion of the value is checksum function name.
+  //
+  // [column family id A]
+  //      |
+  //      |-- [file #1] -> [checksum #1, checksum function name #1]
+  //      |-- [file #2] -> [checksum #2, checksum function name #2]
+  //      |
+  //     ...
+  //      |
+  //      |-- [file #N] -> [checksum #N, checksum function name #N]
+  // [column family id B]
+  //      |
+  //      |-- [file #1] -> [checksum #1, checksum function name #1]
+  //      |
+  //     ...
+  //      |
+  //      |-- [file #M] -> [checksum #M, checksum function name #M]
+  //      |
+  //     ...
+  std::unordered_map<
+      uint32_t,
+      std::unordered_map<uint64_t, std::pair<std::string, std::string>>>
+      cf_file_checksums_;
 };
 
 using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
diff --git a/unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md b/unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md
new file mode 100644
index 000000000000..2e7cbf3a3007
--- /dev/null
+++ b/unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md
@@ -0,0 +1 @@
+* Files in dropped column family won't be returned to the caller upon successful, offline MANIFEST iteration in `GetFileChecksumsFromCurrentManifest`.

From b6e804b7de683450462b133a9d90b6c166ec7e90 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 5 Aug 2025 13:11:01 -0700
Subject: [PATCH 214/500] Rename CompactFiles() and CompactRange() in
 CompactionPickers (#13831)

Summary:
#Summary

Quick follow-up from https://github.com/facebook/rocksdb/pull/13816: `CompactFiles()` and `CompactRange()` in CompactionPickers do not run compaction as their names might suggest. What they actually do is create the Compaction object that will be passed to `CompactionJob` to run the compaction.

Renaming these two functions to better represent their purposes.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13831

Test Plan: No functional change. Existing CI should be sufficient.

Reviewed By: hx235

Differential Revision: D79660196

Pulled By: jaykorean

fbshipit-source-id: ca831dbef5120e7115b52fd07b0059ca16c8f1e8
---
 db/column_family.cc                     |   2 +-
 db/compaction/compaction_picker.cc      |   4 +-
 db/compaction/compaction_picker.h       |  27 +++---
 db/compaction/compaction_picker_fifo.cc |   2 +-
 db/compaction/compaction_picker_fifo.h  |  18 ++--
 db/compaction/compaction_picker_test.cc | 108 +++++++++++++-----------
 db/db_impl/db_impl_compaction_flush.cc  |   2 +-
 db/db_impl/db_impl_secondary.cc         |   2 +-
 8 files changed, 88 insertions(+), 77 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index 5968fa726ae7..100e13050bff 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1331,7 +1331,7 @@ Compaction* ColumnFamilyData::CompactRange(
     const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end, bool* conflict,
     uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
-  auto* result = compaction_picker_->CompactRange(
+  auto* result = compaction_picker_->PickCompactionForCompactRange(
       GetName(), mutable_cf_options, mutable_db_options,
       current_->storage_info(), input_level, output_level,
       compact_range_options, begin, end, compaction_end, conflict,
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 7f6cdffdaca9..9b40c05c1fde 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -333,7 +333,7 @@ bool CompactionPicker::AreFilesInCompaction(
   return false;
 }
 
-Compaction* CompactionPicker::CompactFiles(
+Compaction* CompactionPicker::PickCompactionForCompactFiles(
     const CompactionOptions& compact_options,
     const std::vector<CompactionInputFiles>& input_files, int output_level,
     VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
@@ -603,7 +603,7 @@ void CompactionPicker::GetGrandparents(
   }
 }
 
-Compaction* CompactionPicker::CompactRange(
+Compaction* CompactionPicker::PickCompactionForCompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
     int input_level, int output_level,
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 1212e648b6b6..9591d8f0d23b 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -75,7 +75,7 @@ class CompactionPicker {
   // *compaction_end should point to valid InternalKey!
   // REQUIRES: If not compacting all levels (input_level == kCompactAllLevels),
   // then levels between input_level and output_level should be empty.
-  virtual Compaction* CompactRange(
+  virtual Compaction* PickCompactionForCompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
       int input_level, int output_level,
@@ -121,7 +121,7 @@ class CompactionPicker {
   //  TODO - Remove default values for earliest_snapshot and snapshot_checker
   //  and require all callers to pass them in so that DB::CompactFiles() can
   //  also benefit from Standalone Range Tombstone Optimization
-  Compaction* CompactFiles(
+  Compaction* PickCompactionForCompactFiles(
       const CompactionOptions& compact_options,
       const std::vector<CompactionInputFiles>& input_files, int output_level,
       VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
@@ -283,18 +283,17 @@ class NullCompactionPicker : public CompactionPicker {
   }
 
   // Always return "nullptr"
-  Compaction* CompactRange(const std::string& /*cf_name*/,
-                           const MutableCFOptions& /*mutable_cf_options*/,
-                           const MutableDBOptions& /*mutable_db_options*/,
-                           VersionStorageInfo* /*vstorage*/,
-                           int /*input_level*/, int /*output_level*/,
-                           const CompactRangeOptions& /*compact_range_options*/,
-                           const InternalKey* /*begin*/,
-                           const InternalKey* /*end*/,
-                           InternalKey** /*compaction_end*/,
-                           bool* /*manual_conflict*/,
-                           uint64_t /*max_file_num_to_ignore*/,
-                           const std::string& /*trim_ts*/) override {
+  Compaction* PickCompactionForCompactRange(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      const MutableDBOptions& /*mutable_db_options*/,
+      VersionStorageInfo* /*vstorage*/, int /*input_level*/,
+      int /*output_level*/,
+      const CompactRangeOptions& /*compact_range_options*/,
+      const InternalKey* /*begin*/, const InternalKey* /*end*/,
+      InternalKey** /*compaction_end*/, bool* /*manual_conflict*/,
+      uint64_t /*max_file_num_to_ignore*/,
+      const std::string& /*trim_ts*/) override {
     return nullptr;
   }
 
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index e2a241b625cf..cc2a9bfd0aa1 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -440,7 +440,7 @@ Compaction* FIFOCompactionPicker::PickCompaction(
   return c;
 }
 
-Compaction* FIFOCompactionPicker::CompactRange(
+Compaction* FIFOCompactionPicker::PickCompactionForCompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
     int input_level, int output_level,
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index 6178be7be2de..f1538506163b 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -26,16 +26,14 @@ class FIFOCompactionPicker : public CompactionPicker {
       VersionStorageInfo* version, LogBuffer* log_buffer,
       bool /* require_max_output_level*/ = false) override;
 
-  Compaction* CompactRange(const std::string& cf_name,
-                           const MutableCFOptions& mutable_cf_options,
-                           const MutableDBOptions& mutable_db_options,
-                           VersionStorageInfo* vstorage, int input_level,
-                           int output_level,
-                           const CompactRangeOptions& compact_range_options,
-                           const InternalKey* begin, const InternalKey* end,
-                           InternalKey** compaction_end, bool* manual_conflict,
-                           uint64_t max_file_num_to_ignore,
-                           const std::string& trim_ts) override;
+  Compaction* PickCompactionForCompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts) override;
 
   // The maximum allowed output level.  Always returns 0.
   int MaxOutputLevel() const override { return 0; }
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 73aeae6d1ae3..1f805cc5d652 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -2689,13 +2689,14 @@ TEST_F(CompactionPickerTest, CompactRangeMaxCompactionBytes) {
   bool manual_conflict = false;
   InternalKey manual_end;
   InternalKey* manual_end_ptr = &manual_end;
-  std::unique_ptr<Compaction> compaction(level_compaction_picker.CompactRange(
-      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
-      /*input_level=*/1, /*output_level=*/2,
-      /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr,
-      &manual_end_ptr, &manual_conflict,
-      /*max_file_num_to_ignore=*/std::numeric_limits<uint64_t>::max(),
-      /*trim_ts=*/""));
+  std::unique_ptr<Compaction> compaction(
+      level_compaction_picker.PickCompactionForCompactRange(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          /*input_level=*/1, /*output_level=*/2,
+          /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr,
+          &manual_end_ptr, &manual_conflict,
+          /*max_file_num_to_ignore=*/std::numeric_limits<uint64_t>::max(),
+          /*trim_ts=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(2, compaction->output_level());
@@ -3627,7 +3628,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
   bool manual_conflict = false;
   InternalKey* manual_end = nullptr;
   std::unique_ptr<Compaction> compaction(
-      universal_compaction_picker.CompactRange(
+      universal_compaction_picker.PickCompactionForCompactRange(
           cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
           ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
           nullptr, nullptr, &manual_end, &manual_conflict,
@@ -3831,9 +3832,10 @@ TEST_F(CompactionPickerU64TsTest, Overlap) {
   std::vector<CompactionInputFiles> input_files;
   ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input, vstorage_.get(), CompactionOptions()));
-  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
-      CompactionOptions(), input_files, level, vstorage_.get(),
-      mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+  std::unique_ptr<Compaction> comp1(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          CompactionOptions(), input_files, level, vstorage_.get(),
+          mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
 
   {
     // [600, ts=50000] to [600, ts=50000] is the range to check.
@@ -3942,9 +3944,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
   ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -3988,9 +3991,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
   ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4030,9 +4034,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4077,9 +4082,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4125,9 +4131,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, ProximalOverlapUniversal) {
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   input_set.clear();
   input_files.clear();
@@ -4176,9 +4183,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   // cannot compact file 41 if the preclude_last_level feature is on, otherwise
   // compact file 41 is okay.
@@ -4234,9 +4242,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
   ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
@@ -4252,9 +4261,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
       input_files, 5, Compaction::kInvalidLevel));
 
-  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp2(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
   ASSERT_TRUE(comp2);
   ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
 }
@@ -4290,9 +4300,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
   ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
@@ -4310,9 +4321,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
                     vstorage_.get(), mutable_cf_options_, ioptions_, 6, 6)));
 
   if (!enable_per_key_placement_) {
-    std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
-        comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-        mutable_db_options_, 0));
+    std::unique_ptr<Compaction> comp2(
+        universal_compaction_picker.PickCompactionForCompactFiles(
+            comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+            mutable_db_options_, 0));
     ASSERT_TRUE(comp2);
     ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetProximalLevel());
   }
@@ -4350,9 +4362,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
   ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage_.get(), comp_options));
 
-  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp1(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
 
   ASSERT_TRUE(comp1);
   ASSERT_EQ(comp1->GetProximalLevel(), Compaction::kInvalidLevel);
@@ -4370,9 +4383,10 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
                                         ioptions_, 6, 6)));
 
   // 2 compactions can be run in parallel
-  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
-      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
-      mutable_db_options_, 0));
+  std::unique_ptr<Compaction> comp2(
+      universal_compaction_picker.PickCompactionForCompactFiles(
+          comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+          mutable_db_options_, 0));
   ASSERT_TRUE(comp2);
   if (enable_per_key_placement_) {
     ASSERT_NE(Compaction::kInvalidLevel, comp2->GetProximalLevel());
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 94b10f2c3573..a2aa4b440797 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1500,7 +1500,7 @@ Status DBImpl::CompactFilesImpl(
 
   std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
-  c.reset(cfd->compaction_picker()->CompactFiles(
+  c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles(
       compact_options, input_files, output_level, version->storage_info(),
       cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id));
   // we already sanitized the set of input files and checked for conflicts
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 9f220a05e342..e775490157e7 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -879,7 +879,7 @@ Status DBImplSecondary::CompactWithoutInstallation(
   // input instead of recreating it in the remote worker
   std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
-  c.reset(cfd->compaction_picker()->CompactFiles(
+  c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles(
       comp_options, input_files, input.output_level, vstorage,
       cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0,
       /*earliest_snapshot=*/job_context.snapshot_seqs.empty()

From d0a412d962a59ba6ec78f6a0e06bbac28b81b4d1 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 5 Aug 2025 17:51:54 -0700
Subject: [PATCH 215/500] Disable
 RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources
 (#13839)

Summary:
**Context/Summary:**

The `RoundRobinSubcompactionsAgainstResources` test, specifically the `SubcompactionsUsingResources` case, is now disabled. This decision was made because the test's reliability depends on the absence of any concurrent compactions other than the round-robin compaction. Addressing this issue while maintaining the test's focus on resource reservation requires a deeper investigation, which is currently beyond my available bandwidth. Given the increased frequency of test failures, it has been temporarily disabled to prevent further disruptions.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13839

Test Plan: - Should be no test failure from RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources anymore.

Reviewed By: cbi42

Differential Revision: D79686366

Pulled By: hx235

fbshipit-source-id: 3a226cfd2b67cabc6c585ea567e2b0c25aa5f345
---
 db/db_compaction_test.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 6614edba92bc..994b0bc56954 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -6748,7 +6748,11 @@ INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
                         RoundRobinSubcompactionsAgainstPressureToken,
                         testing::Bool());
 
-TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
+// FIXME: the test is flaky and failing the assertion
+// ASSERT_EQ(actual_reserved_threads, expected_reserved_threads);
+// It's likely a test set up issue, fix if we are to use RoubdRobin compaction.
+TEST_P(RoundRobinSubcompactionsAgainstResources,
+       DISABLED_SubcompactionsUsingResources) {
   const int kKeysPerBuffer = 200;
   Options options = CurrentOptions();
   options.num_levels = 4;

From 3bd7d968e1586799bbfcb3aeeba1bc0c0fa1d895 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Tue, 5 Aug 2025 23:19:09 -0700
Subject: [PATCH 216/500] Introduce column family option
 `cf_allow_ingest_behind` (#13810)

Summary:
this option has the same functionality as DBOptions::allow_ingest_behind but allows the feature at per CF level. `DBOptions::allow_ingest_behind` is deprecated after this PR and users should use `cf_allow_ingest_behind` instead.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13810

Test Plan: updated some existing tests to use the new option.

Reviewed By: xingbowang

Differential Revision: D79191969

Pulled By: cbi42

fbshipit-source-id: 0da45f6be472ace6754ad15df93d45ac86313837
---
 db/column_family.cc                           |   3 +-
 db/column_family.h                            |   5 +
 db/compaction/compaction_iterator.h           |   3 +-
 db/compaction/compaction_picker.cc            |   4 +-
 db/compaction/compaction_picker_test.cc       |  65 +++--
 db/compaction/compaction_picker_universal.cc  |  25 +-
 db/db_impl/db_impl.cc                         |  17 +-
 db/db_impl/db_impl.h                          |   3 +
 db/db_impl/db_impl_compaction_flush.cc        |   8 +-
 db/db_impl/db_impl_open.cc                    |   2 +-
 db/db_universal_compaction_test.cc            | 105 ++++---
 db/external_sst_file_basic_test.cc            |   9 +-
 db/external_sst_file_ingestion_job.cc         |   6 +-
 db/external_sst_file_ingestion_job.h          |   2 +-
 db/external_sst_file_test.cc                  | 259 +++++++++---------
 db/version_edit.h                             |   2 +-
 db/version_set.cc                             |  11 +-
 include/rocksdb/advanced_options.h            |  29 +-
 include/rocksdb/db.h                          |   2 +-
 include/rocksdb/options.h                     |  22 +-
 options/cf_options.cc                         |   7 +-
 options/cf_options.h                          |   2 +
 options/options.cc                            |   2 +
 options/options_helper.cc                     |   1 +
 options/options_settable_test.cc              |   3 +-
 .../new_features/cf-ingest-behind.md          |   1 +
 26 files changed, 348 insertions(+), 250 deletions(-)
 create mode 100644 unreleased_history/new_features/cf-ingest-behind.md

diff --git a/db/column_family.cc b/db/column_family.cc
index 100e13050bff..406fd09767ac 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -280,7 +280,8 @@ ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
   }
 
   if (result.compaction_style == kCompactionStyleUniversal &&
-      db_options.allow_ingest_behind && result.num_levels < 3) {
+      (db_options.allow_ingest_behind || result.cf_allow_ingest_behind) &&
+      result.num_levels < 3) {
     result.num_levels = 3;
   }
 
diff --git a/db/column_family.h b/db/column_family.h
index 1b048dd9b4d4..60b3f15fa6c0 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -600,6 +600,11 @@ class ColumnFamilyData {
     return (mem_->IsEmpty() ? 0 : 1) + imm_.NumNotFlushed();
   }
 
+  // thread-safe, DB mutex not needed.
+  bool AllowIngestBehind() const {
+    return ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind;
+  }
+
  private:
   friend class ColumnFamilySet;
   ColumnFamilyData(
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 5293d647b3d9..92254a18bc56 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -145,7 +145,8 @@ class CompactionIterator {
     }
 
     bool allow_ingest_behind() const override {
-      return compaction_->immutable_options().allow_ingest_behind;
+      return compaction_->immutable_options().cf_allow_ingest_behind ||
+             compaction_->immutable_options().allow_ingest_behind;
     }
 
     bool allow_mmap_reads() const override {
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 9b40c05c1fde..da95425eb813 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -619,8 +619,8 @@ Compaction* CompactionPicker::PickCompactionForCompactRange(
     // Universal compaction with more than one level always compacts all the
     // files together to the last level.
     assert(vstorage->num_levels() > 1);
-    int max_output_level =
-        vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind);
+    int max_output_level = vstorage->MaxOutputLevel(
+        ioptions_.cf_allow_ingest_behind || ioptions_.allow_ingest_behind);
     // DBImpl::CompactRange() set output level to be the last level
     assert(output_level == max_output_level);
     // DBImpl::RunManualCompaction will make full range for universal compaction
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 1f805cc5d652..631295fbe851 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -544,41 +544,48 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
 }
 
 TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
-  const uint64_t kFileSize = 100000;
-  NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
-  ioptions_.allow_ingest_behind = true;
-  ioptions_.num_levels = 3;
-  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
-  UpdateVersionStorageInfo();
-  // must return false when there's no files.
-  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
-            false);
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    const uint64_t kFileSize = 100000;
+    NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
+    if (cf_option) {
+      ioptions_.cf_allow_ingest_behind = true;
+    } else {
+      ioptions_.allow_ingest_behind = true;
+    }
+    ioptions_.num_levels = 3;
+    UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+    UpdateVersionStorageInfo();
+    // must return false when there's no files.
+    ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+              false);
 
-  NewVersionStorage(3, kCompactionStyleUniversal);
+    NewVersionStorage(3, kCompactionStyleUniversal);
 
-  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
-  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
-  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
-  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
-  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
-  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+    Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+    Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+    Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+    Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+    Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+    Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
 
-  UpdateVersionStorageInfo();
+    UpdateVersionStorageInfo();
 
-  std::unique_ptr<Compaction> compaction(
-      universal_compaction_picker.PickCompaction(
-          cf_name_, mutable_cf_options_, mutable_db_options_,
-          /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+    std::unique_ptr<Compaction> compaction(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_,
+            /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
+            vstorage_.get(), &log_buffer_));
 
-  // output level should be the one above the bottom-most
-  ASSERT_EQ(1, compaction->output_level());
+    // output level should be the one above the bottom-most
+    ASSERT_EQ(1, compaction->output_level());
 
-  // input should not include the reserved level
-  const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
-  for (const auto& compaction_input : *inputs) {
-    if (!compaction_input.empty()) {
-      ASSERT_LT(compaction_input.level, 2);
+    // input should not include the reserved level
+    const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
+    for (const auto& compaction_input : *inputs) {
+      if (!compaction_input.empty()) {
+        ASSERT_LT(compaction_input.level, 2);
+      }
     }
   }
 }
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 3d76d53a0dec..d03c9cfd0876 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -48,7 +48,9 @@ class UniversalCompactionBuilder {
         vstorage_(vstorage),
         picker_(picker),
         log_buffer_(log_buffer),
-        require_max_output_level_(require_max_output_level) {
+        require_max_output_level_(require_max_output_level),
+        allow_ingest_behind_(ioptions.cf_allow_ingest_behind ||
+                             ioptions.allow_ingest_behind) {
     assert(icmp_);
     const auto* ucmp = icmp_->user_comparator();
     assert(ucmp);
@@ -422,8 +424,7 @@ class UniversalCompactionBuilder {
   bool MeetsOutputLevelRequirements(int output_level) const {
     return !require_max_output_level_ ||
            Compaction::OutputToNonZeroMaxOutputLevel(
-               output_level,
-               vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
+               output_level, vstorage_->MaxOutputLevel(allow_ingest_behind_));
   }
 
   const ImmutableOptions& ioptions_;
@@ -437,7 +438,6 @@ class UniversalCompactionBuilder {
   VersionStorageInfo* vstorage_;
   UniversalCompactionPicker* picker_;
   LogBuffer* log_buffer_;
-  bool require_max_output_level_;
   // Optional earliest snapshot at time of compaction picking. This is only
   // provided if the column family doesn't enable user-defined timestamps.
   // And this information is only passed to `Compaction` picked by deletion
@@ -448,6 +448,8 @@ class UniversalCompactionBuilder {
   // marked for compaction. This is only populated when snapshot info is
   // populated.
   std::map<uint64_t, size_t> file_marked_for_compaction_to_sorted_run_index_;
+  bool require_max_output_level_;
+  bool allow_ingest_behind_;
 
   std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
       const VersionStorageInfo& vstorage, int last_level,
@@ -733,8 +735,7 @@ bool UniversalCompactionBuilder::ShouldSkipMarkedFile(
 Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
   score_ = vstorage_->CompactionScore(kLevel0);
-  const int max_output_level =
-      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  const int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
   const int file_num_compaction_trigger =
       mutable_cf_options_.level0_file_num_compaction_trigger;
   const unsigned int ratio =
@@ -781,8 +782,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
         "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
     return nullptr;
   }
-  assert(c->output_level() <=
-         vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
+  assert(c->output_level() <= vstorage_->MaxOutputLevel(allow_ingest_behind_));
   assert(MeetsOutputLevelRequirements(c->output_level()));
 
   if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
@@ -1024,8 +1024,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
   int start_level = sorted_runs_[start_index].level;
   int output_level;
   // last level is reserved for the files ingested behind
-  int max_output_level =
-      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
   if (first_index_after == sorted_runs_.size()) {
     output_level = max_output_level;
   } else if (sorted_runs_[first_index_after].level == 0) {
@@ -1517,8 +1516,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       return nullptr;
     }
 
-    int max_output_level =
-        vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+    int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
     // Pick the first non-empty level after the start_level
     for (output_level = start_level + 1; output_level <= max_output_level;
          output_level++) {
@@ -1621,8 +1619,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
   uint32_t path_id =
       GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
   int start_level = sorted_runs_[start_index].level;
-  int max_output_level =
-      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  int max_output_level = vstorage_->MaxOutputLevel(allow_ingest_behind_);
   std::vector<CompactionInputFiles> inputs(max_output_level + 1);
   for (size_t i = 0; i < inputs.size(); ++i) {
     inputs[i].level = start_level + static_cast<int>(i);
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index d9cf5b848f88..c9471122d7ce 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -4345,7 +4345,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
     CfdList cf_scheduled;
     if (oldest_snapshot > bottommost_files_mark_threshold_) {
       for (auto* cfd : *versions_->GetColumnFamilySet()) {
-        if (!cfd->ioptions().allow_ingest_behind) {
+        if (!cfd->AllowIngestBehind()) {
           cfd->current()->storage_info()->UpdateOldestSnapshot(
               oldest_snapshot, /*allow_ingest_behind=*/false);
           if (!cfd->current()
@@ -4365,8 +4365,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
       // inaccurate.
       SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
       for (auto* cfd : *versions_->GetColumnFamilySet()) {
-        if (CfdListContains(cf_scheduled, cfd) ||
-            cfd->ioptions().allow_ingest_behind) {
+        if (CfdListContains(cf_scheduled, cfd) || cfd->AllowIngestBehind()) {
           continue;
         }
         new_bottommost_files_mark_threshold = std::min(
@@ -5761,10 +5760,6 @@ Status DBImpl::IngestExternalFiles(
   for (const auto& arg : args) {
     const IngestExternalFileOptions& ingest_opts = arg.options;
     if (ingest_opts.ingest_behind) {
-      if (!immutable_db_options_.allow_ingest_behind) {
-        return Status::InvalidArgument(
-            "can't ingest_behind file in DB with allow_ingest_behind=false");
-      }
       auto ucmp = arg.column_family->GetComparator();
       assert(ucmp);
       if (ucmp->timestamp_size() > 0) {
@@ -5772,6 +5767,14 @@ Status DBImpl::IngestExternalFiles(
             "Column family with user-defined "
             "timestamps enabled doesn't support ingest behind.");
       }
+
+      if (!static_cast<ColumnFamilyHandleImpl*>(arg.column_family)
+               ->cfd()
+               ->AllowIngestBehind()) {
+        return Status::InvalidArgument(
+            "Can't ingest_behind file in ColumnFamily %s with "
+            "cf_allow_ingest_behind=false");
+      }
     }
     if (arg.atomic_replace_range.has_value()) {
       if (ingest_opts.ingest_behind) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index fce9421de19d..60f95ea27ade 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1388,6 +1388,9 @@ class DBImpl : public DB {
   // logs_, cur_wal_number_. Refer to the definition of each variable below for
   // more description.
   //
+  // Protects access to most ColumnFamilyData methods, see more in comment for
+  // each method.
+  //
   // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
   // cachelines.
   mutable CacheAlignedInstrumentedMutex mutex_;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index a2aa4b440797..75629a8a00ea 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1111,8 +1111,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
       cfd->NumberLevels() > 1) {
     // Always compact all files together.
     final_output_level = cfd->NumberLevels() - 1;
-    // if bottom most level is reserved
-    if (immutable_db_options_.allow_ingest_behind) {
+    if (cfd->AllowIngestBehind()) {
       final_output_level--;
     }
     s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
@@ -1460,7 +1459,7 @@ Status DBImpl::CompactFilesImpl(
     }
   }
 
-  if (cfd->ioptions().allow_ingest_behind &&
+  if (cfd->AllowIngestBehind() &&
       output_level >= cfd->ioptions().num_levels - 1) {
     return Status::InvalidArgument(
         "Exceed the maximum output level defined by "
@@ -4155,6 +4154,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                      ->current()
                      ->storage_info()
                      ->MaxOutputLevel(
+                         c->immutable_options().cf_allow_ingest_behind ||
                          immutable_db_options_.allow_ingest_behind)) &&
              env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
     assert(thread_pri == Env::Priority::LOW);
@@ -4660,7 +4660,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
   bottommost_files_mark_threshold_ = kMaxSequenceNumber;
   standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber;
   for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
-    if (!my_cfd->ioptions().allow_ingest_behind) {
+    if (!my_cfd->AllowIngestBehind()) {
       bottommost_files_mark_threshold_ = std::min(
           bottommost_files_mark_threshold_,
           my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index eaa806283ce9..2dbc2f73a818 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -599,7 +599,7 @@ Status DBImpl::Recover(
         // allow_ingest_behind does not support Level Compaction,
         // and per_key_placement can have infinite compaction loop for Level
         // Compaction. Adjust to_level here just to be safe.
-        if (cfd->ioptions().allow_ingest_behind ||
+        if (cfd->AllowIngestBehind() ||
             moptions.preclude_last_level_data_seconds > 0) {
           to_level -= 1;
         }
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index e7fc69d6fbbe..465f5d0c9632 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -2106,46 +2106,79 @@ TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
 }
 
 TEST_F(DBTestUniversalCompaction2, IngestBehind) {
-  const int kNumKeys = 3000;
-  const int kWindowSize = 100;
-  const int kNumDelsTrigger = 90;
-
-  Options opts = CurrentOptions();
-  opts.table_properties_collector_factories.emplace_back(
-      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
-  opts.compaction_style = kCompactionStyleUniversal;
-  opts.level0_file_num_compaction_trigger = 2;
-  opts.compression = kNoCompression;
-  opts.allow_ingest_behind = true;
-  opts.compaction_options_universal.size_ratio = 10;
-  opts.compaction_options_universal.min_merge_width = 2;
-  opts.compaction_options_universal.max_size_amplification_percent = 200;
-  Reopen(opts);
-
-  // add an L1 file to prevent tombstones from dropping due to obsolescence
-  // during flush
-  int i;
-  for (i = 0; i < 2000; ++i) {
-    ASSERT_OK(Put(Key(i), "val"));
-  }
-  ASSERT_OK(Flush());
-  //  MoveFilesToLevel(6);
-  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (i = 1999; i < kNumKeys; ++i) {
-    if (i >= kNumKeys - kWindowSize &&
-        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      ASSERT_OK(Delete(Key(i)));
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    const int kNumKeys = 3000;
+    const int kWindowSize = 100;
+    const int kNumDelsTrigger = 90;
+
+    Options opts = CurrentOptions();
+    opts.table_properties_collector_factories.emplace_back(
+        NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+    opts.compaction_style = kCompactionStyleUniversal;
+    opts.level0_file_num_compaction_trigger = 2;
+    opts.compression = kNoCompression;
+    if (cf_option) {
+      opts.cf_allow_ingest_behind = true;
     } else {
+      opts.allow_ingest_behind = true;
+    }
+    opts.compaction_options_universal.size_ratio = 10;
+    opts.compaction_options_universal.min_merge_width = 2;
+    opts.compaction_options_universal.max_size_amplification_percent = 200;
+    Reopen(opts);
+
+    // add an L1 file to prevent tombstones from dropping due to obsolescence
+    // during flush
+    int i;
+    for (i = 0; i < 2000; ++i) {
       ASSERT_OK(Put(Key(i), "val"));
     }
-  }
-  ASSERT_OK(Flush());
+    ASSERT_OK(Flush());
+    //  MoveFilesToLevel(6);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    for (i = 1999; i < kNumKeys; ++i) {
+      if (i >= kNumKeys - kWindowSize &&
+          i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+        ASSERT_OK(Delete(Key(i)));
+      } else {
+        ASSERT_OK(Put(Key(i), "val"));
+      }
+    }
+    ASSERT_OK(Flush());
 
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ(0, NumTableFilesAtLevel(0));
-  ASSERT_EQ(0, NumTableFilesAtLevel(6));
-  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(6));
+    ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+    if (cf_option) {
+      // Test that another CF does not allow ingest behind
+      ColumnFamilyHandle* new_cfh;
+      Options new_cf_option;
+      new_cf_option.compaction_style = kCompactionStyleUniversal;
+      new_cf_option.num_levels = 7;
+      // CreateColumnFamilies({"new_cf"}, new_cf_option);
+      ASSERT_OK(db_->CreateColumnFamily(new_cf_option, "new_cf", &new_cfh));
+      // handles_.push_back(new_cfh);
+      for (i = 0; i < 10; ++i) {
+        // ASSERT_OK(Put(1, Key(i), "val"));
+        ASSERT_OK(db_->Put(WriteOptions(), new_cfh, Key(i), "val"));
+      }
+      ASSERT_OK(
+          db_->CompactRange(CompactRangeOptions(), new_cfh, nullptr, nullptr));
+      // This CF can use the last leve.
+      std::string property;
+      EXPECT_TRUE(db_->GetProperty(
+          new_cfh, "rocksdb.num-files-at-level" + std::to_string(6),
+          &property));
+      ASSERT_EQ(1, atoi(property.c_str()));
+
+      ASSERT_OK(db_->DropColumnFamily(new_cfh));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(new_cfh));
+    }
+  }
 }
 
 TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 6a1986cc5398..3d91c62a62fd 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -2567,7 +2567,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
     options.default_write_temperature = Temperature::kHot;
     SstFileWriter sst_file_writer(EnvOptions(), options);
     options.level0_file_num_compaction_trigger = 2;
-    options.allow_ingest_behind = (mode == "ingest_behind");
+    bool cf_option = Random::GetTLSInstance()->OneIn(2);
+    SCOPED_TRACE(std::string("Use ") + (cf_option ? "CF" : "DB") +
+                 " option for ingest behind");
+    if (cf_option) {
+      options.cf_allow_ingest_behind = (mode == "ingest_behind");
+    } else {
+      options.allow_ingest_behind = (mode == "ingest_behind");
+    }
     Reopen(options);
     Defer destroyer([&]() { Destroy(options); });
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 0807f40a8f4f..a1963b720937 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -1277,13 +1277,13 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
         "at the last level!");
   }
 
-  // Second, check if despite allow_ingest_behind=true we still have 0 seqnums
-  // at some upper level
+  // Second, check if despite cf_allow_ingest_behind=true we still have 0
+  // seqnums at some upper level
   for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
     for (auto file : vstorage->LevelFiles(lvl)) {
       if (file->fd.smallest_seqno == 0) {
         return Status::InvalidArgument(
-            "Can't ingest_behind file as despite allow_ingest_behind=true "
+            "Can't ingest_behind file as despite cf_allow_ingest_behind=true "
             "there are files with 0 seqno in database at upper levels!");
       }
     }
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index 628eb36848b8..d97fac31e6e0 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -349,7 +349,7 @@ class ExternalSstFileIngestionJob {
       std::optional<int> prev_batch_uppermost_level);
 
   // File that we want to ingest behind always goes to the lowest level;
-  // we just check that it fits in the level, that DB allows ingest_behind,
+  // we just check that it fits in the level, that the CF allows ingest_behind,
   // and that we don't have 0 seqnums at the upper levels.
   // REQUIRES: Mutex held
   Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 64965643c729..55befde4cab7 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -2417,130 +2417,130 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
 }
 
 TEST_P(ExternalSSTFileTest, IngestBehind) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 3;
-  options.disable_auto_compactions = false;
-  DestroyAndReopen(options);
-  std::vector<std::pair<std::string, std::string>> file_data;
-  std::map<std::string, std::string> true_data;
-
-  // Insert 100 -> 200 into the memtable
-  for (int i = 100; i <= 200; i++) {
-    ASSERT_OK(Put(Key(i), "memtable"));
-  }
-
-  // Insert 0 -> 20 using IngestExternalFile
-  file_data.clear();
-  for (int i = 0; i <= 20; i++) {
-    file_data.emplace_back(Key(i), "ingest_behind");
-    true_data[Key(i)] = "ingest_behind";
-  }
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    Options options = CurrentOptions();
+    options.compaction_style = kCompactionStyleUniversal;
+    options.num_levels = 3;
+    options.disable_auto_compactions = false;
+    DestroyAndReopen(options);
+    std::vector<std::pair<std::string, std::string>> file_data;
+    std::map<std::string, std::string> true_data;
 
-  bool allow_global_seqno = true;
-  bool ingest_behind = true;
-  bool write_global_seqno = std::get<0>(GetParam());
-  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+    // Insert 100 -> 200 into the memtable
+    for (int i = 100; i <= 200; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+    }
 
-  // Can't ingest behind since allow_ingest_behind isn't set to true
-  ASSERT_NOK(GenerateAndAddExternalFile(
-      options, file_data, -1, allow_global_seqno, write_global_seqno,
-      verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
-      &true_data));
+    // Insert 100 -> 200 using IngestExternalFile
+    file_data.clear();
+    for (int i = 0; i <= 20; i++) {
+      file_data.emplace_back(Key(i), "ingest_behind");
+      true_data[Key(i)] = "ingest_behind";
+    }
 
-  options.allow_ingest_behind = true;
-  // check that we still can open the DB, as num_levels should be
-  // sanitized to 3
-  options.num_levels = 2;
-  DestroyAndReopen(options);
+    bool allow_global_seqno = true;
+    bool ingest_behind = true;
+    bool write_global_seqno = std::get<0>(GetParam());
+    bool verify_checksums_before_ingest = std::get<1>(GetParam());
 
-  options.num_levels = 3;
-  DestroyAndReopen(options);
-  true_data.clear();
-  // Insert 100 -> 200 into the memtable
-  for (int i = 100; i <= 200; i++) {
-    ASSERT_OK(Put(Key(i), "memtable"));
-    true_data[Key(i)] = "memtable";
-  }
+    // Can't ingest behind since allow_ingest_behind isn't set to true
+    ASSERT_NOK(GenerateAndAddExternalFile(
+        options, file_data, -1, allow_global_seqno, write_global_seqno,
+        verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+        &true_data));
 
-  // Test that tombstones for Key(7) and Key(8) are not dropped during
-  // compaction. Will verify below that after ingesting Puts for Key(7) and
-  // Key(8), they are covered by these two tombstones.
-  ASSERT_OK(Delete(Key(7)));
-  ASSERT_OK(SingleDelete(Key(8)));
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // Universal picker should go at second from the bottom level
-  ASSERT_EQ("0,1", FilesPerLevel());
-
-  // Test that SingleDelte overwritten by Put is not dropped.
-  // From old to new, we issue SD, PUT, CompactRange, SD, CompactRange. The
-  // first CompactRange() should not drop the overwritten SD. The second
-  // CompactRange() will drop the new SD with PUT. If the older SD was dropped,
-  // the ingested behind data will be incorrectly visible below.
-  ASSERT_OK(SingleDelete(Key(1)));
-  ASSERT_OK(Put(Key(1), "overwrite_sd"));
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_OK(SingleDelete(Key(1)));
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    if (cf_option) {
+      options.cf_allow_ingest_behind = true;
+    } else {
+      options.allow_ingest_behind = true;
+    }
+    // check that we still can open the DB, as num_levels should be
+    // sanitized to 3
+    options.num_levels = 2;
+    DestroyAndReopen(options);
 
-  ASSERT_OK(GenerateAndAddExternalFile(
-      options, file_data, -1, allow_global_seqno, write_global_seqno,
-      verify_checksums_before_ingest, true /*ingest_behind*/,
-      false /*sort_data*/, &true_data));
-  // adjust expected data for tombtones
-  true_data.erase(Key(7));
-  true_data.erase(Key(8));
-  true_data.erase(Key(1));
-  std::unordered_set<std::string> not_found_set;
-  // Tombstones will be verified in VerifyDBFromMap() below.
-  not_found_set.insert(Key(7));
-  not_found_set.insert(Key(8));
-  not_found_set.insert(Key(1));
-
-  ASSERT_EQ("0,1,1", FilesPerLevel());
-  // this time ingest should fail as the file doesn't fit to the bottom level
-  ASSERT_NOK(GenerateAndAddExternalFile(
-      options, file_data, -1, allow_global_seqno, write_global_seqno,
-      verify_checksums_before_ingest, true /*ingest_behind*/,
-      false /*sort_data*/, &true_data));
-  ASSERT_EQ("0,1,1", FilesPerLevel());
-  std::vector<std::vector<FileMetaData>> level_to_files;
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber();
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // Last level should not be compacted
-  ASSERT_EQ("0,1,1", FilesPerLevel());
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
-  size_t kcnt = 0;
-  VerifyDBFromMap(true_data, &kcnt, false, nullptr, nullptr, &not_found_set);
+    options.num_levels = 3;
+    DestroyAndReopen(options);
+    true_data.clear();
+    // Insert 100 -> 200 into the memtable
+    for (int i = 100; i <= 200; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    // Universal picker should go at second from the bottom level
+    ASSERT_EQ("0,1", FilesPerLevel());
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, file_data, -1, allow_global_seqno, write_global_seqno,
+        verify_checksums_before_ingest, true /*ingest_behind*/,
+        false /*sort_data*/, &true_data));
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+    // this time ingest should fail as the file doesn't fit to the bottom level
+    ASSERT_NOK(GenerateAndAddExternalFile(
+        options, file_data, -1, allow_global_seqno, write_global_seqno,
+        verify_checksums_before_ingest, true /*ingest_behind*/,
+        false /*sort_data*/, &true_data));
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+    std::vector<std::vector<FileMetaData>> level_to_files;
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber();
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    // Last level should not be compacted
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
 
-  // Auto-compaction should not include the last level.
-  // Trigger compaction if size amplification exceeds 110%.
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  options.level0_file_num_compaction_trigger = 4;
-  ASSERT_OK(TryReopen(options));
-  Random rnd(301);
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 10; j++) {
-      true_data[Key(j)] = rnd.RandomString(1000);
-      ASSERT_OK(Put(Key(j), true_data[Key(j)]));
+    // Auto-compaction should not include the last level.
+    // Trigger compaction if size amplification exceeds 110%.
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    options.level0_file_num_compaction_trigger = 4;
+    ASSERT_OK(TryReopen(options));
+    Random rnd(301);
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 10; j++) {
+        true_data[Key(j)] = rnd.RandomString(1000);
+        ASSERT_OK(Put(Key(j), true_data[Key(j)]));
+      }
+      ASSERT_OK(Flush());
     }
-    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    ASSERT_EQ(1, level_to_files[2].size());
+    ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
+
+    // Turning off the option allows DB to compact ingested files.
+    if (cf_option) {
+      // Test that another CF does not allow ingest behind
+      ColumnFamilyHandle* new_cfh;
+      Options new_cf_option;
+      ASSERT_OK(db_->CreateColumnFamily(new_cf_option, "new_cf", &new_cfh));
+      ASSERT_TRUE(GenerateAndAddExternalFile(
+                      new_cf_option, file_data, -1, allow_global_seqno,
+                      write_global_seqno, verify_checksums_before_ingest,
+                      true /*ingest_behind*/, false /*sort_data*/, nullptr,
+                      /*cfh=*/new_cfh)
+                      .IsInvalidArgument());
+      ASSERT_OK(db_->DropColumnFamily(new_cfh));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(new_cfh));
+
+      options.cf_allow_ingest_behind = false;
+    } else {
+      options.allow_ingest_behind = false;
+    }
+    ASSERT_OK(TryReopen(options));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(),
+                                    &level_to_files);
+    ASSERT_EQ(1, level_to_files[2].size());
+    ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber());
+    VerifyDBFromMap(true_data, &kcnt, false);
   }
-  ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  ASSERT_EQ(1, level_to_files[2].size());
-  ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber());
-
-  // Turning off the option allows DB to compact ingested files.
-  options.allow_ingest_behind = false;
-  ASSERT_OK(TryReopen(options));
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files);
-  ASSERT_EQ(1, level_to_files[2].size());
-  ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber());
-  VerifyDBFromMap(true_data, &kcnt, false);
 }
 
 TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
@@ -3542,19 +3542,26 @@ TEST_F(ExternalSSTFileWithTimestampTest, SanityCheck) {
   // overlapping key ranges.
   ASSERT_TRUE(IngestExternalUDTFile({file1, file2}).IsNotSupported());
 
-  options.allow_ingest_behind = true;
-  DestroyAndReopen(options);
-  IngestExternalFileOptions opts;
+  for (bool cf_option : {false, true}) {
+    SCOPED_TRACE("cf_option = " + std::to_string(cf_option));
+    if (cf_option) {
+      options.cf_allow_ingest_behind = true;
+    } else {
+      options.allow_ingest_behind = true;
+    }
+    DestroyAndReopen(options);
+    IngestExternalFileOptions opts;
 
-  // TODO(yuzhangyu): support ingestion behind for user-defined timestamps?
-  // Ingesting external files with user-defined timestamps requires searching
-  // through the whole lsm tree to make sure there is no key range overlap with
-  // the db. Ingestion behind currently is doing a simply placing it at the
-  // bottom level step without a search, so we don't allow it either.
-  opts.ingest_behind = true;
-  ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported());
+    // TODO(yuzhangyu): support ingestion behind for user-defined timestamps?
+    // Ingesting external files with user-defined timestamps requires searching
+    // through the whole lsm tree to make sure there is no key range overlap
+    // with the db. Ingestion behind currently is doing a simply placing it at
+    // the bottom level step without a search, so we don't allow it either.
+    opts.ingest_behind = true;
+    ASSERT_TRUE(db_->IngestExternalFile({file1}, opts).IsNotSupported());
 
-  DestroyAndRecreateExternalSSTFilesDir();
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
 }
 
 TEST_F(ExternalSSTFileWithTimestampTest, UDTSettingsCompatibilityCheck) {
diff --git a/db/version_edit.h b/db/version_edit.h
index 25b794fd3359..37175f1db3d4 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -110,7 +110,7 @@ constexpr uint64_t kUnknownOldestAncesterTime = 0;
 constexpr uint64_t kUnknownNewestKeyTime = 0;
 constexpr uint64_t kUnknownFileCreationTime = 0;
 constexpr uint64_t kUnknownEpochNumber = 0;
-// If `Options::allow_ingest_behind` is true, this epoch number
+// If `Options::cf_allow_ingest_behind` is true, this epoch number
 // will be dedicated to files ingested behind.
 constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1;
 
diff --git a/db/version_set.cc b/db/version_set.cc
index 742198d44cd8..9bf3d35c0b41 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3528,7 +3528,9 @@ void VersionStorageInfo::ComputeCompactionScore(
   // maintaining it to be over 1.0, we scale the original score by 10x
   // if it is larger than 1.0.
   const double kScoreScale = 10.0;
-  int max_output_level = MaxOutputLevel(immutable_options.allow_ingest_behind);
+  int max_output_level =
+      MaxOutputLevel(immutable_options.cf_allow_ingest_behind ||
+                     immutable_options.allow_ingest_behind);
   for (int level = 0; level <= MaxInputLevel(); level++) {
     double score;
     if (level == 0) {
@@ -3713,6 +3715,7 @@ void VersionStorageInfo::ComputeCompactionScore(
   }
   ComputeFilesMarkedForCompaction(max_output_level);
   ComputeBottommostFilesMarkedForCompaction(
+      immutable_options.cf_allow_ingest_behind ||
       immutable_options.allow_ingest_behind);
   ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
   ComputeFilesMarkedForPeriodicCompaction(
@@ -4710,8 +4713,7 @@ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd,
   if (restart_epoch) {
     cfd->ResetNextEpochNumber();
 
-    bool reserve_epoch_num_for_file_ingested_behind =
-        cfd->ioptions().allow_ingest_behind;
+    bool reserve_epoch_num_for_file_ingested_behind = cfd->AllowIngestBehind();
     if (reserve_epoch_num_for_file_ingested_behind) {
       uint64_t reserved_epoch_number = cfd->NewEpochNumber();
       assert(reserved_epoch_number ==
@@ -4719,7 +4721,8 @@ void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd,
       ROCKS_LOG_INFO(cfd->ioptions().info_log.get(),
                      "[%s]CF has reserved epoch number %" PRIu64
                      " for files ingested "
-                     "behind since `Options::allow_ingest_behind` is true",
+                     "behind since `Options::allow_ingest_behind` or "
+                     "`Options::cf_allow_ingest_behind` is true",
                      cfd->GetName().c_str(), reserved_epoch_number);
     }
   }
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 2f9b04699a4c..f78bb0c2c129 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -526,7 +526,7 @@ struct AdvancedColumnFamilyOptions {
   // By doing it, we give max_bytes_for_level_multiplier a priority against
   // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
   // useful to limit worse case space amplification.
-  // If `allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`,
+  // If `cf_allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`,
   // then the last level is reserved, and we will start filling LSM from the
   // second last level.
   //
@@ -1146,6 +1146,33 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through the SetOptions() API.
   uint32_t memtable_avg_op_scan_flush_trigger = 0;
 
+  // If either DBOptions::allow_ingest_behind or this option is set to true,
+  // this column family will prepare for ingesting files to the last level
+  // (IngestExternalFiles() with ingest_behind=true). Users should set only
+  // this option since DBOptions::allow_ingest_behind is deprecated.
+  //
+  // Specifically, preparing a column family for ingesting files to the last
+  // level has the following effects:
+  // 1) Disables some internal optimizations around SST file compression.
+  // 2) Reserves the last level for ingested files only.
+  // 3) Compaction will not include any file from the last level.
+  // 4) Compaction will preserve necessary tombstones that can apply on
+  // top of ingested files.
+  //
+  // Note that only Universal Compaction supports cf_allow_ingest_behind.
+  // `num_levels` should be >= 3 if this option is turned on.
+  //
+  // Note that this option needs to be set to true before any write to the CF.
+  // It's recommended to set the option to true since CF creation. Otherwise,
+  // ingestion with ingest_behind = true might fail. Once file ingestions are
+  // done, the option should be flipped to false. Flipping this option to false
+  // allows the CF to disable the behavior changes detailed above and resume
+  // more efficient operation.
+  //
+  // Default: false
+  // Immutable.
+  bool cf_allow_ingest_behind = false;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 5313963e9693..4038ab5c3dc2 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1984,7 +1984,7 @@ class DB {
   //     even if the file compression doesn't match the level compression
   // (3) If IngestExternalFileOptions->ingest_behind is set to true,
   //     we always ingest at the bottommost level, which should be reserved
-  //     for this purpose (see DBOPtions::allow_ingest_behind flag).
+  //     for this purpose (see ColumnFamilyOptions::cf_allow_ingest_behind).
   // (4) If IngestExternalFileOptions->fail_if_not_bottommost_level is set to
   //     true, then this method can return Status:TryAgain() indicating that
   //     the files cannot be ingested to the bottommost level, and it is the
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index e3604fb5f62b..5a856f1f233c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1361,19 +1361,11 @@ struct DBOptions {
   // Dynamically changeable through SetDBOptions() API.
   bool avoid_flush_during_shutdown = false;
 
-  // Set this option to true during creation of database if you want
-  // to be able to ingest behind (call IngestExternalFile() skipping keys
-  // that already exist, rather than overwriting matching keys).
-  // Setting this option to true has the following effects:
-  // 1) Disable some internal optimizations around SST file compression.
-  // 2) Reserve the last level for ingested files only.
-  // 3) Compaction will not include any file from the last level.
-  // 4) Compaction will preserve necessary tombstones that can apply on
-  // top of ingested files.
-  // Note that only Universal Compaction supports allow_ingest_behind.
-  // `num_levels` should be >= 3 if this option is turned on.
-  // Note that if TimedPut was issued to a CF, ingest behind into that
-  // CF may fail.
+  // DEPRECATED: use ColumnFamilyOptions::cf_allow_ingest_behind instead.
+  // This option might be removed in a future release.
+  //
+  // See comment for `ColumnFamilyOptions::cf_allow_ingest_behind` for
+  // detail about the option's functionality and use cases.
   //
   // DEFAULT: false
   // Immutable.
@@ -2380,8 +2372,8 @@ struct IngestExternalFileOptions {
   // to be skipped rather than overwriting existing data under that key.
   // Use case: back-fill of some historical data in the database without
   // over-writing existing newer version of data.
-  // This option could only be used if the DB has been running
-  // with allow_ingest_behind=true since the dawn of time.
+  // This option could only be used if the CF has been running
+  // with cf_allow_ingest_behind=true since CF creation (or before any write).
   // All files will be ingested at the bottommost level with seqno=0.
   bool ingest_behind = false;
   // DEPRECATED - Set to true if you would like to write global_seqno to
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 6d062089a066..14f14b7c7e10 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -893,6 +893,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableCFOptions, persist_user_defined_timestamps),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kCompareLoose}},
+        {"cf_allow_ingest_behind",
+         {offsetof(struct ImmutableCFOptions, cf_allow_ingest_behind),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
 };
 
 const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions";
@@ -1032,7 +1036,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
       sst_partitioner_factory(cf_options.sst_partitioner_factory),
       blob_cache(cf_options.blob_cache),
       persist_user_defined_timestamps(
-          cf_options.persist_user_defined_timestamps) {}
+          cf_options.persist_user_defined_timestamps),
+      cf_allow_ingest_behind(cf_options.cf_allow_ingest_behind) {}
 
 ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {}
 
diff --git a/options/cf_options.h b/options/cf_options.h
index 378dfc28e7d5..c481c0587dcf 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -82,6 +82,8 @@ struct ImmutableCFOptions {
   std::shared_ptr<Cache> blob_cache;
 
   bool persist_user_defined_timestamps;
+
+  bool cf_allow_ingest_behind;
 };
 
 struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions {
diff --git a/options/options.cc b/options/options.cc
index bafcf61a600c..0ce071573a4e 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -462,6 +462,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    experimental_mempurge_threshold);
   ROCKS_LOG_HEADER(log, "           Options.memtable_max_range_deletions: %d",
                    memtable_max_range_deletions);
+  ROCKS_LOG_HEADER(log, "                 Options.cf_allow_ingest_behind: %s",
+                   cf_allow_ingest_behind ? "true" : "false");
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 999dd28cae94..ef7292bf0c22 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -339,6 +339,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
   cf_opts->persist_user_defined_timestamps =
       ioptions.persist_user_defined_timestamps;
   cf_opts->default_temperature = ioptions.default_temperature;
+  cf_opts->cf_allow_ingest_behind = ioptions.cf_allow_ingest_behind;
 
   // TODO(yhchiang): find some way to handle the following derived options
   // * max_file_size
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 5b099ab4d367..fe16b170446b 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -681,7 +681,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "uncache_aggressiveness=1234;"
       "paranoid_memory_checks=1;"
       "memtable_op_scan_flush_trigger=123;"
-      "memtable_avg_op_scan_flush_trigger=12;",
+      "memtable_avg_op_scan_flush_trigger=12;"
+      "cf_allow_ingest_behind=1;",
       new_options));
 
   ASSERT_NE(new_options->blob_cache.get(), nullptr);
diff --git a/unreleased_history/new_features/cf-ingest-behind.md b/unreleased_history/new_features/cf-ingest-behind.md
new file mode 100644
index 000000000000..1a716d17ef24
--- /dev/null
+++ b/unreleased_history/new_features/cf-ingest-behind.md
@@ -0,0 +1 @@
+* Introduce column family option `cf_allow_ingest_behind`. This option aims to replace `DBOptions::allow_ingest_behind` to enable ingest behind at the per-CF level. `DBOptions::allow_ingest_behind` is deprecated.

From 9c0a0c00581cdcf46a81815c0251f9496b692654 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 6 Aug 2025 06:59:51 -0700
Subject: [PATCH 217/500] Fix remote compaction stress test (#13835)

Summary:
Remote Compaction in the stress test previously failed with the following error, so we temporarily disabled it in PR https://github.com/facebook/rocksdb/issues/13815 :

```
reference std::vector<rocksdb::ThreadState *>::operator[](size_type) [_Tp = rocksdb::ThreadState *, _Alloc = std::allocator<rocksdb::ThreadState *>]: Assertion '__n < this->size()' failed.
```

The error was from accessing `remote_compaction_worker_threads[i]` when `i < remote_compaction_worker_threads.size()` which leads to an undefined behavior. This PR fixes the issue by properly setting the worker thread pointers in `remote_compaction_worker_threads`.

Note: We are still encountering errors when both BlobDB and Remote Compaction are enabled. It appears to be a race condition. For now, BlobDB is temporarily disabled if remote compaction is enabled. We will fix the race condition and re-enable BlobDB as a follow-up.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13835

Test Plan:
```
python3 -u tools/db_crashtest.py blackbox --remote_compaction_worker_threads=16 --interval=2 --duration=180
```

Reviewed By: hx235

Differential Revision: D79684447

Pulled By: jaykorean

fbshipit-source-id: 65f5809f651865c3df76c2cf3b9e7b8d654bb90a
---
 db_stress_tool/db_stress_driver.cc | 19 +++++++++----------
 tools/db_crashtest.py              |  8 +++++---
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc
index 21b23b4283da..aa93de97ec4a 100644
--- a/db_stress_tool/db_stress_driver.cc
+++ b/db_stress_tool/db_stress_driver.cc
@@ -139,9 +139,9 @@ bool RunStressTestImpl(SharedState* shared) {
     remote_compaction_worker_threads.reserve(
         remote_compaction_worker_thread_count);
     for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
-      remote_compaction_worker_threads[i] = new ThreadState(i, shared);
-      db_stress_env->StartThread(RemoteCompactionWorkerThread,
-                                 remote_compaction_worker_threads[i]);
+      ThreadState* ts = new ThreadState(i, shared);
+      remote_compaction_worker_threads.push_back(ts);
+      db_stress_env->StartThread(RemoteCompactionWorkerThread, ts);
     }
   }
 
@@ -253,7 +253,7 @@ bool RunStressTestImpl(SharedState* shared) {
       FLAGS_continuous_verification_interval > 0 ||
       FLAGS_compressed_secondary_cache_size > 0 ||
       FLAGS_compressed_secondary_cache_ratio > 0.0 ||
-      FLAGS_remote_compaction_worker_threads > 0) {
+      remote_compaction_worker_thread_count > 0) {
     MutexLock l(shared->GetMutex());
     shared->SetShouldStopBgThread();
     while (!shared->BgThreadsFinished()) {
@@ -261,14 +261,13 @@ bool RunStressTestImpl(SharedState* shared) {
     }
   }
 
-  // Kill remote compaction workers
+  assert(remote_compaction_worker_threads.size() ==
+         remote_compaction_worker_thread_count);
   if (remote_compaction_worker_thread_count > 0) {
-    assert(remote_compaction_worker_threads.capacity() ==
-           remote_compaction_worker_thread_count);
-    for (uint32_t i = 0; i < remote_compaction_worker_thread_count; i++) {
-      delete remote_compaction_worker_threads[i];
-      remote_compaction_worker_threads[i] = nullptr;
+    for (ThreadState* thread_state : remote_compaction_worker_threads) {
+      delete thread_state;
     }
+    remote_compaction_worker_threads.clear();
   }
 
   if (shared->HasVerificationFailedYet()) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index b314b1022e32..c37fa832b1a8 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -343,9 +343,8 @@
     "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]),
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
-    "allow_unprepared_value": lambda: random.choice([0, 1]),
-    # TODO(jaykorean): re-enable remote compaction stress test once fixed
-    "remote_compaction_worker_threads": lambda: 0,
+    "allow_unprepared_value": lambda: random.choice([0, 1]),    
+    "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
@@ -1087,6 +1086,9 @@ def finalize_and_sanitize(src_params):
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
+    # TODO Fix races when both Remote Compaction + BlobDB enabled
+    if dest_params.get("remote_compaction_worker_threads") > 0:
+       dest_params["enable_blob_files"] = 0
     return dest_params
 
 

From dfb4efaae3fe7bd0fa55fe37489ed16f95cb93b3 Mon Sep 17 00:00:00 2001
From: ngina <221624547+nmk70@users.noreply.github.com>
Date: Wed, 6 Aug 2025 11:40:09 -0700
Subject: [PATCH 218/500] Add test for deletion-triggered compaction with min
 file size (#13825)

Summary:
**Summary:**
This test verifies that compaction respects the min_file_size parameter when triggered by deletions, preventing the compaction of files with deletions smaller than the threshold. The test logic includes two scenarios:
1. Verify that a large L0 file with deletions exceeding the minimum file size threshold triggers deletion-triggered compaction (DTC) and compacts to L1.
2. Verify that a small L0 file with deletions, but below the minimum file size threshold, does not trigger DTC and remains at L0.

Added the DeletionTriggeredCompactionWithMinFileSizeTestListener, which verifies that files selected for compaction based on deletion triggers meet the minimum file size threshold. The listener validates in OnCompactionBegin that all input files have sizes greater than or equal to the configured min_file_size parameter.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13825

Test Plan:
Tested this feature on our devserver using the following commands:
```
DEBUG_LEVEL=2 make -j64 db_compaction_test && KEEP_DB=1 ./db_compaction_test --gtest_filter="*DBCompactionTest.CompactionWith*"
```

Test output confirms the expected behavior:
```
2025/07/31-11:24:49.473181 1431671 [/compaction/compaction_job.cc:2291] [default] [JOB 6] Compacting 2@0 files to L1, score 0.04
2025/07/31-11:24:49.473240 1431671 [/compaction/compaction_job.cc:2297] [default]: Compaction start summary: Base version 6 Base level 0, inputs: [15(52KB) 9(103KB)]
2025/07/31-11:24:49.473304 1431671 EVENT_LOG_v1 {"time_micros": 1753986289473273, "job": 6, "event": "compaction_started", "cf_name": "default", "compaction_reason": "FilesMarkedForCompaction", "files_L0": [15, 9], "score": 0.04, "input_data_size": 159848, "oldest_snapshot_seqno": -1}

```

**Tasks:**
T228156639

Reviewed By: cbi42

Differential Revision: D79395851

Pulled By: nmk70

fbshipit-source-id: 4c2a80a95521b40543981dd81b347f3984cd2a8b
---
 db/db_compaction_test.cc | 120 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 994b0bc56954..940cdeaa5af9 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -74,6 +74,43 @@ class CompactionStatsCollector : public EventListener {
   std::vector<std::atomic<int>> compaction_completed_;
 };
 
+class DeletionTriggeredCompactionWithMinFileSizeTestListener
+    : public EventListener {
+ public:
+  explicit DeletionTriggeredCompactionWithMinFileSizeTestListener(
+      uint64_t min_file_size)
+      : min_file_size_(min_file_size) {}
+
+  void OnCompactionBegin(DB* db, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+
+    auto env = db->GetEnv();
+    const std::vector<DbPath>& db_paths = db->GetOptions().db_paths;
+    for (const auto& file : ci.input_file_infos) {
+      uint64_t file_size = GetSstFileSize(env, db_paths, file.file_number);
+
+      // Assert that the file size respects the minimum threshold
+      ASSERT_GE(file_size, min_file_size_);
+    }
+  }
+
+ private:
+  static uint64_t GetSstFileSize(Env* env, const std::vector<DbPath>& db_paths,
+                                 uint64_t file_number) {
+    uint32_t path_id = 0;  // since only one path
+    std::string sst_file_name = TableFileName(db_paths, file_number, path_id);
+    uint64_t file_size = 0;
+    Status s = env->GetFileSize(sst_file_name, &file_size);
+    if (!s.ok()) {
+      return 0;
+    }
+    return file_size;
+  }
+
+  uint64_t min_file_size_;
+};
+
 class DBCompactionTest : public DBTestBase {
  public:
   DBCompactionTest()
@@ -1371,6 +1408,89 @@ TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
   } while (ChangeOptions());
 }
 
+TEST_F(DBCompactionTest, CompactionWithDeletionsAndMinFileSize) {
+  const uint64_t kMinFileSize = 32 * 1024;  // 32KB
+  const int kDeletionTriggerCount = 50;
+  const int kInitialKeyCount = 100;
+  const int kAdditionalKeyCount = 50;
+  const int kValueSize = 1024;
+  const int kSmallValueSize = 512;
+  const int kSeed = 301;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 1024 * 1024;  // 1MB
+  options.level0_file_num_compaction_trigger = 100;
+
+  options.table_properties_collector_factories = {
+      NewCompactOnDeletionCollectorFactory(
+          kInitialKeyCount /* sliding window size */, kDeletionTriggerCount,
+          0.5 /* deletion ratio */, kMinFileSize)};
+  auto listener =
+      new DeletionTriggeredCompactionWithMinFileSizeTestListener(kMinFileSize);
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  Random rnd(kSeed);
+
+  // Create a large file that will be subject to DTC later
+  for (int i = 0; i < kInitialKeyCount; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<LiveFileMetaData> initial_metadata;
+  db_->GetLiveFilesMetaData(&initial_metadata);
+  ASSERT_EQ(initial_metadata.size(), 1);
+
+  // Create small files that should not trigger compaction
+  ASSERT_OK(Put("small_file_key1", rnd.RandomString(kSmallValueSize)));
+  ASSERT_OK(Put("small_file_key2", rnd.RandomString(kSmallValueSize)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("small_file_key1"));
+  ASSERT_OK(Flush());
+
+  // Create a file with enough deletions and size to trigger DTC
+  // Delete keys from the large file to reach deletion threshold
+  for (int i = 0; i < kDeletionTriggerCount; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+
+  // Add new keys to ensure the deletion file meets the min_file_size threshold
+  for (int i = kInitialKeyCount; i < kInitialKeyCount + kAdditionalKeyCount;
+       i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify file count after compaction
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);  // Small file and deletion file
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);  // Compacted large file
+
+  // Verify deleted keys are gone
+  for (int i = 0; i < kDeletionTriggerCount; i++) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+  }
+
+  // Verify non-deleted keys from large file are still accessible
+  for (int i = kDeletionTriggerCount; i < kInitialKeyCount; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    ASSERT_EQ(value.size(), kValueSize);
+  }
+
+  // Verify new keys are accessible
+  for (int i = kInitialKeyCount; i < kInitialKeyCount + kAdditionalKeyCount;
+       i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    ASSERT_EQ(value.size(), kValueSize);
+  }
+}
+
 TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
   int32_t trivial_move = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(

From 3dd6c6f9cb86e784226786ae03849946646d54a6 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 6 Aug 2025 11:54:23 -0700
Subject: [PATCH 219/500] Disable Incompatible Tests with Remote Compaction
 (#13843)

Summary:
To reduce the noise, disable the incompatible ones for now when `remote_compaction_worker_threads > 0`. We will investigate each, fix as needed and re-enable them as follow up.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13843

Test Plan:
```
python3 -u tools/db_crashtest.py blackbox --remote_compaction_worker_threads=8 --interval=5 --duration=6000 --continuous_verification_interval=10 --disable_wal=1 --use_txn=1 --enable_pipelined_write=0 --checkpoint_one_in=0 --use_timed_put_one_in=0
```

Reviewed By: cbi42

Differential Revision: D79735166

Pulled By: jaykorean

fbshipit-source-id: ae3be38a21073fd3282d6e8cd7d71f0363df3590
---
 tools/db_crashtest.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index c37fa832b1a8..229bf7c076b5 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1086,9 +1086,15 @@ def finalize_and_sanitize(src_params):
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
-    # TODO Fix races when both Remote Compaction + BlobDB enabled
     if dest_params.get("remote_compaction_worker_threads") > 0:
+       # TODO Fix races when both Remote Compaction + BlobDB enabled
        dest_params["enable_blob_files"] = 0
+       # TODO Fix - Remote worker shouldn't recover from WAL
+       dest_params["disable_wal"] = 1
+       # Disable Incompatible Ones
+       dest_params["checkpoint_one_in"] = 0
+       dest_params["enable_pipelined_write"] = 0
+       dest_params["use_timed_put_one_in"] = 0
     return dest_params
 
 

From 1bba680ebb175e3302ba1b41460a4b5e1850eb24 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 6 Aug 2025 15:20:07 -0700
Subject: [PATCH 220/500] Improve handling of GetFileSize failure (#13842)

Summary:
https://github.com/facebook/rocksdb/issues/13676 unfortunately treated some IOErrors as corruption, which is not appropriate when remote storage is involved. To help enforce this, our crash test injects errors that are expected to be propagated back to the user rather than causing some other failure.

Saw crash test failures like this:
```
TestMultiGetEntity (AttributeGroup) error: Corruption: Failed to get file size: Not implemented: GetFileSize Not Supported for file ...
```

So fixing this handling by not injecting a false Corruption failure and allowing smooth fallback from FSRandomAccessFile::GetFileSize to FileSystem::GetFileSize

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13842

Test Plan: unit test added

Reviewed By: xingbowang

Differential Revision: D79728861

Pulled By: pdillinger

fbshipit-source-id: 33f7dfc85d86d88cb4ab24a8defd26618c95c954
---
 db/db_sst_test.cc               | 64 +++++++++++++++++++++++++++++++++
 table/format.cc                 | 18 ++++------
 utilities/fault_injection_fs.cc | 11 +++---
 utilities/fault_injection_fs.h  | 28 ++++++++++-----
 4 files changed, 96 insertions(+), 25 deletions(-)

diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index d0579a2c3e4b..d186efd8c600 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -1937,6 +1937,70 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+TEST_F(DBSSTTest, SstGetFileSizeFails) {
+  // Build an SST file
+  ASSERT_OK(Put("x", "zaphod"));
+  ASSERT_OK(Flush());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  std::string filename = dbname_ + metadata[0].name;
+
+  // Prepare for fault injection
+  std::shared_ptr<FaultInjectionTestFS> fault_fs =
+      std::make_shared<FaultInjectionTestFS>(
+          CurrentOptions().env->GetFileSystem());
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.env = fault_fs_env.get();
+  options.paranoid_checks = false;  // don't check file sizes on open
+
+  for (int i = 0; i < 4; i++) {
+    SCOPED_TRACE("Iteration = " + std::to_string(i));
+    fault_fs->SetFailRandomAccessGetFileSizeSst(false);
+    fault_fs->SetFailFilesystemGetFileSizeSst(false);
+    Close();
+
+    if (i == 1) {
+      // Just FSRandomAccessFile::GetFileSize fails, which should be worked
+      // around
+      fault_fs->SetFailRandomAccessGetFileSizeSst(true);
+    } else if (i == 2) {
+      // FileSystem::GetFileSize fails, which should be worked around if
+      // FSRandomAccessFile::GetFileSize is supported
+      fault_fs->SetFailFilesystemGetFileSizeSst(true);
+    } else if (i == 3) {
+      // Both GetFileSize APIs fail with an IOError
+      fault_fs->SetFailRandomAccessGetFileSizeSst(true);
+      fault_fs->SetFailFilesystemGetFileSizeSst(true);
+    }
+
+    ASSERT_OK(TryReopen(options));
+    std::string value;
+    Status get_status = db_->Get({}, "x", &value);
+    if (i < 2) {
+      ASSERT_OK(get_status);
+    } else if (i == 2) {
+      if (encrypted_env_) {
+        // Can't recover because RandomAccessFile::GetFileSize is not supported
+        // on EncryptedEnv
+        // Fail with propagated IOError. (Not Corruption nor NotSupported!)
+        ASSERT_EQ(get_status.code(), Status::Code::kIOError);
+        ASSERT_STREQ(get_status.getState(), "FileSystem::GetFileSize failed");
+      } else {
+        // Never sees the FileSystem::GetFileSize failure
+        ASSERT_OK(get_status);
+      }
+    } else {
+      ASSERT_EQ(i, 3);
+      // Fail with propagated IOError. (Not Corruption nor NotSupported!)
+      ASSERT_EQ(get_status.code(), Status::Code::kIOError);
+      ASSERT_STREQ(get_status.getState(), "FileSystem::GetFileSize failed");
+    }
+  }
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/table/format.cc b/table/format.cc
index 2898749be44b..06a2135f5731 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -488,20 +488,14 @@ static Status ReadFooterFromFileInternal(
     Footer* footer, uint64_t enforce_table_magic_number) {
   uint64_t file_size_from_file_system = 0;
   Status s;
+  // Prefer the more efficient FSRandomAccessFile::GetFileSize when available
   s = file->file()->GetFileSize(&file_size_from_file_system);
   if (!s.ok()) {
-    auto corrupted_status =
-        Status::Corruption("Failed to get file size: " + s.ToString() +
-                           " for file " + file->file_name());
-    if (s.IsNotSupported()) {
-      // If file handle does not support GetFileSize, try File System API
-      s = fs.GetFileSize(file->file_name(), IOOptions(),
-                         &file_size_from_file_system, nullptr);
-      if (!s.ok()) {
-        return corrupted_status;
-      }
-    } else {
-      return corrupted_status;
+    // Fall back on FileSystem::GetFileSize on failure
+    s = fs.GetFileSize(file->file_name(), IOOptions(),
+                       &file_size_from_file_system, nullptr);
+    if (!s.ok()) {
+      return s;
     }
   }
 
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 143ed760c3f7..338c5ff66577 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -458,9 +458,9 @@ IOStatus TestFSRandomRWFile::Sync(const IOOptions& options,
 }
 
 TestFSRandomAccessFile::TestFSRandomAccessFile(
-    const std::string& /*fname*/, std::unique_ptr<FSRandomAccessFile>&& f,
+    const std::string& fname, std::unique_ptr<FSRandomAccessFile>&& f,
     FaultInjectionTestFS* fs)
-    : target_(std::move(f)), fs_(fs) {
+    : target_(std::move(f)), fs_(fs), is_sst_(EndsWith(fname, ".sst")) {
   assert(target_ != nullptr);
 }
 
@@ -564,8 +564,8 @@ size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
 }
 
 IOStatus TestFSRandomAccessFile::GetFileSize(uint64_t* file_size) {
-  if (fs_->ShouldFailGetFileSize()) {
-    return IOStatus::IOError("GetFileSize failed");
+  if (is_sst_ && fs_->ShouldFailRandomAccessGetFileSizeSst()) {
+    return IOStatus::IOError("FSRandomAccessFile::GetFileSize failed");
   } else {
     return target_->GetFileSize(file_size);
   }
@@ -1065,6 +1065,9 @@ IOStatus FaultInjectionTestFS::GetFileSize(const std::string& f,
                                            const IOOptions& options,
                                            uint64_t* file_size,
                                            IODebugContext* dbg) {
+  if (EndsWith(f, ".sst") && ShouldFailFilesystemGetFileSizeSst()) {
+    return IOStatus::IOError("FileSystem::GetFileSize failed");
+  }
   if (!IsFilesystemActive()) {
     return GetError();
   }
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 129b3153e46a..151ab1f09499 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -163,6 +163,7 @@ class TestFSRandomAccessFile : public FSRandomAccessFile {
  private:
   std::unique_ptr<FSRandomAccessFile> target_;
   FaultInjectionTestFS* fs_;
+  const bool is_sst_;
 };
 
 class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
@@ -222,9 +223,7 @@ class FaultInjectionTestFS : public FileSystemWrapper {
         injected_thread_local_metadata_write_error_(
             DeleteThreadLocalErrorContext),
         ingest_data_corruption_before_write_(false),
-        checksum_handoff_func_type_(kCRC32c),
-        fail_get_file_unique_id_(false),
-        fail_get_file_size_(false) {}
+        checksum_handoff_func_type_(kCRC32c) {}
   virtual ~FaultInjectionTestFS() override { fs_error_.PermitUncheckedError(); }
 
   static const char* kClassName() { return "FaultInjectionTestFS"; }
@@ -489,14 +488,24 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     return fail_get_file_unique_id_;
   }
 
-  void SetFailGetFileSize(bool flag) {
+  void SetFailRandomAccessGetFileSizeSst(bool flag) {
     MutexLock l(&mutex_);
-    fail_get_file_size_ = flag;
+    fail_random_access_get_file_size_sst_ = flag;
   }
 
-  bool ShouldFailGetFileSize() {
+  bool ShouldFailRandomAccessGetFileSizeSst() {
     MutexLock l(&mutex_);
-    return fail_get_file_size_;
+    return fail_random_access_get_file_size_sst_;
+  }
+
+  void SetFailFilesystemGetFileSizeSst(bool flag) {
+    MutexLock l(&mutex_);
+    fail_fs_get_file_size_sst_ = flag;
+  }
+
+  bool ShouldFailFilesystemGetFileSizeSst() {
+    MutexLock l(&mutex_);
+    return fail_fs_get_file_size_sst_;
   }
 
   // Specify what the operation, so we can inject the right type of error
@@ -657,8 +666,9 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   ThreadLocalPtr injected_thread_local_metadata_write_error_;
   bool ingest_data_corruption_before_write_;
   ChecksumType checksum_handoff_func_type_;
-  bool fail_get_file_unique_id_;
-  bool fail_get_file_size_;
+  bool fail_get_file_unique_id_ = false;
+  bool fail_random_access_get_file_size_sst_ = false;
+  bool fail_fs_get_file_size_sst_ = false;
 
   // Inject an error. For a READ operation, a status of IOError(), a
   // corruption in the contents of scratch, or truncation of slice

From f2b646713e220b79edfbc8a71504c66158c63550 Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Wed, 6 Aug 2025 16:08:21 -0700
Subject: [PATCH 221/500] allow setting sst file manager via c api (#13826)

Summary:
https://github.com/facebook/rocksdb/pull/13404 exposed pretty much everything via c api except allowing the user to set the sst file manager that was created

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13826

Reviewed By: hx235

Differential Revision: D79733147

Pulled By: cbi42

fbshipit-source-id: 6a18741581717a8b8b644b9f85bcd8fbeba94e6a
---
 db/c.cc             | 5 +++++
 include/rocksdb/c.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 85ce472fcd4a..79b0d7b4b55a 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3639,6 +3639,11 @@ void rocksdb_options_set_write_buffer_manager(
   opt->rep.write_buffer_manager = wbm->rep;
 }
 
+void rocksdb_options_set_sst_file_manager(rocksdb_options_t* opt,
+                                          rocksdb_sst_file_manager_t* sfm) {
+  opt->rep.sst_file_manager = sfm->rep;
+}
+
 size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
   return opt->rep.write_buffer_size;
 }
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 048609bfd38f..0b1aa699bb15 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1154,6 +1154,8 @@ rocksdb_block_based_options_set_unpartitioned_pinning_tier(
     rocksdb_block_based_table_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_manager(
     rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_file_manager(
+    rocksdb_options_t* opt, rocksdb_sst_file_manager_t* sfm);
 
 /* Flush job info */
 

From d0051d931475d7505d81b75e1daf4c3f4c9cbed1 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 7 Aug 2025 09:22:29 -0700
Subject: [PATCH 222/500] Disable other incompatible features when disabled WAL
 + Remote Compaction in Stress Test (#13845)

Summary:
We temporarily disabled WAL when Remote Compaction is enabled in Stress Test (https://github.com/facebook/rocksdb/pull/13843). There are few others to incompatible features when WAL is disabled. Due to the sanitization order, WAL was disabled at the end of the sanitization and these incompatible features weren't set properly. Stress Test failed with an error like the following.

e.g. `reopen` stress test is not compatible with `disable_wal` - `Error: Db cannot reopen safely with disable_wal set!`

This PR changes the order of sanitization so that `disable_wal` is set earlier when `remote_compaction_worker_threads > 0`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13845

Test Plan:
```
python3 -u tools/db_crashtest.py blackbox --remote_compaction_worker_threads=8 --interval=5 --duration=6000 --continuous_verification_interval=10 --disable_wal=1 --use_txn=1 --txn_write_policy=2 --enable_pipelined_write=0 --checkpoint_one_in=0 --use_timed_put_one_in=0
```

Reviewed By: cbi42

Differential Revision: D79758670

Pulled By: jaykorean

fbshipit-source-id: aa6f4a74cc86c23f442928c301187b06e8137f53
---
 db_stress_tool/db_stress_test_base.cc | 15 +++++++++++++++
 tools/db_crashtest.py                 | 24 ++++++++++++++----------
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 4f88c72e7f0a..b801e1800f36 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3642,6 +3642,21 @@ void StressTest::Open(SharedState* shared, bool reopen) {
 
   // Remote Compaction
   if (FLAGS_remote_compaction_worker_threads > 0) {
+    // TODO(jaykorean) Remove this after fix - remote worker shouldn't recover
+    // from WAL
+    if (!FLAGS_disable_wal) {
+      fprintf(stderr,
+              "WAL is not compatible with Remote Compaction in Stress Test\n");
+      exit(1);
+    }
+    if ((options_.enable_blob_files ||
+         options_.enable_blob_garbage_collection ||
+         FLAGS_allow_setting_blob_options_dynamically)) {
+      fprintf(stderr,
+              "Integrated BlobDB is currently incompatible with Remote "
+              "Compaction\n");
+      exit(1);
+    }
     options_.compaction_service =
         std::make_shared<DbStressCompactionService>(shared);
   }
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 229bf7c076b5..f212bac7732c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -811,6 +811,14 @@ def finalize_and_sanitize(src_params):
             dest_params["allow_concurrent_memtable_write"] = 1
         else:
             dest_params["unordered_write"] = 0
+    if dest_params.get("remote_compaction_worker_threads", 0) > 0:
+       # TODO Fix races when both Remote Compaction + BlobDB enabled
+       dest_params["enable_blob_files"] = 0
+       # TODO Fix - Remote worker shouldn't recover from WAL
+       dest_params["disable_wal"] = 1
+       # Disable Incompatible Ones
+       dest_params["checkpoint_one_in"] = 0       
+       dest_params["use_timed_put_one_in"] = 0
     if dest_params.get("disable_wal", 0) == 1:
         dest_params["atomic_flush"] = 1
         dest_params["sync"] = 0
@@ -880,6 +888,8 @@ def finalize_and_sanitize(src_params):
         dest_params["use_multi_cf_iterator"] = 0
         # only works with write committed policy
         dest_params["commit_bypass_memtable_one_in"] = 0
+        # not compatible with Remote Compaction yet
+        dest_params["remote_compaction_worker_threads"] = 0
     # TODO(hx235): enable test_multi_ops_txns with fault injection after stabilizing the CI
     if dest_params.get("test_multi_ops_txns") == 1:
         dest_params["write_fault_one_in"] = 0
@@ -1010,6 +1020,9 @@ def finalize_and_sanitize(src_params):
             # have to disable metadata write fault injection to other file
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
+
+            # TODO Fix - Remote worker shouldn't recover from WAL 
+            dest_params["remote_compaction_worker_threads"] = 0
     # Disabling block align if mixed manager is being used
     if dest_params.get("compression_manager") == "custom":
         if dest_params.get("block_align") == 1:
@@ -1085,16 +1098,7 @@ def finalize_and_sanitize(src_params):
         dest_params["ingest_wbwi_one_in"] = 0
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
-        dest_params["continuous_verification_interval"] = 0
-    if dest_params.get("remote_compaction_worker_threads") > 0:
-       # TODO Fix races when both Remote Compaction + BlobDB enabled
-       dest_params["enable_blob_files"] = 0
-       # TODO Fix - Remote worker shouldn't recover from WAL
-       dest_params["disable_wal"] = 1
-       # Disable Incompatible Ones
-       dest_params["checkpoint_one_in"] = 0
-       dest_params["enable_pipelined_write"] = 0
-       dest_params["use_timed_put_one_in"] = 0
+        dest_params["continuous_verification_interval"] = 0   
     return dest_params
 
 

From b43a84fc379ae14e02fd71cd1f623b61db1ec1c6 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 7 Aug 2025 11:02:33 -0700
Subject: [PATCH 223/500] Temporarily Disable Remote Compaction In Stress Test
 (#13848)

Summary:
Previous attempts were not enough keep the stress test running with remote compaction enabled - https://github.com/facebook/rocksdb/pull/13845, https://github.com/facebook/rocksdb/pull/13843, https://github.com/facebook/rocksdb/pull/13835

We will disable the remote compaction in stress test and address this with a better strategy (using internal Meta infra)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13848

Test Plan: CI

Reviewed By: cbi42

Differential Revision: D79816733

Pulled By: jaykorean

fbshipit-source-id: e93b037adf4f775202e06c3fd4aa8a3b4b85c274
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f212bac7732c..786ae2d346de 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -344,7 +344,8 @@
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),    
-    "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
+    # TODO(jaykorean): Re-enable remote compaction once all incompatible features are addressed in stress test
+    "remote_compaction_worker_threads": lambda: 0,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),

From d2ac955881e856fc69d5b15427d742fc635aaead Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 7 Aug 2025 17:22:01 -0700
Subject: [PATCH 224/500] Refactor CompactionJob::Run() into smaller focused
 methods (#13849)

Summary:
**Context/Summary:**
The `CompactionJob::Run()` method has grown too large and complex, making it difficult to implement moderate changes or reason about the code flow (e.g., determining where to save compaction progress for resuming). This PR refactors the method into smaller, more focused functions to improve readability and maintainability.

The refactoring consists mostly of cosmetic changes that extract logical sections into separate methods, with two notable functional improvements:

1.  **Relocated output processing logic**: Moved code under `RemoveEmptyOutputs()` and `HasNewBlobFiles()` to where it's actually needed, rather than piggy-backing on the subcompaction state loop. While this introduces 2 additional loops over subcompactions, the performance impact should be negligible given the improved code clarity.

2.  **Repositioned statistics updates**: Moved `UpdateCompactionJobInputStats()` and `UpdateCompactionJobOutputStats()` from the record verification section to the end `FinalizeCompactionRun()` methods. This change is safe since record verification is a read-only operation that doesn't modify any statistics.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13849

Test Plan: Existing unit tests

Reviewed By: jaykorean

Differential Revision: D79824429

Pulled By: hx235

fbshipit-source-id: 6b73136f32ecc6842a04a77502b7dbb0bbf507f7
---
 db/compaction/compaction_job.cc | 310 ++++++++++++++++++++------------
 db/compaction/compaction_job.h  |  16 ++
 2 files changed, 210 insertions(+), 116 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index adf86acd54e2..bac6915b6874 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -668,16 +668,17 @@ void CompactionJob::GenSubcompactionBoundaries() {
                extra_num_subcompaction_threads_reserved_));
 }
 
-Status CompactionJob::Run() {
+void CompactionJob::InitializeCompactionRun() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_RUN);
   TEST_SYNC_POINT("CompactionJob::Run():Start");
   log_buffer_->FlushBufferToLog();
   LogCompaction();
+}
 
+void CompactionJob::RunSubcompactions() {
   const size_t num_threads = compact_->sub_compact_states.size();
   assert(num_threads > 0);
-  const uint64_t start_micros = db_options_.clock->NowMicros();
   compact_->compaction->GetOrInitInputTableProperties();
 
   // Launch a thread for each of subcompactions 1...num_threads-1
@@ -696,25 +697,43 @@ Status CompactionJob::Run() {
   for (auto& thread : thread_pool) {
     thread.join();
   }
+  RemoveEmptyOutputs();
+
+  ReleaseSubcompactionResources();
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources");
+}
 
+void CompactionJob::UpdateTimingStats(uint64_t start_micros) {
   internal_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
 
   for (auto& state : compact_->sub_compact_states) {
     internal_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
-    state.RemoveLastEmptyOutput();
   }
 
   RecordTimeToHistogram(stats_, COMPACTION_TIME,
                         internal_stats_.output_level_stats.micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
                         internal_stats_.output_level_stats.cpu_micros);
+}
 
-  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+void CompactionJob::RemoveEmptyOutputs() {
+  for (auto& state : compact_->sub_compact_states) {
+    state.RemoveLastEmptyOutput();
+  }
+}
 
-  // Check if any thread encountered an error during execution
+bool CompactionJob::HasNewBlobFiles() const {
+  for (const auto& state : compact_->sub_compact_states) {
+    if (state.Current().HasBlobFileAdditions()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status CompactionJob::CollectSubcompactionErrors() {
   Status status;
   IOStatus io_s;
-  bool wrote_new_blob_files = false;
 
   for (const auto& state : compact_->sub_compact_states) {
     if (!state.status.ok()) {
@@ -722,125 +741,131 @@ Status CompactionJob::Run() {
       io_s = state.io_status;
       break;
     }
-
-    if (state.Current().HasBlobFileAdditions()) {
-      wrote_new_blob_files = true;
-    }
   }
 
   if (io_status_.ok()) {
     io_status_ = io_s;
   }
-  if (status.ok()) {
-    constexpr IODebugContext* dbg = nullptr;
 
-    if (output_directory_) {
-      io_s = output_directory_->FsyncWithDirOptions(
-          IOOptions(), dbg,
-          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
-    }
+  return status;
+}
 
-    if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
-        blob_output_directory_ != output_directory_) {
-      io_s = blob_output_directory_->FsyncWithDirOptions(
-          IOOptions(), dbg,
-          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
-    }
+Status CompactionJob::SyncOutputDirectories() {
+  Status status;
+  IOStatus io_s;
+  constexpr IODebugContext* dbg = nullptr;
+  const bool wrote_new_blob_files = HasNewBlobFiles();
+  if (output_directory_) {
+    io_s = output_directory_->FsyncWithDirOptions(
+        IOOptions(), dbg,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+  }
+
+  if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+      blob_output_directory_ != output_directory_) {
+    io_s = blob_output_directory_->FsyncWithDirOptions(
+        IOOptions(), dbg,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
   }
+
   if (io_status_.ok()) {
     io_status_ = io_s;
   }
   if (status.ok()) {
     status = io_s;
   }
-  if (status.ok()) {
-    thread_pool.clear();
-    std::vector<const CompactionOutputs::Output*> files_output;
-    for (const auto& state : compact_->sub_compact_states) {
-      for (const auto& output : state.GetOutputs()) {
-        files_output.emplace_back(&output);
-      }
+
+  return status;
+}
+
+Status CompactionJob::VerifyOutputFiles() {
+  Status status;
+  std::vector<port::Thread> thread_pool;
+  std::vector<const CompactionOutputs::Output*> files_output;
+  for (const auto& state : compact_->sub_compact_states) {
+    for (const auto& output : state.GetOutputs()) {
+      files_output.emplace_back(&output);
     }
-    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-    std::atomic<size_t> next_file_idx(0);
-    auto verify_table = [&](Status& output_status) {
-      while (true) {
-        size_t file_idx = next_file_idx.fetch_add(1);
-        if (file_idx >= files_output.size()) {
-          break;
-        }
-        // Verify that the table is usable
-        // We set for_compaction to false and don't
-        // OptimizeForCompactionTableRead here because this is a special case
-        // after we finish the table building No matter whether
-        // use_direct_io_for_flush_and_compaction is true, we will regard this
-        // verification as user reads since the goal is to cache it here for
-        // further user reads
-        ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
-        verify_table_read_options.rate_limiter_priority =
-            GetRateLimiterPriority();
-        InternalIterator* iter = cfd->table_cache()->NewIterator(
-            verify_table_read_options, file_options_,
-            cfd->internal_comparator(), files_output[file_idx]->meta,
-            /*range_del_agg=*/nullptr,
-            compact_->compaction->mutable_cf_options(),
-            /*table_reader_ptr=*/nullptr,
-            cfd->internal_stats()->GetFileReadHist(
-                compact_->compaction->output_level()),
-            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
-            /*skip_filters=*/false, compact_->compaction->output_level(),
-            MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()),
-            /*smallest_compaction_key=*/nullptr,
-            /*largest_compaction_key=*/nullptr,
-            /*allow_unprepared_value=*/false);
-        auto s = iter->status();
-
-        if (s.ok() && paranoid_file_checks_) {
-          OutputValidator validator(cfd->internal_comparator(),
-                                    /*_enable_hash=*/true);
-          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-            s = validator.Add(iter->key(), iter->value());
-            if (!s.ok()) {
-              break;
-            }
-          }
-          if (s.ok()) {
-            s = iter->status();
-          }
-          if (s.ok() &&
-              !validator.CompareValidator(files_output[file_idx]->validator)) {
-            s = Status::Corruption("Paranoid checksums do not match");
+  }
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  std::atomic<size_t> next_file_idx(0);
+  auto verify_table = [&](Status& output_status) {
+    while (true) {
+      size_t file_idx = next_file_idx.fetch_add(1);
+      if (file_idx >= files_output.size()) {
+        break;
+      }
+      // Verify that the table is usable
+      // We set for_compaction to false and don't
+      // OptimizeForCompactionTableRead here because this is a special case
+      // after we finish the table building No matter whether
+      // use_direct_io_for_flush_and_compaction is true, we will regard this
+      // verification as user reads since the goal is to cache it here for
+      // further user reads
+      ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
+      verify_table_read_options.rate_limiter_priority =
+          GetRateLimiterPriority();
+      InternalIterator* iter = cfd->table_cache()->NewIterator(
+          verify_table_read_options, file_options_, cfd->internal_comparator(),
+          files_output[file_idx]->meta,
+          /*range_del_agg=*/nullptr, compact_->compaction->mutable_cf_options(),
+          /*table_reader_ptr=*/nullptr,
+          cfd->internal_stats()->GetFileReadHist(
+              compact_->compaction->output_level()),
+          TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+          /*skip_filters=*/false, compact_->compaction->output_level(),
+          MaxFileSizeForL0MetaPin(compact_->compaction->mutable_cf_options()),
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false);
+      auto s = iter->status();
+
+      if (s.ok() && paranoid_file_checks_) {
+        OutputValidator validator(cfd->internal_comparator(),
+                                  /*_enable_hash=*/true);
+        for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+          s = validator.Add(iter->key(), iter->value());
+          if (!s.ok()) {
+            break;
           }
         }
-
-        delete iter;
-
-        if (!s.ok()) {
-          output_status = s;
-          break;
+        if (s.ok()) {
+          s = iter->status();
+        }
+        if (s.ok() &&
+            !validator.CompareValidator(files_output[file_idx]->validator)) {
+          s = Status::Corruption("Paranoid checksums do not match");
         }
       }
-    };
-    for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
-      thread_pool.emplace_back(
-          verify_table, std::ref(compact_->sub_compact_states[i].status));
-    }
-    verify_table(compact_->sub_compact_states[0].status);
-    for (auto& thread : thread_pool) {
-      thread.join();
-    }
 
-    for (const auto& state : compact_->sub_compact_states) {
-      if (!state.status.ok()) {
-        status = state.status;
+      delete iter;
+
+      if (!s.ok()) {
+        output_status = s;
         break;
       }
     }
+  };
+  for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+    thread_pool.emplace_back(verify_table,
+                             std::ref(compact_->sub_compact_states[i].status));
+  }
+  verify_table(compact_->sub_compact_states[0].status);
+  for (auto& thread : thread_pool) {
+    thread.join();
   }
 
-  ReleaseSubcompactionResources();
-  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources");
+  for (const auto& state : compact_->sub_compact_states) {
+    if (!state.status.ok()) {
+      status = state.status;
+      break;
+    }
+  }
 
+  return status;
+}
+
+void CompactionJob::SetOutputTableProperties() {
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.GetOutputs()) {
       auto fn =
@@ -850,7 +875,9 @@ Status CompactionJob::Run() {
                                                      output.table_properties);
     }
   }
+}
 
+void CompactionJob::AggregateSubcompactionStats() {
   // Before the compaction starts, is_remote_compaction was set to true if
   // compaction_service is set. We now know whether each sub_compaction was
   // done remotely or not. Reset is_remote_compaction back to false and allow
@@ -859,34 +886,85 @@ Status CompactionJob::Run() {
 
   // Finish up all bookkeeping to unify the subcompaction results.
   compact_->AggregateCompactionStats(internal_stats_, *job_stats_);
+}
 
-  uint64_t num_input_range_del = 0;
-  bool ok = BuildStatsFromInputTableProperties(&num_input_range_del);
-  // (Sub)compactions returned ok, do sanity check on the number of input
-  // keys.
-  if (status.ok() && ok) {
-    if (job_stats_->has_num_input_records) {
-      status = VerifyInputRecordCount(num_input_range_del);
+Status CompactionJob::VerifyCompactionRecordCounts(
+    bool stats_built_from_input_table_prop, uint64_t num_input_range_del) {
+  Status status;
+  if (stats_built_from_input_table_prop && job_stats_->has_num_input_records) {
+    status = VerifyInputRecordCount(num_input_range_del);
+    if (!status.ok()) {
+      return status;
     }
-    UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
   }
-  UpdateCompactionJobOutputStats(internal_stats_);
 
-  // Verify number of output records
-  // Only verify on table with format collects table properties
   const auto& mutable_cf_options = compact_->compaction->mutable_cf_options();
-  if (status.ok() && (mutable_cf_options.table_factory->IsInstanceOf(
-                          TableFactory::kBlockBasedTableName()) ||
-                      mutable_cf_options.table_factory->IsInstanceOf(
-                          TableFactory::kPlainTableName()))) {
+  if ((mutable_cf_options.table_factory->IsInstanceOf(
+           TableFactory::kBlockBasedTableName()) ||
+       mutable_cf_options.table_factory->IsInstanceOf(
+           TableFactory::kPlainTableName()))) {
     status = VerifyOutputRecordCount();
+    if (!status.ok()) {
+      return status;
+    }
   }
+  return status;
+}
+
+void CompactionJob::FinalizeCompactionRun(
+    const Status& input_status, bool stats_built_from_input_table_prop,
+    uint64_t num_input_range_del) {
+  if (stats_built_from_input_table_prop) {
+    UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
+  }
+
+  UpdateCompactionJobOutputStats(internal_stats_);
 
   RecordCompactionIOStats();
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
-  compact_->status = status;
-  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status);
+  compact_->status = input_status;
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet",
+                           const_cast<Status*>(&input_status));
+}
+
+Status CompactionJob::Run() {
+  InitializeCompactionRun();
+
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+
+  RunSubcompactions();
+
+  UpdateTimingStats(start_micros);
+
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+  Status status = CollectSubcompactionErrors();
+
+  if (status.ok()) {
+    status = SyncOutputDirectories();
+  }
+
+  if (status.ok()) {
+    status = VerifyOutputFiles();
+  }
+
+  SetOutputTableProperties();
+
+  AggregateSubcompactionStats();
+
+  uint64_t num_input_range_del = 0;
+  bool stats_built_from_input_table_prop =
+      BuildStatsFromInputTableProperties(&num_input_range_del);
+
+  if (status.ok()) {
+    status = VerifyCompactionRecordCounts(stats_built_from_input_table_prop,
+                                          num_input_range_del);
+  }
+
+  FinalizeCompactionRun(status, stats_built_from_input_table_prop,
+                        num_input_range_del);
+
   return status;
 }
 
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index bc116de9971f..ca5f52fb1774 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -279,6 +279,22 @@ class CompactionJob {
   // Release all reserved threads and update the compaction limits.
   void ReleaseSubcompactionResources();
 
+  void InitializeCompactionRun();
+  void RunSubcompactions();
+  void UpdateTimingStats(uint64_t start_micros);
+  void RemoveEmptyOutputs();
+  bool HasNewBlobFiles() const;
+  Status CollectSubcompactionErrors();
+  Status SyncOutputDirectories();
+  Status VerifyOutputFiles();
+  void SetOutputTableProperties();
+  void AggregateSubcompactionStats();
+  Status VerifyCompactionRecordCounts(bool stats_built_from_input_table_prop,
+                                      uint64_t num_input_range_del);
+  void FinalizeCompactionRun(const Status& status,
+                             bool stats_built_from_input_table_prop,
+                             uint64_t num_input_range_del);
+
   CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
       SubcompactionState* sub_compact);
 

From b8b42b7a68e7272b347c6d66a67310ad85cd26b6 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 8 Aug 2025 10:09:55 -0700
Subject: [PATCH 225/500] Simple cleanup to CompactionJob::Run() (#13851)

Summary:
**Context/Summary:**
This update, which should have been part of a previous refactoring [PR](https://github.com/facebook/rocksdb/commit/d2ac955881e856fc69d5b15427d742fc635aaead), involves simple renaming for clarity and ensures output table properties are only set when compaction succeeds. Output properties are not meaningful if compaction fails, so this change prevents their population in such cases. Additionally, subsequent statistics updates already do not rely on output file table properties, maintaining correctness regardless of compaction success.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13851

Test Plan: Existing unit tests

Reviewed By: jaykorean

Differential Revision: D79862244

Pulled By: hx235

fbshipit-source-id: 1db16b8dc7b820fab3ec1d5c8a4b757466590e2c
---
 db/compaction/compaction_job.cc | 12 ++++++------
 db/compaction/compaction_job.h  |  5 ++---
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index bac6915b6874..58fd2da01bf0 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -917,10 +917,9 @@ void CompactionJob::FinalizeCompactionRun(
   if (stats_built_from_input_table_prop) {
     UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
   }
-
   UpdateCompactionJobOutputStats(internal_stats_);
-
   RecordCompactionIOStats();
+
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
   compact_->status = input_status;
@@ -949,13 +948,15 @@ Status CompactionJob::Run() {
     status = VerifyOutputFiles();
   }
 
-  SetOutputTableProperties();
+  if (status.ok()) {
+    SetOutputTableProperties();
+  }
 
   AggregateSubcompactionStats();
 
   uint64_t num_input_range_del = 0;
   bool stats_built_from_input_table_prop =
-      BuildStatsFromInputTableProperties(&num_input_range_del);
+      BuildStatsFromInputFiles(&num_input_range_del);
 
   if (status.ok()) {
     status = VerifyCompactionRecordCounts(stats_built_from_input_table_prop,
@@ -2150,8 +2151,7 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 }
 }  // namespace
 
-bool CompactionJob::BuildStatsFromInputTableProperties(
-    uint64_t* num_input_range_del) {
+bool CompactionJob::BuildStatsFromInputFiles(uint64_t* num_input_range_del) {
   assert(compact_);
 
   Compaction* compaction = compact_->compaction;
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index ca5f52fb1774..436169c5691a 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -225,7 +225,7 @@ class CompactionJob {
  private:
   friend class CompactionJobTestBase;
 
-  // Collect the following stats from Input Table Properties
+  // Collect the following stats from input files and table properties
   // - num_input_files_in_non_output_levels
   // - num_input_files_in_output_level
   // - bytes_read_non_output_levels
@@ -242,8 +242,7 @@ class CompactionJob {
   // num_input_range_del are calculated successfully.
   //
   // This should be called only once for compactions (not per subcompaction)
-  bool BuildStatsFromInputTableProperties(
-      uint64_t* num_input_range_del = nullptr);
+  bool BuildStatsFromInputFiles(uint64_t* num_input_range_del = nullptr);
 
   void UpdateCompactionJobInputStats(
       const InternalStats::CompactionStatsFull& internal_stats,

From 0b44282a9dfea7afc46df8d6e1110fb92628ce2f Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Fri, 8 Aug 2025 10:33:36 -0700
Subject: [PATCH 226/500] Introduction of MultiScanOptions (#13837)

Summary:
To better support future options, and changes, we need to convert the std::vector<ScanOptions> to something more malleable.

This diff introduces the MultiScanOptions structure and pipes it through the various points in the code in the Prepare path.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13837

Test Plan:
Ensure all associated tests pass
```
make check all
```

Reviewed By: cbi42

Differential Revision: D79655229

Pulled By: krhancoc

fbshipit-source-id: 3a90fb7420e9655021de85ed0158b866f8bfba05
---
 db/arena_wrapped_db_iter.h                    |  2 +-
 db/db_impl/db_impl.cc                         |  2 +-
 db/db_impl/db_impl.h                          |  2 +-
 db/db_iter.h                                  |  6 +-
 db/db_iterator_test.cc                        | 67 +++++++++--------
 db/multi_scan.cc                              | 11 +--
 db/version_set.cc                             | 24 +++---
 db_stress_tool/db_stress_test_base.cc         |  6 +-
 include/rocksdb/db.h                          |  2 +-
 include/rocksdb/iterator.h                    |  2 +-
 include/rocksdb/multi_scan.h                  | 11 ++-
 include/rocksdb/options.h                     | 73 +++++++++++++++++++
 include/rocksdb/utilities/stackable_db.h      |  2 +-
 .../block_based/block_based_table_iterator.cc | 15 ++--
 .../block_based/block_based_table_iterator.h  |  6 +-
 .../block_based_table_reader_test.cc          | 37 ++++++----
 .../block_based/user_defined_index_wrapper.h  |  7 +-
 table/external_table.cc                       |  8 +-
 table/internal_iterator.h                     |  2 +-
 table/iterator_wrapper.h                      |  2 +-
 table/merging_iterator.cc                     |  2 +-
 table/table_test.cc                           | 39 ++++++----
 tools/db_bench_tool.cc                        |  4 +-
 23 files changed, 216 insertions(+), 116 deletions(-)

diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
index 647ed62c908c..26062497a0b7 100644
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@@ -98,7 +98,7 @@ class ArenaWrappedDBIter : public Iterator {
 
   bool PrepareValue() override { return db_iter_->PrepareValue(); }
 
-  void Prepare(const std::vector<ScanOptions>& scan_opts) override {
+  void Prepare(const MultiScanArgs& scan_opts) override {
     db_iter_->Prepare(scan_opts);
   }
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index c9471122d7ce..2c9a5f226cd5 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3835,7 +3835,7 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
 
 std::unique_ptr<MultiScan> DBImpl::NewMultiScan(
     const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
-    const std::vector<ScanOptions>& scan_opts) {
+    const MultiScanArgs& scan_opts) {
   std::unique_ptr<MultiScan> ms_iter = std::make_unique<MultiScan>(
       _read_options, scan_opts, this, column_family);
   return ms_iter;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 60f95ea27ade..702a8b9e648a 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -386,7 +386,7 @@ class DBImpl : public DB {
   using DB::NewMultiScan;
   std::unique_ptr<MultiScan> NewMultiScan(
       const ReadOptions& _read_options, ColumnFamilyHandle* column_family,
-      const std::vector<ScanOptions>& scan_opts) override;
+      const MultiScanArgs& scan_opts) override;
 
   const Snapshot* GetSnapshot() override;
   void ReleaseSnapshot(const Snapshot* snapshot) override;
diff --git a/db/db_iter.h b/db/db_iter.h
index 6bb64b6e732e..28a5b22db7fa 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -240,8 +240,8 @@ class DBIter final : public Iterator {
 
   bool PrepareValue() override;
 
-  void Prepare(const std::vector<ScanOptions>& scan_opts) override {
-    std::optional<std::vector<ScanOptions>> new_scan_opts;
+  void Prepare(const MultiScanArgs& scan_opts) override {
+    std::optional<MultiScanArgs> new_scan_opts;
     new_scan_opts.emplace(scan_opts);
     scan_opts_.swap(new_scan_opts);
     if (!scan_opts.empty()) {
@@ -505,7 +505,7 @@ class DBIter final : public Iterator {
   const Slice* const timestamp_lb_;
   const size_t timestamp_size_;
   std::string saved_timestamp_;
-  std::optional<std::vector<ScanOptions>> scan_opts_;
+  std::optional<MultiScanArgs> scan_opts_;
   ReadOnlyMemTable* const active_mem_;
   SequenceNumber memtable_seqno_lb_;
   uint32_t memtable_op_scan_flush_trigger_;
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index d5f1e1f43434..e894e1e1bec8 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4159,9 +4159,9 @@ TEST_F(DBMultiScanIteratorTest, BasicTest) {
 
   std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
   ReadOptions ro;
-  std::vector<ScanOptions> scan_options(
-      {ScanOptions(key_ranges[0], key_ranges[1]),
-       ScanOptions(key_ranges[2], key_ranges[3])});
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
   ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
   std::unique_ptr<MultiScan> iter =
       dbfull()->NewMultiScan(ro, cfh, scan_options);
@@ -4190,7 +4190,10 @@ TEST_F(DBMultiScanIteratorTest, BasicTest) {
 
   // Test the overlapping scan case
   key_ranges[1] = "k30";
-  scan_options[0] = ScanOptions(key_ranges[0], key_ranges[1]);
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+
   iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
   try {
     int idx = 0;
@@ -4216,8 +4219,9 @@ TEST_F(DBMultiScanIteratorTest, BasicTest) {
   iter.reset();
 
   // Test the no limit scan case
-  scan_options[0] = ScanOptions(key_ranges[0]);
-  scan_options[1] = ScanOptions(key_ranges[2]);
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0]);
+  scan_options.insert(key_ranges[2]);
   iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
   try {
     int idx = 0;
@@ -4257,9 +4261,10 @@ TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
   std::vector<std::string> key_ranges(
       {"k03", "k10", "k25", "k50", "k75", "k90"});
   ReadOptions ro;
-  std::vector<ScanOptions> scan_options(
-      {ScanOptions(key_ranges[0], key_ranges[1]), ScanOptions(key_ranges[2]),
-       ScanOptions(key_ranges[4], key_ranges[5])});
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2]);
+  scan_options.insert(key_ranges[4], key_ranges[5]);
   ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
   std::unique_ptr<MultiScan> iter =
       dbfull()->NewMultiScan(ro, cfh, scan_options);
@@ -4268,13 +4273,15 @@ TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
     int count = 0;
     for (auto range : *iter) {
       for (auto it : range) {
-        ASSERT_GE(it.first.ToString().compare(
-                      scan_options[idx].range.start->ToString()),
-                  0);
-        if (scan_options[idx].range.limit) {
-          ASSERT_LT(it.first.ToString().compare(
-                        scan_options[idx].range.limit->ToString()),
-                    0);
+        ASSERT_GE(
+            it.first.ToString().compare(
+                scan_options.GetScanRanges()[idx].range.start->ToString()),
+            0);
+        if (scan_options.GetScanRanges()[idx].range.limit) {
+          ASSERT_LT(
+              it.first.ToString().compare(
+                  scan_options.GetScanRanges()[idx].range.limit->ToString()),
+              0);
         }
         count++;
       }
@@ -4291,23 +4298,25 @@ TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
     abort();
   }
   iter.reset();
-
-  scan_options[0] = ScanOptions(key_ranges[0]);
-  scan_options[1] = ScanOptions(key_ranges[2], key_ranges[3]);
-  scan_options[2] = ScanOptions(key_ranges[4]);
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.insert(key_ranges[4]);
   iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
   try {
     int idx = 0;
     int count = 0;
     for (auto range : *iter) {
       for (auto it : range) {
-        ASSERT_GE(it.first.ToString().compare(
-                      scan_options[idx].range.start->ToString()),
-                  0);
-        if (scan_options[idx].range.limit) {
-          ASSERT_LT(it.first.ToString().compare(
-                        scan_options[idx].range.limit->ToString()),
-                    0);
+        ASSERT_GE(
+            it.first.ToString().compare(
+                scan_options.GetScanRanges()[idx].range.start->ToString()),
+            0);
+        if (scan_options.GetScanRanges()[idx].range.limit) {
+          ASSERT_LT(
+              it.first.ToString().compare(
+                  scan_options.GetScanRanges()[idx].range.limit->ToString()),
+              0);
         }
         count++;
       }
@@ -4345,8 +4354,8 @@ TEST_F(DBMultiScanIteratorTest, RangeAcrossFiles) {
   ASSERT_EQ(2, NumTableFilesAtLevel(49));
   std::vector<std::string> key_ranges({Key(10), Key(90)});
   ReadOptions ro;
-  std::vector<ScanOptions> scan_options(
-      {ScanOptions(key_ranges[0], key_ranges[1])});
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
   ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
   std::unique_ptr<MultiScan> iter =
       dbfull()->NewMultiScan(ro, cfh, scan_options);
diff --git a/db/multi_scan.cc b/db/multi_scan.cc
index 663793240139..ae31c4882d62 100644
--- a/db/multi_scan.cc
+++ b/db/multi_scan.cc
@@ -10,24 +10,25 @@ namespace ROCKSDB_NAMESPACE {
 using MultiScanIterator = MultiScan::MultiScanIterator;
 
 MultiScan::MultiScan(const ReadOptions& read_options,
-                     const std::vector<ScanOptions>& scan_opts, DB* db,
+                     const MultiScanArgs& scan_opts, DB* db,
                      ColumnFamilyHandle* cfh)
     : read_options_(read_options), scan_opts_(scan_opts), db_(db), cfh_(cfh) {
   bool slow_path = false;
   // Setup read_options with iterate_uuper_bound based on the first scan.
   // Subsequent scans will update and allocate a new DB iterator as necessary
-  if (scan_opts[0].range.limit) {
-    upper_bound_ = *scan_opts[0].range.limit;
+  if (scan_opts.GetScanRanges()[0].range.limit) {
+    upper_bound_ = *scan_opts.GetScanRanges()[0].range.limit;
     read_options_.iterate_upper_bound = &upper_bound_;
   } else {
     read_options_.iterate_upper_bound = nullptr;
   }
-  for (auto opts : scan_opts) {
+  for (const auto& opts : scan_opts.GetScanRanges()) {
     // Check that all the ScanOptions either specify an upper bound or not. If
     // its mixed we take the slow path which avoids calling Prepare: we have to
     // reallocate the Iterator with updated read_options everytime we switch
     // between upper bound or no upper bound, which complicates Prepare.
-    if (opts.range.limit.has_value() != scan_opts[0].range.limit.has_value()) {
+    if (opts.range.limit.has_value() !=
+        scan_opts.GetScanRanges()[0].range.limit.has_value()) {
       slow_path = true;
       break;
     }
diff --git a/db/version_set.cc b/db/version_set.cc
index 9bf3d35c0b41..a9a51e4d9dc9 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -95,7 +95,7 @@ namespace ROCKSDB_NAMESPACE {
 
 namespace {
 
-using ScanOptionsMap = std::unordered_map<size_t, std::vector<ScanOptions>>;
+using ScanOptionsMap = std::unordered_map<size_t, MultiScanArgs>;
 
 // Find File in LevelFilesBrief data structure
 // Within an index range defined by left and right
@@ -1101,17 +1101,17 @@ class LevelIterator final : public InternalIterator {
     read_seq_ = read_seq;
   }
 
-  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+  void Prepare(const MultiScanArgs* so) override {
     // We assume here that scan_opts is sorted such that
     // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping
-    scan_opts_ = scan_opts;
-    if (scan_opts_ == nullptr) {
+    if (so == nullptr) {
       return;
     }
+    scan_opts_ = so;
 
     file_to_scan_opts_ = std::make_unique<ScanOptionsMap>();
     for (size_t k = 0; k < scan_opts_->size(); k++) {
-      const ScanOptions& opt = scan_opts_->at(k);
+      const ScanOptions& opt = scan_opts_->GetScanRanges().at(k);
       auto start = opt.range.start;
       auto end = opt.range.limit;
 
@@ -1139,8 +1139,8 @@ class LevelIterator final : public InternalIterator {
       // 3. [  S  ] ...... [  E  ]
       for (auto i = fstart; i <= fend; i++) {
         if (i < flevel_->num_files) {
-          (*file_to_scan_opts_)[i].emplace_back(start.value(), end.value());
-          (*file_to_scan_opts_)[i].back().property_bag = opt.property_bag;
+          (*file_to_scan_opts_)[i].insert(start.value(), end.value(),
+                                          opt.property_bag);
         }
       }
     }
@@ -1271,7 +1271,7 @@ class LevelIterator final : public InternalIterator {
   bool prefix_exhausted_ = false;
   // Whether next/prev key is a sentinel key.
   bool to_return_sentinel_ = false;
-  const std::vector<ScanOptions>* scan_opts_;
+  const MultiScanArgs* scan_opts_ = nullptr;
 
   // Our stored scan_opts for each prefix
   std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
@@ -1540,7 +1540,8 @@ bool LevelIterator::SkipEmptyFileForward() {
       // If we are doing prepared scan opts then we should seek to the values
       // specified by the scan opts
       if (scan_opts_ && (*file_to_scan_opts_)[file_index_].size()) {
-        const ScanOptions& opts = file_to_scan_opts_->at(file_index_).front();
+        const ScanOptions& opts =
+            file_to_scan_opts_->at(file_index_).GetScanRanges().front();
         if (opts.range.start.has_value()) {
           InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber,
                              kValueTypeForSeek);
@@ -1599,9 +1600,8 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
   if (iter && scan_opts_) {
     if (file_to_scan_opts_.get() &&
         file_to_scan_opts_->find(file_index_) != file_to_scan_opts_->end()) {
-      const std::vector<ScanOptions>& opts =
-          file_to_scan_opts_->at(file_index_);
-      file_iter_.Prepare(&opts);
+      const MultiScanArgs& new_opts = file_to_scan_opts_->at(file_index_);
+      file_iter_.Prepare(&new_opts);
     } else {
       file_iter_.Prepare(scan_opts_);
     }
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index b801e1800f36..ab3e296e4cbf 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1677,7 +1677,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
 
   std::vector<std::string> start_key_strs;
   std::vector<std::string> end_key_strs;
-  std::vector<ScanOptions> scan_opts;
+  MultiScanArgs scan_opts;
   start_key_strs.reserve(num_scans);
   end_key_strs.reserve(num_scans);
 
@@ -1688,7 +1688,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
     assert(rand_keys[i] <= rand_keys[i + 1]);
     start_key_strs.emplace_back(Key(rand_keys[i]));
     end_key_strs.emplace_back(Key(rand_keys[i + 1]));
-    scan_opts.emplace_back(start_key_strs.back(), end_key_strs.back());
+    scan_opts.insert(Slice(start_key_strs.back()), Slice(end_key_strs.back()));
   }
 
   std::string op_logs;
@@ -1715,7 +1715,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
     return true;
   };
 
-  for (const ScanOptions& scan_opt : scan_opts) {
+  for (const ScanOptions& scan_opt : scan_opts.GetScanRanges()) {
     if (op_logs.size() > kOpLogsLimit) {
       // Shouldn't take too much memory for the history log. Clear it.
       op_logs = "(cleared...)\n";
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 4038ab5c3dc2..14db14aa5a9e 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1119,7 +1119,7 @@ class DB {
   //  }
   virtual std::unique_ptr<MultiScan> NewMultiScan(
       const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
-      const std::vector<ScanOptions>& /*scan_opts*/) {
+      const MultiScanArgs& /*scan_opts*/) {
     std::unique_ptr<Iterator> iter(NewErrorIterator(Status::NotSupported()));
     std::unique_ptr<MultiScan> ms_iter =
         std::make_unique<MultiScan>(std::move(iter));
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index af7934e34a1b..b006138376aa 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -109,7 +109,7 @@ class Iterator : public IteratorBase {
   //
   // If Prepare() is called, it overrides the iterate_upper_bound in
   // ReadOptions
-  virtual void Prepare(const std::vector<ScanOptions>& /*scan_opts*/) {}
+  virtual void Prepare(const MultiScanArgs& /*scan_opts*/) {}
 };
 
 // Return an empty iterator (yields nothing).
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
index c76cb9c7c407..c9af9022a0e1 100644
--- a/include/rocksdb/multi_scan.h
+++ b/include/rocksdb/multi_scan.h
@@ -152,9 +152,8 @@ class Scan {
 // A Status exception is thrown if there is an error.
 class MultiScan {
  public:
-  MultiScan(const ReadOptions& read_options,
-            const std::vector<ScanOptions>& scan_opts, DB* db,
-            ColumnFamilyHandle* cfh);
+  MultiScan(const ReadOptions& read_options, const MultiScanArgs& scan_opts,
+            DB* db, ColumnFamilyHandle* cfh);
 
   explicit MultiScan(std::unique_ptr<Iterator>&& db_iter)
       : db_iter_(std::move(db_iter)) {}
@@ -220,15 +219,15 @@ class MultiScan {
   };
 
   MultiScanIterator begin() {
-    return MultiScanIterator(scan_opts_, db_, cfh_, read_options_,
-                             &upper_bound_, db_iter_);
+    return MultiScanIterator(scan_opts_.GetScanRanges(), db_, cfh_,
+                             read_options_, &upper_bound_, db_iter_);
   }
 
   std::nullptr_t end() { return nullptr; }
 
  private:
   ReadOptions read_options_;
-  const std::vector<ScanOptions> scan_opts_;
+  const MultiScanArgs scan_opts_;
   DB* db_;
   ColumnFamilyHandle* cfh_;
   Slice upper_bound_;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 5a856f1f233c..a436c43e389c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1776,6 +1776,79 @@ struct ScanOptions {
       : range(_start, _upper_bound) {}
 };
 
+// Container for multiple scan ranges that can be used with MultiScan.
+// This replaces std::vector<ScanOptions> with a more efficient implementation
+// that can merge overlapping ranges.
+class MultiScanArgs {
+ public:
+  // Constructor that takes a comparator
+  explicit MultiScanArgs(const Comparator* comparator = BytewiseComparator())
+      : comp_(comparator) {}
+
+  // Copy Constructor
+  MultiScanArgs(const MultiScanArgs& other) {
+    comp_ = other.comp_;
+    original_ranges_ = other.original_ranges_;
+  }
+  MultiScanArgs(MultiScanArgs&& other) noexcept
+      : comp_(other.comp_),
+        original_ranges_(std::move(other.original_ranges_)) {}
+
+  MultiScanArgs& operator=(const MultiScanArgs& other) {
+    comp_ = other.comp_;
+    original_ranges_ = other.original_ranges_;
+    return *this;
+  }
+
+  MultiScanArgs& operator=(MultiScanArgs&& other) noexcept {
+    if (this != &other) {
+      comp_ = other.comp_;
+      original_ranges_ = std::move(other.original_ranges_);
+    }
+    return *this;
+  }
+
+  void insert(const Slice& s, const Slice& b) {
+    original_ranges_.emplace_back(s, b);
+  }
+
+  void insert(const Slice& s, const Slice& b,
+              const std::optional<std::unordered_map<std::string, std::string>>&
+                  property_bag) {
+    original_ranges_.emplace_back(s, b);
+    original_ranges_.back().property_bag = property_bag;
+  }
+
+  void insert(const Slice& s) { original_ranges_.emplace_back(s); }
+
+  void insert(const Slice& s,
+              const std::optional<std::unordered_map<std::string, std::string>>&
+                  property_bag) {
+    original_ranges_.emplace_back(s);
+    original_ranges_.back().property_bag = property_bag;
+  }
+
+  size_t size() const { return original_ranges_.size(); }
+  bool empty() const { return original_ranges_.empty(); }
+
+  void reserve(size_t size) { original_ranges_.reserve(size); }
+
+  operator std::vector<ScanOptions>*() { return &original_ranges_; }
+
+  operator const std::vector<ScanOptions>*() const { return &original_ranges_; }
+  // Destructor
+  ~MultiScanArgs() {}
+
+  const std::vector<ScanOptions>& GetScanRanges() const {
+    return original_ranges_;
+  }
+
+ private:
+  // The comparator used for ordering ranges
+  const Comparator* comp_;
+  std::vector<ScanOptions> original_ranges_;
+};
+
 // Options that control read operations
 struct ReadOptions {
   // *** BEGIN options relevant to point lookups as well as scans ***
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 89549941cb91..06c5d1f7d8e5 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -292,7 +292,7 @@ class StackableDB : public DB {
   using DB::NewMultiScan;
   std::unique_ptr<MultiScan> NewMultiScan(
       const ReadOptions& opts, ColumnFamilyHandle* column_family,
-      const std::vector<ScanOptions>& scan_opts) override {
+      const MultiScanArgs& scan_opts) override {
     return db_->NewMultiScan(opts, column_family, scan_opts);
   }
 
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 7ca2dbc896a1..5d9536a87810 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -932,19 +932,19 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
 // ReadOptions::max_skippable_internal_keys or reseeking into range deletion
 // end key. So these Seeks can cause iterator to fall back to normal
 // (non-prepared) iterator and ignore the optimizations done in Prepare().
-// TODO: support fill_cache = false and when block cache is disabled.
-void BlockBasedTableIterator::Prepare(
-    const std::vector<ScanOptions>* scan_opts) {
-  index_iter_->Prepare(scan_opts);
+void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
+  index_iter_->Prepare(multiscan_opts);
 
   assert(!multi_scan_);
   if (multi_scan_) {
     multi_scan_.reset();
     return;
   }
-  if (scan_opts == nullptr || scan_opts->empty()) {
+  if (multiscan_opts == nullptr || multiscan_opts->empty()) {
     return;
   }
+
+  const std::vector<ScanOptions>* scan_opts = &multiscan_opts->GetScanRanges();
   const bool has_limit = scan_opts->front().range.limit.has_value();
   if (!has_limit && scan_opts->size() > 1) {
     // Abort: overlapping ranges
@@ -1183,7 +1183,7 @@ void BlockBasedTableIterator::Prepare(
 
   // Successful Prepare, init related states so the iterator reads from prepared
   // blocks
-  multi_scan_.reset(new MultiScanState(scan_opts,
+  multi_scan_.reset(new MultiScanState(multiscan_opts,
                                        std::move(pinned_data_blocks_guard),
                                        std::move(block_ranges_per_scan)));
   is_index_at_curr_block_ = false;
@@ -1202,7 +1202,8 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
     multi_scan_.reset();
   } else if (user_comparator_.CompareWithoutTimestamp(
                  ExtractUserKey(*target), /*a_has_ts=*/true,
-                 (*multi_scan_->scan_opts)[multi_scan_->next_scan_idx]
+                 multi_scan_->scan_opts
+                     ->GetScanRanges()[multi_scan_->next_scan_idx]
                      .range.start.value(),
                  /*b_has_ts=*/false) != 0) {
     // Unexpected seek key
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index dff61ad9c35a..d31296fcf841 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -227,7 +227,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     }
   }
 
-  void Prepare(const std::vector<ScanOptions>* scan_opts) override;
+  void Prepare(const MultiScanArgs* scan_opts) override;
 
   FilePrefetchBuffer* prefetch_buffer() {
     return block_prefetcher_.prefetch_buffer();
@@ -375,7 +375,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // *** BEGIN MultiScan related states ***
   struct MultiScanState {
     // bool prepared_ = false;
-    const std::vector<ScanOptions>* scan_opts;
+    const MultiScanArgs* scan_opts;
     std::vector<CachableEntry<Block>> pinned_data_blocks;
 
     // Indicies into multiscan_pinned_data_blocks_ for data blocks that are
@@ -386,7 +386,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     size_t cur_data_block_idx;
 
     MultiScanState(
-        const std::vector<ScanOptions>* _scan_opts,
+        const MultiScanArgs* _scan_opts,
         std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
         std::vector<std::tuple<size_t, size_t>>&& _block_ranges_per_scan)
         : scan_opts(_scan_opts),
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 3eec5cfbd7f8..89cb4a66f557 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1024,11 +1024,11 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // Should coalesce into a single I/O
-  std::vector<ScanOptions> scan_options(
-      {ScanOptions(ExtractUserKey(kv[0].first),
-                   ExtractUserKey(kv[kEntriesPerBlock].first)),
-       ScanOptions(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
-                   ExtractUserKey(kv[3 * kEntriesPerBlock].first))});
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(ExtractUserKey(kv[0].first),
+                      ExtractUserKey(kv[kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[3 * kEntriesPerBlock].first));
 
   auto read_count_before =
       options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
@@ -1057,10 +1057,12 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   // No IO coalesce, should do MultiRead with 2 read requests.
-  scan_options = {ScanOptions(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
-                              ExtractUserKey(kv[75 * kEntriesPerBlock].first)),
-                  ScanOptions(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
-                              ExtractUserKey(kv[95 * kEntriesPerBlock].first))};
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[75 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[95 * kEntriesPerBlock].first));
+
   read_count_before =
       options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
   iter->Prepare(&scan_options);
@@ -1088,7 +1090,8 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   // Should do two I/Os since blocks 80-81 and 90-95 are already in block cache,
   // reads from blocks 50-79 and 82-.. are co
-  scan_options = {ScanOptions(ExtractUserKey(kv[50 * kEntriesPerBlock].first))};
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
   read_count_before =
       options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
   iter->Prepare(&scan_options);
@@ -1108,10 +1111,11 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
   iter.reset(table->NewIterator(
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
-                              ExtractUserKey(kv[20 * kEntriesPerBlock].first)),
-                  ScanOptions(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
-                              ExtractUserKey(kv[40 * kEntriesPerBlock].first))};
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[20 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[40 * kEntriesPerBlock].first));
   iter->Prepare(&scan_options);
   // Match start key
   iter->Seek(kv[10 * kEntriesPerBlock].first);
@@ -1134,8 +1138,9 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
   iter.reset(table->NewIterator(
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  scan_options = {ScanOptions(ExtractUserKey(kv[10 * kEntriesPerBlock].first)),
-                  ScanOptions(ExtractUserKey(kv[11 * kEntriesPerBlock].first))};
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
   iter->Prepare(&scan_options);
   // Does not match the first ScanOptions.
   iter->SeekToFirst();
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 2cb42a0765fb..3d900379ca1f 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -181,8 +181,11 @@ class UserDefinedIndexIteratorWrapper
 
   Status status() const override { return status_; }
 
-  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
-    udi_iter_->Prepare(scan_opts->data(), scan_opts->size());
+  void Prepare(const MultiScanArgs* scan_opts) override {
+    if (scan_opts) {
+      udi_iter_->Prepare(scan_opts->GetScanRanges().data(),
+                         scan_opts->GetScanRanges().size());
+    }
   }
 
  private:
diff --git a/table/external_table.cc b/table/external_table.cc
index 8835d7e013a3..ecc08135bf30 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -131,9 +131,11 @@ class ExternalTableIteratorAdapter : public InternalIterator {
 
   Status status() const override { return status_; }
 
-  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
-    if (iterator_) {
-      iterator_->Prepare(scan_opts->data(), scan_opts->size());
+  void Prepare(const MultiScanArgs* scan_opts) override {
+    if (iterator_ && scan_opts) {
+      iterator_->Prepare(scan_opts->GetScanRanges().data(), scan_opts->size());
+    } else if (iterator_) {
+      iterator_->Prepare(nullptr, 0);
     }
   }
 
diff --git a/table/internal_iterator.h b/table/internal_iterator.h
index f6b6998b1d53..b385ef55a2c0 100644
--- a/table/internal_iterator.h
+++ b/table/internal_iterator.h
@@ -200,7 +200,7 @@ class InternalIteratorBase : public Cleanable {
   // used by MergingIterator and LevelIterator for now.
   virtual bool IsDeleteRangeSentinelKey() const { return false; }
 
-  virtual void Prepare(const std::vector<ScanOptions>* /*scan_opts*/) {}
+  virtual void Prepare(const MultiScanArgs* /*scan_opts*/) {}
 
  protected:
   void SeekForPrevImpl(const Slice& target, const CompareInterface* cmp) {
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
index 398ec9e3d0fe..b585aaa4a7e0 100644
--- a/table/iterator_wrapper.h
+++ b/table/iterator_wrapper.h
@@ -197,7 +197,7 @@ class IteratorWrapperBase {
 
   // scan_opts lifetime is guaranteed until the iterator is destructed, or
   // Prepare() is called with a new scan_opts
-  void Prepare(const std::vector<ScanOptions>* scan_opts) {
+  void Prepare(const MultiScanArgs* scan_opts) {
     if (iter_) {
       iter_->Prepare(scan_opts);
     }
diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc
index 0a47ec130f3f..e27f4c6fa270 100644
--- a/table/merging_iterator.cc
+++ b/table/merging_iterator.cc
@@ -482,7 +482,7 @@ class MergingIterator : public InternalIterator {
            current_->IsValuePinned();
   }
 
-  void Prepare(const std::vector<ScanOptions>* scan_opts) override {
+  void Prepare(const MultiScanArgs* scan_opts) override {
     for (auto& child : children_) {
       child.iter.Prepare(scan_opts);
     }
diff --git a/table/table_test.cc b/table/table_test.cc
index bb356a90869e..8e96d5036519 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7198,9 +7198,9 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
 
   std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
   ReadOptions ro;
-  std::vector<ScanOptions> scan_options(
-      {ScanOptions(key_ranges[0], key_ranges[1]),
-       ScanOptions(key_ranges[2], key_ranges[3])});
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
   std::unique_ptr<MultiScan> iter = db->NewMultiScan(ro, cfh, scan_options);
   try {
     int idx = 0;
@@ -7227,7 +7227,10 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
 
   // Test the overlapping scan case
   key_ranges[1] = "k30";
-  scan_options[0] = ScanOptions(key_ranges[0], key_ranges[1]);
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+
   iter = db->NewMultiScan(ro, cfh, scan_options);
   try {
     int idx = 0;
@@ -7253,8 +7256,9 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
   iter.reset();
 
   // Test the no limit scan case
-  scan_options[0] = ScanOptions(key_ranges[0]);
-  scan_options[1] = ScanOptions(key_ranges[2]);
+  scan_options = MultiScanArgs(BytewiseComparator());
+  scan_options.insert(key_ranges[0]);
+  scan_options.insert(key_ranges[2]);
   iter = db->NewMultiScan(ro, cfh, scan_options);
   try {
     int idx = 0;
@@ -7811,14 +7815,16 @@ TEST_F(UserDefinedIndexTest, BasicTest) {
   ro.iterate_upper_bound = nullptr;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
-  std::vector<ScanOptions> scan_opts({ScanOptions("key20")});
-  ;
-  scan_opts[0].property_bag.emplace().emplace("count", std::to_string(25));
+  MultiScanArgs scan_opts(BytewiseComparator());
+
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(25);
+  scan_opts.insert("key20", property_bag);
   iter->Prepare(scan_opts);
   // Test that we can read all the keys
   key_count = 0;
-  for (iter->Seek(scan_opts[0].range.start.value()); iter->Valid();
-       iter->Next()) {
+  for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
+       iter->Valid(); iter->Next()) {
     key_count++;
   }
   ASSERT_GE(key_count, 25);
@@ -7970,14 +7976,15 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
   ro.iterate_upper_bound = nullptr;
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
-  std::vector<ScanOptions> scan_opts({ScanOptions("key20")});
-  ;
-  scan_opts[0].property_bag.emplace().emplace("count", std::to_string(25));
+  MultiScanArgs scan_opts;
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(25);
+  scan_opts.insert(Slice("key20"), std::optional(property_bag));
   iter->Prepare(scan_opts);
   // Test that we can read all the keys
   key_count = 0;
-  for (iter->Seek(scan_opts[0].range.start.value()); iter->Valid();
-       iter->Next()) {
+  for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
+       iter->Valid(); iter->Next()) {
     key_count++;
   }
   ASSERT_GE(key_count, 25);
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 5995493a683f..7b59b7fc487a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -6412,7 +6412,7 @@ class Benchmark {
     Duration duration(FLAGS_duration, reads_);
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
-      std::vector<ScanOptions> opts;
+      MultiScanArgs opts;
       std::vector<std::unique_ptr<const char[]>> guards;
       opts.reserve(multiscan_size);
       // We create 1 random start, and then multiscan will start from that
@@ -6433,7 +6433,7 @@ class Benchmark {
         uint64_t end_key = start_key + scan_size;
         GenerateKeyFromInt(end_key, FLAGS_num, &ekey);
 
-        opts.emplace_back(skey, ekey);
+        opts.insert(skey, ekey);
         start_key += scan_size + FLAGS_multiscan_stride;
       }
 

From 13f054febb26100184eeefaac11877d735d45ac2 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 8 Aug 2025 11:04:14 -0700
Subject: [PATCH 227/500] Support DbStressCustomCompressionManager in ldb and
 sst_dump (#13827)

Summary:
while debugging stress test failure, I noticed that sst_dump and ldb do not work if custom db_stress compression manager is used. This PR adds support for it.

```
 ./sst_dump --command=raw --show_properties --file=/tmp/rocksdb_crashtest_whitebox4ny5mass/000589.sst
options.env is 0x7f2b1f4b9000
Process /tmp/rocksdb_crashtest_whitebox4ny5mass/000589.sst
Sst file format: block-based
/tmp/rocksdb_crashtest_whitebox4ny5mass/000589.sst: Not implemented: Could not load CompressionManager: DbStressCustom1
/tmp/rocksdb_crashtest_whitebox4ny5mass/000589.sst is not a valid SST file

./ldb idump --db=/tmp/rocksdb_crashtest_whiteboxy_emah11 --ignore_unknown_options  --hex >> /tmp/i_dump
Failed: Not implemented: Could not load CompressionManager: DbStressCustom1
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13827

Test Plan: manually tested that ldb and sst_dump work with DbStressCustomCompressionManager after this PR

Reviewed By: pdillinger

Differential Revision: D79461175

Pulled By: cbi42

fbshipit-source-id: c8c092b10b4fde3a295b00751057749e8f0cf095
---
 BUCK                                          |  2 ++
 CMakeLists.txt                                |  1 +
 Makefile                                      |  3 ++
 db_stress_tool/CMakeLists.txt                 |  1 +
 .../db_stress_compression_manager.cc          | 28 +++++++++++++++++++
 .../db_stress_compression_manager.h           |  2 ++
 db_stress_tool/db_stress_test_base.cc         | 19 +------------
 src.mk                                        |  2 ++
 tools/ldb_cmd.cc                              |  2 ++
 tools/sst_dump_tool.cc                        |  2 ++
 10 files changed, 44 insertions(+), 18 deletions(-)
 create mode 100644 db_stress_tool/db_stress_compression_manager.cc

diff --git a/BUCK b/BUCK
index efdc0083fde5..7ba29bb54751 100644
--- a/BUCK
+++ b/BUCK
@@ -114,6 +114,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "db/write_controller.cc",
         "db/write_stall_stats.cc",
         "db/write_thread.cc",
+        "db_stress_tool/db_stress_compression_manager.cc",
         "env/composite_env.cc",
         "env/env.cc",
         "env/env_chroot.cc",
@@ -422,6 +423,7 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
         "db_stress_tool/batched_ops_stress.cc",
         "db_stress_tool/cf_consistency_stress.cc",
         "db_stress_tool/db_stress_common.cc",
+        "db_stress_tool/db_stress_compression_manager.cc",
         "db_stress_tool/db_stress_driver.cc",
         "db_stress_tool/db_stress_filters.cc",
         "db_stress_tool/db_stress_gflags.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08abd4daf4ae..ef93aa20d6dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -747,6 +747,7 @@ set(SOURCES
         db/write_controller.cc
         db/write_stall_stats.cc
         db/write_thread.cc
+        db_stress_tool/db_stress_compression_manager.cc
         env/composite_env.cc
         env/env.cc
         env/env_chroot.cc
diff --git a/Makefile b/Makefile
index 4f0a7b5ff70f..a766426b05ef 100644
--- a/Makefile
+++ b/Makefile
@@ -1357,6 +1357,9 @@ filter_bench: $(OBJ_DIR)/util/filter_bench.o $(LIBRARY)
 db_stress: $(OBJ_DIR)/db_stress_tool/db_stress.o $(STRESS_LIBRARY) $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_stress_compression_manager: $(OBJ_DIR)/db_stress_tool/db_stress_compression_manager.o $(LIBRARY)
+	$(AM_LINK)
+
 write_stress: $(OBJ_DIR)/tools/write_stress.o $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt
index be34778ddd44..49e76ab51532 100644
--- a/db_stress_tool/CMakeLists.txt
+++ b/db_stress_tool/CMakeLists.txt
@@ -2,6 +2,7 @@ add_executable(db_stress${ARTIFACT_SUFFIX}
   batched_ops_stress.cc
   cf_consistency_stress.cc
   db_stress.cc
+  db_stress_compression_manager.cc
   db_stress_common.cc
   db_stress_driver.cc
   db_stress_filters.cc
diff --git a/db_stress_tool/db_stress_compression_manager.cc b/db_stress_tool/db_stress_compression_manager.cc
new file mode 100644
index 000000000000..9746c490333f
--- /dev/null
+++ b/db_stress_tool/db_stress_compression_manager.cc
@@ -0,0 +1,28 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include "db_stress_compression_manager.h"
+
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DbStressCustomCompressionManager::Register() {
+  // We must register any compression managers with a custom
+  // CompatibilityName() so that if it was used in a past invocation but not
+  // the current invocation, we can still read the SST files requiring it.
+  static std::once_flag loaded;
+  std::call_once(loaded, [&]() {
+    TEST_AllowUnsupportedFormatVersion() = true;
+    auto& library = *ObjectLibrary::Default();
+    library.AddFactory<CompressionManager>(
+        DbStressCustomCompressionManager().CompatibilityName(),
+        [](const std::string& /*uri*/,
+           std::unique_ptr<CompressionManager>* guard,
+           std::string* /*errmsg*/) {
+          *guard = std::make_unique<DbStressCustomCompressionManager>();
+          return guard->get();
+        });
+  });
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_compression_manager.h b/db_stress_tool/db_stress_compression_manager.h
index f1ac5aa1275e..8438a6583c7d 100644
--- a/db_stress_tool/db_stress_compression_manager.h
+++ b/db_stress_tool/db_stress_compression_manager.h
@@ -57,6 +57,8 @@ class DbStressCustomCompressionManager : public CompressionManager {
     return decomp;
   }
 
+  static void Register();
+
  protected:
   std::shared_ptr<CompressionManager> default_ =
       GetBuiltinV2CompressionManager();
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index ab3e296e4cbf..879888ae6d08 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3590,24 +3590,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     InitializeOptionsFromFlags(cache_, filter_policy_, options_);
   }
   InitializeOptionsGeneral(cache_, filter_policy_, sqfc_factory_, options_);
-  {
-    // We must register any compression managers with a custom
-    // CompatibilityName() so that if it was used in a past invocation but not
-    // the current invocation, we can still read the SST files requiring it.
-    static std::once_flag loaded;
-    std::call_once(loaded, [&]() {
-      TEST_AllowUnsupportedFormatVersion() = true;
-      auto& library = *ObjectLibrary::Default();
-      library.AddFactory<CompressionManager>(
-          DbStressCustomCompressionManager().CompatibilityName(),
-          [](const std::string& /*uri*/,
-             std::unique_ptr<CompressionManager>* guard,
-             std::string* /*errmsg*/) {
-            *guard = std::make_unique<DbStressCustomCompressionManager>();
-            return guard->get();
-          });
-    });
-  }
+  DbStressCustomCompressionManager::Register();
 
   if (!strcasecmp(FLAGS_compression_manager.c_str(), "custom")) {
     options_.compression_manager =
diff --git a/src.mk b/src.mk
index 182edc695cfc..01f754416ed2 100644
--- a/src.mk
+++ b/src.mk
@@ -367,6 +367,7 @@ RANGE_TREE_SOURCES =\
   utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
 
 TOOL_LIB_SOURCES =                                              \
+  db_stress_tool/db_stress_compression_manager.cc               \
   tools/io_tracer_parser_tool.cc                                \
   tools/ldb_cmd.cc                                              \
   tools/ldb_tool.cc                                             \
@@ -392,6 +393,7 @@ STRESS_LIB_SOURCES =                                           \
   db_stress_tool/batched_ops_stress.cc                         \
   db_stress_tool/cf_consistency_stress.cc                      \
   db_stress_tool/db_stress_common.cc                           \
+  db_stress_tool/db_stress_compression_manager.cc              \
   db_stress_tool/db_stress_driver.cc                           \
   db_stress_tool/db_stress_filters.cc                          \
   db_stress_tool/db_stress_gflags.cc                           \
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 565c24540901..50051198d770 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -25,6 +25,7 @@
 #include "db/wide/wide_column_serialization.h"
 #include "db/wide/wide_columns_helper.h"
 #include "db/write_batch_internal.h"
+#include "db_stress_tool/db_stress_compression_manager.h"
 #include "file/filename.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
@@ -1149,6 +1150,7 @@ void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) {
 // Second, overrides the options according to the CLI arguments and the
 // specific subcommand being run.
 void LDBCommand::PrepareOptions() {
+  DbStressCustomCompressionManager::Register();
   std::vector<ColumnFamilyDescriptor> column_families_from_options;
 
   if (!create_if_missing_ && try_load_options_) {
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 2710809bb46b..c650974af806 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -9,6 +9,7 @@
 #include <cinttypes>
 #include <iostream>
 
+#include "db_stress_tool/db_stress_compression_manager.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
@@ -193,6 +194,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   int64_t tmp_val;
 
   TEST_AllowUnsupportedFormatVersion() = true;
+  DbStressCustomCompressionManager::Register();
 
   for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--env_uri=", 10) == 0) {

From d8835f918c8c2e0eaee5b69e35bbb22ee2925ee1 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 11 Aug 2025 13:13:21 -0700
Subject: [PATCH 228/500] Enable track_and_verify_wal in stress test (#13853)

Summary:
**Context/Summary:**
https://github.com/facebook/rocksdb/pull/13508 accidentally didn't enable track_and_verify_wal back and this PR will enable it.

**Test**
[ongoing] Rehearsal stress test

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13853

Reviewed By: pdillinger

Differential Revision: D79909991

Pulled By: hx235

fbshipit-source-id: aea91c98e43f26dec9a8988c837a6ed821979a3c
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 786ae2d346de..bf59cc471d20 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -343,7 +343,8 @@
     "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]),
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
-    "allow_unprepared_value": lambda: random.choice([0, 1]),    
+    "allow_unprepared_value": lambda: random.choice([0, 1]),
+    "track_and_verify_wals": lambda: random.choice([0, 1]),
     # TODO(jaykorean): Re-enable remote compaction once all incompatible features are addressed in stress test
     "remote_compaction_worker_threads": lambda: 0,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),

From 496eebaee891c5f132d1ec1d9293a9c7f59b1e46 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 11 Aug 2025 15:15:26 -0700
Subject: [PATCH 229/500] Fix compilation error using CLANG (#13864)

Summary:
fix the following error showing up in continuous tests:
```
Makefile:186: Warning: Compiling in debug mode. Don't use the resulting binary in production
port/mmap.cc:46:15: error: first argument in call to 'memcpy' is a pointer to non-trivially copyable type 'rocksdb::MemMapping' [-Werror,-Wnontrivial-memcall]
   46 |   std::memcpy(this, &other, sizeof(*this));
      |               ^
port/mmap.cc:46:15: note: explicitly cast the pointer to silence this warning
   46 |   std::memcpy(this, &other, sizeof(*this));
      |               ^
      |               (void*)
1 error generated.
make: *** [Makefile:2580: port/mmap.o] Error 1
make: *** Waiting for unfinished jobs....
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13864

Test Plan: `make USE_CLANG=1 j=150 check` with https://github.com/facebook/rocksdb/blob/13f054febb26100184eeefaac11877d735d45ac2/build_tools/build_detect_platform#L61-L70 commented out.

Reviewed By: mszeszko-meta

Differential Revision: D80033441

Pulled By: cbi42

fbshipit-source-id: b2330eea71fe28243236b75128ec6f3f1e971873
---
 port/mmap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/port/mmap.cc b/port/mmap.cc
index 36e8f32617fb..36977f17b9f4 100644
--- a/port/mmap.cc
+++ b/port/mmap.cc
@@ -43,7 +43,7 @@ MemMapping& MemMapping::operator=(MemMapping&& other) noexcept {
     return *this;
   }
   this->~MemMapping();
-  std::memcpy(this, &other, sizeof(*this));
+  std::memcpy(static_cast<void*>(this), &other, sizeof(*this));
   new (&other) MemMapping();
   return *this;
 }

From 99bbc2d7fa98c001202cfd2465196b35cb154cef Mon Sep 17 00:00:00 2001
From: Karthik Krishnamurthy <kark@meta.com>
Date: Tue, 12 Aug 2025 08:41:55 -0700
Subject: [PATCH 230/500] Fix bug in the generation of index and meta blocks
 when constructing UDI (#13846)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13846

This diff addresses few issues that was identified during testing of the user defined index.

1. During the finishing of the index blocks, we run into an infinite loop because the user defined index wrapper returns
early on incomplete status. This happens because the wrapper blindly returns the status if it not OK. But, the status
could legitimately be `Incomplete()` for some indices like Partitioned Index (serving as the internal index for the UDI
wrapper). Fix is to exclude `Incomplete()` check from the status check early in the UDI wrapper's finish.

2. Once we fixed (1), we noticed that the meta blocks for the UDI-based index writer were not written out to the final
SST file. This is because the UDI's meta blocks are created after the internal index's meta blocks and the block-based
index builder didn't account for this. The fix is to finish the UDI wrapper first which will create the necessary meta blocks
and then finish the internal index. If the internal index is incomplete, the block-based index builder should still continue
to write out the meta blocks.

3. OnKeyAdded when delegating to the user-defined index should only pass the user key. The UDI builder doesn't
understand RocksDB's internal key format and while that poses interesting challenges when the UDI is used for non
last level SST files, our plan is to restrict the usage of the UDI to last level files only (for now).

Reviewed By: pdillinger

Differential Revision: D79781453

fbshipit-source-id: 2239c8fc016da55df5c24be6aacc8f6357cab029
---
 .../block_based/block_based_table_builder.cc  | 15 ++++----
 .../block_based/user_defined_index_wrapper.h  | 35 +++++++++++--------
 table/table_test.cc                           | 21 +++++++++--
 3 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index bfe645b883aa..b4898709b9f5 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1062,7 +1062,7 @@ struct BlockBasedTableBuilder::Rep {
       char* ptr = compression_name.data() + pos;
       // Populate the field contents
       for (CompressionType t : compression_types_used) {
-        PutBaseChars<16>(&ptr, /*digits=*/2, static_cast<unsigned char>(t),
+        PutBaseChars<16>(&ptr, /*n=*/2, static_cast<unsigned char>(t),
                          /*uppercase=*/true);
       }
       assert(ptr == compression_name.data() + pos + ctype_count * 2);
@@ -1827,14 +1827,15 @@ void BlockBasedTableBuilder::WriteIndexBlock(
   }
   IndexBuilder::IndexBlocks index_blocks;
   auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
-  if (index_builder_status.IsIncomplete()) {
-    // We we have more than one index partition then meta_blocks are not
-    // supported for the index. Currently meta_blocks are used only by
-    // HashIndexBuilder which is not multi-partition.
-    assert(index_blocks.meta_blocks.empty());
-  } else if (ok() && !index_builder_status.ok()) {
+  if (ok() && !index_builder_status.ok() &&
+      !index_builder_status.IsIncomplete()) {
+    // If the index builder failed for non-Incomplete errors, we should
+    // mark the entire builder as having failed wit that status. However,
+    // If the index builder failed with an incomplete error, we should
+    // continue writing out any meta blocks that may have been generated.
     rep_->SetStatus(index_builder_status);
   }
+
   if (ok()) {
     for (const auto& item : index_blocks.meta_blocks) {
       BlockHandle block_handle;
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 3d900379ca1f..ba92f78aec3b 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -56,13 +56,13 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
 
   void OnKeyAdded(const Slice& key,
                   const std::optional<Slice>& value) override {
+    ParsedInternalKey pkey;
     if (status_.ok()) {
       if (!value.has_value()) {
         status_ = Status::InvalidArgument(
             "user_defined_index_factory not supported with parallel "
             "compression");
       } else {
-        ParsedInternalKey pkey;
         status_ = ParseInternalKey(key, &pkey, /*lof_err_key*/ false);
         if (status_.ok() && pkey.type != ValueType::kTypeValue) {
           status_ = Status::InvalidArgument(
@@ -77,15 +77,31 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
     // Forward the call to both index builders
     internal_index_builder_->OnKeyAdded(key, value);
     user_defined_index_builder_->OnKeyAdded(
-        key, UserDefinedIndexBuilder::ValueType::kValue, value.value());
+        pkey.user_key, UserDefinedIndexBuilder::ValueType::kValue,
+        value.value());
   }
 
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& last_partition_block_handle) override {
-    if (!status_.ok()) {
+    if (!status_.ok() && !status_.IsIncomplete()) {
       return status_;
     }
 
+    if (!udi_finished_) {
+      // Finish the user defined index builder
+      Slice user_index_contents;
+      status_ = user_defined_index_builder_->Finish(&user_index_contents);
+      if (!status_.ok()) {
+        return status_;
+      }
+
+      // Add the user defined index to the meta blocks
+      std::string block_name = kUserDefinedIndexPrefix + name_;
+      index_blocks->meta_blocks.insert(
+          {block_name, {BlockType::kUserDefinedIndex, user_index_contents}});
+      udi_finished_ = true;
+    }
+
     // Finish the internal index builder
     status_ = internal_index_builder_->Finish(index_blocks,
                                               last_partition_block_handle);
@@ -93,18 +109,6 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
       return status_;
     }
 
-    // Finish the user defined index builder
-    Slice user_index_contents;
-    status_ = user_defined_index_builder_->Finish(&user_index_contents);
-    if (!status_.ok()) {
-      return status_;
-    }
-
-    // Add the user defined index to the meta blocks
-    std::string block_name = kUserDefinedIndexPrefix + name_;
-    index_blocks->meta_blocks.insert(
-        {block_name, {BlockType::kUserDefinedIndex, user_index_contents}});
-
     index_size_ = internal_index_builder_->IndexSize();
     return status_;
   }
@@ -120,6 +124,7 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
   std::unique_ptr<IndexBuilder> internal_index_builder_;
   std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder_;
   Status status_;
+  bool udi_finished_ = false;
 };
 
 class UserDefinedIndexIteratorWrapper
diff --git a/table/table_test.cc b/table/table_test.cc
index 8e96d5036519..9d2ca33f522d 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2059,7 +2059,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) {
 
   // Simple
   PrefetchRange(&c, &opt, &table_options,
-                /*key_range=*/"k01", "k05",
+                /*key_begin=*/"k01", /*key_end=*/"k05",
                 /*keys_in_cache=*/{"k01", "k02", "k03", "k04", "k05"},
                 /*keys_not_in_cache=*/{"k06", "k07"});
   PrefetchRange(&c, &opt, &table_options, "k01", "k01", {"k01", "k02", "k03"},
@@ -7679,9 +7679,12 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
           index_data_;
     };
   };
+
+ protected:
+  void BasicTest(bool use_partitioned_index);
 };
 
-TEST_F(UserDefinedIndexTest, BasicTest) {
+void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   Options options;
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
@@ -7691,7 +7694,11 @@ TEST_F(UserDefinedIndexTest, BasicTest) {
   auto user_defined_index_factory =
       std::make_shared<TestUserDefinedIndexFactory>();
   table_options.user_defined_index_factory = user_defined_index_factory;
-
+  if (use_partitioned_index) {
+    table_options.partition_filters = true;
+    table_options.decouple_partitioned_filters = true;
+    table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+  }
   // Set up custom flush block policy that flushes every 3 keys
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
@@ -7833,6 +7840,14 @@ TEST_F(UserDefinedIndexTest, BasicTest) {
   ASSERT_OK(iter->status());
 }
 
+TEST_F(UserDefinedIndexTest, BasicTestWithPartitionedIndex) {
+  BasicTest(/*use_partitioned_index=*/true);
+}
+
+TEST_F(UserDefinedIndexTest, BasicTestWithoutPartitionedIndex) {
+  BasicTest(/*use_partitioned_index=*/false);
+}
+
 TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {
   Options options;
   BlockBasedTableOptions table_options;

From e12734d51f819160644020c69422495b2e171bc7 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 12 Aug 2025 11:57:29 -0700
Subject: [PATCH 231/500] Disable track_and_verify_wals temporarily (#13869)

Summary:
... as we see some issues that rehearsal stress test didn't surface.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13869

Reviewed By: cbi42

Differential Revision: D80103341

Pulled By: hx235

fbshipit-source-id: 8b2c1d76d4c3099727ba3a69de44de67afd64369
---
 tools/db_crashtest.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index bf59cc471d20..e0b7e4a16cbf 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -344,7 +344,8 @@
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
-    "track_and_verify_wals": lambda: random.choice([0, 1]),
+    # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
+    "track_and_verify_wals": lambda: random.choice([0]),
     # TODO(jaykorean): Re-enable remote compaction once all incompatible features are addressed in stress test
     "remote_compaction_worker_threads": lambda: 0,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
@@ -814,13 +815,13 @@ def finalize_and_sanitize(src_params):
         else:
             dest_params["unordered_write"] = 0
     if dest_params.get("remote_compaction_worker_threads", 0) > 0:
-       # TODO Fix races when both Remote Compaction + BlobDB enabled
-       dest_params["enable_blob_files"] = 0
-       # TODO Fix - Remote worker shouldn't recover from WAL
-       dest_params["disable_wal"] = 1
-       # Disable Incompatible Ones
-       dest_params["checkpoint_one_in"] = 0       
-       dest_params["use_timed_put_one_in"] = 0
+        # TODO Fix races when both Remote Compaction + BlobDB enabled
+        dest_params["enable_blob_files"] = 0
+        # TODO Fix - Remote worker shouldn't recover from WAL
+        dest_params["disable_wal"] = 1
+        # Disable Incompatible Ones
+        dest_params["checkpoint_one_in"] = 0
+        dest_params["use_timed_put_one_in"] = 0
     if dest_params.get("disable_wal", 0) == 1:
         dest_params["atomic_flush"] = 1
         dest_params["sync"] = 0
@@ -1023,7 +1024,7 @@ def finalize_and_sanitize(src_params):
             dest_params["exclude_wal_from_write_fault_injection"] = 1
             dest_params["metadata_write_fault_one_in"] = 0
 
-            # TODO Fix - Remote worker shouldn't recover from WAL 
+            # TODO Fix - Remote worker shouldn't recover from WAL
             dest_params["remote_compaction_worker_threads"] = 0
     # Disabling block align if mixed manager is being used
     if dest_params.get("compression_manager") == "custom":
@@ -1100,7 +1101,7 @@ def finalize_and_sanitize(src_params):
         dest_params["ingest_wbwi_one_in"] = 0
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
-        dest_params["continuous_verification_interval"] = 0   
+        dest_params["continuous_verification_interval"] = 0
     return dest_params
 
 

From 8f0ab1598effd4b05f6f88310c7bd9aaf5d418c6 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 12 Aug 2025 14:00:40 -0700
Subject: [PATCH 232/500] Make UDI interface consistently use the user key
 (#13865)

Summary:
The original intention of the User Defined Index interface was to use the user key. However, the implementation mixed user and internal key usage. This PR makes it consistent. It also clarifies the UDI contract.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13865

Test Plan: Update tests in table_test.cc

Reviewed By: pdillinger

Differential Revision: D80050344

Pulled By: anand1976

fbshipit-source-id: ace47737d21684ec19709640a09e198cee2d98bd
---
 include/rocksdb/user_defined_index.h          | 20 ++++++
 .../block_based/block_based_table_builder.cc  |  6 ++
 .../block_based/user_defined_index_wrapper.h  | 62 +++++++++++++++----
 table/table_test.cc                           |  8 ++-
 .../bug_fixes/udi_index_key_format.md         |  1 +
 5 files changed, 84 insertions(+), 13 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/udi_index_key_format.md

diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
index 6aabed4d1dea..49a63e2c17ba 100644
--- a/include/rocksdb/user_defined_index.h
+++ b/include/rocksdb/user_defined_index.h
@@ -27,6 +27,11 @@ inline const std::string kUserDefinedIndexPrefix =
 // It allows users to define their own index format and build custom
 // indexes during table building. Currently, only a monolithic index
 // block is supported (no partitioned index).
+//
+// This is currently supported only for a restricted set of use cases. The
+// CF must be ingest only, and only files containing Puts generated by
+// SstFileWriter are supported. The user_comparator used for the CF must
+// be BytewiseComparator.
 
 // The interface for building user-defined index.
 class UserDefinedIndexBuilder {
@@ -51,6 +56,10 @@ class UserDefinedIndexBuilder {
   // The previous index entry key and the new index entry key cover
   // all the keys in the data block associated with the new index entry.
   //
+  // The last_key_in_current_block and first_key_in_next_block will be user
+  // keys, i.e the user key string, and optionally the user timestamp if one
+  // is configured, without a sequence number suffix.
+  //
   // Called before the OnKeyAdded() call for first_key_in_next_block.
   // @last_key_in_current_block: The last key in the current data block
   // @first_key_in_next_block: it will be nullptr if the entry being added is
@@ -72,6 +81,9 @@ class UserDefinedIndexBuilder {
   // override OnKeyAdded() if they need to collect additional information.
   // The type argument indicates whether the value is a full value or partial.
   // At the moment, only full values are supported.
+  //
+  // The key will be a user key. RocksDB guarantees that there will only be
+  // one entry for each key in the file/index.
   virtual void OnKeyAdded(const Slice& /*key*/, ValueType /*type*/,
                           const Slice& /*value*/) {}
 
@@ -100,6 +112,14 @@ class UserDefinedIndexIterator {
   // termination criteria, kInbound if the data block is definitely fully
   // within bounds, or kUnknown if the data block could be partially
   // within bounds.
+  // The UDI implementation needs to be careful about returning kOutOfBound.
+  // If a limit key is specified in ScanOptions, an implementation that
+  // does not store the first key in the block for the corresponding index
+  // entry cannot reliably determine if the block is out of bounds. It must
+  // compare against the previous index key to determine if the current block
+  // is out of bounds w.r.t the limit. Other termination criteria (specified
+  // in property_bag) may cause the scan to terminate earlier, in which case
+  // kOutOfBound can be returned earlier.
   virtual Status SeekAndGetResult(const Slice& target,
                                   IterateResult* result) = 0;
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index b4898709b9f5..67c1e167a0aa 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -911,6 +911,12 @@ struct BlockBasedTableBuilder::Rep {
         SetStatus(
             Status::InvalidArgument("user_defined_index_factory not supported "
                                     "with parallel compression"));
+      } else if (ioptions.user_comparator != BytewiseComparator()) {
+        // TODO: Pass the user_comparator to the UDI and let it validate. Do
+        // it in a major release.
+        SetStatus(
+            Status::InvalidArgument("user_defined_index_factory only supported "
+                                    "with bytewise comparator"));
       } else {
         std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder(
             table_options.user_defined_index_factory->NewBuilder());
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index ba92f78aec3b..190d02170c96 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -46,9 +46,24 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
     handle.offset = block_handle.offset();
     handle.size = block_handle.size();
     // Forward the call to both index builders
-    user_defined_index_builder_->AddIndexEntry(last_key_in_current_block,
-                                               first_key_in_next_block, handle,
-                                               separator_scratch);
+    ParsedInternalKey pkey_last;
+    ParsedInternalKey pkey_first;
+    // There's no way to return an error here, so we remember the statsu and
+    // return it in Finish()
+    if (status_.ok()) {
+      status_ = ParseInternalKey(last_key_in_current_block, &pkey_last,
+                                 /*lof_err_key*/ false);
+    }
+    if (status_.ok() && first_key_in_next_block) {
+      status_ = ParseInternalKey(*first_key_in_next_block, &pkey_first,
+                                 /*lof_err_key*/ false);
+    }
+    if (status_.ok()) {
+      user_defined_index_builder_->AddIndexEntry(
+          pkey_last.user_key,
+          first_key_in_next_block ? &pkey_first.user_key : nullptr, handle,
+          separator_scratch);
+    }
     return internal_index_builder_->AddIndexEntry(
         last_key_in_current_block, first_key_in_next_block, block_handle,
         separator_scratch);
@@ -76,6 +91,12 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
 
     // Forward the call to both index builders
     internal_index_builder_->OnKeyAdded(key, value);
+
+    // Pass the user key to the UDI. We don't expect multiple entries with
+    // different sequence numbers for the same key in the file. RocksDB may
+    // enforce it in the future by allowing UDIs only for read only
+    // bulkloaded use cases, and only allow ingestion of files with
+    // sequence number 0.
     user_defined_index_builder_->OnKeyAdded(
         pkey.user_key, UserDefinedIndexBuilder::ValueType::kValue,
         value.value());
@@ -149,23 +170,41 @@ class UserDefinedIndexIteratorWrapper
     status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
     if (status_.ok()) {
       status_ = udi_iter_->SeekAndGetResult(pkey.user_key, &result_);
-      valid_ = status_.ok() &&
-               result_.bound_check_result == IterBoundCheck::kInbound;
+      if (status_.ok()) {
+        valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+        if (valid_) {
+          ikey_.Set(result_.key, 0, ValueType::kTypeValue);
+        }
+      }
+    } else {
+      valid_ = false;
     }
   }
 
   void Next() override {
     status_ = udi_iter_->NextAndGetResult(&result_);
-    valid_ =
-        status_.ok() && result_.bound_check_result == IterBoundCheck::kInbound;
+    if (status_.ok()) {
+      valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+      if (valid_) {
+        ikey_.Set(result_.key, 0, ValueType::kTypeValue);
+      }
+    } else {
+      valid_ = false;
+    }
   }
 
   bool NextAndGetResult(IterateResult* result) override {
     status_ = udi_iter_->NextAndGetResult(&result_);
-    valid_ =
-        status_.ok() && result_.bound_check_result == IterBoundCheck::kInbound;
     if (status_.ok()) {
-      *result = result_;
+      valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+      if (valid_) {
+        ikey_.Set(result_.key, 0, ValueType::kTypeValue);
+      }
+      if (status_.ok()) {
+        *result = result_;
+      }
+    } else {
+      valid_ = false;
     }
     return valid_;
   }
@@ -176,7 +215,7 @@ class UserDefinedIndexIteratorWrapper
 
   void Prev() override { status_ = Status::NotSupported("Prev not supported"); }
 
-  Slice key() const override { return result_.key; }
+  Slice key() const override { return Slice(*ikey_.const_rep()); }
 
   IndexValue value() const override {
     auto handle = udi_iter_->value();
@@ -196,6 +235,7 @@ class UserDefinedIndexIteratorWrapper
  private:
   std::unique_ptr<UserDefinedIndexIterator> udi_iter_;
   IterateResult result_;
+  InternalKey ikey_;
   Status status_;
   bool valid_;
 };
diff --git a/table/table_test.cc b/table/table_test.cc
index 9d2ca33f522d..d156966c6fc5 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7479,8 +7479,11 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
                           const Slice* first_key_in_next_block,
                           const BlockHandle& block_handle,
                           std::string* separator_scratch) override {
+        EXPECT_EQ(last_key_in_current_block.size(), 5);
+        if (first_key_in_next_block) {
+          EXPECT_EQ(first_key_in_next_block->size(), 5);
+        }
         // Unused parameters
-        (void)first_key_in_next_block;
         (void)separator_scratch;
         entries_added_++;
         // Store the block handle for each key
@@ -7494,8 +7497,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         return last_key_in_current_block;
       }
 
-      void OnKeyAdded(const Slice& /*key*/, ValueType /*value*/,
+      void OnKeyAdded(const Slice& key, ValueType /*value*/,
                       const Slice& /*value*/) override {
+        EXPECT_EQ(key.size(), 5);
         // Track keys added to the index
         keys_added_++;
       }
diff --git a/unreleased_history/bug_fixes/udi_index_key_format.md b/unreleased_history/bug_fixes/udi_index_key_format.md
new file mode 100644
index 000000000000..943e9413ed1e
--- /dev/null
+++ b/unreleased_history/bug_fixes/udi_index_key_format.md
@@ -0,0 +1 @@
+Make the User Defined Index interface consistently use the user key format, fixing the previous mixed usage of internal and user key.

From 7e9c96020b79c799b6bb2c48cae37cc0a05d3ea0 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 13 Aug 2025 12:02:12 -0700
Subject: [PATCH 233/500] Improve two error messages on WAL recovery (#13876)

Summary:
**Context/Summary:** ... for better readability

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13876

Test Plan: Existing UT

Reviewed By: mszeszko-meta

Differential Revision: D80185817

Pulled By: hx235

fbshipit-source-id: 534d37dd747369da48fc5903acc66bb9c8f5206d
---
 db/db_impl/db_impl_open.cc | 8 ++++++--
 db/log_reader.cc           | 7 +++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 2dbc2f73a818..a9871d6bb2f5 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1755,8 +1755,12 @@ Status DBImpl::MaybeHandleStopReplayForCorruptionForInconsistency(
         ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                         "Column family inconsistency: SST file contains data"
                         " beyond the point of corruption.");
-        status = Status::Corruption("SST file is ahead of WALs in CF " +
-                                    cfd->GetName());
+        status = Status::Corruption(
+            "Column family inconsistency: SST file contains data"
+            " beyond the point of corruption in CF " +
+            cfd->GetName() +
+            ". WAL recovery stopped at corruption point, but SST files"
+            " contain newer data.");
         return status;
       }
     }
diff --git a/db/log_reader.cc b/db/log_reader.cc
index 71b84b428987..2650b4c97a9a 100644
--- a/db/log_reader.cc
+++ b/db/log_reader.cc
@@ -380,8 +380,11 @@ void Reader::MaybeVerifyPredecessorWALInfo(
   } else {
     if (observed_predecessor_wal_info_.GetLogNumber() !=
         recorded_predecessor_log_number) {
-      std::string reason = "Missing WAL of log number " +
-                           std::to_string(recorded_predecessor_log_number);
+      std::string reason =
+          "Mismatched predecessor log number of WAL file " +
+          file_->file_name() + " Recorded " +
+          std::to_string(recorded_predecessor_log_number) + ". Observed " +
+          std::to_string(observed_predecessor_wal_info_.GetLogNumber());
       ReportCorruption(fragment.size(), reason.c_str(),
                        recorded_predecessor_log_number);
     } else if (observed_predecessor_wal_info_.GetLastSeqnoRecorded() !=

From 1369c7b169abf92e9750df0bb5471038d5fb7a15 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 14 Aug 2025 09:05:39 -0700
Subject: [PATCH 234/500] Allow a user defined index to be configured from a
 string (#13880)

Summary:
Allow a user defined index to be configured from a string

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13880

Test Plan: Add a unit test in table_test.cc

Reviewed By: bikash-c

Differential Revision: D80237701

Pulled By: anand1976

fbshipit-source-id: 8b3d0bcdfbb4bb76803916ea1b1f940a4d985dfd
---
 include/rocksdb/user_defined_index.h          |  6 ++
 .../block_based/block_based_table_factory.cc  | 14 +++
 table/table_test.cc                           | 87 +++++++++++++++++++
 unreleased_history/bug_fixes/udi_config.md    |  1 +
 4 files changed, 108 insertions(+)
 create mode 100644 unreleased_history/bug_fixes/udi_config.md

diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
index 49a63e2c17ba..f51345231cab 100644
--- a/include/rocksdb/user_defined_index.h
+++ b/include/rocksdb/user_defined_index.h
@@ -150,6 +150,12 @@ class UserDefinedIndexFactory : public Customizable {
  public:
   virtual ~UserDefinedIndexFactory() = default;
 
+  static const char* Type() { return "UserDefinedIndexFactory"; }
+
+  static Status CreateFromString(
+      const ConfigOptions& config_options, const std::string& value,
+      std::shared_ptr<UserDefinedIndexFactory>* factory);
+
   // Create a new builder for user-defined index.
   virtual UserDefinedIndexBuilder* NewBuilder() const = 0;
 
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index ee4d941c7297..ff6cdaaa2b74 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -25,6 +25,8 @@
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/table.h"
+#include "rocksdb/user_defined_index.h"
+#include "rocksdb/utilities/customizable_util.h"
 #include "rocksdb/utilities/options_type.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_based_table_reader.h"
@@ -312,6 +314,11 @@ static struct BlockBasedTableTypeInfo {
          OptionTypeInfo::AsCustomSharedPtr<const FilterPolicy>(
              offsetof(struct BlockBasedTableOptions, filter_policy),
              OptionVerificationType::kByNameAllowFromNull)},
+        {"user_defined_index_factory",
+         OptionTypeInfo::AsCustomSharedPtr<UserDefinedIndexFactory>(
+             offsetof(struct BlockBasedTableOptions,
+                      user_defined_index_factory),
+             OptionVerificationType::kByNameAllowFromNull)},
         {"whole_key_filtering",
          {offsetof(struct BlockBasedTableOptions, whole_key_filtering),
           OptionType::kBoolean, OptionVerificationType::kNormal}},
@@ -1011,6 +1018,13 @@ TableFactory* NewBlockBasedTableFactory(
   return new BlockBasedTableFactory(_table_options);
 }
 
+Status UserDefinedIndexFactory::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<UserDefinedIndexFactory>* factory) {
+  return LoadSharedObject<UserDefinedIndexFactory>(config_options, value,
+                                                   factory);
+}
+
 const std::string BlockBasedTablePropertyNames::kIndexType =
     "rocksdb.block.based.table.index.type";
 const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
diff --git a/table/table_test.cc b/table/table_test.cc
index d156966c6fc5..9185827c5959 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -53,6 +53,7 @@
 #include "rocksdb/trace_record.h"
 #include "rocksdb/unique_id.h"
 #include "rocksdb/user_defined_index.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_builder.h"
@@ -8068,6 +8069,92 @@ TEST_F(UserDefinedIndexTest, IngestFailTest) {
   ASSERT_OK(db->Close());
   ASSERT_OK(DestroyDB(dbname, options));
 }
+
+TEST_F(UserDefinedIndexTest, ConfigTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Add 100 keys instead of just 5
+  for (int i = 0; i < 100; i++) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  table_options.user_defined_index_factory.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  // Set up the user-defined index factory
+  ObjectLibrary::Default().get()->AddFactory<UserDefinedIndexFactory>(
+      "test_index", [](const std::string& /* uri */,
+                       std::unique_ptr<UserDefinedIndexFactory>* guard,
+                       std::string* /* errmsg */) {
+        auto factory = new TestUserDefinedIndexFactory();
+        guard->reset(factory);
+        return guard->get();
+      });
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ConfigOptions(), options,
+      "block_based_table_factory={user_defined_index_factory=test_index;}",
+      &options));
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  ro.table_index_factory = user_defined_index_factory.get();
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  MultiScanArgs scan_opts;
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(25);
+  scan_opts.insert(Slice("key20"), std::optional(property_bag));
+  iter->Prepare(scan_opts);
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
+       iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_GE(key_count, 25);
+  // The index may undercount by 2 blocks
+  ASSERT_LE(key_count, 30);
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/bug_fixes/udi_config.md b/unreleased_history/bug_fixes/udi_config.md
new file mode 100644
index 000000000000..fce63ce44c3d
--- /dev/null
+++ b/unreleased_history/bug_fixes/udi_config.md
@@ -0,0 +1 @@
+Allow a user defined index to be configured from a string.

From 972fd9adf11bfaa77ae3f32b4e8562ca380c3e77 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 14 Aug 2025 16:40:25 -0700
Subject: [PATCH 235/500] Remove `expect_valid_internal_key` parameter from
 CompactionIterator (#13882)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13882

The `expect_valid_internal_key` parameter was always passed as true, with false only used in one unit test. This change removes the parameter and always fail compaction when encountering corrupted internal keys, which is the expected production behavior.

Reviewed By: mszeszko-meta

Differential Revision: D80287672

fbshipit-source-id: e30a282ac30d7fded677504cec11173de8d15167
---
 db/builder.cc                             |  3 +--
 db/compaction/compaction_iterator.cc      | 24 +++++++----------------
 db/compaction/compaction_iterator.h       |  5 ++---
 db/compaction/compaction_iterator_test.cc | 15 +++++---------
 db/compaction/compaction_job.cc           |  3 +--
 db/flush_job.cc                           |  3 +--
 6 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/db/builder.cc b/db/builder.cc
index 1bc59eb25bb4..854958f2478e 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -201,8 +201,7 @@ Status BuildTable(
     CompactionIterator c_iter(
         iter, ucmp, &merge, kMaxSequenceNumber, &snapshots, earliest_snapshot,
         earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
-        ShouldReportDetailedTime(env, ioptions.stats),
-        true /* internal key corruption is not ok */, range_del_agg.get(),
+        ShouldReportDetailedTime(env, ioptions.stats), range_del_agg.get(),
         blob_file_builder.get(), ioptions.allow_data_in_errors,
         ioptions.enforce_single_del_contracts,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index d21672e8906c..58f3afaea662 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -28,7 +28,7 @@ CompactionIterator::CompactionIterator(
     SequenceNumber earliest_snapshot,
     SequenceNumber earliest_write_conflict_snapshot,
     SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
-    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    Env* env, bool report_detailed_time,
     CompactionRangeDelAggregator* range_del_agg,
     BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
     bool enforce_single_del_contracts,
@@ -42,8 +42,8 @@ CompactionIterator::CompactionIterator(
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots, earliest_snapshot,
           earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
-          report_detailed_time, expect_valid_internal_key, range_del_agg,
-          blob_file_builder, allow_data_in_errors, enforce_single_del_contracts,
+          report_detailed_time, range_del_agg, blob_file_builder,
+          allow_data_in_errors, enforce_single_del_contracts,
           manual_compaction_canceled,
           compaction ? std::make_unique<RealCompaction>(compaction) : nullptr,
           must_count_input_entries, compaction_filter, shutting_down, info_log,
@@ -55,7 +55,7 @@ CompactionIterator::CompactionIterator(
     SequenceNumber earliest_snapshot,
     SequenceNumber earliest_write_conflict_snapshot,
     SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
-    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    Env* env, bool report_detailed_time,
     CompactionRangeDelAggregator* range_del_agg,
     BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
     bool enforce_single_del_contracts,
@@ -76,7 +76,6 @@ CompactionIterator::CompactionIterator(
       env_(env),
       clock_(env_->GetSystemClock().get()),
       report_detailed_time_(report_detailed_time),
-      expect_valid_internal_key_(expect_valid_internal_key),
       range_del_agg_(range_del_agg),
       blob_file_builder_(blob_file_builder),
       compaction_(std::move(compaction)),
@@ -464,18 +463,9 @@ void CompactionIterator::NextFromInput() {
     if (!pik_status.ok()) {
       iter_stats_.num_input_corrupt_records++;
 
-      // If `expect_valid_internal_key_` is false, return the corrupted key
-      // and let the caller decide what to do with it.
-      if (expect_valid_internal_key_) {
-        status_ = pik_status;
-        return;
-      }
-      key_ = current_key_.SetInternalKey(key_);
-      has_current_user_key_ = false;
-      current_user_key_sequence_ = kMaxSequenceNumber;
-      current_user_key_snapshot_ = 0;
-      validity_info_.SetValid(ValidContext::kParseKeyError);
-      break;
+      // Always fail compaction when encountering corrupted internal keys
+      status_ = pik_status;
+      return;
     }
     TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
     if (is_range_del_) {
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 92254a18bc56..bc0407e0ee6f 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -193,7 +193,7 @@ class CompactionIterator {
       SequenceNumber earliest_snapshot,
       SequenceNumber earliest_write_conflict_snapshot,
       SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
-      Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+      Env* env, bool report_detailed_time,
       CompactionRangeDelAggregator* range_del_agg,
       BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
       bool enforce_single_del_contracts,
@@ -213,7 +213,7 @@ class CompactionIterator {
                      SequenceNumber earliest_write_conflict_snapshot,
                      SequenceNumber job_snapshot,
                      const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
+                     bool report_detailed_time,
                      CompactionRangeDelAggregator* range_del_agg,
                      BlobFileBuilder* blob_file_builder,
                      bool allow_data_in_errors,
@@ -348,7 +348,6 @@ class CompactionIterator {
   Env* env_;
   SystemClock* clock_;
   const bool report_detailed_time_;
-  const bool expect_valid_internal_key_;
   CompactionRangeDelAggregator* range_del_agg_;
   BlobFileBuilder* blob_file_builder_;
   std::unique_ptr<CompactionProxy> compaction_;
diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
index 974a4e1ff837..5ede0f4e1623 100644
--- a/db/compaction/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -294,7 +294,7 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
         snapshots_.empty() ? kMaxSequenceNumber : snapshots_.at(0),
         earliest_write_conflict_snapshot, kMaxSequenceNumber,
         snapshot_checker_.get(), Env::Default(),
-        false /* report_detailed_time */, false, range_del_agg_.get(),
+        false /* report_detailed_time */, range_del_agg_.get(),
         nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
         true /*enforce_single_del_contracts*/,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
@@ -374,8 +374,7 @@ TEST_P(CompactionIteratorTest, EmptyResult) {
   ASSERT_FALSE(c_iter_->Valid());
 }
 
-// If there is a corruption after a single deletion, the corrupted key should
-// be preserved.
+// If there is a corruption after a single deletion, the compaction should fail.
 TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
   InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
                  test::KeyStr("a", 3, kTypeValue, true),
@@ -386,14 +385,10 @@ TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
   ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
             c_iter_->key().ToString());
   c_iter_->Next();
-  ASSERT_TRUE(c_iter_->Valid());
-  ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
-  c_iter_->Next();
-  ASSERT_TRUE(c_iter_->Valid());
-  ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
-  c_iter_->Next();
-  ASSERT_OK(c_iter_->status());
+  // The iterator should now fail when encountering the corrupted key
   ASSERT_FALSE(c_iter_->Valid());
+  ASSERT_FALSE(c_iter_->status().ok());
+  ASSERT_TRUE(c_iter_->status().IsCorruption());
 }
 
 // Tests compatibility of TimedPut and SingleDelete. TimedPut should act as if
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 58fd2da01bf0..532c4cedcf8d 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1410,8 +1410,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       &(job_context_->snapshot_seqs), earliest_snapshot_,
       job_context_->earliest_write_conflict_snapshot, job_snapshot_seq,
       job_context_->snapshot_checker, env_,
-      ShouldReportDetailedTime(env_, stats_),
-      /*expect_valid_internal_key=*/true, sub_compact->RangeDelAgg(),
+      ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
       blob_file_builder.get(), db_options_.allow_data_in_errors,
       db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
       sub_compact->compaction
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 60feeb5c9191..6bed0afb2d96 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -502,8 +502,7 @@ Status FlushJob::MemPurge() {
         kMaxSequenceNumber, &job_context_->snapshot_seqs, earliest_snapshot_,
         job_context_->earliest_write_conflict_snapshot,
         job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
-        env, ShouldReportDetailedTime(env, ioptions.stats),
-        true /* internal key corruption is not ok */, range_del_agg.get(),
+        env, ShouldReportDetailedTime(env, ioptions.stats), range_del_agg.get(),
         nullptr, ioptions.allow_data_in_errors,
         ioptions.enforce_single_del_contracts,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,

From 5c7162da271c3e8c3649865693b1f106182e021e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 14 Aug 2025 21:03:47 -0700
Subject: [PATCH 236/500] Set decouple_partitioned_filters=true by default
 (#13881)

Summary:
This is an important feature for avoiding (reducing) unfair block cache treatment for a lot of blocks. It should also unlock some parallel optimizations (https://github.com/facebook/rocksdb/issues/13850) and code simplification.

Consider for follow-up:
* Feature to avoid majorly under0sized data blocks and filter and index partition blocks

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13881

Test Plan: existing tests, been looking good in production

Reviewed By: hx235

Differential Revision: D80288192

Pulled By: pdillinger

fbshipit-source-id: 5e274ffffb044713278d2a286db6bceaab2dadec
---
 db/db_test_util.cc                                | 5 -----
 include/rocksdb/table.h                           | 7 +++----
 unreleased_history/public_api_changes/decouple.md | 1 +
 3 files changed, 4 insertions(+), 9 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/decouple.md

diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index bec9bbd475b8..0bfb32ebf0fe 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -366,11 +366,6 @@ Options DBTestBase::GetOptions(
     table_options.block_cache = NewLRUCache(/* too small */ 1);
   }
 
-  // Test anticipated new default as much as reasonably possible (and remove
-  // this code when obsolete)
-  assert(!table_options.decouple_partitioned_filters);
-  table_options.decouple_partitioned_filters = true;
-
   bool can_allow_mmap = IsMemoryMappedAccessSupported();
   switch (option_config) {
     case kHashSkipList:
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index fb2b1c16adb4..6c71b02501dc 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -440,10 +440,9 @@ struct BlockBasedTableOptions {
   // versions of RocksDB able to read partitioned filters are able to read
   // decoupled partitioned filters.)
   //
-  // decouple_partitioned_filters = false is the original behavior, because of
-  // limitations in the initial implementation, and the new behavior
-  // decouple_partitioned_filters = true is expected to become the new default.
-  bool decouple_partitioned_filters = false;
+  // decouple_partitioned_filters = true is the new default. This option is now
+  // DEPRECATED and might be ignored and/or removed in a future release.
+  bool decouple_partitioned_filters = true;
 
   // Option to generate Bloom/Ribbon filters that minimize memory
   // internal fragmentation.
diff --git a/unreleased_history/public_api_changes/decouple.md b/unreleased_history/public_api_changes/decouple.md
new file mode 100644
index 000000000000..c4c6944ae21e
--- /dev/null
+++ b/unreleased_history/public_api_changes/decouple.md
@@ -0,0 +1 @@
+* `decouple_partitioned_filters = true` is now the default in BlockBasedTableOptions.

From b3fdb9b3cc67625869c1e2958caf8051233f22b0 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 14 Aug 2025 21:54:52 -0700
Subject: [PATCH 237/500] Use safer atomic APIs for some memtable code (#13844)

Summary:
Two instances of change that are not just cosmetic:

* InlineSkipList<>::Node::CASNext() was implicitly using memory_order_seq_cst to access `next_` while it's intended to be accessed with acquire/release. This is probably not a correctness issue for compare_exchange_strong but potentially a previously missed optimization.
* Similar for `max_height_` in Insert which is otherwise accessed with relaxed memory order.
* One non-relaxed access to `is_range_del_table_empty_` in a function only used in assertions. Access to this atomic is otherwise relaxed (and should be - comment added)

Didn't do all of memtable.h because some of them are more complicated changes and I should probably add FetchMin and FetchMax functions to simplify and take advantage of C++27 functions where available (intended follow-up).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13844

Test Plan: existing tests

Reviewed By: xingbowang

Differential Revision: D79742552

Pulled By: pdillinger

fbshipit-source-id: d97ce72ba9af6c105694b7d40622db9e994720cd
---
 db/memtable.cc            | 35 ++++++++++------------
 db/memtable.h             | 61 +++++++++++++++++----------------------
 memtable/inlineskiplist.h | 35 ++++++++++------------
 memtable/skiplist.h       | 23 +++++++--------
 util/atomic.h             |  6 ++++
 util/dynamic_bloom.cc     |  4 +--
 util/dynamic_bloom.h      | 29 ++++++++++---------
 7 files changed, 92 insertions(+), 101 deletions(-)

diff --git a/db/memtable.cc b/db/memtable.cc
index 6fbd44a3b76f..3ef4db0ee277 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -179,7 +179,7 @@ size_t MemTable::ApproximateMemoryUsage() {
     }
     total_usage += usage;
   }
-  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+  approximate_memory_usage_.StoreRelaxed(total_usage);
   // otherwise, return the actual usage
   return total_usage;
 }
@@ -193,12 +193,12 @@ bool MemTable::ShouldFlushNow() {
   // This is set if memtable_max_range_deletions is > 0,
   // and that many range deletions are done
   if (memtable_max_range_deletions_ > 0 &&
-      num_range_deletes_.load(std::memory_order_relaxed) >=
+      num_range_deletes_.LoadRelaxed() >=
           static_cast<uint64_t>(memtable_max_range_deletions_)) {
     return true;
   }
 
-  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+  size_t write_buffer_size = write_buffer_size_.LoadRelaxed();
   // In a lot of times, we cannot allocate arena blocks that exactly matches the
   // buffer size. Thus we have to decide if we should over-allocate or
   // under-allocate.
@@ -214,7 +214,7 @@ bool MemTable::ShouldFlushNow() {
   auto allocated_memory =
       table_->ApproximateMemoryUsage() + arena_.MemoryAllocatedBytes();
 
-  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+  approximate_memory_usage_.StoreRelaxed(allocated_memory);
 
   // if we can still allocate one more block without exceeding the
   // over-allocation ratio, then we should not flush.
@@ -756,7 +756,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
     const ReadOptions& read_options, SequenceNumber read_seq,
     bool immutable_memtable) {
   if (read_options.ignore_range_deletions ||
-      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+      is_range_del_table_empty_.LoadRelaxed()) {
     return nullptr;
   }
   return NewRangeTombstoneIteratorInternal(read_options, read_seq,
@@ -767,7 +767,7 @@ FragmentedRangeTombstoneIterator*
 MemTable::NewTimestampStrippingRangeTombstoneIterator(
     const ReadOptions& read_options, SequenceNumber read_seq, size_t ts_sz) {
   if (read_options.ignore_range_deletions ||
-      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+      is_range_del_table_empty_.LoadRelaxed()) {
     return nullptr;
   }
   if (!timestamp_stripping_fragmented_range_tombstone_list_) {
@@ -831,7 +831,7 @@ void MemTable::ConstructFragmentedRangeTombstones() {
   // There should be no concurrent Construction.
   // We could also check fragmented_range_tombstone_list_ to avoid repeate
   // constructions. We just construct them here again to be safe.
-  if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+  if (!is_range_del_table_empty_.LoadRelaxed()) {
     // TODO: plumb Env::IOActivity, Env::IOPriority
     auto* unfragmented_iter = new MemTableIterator(
         MemTableIterator::kRangeDelEntries, *this, ReadOptions());
@@ -854,7 +854,7 @@ ReadOnlyMemTable::MemTableStats MemTable::ApproximateStats(
   if (entry_count == 0) {
     return {0, 0};
   }
-  uint64_t n = num_entries_.load(std::memory_order_relaxed);
+  uint64_t n = num_entries_.LoadRelaxed();
   if (n == 0) {
     return {0, 0};
   }
@@ -864,7 +864,7 @@ ReadOnlyMemTable::MemTableStats MemTable::ApproximateStats(
     // the inaccuracy.
     entry_count = n;
   }
-  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+  uint64_t data_size = data_size_.LoadRelaxed();
   return {entry_count * (data_size / n), entry_count};
 }
 
@@ -994,17 +994,14 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
 
     // this is a bit ugly, but is the way to avoid locked instructions
     // when incrementing an atomic
-    num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
-                       std::memory_order_relaxed);
-    data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
-                     std::memory_order_relaxed);
+    num_entries_.StoreRelaxed(num_entries_.LoadRelaxed() + 1);
+    data_size_.StoreRelaxed(data_size_.LoadRelaxed() + encoded_len);
     if (type == kTypeDeletion || type == kTypeSingleDeletion ||
         type == kTypeDeletionWithTimestamp) {
-      num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
-                         std::memory_order_relaxed);
+      num_deletes_.StoreRelaxed(num_deletes_.LoadRelaxed() + 1);
     } else if (type == kTypeRangeDeletion) {
-      uint64_t val = num_range_deletes_.load(std::memory_order_relaxed) + 1;
-      num_range_deletes_.store(val, std::memory_order_relaxed);
+      uint64_t val = num_range_deletes_.LoadRelaxed() + 1;
+      num_range_deletes_.StoreRelaxed(val);
     }
 
     if (bloom_filter_ && prefix_extractor_ &&
@@ -1105,7 +1102,7 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
     if (allow_concurrent) {
       range_del_mutex_.unlock();
     }
-    is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+    is_range_del_table_empty_.StoreRelaxed(false);
   }
   UpdateOldestKeyTime();
 
@@ -1524,7 +1521,7 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   // range tombstones. This is the simplest way to ensure range tombstones are
   // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
   bool no_range_del = read_options.ignore_range_deletions ||
-                      is_range_del_table_empty_.load(std::memory_order_relaxed);
+                      is_range_del_table_empty_.LoadRelaxed();
   MultiGetRange temp_range(*range, range->begin(), range->end());
   if (bloom_filter_ && no_range_del) {
     bool whole_key =
diff --git a/db/memtable.h b/db/memtable.h
index da0067297e03..b3e6069531b8 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <atomic>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -568,7 +567,7 @@ class MemTable final : public ReadOnlyMemTable {
   // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
   // require external synchronization. The value may be less accurate though
   size_t ApproximateMemoryUsageFast() const {
-    return approximate_memory_usage_.load(std::memory_order_relaxed);
+    return approximate_memory_usage_.LoadRelaxed();
   }
 
   size_t MemoryAllocatedBytes() const override {
@@ -689,16 +688,13 @@ class MemTable final : public ReadOnlyMemTable {
   // Used in concurrent memtable inserts.
   void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
     table_->BatchPostProcess();
-    num_entries_.fetch_add(update_counters.num_entries,
-                           std::memory_order_relaxed);
-    data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+    num_entries_.FetchAddRelaxed(update_counters.num_entries);
+    data_size_.FetchAddRelaxed(update_counters.data_size);
     if (update_counters.num_deletes != 0) {
-      num_deletes_.fetch_add(update_counters.num_deletes,
-                             std::memory_order_relaxed);
+      num_deletes_.FetchAddRelaxed(update_counters.num_deletes);
     }
     if (update_counters.num_range_deletes > 0) {
-      num_range_deletes_.fetch_add(update_counters.num_range_deletes,
-                                   std::memory_order_relaxed);
+      num_range_deletes_.FetchAddRelaxed(update_counters.num_range_deletes);
       // noop for skip-list memtable
       // Besides correctness test in stress test, memtable flush record count
       // check will catch this if it were not noop.
@@ -707,35 +703,26 @@ class MemTable final : public ReadOnlyMemTable {
     UpdateFlushState();
   }
 
-  uint64_t NumEntries() const override {
-    return num_entries_.load(std::memory_order_relaxed);
-  }
+  uint64_t NumEntries() const override { return num_entries_.LoadRelaxed(); }
 
-  uint64_t NumDeletion() const override {
-    return num_deletes_.load(std::memory_order_relaxed);
-  }
+  uint64_t NumDeletion() const override { return num_deletes_.LoadRelaxed(); }
 
   uint64_t NumRangeDeletion() const override {
-    return num_range_deletes_.load(std::memory_order_relaxed);
+    return num_range_deletes_.LoadRelaxed();
   }
 
-  uint64_t GetDataSize() const override {
-    return data_size_.load(std::memory_order_relaxed);
-  }
+  uint64_t GetDataSize() const override { return data_size_.LoadRelaxed(); }
 
-  size_t write_buffer_size() const {
-    return write_buffer_size_.load(std::memory_order_relaxed);
-  }
+  size_t write_buffer_size() const { return write_buffer_size_.LoadRelaxed(); }
 
   // Dynamically change the memtable's capacity. If set below the current usage,
   // the next key added will trigger a flush. Can only increase size when
   // memtable prefix bloom is disabled, since we can't easily allocate more
-  // space.
+  // space. Non-atomic update ok because this is only called with DB mutex held.
   void UpdateWriteBufferSize(size_t new_write_buffer_size) {
     if (bloom_filter_ == nullptr ||
-        new_write_buffer_size < write_buffer_size_) {
-      write_buffer_size_.store(new_write_buffer_size,
-                               std::memory_order_relaxed);
+        new_write_buffer_size < write_buffer_size_.LoadRelaxed()) {
+      write_buffer_size_.StoreRelaxed(new_write_buffer_size);
     }
   }
 
@@ -827,7 +814,7 @@ class MemTable final : public ReadOnlyMemTable {
 
   bool IsFragmentedRangeTombstonesConstructed() const override {
     return fragmented_range_tombstone_list_.get() != nullptr ||
-           is_range_del_table_empty_;
+           is_range_del_table_empty_.LoadRelaxed();
   }
 
   //  Gets the newest user defined timestamps in the memtable. This should only
@@ -853,16 +840,22 @@ class MemTable final : public ReadOnlyMemTable {
   ConcurrentArena arena_;
   std::unique_ptr<MemTableRep> table_;
   std::unique_ptr<MemTableRep> range_del_table_;
-  std::atomic_bool is_range_del_table_empty_;
+  // This is OK to be relaxed access because consistency between table_ and
+  // range_del_table_ is provided by explicit multi-versioning with sequence
+  // numbers. It's ok for stale memory to say the range_del_table_ is empty when
+  // it's actually not because if it was relevant to our read (based on sequence
+  // number), the relaxed memory read would get a sufficiently updated value
+  // because of the ordering provided by LastPublishedSequence().
+  RelaxedAtomic<bool> is_range_del_table_empty_;
 
   // Total data size of all data inserted
-  std::atomic<uint64_t> data_size_;
-  std::atomic<uint64_t> num_entries_;
-  std::atomic<uint64_t> num_deletes_;
-  std::atomic<uint64_t> num_range_deletes_;
+  RelaxedAtomic<uint64_t> data_size_;
+  RelaxedAtomic<uint64_t> num_entries_;
+  RelaxedAtomic<uint64_t> num_deletes_;
+  RelaxedAtomic<uint64_t> num_range_deletes_;
 
   // Dynamically changeable memtable option
-  std::atomic<size_t> write_buffer_size_;
+  RelaxedAtomic<size_t> write_buffer_size_;
 
   // The sequence number of the kv that was inserted first
   std::atomic<SequenceNumber> first_seqno_;
@@ -898,7 +891,7 @@ class MemTable final : public ReadOnlyMemTable {
 
   // keep track of memory usage in table_, arena_, and range_del_table_.
   // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
-  std::atomic<uint64_t> approximate_memory_usage_;
+  RelaxedAtomic<uint64_t> approximate_memory_usage_;
 
   // max range deletions in a memtable,  before automatic flushing, 0 for
   // unlimited.
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index 9fdf618fa550..caa4c3aec4fa 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -44,8 +44,6 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include <algorithm>
-#include <atomic>
 #include <type_traits>
 
 #include "memory/allocator.h"
@@ -53,7 +51,7 @@
 #include "port/port.h"
 #include "rocksdb/slice.h"
 #include "test_util/sync_point.h"
-#include "util/coding.h"
+#include "util/atomic.h"
 #include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -215,18 +213,17 @@ class InlineSkipList {
   Comparator const compare_;
   Node* const head_;
 
-  // Modified only by Insert().  Read racily by readers, but stale
-  // values are ok.
-  std::atomic<int> max_height_;  // Height of the entire list
+  // Maximum height of any node in the list (or in the process of being added).
+  //  Modified only by Insert().  Relaxed reads are always OK because starting
+  // from higher levels only helps efficiency, not correctness.
+  RelaxedAtomic<int> max_height_;
 
   // seq_splice_ is a Splice used for insertions in the non-concurrent
   // case.  It caches the prev and next found during the most recent
   // non-concurrent insertion.
   Splice* seq_splice_;
 
-  inline int GetMaxHeight() const {
-    return max_height_.load(std::memory_order_relaxed);
-  }
+  inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); }
 
   int RandomHeight();
 
@@ -311,7 +308,7 @@ struct InlineSkipList<Comparator>::Node {
   // Stores the height of the node in the memory location normally used for
   // next_[0].  This is used for passing data from AllocateKey to Insert.
   void StashHeight(const int height) {
-    assert(sizeof(int) <= sizeof(next_[0]));
+    static_assert(sizeof(int) <= sizeof(next_[0]));
     memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int));
   }
 
@@ -332,30 +329,30 @@ struct InlineSkipList<Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return ((&next_[0] - n)->load(std::memory_order_acquire));
+    return ((&next_[0] - n)->Load());
   }
 
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    (&next_[0] - n)->store(x, std::memory_order_release);
+    (&next_[0] - n)->Store(x);
   }
 
   bool CASNext(int n, Node* expected, Node* x) {
     assert(n >= 0);
-    return (&next_[0] - n)->compare_exchange_strong(expected, x);
+    return (&next_[0] - n)->CasStrong(expected, x);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return (&next_[0] - n)->load(std::memory_order_relaxed);
+    return (&next_[0] - n)->LoadRelaxed();
   }
 
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    (&next_[0] - n)->store(x, std::memory_order_relaxed);
+    (&next_[0] - n)->StoreRelaxed(x);
   }
 
   // Insert node after prev on specific level.
@@ -369,7 +366,7 @@ struct InlineSkipList<Comparator>::Node {
  private:
   // next_[0] is the lowest level link (level 0).  Higher levels are
   // stored _earlier_, so level 1 is at next_[-1].
-  std::atomic<Node*> next_[1];
+  AcqRelAtomic<Node*> next_[1];
 };
 
 template <class Comparator>
@@ -789,7 +786,7 @@ char* InlineSkipList<Comparator>::AllocateKey(size_t key_size) {
 template <class Comparator>
 typename InlineSkipList<Comparator>::Node*
 InlineSkipList<Comparator>::AllocateNode(size_t key_size, int height) {
-  auto prefix = sizeof(std::atomic<Node*>) * (height - 1);
+  auto prefix = sizeof(AcqRelAtomic<Node*>) * (height - 1);
 
   // prefix is space for the height - 1 pointers that we store before
   // the Node instance (next_[-(height - 1) .. -1]).  Node starts at
@@ -923,9 +920,9 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
   int height = x->UnstashHeight();
   assert(height >= 1 && height <= kMaxHeight_);
 
-  int max_height = max_height_.load(std::memory_order_relaxed);
+  int max_height = max_height_.LoadRelaxed();
   while (height > max_height) {
-    if (max_height_.compare_exchange_weak(max_height, height)) {
+    if (max_height_.CasWeakRelaxed(max_height, height)) {
       // successfully updated it
       max_height = height;
       break;
diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index aabbe75c8615..979cffd111c7 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -34,10 +34,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include <atomic>
-
 #include "memory/allocator.h"
 #include "port/port.h"
+#include "util/atomic.h"
 #include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -128,7 +127,7 @@ class SkipList {
 
   // Modified only by Insert().  Read racily by readers, but stale
   // values are ok.
-  std::atomic<int> max_height_;  // Height of the entire list
+  RelaxedAtomic<int> max_height_;  // Height of the entire list
 
   // Used for optimizing sequential insert patterns.  Tricky.  prev_[i] for
   // i up to max_height_ is the predecessor of prev_[0] and prev_height_
@@ -137,9 +136,7 @@ class SkipList {
   int32_t prev_height_;
   Node** prev_;
 
-  inline int GetMaxHeight() const {
-    return max_height_.load(std::memory_order_relaxed);
-  }
+  inline int GetMaxHeight() const { return max_height_.LoadRelaxed(); }
 
   Node* NewNode(const Key& key, int height);
   int RandomHeight();
@@ -179,35 +176,35 @@ struct SkipList<Key, Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return (next_[n].load(std::memory_order_acquire));
+    return (next_[n].Load());
   }
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_[n].store(x, std::memory_order_release);
+    next_[n].Store(x);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return next_[n].load(std::memory_order_relaxed);
+    return next_[n].LoadRelaxed();
   }
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    next_[n].store(x, std::memory_order_relaxed);
+    next_[n].StoreRelaxed(x);
   }
 
  private:
   // Array of length equal to the node height.  next_[0] is lowest level link.
-  std::atomic<Node*> next_[1];
+  AcqRelAtomic<Node*> next_[1];
 };
 
 template <typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::NewNode(
     const Key& key, int height) {
   char* mem = allocator_->AllocateAligned(
-      sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
+      sizeof(Node) + sizeof(AcqRelAtomic<Node*>) * (height - 1));
   return new (mem) Node(key);
 }
 
@@ -494,7 +491,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
     // the loop below.  In the former case the reader will
     // immediately drop to the next level since nullptr sorts after all
     // keys.  In the latter case the reader will use the new node.
-    max_height_.store(height, std::memory_order_relaxed);
+    max_height_.StoreRelaxed(height);
   }
 
   Node* x = NewNode(key, height);
diff --git a/util/atomic.h b/util/atomic.h
index afb3dc540050..94575fc7ca8e 100644
--- a/util/atomic.h
+++ b/util/atomic.h
@@ -20,6 +20,12 @@ namespace ROCKSDB_NAMESPACE {
 // https://en.cppreference.com/w/cpp/atomic/memory_order
 // * It's easy to use nonsensical (UB) combinations like store with
 // std::memory_order_acquire.
+// * It is unlikely that anything in RocksDB will need std::memory_order_seq_cst
+// because sequential consistency for the user, potentially writing from
+// multiple threads, is provided by explicit versioning with sequence numbers.
+// If threads A & B update separate atomics, it's typically OK if threads C & D
+// see those updates in different orders.
+//
 // For such reasons, we provide wrappers below to make safe usage easier.
 
 // Wrapper around std::atomic to avoid certain bugs (see Background above).
diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc
index 0ff3b4a758eb..96e1e0f4367c 100644
--- a/util/dynamic_bloom.cc
+++ b/util/dynamic_bloom.cc
@@ -62,9 +62,9 @@ DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
     // Align on block_bytes boundary
     raw += block_bytes - block_offset;
   }
-  static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
+  static_assert(sizeof(RelaxedAtomic<uint64_t>) == sizeof(uint64_t),
                 "Expecting zero-space-overhead atomic");
-  data_ = reinterpret_cast<std::atomic<uint64_t>*>(raw);
+  data_ = reinterpret_cast<RelaxedAtomic<uint64_t>*>(raw);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h
index 0ff1053ca6a8..e478a60d4102 100644
--- a/util/dynamic_bloom.h
+++ b/util/dynamic_bloom.h
@@ -7,12 +7,10 @@
 
 #include <array>
 #include <atomic>
-#include <memory>
-#include <string>
 
-#include "port/port.h"
 #include "rocksdb/slice.h"
 #include "table/multiget_context.h"
+#include "util/atomic.h"
 #include "util/hash.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -50,16 +48,20 @@ class DynamicBloom {
 
   ~DynamicBloom() {}
 
-  // Assuming single threaded access to this function.
+  // Assuming single thread adding to the DynamicBloom
   void Add(const Slice& key);
 
-  // Like Add, but may be called concurrent with other functions.
+  // Like Add, but may be called concurrently with other functions. Does not
+  // establish happens-before relationship with other functions so requires some
+  // external mechanism to ensure other threads can see the change.
   void AddConcurrently(const Slice& key);
 
   // Assuming single threaded access to this function.
   void AddHash(uint32_t hash);
 
-  // Like AddHash, but may be called concurrent with other functions.
+  // Like AddHash, but may be called concurrently with other functions. Does not
+  // establish happens-before relationship with other functions so requires some
+  // external mechanism to ensure other threads can see the change.
   void AddHashConcurrently(uint32_t hash);
 
   // Multithreaded access to this function is OK
@@ -80,7 +82,7 @@ class DynamicBloom {
   // this stores k/2, the number of words to double-probe.
   const uint32_t kNumDoubleProbes;
 
-  std::atomic<uint64_t>* data_;
+  RelaxedAtomic<uint64_t>* data_;
 
   // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
   // concurrency safety, working with bytes.
@@ -97,21 +99,20 @@ inline void DynamicBloom::AddConcurrently(const Slice& key) {
 }
 
 inline void DynamicBloom::AddHash(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
-    ptr->store(ptr->load(std::memory_order_relaxed) | mask,
-               std::memory_order_relaxed);
+  AddHash(hash, [](RelaxedAtomic<uint64_t>* ptr, uint64_t mask) {
+    ptr->StoreRelaxed(ptr->LoadRelaxed() | mask);
   });
 }
 
 inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
-  AddHash(hash, [](std::atomic<uint64_t>* ptr, uint64_t mask) {
+  AddHash(hash, [](RelaxedAtomic<uint64_t>* ptr, uint64_t mask) {
     // Happens-before between AddHash and MaybeContains is handled by
     // access to versions_->LastSequence(), so all we have to do here is
     // avoid races (so we don't give the compiler a license to mess up
     // our code) and not lose bits.  std::memory_order_relaxed is enough
     // for that.
-    if ((mask & ptr->load(std::memory_order_relaxed)) != mask) {
-      ptr->fetch_or(mask, std::memory_order_relaxed);
+    if ((mask & ptr->LoadRelaxed()) != mask) {
+      ptr->FetchOrRelaxed(mask);
     }
   });
 }
@@ -183,7 +184,7 @@ inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const {
     // Two bit probes per uint64_t probe
     uint64_t mask =
         ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63));
-    uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed);
+    uint64_t val = data_[byte_offset ^ i].LoadRelaxed();
     if (i + 1 >= kNumDoubleProbes) {
       return (val & mask) == mask;
     } else if ((val & mask) != mask) {

From 772e342a9269ff66d702258e1901a9ac09fd7824 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 15 Aug 2025 09:42:42 -0700
Subject: [PATCH 238/500] Add an option to sst_dump to list all metadata blocks
 (#13838)

Summary:
Add the --list_meta_blocks option to sst_dump. This PR also refactors some of the test code in sst_dump_test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13838

Reviewed By: cbi42

Differential Revision: D80320812

Pulled By: anand1976

fbshipit-source-id: 921b6560fbd756f5f8b364893700d240d3b7ad00
---
 table/sst_file_dumper.cc |   8 ++
 table/sst_file_dumper.h  |   3 +
 tools/sst_dump_test.cc   | 174 +++++++++++----------------------------
 tools/sst_dump_tool.cc   |  35 ++++++++
 4 files changed, 93 insertions(+), 127 deletions(-)

diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index e96cc6f0771c..80ac41367db2 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -158,6 +158,14 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
       s = SetOldTableOptions();
     }
     options_.comparator = internal_comparator_.user_comparator();
+
+    {
+      Status status = ReadMetaIndexBlockInFile(
+          file_.get(), file_size, magic_number, ImmutableOptions(options_),
+          ReadOptions(), &meta_index_contents_);
+      // Ignore any errors since this is required for a specific CLI option
+      status.PermitUncheckedError();
+    }
   }
 
   if (s.ok()) {
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index 7ce1b016d3b8..2cceec407439 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -51,6 +51,8 @@ class SstFileDumper {
   Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
                              const CompressionOptions& compress_opt);
 
+  BlockContents& GetMetaIndexContents() { return meta_index_contents_; }
+
  private:
   // Get the TableReader implementation for the sst file
   Status GetTableReader(const std::string& file_path);
@@ -96,6 +98,7 @@ class SstFileDumper {
   ReadOptions read_options_;
   InternalKeyComparator internal_comparator_;
   std::unique_ptr<TableProperties> table_properties_;
+  BlockContents meta_index_contents_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 9b789b4bb2bf..6df982b4f472 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -175,6 +175,30 @@ class SSTDumpToolTest : public testing::Test {
 
  protected:
   constexpr static int kNumKey = 1024;
+
+  void SSTDumpToolTestCase(Options& opts, bool filter, int wide_column_one_in,
+                           const char* cmd_arg) {
+    opts.env = env();
+    BlockBasedTableOptions table_opts;
+    if (filter) {
+      table_opts.filter_policy.reset(
+          ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+    }
+    opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+    std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+    createSST(opts, file_path, wide_column_one_in);
+
+    char* usage[3];
+    PopulateCommandArgs(file_path, cmd_arg, usage);
+
+    ROCKSDB_NAMESPACE::SSTDumpTool tool;
+    ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+    cleanup(opts, file_path);
+    for (int i = 0; i < 3; i++) {
+      delete[] usage[i];
+    }
+  }
 };
 
 
@@ -194,156 +218,52 @@ TEST_F(SSTDumpToolTest, HelpAndVersion) {
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
   Options opts;
-  opts.env = env();
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/false, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
 TEST_F(SSTDumpToolTest, SstDumpReverseBytewiseComparator) {
   Options opts;
-  opts.env = env();
   opts.comparator = ReverseBytewiseComparator();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path =
-      MakeFilePath("rocksdb_sst_reverse_bytewise_comparator.sst");
-  createSST(opts, file_path);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
 TEST_F(SSTDumpToolTest, SstDumpComparatorWithU64Ts) {
   Options opts;
-  opts.env = env();
   opts.comparator = test::BytewiseComparatorWithU64TsWrapper();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path =
-      MakeFilePath("rocksdb_sst_comparator_with_u64_ts.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
-TEST_F(SSTDumpToolTest, FilterBlock) {
+TEST_F(SSTDumpToolTest, FilterBlockWideColumn) {
   Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=raw");
 }
 
-TEST_F(SSTDumpToolTest, FullFilterBlock) {
+TEST_F(SSTDumpToolTest, FilterBlock) {
   Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=raw", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/0,
+                      "--command=raw");
 }
 
 TEST_F(SSTDumpToolTest, GetProperties) {
   Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--show_properties", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
-
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/0,
+                      "--show_properties");
 }
 
 TEST_F(SSTDumpToolTest, CompressedSizes) {
   Options opts;
-  opts.env = env();
-  BlockBasedTableOptions table_opts;
-  table_opts.filter_policy.reset(
-      ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false));
-  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
-  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
-  createSST(opts, file_path, 10);
-
-  char* usage[3];
-  PopulateCommandArgs(file_path, "--command=recompress", usage);
-
-  ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/10,
+                      "--command=recompress");
+}
 
-  cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
+TEST_F(SSTDumpToolTest, ListMetaBlocks) {
+  Options opts;
+  SSTDumpToolTestCase(opts, /*filter=*/true, /*wide_column_one_in=*/0,
+                      "--list_meta_blocks");
 }
 
 namespace {
@@ -455,8 +375,8 @@ TEST_F(SSTDumpToolTest, ReadaheadSize) {
 
   // The file is approximately 10MB. Readahead is 4MB.
   // We usually need 3 reads + one metadata read.
-  // One extra read is needed before opening the file for metadata.
-  ASSERT_EQ(5, num_reads);
+  // Three extra read is needed before opening the file for metadata.
+  ASSERT_EQ(7, num_reads);
 
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index c650974af806..f81ee8e56b4e 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -14,6 +14,7 @@
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/utilities/ldb_cmd.h"
+#include "table/block_based/block.h"
 #include "table/sst_file_dumper.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -121,6 +122,9 @@ void print_help(bool to_stderr) {
 
     --compression_use_zstd_finalize_dict
       Use zstd's finalizeDictionary() API instead of zstd's dictionary trainer to generate dictionary.
+
+    --list_meta_blocks
+      Print the list of all meta blocks in the file
 )",
       supported_compressions.c_str());
 }
@@ -162,6 +166,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   bool use_from_as_prefix = false;
   bool show_properties = false;
   bool show_summary = false;
+  bool list_meta_blocks = false;
   bool set_block_size = false;
   bool has_compression_level_from = false;
   bool has_compression_level_to = false;
@@ -335,6 +340,8 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       compression_max_dict_buffer_bytes = static_cast<uint64_t>(tmp_val);
     } else if (strcmp(argv[i], "--compression_use_zstd_finalize_dict") == 0) {
       compression_use_zstd_finalize_dict = true;
+    } else if (strcmp(argv[i], "--list_meta_blocks") == 0) {
+      list_meta_blocks = true;
     } else if (strcmp(argv[i], "--help") == 0) {
       print_help(/*to_stderr*/ false);
       return 0;
@@ -561,7 +568,35 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         fprintf(stderr, "Reader unexpectedly returned null properties\n");
       }
     }
+
+    BlockContents& meta_index_contents = dumper.GetMetaIndexContents();
+    if (list_meta_blocks && meta_index_contents.data.size() > 0) {
+      Block meta_index_block(std::move(meta_index_contents));
+      std::unique_ptr<MetaBlockIter> meta_index_iter;
+      meta_index_iter.reset(meta_index_block.NewMetaIterator());
+      meta_index_iter->SeekToFirst();
+      fprintf(stdout,
+              "Meta Blocks:\n"
+              "------------------------------\n");
+      while (meta_index_iter->status().ok() && meta_index_iter->Valid()) {
+        Slice v = meta_index_iter->value();
+        BlockHandle handle;
+        st = handle.DecodeFrom(&v);
+        if (!st.ok()) {
+          fprintf(stderr, "%s: Could not decode block handle - %s\n",
+                  filename.c_str(), st.ToString().c_str());
+        } else {
+          fprintf(stdout, "  %s: %" PRIu64 " %" PRIu64 "\n",
+                  meta_index_iter->key().ToString().c_str(), handle.offset(),
+                  handle.size());
+        }
+        meta_index_iter->Next();
+      }
+    } else if (list_meta_blocks) {
+      fprintf(stderr, "Could not read the meta index block\n");
+    }
   }
+
   if (show_summary) {
     fprintf(stdout, "total number of files: %" PRIu64 "\n", total_num_files);
     fprintf(stdout, "total number of data blocks: %" PRIu64 "\n",

From 84f814454ad54595e31b698cf288c9ed63f956aa Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Fri, 15 Aug 2025 15:41:01 -0700
Subject: [PATCH 239/500] Remove reservation mismatch assert in cache adapter
 destructor (#13885)

Summary:
The assert occasionally throws off the stress test runs. We already have sufficient logging in place to collect the signal about secondary cache capacity exceeding primary cache reservation for further investigation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13885

Reviewed By: anand1976

Differential Revision: D80355513

Pulled By: mszeszko-meta

fbshipit-source-id: b36926f0493a3aca19818a1980ef79277db9fe7e
---
 cache/secondary_cache_adapter.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index 4a9e3decc94a..11a330284c90 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -121,16 +121,13 @@ CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
     assert(s.ok());
     assert(placeholder_usage_ == 0);
     assert(reserved_usage_ == 0);
-    bool pri_cache_res_mismatch =
-        pri_cache_res_->GetTotalMemoryUsed() != sec_capacity;
-    if (pri_cache_res_mismatch) {
-      fprintf(stderr,
+    if (pri_cache_res_->GetTotalMemoryUsed() != sec_capacity) {
+      fprintf(stdout,
               "~CacheWithSecondaryAdapter: Primary cache reservation: "
               "%zu, Secondary cache capacity: %zu, "
               "Secondary cache reserved: %zu\n",
               pri_cache_res_->GetTotalMemoryUsed(), sec_capacity,
               sec_reserved_);
-      assert(!pri_cache_res_mismatch);
     }
   }
 #endif  // NDEBUG

From 618f660eab618cc63bd466eced380a84c1a38276 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 18 Aug 2025 10:56:16 -0700
Subject: [PATCH 240/500] Configurable multiscan IO coalescing threshold
 (#13886)

Summary:
Add a new filed `io_coalesce_threshold` to MultiScanArgs to make IO coalescing threshold configurable.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13886

Test Plan:
db_bench showing less IO requests with higher io_coalesce_threshold
```
Single L0 file, iterator uses BlockBasedTableIterator directly, skipping LevelIterator

DB Set up: ./db_bench --benchmarks="fillseq,compact" --disable_wal=1 --threads=1 --num_levels=1 --compaction_style=2 --fifo_compaction_max_table_files_size_mb=1000 --write_buffer_size=268435456

./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --seek_nexts=100 --threads=32 --duration=10 --statistics=1 --use_direct_reads=1 ..

--multiscan_coalesce_threshold=0
rocksdb.non.last.level.read.bytes COUNT : 54591304136
rocksdb.non.last.level.read.count COUNT : 7680204
multiscan    :     397.197 micros/op 79401 ops/sec 10.377 seconds 823968 operations; (multscans:24999)

--multiscan_coalesce_threshold=16384
rocksdb.non.last.level.read.bytes COUNT : 95960989272
rocksdb.non.last.level.read.count COUNT : 912008
multiscan    :     389.099 micros/op 81064 ops/sec 10.312 seconds 835968 operations; (multscans:25999)

--multiscan_coalesce_threshold=163840
rocksdb.non.last.level.read.bytes COUNT : 98805008718
rocksdb.non.last.level.read.count COUNT : 827893
multiscan    :     392.831 micros/op 80357 ops/sec 10.353 seconds 831968 operations; (multscans:25999)

DB with multiple files in a level, iterator will use LevelIterator
./db_bench --benchmarks="fillseq,compact" --disable_wal=1 --threads=1 --num_levels=6 --num=10000000

./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --seek_nexts=100 --threads=32 --duration=10 --statistics=1 --use_direct_reads=1 --num=10000000

--multiscan_coalesce_threshold=0
multiscan    :    1161.734 micros/op 26995 ops/sec 10.667 seconds 287968 operations; (multscans:8999)
rocksdb.non.last.level.read.bytes COUNT : 23917753523
rocksdb.non.last.level.read.count COUNT : 2868907

--multiscan_coalesce_threshold=16384
rocksdb.non.last.level.read.bytes COUNT : 35022281853
rocksdb.non.last.level.read.count COUNT : 287375
multiscan    :    1195.336 micros/op 26265 ops/sec 10.850 seconds 284968 operations; (multscans:8999)

```

Reviewed By: anand1976

Differential Revision: D80381441

Pulled By: cbi42

fbshipit-source-id: 57cc67df4a808e27c3a48ddf3ef6907bec131ee9
---
 db/version_set.cc                                        | 6 +++++-
 include/rocksdb/options.h                                | 8 +++++++-
 table/block_based/block_based_table_iterator.cc          | 6 ++----
 tools/db_bench_tool.cc                                   | 5 +++++
 unreleased_history/new_features/multiscan-io-coalesce.md | 2 ++
 5 files changed, 21 insertions(+), 6 deletions(-)
 create mode 100644 unreleased_history/new_features/multiscan-io-coalesce.md

diff --git a/db/version_set.cc b/db/version_set.cc
index a9a51e4d9dc9..70649114a9ce 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1144,6 +1144,10 @@ class LevelIterator final : public InternalIterator {
         }
       }
     }
+    // Propagate io colaescing threshold
+    for (auto& file_to_arg : *file_to_scan_opts_) {
+      file_to_arg.second.io_coalesce_threshold = so->io_coalesce_threshold;
+    }
   }
 
  private:
@@ -6491,7 +6495,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
                       nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
                       /*db_id*/ "",
                       /*db_session_id*/ "", options->daily_offpeak_time_utc,
-                      /*error_handler_*/ nullptr, /*read_only=*/false);
+                      /*error_handler_*/ nullptr, /*unchanging=*/false);
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a436c43e389c..5463beb10e58 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1789,14 +1789,17 @@ class MultiScanArgs {
   MultiScanArgs(const MultiScanArgs& other) {
     comp_ = other.comp_;
     original_ranges_ = other.original_ranges_;
+    io_coalesce_threshold = other.io_coalesce_threshold;
   }
   MultiScanArgs(MultiScanArgs&& other) noexcept
-      : comp_(other.comp_),
+      : io_coalesce_threshold(other.io_coalesce_threshold),
+        comp_(other.comp_),
         original_ranges_(std::move(other.original_ranges_)) {}
 
   MultiScanArgs& operator=(const MultiScanArgs& other) {
     comp_ = other.comp_;
     original_ranges_ = other.original_ranges_;
+    io_coalesce_threshold = other.io_coalesce_threshold;
     return *this;
   }
 
@@ -1804,6 +1807,7 @@ class MultiScanArgs {
     if (this != &other) {
       comp_ = other.comp_;
       original_ranges_ = std::move(other.original_ranges_);
+      io_coalesce_threshold = other.io_coalesce_threshold;
     }
     return *this;
   }
@@ -1843,6 +1847,8 @@ class MultiScanArgs {
     return original_ranges_;
   }
 
+  uint64_t io_coalesce_threshold = 16 << 10;  // 16KB by default
+
  private:
   // The comparator used for ordering ranges
   const Comparator* comp_;
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 5d9536a87810..b966a19cf5a1 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1067,9 +1067,6 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
     // Each member in the vector is an index into blocks_to_prepare.
     std::vector<std::vector<size_t>> collapsed_blocks_to_read(1);
 
-    // TODO: make this threshold configurable
-    constexpr size_t kCoalesceThreshold = 16 << 10;  // 16KB
-
     for (const auto& block_idx : blocks_to_read) {
       if (!collapsed_blocks_to_read.back().empty()) {
         // Check if we can coalesce.
@@ -1080,7 +1077,8 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
             BlockBasedTable::BlockSizeWithTrailer(last_block);
         uint64_t current_start = blocks_to_prepare[block_idx].offset();
 
-        if (current_start > last_block_end + kCoalesceThreshold) {
+        if (current_start >
+            last_block_end + multiscan_opts->io_coalesce_threshold) {
           // new IO
           collapsed_blocks_to_read.emplace_back();
         }
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 7b59b7fc487a..727275233e30 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1847,6 +1847,10 @@ DEFINE_bool(universal_reduce_file_locking,
                 .compaction_options_universal.reduce_file_locking,
             "See Options().compaction_options_universal.reduce_file_locking");
 
+DEFINE_uint64(multiscan_coalesce_threshold,
+              ROCKSDB_NAMESPACE::MultiScanArgs().io_coalesce_threshold,
+              "Configures io coalescing threshold for multiscans");
+
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static Status CreateMemTableRepFactory(
@@ -6413,6 +6417,7 @@ class Benchmark {
     while (!duration.Done(1)) {
       DB* db = SelectDB(thread);
       MultiScanArgs opts;
+      opts.io_coalesce_threshold = FLAGS_multiscan_coalesce_threshold;
       std::vector<std::unique_ptr<const char[]>> guards;
       opts.reserve(multiscan_size);
       // We create 1 random start, and then multiscan will start from that
diff --git a/unreleased_history/new_features/multiscan-io-coalesce.md b/unreleased_history/new_features/multiscan-io-coalesce.md
new file mode 100644
index 000000000000..2186bbdd745a
--- /dev/null
+++ b/unreleased_history/new_features/multiscan-io-coalesce.md
@@ -0,0 +1,2 @@
+* Introduce `MultiScanArgs::io_coalesce_threshold` to allow a configurable IO coalescing threshold.
+

From 0b426ff58dabe299c705749981654b552e314721 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 20 Aug 2025 12:02:20 -0700
Subject: [PATCH 241/500] Enable multiscan in crash test (#13888)

Summary:
I ran multiple runs of crash test jobs internally, so far I've seen one iterator mismatch and one assertion failure. I've added relevant logging improvements to help debugging them. use_multiscan will be stable within a crash test run to make it easier to triage.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13888

Test Plan: `python3 tools/db_crashtest.py whitebox --prefix_size=-1 --test_batches_snapshots=0 --use_multiscan=1 --read_fault_one_in=0 --kill_random_test=88888`

Reviewed By: anand1976

Differential Revision: D80627399

Pulled By: cbi42

fbshipit-source-id: 2fa3f77e730f5bc7d1d200dc122cf84e3558c588
---
 db_stress_tool/db_stress_test_base.cc         | 15 ++++++++-
 .../block_based/block_based_table_iterator.cc | 31 +++++++++++++++++++
 tools/db_crashtest.py                         | 20 +++++++++---
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 879888ae6d08..8c22d30c314f 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1701,7 +1701,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
   iter.reset(db_->NewIterator(ro, column_families_[rand_column_families[0]]));
   iter->Prepare(scan_opts);
 
-  constexpr size_t kOpLogsLimit = 10000;
+  constexpr size_t kOpLogsLimit = 50000;
 
   auto verify_func = [](Iterator* iterator) {
     if (!VerifyWideColumns(iterator->value(), iterator->columns())) {
@@ -1801,11 +1801,24 @@ Status StressTest::TestMultiScan(ThreadState* thread,
 
       VerifyIterator(thread, cmp_cfh, ro, iter.get(), cmp_iter.get(), last_op,
                      key, op_logs, verify_func, &diverged);
+
+      if (diverged) {
+        const std::vector<ScanOptions>& scanoptions = scan_opts.GetScanRanges();
+        for (const auto& t : scanoptions) {
+          fprintf(stdout, "Multiscan options: %s to %s \n",
+                  t.range.start.value().ToString(true).c_str(),
+                  t.range.limit.value().ToString(true).c_str());
+        }
+        break;
+      }
     }
 
     thread->stats.AddIterations(1);
 
     op_logs += "; ";
+    if (diverged) {
+      break;
+    }
   }
 
   return Status::OK();
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index b966a19cf5a1..3dfef48bfb03 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1106,6 +1106,37 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       const auto start_offset = first_block.offset();
       const auto end_offset = last_block.offset() +
                               BlockBasedTable::BlockSizeWithTrailer(last_block);
+#ifndef NDEBUG
+      // Debug print for failing the assertion below.
+      if (start_offset >= end_offset) {
+        fprintf(stderr, "blocks_to_prepare: ");
+        for (const auto& block : blocks_to_prepare) {
+          fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
+                  block.offset(), block.size());
+        }
+        fprintf(stderr,
+                "\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
+                first_block.offset(), first_block.size());
+        fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
+                last_block.offset(), last_block.size());
+
+        fprintf(stderr, "collapsed_blocks_to_read: ");
+        for (const auto& b : collapsed_blocks_to_read) {
+          fprintf(stderr, "[");
+          for (const auto& block_idx : b) {
+            fprintf(stderr, "%zu ", block_idx);
+          }
+          fprintf(stderr, "] ");
+        }
+        fprintf(stderr, "\ncurrent blocks: ");
+        for (const auto& block_idx : blocks) {
+          fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
+                  blocks_to_prepare[block_idx].offset(),
+                  blocks_to_prepare[block_idx].size());
+        }
+        fprintf(stderr, "\n");
+      }
+#endif  // NDEBUG
       assert(end_offset > start_offset);
       FSReadRequest read_req;
       read_req.offset = start_offset;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index e0b7e4a16cbf..c924ae25a84b 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -181,7 +181,6 @@
     "format_version": lambda: random.choice([2, 3, 4, 5, 6, 7, 7]),
     "index_block_restart_interval": lambda: random.choice(range(1, 16)),
     "use_multiget": lambda: random.randint(0, 1),
-    "use_multiscan": 0,
     "use_get_entity": lambda: random.choice([0] * 7 + [1]),
     "use_multi_get_entity": lambda: random.choice([0] * 7 + [1]),
     "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
@@ -360,6 +359,9 @@
         + ["randommixed"] * 2
         + ["custom"] * 3
     ),
+    # fixed within a run for easier debugging
+    # actual frequency is lower after option sanitization
+    "use_multiscan": random.choice([1] + [0] * 3),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
@@ -757,7 +759,7 @@ def finalize_and_sanitize(src_params):
     if (
         dest_params.get("test_batches_snapshots") == 1
         or dest_params.get("use_txn") == 1
-        or dest_params.get("user_timestamp_size") > 0
+        or dest_params.get("user_timestamp_size", 0) > 0
     ):
         dest_params["ingest_external_file_one_in"] = 0
     if (
@@ -785,7 +787,7 @@ def finalize_and_sanitize(src_params):
     if (
         dest_params.get("sync_fault_injection") == 1
         or dest_params.get("disable_wal") == 1
-        or dest_params.get("manual_wal_flush_one_in") > 0
+        or dest_params.get("manual_wal_flush_one_in", 0) > 0
     ):
         # File ingestion does not guarantee prefix-recoverability when unsynced
         # data can be lost. Ingesting a file syncs data immediately that is
@@ -992,7 +994,7 @@ def finalize_and_sanitize(src_params):
         dest_params["check_multiget_entity_consistency"] = 0
     if dest_params.get("disable_wal") == 0:
         if (
-            dest_params.get("reopen") > 0
+            dest_params.get("reopen", 0) > 0
             or (
                 dest_params.get("manual_wal_flush_one_in")
                 and dest_params.get("column_families") != 1
@@ -1061,7 +1063,7 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("use_put_entity_one_in") == 1:
         dest_params["use_timed_put_one_in"] = 0
     elif (
-        dest_params.get("use_put_entity_one_in") > 1
+        dest_params.get("use_put_entity_one_in", 0) > 1
         and dest_params.get("use_timed_put_one_in") == 1
     ):
         dest_params["use_timed_put_one_in"] = 3
@@ -1102,6 +1104,14 @@ def finalize_and_sanitize(src_params):
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
+    if (
+        dest_params.get("prefix_size", 0) > 0
+        or dest_params.get("read_fault_one_in", 0) > 0
+    ):
+        dest_params["use_multiscan"] = 0
+    if dest_params.get("use_multiscan") == 1:
+        dest_params["fill_cache"] = 1
+        dest_params["async_io"] = 0
     return dest_params
 
 

From a5d4db64e2b61a4e675245ecc7e483c7feafe133 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 21 Aug 2025 08:55:47 -0700
Subject: [PATCH 242/500] Fix multiscan crash when fill_cache=false (#13889)

Summary:
When fill_cache is ReadOptions is false, multi scan Prepare crashes with the following assertion failure. In this case, CreateAndPibBlockInCache needs to directly create a block with full ownership.

https://github.com/facebook/rocksdb/issues/9  0x00007f2fc003bc93 in __GI___assert_fail (assertion=0x7f2fc2147361 "pinned_data_blocks_guard[block_idx].GetValue()", file=0x7f2fc2146e08 "table/block_based/block_based_table_iterator.cc", line=1178, function=0x7f2fc2147262 "virtual void rocksdb::BlockBasedTableIterator::Prepare(const rocksdb::MultiScanArgs *)") at assert.c:101
101 in assert.c
https://github.com/facebook/rocksdb/issues/10 0x00007f2fc1d73088 in rocksdb::BlockBasedTableIterator::Prepare(rocksdb::MultiScanArgs const*) () from /data/users/anand76/rocksdb_anand76/librocksdb.so.10.6

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13889

Test Plan: Parameterize the DBMultiScanIteratorTest tests with fill_cache

Reviewed By: cbi42

Differential Revision: D80552069

Pulled By: anand1976

fbshipit-source-id: 1a0b64af1e14c63d826add1f994a832ebff12757
---
 db/db_iterator_test.cc                        | 15 +++-
 .../block_based/block_based_table_iterator.cc |  3 +-
 table/block_based/block_based_table_reader.cc | 58 ++++++++++++--
 table/block_based/block_based_table_reader.h  |  2 +-
 .../block_based_table_reader_sync_and_async.h | 77 +------------------
 .../bug_fixes/multiscan_fill_cache.md         |  1 +
 6 files changed, 67 insertions(+), 89 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/multiscan_fill_cache.md

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index e894e1e1bec8..99ecb713b011 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4142,13 +4142,17 @@ TEST_P(DBIteratorTest, AverageMemtableOpsScanFlushTriggerByOverwrites) {
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 }
 
-class DBMultiScanIteratorTest : public DBTestBase {
+class DBMultiScanIteratorTest : public DBTestBase,
+                                public ::testing::WithParamInterface<bool> {
  public:
   DBMultiScanIteratorTest()
       : DBTestBase("db_multi_scan_iterator_test", /*env_do_fsync=*/true) {}
 };
 
-TEST_F(DBMultiScanIteratorTest, BasicTest) {
+INSTANTIATE_TEST_CASE_P(DBMultiScanIteratorTest, DBMultiScanIteratorTest,
+                        ::testing::Bool());
+
+TEST_P(DBMultiScanIteratorTest, BasicTest) {
   // Create a file
   for (int i = 0; i < 100; ++i) {
     std::stringstream ss;
@@ -4159,6 +4163,7 @@ TEST_F(DBMultiScanIteratorTest, BasicTest) {
 
   std::vector<std::string> key_ranges({"k03", "k10", "k25", "k50"});
   ReadOptions ro;
+  ro.fill_cache = GetParam();
   MultiScanArgs scan_options(BytewiseComparator());
   scan_options.insert(key_ranges[0], key_ranges[1]);
   scan_options.insert(key_ranges[2], key_ranges[3]);
@@ -4249,7 +4254,7 @@ TEST_F(DBMultiScanIteratorTest, BasicTest) {
   iter.reset();
 }
 
-TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
+TEST_P(DBMultiScanIteratorTest, MixedBoundsTest) {
   // Create a file
   for (int i = 0; i < 100; ++i) {
     std::stringstream ss;
@@ -4261,6 +4266,7 @@ TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
   std::vector<std::string> key_ranges(
       {"k03", "k10", "k25", "k50", "k75", "k90"});
   ReadOptions ro;
+  ro.fill_cache = GetParam();
   MultiScanArgs scan_options(BytewiseComparator());
   scan_options.insert(key_ranges[0], key_ranges[1]);
   scan_options.insert(key_ranges[2]);
@@ -4335,7 +4341,7 @@ TEST_F(DBMultiScanIteratorTest, MixedBoundsTest) {
   iter.reset();
 }
 
-TEST_F(DBMultiScanIteratorTest, RangeAcrossFiles) {
+TEST_P(DBMultiScanIteratorTest, RangeAcrossFiles) {
   auto options = CurrentOptions();
   options.target_file_size_base = 100 << 10;  // 20KB
   options.compaction_style = kCompactionStyleUniversal;
@@ -4354,6 +4360,7 @@ TEST_F(DBMultiScanIteratorTest, RangeAcrossFiles) {
   ASSERT_EQ(2, NumTableFilesAtLevel(49));
   std::vector<std::string> key_ranges({Key(10), Key(90)});
   ReadOptions ro;
+  ro.fill_cache = GetParam();
   MultiScanArgs scan_options(BytewiseComparator());
   scan_options.insert(key_ranges[0], key_ranges[1]);
   ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 3dfef48bfb03..9b7d4c7e97a8 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1199,7 +1199,8 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 #endif
         assert(pinned_data_blocks_guard[block_idx].IsEmpty());
         s = table_->CreateAndPinBlockInCache<Block_kData>(
-            read_options_, block, &tmp_contents,
+            read_options_, block, table_->get_rep()->decompressor.get(),
+            &tmp_contents,
             &(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
         if (!s.ok()) {
           // Abort: failed to create and pin block in cache
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 21b56d4724ac..d52d246a2ea6 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -109,8 +109,8 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
       CachableEntry<T>* out_parsed_block) const;                               \
   template Status BlockBasedTable::CreateAndPinBlockInCache<T>(                \
       const ReadOptions& ro, const BlockHandle& handle,                        \
-      BlockContents* block_contents, CachableEntry<T>* out_parsed_block)       \
-      const;
+      UnownedPtr<Decompressor> decomp, BlockContents* block_contents,          \
+      CachableEntry<T>* out_parsed_block) const;
 
 INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock);
 INSTANTIATE_BLOCKLIKE_TEMPLATES(DecompressorDict);
@@ -1741,13 +1741,55 @@ Status BlockBasedTable::LookupAndPinBlocksInCache(
 
 template <typename TBlocklike>
 Status BlockBasedTable::CreateAndPinBlockInCache(
-    const ReadOptions& ro, const BlockHandle& handle, BlockContents* contents,
+    const ReadOptions& ro, const BlockHandle& handle,
+    UnownedPtr<Decompressor> decomp, BlockContents* contents,
     CachableEntry<TBlocklike>* out_parsed_block) const {
-  return MaybeReadBlockAndLoadToCache(
-      nullptr, ro, handle, rep_->decompressor.get(),
-      /*for_compaction=*/false, out_parsed_block, nullptr, nullptr, contents,
-      /*async_read=*/false,
-      /*use_block_cache_for_lookup=*/true);
+  CompressionType compression_type = GetBlockCompressionType(*contents);
+  // If we don't own the contents and we don't need to decompress, copy
+  // the block to heap in order to have ownership. If decompression is
+  // needed, then the decompressor will allocate a buffer.
+  if (!contents->own_bytes() && compression_type == kNoCompression) {
+    Slice src = Slice(contents->data.data(), BlockSizeWithTrailer(handle));
+    *contents = BlockContents(
+        CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), src),
+        handle.size());
+#ifndef NDEBUG
+    contents->has_trailer = true;
+#endif
+  }
+
+  Status s;
+  if (ro.fill_cache) {
+    s = MaybeReadBlockAndLoadToCache(nullptr, ro, handle, decomp,
+                                     /*for_compaction=*/false, out_parsed_block,
+                                     nullptr, nullptr, contents,
+                                     /*async_read=*/false,
+                                     /*use_block_cache_for_lookup=*/true);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // fill_cache could be false, or no block cache is configured. In that
+  // case, decompress if necessary and take ownership of the block
+  if (out_parsed_block->GetValue() == nullptr && contents != nullptr) {
+    BlockContents tmp_contents;
+    if (compression_type != kNoCompression) {
+      s = DecompressSerializedBlock(contents->data.data(), handle.size(),
+                                    compression_type, *decomp, &tmp_contents,
+                                    rep_->ioptions,
+                                    GetMemoryAllocator(rep_->table_options));
+    } else {
+      tmp_contents = std::move(*contents);
+    }
+    if (s.ok()) {
+      std::unique_ptr<TBlocklike> block_holder;
+      rep_->create_context.Create(&block_holder, std::move(tmp_contents));
+      out_parsed_block->SetOwnedValue(std::move(block_holder));
+    }
+  }
+  return s;
 }
 
 // If contents is nullptr, this function looks up the block caches for the
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 107f2b6e66e7..946d7263485c 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -311,7 +311,7 @@ class BlockBasedTable : public TableReader {
   template <typename TBlocklike>
   Status CreateAndPinBlockInCache(
       const ReadOptions& ro, const BlockHandle& handle,
-      BlockContents* block_contents,
+      UnownedPtr<Decompressor> decomp, BlockContents* block_contents,
       CachableEntry<TBlocklike>* out_parsed_block) const;
 
   struct Rep;
diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h
index 7c331cbe826d..be0b05808067 100644
--- a/table/block_based/block_based_table_reader_sync_and_async.h
+++ b/table/block_based/block_based_table_reader_sync_and_async.h
@@ -37,8 +37,6 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
   RandomAccessFileReader* file = rep_->file.get();
   const Footer& footer = rep_->footer;
   const ImmutableOptions& ioptions = rep_->ioptions;
-  size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
-  MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
 
   if (ioptions.allow_mmap_reads) {
     size_t idx_in_batch = 0;
@@ -266,79 +264,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
     }
 
     if (s.ok()) {
-      // When the blocks share the same underlying buffer (scratch or direct io
-      // buffer), we may need to manually copy the block into heap if the
-      // serialized block has to be inserted into a cache. That falls into the
-      // following cases -
-      // 1. serialized block is not compressed, it needs to be inserted into
-      //    the uncompressed block cache if there is one
-      // 2. If the serialized block is compressed, it needs to be inserted
-      //    into the compressed block cache if there is one
-      //
-      // In all other cases, the serialized block is either uncompressed into a
-      // heap buffer or there is no cache at all.
-      CompressionType compression_type =
-          GetBlockCompressionType(serialized_block);
-      if ((use_fs_scratch || use_shared_buffer) &&
-          compression_type == kNoCompression) {
-        Slice serialized =
-            Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle));
-        serialized_block = BlockContents(
-            CopyBufferToHeap(GetMemoryAllocator(rep_->table_options),
-                             serialized),
-            handle.size());
-#ifndef NDEBUG
-        serialized_block.has_trailer = true;
-#endif
-      }
-    }
-
-    if (s.ok()) {
-      if (options.fill_cache) {
-        CachableEntry<Block_kData>* block_entry = &results[idx_in_batch];
-        // MaybeReadBlockAndLoadToCache will insert into the block caches if
-        // necessary. Since we're passing the serialized block contents, it
-        // will avoid looking up the block cache
-        s = MaybeReadBlockAndLoadToCache(
-            nullptr, options, handle, decomp,
-            /*for_compaction=*/false, block_entry, mget_iter->get_context,
-            /*lookup_context=*/nullptr, &serialized_block,
-            /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
-
-        if (!s.ok()) {
-          statuses[idx_in_batch] = s;
-          continue;
-        }
-        // block_entry value could be null if no block cache is present, i.e
-        // BlockBasedTableOptions::no_block_cache is true and no compressed
-        // block cache is configured. In that case, fall
-        // through and set up the block explicitly
-        if (block_entry->GetValue() != nullptr) {
-          continue;
-        }
-      }
-
-      CompressionType compression_type =
-          GetBlockCompressionType(serialized_block);
-      BlockContents contents;
-      if (compression_type != kNoCompression) {
-        s = DecompressSerializedBlock(
-            req.result.data() + req_offset, handle.size(), compression_type,
-            *decomp, &contents, rep_->ioptions, memory_allocator);
-      } else {
-        // There are two cases here:
-        // 1) caller uses the shared buffer (scratch or direct io buffer);
-        // 2) we use the requst buffer.
-        // If scratch buffer or direct io buffer is used, we ensure that
-        // all serialized blocks are copyed to the heap as single blocks. If
-        // scratch buffer is not used, we also have no combined read, so the
-        // serialized block can be used directly.
-        contents = std::move(serialized_block);
-      }
-      if (s.ok()) {
-        results[idx_in_batch].SetOwnedValue(std::make_unique<Block_kData>(
-            std::move(contents), read_amp_bytes_per_bit, ioptions.stats));
-      }
+      s = CreateAndPinBlockInCache(options, handle, decomp, &serialized_block,
+                                   &results[idx_in_batch]);
     }
     statuses[idx_in_batch] = s;
   }
diff --git a/unreleased_history/bug_fixes/multiscan_fill_cache.md b/unreleased_history/bug_fixes/multiscan_fill_cache.md
new file mode 100644
index 000000000000..1216ed9db79e
--- /dev/null
+++ b/unreleased_history/bug_fixes/multiscan_fill_cache.md
@@ -0,0 +1 @@
+Fix a crash in iterator Prepare() when fill_cache=false

From 444f1ed07f27d73e31317de234468669d84a4149 Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Thu, 21 Aug 2025 11:51:28 -0700
Subject: [PATCH 243/500] expose compact on deletion factory with min file size
 via C api (#13887)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/13887

Reviewed By: hx235

Differential Revision: D80717735

Pulled By: cbi42

fbshipit-source-id: efecf436188d473a18359e715df979ff24f2fd2e
---
 db/c.cc             | 9 +++++++++
 include/rocksdb/c.h | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 79b0d7b4b55a..dcc19bf333d1 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -4751,6 +4751,15 @@ void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
   opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
 }
 
+void rocksdb_options_add_compact_on_deletion_collector_factory_min_file_size(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger,
+    double deletion_ratio, uint64_t min_file_size) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del = NewCompactOnDeletionCollectorFactory(
+          window_size, num_dels_trigger, deletion_ratio, min_file_size);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
 void rocksdb_set_perf_level(int v) {
   PerfLevel level = static_cast<PerfLevel>(v);
   SetPerfLevel(level);
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 0b1aa699bb15..13a3933ef7e0 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1925,6 +1925,10 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
     rocksdb_options_t*, size_t window_size, size_t num_dels_trigger,
     double deletion_ratio);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory_min_file_size(
+    rocksdb_options_t*, size_t window_size, size_t num_dels_trigger,
+    double deletion_ratio, uint64_t min_file_size);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
     rocksdb_options_t* opt, unsigned char);
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(

From b9957c991cae44959f96888369caf1b145398132 Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Thu, 21 Aug 2025 14:50:22 -0700
Subject: [PATCH 244/500] actually expose rocksdb_status_ptr_get_error via c
 api (#13875)

Summary:
the function implementation is here: https://github.com/facebook/rocksdb/blob/8f0ab1598effd4b05f6f88310c7bd9aaf5d418c6/db/c.cc#L928-L930 but it wasn't fully exposed

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13875

Reviewed By: hx235

Differential Revision: D80717828

Pulled By: cbi42

fbshipit-source-id: d6aaa984f24e469aa8ddb81524dc156b85e891f2
---
 include/rocksdb/c.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 13a3933ef7e0..3f6d28e73541 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1175,6 +1175,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_reset_status(
     rocksdb_status_ptr_t* status_ptr);
 extern ROCKSDB_LIBRARY_API uint32_t
 rocksdb_flushjobinfo_flush_reason(const rocksdb_flushjobinfo_t* info);
+extern ROCKSDB_LIBRARY_API void rocksdb_status_ptr_get_error(
+    rocksdb_status_ptr_t* status, char** errptr);
 
 /* Compaction job info */
 extern ROCKSDB_LIBRARY_API void rocksdb_compactionjobinfo_status(

From 239b06cefb0af7c50f3ab571846fd81d4bed1d63 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Fri, 22 Aug 2025 12:31:50 -0700
Subject: [PATCH 245/500] Retry on some io_uring_wait_cqe error codes (#13890)

Summary:
RocksDB currently aborts whenever `io_uring_wait_cqe` returns an error code. It also does not log what error code was returned.

While experimenting with `IO_URING`, my application crashed because of this.

I asked the Linux Kernel user group the best way to handle unsuccessful `io_uring_wait_cqe`.

It was recommended to retry on `EINTR`, `EAGAIN`, and `ETIME`. `ETIME` only happens when waiting with a timeout, so I am not handling it.

I also write to `stderr` so that we have some debugging information if we abort.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13890

Test Plan: Unfortunately this is hard to cover through unit/stress tests. We have to see what sort of errors get encountered in production.

Reviewed By: anand1976

Differential Revision: D80639955

Pulled By: archang19

fbshipit-source-id: e3a230bd37552ec0f36be34e6a4e53cfd2a254f1
---
 env/fs_posix.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index bc28b52de214..06d7e1a9e939 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -1107,8 +1107,10 @@ class PosixFileSystem : public FileSystem {
         struct io_uring_cqe* cqe = nullptr;
         ssize_t ret = io_uring_wait_cqe(iu, &cqe);
         if (ret) {
-          // abort as it shouldn't be in indeterminate state and there is no
-          // good way currently to handle this error.
+          fprintf(stderr, "Poll: io_uring_wait_cqe failed: %ld", (long)ret);
+          if (ret == -EINTR || ret == -EAGAIN) {
+            continue;  // Retry
+          }
           abort();
         }
 
@@ -1210,8 +1212,10 @@ class PosixFileSystem : public FileSystem {
         struct io_uring_cqe* cqe = nullptr;
         ssize_t ret = io_uring_wait_cqe(iu, &cqe);
         if (ret) {
-          // abort as it shouldn't be in indeterminate state and there is no
-          // good way currently to handle this error.
+          fprintf(stderr, "AbortIO: io_uring_wait_cqe failed: %ld", (long)ret);
+          if (ret == -EINTR || ret == -EAGAIN) {
+            continue;  // Retry
+          }
           abort();
         }
         assert(cqe != nullptr);

From 439e1707fcd354412331d7f2b6c57139e35abf02 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 22 Aug 2025 13:32:10 -0700
Subject: [PATCH 246/500] Fix MultiScan Prepare() to support dictionary
 compression (#13896)

Summary:
I saw failure when added some asserts near https://github.com/facebook/rocksdb/blob/b9957c991cae44959f96888369caf1b145398132/table/block_based/block_based_table_iterator.cc#L1201-L1205 in stress test. The decompression failed with error message like "Corruption: Failed zlib inflate: -3". This PR fixes the issue to use the right decompressor for dictionary compression.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13896

Test Plan: updated unit test that checks no I/O is done after Prepare(), this would fail before this change.

Reviewed By: anand1976

Differential Revision: D80821500

Pulled By: cbi42

fbshipit-source-id: a4322c0da99a2d10e9787d0ec168668567c0c19a
---
 .../block_based/block_based_table_iterator.cc | 34 +++++++++++++++++--
 .../block_based_table_reader_test.cc          | 19 ++++++++++-
 .../bug_fixes/multi-scan-dict-compression.md  |  1 +
 3 files changed, 51 insertions(+), 3 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/multi-scan-dict-compression.md

diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 9b7d4c7e97a8..a8d821e2c326 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1175,6 +1175,34 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       }
     }
 
+    // Get compression dictionary if available - needed for dictionary-aware
+    // decompression
+    UnownedPtr<Decompressor> decompressor =
+        table_->get_rep()->decompressor.get();
+    CachableEntry<DecompressorDict> cached_dict;
+    if (table_->get_rep()->uncompression_dict_reader) {
+      s = table_->get_rep()
+              ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                  /* prefetch_buffer= */ nullptr, read_options_,
+                  /* get_context= */ nullptr, /* lookup_context= */ nullptr,
+                  &cached_dict);
+      if (!s.ok()) {
+#ifndef NDEBUG
+        fprintf(stdout, "Prepare dictionary loading failed with %s\n",
+                s.ToString().c_str());
+#endif
+        // Abort: dictionary lookup failed.
+        return;
+      }
+      if (!cached_dict.GetValue()) {
+#ifndef NDEBUG
+        fprintf(stdout, "Success but no dictionary read\n");
+#endif
+        return;
+      }
+      decompressor = cached_dict.GetValue()->decompressor_.get();
+    }
+
     // Init blocks and pin them in block cache.
     MemoryAllocator* memory_allocator =
         table_->get_rep()->table_options.block_cache->memory_allocator();
@@ -1199,10 +1227,12 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 #endif
         assert(pinned_data_blocks_guard[block_idx].IsEmpty());
         s = table_->CreateAndPinBlockInCache<Block_kData>(
-            read_options_, block, table_->get_rep()->decompressor.get(),
-            &tmp_contents,
+            read_options_, block, decompressor, &tmp_contents,
             &(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
         if (!s.ok()) {
+#ifndef NDEBUG
+          fprintf(stdout, "Prepare failed with %s\n", s.ToString().c_str());
+#endif
           // Abort: failed to create and pin block in cache
           return;
         }
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 89cb4a66f557..6f22965eb7df 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -994,6 +994,18 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
 }
 
 TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
+  std::ostringstream param_trace;
+  param_trace << "[MultiScanPrepare] Test params: " << "CompressionType="
+              << CompressionTypeToString(compression_type_)
+              << ", UseDirectReads=" << (use_direct_reads_ ? "true" : "false")
+              << ", UDTEnabled=" << (udt_enabled_ ? "true" : "false")
+              << ", PersistUDT=" << (persist_udt_ ? "true" : "false")
+              << ", CompressionParallelThreads="
+              << compression_parallel_threads_
+              << ", CompressionDictBytes=" << compression_dict_bytes_
+              << ", SameKeyDiffTs=" << (same_key_diff_ts_ ? "true" : "false");
+  std::cout << param_trace.str() << std::endl;
+
   Options options;
   options.statistics = CreateDBStatistics();
   ReadOptions read_opts;
@@ -1052,6 +1064,9 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
     iter->Next();
   }
   ASSERT_OK(iter->status());
+  // No I/O expected during scanning since all blocks were loaded and pinned.
+  ASSERT_EQ(read_count_after,
+            options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT));
 
   iter.reset(table->NewIterator(
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
@@ -1089,7 +1104,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   // Should do two I/Os since blocks 80-81 and 90-95 are already in block cache,
-  // reads from blocks 50-79 and 82-.. are co
+  // reads from blocks 50-79 and 82-.. are coalesced.
   scan_options = MultiScanArgs(BytewiseComparator());
   scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
   read_count_before =
@@ -1106,6 +1121,8 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
   }
   ASSERT_FALSE(iter->Valid());
   ASSERT_OK(iter->status());
+  ASSERT_EQ(read_count_after,
+            options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT));
 
   // Check cases when Seek key does not match start key in ScanOptions
   iter.reset(table->NewIterator(
diff --git a/unreleased_history/bug_fixes/multi-scan-dict-compression.md b/unreleased_history/bug_fixes/multi-scan-dict-compression.md
new file mode 100644
index 000000000000..f01e49bea11d
--- /dev/null
+++ b/unreleased_history/bug_fixes/multi-scan-dict-compression.md
@@ -0,0 +1 @@
+* Fix a bug in MultiScan that causes it to fall back to a normal scan when dictionary compression is enabled.

From 82b5a2d3fc42730fa992f1d8e154e0a78bc42a48 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 22 Aug 2025 16:05:56 -0700
Subject: [PATCH 247/500] Allow ingestion of any DB generated SST file (#13878)

Summary:
`IngestExternalFileOptions::allow_db_generated_files` requires SST files to have zero sequence number. This PR opens it up for any DB generated SST files. Currently we don't do global sequence number assignment when `allow_db_generated_files` is true, so we require that files do not overlap with any key in the CF. One behavior difference is that now we allow ingesting overlapping files when `allow_db_generated_files` is true. Users need to ensure that files are ordered such that later files have more recent updates.

Intended follow ups:
- Record smallest seqno in table property, so that we don't need to scan the file for it.
- Cover allow_db_generated_files in crash test. We may create a new DB and ingest all files from a CF for verification.
- Add APIs that uses allow_db_generated_files. For example, an API for ingesting SST files from a source CF, so that we take care of ingestion file ordering for user. If we are already getting metadata from the source CF, we may be use it as a hint for level placement instead of dividing input files into batches again (`ExternalSstFileIngestionJob::DivideInputFilesIntoBatches`).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13878

Test Plan: two new unit tests.

Reviewed By: hx235, xingbowang

Differential Revision: D80233727

Pulled By: cbi42

fbshipit-source-id: 74209386d8426c434bff2d9a734f06db537eb50c
---
 db/db_impl/db_impl.cc                         |  19 +-
 db/db_test_util.cc                            |  48 +-
 db/db_test_util.h                             |   8 +
 db/external_sst_file_ingestion_job.cc         | 266 ++++++---
 db/external_sst_file_ingestion_job.h          |  26 +-
 db/external_sst_file_test.cc                  | 522 +++++++++++++++---
 db/version_set.h                              |   2 +
 include/rocksdb/db.h                          |   4 +-
 include/rocksdb/metadata.h                    |   3 +
 include/rocksdb/options.h                     | 111 +++-
 include/rocksdb/table_properties.h            |   2 +
 .../db-gen-file-ingestion.md                  |   1 +
 12 files changed, 815 insertions(+), 197 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/db-gen-file-ingestion.md

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 2c9a5f226cd5..bdbd86b9e188 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -6009,18 +6009,19 @@ Status DBImpl::IngestExternalFiles(
       // mutex when persisting MANIFEST file, and the snapshots taken during
       // that period will not be stable if VersionSet last seqno is updated
       // before LogAndApply.
-      int consumed_seqno_count =
-          ingestion_jobs[0].ConsumedSequenceNumbersCount();
+      SequenceNumber max_assigned_seqno =
+          ingestion_jobs[0].MaxAssignedSequenceNumber();
       for (size_t i = 1; i != num_cfs; ++i) {
-        consumed_seqno_count =
-            std::max(consumed_seqno_count,
-                     ingestion_jobs[i].ConsumedSequenceNumbersCount());
+        max_assigned_seqno = std::max(
+            max_assigned_seqno, ingestion_jobs[i].MaxAssignedSequenceNumber());
       }
-      if (consumed_seqno_count > 0) {
+      if (max_assigned_seqno > 0) {
         const SequenceNumber last_seqno = versions_->LastSequence();
-        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
-        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
-        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+        if (max_assigned_seqno > last_seqno) {
+          versions_->SetLastAllocatedSequence(max_assigned_seqno);
+          versions_->SetLastPublishedSequence(max_assigned_seqno);
+          versions_->SetLastSequence(max_assigned_seqno);
+        }
       }
     }
 
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 0bfb32ebf0fe..0f839b77fc9f 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -1154,16 +1154,18 @@ size_t DBTestBase::CountLiveFiles() {
 }
 
 int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
-  std::string property;
-  if (cf == 0) {
-    // default cfd
-    EXPECT_TRUE(db_->GetProperty(
-        "rocksdb.num-files-at-level" + std::to_string(level), &property));
-  } else {
-    EXPECT_TRUE(db_->GetProperty(
-        handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
-        &property));
+  return NumTableFilesAtLevel(level,
+                              cf ? handles_[cf] : db_->DefaultColumnFamily());
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, ColumnFamilyHandle* cfh,
+                                     DB* db) {
+  if (!db) {
+    db = db_;
   }
+  std::string property;
+  EXPECT_TRUE(db->GetProperty(
+      cfh, "rocksdb.num-files-at-level" + std::to_string(level), &property));
   return atoi(property.c_str());
 }
 
@@ -1196,12 +1198,22 @@ int DBTestBase::TotalTableFiles(int cf, int levels) {
 
 // Return spread of files per level
 std::string DBTestBase::FilesPerLevel(int cf) {
-  int num_levels =
-      (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[cf]);
+  if (cf == 0) {
+    return FilesPerLevel(db_->DefaultColumnFamily());
+  } else {
+    return FilesPerLevel(handles_[cf]);
+  }
+}
+
+std::string DBTestBase::FilesPerLevel(ColumnFamilyHandle* cfh, DB* db) {
+  if (!db) {
+    db = db_;
+  }
+  int num_levels = db->NumberLevels(cfh);
   std::string result;
   size_t last_non_zero_offset = 0;
   for (int level = 0; level < num_levels; level++) {
-    int f = NumTableFilesAtLevel(level, cf);
+    int f = NumTableFilesAtLevel(level, cfh, db);
     char buf[100];
     snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
     result += buf;
@@ -1334,12 +1346,14 @@ void DBTestBase::FillLevels(const std::string& smallest,
 }
 
 void DBTestBase::MoveFilesToLevel(int level, int cf) {
+  MoveFilesToLevel(level, cf ? handles_[cf] : db_->DefaultColumnFamily());
+}
+
+void DBTestBase::MoveFilesToLevel(int level, ColumnFamilyHandle* column_family,
+                                  DB* db) {
+  DBImpl* db_impl = db ? static_cast<DBImpl*>(db) : dbfull();
   for (int l = 0; l < level; ++l) {
-    if (cf > 0) {
-      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
-    } else {
-      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
-    }
+    EXPECT_OK(db_impl->TEST_CompactRange(l, nullptr, nullptr, column_family));
   }
 }
 
diff --git a/db/db_test_util.h b/db/db_test_util.h
index ea2ff609663a..168a6ebf0a07 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -1280,6 +1280,9 @@ class DBTestBase : public testing::Test {
 
   int NumTableFilesAtLevel(int level, int cf = 0);
 
+  int NumTableFilesAtLevel(int level, ColumnFamilyHandle* column_family,
+                           DB* db = nullptr);
+
   double CompressionRatioAtLevel(int level, int cf = 0);
 
   int TotalTableFiles(int cf = 0, int levels = -1);
@@ -1289,6 +1292,8 @@ class DBTestBase : public testing::Test {
   // Return spread of files per level
   std::string FilesPerLevel(int cf = 0);
 
+  std::string FilesPerLevel(ColumnFamilyHandle* cfh, DB* db = nullptr);
+
   size_t CountFiles();
 
   Status CountFiles(size_t* count);
@@ -1320,6 +1325,9 @@ class DBTestBase : public testing::Test {
 
   void MoveFilesToLevel(int level, int cf = 0);
 
+  void MoveFilesToLevel(int level, ColumnFamilyHandle* column_family,
+                        DB* db = nullptr);
+
   void DumpFileCounts(const char* label);
 
   std::string DumpSSTableList();
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index a1963b720937..fc14b6613c73 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -122,24 +122,28 @@ Status ExternalSstFileIngestionJob::Prepare(
     }
   }
 
-  if (ingestion_options_.ingest_behind && files_overlap_) {
-    return Status::NotSupported(
-        "Files with overlapping ranges cannot be ingested with ingestion "
-        "behind mode.");
-  }
+  if (files_overlap_) {
+    if (ingestion_options_.ingest_behind) {
+      return Status::NotSupported(
+          "Files with overlapping ranges cannot be ingested with ingestion "
+          "behind mode.");
+    }
 
-  // Overlapping files need at least two different sequence numbers. If settings
-  // disables global seqno, ingestion will fail anyway, so fail fast in prepare.
-  if (!ingestion_options_.allow_global_seqno && files_overlap_) {
-    return Status::InvalidArgument(
-        "Global seqno is required, but disabled (because external files key "
-        "range overlaps).");
-  }
+    // Overlapping files need at least two different sequence numbers. If
+    // settings disables global seqno, ingestion will fail anyway, so fail
+    // fast in prepare.
+    if (!ingestion_options_.allow_global_seqno &&
+        !ingestion_options_.allow_db_generated_files) {
+      return Status::InvalidArgument(
+          "Global seqno is required, but disabled (because external files key "
+          "range overlaps).");
+    }
 
-  if (ucmp_->timestamp_size() > 0 && files_overlap_) {
-    return Status::NotSupported(
-        "Files with overlapping ranges cannot be ingested to column "
-        "family with user-defined timestamp enabled.");
+    if (ucmp_->timestamp_size() > 0) {
+      return Status::NotSupported(
+          "Files with overlapping ranges cannot be ingested to column "
+          "family with user-defined timestamp enabled.");
+    }
   }
 
   // Copy/Move external files into DB
@@ -531,6 +535,8 @@ Status ExternalSstFileIngestionJob::Run() {
 
   // Find levels to ingest into
   std::optional<int> prev_batch_uppermost_level;
+  // batches at the front of file_batches_to_ingest_ contains older updates and
+  // are placed in smaller levels.
   for (auto& batch : file_batches_to_ingest_) {
     int batch_uppermost_level = 0;
     status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno,
@@ -581,6 +587,15 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
     if (!status.ok()) {
       return status;
     }
+
+    // If any ingested file overlaps with the DB, it will fail here.
+    if (ingestion_options_.allow_db_generated_files && assigned_seqno != 0) {
+      return Status::InvalidArgument(
+          "An ingested file overlaps with existing data in the DB and has been "
+          "assigned a non-zero sequence number, which is not allowed when "
+          "'allow_db_generated_files' is enabled.");
+    }
+
     if (smallest_parsed.sequence == 0 && assigned_seqno != 0) {
       UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno,
                         smallest_parsed.type);
@@ -599,8 +614,8 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
     assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1);
     if (assigned_seqno > *last_seqno) {
       *last_seqno = assigned_seqno;
-      ++consumed_seqno_count_;
     }
+    max_assigned_seqno_ = std::max(max_assigned_seqno_, assigned_seqno);
 
     status = GenerateChecksumForIngestedFile(file);
     if (!status.ok()) {
@@ -623,15 +638,24 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
         file->table_properties.num_range_deletions == 1 &&
         (file->table_properties.num_entries ==
          file->table_properties.num_range_deletions);
+    SequenceNumber smallest_seqno = file->assigned_seqno;
+    SequenceNumber largest_seqno = file->assigned_seqno;
+    if (ingestion_options_.allow_db_generated_files) {
+      assert(file->assigned_seqno == 0);
+      assert(file->smallest_seqno != kMaxSequenceNumber);
+      assert(file->largest_seqno != kMaxSequenceNumber);
+      smallest_seqno = file->smallest_seqno;
+      largest_seqno = file->largest_seqno;
+      max_assigned_seqno_ = std::max(max_assigned_seqno_, file->largest_seqno);
+    }
     FileMetaData f_metadata(
         file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(),
-        file->smallest_internal_key, file->largest_internal_key,
-        file->assigned_seqno, file->assigned_seqno, false,
-        file->file_temperature, kInvalidBlobFileNumber, oldest_ancester_time,
-        current_time,
+        file->smallest_internal_key, file->largest_internal_key, smallest_seqno,
+        largest_seqno, false, file->file_temperature, kInvalidBlobFileNumber,
+        oldest_ancester_time, current_time,
         ingestion_options_.ingest_behind
             ? kReservedEpochNumberForFileIngestedBehind
-            : cfd_->NewEpochNumber(),
+            : cfd_->NewEpochNumber(),  // orders files ingested to L0
         file->file_checksum, file->file_checksum_func_name, file->unique_id, 0,
         tail_size, file->user_defined_timestamps_persisted);
     f_metadata.temperature = file->file_temperature;
@@ -778,7 +802,6 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
     // We failed to add the files to the database
     // remove all the files we copied
     DeleteInternalFiles();
-    consumed_seqno_count_ = 0;
     files_overlap_ = false;
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
@@ -992,6 +1015,32 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     return status;
   }
 
+  const bool allow_data_in_errors = db_options_.allow_data_in_errors;
+  ParsedInternalKey key;
+  if (ingestion_options_.allow_db_generated_files) {
+    // We are ingesting a DB generated SST file for which we don't reassign
+    // sequence numbers. We need its smallest sequence number and largest
+    // sequence number for FileMetaData.
+    Status seqno_status = GetSeqnoBoundaryForFile(
+        table_reader.get(), sv, file_to_ingest, allow_data_in_errors);
+
+    if (!seqno_status.ok()) {
+      return seqno_status;
+    }
+    assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno);
+    assert(file_to_ingest->largest_seqno < kMaxSequenceNumber);
+  } else {
+    SequenceNumber largest_seqno =
+        table_reader.get()->GetTableProperties()->key_largest_seqno;
+    // UINT64_MAX means unknown and the file is generated before table property
+    // `key_largest_seqno` is introduced.
+    if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
+      return Status::Corruption(
+          "External file has non zero largest sequence number " +
+          std::to_string(largest_seqno));
+    }
+  }
+
   if (ingestion_options_.verify_checksums_before_ingest) {
     // If customized readahead size is needed, we can pass a user option
     // all the way to here. Right now we just rely on the default readahead
@@ -1007,7 +1056,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     }
   }
 
-  ParsedInternalKey key;
   // TODO: plumb Env::IOActivity, Env::IOPriority
   ReadOptions ro;
   ro.fill_cache = ingestion_options_.fill_cache;
@@ -1016,7 +1064,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
 
   // Get first (smallest) and last (largest) key from file.
-  bool allow_data_in_errors = db_options_.allow_data_in_errors;
   iter->SeekToFirst();
   if (iter->Valid()) {
     Status pik_status =
@@ -1025,7 +1072,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       return Status::Corruption("Corrupted key in external file. ",
                                 pik_status.getState());
     }
-    if (key.sequence != 0) {
+    if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
       return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->smallest_internal_key.SetFrom(key);
@@ -1062,41 +1109,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
       return Status::Corruption("Corrupted key in external file. ",
                                 pik_status.getState());
     }
-    if (key.sequence != 0) {
+    if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
       return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->largest_internal_key.SetFrom(key);
   } else if (!iter->status().ok()) {
     return iter->status();
   }
-  SequenceNumber largest_seqno =
-      table_reader.get()->GetTableProperties()->key_largest_seqno;
-  // UINT64_MAX means unknown and the file is generated before table property
-  // `key_largest_seqno` is introduced.
-  if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
-    return Status::Corruption(
-        "External file has non zero largest sequence number " +
-        std::to_string(largest_seqno));
-  }
-  if (ingestion_options_.allow_db_generated_files &&
-      largest_seqno == UINT64_MAX) {
-    // Need to verify that all keys have seqno zero.
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      Status pik_status =
-          ParseInternalKey(iter->key(), &key, allow_data_in_errors);
-      if (!pik_status.ok()) {
-        return Status::Corruption("Corrupted key in external file. ",
-                                  pik_status.getState());
-      }
-      if (key.sequence != 0) {
-        return Status::NotSupported(
-            "External file has a key with non zero sequence number.");
-      }
-    }
-    if (!iter->status().ok()) {
-      return iter->status();
-    }
-  }
 
   std::unique_ptr<InternalIterator> range_del_iter(
       table_reader->NewRangeTombstoneIterator(ro));
@@ -1111,7 +1130,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
         return Status::Corruption("Corrupted key in external file. ",
                                   pik_status.getState());
       }
-      if (key.sequence != 0) {
+      if (key.sequence != 0 && !ingestion_options_.allow_db_generated_files) {
         return Status::Corruption(
             "External file has a range deletion with non zero sequence "
             "number.");
@@ -1159,12 +1178,14 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
   const size_t ts_sz = ucmp_->timestamp_size();
   assert(!prev_batch_uppermost_level.has_value() ||
          prev_batch_uppermost_level.value() < cfd_->NumberLevels());
-  bool must_assign_to_l0 = prev_batch_uppermost_level.has_value() &&
-                           prev_batch_uppermost_level.value() == 0;
-  if (force_global_seqno || files_overlap_ ||
-      compaction_style == kCompactionStyleFIFO || must_assign_to_l0) {
+  bool must_assign_to_l0 = (prev_batch_uppermost_level.has_value() &&
+                            prev_batch_uppermost_level.value() == 0) ||
+                           compaction_style == kCompactionStyleFIFO;
+
+  if (force_global_seqno || (!ingestion_options_.allow_db_generated_files &&
+                             (files_overlap_ || must_assign_to_l0))) {
     *assigned_seqno = last_seqno + 1;
-    if (compaction_style == kCompactionStyleFIFO || must_assign_to_l0) {
+    if (must_assign_to_l0) {
       assert(ts_sz == 0);
       file_to_ingest->picked_level = 0;
       if (ingestion_options_.fail_if_not_bottommost_level &&
@@ -1185,16 +1206,26 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
   ro.total_order_seek = true;
   int target_level = 0;
   auto* vstorage = cfd_->current()->storage_info();
-  assert(!must_assign_to_l0);
-  int exclusive_end_level = prev_batch_uppermost_level.has_value()
-                                ? prev_batch_uppermost_level.value()
-                                : cfd_->NumberLevels();
+  assert(!must_assign_to_l0 || ingestion_options_.allow_db_generated_files);
+  int assigned_level_exclusive_end = cfd_->NumberLevels();
+  if (must_assign_to_l0) {
+    assigned_level_exclusive_end = 0;
+  } else if (prev_batch_uppermost_level.has_value()) {
+    assigned_level_exclusive_end = prev_batch_uppermost_level.value();
+  }
 
-  for (int lvl = 0; lvl < exclusive_end_level; lvl++) {
+  // When ingesting db generated files, we require that ingested files do not
+  // overlap with any file in the DB. So we need to check all levels.
+  int overlap_checking_exclusive_end =
+      ingestion_options_.allow_db_generated_files
+          ? cfd_->NumberLevels()
+          : assigned_level_exclusive_end;
+  for (int lvl = 0; lvl < overlap_checking_exclusive_end; lvl++) {
     if (lvl > 0 && lvl < vstorage->base_level()) {
       continue;
     }
-    if (atomic_replace_range_.has_value()) {
+    if (lvl < assigned_level_exclusive_end &&
+        atomic_replace_range_.has_value()) {
       target_level = lvl;
       continue;
     }
@@ -1225,7 +1256,8 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
 
     // We don't overlap with any keys in this level, but we still need to check
     // if our file can fit in it
-    if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+    if (lvl < assigned_level_exclusive_end &&
+        IngestedFileFitInLevel(file_to_ingest, lvl)) {
       target_level = lvl;
     }
   }
@@ -1234,8 +1266,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
       target_level < cfd_->NumberLevels() - 1) {
     status = Status::TryAgain(
         "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
-        "and ongoing compaction's output to Lmax"
-        "does not overlap with files to ingest.");
+        "and ongoing compaction's output to Lmax does not overlap with files "
+        "to ingest. Input files overlapping with each other can cause some "
+        "file to be assigned to non Lmax level.");
     return status;
   }
 
@@ -1256,11 +1289,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     }
   }
 
-  if (ingestion_options_.allow_db_generated_files && *assigned_seqno != 0) {
-    return Status::InvalidArgument(
-        "An ingested file is assigned to a non-zero sequence number, which is "
-        "incompatible with ingestion option allow_db_generated_files.");
-  }
   return status;
 }
 
@@ -1295,8 +1323,12 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
 
 Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
     IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+  if (ingestion_options_.allow_db_generated_files) {
+    assert(seqno == 0);
+    assert(file_to_ingest->original_seqno == 0);
+  }
   if (file_to_ingest->original_seqno == seqno) {
-    // This file already have the correct global seqno
+    // This file already has the correct global seqno.
     return Status::OK();
   } else if (!ingestion_options_.allow_global_seqno) {
     return Status::InvalidArgument("Global seqno is required, but disabled");
@@ -1410,4 +1442,86 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
   }
 }
 
+Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile(
+    TableReader* table_reader, SuperVersion* sv,
+    IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) {
+  const bool has_largest_seqno =
+      table_reader->GetTableProperties()->HasKeyLargestSeqno();
+  SequenceNumber largest_seqno =
+      table_reader->GetTableProperties()->key_largest_seqno;
+  if (has_largest_seqno && largest_seqno == 0) {
+    file_to_ingest->largest_seqno = 0;
+    file_to_ingest->smallest_seqno = 0;
+    return Status::OK();
+  }
+  // The following file scan is only executed when ingesting files with
+  // non-zero seqno.
+  // TODO: record smallest_seqno in table properties to avoid the
+  // file scan here.
+  SequenceNumber smallest_seqno = kMaxSequenceNumber;
+
+  SequenceNumber largest_seqno_from_iter = 0;
+  ReadOptions ro;
+  ro.fill_cache = ingestion_options_.fill_cache;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+  ParsedInternalKey key;
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    Status pik_status =
+        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
+    }
+    smallest_seqno = std::min(smallest_seqno, key.sequence);
+    largest_seqno_from_iter = std::max(largest_seqno_from_iter, key.sequence);
+    iter->Next();
+  }
+  if (!iter->status().ok()) {
+    return iter->status();
+  }
+
+  if (table_reader->GetTableProperties()->num_range_deletions > 0) {
+    std::unique_ptr<InternalIterator> range_del_iter(
+        table_reader->NewRangeTombstoneIterator(ro));
+    if (range_del_iter != nullptr) {
+      for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+           range_del_iter->Next()) {
+        Status pik_status =
+            ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+        if (!pik_status.ok()) {
+          return Status::Corruption("Corrupted key in external file. ",
+                                    pik_status.getState());
+        }
+        smallest_seqno = std::min(smallest_seqno, key.sequence);
+        largest_seqno_from_iter =
+            std::max(largest_seqno_from_iter, key.sequence);
+      }
+      if (!range_del_iter->status().ok()) {
+        return range_del_iter->status();
+      }
+    }
+  }
+
+  file_to_ingest->smallest_seqno = smallest_seqno;
+  if (!has_largest_seqno) {
+    file_to_ingest->largest_seqno = largest_seqno_from_iter;
+  } else {
+    assert(largest_seqno == largest_seqno_from_iter);
+    file_to_ingest->largest_seqno = largest_seqno;
+  }
+
+  if (file_to_ingest->largest_seqno == kMaxSequenceNumber) {
+    return Status::InvalidArgument(
+        "Unknown smallest seqno for db generated file.");
+  }
+  if (file_to_ingest->smallest_seqno == kMaxSequenceNumber) {
+    return Status::InvalidArgument(
+        "Unknown largest seqno for db generated file.");
+  }
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index d97fac31e6e0..d9ecf43da1b4 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -180,6 +180,9 @@ struct IngestedFileInfo : public KeyRangeInfo {
   // the user key's format in the external file matches the column family's
   // setting.
   bool user_defined_timestamps_persisted = true;
+
+  SequenceNumber largest_seqno = kMaxSequenceNumber;
+  SequenceNumber smallest_seqno = kMaxSequenceNumber;
 };
 
 // A batch of files.
@@ -230,7 +233,7 @@ class ExternalSstFileIngestionJob {
         directories_(directories),
         event_logger_(event_logger),
         job_start_time_(clock_->NowMicros()),
-        consumed_seqno_count_(0),
+        max_assigned_seqno_(0),
         io_tracer_(io_tracer) {
     assert(directories != nullptr);
     assert(cfd_);
@@ -287,8 +290,16 @@ class ExternalSstFileIngestionJob {
     return files_to_ingest_;
   }
 
-  // How many sequence numbers did we consume as part of the ingestion job?
-  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+  // Return the maximum assigned sequence number for all files in this job.
+  // When allow_db_generated_files = false, we may assign global sequence
+  // numbers to ingested files. The global sequence numbers are sequence numbers
+  // following versions_->LastSequence().
+  // When allow_db_generated_files = true, we ingest files that already have
+  // sequence numbers assigned. max_assigned_seqno_ will be the max sequence
+  // number among ingested files.
+  SequenceNumber MaxAssignedSequenceNumber() const {
+    return max_assigned_seqno_;
+  }
 
  private:
   Status ResetTableReader(const std::string& external_file,
@@ -369,6 +380,13 @@ class ExternalSstFileIngestionJob {
   template <typename TWritableFile>
   Status SyncIngestedFile(TWritableFile* file);
 
+  // Helper function to obtain the smallest and largest sequence number from a
+  // file. When OK is returned, file_to_ingest->smallest_seqno and
+  // file_to_ingest->largest_seqno will be updated.
+  Status GetSeqnoBoundaryForFile(TableReader* table_reader, SuperVersion* sv,
+                                 IngestedFileInfo* file_to_ingest,
+                                 bool allow_data_in_errors);
+
   // Create equivalent `Compaction` objects to this file ingestion job
   // , which will be used to check range conflict with other ongoing
   // compactions.
@@ -395,7 +413,7 @@ class ExternalSstFileIngestionJob {
   EventLogger* event_logger_;
   VersionEdit edit_;
   uint64_t job_start_time_;
-  int consumed_seqno_count_;
+  SequenceNumber max_assigned_seqno_;
   // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
   // ingested in L0
   bool files_overlap_{false};
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 55befde4cab7..6d8c56ca6190 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -7,6 +7,7 @@
 
 #include <functional>
 #include <memory>
+#include <sstream>
 
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -3853,99 +3854,32 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
       ASSERT_OK(Put(1, Key(k), "cf1_" + Key(k)));
     }
     ASSERT_OK(Flush(/*cf=*/1));
-    {
-      // Verify that largest key of the file has non-zero seqno.
-      std::vector<std::vector<FileMetaData>> metadata;
-      dbfull()->TEST_GetFilesMetaData(handles_[1], &metadata, nullptr);
-      const FileMetaData& file = metadata[0][0];
-      ValueType vtype;
-      SequenceNumber seq;
-      UnPackSequenceAndType(ExtractInternalKeyFooter(file.largest.Encode()),
-                            &seq, &vtype);
-      ASSERT_GE(seq, 0);
-    }
-    std::vector<LiveFileMetaData> live_meta;
-    db_->GetLiveFilesMetaData(&live_meta);
-    ASSERT_EQ(live_meta.size(), 1);
-    std::vector<std::string> to_ingest_files;
-    to_ingest_files.emplace_back(live_meta[0].directory + "/" +
-                                 live_meta[0].relative_filename);
-    // Ingesting a file whose boundary key has non-zero seqno.
-    Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
-    // This error msg is from checking seqno of boundary keys.
-    ASSERT_TRUE(
-        s.ToString().find("External file has non zero sequence number") !=
-        std::string::npos);
-    ASSERT_NOK(s);
-
-    {
-      // Only non-boundary key with non-zero seqno.
-      const Snapshot* snapshot = db_->GetSnapshot();
-      ASSERT_OK(Put(1, Key(70), "cf1_" + Key(70)));
-      ASSERT_OK(Flush(1));
-      CompactRangeOptions cro;
-      cro.bottommost_level_compaction =
-          BottommostLevelCompaction::kForceOptimized;
-      ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
-
-      // Verify that only the non-boundary key of the file has non-zero seqno.
-      std::vector<std::vector<FileMetaData>> metadata;
-      // File may be at different level for different options.
-      dbfull()->TEST_GetFilesMetaData(handles_[1], &metadata, nullptr);
-      bool found_file = false;
-      for (const auto& level : metadata) {
-        if (level.empty()) {
-          continue;
-        }
-        ASSERT_FALSE(found_file);
-        found_file = true;
-        ASSERT_EQ(1, level.size());
-        const FileMetaData& file = level[0];
-        ValueType vtype;
-        SequenceNumber seq;
-        UnPackSequenceAndType(ExtractInternalKeyFooter(file.largest.Encode()),
-                              &seq, &vtype);
-        ASSERT_EQ(seq, 0);
-        UnPackSequenceAndType(ExtractInternalKeyFooter(file.smallest.Encode()),
-                              &seq, &vtype);
-        ASSERT_EQ(seq, 0);
-        ASSERT_GT(file.fd.largest_seqno, 0);
-      }
-      ASSERT_TRUE(found_file);
-      live_meta.clear();
-      db_->GetLiveFilesMetaData(&live_meta);
-      ASSERT_EQ(live_meta.size(), 1);
-      to_ingest_files[0] =
-          live_meta[0].directory + "/" + live_meta[0].relative_filename;
-      s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
-      ASSERT_NOK(s);
-      // This error msg is from checking largest seqno in table property.
-      ASSERT_TRUE(s.ToString().find("non zero largest sequence number") !=
-                  std::string::npos);
-      db_->ReleaseSnapshot(snapshot);
-    }
 
+    Status s;
     CompactRangeOptions cro;
     cro.bottommost_level_compaction =
         BottommostLevelCompaction::kForceOptimized;
     ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
-    live_meta.clear();
+
+    std::vector<LiveFileMetaData> live_meta;
+    std::vector<std::string> to_ingest_files;
     db_->GetLiveFilesMetaData(&live_meta);
     ASSERT_EQ(live_meta.size(), 1);
+    ASSERT_EQ(live_meta[0].column_family_name, "toto");
     ASSERT_EQ(0, live_meta[0].largest_seqno);
-    to_ingest_files[0] =
-        live_meta[0].directory + "/" + live_meta[0].relative_filename;
+    to_ingest_files.emplace_back(live_meta[0].directory + "/" +
+                                 live_meta[0].relative_filename);
 
+    // Ingesting a DB generated file with allow_db_generated_files = false
     ingest_opts.allow_db_generated_files = false;
-    // Ingesting a DB genrate file with allow_db_generated_files = false;
     s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
     ASSERT_TRUE(s.ToString().find("External file version not found") !=
                 std::string::npos);
     ASSERT_NOK(s);
 
     const std::string err =
-        "An ingested file is assigned to a non-zero sequence number, which is "
-        "incompatible with ingestion option allow_db_generated_files";
+        "An ingested file overlaps with existing data in the DB and has been "
+        "assigned a non-zero sequence number";
     ingest_opts.allow_db_generated_files = true;
     s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
     ASSERT_TRUE(s.ToString().find(err) != std::string::npos);
@@ -4146,6 +4080,440 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
     }
   } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
 }
+
+TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
+  // Test ingestion of DB-generated SST files that contain non-zero sequence
+  // numbers.
+  IngestExternalFileOptions ingest_opts;
+  ingest_opts.allow_db_generated_files = true;
+  // This only works since we are ingesting without snapshot
+  // Failure case will be tested below.
+  ingest_opts.snapshot_consistency = std::get<0>(GetParam());
+  ingest_opts.allow_global_seqno = std::get<1>(GetParam());
+  ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
+  ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
+  ingest_opts.link_files = std::get<4>(GetParam());
+  Random* rnd = Random::GetTLSInstance();
+
+  do {
+    SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
+
+    Options options = CurrentOptions();
+    options.allow_concurrent_memtable_write =
+        false;  // Required for VectorRepFactory
+    CreateAndReopenWithCF({"non_overlap", "overlap"}, options);
+
+    ColumnFamilyHandle* non_overlap_cf = handles_[1];
+    ColumnFamilyHandle* overlap_cf = handles_[2];
+
+    std::vector<std::string> expected_values;
+    expected_values.resize(100);
+    WriteOptions wo;
+    // Setup target CF with non-overlapping base data Key1 and Key99
+    // Will ingest keys [1, 98] below.
+    expected_values[0] = rnd->RandomString(100);
+    ASSERT_OK(db_->Put(wo, non_overlap_cf, Key(0), expected_values[0]));
+    ASSERT_OK(db_->Flush({}, non_overlap_cf));
+    expected_values[99] = rnd->RandomString(100);
+    ASSERT_OK(db_->Put(wo, non_overlap_cf, Key(99), expected_values[99]));
+
+    // Set up overlapping cf
+    ASSERT_OK(db_->Put(wo, overlap_cf, Key(50), rnd->RandomString(100)));
+
+    // Create temp CF/DB
+    Options temp_cf_opts;
+    ColumnFamilyHandle* temp_cfh = nullptr;
+    DB* from_db = nullptr;
+    std::string temp_db_name;
+    // Using a separate DB also validates that latest sequence number
+    // of target db is updated after ingestion (to the max sequence number
+    // in ingested files).
+    const bool use_temp_db = rnd->OneIn(2);
+    SCOPED_TRACE("use_temp_db: " + std::to_string(use_temp_db));
+
+    std::vector<std::string> sst_file_paths;
+    // optional L5: files in key range [70, 98]
+    // L6: files in key range [1, 79]
+    temp_cf_opts.target_file_size_base =
+        4 << 10;  // Small files to create multiple SSTs
+    temp_cf_opts.num_levels = 7;
+    temp_cf_opts.disable_auto_compactions = true;  // Manually set up LSM
+    temp_cf_opts.env = options.env;
+
+    if (use_temp_db) {
+      temp_cf_opts.create_if_missing = true;
+      temp_db_name = dbname_ + "/temp_db_" + std::to_string(rnd->Next());
+      ASSERT_OK(DB::Open(temp_cf_opts, temp_db_name, &from_db));
+      temp_cfh = from_db->DefaultColumnFamily();
+    } else {
+      from_db = db_;
+      ASSERT_OK(
+          from_db->CreateColumnFamily(temp_cf_opts, "temp_cf", &temp_cfh));
+    }
+
+    // Use snapshot to ensure non-zero sequence numbers after compaction
+    const Snapshot* snapshot = from_db->GetSnapshot();
+
+    for (int k = 1; k < 99; ++k) {
+      expected_values[k] = rnd->RandomString(500);
+      ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
+    }
+    ASSERT_OK(from_db->Flush({}, temp_cfh));
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction =
+        BottommostLevelCompaction::kForceOptimized;
+    ASSERT_OK(from_db->CompactRange(cro, temp_cfh, nullptr, nullptr));
+
+    ASSERT_GT(NumTableFilesAtLevel(6, temp_cfh, from_db), 1);
+
+    const bool multi_level_ingestion = rnd->OneIn(2);
+    SCOPED_TRACE("Multi-level ingestion: " +
+                 std::to_string(multi_level_ingestion));
+    if (multi_level_ingestion) {
+      for (int k = 80; k < 99; ++k) {
+        expected_values[k] = rnd->RandomString(500);
+        ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
+      }
+      ASSERT_OK(from_db->Flush({}, temp_cfh));
+
+      // Do some overwrites, and overlap with previous L0 to avoid trivial move
+      for (int k = 70; k < 82; ++k) {
+        expected_values[k] = rnd->RandomString(500);
+        ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
+      }
+      ASSERT_OK(from_db->Flush({}, temp_cfh));
+
+      if (rnd->OneIn(2)) {
+        MoveFilesToLevel(5, temp_cfh, from_db);
+        ASSERT_GT(NumTableFilesAtLevel(5, temp_cfh, from_db), 0);
+      }
+      ASSERT_GT(NumTableFilesAtLevel(6, temp_cfh, from_db), 0);
+    }
+    SCOPED_TRACE("LSM of from_db " + FilesPerLevel(temp_cfh, from_db));
+
+    ColumnFamilyMetaData cf_meta;
+    from_db->GetColumnFamilyMetaData(temp_cfh, &cf_meta);
+
+    // Iterate in reverse since IngestExternalFiles expect files to be ordered
+    // from old to new
+    for (auto level_meta = cf_meta.levels.rbegin();
+         level_meta != cf_meta.levels.rend(); ++level_meta) {
+      // L0 files need to be added in reverse order.
+      for (auto file_meta = level_meta->files.rbegin();
+           file_meta != level_meta->files.rend(); ++file_meta) {
+        // Validate that files contain non-zero sequence numbers
+        ASSERT_GT(file_meta->smallest_seqno, 0);
+        ASSERT_GE(file_meta->largest_seqno, file_meta->smallest_seqno);
+        sst_file_paths.emplace_back(file_meta->directory + "/" +
+                                    file_meta->relative_filename);
+      }
+    }
+    from_db->ReleaseSnapshot(snapshot);
+
+    Status s;
+    // Perform ingestion and validate results
+    if (multi_level_ingestion && options.num_levels > 1) {
+      // fail_if_bottommost requres ingesting all files into the last level,
+      // so it fails if we are assiging files to multiple levels.
+      ingest_opts.fail_if_not_bottommost_level = true;
+      s = db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts);
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") !=
+                  std::string::npos);
+      ingest_opts.fail_if_not_bottommost_level = false;
+    }
+    if (ingest_opts.snapshot_consistency) {
+      // snapshot_consisteny requires global sequence number assignment to
+      // ingested files if there is any live snapshot.
+      snapshot = db_->GetSnapshot();
+      s = db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts);
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find(
+          "An ingested file overlaps with existing data in the DB and has been "
+          "assigned a non-zero sequence number"));
+      db_->ReleaseSnapshot(snapshot);
+    }
+    ASSERT_OK(
+        db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts));
+
+    // Validate ingested data.
+    ReadOptions ro;
+    std::string val;
+    for (int k = 0; k < 100; ++k) {
+      s = db_->Get(ro, handles_[1], Key(k), &val);
+      ASSERT_OK(s) << "Should find ingested key " << Key(k);
+      ASSERT_EQ(val, expected_values[k]) << "key: " << Key(k);
+    }
+
+    // Overlap with data in the CF
+    if (ingest_opts.allow_blocking_flush) {
+      s = db_->IngestExternalFile(overlap_cf, sst_file_paths, ingest_opts);
+
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find("An ingested file overlaps with existing "
+                                    "data in the DB and has been "
+                                    "assigned a non-zero sequence number") !=
+                  std::string::npos)
+          << s.ToString();
+    }
+
+    // Cleanup
+    // FIXME: Without this, the test triggers some data race between dropping
+    // CF and background compaction.
+    ASSERT_OK(db_->WaitForCompact({}));
+    if (use_temp_db) {
+      ASSERT_OK(from_db->Close());
+      delete from_db;
+      ASSERT_OK(DestroyDB(temp_db_name, temp_cf_opts));
+    } else {
+      ASSERT_OK(db_->DropColumnFamily(temp_cfh));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(temp_cfh));
+    }
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+std::string GenSecondaryKey(const std::string& pk, const std::string& val) {
+  return "index_" + val + "_" + pk;
+};
+
+TEST_P(IngestDBGeneratedFileTest2, ZeroAndNonZeroSeqno) {
+  // Test ingestion of SST files with zero and with non-zero sequence numbers.
+  // Generate data using a temp CF and a temp DB:
+  // 1. Temp CF with cf_allow_ingest_behind enabled to preserve non-zero seqno.
+  // 2. Temp DB with everything compacted to have zero seqno.
+  // Then ingest both types of files together into a target CF.
+  // This mimics a user case where temp DB contains data read from a
+  // snapshot while temp CF contains live writes after a snapshot is taken.
+  IngestExternalFileOptions ingest_opts;
+  ingest_opts.allow_db_generated_files = true;
+  ingest_opts.snapshot_consistency = std::get<0>(GetParam());
+  ingest_opts.allow_global_seqno = std::get<1>(GetParam());
+  ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
+  ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
+  ingest_opts.link_files = std::get<4>(GetParam());
+
+  Random* rnd = Random::GetTLSInstance();
+
+  do {
+    SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
+    Options options = CurrentOptions();
+    options.allow_concurrent_memtable_write = false;
+    // Force more flushes/compactions and more files to be generated
+    options.target_file_size_base = 1 << 10;     // 1KB
+    options.max_bytes_for_level_base = 2 << 10;  // 2KB
+    options.max_bytes_for_level_multiplier = 2;
+    options.level0_file_num_compaction_trigger = 2;
+    options.level_compaction_dynamic_level_bytes = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"target_cf"}, options);
+    auto* target_cfh = handles_[1];
+
+    Options live_write_cf_opts = options;
+    live_write_cf_opts.memtable_factory.reset(new VectorRepFactory());
+    live_write_cf_opts.compaction_style = kCompactionStyleUniversal;
+    live_write_cf_opts.cf_allow_ingest_behind = true;
+    live_write_cf_opts.num_levels = 50;
+    ColumnFamilyHandle* live_write_cfh;
+    ASSERT_OK(db_->CreateColumnFamily(live_write_cf_opts, "live_write_cf",
+                                      &live_write_cfh));
+
+    // Expected value and key
+    std::map<std::string, std::string> expected;
+    std::unordered_set<std::string> deleted;
+    std::stringstream debug_info;
+
+    // Setup base data in target CF, will ingest keys with different prefixes
+    // so they don't overlap with the base data.
+    WriteOptions wo;
+    for (int k = 0; k < 100; ++k) {
+      int random_val = rnd->Uniform(20);
+      expected[Key(k)] = std::to_string(random_val);
+      ASSERT_OK(db_->Put(wo, target_cfh, Key(k), expected[Key(k)]));
+
+      // Force flush every 20 keys to create multiple SST files
+      if (rnd->OneIn(20)) {
+        ASSERT_OK(db_->Flush({}, target_cfh));
+        debug_info << "Flush after " << k
+                   << ", LSM state: " << FilesPerLevel(target_cfh) << "\n";
+      }
+    }
+
+    // Temp DB for snapshot data
+    Options temp_db_opts;
+    temp_db_opts.create_if_missing = true;
+    temp_db_opts.target_file_size_base = 1 << 10;
+    temp_db_opts.write_buffer_size = 1 << 10;
+    temp_db_opts.memtable_factory.reset(new VectorRepFactory());
+    temp_db_opts.allow_concurrent_memtable_write = false;
+    temp_db_opts.compaction_style = kCompactionStyleUniversal;
+    temp_db_opts.env = env_;
+    temp_db_opts.num_levels = 7;
+
+    std::string temp_db_name =
+        dbname_ + "/temp_db_" + std::to_string(rnd->Next());
+    DB* temp_db = nullptr;
+    ASSERT_OK(DB::Open(temp_db_opts, temp_db_name, &temp_db));
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions ro;
+    ro.snapshot = snapshot;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter{db_->NewIterator(ro, target_cfh)};
+    // transform data read from snapshot and write to temp DB
+    // Varying the number of files in temp DB.
+    const int kValSize = rnd->Uniform(200);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string key = iter->key().ToString();
+      std::string value = iter->value().ToString();
+      std::string sk = GenSecondaryKey(key, value);
+      // Usually value is empty, here we use a larger value to generate
+      // multiple SST files in temp_db.
+      std::string sk_val = rnd->RandomString(kValSize);
+      ASSERT_OK(temp_db->Put(wo, sk, sk_val));
+      expected[sk] = sk_val;
+      debug_info << "Snapshot data: " << sk << " -> \n";
+    }
+    ASSERT_OK(iter->status());
+
+    // Do some live writes into target CF and live write CF.
+    for (int i = 0; i < 10; ++i) {
+      WriteBatch wb;
+      for (int j = 0; j < 5; ++j) {
+        std::string key = Key(rnd->Uniform(100));
+        std::string old_val = expected[key];
+        // Value range is 0-19, allow some PK to have the same value.
+        int random_val = rnd->Uniform(20);
+        std::string new_val = std::to_string(random_val);
+        std::string old_index_key = GenSecondaryKey(key, old_val);
+        std::string new_index_key = GenSecondaryKey(key, new_val);
+        ASSERT_OK(wb.SingleDelete(live_write_cfh, old_index_key));
+        std::string sk_val = rnd->RandomString(kValSize);
+        ASSERT_OK(wb.Put(live_write_cfh, new_index_key, sk_val));
+        ASSERT_OK(wb.Put(target_cfh, key, new_val));
+        expected[key] = new_val;
+        expected.erase(old_index_key);
+        expected[new_index_key] = sk_val;
+        deleted.insert(old_index_key);
+        deleted.erase(new_index_key);
+
+        debug_info << "Live write: SD " << old_index_key << "\n";
+        debug_info << "Live write: " << key << " -> " << new_val << "\n";
+        debug_info << "Live write: " << new_index_key << " -> \n";
+      }
+      ASSERT_OK(db_->Write(wo, &wb));
+      if (rnd->OneIn(3)) {
+        debug_info << "Flush after " << i << " live writes\n";
+        ASSERT_OK(db_->Flush({}, live_write_cfh));
+      }
+    }
+    iter.reset();
+    db_->ReleaseSnapshot(snapshot);
+
+    // Compact temp_db to ensure zero sequence numbers
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(temp_db->CompactRange(cro, nullptr, nullptr));
+    SCOPED_TRACE("Temp DB LSM: " +
+                 FilesPerLevel(temp_db->DefaultColumnFamily(), temp_db));
+
+    // Base data from snapshot
+    std::vector<std::string> sst_file_paths_zero_seqno;
+
+    // Collect SST file paths with zero sequence numbers
+    ASSERT_OK(temp_db->DisableFileDeletions());
+    ColumnFamilyMetaData cf_meta_temp_db;
+    temp_db->GetColumnFamilyMetaData(&cf_meta_temp_db);
+    for (const auto& level_meta : cf_meta_temp_db.levels) {
+      if (level_meta.level == 6) {
+        for (const auto& file_meta : level_meta.files) {
+          // Verify files have zero sequence numbers
+          ASSERT_EQ(0, file_meta.largest_seqno)
+              << "File " << file_meta.relative_filename
+              << " should have zero sequence number\n"
+              << debug_info.str();
+          sst_file_paths_zero_seqno.emplace_back(file_meta.directory + "/" +
+                                                 file_meta.relative_filename);
+        }
+      } else {
+        // All files should be in L6
+        ASSERT_EQ(0, level_meta.files.size()) << debug_info.str();
+      }
+    }
+
+    // Flush remaining catch up writes in memtable
+    ASSERT_OK(db_->Flush({}, live_write_cfh));
+    SCOPED_TRACE("LSM of live write cfh " + FilesPerLevel(live_write_cfh));
+    // Collect SST file paths with non-zero sequence numbers
+    ColumnFamilyMetaData live_write_cf_meta;
+    ASSERT_OK(db_->DisableFileDeletions());
+    db_->GetColumnFamilyMetaData(live_write_cfh, &live_write_cf_meta);
+
+    // Live writes after snapshot
+    std::vector<std::string> sst_file_paths_nonzero_seqno;
+    for (auto level_meta = live_write_cf_meta.levels.rbegin();
+         level_meta != live_write_cf_meta.levels.rend(); ++level_meta) {
+      // Reverse order is important for L0, where recent updates are ordered
+      // first
+      for (auto file_meta = level_meta->files.rbegin();
+           file_meta != level_meta->files.rend(); ++file_meta) {
+        sst_file_paths_nonzero_seqno.emplace_back(file_meta->directory + "/" +
+                                                  file_meta->relative_filename);
+        ASSERT_GT(file_meta->smallest_seqno, 0) << debug_info.str();
+      }
+      if (level_meta->level == 49) {
+        // Ingest behind does not compact to the last level
+        ASSERT_EQ(level_meta->files.size(), 0) << debug_info.str();
+      }
+    }
+
+    ASSERT_GT(sst_file_paths_zero_seqno.size(), 0) << debug_info.str();
+    ASSERT_GT(sst_file_paths_nonzero_seqno.size(), 0) << debug_info.str();
+
+    // Combine all SST file paths.
+    // File ingestion takes files from old to new.
+    std::vector<std::string> all_sst_files;
+    all_sst_files.insert(all_sst_files.end(), sst_file_paths_zero_seqno.begin(),
+                         sst_file_paths_zero_seqno.end());
+    all_sst_files.insert(all_sst_files.end(),
+                         sst_file_paths_nonzero_seqno.begin(),
+                         sst_file_paths_nonzero_seqno.end());
+    if (ingest_opts.fail_if_not_bottommost_level && options.num_levels > 1) {
+      // overlapping files will be ingested into different levels, including non
+      // Lmax
+      Status s =
+          db_->IngestExternalFile(target_cfh, all_sst_files, ingest_opts);
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") !=
+                  std::string::npos);
+    } else {
+      ASSERT_OK(
+          db_->IngestExternalFile(target_cfh, all_sst_files, ingest_opts));
+
+      debug_info << "Zero seqno files: " << sst_file_paths_zero_seqno.size()
+                 << "\nNon-zero seqno files: "
+                 << sst_file_paths_nonzero_seqno.size() << "\n";
+
+      SCOPED_TRACE("Debug info:\n" + debug_info.str());
+      VerifyDBFromMap(expected, nullptr, false, nullptr, target_cfh, &deleted);
+    }
+
+    // clean up
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_OK(temp_db->EnableFileDeletions());
+
+    // FIXME: Without this, the test triggers some data race between dropping
+    // CF and background compaction.
+    ASSERT_OK(db_->WaitForCompact({}));
+
+    ASSERT_OK(db_->DropColumnFamily(live_write_cfh));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(live_write_cfh));
+
+    ASSERT_OK(temp_db->Close());
+    delete temp_db;
+    ASSERT_OK(DestroyDB(temp_db_name, temp_db_opts));
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.h b/db/version_set.h
index 2d81dfce73b1..b20ab972f20f 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -669,6 +669,8 @@ class VersionStorageInfo {
 
   // List of files per level, files in each level are arranged
   // in increasing order of keys
+  // In L0, files are ordered in decreasing epoch number, meaning
+  // more recent updates are ordered first.
   std::vector<FileMetaData*>* files_;
 
   // Map of all table files in version. Maps file number to (level, position on
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 14db14aa5a9e..fdc3e7a8f26a 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1966,7 +1966,9 @@ class DB {
   // In the first mode we will try to find the lowest possible level that
   // the file can fit in, and ingest the file into this level (2). A file that
   // have a key range that overlap with the memtable key range will require us
-  // to Flush the memtable first before ingesting the file.
+  // to Flush the memtable first before ingesting the file. If ingested files
+  // have any overlap with each other, level and sequence number assignment
+  // ensure later files overwrite earlier files.
   // In the second mode we will always ingest in the bottom most level (see
   // docs to IngestExternalFileOptions::ingest_behind).
   // For a column family that enables user-defined timestamps, ingesting
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 4ab3842dda80..4c6c79f4c6fb 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -239,6 +239,9 @@ struct ColumnFamilyMetaData {
   // The name of the column family.
   std::string name;
   // The metadata of all levels in this column family.
+  // levels[i] contains files in level i.
+  // For level 0, files with recent updates are ordered first.
+  // For level 1+, files are ordered by increasing key range.
   std::vector<LevelMetaData> levels;
 
   // The total size of all blob files
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 5463beb10e58..206085b208a7 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2419,7 +2419,47 @@ struct CompactRangeOptions {
   double blob_garbage_collection_age_cutoff = -1;
 };
 
-// IngestExternalFileOptions is used by IngestExternalFile()
+// IngestExternalFileOptions setting guide:
+//
+// The options in IngestExternalFileOptions interact in complex ways depending
+// on the source and overlap of SST files. Below is a summary of recommended
+// non-default settings for common use cases:
+//
+// 1. Ingesting only SST writer generated non-overlapping SSTs that are not
+// expected to overlap with existing data:
+//    - Optionally set fail_if_not_bottommost_level = true to enforce placement
+//    in the last level. This is better paird with SST partitioner to guarantee
+//    that there are no existing file with keys across the ingesting key range.
+//    - Set allow_blocking_flush to false: Not expecting to overlap with
+//    memtable and cause a flush.
+//    - If snapshot consistency is not expected, set snapshot_consistency to
+//    false and allow_global_seqno to false. allow_global_seqno = false will
+//    fail ingestion if any input file overlap with each other.
+//
+// 2. Ingesting SST writer generated overlapping SSTs:
+//    - order files with older updates first, newer overwrites later.
+//    - Set allow_global_seqno = true since newer files need to be assigned
+//    larger sequence numbers.
+//
+// 3. Ingesting DB generated SSTs: overlapping with target CF data is not
+// allowed. Input files are allowed to contain both DB generated files and SST
+// file writer generated files. They will all be treated as DB generated.
+//    - Set allow_db_generated_files = true.
+//    - Set snapshot_consistency = false: snapshot consistency requires
+// assigning higher sequence number to ingested files. DB generated files
+// don't support global seqno assignment yet.
+//    - Set allow_blocking_flush to false: Not expecting to overlap with
+//    memtable and cause a flush.
+//    - If the source live DB is running, set link_files = true instead of
+//    move_files.
+// 3a) SST files are non-overlapping and all keys have seqno 0: e.g., a
+// temporary RocksDB instance used to sort some data, and compacts all
+// data into the last level before ingestion.
+//    - Optionally set fail_if_not_bottommost_level = true to enforce placement
+//    in the last level.
+// 3b) SST files are overlapping, e.g. ingesting files from one CF to another.
+//    - Ensure older updates are ordered first and newer updates are ordered
+//    later. See more in option comment for allow_db_generated_files.
 struct IngestExternalFileOptions {
   // Can be set to true to move the files instead of copying them.
   // The input files will be unlinked after successful ingestion.
@@ -2436,10 +2476,20 @@ struct IngestExternalFileOptions {
   // If set to false, an ingested file keys could appear in existing snapshots
   // that where created before the file was ingested.
   bool snapshot_consistency = true;
-  // If set to false, IngestExternalFile() will fail if the file key range
+  // Enables assiging a global sequence number to each ingested file, i.e.,
+  // all keys in the ingested file will be treated as having this seqno.
+  // If set to false, we will use the sequence numbers in the ingested file
+  // as is, and IngestExternalFile() will fail if the ingested key range
   // overlaps with existing keys or tombstones or output of ongoing compaction
-  // during file ingestion in the DB (the conditions under which a global_seqno
-  // must be assigned to the ingested file).
+  // in the CF (the conditions under which a global seqno must be assigned to
+  // the ingested file).
+  // If the ingested files overlap with each other, we need to assign global
+  // sequence to the ingested files and this option needs to be enabled. One
+  // exception to this is when ingesting DB generated SST files (see option
+  // allow_db_generated_files below). DB generated files do not support
+  // global seqno assignment and can be ingested even if they overlap with
+  // each other. This option has no effect when allow_db_generated_files is
+  // enabled.
   bool allow_global_seqno = true;
   // Normally (true), IngestExternalFile() will trigger and block for flushing
   // memtable(s) if there is overlap between ingested files and memtable(s). If
@@ -2505,18 +2555,53 @@ struct IngestExternalFileOptions {
   //
   // XXX: "bottommost" is obsolete/confusing terminology to refer to last level
   bool fail_if_not_bottommost_level = false;
-  // EXPERIMENTAL
-  // Enables ingestion of files not generated by SstFileWriter. When true:
+  // EXPERIMENTAL, SUBJECT TO CHANGE
+  //
+  // Enables special mode of ingestion that allows files generated by a live DB,
+  // instead of SstFileWriter. When true:
   // - Allows files to be ingested when their cf_id doesn't match the CF they
   //   are being ingested into.
+  // - Allows files with any sequence numbers to be ingested.
+  // - Original sequence numbers are preserved (no reassignment).
+  //
   // REQUIREMENTS:
-  // - Ingested files must not overlap with existing keys.
-  // - `write_global_seqno` must be false.
-  // - All keys in ingested files should have sequence number 0. We fail
-  // ingestion if any sequence numbers is non-zero.
-  // WARNING: If a DB contains ingested files generated by another DB/CF,
-  // RepairDB() may not recover these files correctly, potentially leading to
-  // data loss.
+  // - Ingested files must NOT overlap with any existing data in the DB. Since
+  //   no sequence number reassignment is performed on db generated files.
+  //   Ingestion will fail if any overlap is detected. However, input files
+  //   are allowed to overlap with each other when this option is enabled. This
+  //   is useful when ingesting multiple levels of files from a CF, where
+  //   levels naturally overlap with each other.
+  // - CAUTION: If input files overlap with each other, then for any given user
+  //   key appearing in multiple files, earlier files MUST have smaller sequence
+  //   numbers than later files. Later files will be placed at a higher level
+  //   (smaller level number). This is to ensure the LSM invariant where for
+  //   the same key, recent updates are in higher levels. This means that
+  //   if you are ingesting files from multiple levels of a CF, you should
+  //   put files from lower levels first, and files from higher levels later.
+  //   Example for getting files from a CF for ingestion:
+  //
+  // ColumnFamilyMetaData cf_meta;
+  // from_db->GetColumnFamilyMetaData(from_cf, &cf_meta);
+  // // iterate in reverse to start from lowest level
+  // for (auto level_meta = cf_meta.levels.rbegin();
+  //      level_meta != cf_meta.levels.rend(); ++level_meta) {
+  //   // L0 files need to be added in reverse order so we iterate in reverse
+  //   // within a level too
+  //   for (auto file_meta = level_meta->files.rbegin();
+  //        file_meta != level_meta->files.rend(); ++file_meta) {
+  //     // Add file for ingestion
+  //   }
+  // }
+  //
+  //   WARNING: Violating the sequence number ordering requirement will cause
+  //   LSM invariant violations and may lead to incorrect reads or data
+  //   corruption.
+  // - If you would like to enforce that the ingested files do not overlap
+  //   with each other, you can set `fail_if_not_bottommost_level` to true.
+  //   If ingested files overlap with each other, some file will be placed
+  //   above Lmax, failing the ingestion if the option is set.
+  // - `write_global_seqno` must be false (sequence numbers cannot be
+  //    reassigned).
   bool allow_db_generated_files = false;
 
   // Controls whether data and metadata blocks (e.g. index, filter) read during
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 1b20d9d3ab99..860fa6fd4f2f 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -307,6 +307,8 @@ struct TableProperties {
   // table is empty).
   uint64_t key_largest_seqno = UINT64_MAX;
 
+  bool HasKeyLargestSeqno() const { return key_largest_seqno != UINT64_MAX; }
+
   // DB identity
   // db_id is an identifier generated the first time the DB is created
   // If DB identity is unset or unassigned, `db_id` will be an empty string.
diff --git a/unreleased_history/public_api_changes/db-gen-file-ingestion.md b/unreleased_history/public_api_changes/db-gen-file-ingestion.md
new file mode 100644
index 000000000000..9f13e52c869c
--- /dev/null
+++ b/unreleased_history/public_api_changes/db-gen-file-ingestion.md
@@ -0,0 +1 @@
+* `IngestExternalFileOptions::allow_db_generated_files` now allows files ingestion of any DB generated SST file, instead of only the ones with all keys having sequence number 0.

From 1842a4029ff58eb3e610187762ceb35e339ee9c2 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 25 Aug 2025 16:13:13 -0700
Subject: [PATCH 248/500] Update main for 10.7 (#13897)

Summary:
* Release notes from 10.6 branch
* Update version.h
* Add [10.6.fb](https://github.com/facebook/rocksdb/tree/10.4.fb) (to check_format_compatible.sh
* No update to folly commit hash due to build failures

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13897

Reviewed By: mszeszko-meta

Differential Revision: D80971628

Pulled By: anand1976

fbshipit-source-id: a24dbe90b5c54f781b2d017497ea3a22fcf6e148
---
 HISTORY.md                                    | 28 +++++++++++++++++++
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 .../ingest_behind_tombstone.md                |  1 -
 ...edit_version_in_file_checksum_retriever.md |  1 -
 .../bug_fixes/multi-scan-dict-compression.md  |  1 -
 unreleased_history/bug_fixes/multi-scan.md    |  1 -
 .../bug_fixes/multiscan_fill_cache.md         |  1 -
 .../remote_compaction_empty_result.md         |  1 -
 unreleased_history/bug_fixes/udi_config.md    |  1 -
 .../bug_fixes/udi_index_key_format.md         |  1 -
 .../new_features/cf-ingest-behind.md          |  1 -
 .../new_features/multiscan-io-coalesce.md     |  2 --
 .../compression_perf.md                       |  1 -
 .../db-gen-file-ingestion.md                  |  1 -
 .../public_api_changes/decouple.md            |  1 -
 .../public_api_changes/get_ttl_in_ttl_db.md   |  1 -
 .../public_api_changes/lz4_etc.md             |  2 --
 .../new_SyncFile_api_at_FileSystem_interface  |  1 -
 19 files changed, 30 insertions(+), 20 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/ingest_behind_tombstone.md
 delete mode 100644 unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md
 delete mode 100644 unreleased_history/bug_fixes/multi-scan-dict-compression.md
 delete mode 100644 unreleased_history/bug_fixes/multi-scan.md
 delete mode 100644 unreleased_history/bug_fixes/multiscan_fill_cache.md
 delete mode 100644 unreleased_history/bug_fixes/remote_compaction_empty_result.md
 delete mode 100644 unreleased_history/bug_fixes/udi_config.md
 delete mode 100644 unreleased_history/bug_fixes/udi_index_key_format.md
 delete mode 100644 unreleased_history/new_features/cf-ingest-behind.md
 delete mode 100644 unreleased_history/new_features/multiscan-io-coalesce.md
 delete mode 100644 unreleased_history/performance_improvements/compression_perf.md
 delete mode 100644 unreleased_history/public_api_changes/db-gen-file-ingestion.md
 delete mode 100644 unreleased_history/public_api_changes/decouple.md
 delete mode 100644 unreleased_history/public_api_changes/get_ttl_in_ttl_db.md
 delete mode 100644 unreleased_history/public_api_changes/lz4_etc.md
 delete mode 100644 unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface

diff --git a/HISTORY.md b/HISTORY.md
index 03e08a7dc2db..9f37452ccb5d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,34 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.6.0 (08/22/2025)
+### New Features
+* Introduce column family option `cf_allow_ingest_behind`. This option aims to replace `DBOptions::allow_ingest_behind` to enable ingest behind at the per-CF level. `DBOptions::allow_ingest_behind` is deprecated.
+* Introduce `MultiScanArgs::io_coalesce_threshold` to allow a configurable IO coalescing threshold.
+
+### Public API Changes
+* `IngestExternalFileOptions::allow_db_generated_files` now allows files ingestion of any DB generated SST file, instead of only the ones with all keys having sequence number 0.
+* `decouple_partitioned_filters = true` is now the default in BlockBasedTableOptions.
+* GetTtl() API is now available in TTL DB
+* Minimum supported version of LZ4 library is now 1.7.0 (r129 from 2015)
+* Some changes to experimental Compressor and CompressionManager APIs
+* A new Filesystem::SyncFile function is added for syncing a file that was already written, such as on file ingestion. The default implementation matches previous RocksDB behavior: re-open the file for read-write, sync it, and close it. We recommend overriding for FileSystems that do not require syncing for crash recovery or do not handle (well) re-opening for writes.
+
+### Behavior Changes
+* When `allow_ingest_behind` is enabled, compaction will no longer drop tombstones based on the absence of underlying data. Tombstones will be preserved to apply to ingested files.
+
+### Bug Fixes
+* Files in dropped column family won't be returned to the caller upon successful, offline MANIFEST iteration in `GetFileChecksumsFromCurrentManifest`.
+* Fix a bug in MultiScan that causes it to fall back to a normal scan when dictionary compression is enabled.
+* Fix a crash in iterator Prepare() when fill_cache=false
+* Fix a bug in MultiScan where incorrect results can be returned when a Scan's range is across multiple files.
+* Fixed a bug in remote compaction that may mistakenly delete live SST file(s) during the cleanup phase when no keys survive the compaction (all expired)
+* Allow a user defined index to be configured from a string.
+* Make the User Defined Index interface consistently use the user key format, fixing the previous mixed usage of internal and user key.
+
+### Performance Improvements
+* Small improvement to CPU efficiency of compression using built-in algorithms, and a dramatic efficiency improvement for LZ4HC, based on reusing data structures between invocations.
+
 ## 10.5.0 (07/18/2025)
 ### Public API Changes
 * DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 6616b7e4e658..36e64444736a 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 6
+#define ROCKSDB_MINOR 7
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 416dfb0eaa6e..bfd3be3ae716 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/ingest_behind_tombstone.md b/unreleased_history/behavior_changes/ingest_behind_tombstone.md
deleted file mode 100644
index ce54cf221fd5..000000000000
--- a/unreleased_history/behavior_changes/ingest_behind_tombstone.md
+++ /dev/null
@@ -1 +0,0 @@
-* When `allow_ingest_behind` is enabled, compaction will no longer drop tombstones based on the absence of underlying data. Tombstones will be preserved to apply to ingested files.
diff --git a/unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md b/unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md
deleted file mode 100644
index 2e7cbf3a3007..000000000000
--- a/unreleased_history/bug_fixes/handle_drop_column_family_edit_version_in_file_checksum_retriever.md
+++ /dev/null
@@ -1 +0,0 @@
-* Files in dropped column family won't be returned to the caller upon successful, offline MANIFEST iteration in `GetFileChecksumsFromCurrentManifest`.
diff --git a/unreleased_history/bug_fixes/multi-scan-dict-compression.md b/unreleased_history/bug_fixes/multi-scan-dict-compression.md
deleted file mode 100644
index f01e49bea11d..000000000000
--- a/unreleased_history/bug_fixes/multi-scan-dict-compression.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug in MultiScan that causes it to fall back to a normal scan when dictionary compression is enabled.
diff --git a/unreleased_history/bug_fixes/multi-scan.md b/unreleased_history/bug_fixes/multi-scan.md
deleted file mode 100644
index 9ba67ac40fa5..000000000000
--- a/unreleased_history/bug_fixes/multi-scan.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug in MultiScan where incorrect results can be returned when a Scan's range is across multiple files.
diff --git a/unreleased_history/bug_fixes/multiscan_fill_cache.md b/unreleased_history/bug_fixes/multiscan_fill_cache.md
deleted file mode 100644
index 1216ed9db79e..000000000000
--- a/unreleased_history/bug_fixes/multiscan_fill_cache.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix a crash in iterator Prepare() when fill_cache=false
diff --git a/unreleased_history/bug_fixes/remote_compaction_empty_result.md b/unreleased_history/bug_fixes/remote_compaction_empty_result.md
deleted file mode 100644
index dcb93d2cc9c2..000000000000
--- a/unreleased_history/bug_fixes/remote_compaction_empty_result.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed a bug in remote compaction that may mistakenly delete live SST file(s) during the cleanup phase when no keys survive the compaction (all expired)
diff --git a/unreleased_history/bug_fixes/udi_config.md b/unreleased_history/bug_fixes/udi_config.md
deleted file mode 100644
index fce63ce44c3d..000000000000
--- a/unreleased_history/bug_fixes/udi_config.md
+++ /dev/null
@@ -1 +0,0 @@
-Allow a user defined index to be configured from a string.
diff --git a/unreleased_history/bug_fixes/udi_index_key_format.md b/unreleased_history/bug_fixes/udi_index_key_format.md
deleted file mode 100644
index 943e9413ed1e..000000000000
--- a/unreleased_history/bug_fixes/udi_index_key_format.md
+++ /dev/null
@@ -1 +0,0 @@
-Make the User Defined Index interface consistently use the user key format, fixing the previous mixed usage of internal and user key.
diff --git a/unreleased_history/new_features/cf-ingest-behind.md b/unreleased_history/new_features/cf-ingest-behind.md
deleted file mode 100644
index 1a716d17ef24..000000000000
--- a/unreleased_history/new_features/cf-ingest-behind.md
+++ /dev/null
@@ -1 +0,0 @@
-* Introduce column family option `cf_allow_ingest_behind`. This option aims to replace `DBOptions::allow_ingest_behind` to enable ingest behind at the per-CF level. `DBOptions::allow_ingest_behind` is deprecated.
diff --git a/unreleased_history/new_features/multiscan-io-coalesce.md b/unreleased_history/new_features/multiscan-io-coalesce.md
deleted file mode 100644
index 2186bbdd745a..000000000000
--- a/unreleased_history/new_features/multiscan-io-coalesce.md
+++ /dev/null
@@ -1,2 +0,0 @@
-* Introduce `MultiScanArgs::io_coalesce_threshold` to allow a configurable IO coalescing threshold.
-
diff --git a/unreleased_history/performance_improvements/compression_perf.md b/unreleased_history/performance_improvements/compression_perf.md
deleted file mode 100644
index ed567e4e0fa8..000000000000
--- a/unreleased_history/performance_improvements/compression_perf.md
+++ /dev/null
@@ -1 +0,0 @@
-* Small improvement to CPU efficiency of compression using built-in algorithms, and a dramatic efficiency improvement for LZ4HC, based on reusing data structures between invocations.
diff --git a/unreleased_history/public_api_changes/db-gen-file-ingestion.md b/unreleased_history/public_api_changes/db-gen-file-ingestion.md
deleted file mode 100644
index 9f13e52c869c..000000000000
--- a/unreleased_history/public_api_changes/db-gen-file-ingestion.md
+++ /dev/null
@@ -1 +0,0 @@
-* `IngestExternalFileOptions::allow_db_generated_files` now allows files ingestion of any DB generated SST file, instead of only the ones with all keys having sequence number 0.
diff --git a/unreleased_history/public_api_changes/decouple.md b/unreleased_history/public_api_changes/decouple.md
deleted file mode 100644
index c4c6944ae21e..000000000000
--- a/unreleased_history/public_api_changes/decouple.md
+++ /dev/null
@@ -1 +0,0 @@
-* `decouple_partitioned_filters = true` is now the default in BlockBasedTableOptions.
diff --git a/unreleased_history/public_api_changes/get_ttl_in_ttl_db.md b/unreleased_history/public_api_changes/get_ttl_in_ttl_db.md
deleted file mode 100644
index 6a118735a526..000000000000
--- a/unreleased_history/public_api_changes/get_ttl_in_ttl_db.md
+++ /dev/null
@@ -1 +0,0 @@
-GetTtl() API is now available in TTL DB
diff --git a/unreleased_history/public_api_changes/lz4_etc.md b/unreleased_history/public_api_changes/lz4_etc.md
deleted file mode 100644
index e961f656ec96..000000000000
--- a/unreleased_history/public_api_changes/lz4_etc.md
+++ /dev/null
@@ -1,2 +0,0 @@
-* Minimum supported version of LZ4 library is now 1.7.0 (r129 from 2015)
-* Some changes to experimental Compressor and CompressionManager APIs
diff --git a/unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface b/unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface
deleted file mode 100644
index 6918f05f34f3..000000000000
--- a/unreleased_history/public_api_changes/new_SyncFile_api_at_FileSystem_interface
+++ /dev/null
@@ -1 +0,0 @@
-A new Filesystem::SyncFile function is added for syncing a file that was already written, such as on file ingestion. The default implementation matches previous RocksDB behavior: re-open the file for read-write, sync it, and close it. We recommend overriding for FileSystems that do not require syncing for crash recovery or do not handle (well) re-opening for writes.

From 8d2f420db2dbd4036aacdec961d0952fb6329ffd Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 26 Aug 2025 11:01:12 -0700
Subject: [PATCH 249/500] Shorten the lifetime of statistics object in db
 stress (#13899)

Summary:
**Context/Summary:**
Clear statistics reference from options_ to intentionally shorten the statistics object lifetime to be same as the db object (which is the common case in practice) and detect if RocksDB access the statistics beyond its lifetime.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13899

Test Plan: - [Ongoing] Stress test rehearsal

Reviewed By: pdillinger

Differential Revision: D80985435

Pulled By: hx235

fbshipit-source-id: ab238231cd81f47fa451aea12a0c85fa11d9ac81
---
 BUCK                                  |  1 -
 db_stress_tool/CMakeLists.txt         |  1 -
 db_stress_tool/db_stress_stat.cc      | 17 --------------
 db_stress_tool/db_stress_stat.h       |  4 ----
 db_stress_tool/db_stress_test_base.cc | 33 ++++++++++++++++++++-------
 db_stress_tool/db_stress_tool.cc      |  6 -----
 src.mk                                |  1 -
 7 files changed, 25 insertions(+), 38 deletions(-)
 delete mode 100644 db_stress_tool/db_stress_stat.cc

diff --git a/BUCK b/BUCK
index 7ba29bb54751..565e6b831ad8 100644
--- a/BUCK
+++ b/BUCK
@@ -429,7 +429,6 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
         "db_stress_tool/db_stress_gflags.cc",
         "db_stress_tool/db_stress_listener.cc",
         "db_stress_tool/db_stress_shared_state.cc",
-        "db_stress_tool/db_stress_stat.cc",
         "db_stress_tool/db_stress_test_base.cc",
         "db_stress_tool/db_stress_tool.cc",
         "db_stress_tool/db_stress_wide_merge_operator.cc",
diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt
index 49e76ab51532..80b46330514f 100644
--- a/db_stress_tool/CMakeLists.txt
+++ b/db_stress_tool/CMakeLists.txt
@@ -9,7 +9,6 @@ add_executable(db_stress${ARTIFACT_SUFFIX}
   db_stress_gflags.cc
   db_stress_listener.cc
   db_stress_shared_state.cc
-  db_stress_stat.cc
   db_stress_test_base.cc
   db_stress_wide_merge_operator.cc
   db_stress_tool.cc
diff --git a/db_stress_tool/db_stress_stat.cc b/db_stress_tool/db_stress_stat.cc
deleted file mode 100644
index 6a7883a52ac7..000000000000
--- a/db_stress_tool/db_stress_stat.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifdef GFLAGS
-
-#include "db_stress_tool/db_stress_stat.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
-std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
-
-}  // namespace ROCKSDB_NAMESPACE
-
-#endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_stat.h b/db_stress_tool/db_stress_stat.h
index 5b38c6e2bb5d..e4a8a8fb5999 100644
--- a/db_stress_tool/db_stress_stat.h
+++ b/db_stress_tool/db_stress_stat.h
@@ -22,10 +22,6 @@ DECLARE_bool(progress_reports);
 
 namespace ROCKSDB_NAMESPACE {
 
-// Database statistics
-extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats;
-extern std::shared_ptr<ROCKSDB_NAMESPACE::Statistics> dbstats_secondaries;
-
 class Stats {
  private:
   uint64_t start_;
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 8c22d30c314f..220b3b9b628f 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -653,12 +653,20 @@ std::string StressTest::DebugString(const Slice& value,
 }
 
 void StressTest::PrintStatistics() {
-  if (dbstats) {
-    fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+  // Print statistics from the DB instance instead of global dbstats
+  if (db_) {
+    auto stats = db_->GetOptions().statistics;
+    if (stats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", stats->ToString().c_str());
+    }
   }
-  if (dbstats_secondaries) {
-    fprintf(stdout, "Secondary instances STATISTICS:\n%s\n",
-            dbstats_secondaries->ToString().c_str());
+  // Print statistics from secondary DB instance if it exists
+  if (secondary_db_) {
+    auto stats = secondary_db_->GetOptions().statistics;
+    if (stats) {
+      fprintf(stdout, "Secondary instance STATISTICS:\n%s\n",
+              stats->ToString().c_str());
+    }
   }
 }
 
@@ -3954,6 +3962,13 @@ void StressTest::Open(SharedState* shared, bool reopen) {
     assert(s.ok());
     assert(column_families_.size() ==
            static_cast<size_t>(FLAGS_column_families));
+    // Clear statistics reference from options_ to intentionally shorten the
+    // statistics object lifetime to be same as the db object (which is the
+    // common case in practice) and detect if RocksDB access the statistics
+    // beyond its lifetime.
+    if (FLAGS_statistics) {
+      options_.statistics.reset();
+    }
 
     // Secondary instance does not support write-prepared/write-unprepared
     // transactions, thus just disable secondary instance if we use
@@ -4328,7 +4343,9 @@ void InitializeOptionsFromFlags(
     }
   }
   options.max_open_files = FLAGS_open_files;
-  options.statistics = dbstats;
+  if (FLAGS_statistics) {
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  }
   options.env = db_stress_env;
   options.use_fsync = FLAGS_use_fsync;
   options.compaction_readahead_size = FLAGS_compaction_readahead_size;
@@ -4578,8 +4595,8 @@ void InitializeOptionsGeneral(
   options.create_missing_column_families = true;
   options.create_if_missing = true;
 
-  if (!options.statistics) {
-    options.statistics = dbstats;
+  if (FLAGS_statistics) {
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   }
 
   if (options.env == Options().env) {
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index ca43b699c8f9..f22827e53fee 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -53,12 +53,6 @@ int db_stress_tool(int argc, char** argv) {
     SetupSyncPointsToMockDirectIO();
   }
 #endif
-  if (FLAGS_statistics) {
-    dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    if (FLAGS_test_secondary) {
-      dbstats_secondaries = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    }
-  }
   compression_type_e = StringToCompressionType(FLAGS_compression_type.c_str());
   bottommost_compression_type_e =
       StringToCompressionType(FLAGS_bottommost_compression_type.c_str());
diff --git a/src.mk b/src.mk
index 01f754416ed2..9d771f45a8e2 100644
--- a/src.mk
+++ b/src.mk
@@ -399,7 +399,6 @@ STRESS_LIB_SOURCES =                                           \
   db_stress_tool/db_stress_gflags.cc                           \
   db_stress_tool/db_stress_listener.cc                         \
   db_stress_tool/db_stress_shared_state.cc                     \
-  db_stress_tool/db_stress_stat.cc                             \
   db_stress_tool/db_stress_test_base.cc                        \
   db_stress_tool/db_stress_tool.cc                             \
   db_stress_tool/db_stress_wide_merge_operator.cc              \

From d3991651094ed058b68c8a55314ba9bedf839a80 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 26 Aug 2025 11:03:13 -0700
Subject: [PATCH 250/500] Ignore IOActivity check for ManagedSnapshot
 snapshot_guard(db_); for TestMultiScan (#13898)

Summary:
**Context/Summary:**

RocksDB stress test verifies IOActivity is set correctly through reusing the pass-in Read/Write options through assertion. This is too strict for API that does not take or do not need to take Read/WriteOptions yet hence assertion failure.
```
stderr:
 db_stress: ... db_stress_tool/db_stress_env_wrapper.h:24: void rocksdb::(anonymous namespace)::CheckIOActivity(const IOOptions &): Assertion `io_activity == Env::IOActivity::kUnknown || io_activity == options.io_activity' failed.
Received signal 6 (Aborted)
```
An example is ManagedSnapshot snapshot_guard(db_); in TestMultiScan().

This PR ignores such check.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13898

Test Plan: The same command repro-ed this assertion failure passes after this fix

Reviewed By: archang19

Differential Revision: D80983214

Pulled By: hx235

fbshipit-source-id: d8b660f8c8771198bc7fa0e805c3e86d2584f03e
---
 db_stress_tool/db_stress_test_base.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 220b3b9b628f..02697796b53c 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1674,7 +1674,11 @@ Status StressTest::TestMultiScan(ThreadState* thread,
   assert(!rand_column_families.empty());
   assert(!rand_keys.empty());
 
+  ThreadStatus::OperationType cur_op_type =
+      ThreadStatusUtil::GetThreadOperation();
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_UNKNOWN);
   ManagedSnapshot snapshot_guard(db_);
+  ThreadStatusUtil::SetThreadOperation(cur_op_type);
 
   ReadOptions ro = read_opts;
   ro.snapshot = snapshot_guard.snapshot();

From b67149a55ee59024b7f00cd4d56372b473fc8d44 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 26 Aug 2025 11:20:41 -0700
Subject: [PATCH 251/500] Skip DumpStats() on dropped CF (#13900)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
**Context/Summary:**

DumpStats() do not skip dropped CF and can run into a seg fault like below
```
2025-08-23T06:44:05.0469230Z �[0;32m[ RUN      ] �[mFormatLatest/ColumnFamilyTest.LiveIteratorWithDroppedColumnFamily/0
2025-08-23T06:44:05.0470050Z Received signal 11 (Segmentation fault: 11)
2025-08-23T06:44:05.0470510Z #0   0x7000069305e0
2025-08-23T06:44:05.0471070Z https://github.com/facebook/rocksdb/issues/1   rocksdb::DBImpl::DumpStats() (in librocksdb.10.6.0.dylib) (db_impl.cc:1076)
```

This PR skipped it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13900

Test Plan:
- Deterministically repro-ed the seg fault before the fix and ensure it doesn't happen after the fix
```
 diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 3a2ca0617..f57d6f757 100644
 --- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -2372,11 +2372,17 @@ TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
   int kKeysNum = 10000;
   PutRandomData(1, kKeysNum, 100);
   {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"PostDrop", "BeforeAccessCFD"}, {"PostAccessCFD", "BeforeGo"}});
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
     std::unique_ptr<Iterator> iterator(
         db_->NewIterator(ReadOptions(), handles_[1]));
     iterator->SeekToFirst();

     DropColumnFamilies({1});
+    TEST_SYNC_POINT("PostDrop");
+    TEST_SYNC_POINT("BeforeGo");

     // Make sure iterator created can still be used.
     int count = 0;
@@ -2386,6 +2392,9 @@ TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
     }
     ASSERT_OK(iterator->status());
     ASSERT_EQ(count, kKeysNum);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
   }

   Reopen();
 diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index a8e4f5f8f..a8a0499c0 100644
 --- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1073,8 +1073,10 @@ void DBImpl::DumpStats() {
         continue;
       }

-      auto* table_factory =
-          cfd->GetCurrentMutableCFOptions().table_factory.get();
+      TEST_SYNC_POINT("BeforeAccessCFD");
+      auto moptions = cfd->GetCurrentMutableCFOptions();
+      auto* table_factory = moptions.table_factory.get();
+      TEST_SYNC_POINT("PostAccessCFD");
       assert(table_factory != nullptr);
       // FIXME: need to a shared_ptr if/when block_cache is going to be mutable
       Cache* cache =
~
```

Reviewed By: archang19

Differential Revision: D81003739

Pulled By: hx235

fbshipit-source-id: bdf3c4cc45988f43e79ebc191a20af5b70ac289f
---
 db/db_impl/db_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index bdbd86b9e188..3acde3a39c25 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1069,7 +1069,7 @@ void DBImpl::DumpStats() {
   {
     InstrumentedMutexLock l(&mutex_);
     for (auto cfd : versions_->GetRefedColumnFamilySet()) {
-      if (!cfd->initialized()) {
+      if (!cfd->initialized() || cfd->IsDropped()) {
         continue;
       }
 

From 749e11f0adc0496b7392308068b9404aa1f645c1 Mon Sep 17 00:00:00 2001
From: ngina <221624547+nmk70@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:08:15 -0700
Subject: [PATCH 252/500] Add compaction on deletion-trigger test to db stress
 test (#13894)

Summary:
Enable stress testing of deletion-triggered compaction.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13894

Test Plan:
```
 python3 -u tools/db_crashtest.py --simple whitebox --enable_compaction_on_deletion_trigger=true
```

Reviewed By: jaykorean

Differential Revision: D81175559

Pulled By: nmk70

fbshipit-source-id: c5128b7c1e2d07833b0e9385e04b342bc42c65cf
---
 db_stress_tool/db_stress_common.h     |  7 +++++++
 db_stress_tool/db_stress_gflags.cc    | 22 ++++++++++++++++++++++
 db_stress_tool/db_stress_test_base.cc | 11 +++++++++++
 tools/db_crashtest.py                 |  1 +
 4 files changed, 41 insertions(+)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index e675dbee38b6..4b73733933fc 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -431,6 +431,13 @@ DECLARE_uint32(ingest_wbwi_one_in);
 DECLARE_bool(universal_reduce_file_locking);
 DECLARE_bool(use_multiscan);
 
+// Compaction deletion trigger declarations for stress testing
+DECLARE_bool(enable_compaction_on_deletion_trigger);
+DECLARE_uint64(compaction_on_deletion_min_file_size);
+DECLARE_int32(compaction_on_deletion_trigger_count);
+DECLARE_int32(compaction_on_deletion_window_size);
+DECLARE_double(compaction_on_deletion_ratio);
+
 constexpr long KB = 1024;
 constexpr int kRandomValueMaxFactor = 3;
 constexpr int kValueMaxLen = 100;
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 8e0d6a5c10c4..d221c374286a 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1480,6 +1480,28 @@ DEFINE_uint32(commit_bypass_memtable_one_in, 0,
               "If greater than zero, transaction option will set "
               "commit_bypass_memtable to per every N transactions on average.");
 
+// Compaction on deletion trigger flags
+DEFINE_bool(enable_compaction_on_deletion_trigger, false,
+            "Enable CompactOnDeletionCollectorFactory for stress testing "
+            "deletion-triggered compaction scenarios.");
+
+DEFINE_uint64(compaction_on_deletion_min_file_size, 32 * 1024,
+              "Minimum file size (in bytes) for deletion-triggered compaction. "
+              "Files smaller than this will not trigger compaction even if "
+              "deletion ratio is exceeded. Default: 32KB");
+
+DEFINE_int32(compaction_on_deletion_trigger_count, 50,
+             "Number of deletions that triggers compaction when deletion "
+             "ratio is exceeded. Default: 50");
+
+DEFINE_int32(compaction_on_deletion_window_size, 100,
+             "Size of the sliding window for tracking deletions. "
+             "Default: 100");
+
+DEFINE_double(compaction_on_deletion_ratio, 0.5,
+              "Deletion ratio threshold for triggering compaction. "
+              "Default: 0.5 (50%)");
+
 DEFINE_bool(
     auto_refresh_iterator_with_snapshot,
     ROCKSDB_NAMESPACE::ReadOptions().auto_refresh_iterator_with_snapshot,
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 02697796b53c..c5846f79b56f 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -30,6 +30,7 @@
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/secondary_cache.h"
 #include "rocksdb/sst_file_manager.h"
+#include "rocksdb/table_properties.h"
 #include "rocksdb/types.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
@@ -4673,6 +4674,16 @@ void InitializeOptionsGeneral(
   if (sqfc_factory && !sqfc_factory->GetConfigs().IsEmptyNotFound()) {
     options.table_properties_collector_factories.emplace_back(sqfc_factory);
   }
+
+  // Add CompactOnDeletionCollectorFactory if enabled
+  if (FLAGS_enable_compaction_on_deletion_trigger) {
+    options.table_properties_collector_factories.emplace_back(
+        ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
+            FLAGS_compaction_on_deletion_window_size,
+            FLAGS_compaction_on_deletion_trigger_count,
+            FLAGS_compaction_on_deletion_ratio,
+            FLAGS_compaction_on_deletion_min_file_size));
+  }
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index c924ae25a84b..f307009b9399 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -80,6 +80,7 @@
     "destroy_db_initially": 0,
     "enable_pipelined_write": lambda: random.randint(0, 1),
     "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]),
+    "enable_compaction_on_deletion_trigger": lambda: random.choice([0, 0, 0, 1]),
     # `inplace_update_support` is incompatible with DB that has delete
     # range data in memtables.
     # Such data can result from any of the previous db stress runs

From e59bbd72414d0b5ae1c034e6c1bce9e2e5375bee Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 27 Aug 2025 18:57:44 -0700
Subject: [PATCH 253/500] First step to improve parallel compression efficiency
 (#13850)

Summary:
The implementation of parallel compression has historically scaled rather poorly, or perhaps modestly with heavy compression, topping out around 3x throughput vs. serial and incurring big overheads in CPU consumption relative to the throughput.

This change addresses one source of that extra CPU consumption: stashing all the keys of a block for later processing into building index and filter blocks. Historically with parallel compression, the index and filter block updates were handled in the last stage of processing along with writing each data block to the file writer. This was because the index blocks needed to know the BlockHandle of the new data block, which could only be known after every preceeding data block was compressed, to know the starting location for the BlockHandle. And because index and filter partitions were historically coupled (see decouple_partitioned_filters), filter updates had to happen at the same time.

Here we get rid of stashing the keys for later processing and the extra CPU associated with it, by
* Creating a two stage process of adding to index blocks ("prepare" and "finish" each entry; one entry per data block). The two stages must be executable in parallel for separate index entries. NOTE: not yet supported by UserDefinedIndex
* Requiring decouple_partitioned_filters=true for parallel compression, because we now add to filters in the first stage of processing when each key is readily available and we cannot couple that with finalizing index entries in the last stage of processing.

It might seem like adding to filters is something that is expensive (hashing etc.) and should be kept out of the bottle-neck first stage of processing (which includes walking the compaction iterator) but it's probably similar cost to simply stashing the keys away for later processing. (We might be able to reduce a bottle-neck by stashing hashes, but we're not to a point where that is worth the effort.)

And it makes sense to make two more simple public API updates in conjunction with this:
* Set decouple_partitioned_filters=true by default. No signs of problems in production.
* Mark parallel compression as production-ready. It's being thoroughly tested in the crash test, successfully, and in limited production uses.

Follow-up:
* Improve the threading/sychronization model of parallel compression for the next major efficiency improvement
* Consider supporting the parallel-compatible index building APIs with UserDefinedIndex, unless it's considered too dangerous to expect users to safely handle the multi-threading.
* (In a subsequent release) remove all the code associated with coupling filter and index partitions and mark the option as ignored.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13850

Test Plan:
for correctness, existing tests

## Performance Data

The "before" data here includes revert of https://github.com/facebook/rocksdb/issues/13828 for combined performance measurement of this change and that one.

```
SUFFIX=`tty | sed 's|/|_|g'`; for CT in lz4 zstd lz4; do for PT in 1 2 3 4 6 8; do echo "$CT pt=$PT"; (for I in `seq 1 1`; do BIN=/dev/shm/dbbench${SUFFIX}.bin; rm -f $BIN; cp db_bench $BIN; /usr/bin/time $BIN -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=30000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 -compression_type=$CT -compression_parallel_threads=$PT 2>&1 | tail -n 3 | head -n 2; done); done; done
```

To get a sense of the overall performance relative to number of parallel threads, we vary that with popular fast compression and popular heavier weight compression (some noise in this data, don't interpret each data point too strongly)

lz4 pt=1
2107431 -> 2112941 ops/sec (+0.3% - improvement)
(26.51 + 0.75) = 27.26 CPU sec -> (26.63 + 0.79) = 27.42 CPU sec (+0.6% - regression)
lz4 pt=2
1606660 -> 1580333 ops/sec (-1.6% - regression)
(47.10 + 8.37) = 55.47 CPU sec -> (45.05 + 9.23) = 54.28 CPU sec (-2.2% - improvement)
lz4 pt=3
1701353 -> 1889283 ops/sec (+11.1% - improvement)
(47.23 + 8.29) = 55.52 CPU sec -> (43.89 + 8.33) = 52.22 CPU sec (-6.0% - improvement)
lz4 pt=4
1651504 -> 1817890 ops/sec (+10.1% - improvement)
(48.07 + 8.31) = 56.38 CPU sec -> (44.77 + 8.45) = 53.22 CPU sec (-5.6% - improvement)
lz4 pt=6
1716099 -> 1888523 ops/sec (+10.1% - improvement)
(47.50 + 8.45) = 55.95 CPU sec -> (44.25 + 8.73) = 52.98 CPU sec (-5.3% - improvement)
lz4 pt=8
1696840 -> 1797256 ops/sec (+5.9% - improvement)
(48.09 + 8.61) = 56.70 CPU sec -> (45.90 + 8.68) = 54.58 CPU sec (-3.8% - improvement)

Clearly parallel threads do not help with fast compression like LZ4, but it's not as bad as it was before.

zstd pt=1
1214258 -> 1202863 ops/sec (-0.9% - regression)
(38.26 + 0.66) = 38.92 CPU sec -> (39.37 + 0.69) = 40.06 CPU sec (+2.9% - regression)
zstd pt=2
1194673 -> 1152746 ops/sec (-3.5% - regression)
(61.01 + 9.85) = 70.86 CPU sec -> (58.28 + 9.99) = 68.27 CPU sec (-3.7% - improvement)
zstd pt=3
1653661 -> 1825618 ops/sec (+10.4% - improvement)
(60.07 + 8.45) = 68.52 CPU sec -> (56.03 + 8.43) = 64.46 CPU sec (-5.9% - improvement)
zstd pt=4
1691723 -> 1890976 ops/sec (+11.8% - improvement)
(59.72 + 8.46) = 68.18 CPU sec -> (55.96 + 8.27) = 64.23 CPU sec (-5.7% - improvement)
zstd pt=6
1684982 -> 1900002 ops/sec (+12.8% - improvement)
(58.89 + 8.26) = 67.15 CPU sec -> (55.98 + 8.48) = 64.46 CPU sec (-4.0% - improvement)
zstd pt=8
1648282 -> 1892531 ops/sec (+14.8% - improvement)
(59.43 + 8.63) = 68.06 CPU sec -> (56.49 + 8.32) = 64.81 CPU sec (-4.8% - improvement)

The throughput is now able to increase by *more than half* with lots of parallelism, rather than only *about a third*.

Scalability is a bit better with higher compression level, and we still see a benefit from this change. (We've also enabled partitioned indexes and filters here, which sees essentially the same benefits):

zstd pt=1 compression_level=7
595720 -> 597359 ops/sec (+0.3% - improvement)
(63.45 + 0.73) = 64.18 CPU sec -> (63.25 + 0.71) = 63.96 CPU sec (-0.3% - improvement)
zstd pt=4 compression_level=7
1527116 -> 1501779 ops/sec (-1.7% - regression)
(85.00 + 8.14) = 93.14 CPU sec -> (81.85 + 9.02) = 90.87 CPU sec (-2.5% - improvement)
zstd pt=6 compression_level=7
1678239 -> 1956070 ops/sec (+16.5% - improvement)
(83.77 + 8.11) = 91.88 CPU sec -> (79.87 + 7.78) = 87.65 CPU sec (-4.6% - improvement)
zstd pt=8 compression_level=7
1696132 -> 1953041 ops/sec (+15.1% - improvement)
(83.97 + 8.14) = 92.11 CPU sec -> (80.61 + 7.78) = 88.39 CPU sec (-4.1% - improvement)

With more tests, not really seeing any consistent differences with no parallelism (despite some micro-optimizations thrown in)

Reviewed By: hx235

Differential Revision: D79853111

Pulled By: pdillinger

fbshipit-source-id: 7a34fd7811217fb74fa6d3efaea7ffcce72beec7
---
 include/rocksdb/compression_type.h            |  12 +-
 include/rocksdb/table.h                       |   7 +
 .../block_based/block_based_table_builder.cc  | 310 +++++++-----------
 table/block_based/block_based_table_builder.h |  12 +-
 table/block_based/block_builder.h             |   2 +
 table/block_based/index_builder.cc            | 111 +++++--
 table/block_based/index_builder.h             | 175 +++++++++-
 .../block_based/user_defined_index_wrapper.h  |  19 ++
 table/format.h                                |  10 +
 .../parallel_compression.md                   |   1 +
 10 files changed, 405 insertions(+), 254 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/parallel_compression.md

diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index 63d78c163c49..a05aa3307874 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -226,11 +226,15 @@ struct CompressionOptions {
   // The training data will be used to generate a dictionary of max_dict_bytes.
   uint32_t zstd_max_train_bytes = 0;
 
-  // Number of threads for parallel compression.
-  // Parallel compression is enabled only if threads > 1.
-  // THE FEATURE IS STILL EXPERIMENTAL
+  // Number of threads for parallel compression for each running flush or
+  // compaction job. Parallel compression is enabled only if threads > 1. Not
+  // recommended for lightweight compression algorithms such as Snappy, LZ4, and
+  // obviously kNoCompression because there is unlikely to be a throughput gain.
   //
-  // This option is valid only when BlockBasedTable is used.
+  // This option is valid only when BlockBasedTable is used and is disabled
+  // (sanitized to 1) with any of these:
+  // * User-defined index (UserDefinedIndexFactory)
+  // * partition_filters == true && decouple_partitioned_filters == false
   //
   // When parallel compression is enabled, SST size file sizes might be
   // more inflated compared to the target size, because more data of unknown
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 6c71b02501dc..6e71ed99f279 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -442,6 +442,10 @@ struct BlockBasedTableOptions {
   //
   // decouple_partitioned_filters = true is the new default. This option is now
   // DEPRECATED and might be ignored and/or removed in a future release.
+  //
+  // NOTE: decouple_partitioned_filters = false with partition_filters = true
+  // disables parallel compression (CompressionOptions::parallel_threads
+  // sanitized to 1).
   bool decouple_partitioned_filters = true;
 
   // Option to generate Bloom/Ribbon filters that minimize memory
@@ -500,6 +504,9 @@ struct BlockBasedTableOptions {
   // If non-nullptr, use the specified factory to build user-defined index.
   // This allows users to define their own index format and build the index
   // during table building.
+  //
+  // NOTE: UserDefinedIndexFactory currently disables parallel compression
+  // (CompressionOptions::parallel_threads sanitized to 1).
   std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory = nullptr;
 
   // If true, place whole keys in the filter (not just prefixes).
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 67c1e167a0aa..121d520bbd75 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -224,7 +224,6 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     std::vector<std::string> keys_;
     size_t size_;
   };
-  Keys curr_block_keys;
 
   struct BlockRep;
 
@@ -253,11 +252,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     std::string uncompressed;
     GrowableBuffer compressed;
     CompressionType compression_type = kNoCompression;
-    // For efficiency, the std::string is repeatedly overwritten without
-    // checking for "has no value". Only at the end of its life will it be
-    // assigned "no value". Thus, it needs to start with a value.
-    std::optional<std::string> first_key_in_next_block = std::string{};
-    Keys keys;
+    std::unique_ptr<IndexBuilder::PreparedIndexEntry> prepared_index_entry;
     BlockRepSlot slot;
     Status status;
   };
@@ -419,26 +414,12 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
 #endif
   }
 
-  // Make a block prepared to be emitted to compression thread
-  // Used in non-buffered mode
-  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
-                         BlockBuilder* data_block) {
-    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
+  BlockRep* PopRecycledBlockRep() {
+    BlockRep* block_rep = nullptr;
+    block_rep_pool.pop(block_rep);
     assert(block_rep != nullptr);
-    data_block->SwapAndReset(block_rep->uncompressed);
-    std::swap(block_rep->keys, curr_block_keys);
-    curr_block_keys.Clear();
-    return block_rep;
-  }
 
-  // Used in EnterUnbuffered
-  BlockRep* PrepareBlock(const Slice* first_key_in_next_block,
-                         std::string* data_block,
-                         std::vector<std::string>* keys) {
-    BlockRep* block_rep = PrepareBlockInternal(first_key_in_next_block);
-    assert(block_rep != nullptr);
-    std::swap(block_rep->uncompressed, *data_block);
-    block_rep->keys.SwapAssign(*keys);
+    block_rep->compression_type = kNoCompression;
     return block_rep;
   }
 
@@ -473,24 +454,6 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
       first_block_cond.notify_one();
     }
   }
-
- private:
-  BlockRep* PrepareBlockInternal(const Slice* first_key_in_next_block) {
-    BlockRep* block_rep = nullptr;
-    block_rep_pool.pop(block_rep);
-    assert(block_rep != nullptr);
-
-    block_rep->compression_type = kNoCompression;
-
-    if (first_key_in_next_block == nullptr) {
-      block_rep->first_key_in_next_block = {};
-    } else {
-      block_rep->first_key_in_next_block->assign(
-          first_key_in_next_block->data(), first_key_in_next_block->size());
-    }
-
-    return block_rep;
-  }
 };
 
 struct BlockBasedTableBuilder::Rep {
@@ -531,7 +494,6 @@ struct BlockBasedTableBuilder::Rep {
   PartitionedIndexBuilder* p_index_builder_ = nullptr;
 
   std::string last_ikey;  // Internal key or empty (unset)
-  const Slice* first_key_in_next_block = nullptr;
   bool warm_cache = false;
   bool uses_explicit_compression_manager = false;
 
@@ -749,7 +711,12 @@ struct BlockBasedTableBuilder::Rep {
         sampled_input_data_bytes(0),
         sampled_output_slow_data_bytes(0),
         sampled_output_fast_data_bytes(0),
-        compression_parallel_threads(tbo.compression_opts.parallel_threads),
+        compression_parallel_threads(
+            ((table_opt.partition_filters &&
+              !table_opt.decouple_partitioned_filters) ||
+             table_options.user_defined_index_factory)
+                ? uint32_t{1}
+                : tbo.compression_opts.parallel_threads),
         max_compressed_bytes_per_kb(
             tbo.compression_opts.max_compressed_bytes_per_kb),
         data_block_working_areas(compression_parallel_threads),
@@ -1167,61 +1134,20 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
     auto should_flush = r->flush_block_policy->Update(ikey, value);
     if (should_flush) {
       assert(!r->data_block.empty());
-      r->first_key_in_next_block = &ikey;
-      Flush();
-      if (r->state == Rep::State::kBuffered) {
-        bool exceeds_buffer_limit =
-            (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
-        bool exceeds_global_block_cache_limit = false;
-
-        // Increase cache charging for the last buffered data block
-        // only if the block is not going to be unbuffered immediately
-        // and there exists a cache reservation manager
-        if (!exceeds_buffer_limit &&
-            r->compression_dict_buffer_cache_res_mgr != nullptr) {
-          Status s =
-              r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
-                  r->data_begin_offset);
-          exceeds_global_block_cache_limit = s.IsMemoryLimit();
-        }
-
-        if (exceeds_buffer_limit || exceeds_global_block_cache_limit) {
-          EnterUnbuffered();
-        }
-      }
-
-      // Add item to index block.
-      // We do not emit the index entry for a block until we have seen the
-      // first key for the next data block.  This allows us to use shorter
-      // keys in the index block.  For example, consider a block boundary
-      // between the keys "the quick brown fox" and "the who".  We can use
-      // "the r" as the key for the index block entry since it is >= all
-      // entries in the first block and < all entries in subsequent
-      // blocks.
-      if (ok() && r->state == Rep::State::kUnbuffered) {
-        if (r->IsParallelCompressionEnabled()) {
-          r->pc_rep->curr_block_keys.Clear();
-        } else {
-          r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
-                                          r->pending_handle,
-                                          &r->index_separator_scratch);
-        }
-      }
+      Flush(/*first_key_in_next_block=*/&ikey);
     }
 
-    // Note: PartitionedFilterBlockBuilder requires key being added to filter
-    // builder after being added to index builder.
+    // Note: PartitionedFilterBlockBuilder with
+    // decouple_partitioned_filters=false requires key being added to filter
+    // builder after being added to and "finished" in the index builder, so
+    // forces no parallel compression (logic in Rep constructor).
     if (r->state == Rep::State::kUnbuffered) {
-      if (r->IsParallelCompressionEnabled()) {
-        r->pc_rep->curr_block_keys.PushBack(ikey);
-      } else {
-        if (r->filter_builder != nullptr) {
-          r->filter_builder->AddWithPrevKey(
-              ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
-              r->last_ikey.empty()
-                  ? Slice{}
-                  : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
-        }
+      if (r->filter_builder != nullptr) {
+        r->filter_builder->AddWithPrevKey(
+            ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
+            r->last_ikey.empty()
+                ? Slice{}
+                : ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
       }
     }
 
@@ -1232,9 +1158,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
       // Buffered keys will be replayed from data_block_buffers during
       // `Finish()` once compression dictionary has been finalized.
     } else {
-      if (!r->IsParallelCompressionEnabled()) {
-        r->index_builder->OnKeyAdded(ikey, value);
-      }
+      r->index_builder->OnKeyAdded(ikey, value);
     }
     // TODO offset passed in is not accurate for parallel compression case
     NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
@@ -1281,7 +1205,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
   }
 }
 
-void BlockBasedTableBuilder::Flush() {
+void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
   Rep* r = rep_;
   assert(rep_->state != Rep::State::kClosed);
   if (!ok()) {
@@ -1290,7 +1214,6 @@ void BlockBasedTableBuilder::Flush() {
   if (r->data_block.empty()) {
     return;
   }
-
   Slice uncompressed_block_data = r->data_block.Finish();
 
   // NOTE: compression sampling is done here in the same thread as building
@@ -1369,18 +1292,46 @@ void BlockBasedTableBuilder::Flush() {
     assert(uncompressed_block_data.size() == uncompressed_block_holder.size());
     rep_->data_block_buffers.emplace_back(std::move(uncompressed_block_holder));
     rep_->data_begin_offset += uncompressed_block_data.size();
-  } else if (r->IsParallelCompressionEnabled()) {
-    assert(rep_->state == Rep::State::kUnbuffered);
+    MaybeEnterUnbuffered(first_key_in_next_block);
+  } else {
+    EmitBlock(r->data_block.MutableBuffer(), r->last_ikey,
+              first_key_in_next_block);
+    r->data_block.Reset();
+  }
+}
+
+void BlockBasedTableBuilder::EmitBlock(std::string& uncompressed,
+                                       const Slice& last_key_in_current_block,
+                                       const Slice* first_key_in_next_block) {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kUnbuffered);
+  assert(uncompressed.size() > 0);
+  if (r->IsParallelCompressionEnabled()) {
     ParallelCompressionRep::BlockRep* block_rep =
-        r->pc_rep->PrepareBlock(r->first_key_in_next_block, &(r->data_block));
+        r->pc_rep->PopRecycledBlockRep();
+    std::swap(uncompressed, block_rep->uncompressed);
+    r->index_builder->PrepareIndexEntry(last_key_in_current_block,
+                                        first_key_in_next_block,
+                                        block_rep->prepared_index_entry.get());
+
     assert(block_rep != nullptr);
     r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
                                              r->get_offset());
     r->pc_rep->EmitBlock(block_rep);
   } else {
-    assert(rep_->state == Rep::State::kUnbuffered);
-    WriteBlock(uncompressed_block_data, &r->pending_handle, BlockType::kData);
-    r->data_block.Reset();
+    WriteBlock(uncompressed, &r->pending_handle, BlockType::kData);
+    if (ok()) {
+      // We do not emit the index entry for a block until we have seen the
+      // first key for the next data block.  This allows us to use shorter
+      // keys in the index block.  For example, consider a block boundary
+      // between the keys "the quick brown fox" and "the who".  We can use
+      // "the r" as the key for the index block entry since it is >= all
+      // entries in the first block and < all entries in subsequent
+      // blocks.
+      r->index_builder->AddIndexEntry(
+          last_key_in_current_block, first_key_in_next_block, r->pending_handle,
+          &r->index_separator_scratch);
+    }
   }
 }
 
@@ -1646,7 +1597,6 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
   ParallelCompressionRep::BlockRepSlot* slot = nullptr;
   ParallelCompressionRep::BlockRep* block_rep = nullptr;
   // Starts empty; see FilterBlockBuilder::AddWithPrevKey
-  std::string prev_block_last_key_no_ts;
   while (r->pc_rep->write_queue.pop(slot)) {
     // FIXME: this is weird popping off write queue just to wait again on
     // compress queue
@@ -1662,21 +1612,6 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
       continue;
     }
 
-    Slice prev_key_no_ts = prev_block_last_key_no_ts;
-    for (size_t i = 0; i < block_rep->keys.Size(); i++) {
-      auto& key = block_rep->keys[i];
-      if (r->filter_builder != nullptr) {
-        Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz);
-        r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
-        prev_key_no_ts = key_no_ts;
-      }
-      r->index_builder->OnKeyAdded(key, {});
-    }
-    if (r->filter_builder != nullptr) {
-      prev_block_last_key_no_ts.assign(prev_key_no_ts.data(),
-                                       prev_key_no_ts.size());
-    }
-
     r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
         block_rep->uncompressed.size());
     Slice compressed = block_rep->compressed;
@@ -1693,17 +1628,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
     r->props.data_size = r->get_offset();
     ++r->props.num_data_blocks;
 
-    if (!block_rep->first_key_in_next_block.has_value()) {
-      r->index_builder->AddIndexEntry(block_rep->keys.Back(), nullptr,
-                                      r->pending_handle,
-                                      &r->index_separator_scratch);
-    } else {
-      Slice first_key_in_next_block =
-          Slice(*block_rep->first_key_in_next_block);
-      r->index_builder->AddIndexEntry(
-          block_rep->keys.Back(), &first_key_in_next_block, r->pending_handle,
-          &r->index_separator_scratch);
-    }
+    r->index_builder->FinishIndexEntry(r->pending_handle,
+                                       block_rep->prepared_index_entry.get());
 
     r->pc_rep->ReapBlock(block_rep);
   }
@@ -1715,6 +1641,8 @@ void BlockBasedTableBuilder::StartParallelCompression() {
   rep_->pc_rep->compress_thread_pool.reserve(
       rep_->compression_parallel_threads);
   for (uint32_t i = 0; i < rep_->compression_parallel_threads; i++) {
+    rep_->pc_rep->block_rep_buf[i].prepared_index_entry =
+        rep_->index_builder->CreatePreparedIndexEntry();
     rep_->pc_rep->compress_thread_pool.emplace_back(
         [this, i] { BGWorkCompression(rep_->data_block_working_areas[i]); });
   }
@@ -2076,9 +2004,32 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
   }
 }
 
-void BlockBasedTableBuilder::EnterUnbuffered() {
+void BlockBasedTableBuilder::MaybeEnterUnbuffered(
+    const Slice* first_key_in_next_block) {
   Rep* r = rep_;
   assert(r->state == Rep::State::kBuffered);
+  // Don't yet enter unbuffered (early return) if none of the conditions are met
+  if (first_key_in_next_block != nullptr) {
+    bool exceeds_buffer_limit =
+        (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
+    if (!exceeds_buffer_limit) {
+      bool exceeds_global_block_cache_limit = false;
+      // Increase cache charging for the last buffered data block
+      // only if the block is not going to be unbuffered immediately
+      // and there exists a cache reservation manager
+      if (r->compression_dict_buffer_cache_res_mgr != nullptr) {
+        Status s =
+            r->compression_dict_buffer_cache_res_mgr->UpdateCacheReservation(
+                r->data_begin_offset);
+        exceeds_global_block_cache_limit = s.IsMemoryLimit();
+      }
+      if (!exceeds_global_block_cache_limit) {
+        return;
+      }
+    }
+  }
+
+  // Enter Unbuffered state
   r->state = Rep::State::kUnbuffered;
   const size_t kNumBlocksBuffered = r->data_block_buffers.size();
   if (kNumBlocksBuffered == 0) {
@@ -2189,60 +2140,32 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       assert(iter != nullptr);
     };
 
+    for (; iter->Valid(); iter->Next()) {
+      Slice key = iter->key();
+      if (r->filter_builder != nullptr) {
+        // NOTE: AddWithPrevKey here would only save key copying if prev is
+        // pinned (iter->IsKeyPinned()), which is probably rare with delta
+        // encoding. OK to go from Add() here to AddWithPrevKey() in
+        // unbuffered operation.
+        r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
+      }
+      r->index_builder->OnKeyAdded(key, iter->value());
+    }
+
+    Slice first_key_in_loop_next_block;
+    const Slice* first_key_in_loop_next_block_ptr;
     if (i + 1 < r->data_block_buffers.size()) {
       next_block_iter = get_iterator_for_block(i + 1);
+      first_key_in_loop_next_block = next_block_iter->key();
+      first_key_in_loop_next_block_ptr = &first_key_in_loop_next_block;
+    } else {
+      first_key_in_loop_next_block_ptr = first_key_in_next_block;
     }
 
     auto& data_block = r->data_block_buffers[i];
-    if (r->IsParallelCompressionEnabled()) {
-      Slice first_key_in_next_block;
-      const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
-      if (i + 1 < r->data_block_buffers.size()) {
-        assert(next_block_iter != nullptr);
-        first_key_in_next_block = next_block_iter->key();
-      } else {
-        first_key_in_next_block_ptr = r->first_key_in_next_block;
-      }
-
-      std::vector<std::string> keys;
-      for (; iter->Valid(); iter->Next()) {
-        keys.emplace_back(iter->key().ToString());
-      }
-
-      ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
-          first_key_in_next_block_ptr, &data_block, &keys);
-
-      assert(block_rep != nullptr);
-      r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
-                                               r->get_offset());
-      r->pc_rep->EmitBlock(block_rep);
-    } else {
-      for (; iter->Valid(); iter->Next()) {
-        Slice key = iter->key();
-        if (r->filter_builder != nullptr) {
-          // NOTE: AddWithPrevKey here would only save key copying if prev is
-          // pinned (iter->IsKeyPinned()), which is probably rare with delta
-          // encoding. OK to go from Add() here to AddWithPrevKey() in
-          // unbuffered operation.
-          r->filter_builder->Add(
-              ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
-        }
-        r->index_builder->OnKeyAdded(key, iter->value());
-      }
-      WriteBlock(Slice(data_block), &r->pending_handle, BlockType::kData);
-      if (ok() && i + 1 < r->data_block_buffers.size()) {
-        assert(next_block_iter != nullptr);
-        Slice first_key_in_next_block = next_block_iter->key();
-
-        Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
-
-        iter->SeekToLast();
-        assert(iter->Valid());
-        r->index_builder->AddIndexEntry(
-            iter->key(), first_key_in_next_block_ptr, r->pending_handle,
-            &r->index_separator_scratch);
-      }
-    }
+    iter->SeekToLast();
+    assert(iter->Valid());
+    EmitBlock(data_block, iter->key(), first_key_in_loop_next_block_ptr);
     std::swap(iter, next_block_iter);
   }
   r->data_block_buffers.clear();
@@ -2258,12 +2181,13 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
 Status BlockBasedTableBuilder::Finish() {
   Rep* r = rep_;
   assert(r->state != Rep::State::kClosed);
-  bool empty_data_block = r->data_block.empty();
-  r->first_key_in_next_block = nullptr;
-  Flush();
-  if (r->state == Rep::State::kBuffered) {
-    EnterUnbuffered();
+  // To make sure properties block is able to keep the accurate size of index
+  // block, we will finish writing all index entries first, in Flush().
+  Flush(/*first_key_in_next_block=*/nullptr);
+  if (rep_->state == Rep::State::kBuffered) {
+    MaybeEnterUnbuffered(nullptr);
   }
+  assert(r->state == Rep::State::kUnbuffered);
   if (r->IsParallelCompressionEnabled()) {
     StopParallelCompression();
 #ifndef NDEBUG
@@ -2271,14 +2195,6 @@ Status BlockBasedTableBuilder::Finish() {
       assert(br.status.ok());
     }
 #endif  // !NDEBUG
-  } else {
-    // To make sure properties block is able to keep the accurate size of index
-    // block, we will finish writing all index entries first.
-    if (ok() && !empty_data_block) {
-      r->index_builder->AddIndexEntry(
-          r->last_ikey, nullptr /* no next data block */, r->pending_handle,
-          &r->index_separator_scratch);
-    }
   }
 
   r->props.tail_start_offset = r->offset;
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index f86216d2e184..2ba0ef8c8d6c 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -115,10 +115,14 @@ class BlockBasedTableBuilder : public TableBuilder {
  private:
   bool ok() const { return status().ok(); }
 
-  // Transition state from buffered to unbuffered. See `Rep::State` API comment
-  // for details of the states.
+  // Transition state from buffered to unbuffered if the conditions are met. See
+  // `Rep::State` API comment for details of the states.
   // REQUIRES: `rep_->state == kBuffered`
-  void EnterUnbuffered();
+  void MaybeEnterUnbuffered(const Slice* first_key_in_next_block);
+
+  void EmitBlock(std::string& uncompressed,
+                 const Slice& last_key_in_current_block,
+                 const Slice* first_key_in_next_block);
 
   // Compress and write block content to the file.
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
@@ -162,7 +166,7 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Can be used to ensure that two adjacent entries never live in
   // the same data block.  Most clients should not need to use this method.
   // REQUIRES: Finish(), Abandon() have not been called
-  void Flush();
+  void Flush(const Slice* first_key_in_next_block);
 
   // Some compression libraries fail when the uncompressed size is bigger than
   // int. If uncompressed size is bigger than kCompressionSizeLimit, don't
diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h
index f167470bb5f5..37e2c8ee69d2 100644
--- a/table/block_based/block_builder.h
+++ b/table/block_based/block_builder.h
@@ -80,6 +80,8 @@ class BlockBuilder {
   // Return true iff no entries have been added since the last Reset()
   bool empty() const { return buffer_.empty(); }
 
+  std::string& MutableBuffer() { return buffer_; }
+
  private:
   inline void AddWithLastKeyImpl(const Slice& key, const Slice& value,
                                  const Slice& last_key,
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index c3b360a07139..1ab6b0da82ae 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -153,15 +153,19 @@ PartitionedIndexBuilder::PartitionedIndexBuilder(
       // sub_index_builders could not safely exclude seq from the keys, then it
       // wil be enforced on all sub_index_builders on ::Finish.
       must_use_separator_with_seq_(false),
-      use_value_delta_encoding_(use_value_delta_encoding) {}
+      use_value_delta_encoding_(use_value_delta_encoding) {
+  MakeNewSubIndexBuilder();
+}
 
 void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
-  assert(sub_index_builder_ == nullptr);
-  sub_index_builder_ = std::make_unique<ShortenedIndexBuilder>(
+  auto new_builder = std::make_unique<ShortenedIndexBuilder>(
       comparator_, table_opt_.index_block_restart_interval,
       table_opt_.format_version, use_value_delta_encoding_,
       table_opt_.index_shortening, /* include_first_key */ false, ts_sz_,
       persist_user_defined_timestamps_);
+  sub_index_builder_ = new_builder.get();
+  // Start next partition entry, where we will modify the key
+  entries_.push_back({{}, std::move(new_builder)});
 
   BlockBuilder* builder_to_monitor;
   // Set sub_index_builder_->must_use_separator_with_seq_ to true if
@@ -192,38 +196,70 @@ void PartitionedIndexBuilder::RequestPartitionCut() {
   partition_cut_requested_ = true;
 }
 
+std::unique_ptr<IndexBuilder::PreparedIndexEntry>
+PartitionedIndexBuilder::CreatePreparedIndexEntry() {
+  // Fortunately, for ShortenedIndexBuilder, we can prepare an entry from one
+  // similarly configured builder and finish it at another.
+  return entries_.front().value->CreatePreparedIndexEntry();
+}
+void PartitionedIndexBuilder::PrepareIndexEntry(
+    const Slice& last_key_in_current_block,
+    const Slice* first_key_in_next_block, PreparedIndexEntry* out) {
+  // Fortunately, for ShortenedIndexBuilder, we can prepare an entry from one
+  // similarly configured builder and finish it at another. We just have to
+  // keep in mind that this first sub builder keeps track of the original
+  // must_use_separator_with_seq_ in the pipeline that is then propagated.
+  return entries_.front().value->PrepareIndexEntry(
+      last_key_in_current_block, first_key_in_next_block, out);
+}
+
+void PartitionedIndexBuilder::MaybeFlush(const Slice& index_key,
+                                         const BlockHandle& index_value) {
+  bool do_flush = !sub_index_builder_->index_block_builder_.empty() &&
+                  (partition_cut_requested_ ||
+                   flush_policy_->Update(
+                       index_key, EncodedBlockHandle(index_value).AsSlice()));
+  if (do_flush) {
+    assert(entries_.back().value.get() == sub_index_builder_);
+    cut_filter_block = true;
+    MakeNewSubIndexBuilder();
+  }
+}
+
+void PartitionedIndexBuilder::FinishIndexEntry(const BlockHandle& block_handle,
+                                               PreparedIndexEntry* base_entry) {
+  using SPIE = ShortenedIndexBuilder::ShortenedPreparedIndexEntry;
+  SPIE* entry = static_cast<SPIE*>(base_entry);
+
+  MaybeFlush(entry->separator_with_seq, block_handle);
+
+  sub_index_builder_->FinishIndexEntry(block_handle, base_entry);
+  std::swap(entries_.back().key, entry->separator_with_seq);
+
+  if (!must_use_separator_with_seq_ && entry->must_use_separator_with_seq) {
+    // We need to apply !must_use_separator_with_seq to all sub-index builders
+    must_use_separator_with_seq_ = true;
+    flush_policy_->Retarget(sub_index_builder_->index_block_builder_);
+  }
+  // NOTE: not compatible with coupled partitioned filters so don't need to
+  // cut_filter_block
+}
+
 Slice PartitionedIndexBuilder::AddIndexEntry(
     const Slice& last_key_in_current_block,
     const Slice* first_key_in_next_block, const BlockHandle& block_handle,
     std::string* separator_scratch) {
-  // Note: to avoid two consecuitive flush in the same method call, we do not
-  // check flush policy when adding the last key
-  if (LIKELY(first_key_in_next_block != nullptr)) {
-    // apply flush policy only to non-empty sub_index_builder_
-    if (sub_index_builder_ != nullptr) {
-      std::string handle_encoding;
-      block_handle.EncodeTo(&handle_encoding);
-      bool do_flush =
-          partition_cut_requested_ ||
-          flush_policy_->Update(last_key_in_current_block, handle_encoding);
-      if (do_flush) {
-        assert(entries_.back().value == nullptr);
-        std::swap(entries_.back().value, sub_index_builder_);
-        cut_filter_block = true;
-      }
-    }
+  // At least when running without parallel compression, maintain behavior of
+  // avoiding a last index partition with just one entry
+  if (first_key_in_next_block) {
+    MaybeFlush(last_key_in_current_block, block_handle);
   }
 
-  if (sub_index_builder_ == nullptr) {
-    MakeNewSubIndexBuilder();
-    // Reserve next partition entry, where we will modify the key and
-    // eventually set the value
-    entries_.push_back({{}, {}});
-  }
   auto sep = sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                                first_key_in_next_block,
                                                block_handle, separator_scratch);
   entries_.back().key.assign(sep.data(), sep.size());
+
   if (!must_use_separator_with_seq_ &&
       sub_index_builder_->must_use_separator_with_seq_) {
     // We need to apply !must_use_separator_with_seq to all sub-index builders
@@ -232,8 +268,6 @@ Slice PartitionedIndexBuilder::AddIndexEntry(
   }
   if (UNLIKELY(first_key_in_next_block == nullptr)) {
     // no more keys
-    assert(entries_.back().value == nullptr);
-    std::swap(entries_.back().value, sub_index_builder_);
     cut_filter_block = true;
   }
   return sep;
@@ -242,25 +276,30 @@ Slice PartitionedIndexBuilder::AddIndexEntry(
 Status PartitionedIndexBuilder::Finish(
     IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
   if (partition_cnt_ == 0) {
-    partition_cnt_ = entries_.size();
+    sub_index_builder_ = nullptr;
+    if (!entries_.empty()) {
+      // Remove the last entry if it is empty
+      if (entries_.back().value->index_block_builder_.empty()) {
+        assert(entries_.back().key.empty());
+        entries_.pop_back();
+      }
+      partition_cnt_ = entries_.size();
+    }
   }
-  // It must be set to null after last key is added
-  assert(sub_index_builder_ == nullptr);
-  if (finishing_indexes == true) {
+  if (finishing_indexes_ == true) {
     Entry& last_entry = entries_.front();
-    std::string handle_encoding;
-    last_partition_block_handle.EncodeTo(&handle_encoding);
+    EncodedBlockHandle handle_encoding(last_partition_block_handle);
     std::string handle_delta_encoding;
     PutVarsignedint64(
         &handle_delta_encoding,
         last_partition_block_handle.size() - last_encoded_handle_.size());
     last_encoded_handle_ = last_partition_block_handle;
     const Slice handle_delta_encoding_slice(handle_delta_encoding);
-    index_block_builder_.Add(last_entry.key, handle_encoding,
+    index_block_builder_.Add(last_entry.key, handle_encoding.AsSlice(),
                              &handle_delta_encoding_slice);
     if (!must_use_separator_with_seq_) {
       index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
-                                           handle_encoding,
+                                           handle_encoding.AsSlice(),
                                            &handle_delta_encoding_slice);
     }
     entries_.pop_front();
@@ -284,7 +323,7 @@ Status PartitionedIndexBuilder::Finish(
     entry.value->must_use_separator_with_seq_ = must_use_separator_with_seq_;
     auto s = entry.value->Finish(index_blocks);
     index_size_ += index_blocks->index_block_contents.size();
-    finishing_indexes = true;
+    finishing_indexes_ = true;
     return s.ok() ? Status::Incomplete() : s;
   }
 }
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index a7ce797e0a29..630555219648 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -77,6 +77,49 @@ class IndexBuilder {
                               const BlockHandle& block_handle,
                               std::string* separator_scratch) = 0;
 
+  // An abstract (extensible) holder for passing data from PrepareIndexEntry to
+  // FinishIndexEntry (see below).
+  struct PreparedIndexEntry {
+    virtual ~PreparedIndexEntry() = default;
+  };
+
+  // Parallel compression/construction alternative to AddIndexEntry, 1/3
+  //
+  // This function creates a holder for data that needs to be passed from
+  // PrepareIndexEntry to FinishIndexEntry, depending on the implementation
+  // of those. Few of these are created and reused, so construction/destruction
+  // performance is not critical.
+  virtual std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() = 0;
+
+  // Parallel compression/construction alternative to AddIndexEntry, 2/3
+  //
+  // One thread calls this function for successive index entries to compute and
+  // record in `out` what is needed to build the index entry EXCEPT for the
+  // BlockHandle, which will only be known later. That thread is generally the
+  // same thread as calls every other function such as OnKeyAdded EXCEPT
+  // FinishIndexEntry (see below). This function should be considered "mostly
+  // stateless" but might modify state distinct from what is modified by
+  // FinishIndexEntry. Ideally synchronization within the IndexBuilder can be
+  // avoided.
+  //
+  // The passed-in PreparedIndexEntry object is likely reused so might be
+  // passed-in in any state.
+  virtual void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                                 const Slice* first_key_in_next_block,
+                                 PreparedIndexEntry* out) = 0;
+
+  // Parallel compression/construction alternative to AddIndexEntry, 3/3
+  //
+  // This function is called by a different thread than PrepareIndexEntry, but
+  // is called on entries in the same order as PrepareIndexEntry, passed in the
+  // PreparedIndexEntry objects populated by PrepareIndexEntry. This function
+  // finishes the same effect of AddIndexEntry but split across a few functions.
+  //
+  // External synchronization ensures Finish is only called after all the
+  // FinishIndexEntry calls have completed.
+  virtual void FinishIndexEntry(const BlockHandle& block_handle,
+                                PreparedIndexEntry* entry) = 0;
+
   // This method will be called whenever a key is added. The subclasses may
   // override OnKeyAdded() if they need to collect additional information.
   virtual void OnKeyAdded(const Slice& /*key*/,
@@ -118,7 +161,7 @@ class IndexBuilder {
   // can be used as separator.
   inline bool ShouldUseKeyPlusSeqAsSeparator(
       const Slice& last_key_in_current_block,
-      const Slice& first_key_in_next_block) {
+      const Slice& first_key_in_next_block) const {
     Slice l_user_key = ExtractUserKey(last_key_in_current_block);
     Slice r_user_key = ExtractUserKey(first_key_in_next_block);
     // If user defined timestamps are not persisted. All the user keys will
@@ -189,10 +232,9 @@ class ShortenedIndexBuilder : public IndexBuilder {
     }
   }
 
-  Slice AddIndexEntry(const Slice& last_key_in_current_block,
-                      const Slice* first_key_in_next_block,
-                      const BlockHandle& block_handle,
-                      std::string* separator_scratch) override {
+  Slice GetSeparatorWithSeq(const Slice& last_key_in_current_block,
+                            const Slice* first_key_in_next_block,
+                            std::string* separator_scratch) {
     Slice separator_with_seq;
     if (first_key_in_next_block != nullptr) {
       if (shortening_mode_ !=
@@ -218,20 +260,33 @@ class ShortenedIndexBuilder : public IndexBuilder {
         separator_with_seq = last_key_in_current_block;
       }
     }
+    return separator_with_seq;
+  }
 
-    assert(!include_first_key_ || !current_block_first_internal_key_.empty());
+  Slice GetFirstInternalKey(std::string* first_internal_key_buf) const {
+    if (!include_first_key_) {
+      return Slice();
+    }
+    assert(!current_block_first_internal_key_.empty());
     // When UDT should not be persisted, the index block builders take care of
     // stripping UDT from the key, for the first internal key contained in the
     // IndexValue, we need to explicitly do the stripping here before passing
     // it to the block builders.
-    std::string first_internal_key_buf;
     Slice first_internal_key = current_block_first_internal_key_;
     if (!current_block_first_internal_key_.empty() && ts_sz_ > 0 &&
         !persist_user_defined_timestamps_) {
-      StripTimestampFromInternalKey(&first_internal_key_buf,
+      first_internal_key_buf->clear();
+      StripTimestampFromInternalKey(first_internal_key_buf,
                                     current_block_first_internal_key_, ts_sz_);
-      first_internal_key = first_internal_key_buf;
+      first_internal_key = *first_internal_key_buf;
     }
+    return first_internal_key;
+  }
+
+  void AddIndexEntryImpl(const Slice& separator_with_seq,
+                         const Slice& first_internal_key,
+                         const BlockHandle& block_handle,
+                         bool must_use_separator_with_seq) {
     IndexValue entry(block_handle, first_internal_key);
     std::string encoded_entry;
     std::string delta_encoded_entry;
@@ -257,16 +312,85 @@ class ShortenedIndexBuilder : public IndexBuilder {
     // optimization is provided.
     index_block_builder_.Add(separator_with_seq, encoded_entry,
                              &delta_encoded_entry_slice);
-    if (!must_use_separator_with_seq_) {
+    if (!must_use_separator_with_seq) {
       index_block_builder_without_seq_.Add(ExtractUserKey(separator_with_seq),
                                            encoded_entry,
                                            &delta_encoded_entry_slice);
     }
+  }
+
+  Slice AddIndexEntry(const Slice& last_key_in_current_block,
+                      const Slice* first_key_in_next_block,
+                      const BlockHandle& block_handle,
+                      std::string* separator_scratch) override {
+    Slice separator_with_seq = GetSeparatorWithSeq(
+        last_key_in_current_block, first_key_in_next_block, separator_scratch);
+
+    std::string first_internal_key_buf;
+    Slice first_internal_key = GetFirstInternalKey(&first_internal_key_buf);
 
+    AddIndexEntryImpl(separator_with_seq, first_internal_key, block_handle,
+                      must_use_separator_with_seq_);
     current_block_first_internal_key_.clear();
     return separator_with_seq;
   }
 
+  struct ShortenedPreparedIndexEntry : public PreparedIndexEntry {
+    std::string separator_with_seq;
+    std::string first_internal_key;
+    bool must_use_separator_with_seq = false;
+    void SaveFrom(const Slice& from_separator,
+                  const Slice& from_first_internal_key,
+                  bool from_must_use_separator_with_seq) {
+      assert(from_separator.size() >= kNumInternalBytes);
+      if (from_separator.data() == separator_with_seq.data()) {
+        // No need to copy
+        assert(from_separator.size() == separator_with_seq.size());
+      } else {
+        // Copy the separator
+        separator_with_seq.assign(from_separator.data(), from_separator.size());
+      }
+      // first_internal_key is optional, so it may be empty.
+      assert(from_first_internal_key.empty() ||
+             from_first_internal_key.size() >= kNumInternalBytes);
+      if (from_first_internal_key.data() == first_internal_key.data()) {
+        // No need to copy
+        assert(from_first_internal_key.size() == first_internal_key.size());
+      } else {
+        // Copy the first internal key
+        first_internal_key.assign(from_first_internal_key.data(),
+                                  from_first_internal_key.size());
+      }
+      must_use_separator_with_seq = from_must_use_separator_with_seq;
+    }
+  };
+
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
+    return std::make_unique<ShortenedPreparedIndexEntry>();
+  }
+
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override {
+    ShortenedPreparedIndexEntry* entry =
+        static_cast<ShortenedPreparedIndexEntry*>(out);
+    Slice separator =
+        GetSeparatorWithSeq(last_key_in_current_block, first_key_in_next_block,
+                            &entry->separator_with_seq);
+    Slice first_internal_key = GetFirstInternalKey(&entry->first_internal_key);
+    entry->SaveFrom(separator, first_internal_key,
+                    must_use_separator_with_seq_);
+    current_block_first_internal_key_.clear();
+  }
+
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* base_entry) override {
+    ShortenedPreparedIndexEntry* entry =
+        static_cast<ShortenedPreparedIndexEntry*>(base_entry);
+    AddIndexEntryImpl(entry->separator_with_seq, entry->first_internal_key,
+                      block_handle, entry->must_use_separator_with_seq);
+  }
+
   using IndexBuilder::Finish;
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& /*last_partition_block_handle*/) override {
@@ -366,6 +490,23 @@ class HashIndexBuilder : public IndexBuilder {
         separator_scratch);
   }
 
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
+    return primary_index_builder_.CreatePreparedIndexEntry();
+  }
+
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override {
+    ++current_restart_index_;
+    primary_index_builder_.PrepareIndexEntry(last_key_in_current_block,
+                                             first_key_in_next_block, out);
+  }
+
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* entry) override {
+    primary_index_builder_.FinishIndexEntry(block_handle, entry);
+  }
+
   void OnKeyAdded(const Slice& key,
                   const std::optional<Slice>& /*value*/) override {
     auto key_prefix = hash_key_extractor_->Transform(key);
@@ -472,6 +613,14 @@ class PartitionedIndexBuilder : public IndexBuilder {
                       const BlockHandle& block_handle,
                       std::string* separator_scratch) override;
 
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override;
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override;
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* entry) override;
+  void MaybeFlush(const Slice& index_key, const BlockHandle& index_value);
+
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& last_partition_block_handle) override;
 
@@ -524,12 +673,12 @@ class PartitionedIndexBuilder : public IndexBuilder {
   std::list<Entry> entries_;
   BlockBuilder index_block_builder_;              // top-level index builder
   BlockBuilder index_block_builder_without_seq_;  // same for user keys
-  // the active partition index builder
-  std::unique_ptr<ShortenedIndexBuilder> sub_index_builder_;
+  // the active partition index builder (owned by an Entry in entries_)
+  ShortenedIndexBuilder* sub_index_builder_;
   // the last key in the active partition index builder
   std::unique_ptr<RetargetableFlushBlockPolicy> flush_policy_;
   // true if Finish is called once but not complete yet.
-  bool finishing_indexes = false;
+  bool finishing_indexes_ = false;
   const BlockBasedTableOptions& table_opt_;
   bool must_use_separator_with_seq_;
   bool use_value_delta_encoding_;
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 190d02170c96..73161f64d628 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -69,6 +69,25 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
         separator_scratch);
   }
 
+  // Not supported with parallel compression
+  std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
+    return nullptr;
+  }
+  void PrepareIndexEntry(const Slice& last_key_in_current_block,
+                         const Slice* first_key_in_next_block,
+                         PreparedIndexEntry* out) override {
+    (void)last_key_in_current_block;
+    (void)first_key_in_next_block;
+    (void)out;
+    assert(false);
+  }
+  void FinishIndexEntry(const BlockHandle& block_handle,
+                        PreparedIndexEntry* entry) override {
+    (void)block_handle;
+    (void)entry;
+    assert(false);
+  }
+
   void OnKeyAdded(const Slice& key,
                   const std::optional<Slice>& value) override {
     ParsedInternalKey pkey;
diff --git a/table/format.h b/table/format.h
index c8e1c86e4141..38a5977abfd6 100644
--- a/table/format.h
+++ b/table/format.h
@@ -90,6 +90,16 @@ class BlockHandle {
   static const BlockHandle kNullBlockHandle;
 };
 
+struct EncodedBlockHandle {
+  explicit EncodedBlockHandle(const BlockHandle& h) {
+    auto end = h.EncodeTo(buffer.data());
+    size = end - buffer.data();
+  }
+  Slice AsSlice() const { return Slice(buffer.data(), size); }
+  std::array<char, BlockHandle::kMaxEncodedLength> buffer;
+  size_t size;
+};
+
 // Value in block-based table file index.
 //
 // The index entry for block n is: y -> h, [x],
diff --git a/unreleased_history/performance_improvements/parallel_compression.md b/unreleased_history/performance_improvements/parallel_compression.md
new file mode 100644
index 000000000000..769b03941e13
--- /dev/null
+++ b/unreleased_history/performance_improvements/parallel_compression.md
@@ -0,0 +1 @@
+* Improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature.

From 68efd6fd8ec83c84fb0f8bdff1d89adb1ac1932f Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 28 Aug 2025 13:46:54 -0700
Subject: [PATCH 254/500] Refactor ProcessKeyValueCompaction into smaller
 functions (#13879)

Summary:
**Context/Summary:**
`ProcessKeyValueCompaction()` has grown too long to resonate or add any logic to resume from some key and save progress for resumable compaction. This PR breaks this function into smaller functions. Almost all of them are cosmetic changes, except for one thing pointed out in below PR conversation.

Specially, this PR did the following:
- Added `SubcompactionInternalIterators`, `SubcompactionKeyBoundaries` and `BlobFileResources` to manage the lifetime of the local variables of the original functions to be used across smaller functions
- Moved AutoThreadOperationStageUpdater, some IO stats measurement to a different place that makes more sense

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13879

Test Plan: Existing UT

Reviewed By: jaykorean

Differential Revision: D80216092

Pulled By: hx235

fbshipit-source-id: 515615906e5e5fd5ec191bcdd4126f17d282cac2
---
 db/compaction/compaction_job.cc         | 551 +++++++++++++++---------
 db/compaction/compaction_job.h          | 119 ++++-
 db/compaction/compaction_service_job.cc |   2 +-
 3 files changed, 452 insertions(+), 220 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 532c4cedcf8d..07bad56d3cb2 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -170,7 +170,11 @@ CompactionJob::CompactionJob(
       blob_output_directory_(blob_output_directory),
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
-      earliest_snapshot_(job_context->GetEarliestSnapshotSequence()),
+      // job_context cannot be nullptr, but we will assert later in the body of
+      // the constructor.
+      earliest_snapshot_(job_context
+                             ? job_context->GetEarliestSnapshotSequence()
+                             : kMaxSequenceNumber),
       job_context_(job_context),
       table_cache_(std::move(table_cache)),
       event_logger_(event_logger),
@@ -185,6 +189,7 @@ CompactionJob::CompactionJob(
       bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
   assert(job_stats_ != nullptr);
   assert(log_buffer_ != nullptr);
+  assert(job_context);
   assert(job_context->snapshot_context_initialized);
 
   const auto* cfd = compact_->compaction->column_family_data();
@@ -877,7 +882,7 @@ void CompactionJob::SetOutputTableProperties() {
   }
 }
 
-void CompactionJob::AggregateSubcompactionStats() {
+void CompactionJob::AggregateSubcompactionOutputAndJobStats() {
   // Before the compaction starts, is_remote_compaction was set to true if
   // compaction_service is set. We now know whether each sub_compaction was
   // done remotely or not. Reset is_remote_compaction back to false and allow
@@ -915,9 +920,10 @@ void CompactionJob::FinalizeCompactionRun(
     const Status& input_status, bool stats_built_from_input_table_prop,
     uint64_t num_input_range_del) {
   if (stats_built_from_input_table_prop) {
-    UpdateCompactionJobInputStats(internal_stats_, num_input_range_del);
+    UpdateCompactionJobInputStatsFromInternalStats(internal_stats_,
+                                                   num_input_range_del);
   }
-  UpdateCompactionJobOutputStats(internal_stats_);
+  UpdateCompactionJobOutputStatsFromInternalStats(internal_stats_);
   RecordCompactionIOStats();
 
   LogFlush(db_options_.info_log);
@@ -952,11 +958,11 @@ Status CompactionJob::Run() {
     SetOutputTableProperties();
   }
 
-  AggregateSubcompactionStats();
+  AggregateSubcompactionOutputAndJobStats();
 
   uint64_t num_input_range_del = 0;
   bool stats_built_from_input_table_prop =
-      BuildStatsFromInputFiles(&num_input_range_del);
+      UpdateInternalStatsFromInputFiles(&num_input_range_del);
 
   if (status.ok()) {
     status = VerifyCompactionRecordCounts(stats_built_from_input_table_prop,
@@ -1189,58 +1195,62 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
   }
 }
 
-void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
-  assert(sub_compact);
-  assert(sub_compact->compaction);
+bool CompactionJob::ShouldUseLocalCompaction(SubcompactionState* sub_compact) {
   if (db_options_.compaction_service) {
     CompactionServiceJobStatus comp_status =
         ProcessKeyValueCompactionWithCompactionService(sub_compact);
     if (comp_status != CompactionServiceJobStatus::kUseLocal) {
-      return;
+      return false;
     }
     // fallback to local compaction
     assert(comp_status == CompactionServiceJobStatus::kUseLocal);
     sub_compact->compaction_job_stats.is_remote_compaction = false;
   }
+  return true;
+}
 
-  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
+CompactionJob::CompactionIOStatsSnapshot CompactionJob::InitializeIOStats() {
+  CompactionIOStatsSnapshot io_stats;
 
-  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  if (measure_io_stats_) {
+    io_stats.prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+    io_stats.prev_write_nanos = IOSTATS(write_nanos);
+    io_stats.prev_fsync_nanos = IOSTATS(fsync_nanos);
+    io_stats.prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    io_stats.prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    io_stats.prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    io_stats.prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+
+  return io_stats;
+}
+
+Status CompactionJob::SetupAndValidateCompactionFilter(
+    SubcompactionState* sub_compact,
+    const CompactionFilter* configured_compaction_filter,
+    const CompactionFilter*& compaction_filter,
+    std::unique_ptr<CompactionFilter>& compaction_filter_from_factory) {
+  compaction_filter = configured_compaction_filter;
 
-  // Create compaction filter and fail the compaction if
-  // IgnoreSnapshots() = false because it is not supported anymore
-  const CompactionFilter* compaction_filter = cfd->ioptions().compaction_filter;
-  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
   if (compaction_filter == nullptr) {
     compaction_filter_from_factory =
         sub_compact->compaction->CreateCompactionFilter();
     compaction_filter = compaction_filter_from_factory.get();
   }
+
   if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
-    sub_compact->status = Status::NotSupported(
+    return Status::NotSupported(
         "CompactionFilter::IgnoreSnapshots() = false is not supported "
         "anymore.");
-    return;
   }
 
-  NotifyOnSubcompactionBegin(sub_compact);
-
-  // This is assigned after creation of SubcompactionState to simplify that
-  // creation across both CompactionJob and CompactionServiceCompactionJob
-  sub_compact->AssignRangeDelAggregator(
-      std::make_unique<CompactionRangeDelAggregator>(
-          &cfd->internal_comparator(), job_context_->snapshot_seqs,
-          &full_history_ts_low_, &trim_ts_));
-
-  // TODO: since we already use C++17, should use
-  // std::optional<const Slice> instead.
-  const std::optional<Slice> start = sub_compact->start;
-  const std::optional<Slice> end = sub_compact->end;
-
-  std::optional<Slice> start_without_ts;
-  std::optional<Slice> end_without_ts;
+  return Status::OK();
+}
 
-  ReadOptions read_options;
+void CompactionJob::InitializeReadOptions(
+    ColumnFamilyData* cfd, ReadOptions& read_options,
+    SubcompactionKeyBoundaries& boundaries) {
   read_options.verify_checksums = true;
   read_options.fill_cache = false;
   read_options.rate_limiter_priority = GetRateLimiterPriority();
@@ -1251,222 +1261,207 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
   read_options.total_order_seek = true;
 
-  const WriteOptions write_options(Env::IOPriority::IO_LOW,
-                                   Env::IOActivity::kCompaction);
-
   // Remove the timestamps from boundaries because boundaries created in
   // GenSubcompactionBoundaries doesn't strip away the timestamp.
-  size_t ts_sz = cfd->user_comparator()->timestamp_size();
-  if (start.has_value()) {
-    read_options.iterate_lower_bound = &(*start);
+  const size_t ts_sz = cfd->user_comparator()->timestamp_size();
+
+  if (boundaries.start.has_value()) {
+    read_options.iterate_lower_bound = &(*boundaries.start);
     if (ts_sz > 0) {
-      start_without_ts = StripTimestampFromUserKey(*start, ts_sz);
-      read_options.iterate_lower_bound = &(*start_without_ts);
+      boundaries.start_without_ts =
+          StripTimestampFromUserKey(*boundaries.start, ts_sz);
+      read_options.iterate_lower_bound = &(*boundaries.start_without_ts);
     }
   }
-  if (end.has_value()) {
-    read_options.iterate_upper_bound = &(*end);
+  if (boundaries.end.has_value()) {
+    read_options.iterate_upper_bound = &(*boundaries.end);
     if (ts_sz > 0) {
-      end_without_ts = StripTimestampFromUserKey(*end, ts_sz);
-      read_options.iterate_upper_bound = &(*end_without_ts);
+      boundaries.end_without_ts =
+          StripTimestampFromUserKey(*boundaries.end, ts_sz);
+      read_options.iterate_upper_bound = &(*boundaries.end_without_ts);
     }
   }
+}
+
+InternalIterator* CompactionJob::CreateInputIterator(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    SubcompactionInternalIterators& iterators,
+    SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
+  // This is assigned after creation of SubcompactionState to simplify that
+  // creation across both CompactionJob and CompactionServiceCompactionJob
+  sub_compact->AssignRangeDelAggregator(
+      std::make_unique<CompactionRangeDelAggregator>(
+          &cfd->internal_comparator(), job_context_->snapshot_seqs,
+          &full_history_ts_low_, &trim_ts_));
+
+  InitializeReadOptions(cfd, read_options, boundaries);
 
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
-  std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
-      read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
-      file_options_for_read_, start, end));
-  InternalIterator* input = raw_input.get();
-
-  IterKey start_ikey;
-  IterKey end_ikey;
-  Slice start_slice;
-  Slice end_slice;
-  Slice start_user_key{};
-  Slice end_user_key{};
-
-  static constexpr char kMaxTs[] =
-      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
-  Slice ts_slice;
-  std::string max_ts;
+  iterators.raw_input =
+      std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
+          read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
+          file_options_for_read_, boundaries.start, boundaries.end));
+  InternalIterator* input = iterators.raw_input.get();
+
+  const size_t ts_sz = cfd->user_comparator()->timestamp_size();
   if (ts_sz > 0) {
-    if (ts_sz <= strlen(kMaxTs)) {
-      ts_slice = Slice(kMaxTs, ts_sz);
+    if (ts_sz <= strlen(boundaries.kMaxTs)) {
+      boundaries.ts_slice = Slice(boundaries.kMaxTs, ts_sz);
     } else {
-      max_ts = std::string(ts_sz, '\xff');
-      ts_slice = Slice(max_ts);
+      boundaries.max_ts = std::string(ts_sz, '\xff');
+      boundaries.ts_slice = Slice(boundaries.max_ts);
     }
   }
 
-  if (start.has_value()) {
-    start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+  if (boundaries.start.has_value()) {
+    boundaries.start_ikey.SetInternalKey(*boundaries.start, kMaxSequenceNumber,
+                                         kValueTypeForSeek);
     if (ts_sz > 0) {
-      start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
-                                   &ts_slice);
+      boundaries.start_ikey.UpdateInternalKey(
+          kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice);
     }
-    start_slice = start_ikey.GetInternalKey();
-    start_user_key = start_ikey.GetUserKey();
+    boundaries.start_internal_key = boundaries.start_ikey.GetInternalKey();
+    boundaries.start_user_key = boundaries.start_ikey.GetUserKey();
   }
-  if (end.has_value()) {
-    end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
+  if (boundaries.end.has_value()) {
+    boundaries.end_ikey.SetInternalKey(*boundaries.end, kMaxSequenceNumber,
+                                       kValueTypeForSeek);
     if (ts_sz > 0) {
-      end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
-                                 &ts_slice);
+      boundaries.end_ikey.UpdateInternalKey(
+          kMaxSequenceNumber, kValueTypeForSeek, &boundaries.ts_slice);
     }
-    end_slice = end_ikey.GetInternalKey();
-    end_user_key = end_ikey.GetUserKey();
+    boundaries.end_internal_key = boundaries.end_ikey.GetInternalKey();
+    boundaries.end_user_key = boundaries.end_ikey.GetUserKey();
   }
 
-  std::unique_ptr<InternalIterator> clip;
-  if (start.has_value() || end.has_value()) {
-    clip = std::make_unique<ClippingIterator>(
-        raw_input.get(), start.has_value() ? &start_slice : nullptr,
-        end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
-    input = clip.get();
+  if (boundaries.start.has_value() || boundaries.end.has_value()) {
+    iterators.clip = std::make_unique<ClippingIterator>(
+        iterators.raw_input.get(),
+        boundaries.start.has_value() ? &boundaries.start_internal_key : nullptr,
+        boundaries.end.has_value() ? &boundaries.end_internal_key : nullptr,
+        &cfd->internal_comparator());
+    input = iterators.clip.get();
   }
 
-  std::unique_ptr<InternalIterator> blob_counter;
-
   if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
     BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
-    blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
-    input = blob_counter.get();
+    iterators.blob_counter =
+        std::make_unique<BlobCountingIterator>(input, meter);
+    input = iterators.blob_counter.get();
   }
 
-  std::unique_ptr<InternalIterator> trim_history_iter;
   if (ts_sz > 0 && !trim_ts_.empty()) {
-    trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
+    iterators.trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
         input, cfd->user_comparator(), trim_ts_);
-    input = trim_history_iter.get();
+    input = iterators.trim_history_iter.get();
   }
 
-  input->SeekToFirst();
-
-  AutoThreadOperationStageUpdater stage_updater(
-      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
-
-  // I/O measurement variables
-  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
-  const uint64_t kRecordStatsEvery = 1000;
-  uint64_t prev_write_nanos = 0;
-  uint64_t prev_fsync_nanos = 0;
-  uint64_t prev_range_sync_nanos = 0;
-  uint64_t prev_prepare_write_nanos = 0;
-  uint64_t prev_cpu_write_nanos = 0;
-  uint64_t prev_cpu_read_nanos = 0;
-  if (measure_io_stats_) {
-    prev_perf_level = GetPerfLevel();
-    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
-    prev_write_nanos = IOSTATS(write_nanos);
-    prev_fsync_nanos = IOSTATS(fsync_nanos);
-    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
-    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
-    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
-    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
-  }
-
-  MergeHelper merge(
-      env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
-      compaction_filter, db_options_.info_log.get(),
-      false /* internal key corruption is expected */,
-      job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
-      compact_->compaction->level(), db_options_.stats);
+  return input;
+}
 
+void CompactionJob::CreateBlobFileBuilder(SubcompactionState* sub_compact,
+                                          ColumnFamilyData* cfd,
+                                          BlobFileResources& blob_resources,
+                                          const WriteOptions& write_options) {
   const auto& mutable_cf_options =
       sub_compact->compaction->mutable_cf_options();
 
-  std::vector<std::string> blob_file_paths;
-
   // TODO: BlobDB to support output_to_proximal_level compaction, which needs
   //  2 builders, so may need to move to `CompactionOutputs`
-  std::unique_ptr<BlobFileBuilder> blob_file_builder(
-      (mutable_cf_options.enable_blob_files &&
-       sub_compact->compaction->output_level() >=
-           mutable_cf_options.blob_file_starting_level)
-          ? new BlobFileBuilder(
-                versions_, fs_.get(),
-                &sub_compact->compaction->immutable_options(),
-                &mutable_cf_options, &file_options_, &write_options, db_id_,
-                db_session_id_, job_id_, cfd->GetID(), cfd->GetName(),
-                write_hint_, io_tracer_, blob_callback_,
-                BlobFileCreationReason::kCompaction, &blob_file_paths,
-                sub_compact->Current().GetBlobFileAdditionsPtr())
-          : nullptr);
+  if (mutable_cf_options.enable_blob_files &&
+      sub_compact->compaction->output_level() >=
+          mutable_cf_options.blob_file_starting_level) {
+    blob_resources.blob_file_builder = std::make_unique<BlobFileBuilder>(
+        versions_, fs_.get(), &sub_compact->compaction->immutable_options(),
+        &mutable_cf_options, &file_options_, &write_options, db_id_,
+        db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_,
+        io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction,
+        &blob_resources.blob_file_paths,
+        sub_compact->Current().GetBlobFileAdditionsPtr());
+  } else {
+    blob_resources.blob_file_builder = nullptr;
+  }
+}
 
-  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
-  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
-                           static_cast<void*>(const_cast<std::atomic<bool>*>(
-                               &manual_compaction_canceled_)));
+std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    InternalIterator* input, const CompactionFilter* compaction_filter,
+    MergeHelper& merge, BlobFileResources& blob_resources,
+    const WriteOptions& write_options) {
+  CreateBlobFileBuilder(sub_compact, cfd, blob_resources, write_options);
 
   const std::string* const full_history_ts_low =
       full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
-  const SequenceNumber job_snapshot_seq =
-      job_context_ ? job_context_->GetJobSnapshotSequence()
-                   : kMaxSequenceNumber;
+  assert(job_context_);
 
-  auto c_iter = std::make_unique<CompactionIterator>(
+  return std::make_unique<CompactionIterator>(
       input, cfd->user_comparator(), &merge, versions_->LastSequence(),
       &(job_context_->snapshot_seqs), earliest_snapshot_,
-      job_context_->earliest_write_conflict_snapshot, job_snapshot_seq,
-      job_context_->snapshot_checker, env_,
-      ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
-      blob_file_builder.get(), db_options_.allow_data_in_errors,
+      job_context_->earliest_write_conflict_snapshot,
+      job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
+      env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
+      blob_resources.blob_file_builder.get(), db_options_.allow_data_in_errors,
       db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
-      sub_compact->compaction
-          ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
+      sub_compact->compaction->DoesInputReferenceBlobFiles(),
       sub_compact->compaction, compaction_filter, shutting_down_,
       db_options_.info_log, full_history_ts_low, preserve_seqno_after_);
-  c_iter->SeekToFirst();
-
-  const auto& c_iter_stats = c_iter->iter_stats();
+}
 
-  // define the open and close functions for the compaction files, which will be
-  // used open/close output files when needed.
+std::pair<CompactionFileOpenFunc, CompactionFileCloseFunc>
+CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact,
+                                  SubcompactionKeyBoundaries& boundaries) {
   const CompactionFileOpenFunc open_file_func =
       [this, sub_compact](CompactionOutputs& outputs) {
         return this->OpenCompactionOutputFile(sub_compact, outputs);
       };
 
+  const Slice* start_user_key =
+      sub_compact->start.has_value() ? &boundaries.start_user_key : nullptr;
+  const Slice* end_user_key =
+      sub_compact->end.has_value() ? &boundaries.end_user_key : nullptr;
+
   const CompactionFileCloseFunc close_file_func =
       [this, sub_compact, start_user_key, end_user_key](
           CompactionOutputs& outputs, const Status& status,
           const Slice& next_table_min_key) {
-        return this->FinishCompactionOutputFile(
-            status, sub_compact, outputs, next_table_min_key,
-            sub_compact->start.has_value() ? &start_user_key : nullptr,
-            sub_compact->end.has_value() ? &end_user_key : nullptr);
+        return this->FinishCompactionOutputFile(status, sub_compact, outputs,
+                                                next_table_min_key,
+                                                start_user_key, end_user_key);
       };
 
+  return {open_file_func, close_file_func};
+}
+
+Status CompactionJob::ProcessKeyValue(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    CompactionIterator* c_iter, const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func, uint64_t& prev_cpu_micros) {
   Status status;
+  const uint64_t kRecordStatsEvery = 1000;
+  [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
+
   TEST_SYNC_POINT_CALLBACK(
       "CompactionJob::ProcessKeyValueCompaction()::Processing",
       static_cast<void*>(const_cast<Compaction*>(sub_compact->compaction)));
-  uint64_t last_cpu_micros = prev_cpu_micros;
-  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
-    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
-    // returns true.
+
+  while (status.ok() && !cfd->IsDropped() && c_iter->Valid() &&
+         c_iter->status().ok()) {
     assert(!end.has_value() ||
            cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
 
-    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+    if (c_iter->iter_stats().num_input_records % kRecordStatsEvery ==
         kRecordStatsEvery - 1) {
-      RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
-      c_iter->ResetRecordCounts();
-      RecordCompactionIOStats();
-
-      uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
-      assert(cur_cpu_micros >= last_cpu_micros);
-      RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
-                 cur_cpu_micros - last_cpu_micros);
-      last_cpu_micros = cur_cpu_micros;
+      UpdateSubcompactionJobStatsIncrementally(
+          c_iter, &sub_compact->compaction_job_stats,
+          db_options_.clock->CPUMicros(), prev_cpu_micros);
     }
 
     const auto& ikey = c_iter->ikey();
     bool use_proximal_output = ikey.sequence > proximal_after_seqno_;
+
 #ifndef NDEBUG
     if (sub_compact->compaction->SupportsPerKeyPlacement()) {
-      // Could be overridden by unittest
       PerKeyPlacementContext context(sub_compact->compaction->output_level(),
                                      ikey.user_key, c_iter->value(),
                                      ikey.sequence, use_proximal_output);
@@ -1505,9 +1500,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
                              static_cast<void*>(const_cast<std::atomic<bool>*>(
                                  &manual_compaction_canceled_)));
     c_iter->Next();
-    if (c_iter->status().IsManualCompactionPaused()) {
-      break;
-    }
 
 #ifndef NDEBUG
     bool stop = false;
@@ -1519,6 +1511,28 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 #endif  // NDEBUG
   }
 
+  return status;
+}
+
+void CompactionJob::UpdateSubcompactionJobStatsIncrementally(
+    CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats,
+    uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros) {
+  RecordDroppedKeys(c_iter->iter_stats(), compaction_job_stats);
+  c_iter->ResetRecordCounts();
+  RecordCompactionIOStats();
+
+  assert(cur_cpu_micros >= prev_cpu_micros);
+  RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
+             cur_cpu_micros - prev_cpu_micros);
+  prev_cpu_micros = cur_cpu_micros;
+}
+
+void CompactionJob::FinalizeSubcompactionJobStats(
+    SubcompactionState* sub_compact, CompactionIterator* c_iter,
+    uint64_t start_cpu_micros, uint64_t prev_cpu_micros,
+    const CompactionIOStatsSnapshot& io_stats) {
+  const CompactionIterationStats& c_iter_stats = c_iter->iter_stats();
+
   // This number may not be accurate when CompactionIterator was created
   // with `must_count_input_entries=false`.
   assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
@@ -1556,9 +1570,42 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
                c_iter_stats.total_blob_bytes_relocated);
   }
 
-  RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
-  RecordCompactionIOStats();
+  uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
+
+  // Record final compaction statistics including dropped keys, I/O stats,
+  // and CPU time delta from the last periodic measurement
+  UpdateSubcompactionJobStatsIncrementally(c_iter,
+                                           &sub_compact->compaction_job_stats,
+                                           cur_cpu_micros, prev_cpu_micros);
+
+  // Finalize timing and I/O statistics
+
+  sub_compact->compaction_job_stats.cpu_micros =
+      cur_cpu_micros - start_cpu_micros;
+
+  if (measure_io_stats_) {
+    sub_compact->compaction_job_stats.file_write_nanos +=
+        IOSTATS(write_nanos) - io_stats.prev_write_nanos;
+    sub_compact->compaction_job_stats.file_fsync_nanos +=
+        IOSTATS(fsync_nanos) - io_stats.prev_fsync_nanos;
+    sub_compact->compaction_job_stats.file_range_sync_nanos +=
+        IOSTATS(range_sync_nanos) - io_stats.prev_range_sync_nanos;
+    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+        IOSTATS(prepare_write_nanos) - io_stats.prev_prepare_write_nanos;
+    sub_compact->compaction_job_stats.cpu_micros -=
+        (IOSTATS(cpu_write_nanos) - io_stats.prev_cpu_write_nanos +
+         IOSTATS(cpu_read_nanos) - io_stats.prev_cpu_read_nanos) /
+        1000;
+    if (io_stats.prev_perf_level !=
+        PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+      SetPerfLevel(io_stats.prev_perf_level);
+    }
+  }
+}
 
+Status CompactionJob::FinalizeProcessKeyValueStatus(
+    ColumnFamilyData* cfd, InternalIterator* input_iter,
+    CompactionIterator* c_iter, Status status) {
   if (status.ok() && cfd->IsDropped()) {
     status =
         Status::ColumnFamilyDropped("Column family dropped during compaction");
@@ -1572,68 +1619,141 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
   if (status.ok()) {
-    status = input->status();
+    status = input_iter->status();
   }
   if (status.ok()) {
     status = c_iter->status();
   }
 
+  return status;
+}
+
+Status CompactionJob::CleanupCompactionFiles(
+    SubcompactionState* sub_compact, Status status,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func) {
   // Call FinishCompactionOutputFile() even if status is not ok: it needs to
   // close the output files. Open file function is also passed, in case there's
   // only range-dels, no file was opened, to save the range-dels, it need to
   // create a new output file.
-  status = sub_compact->CloseCompactionFiles(status, open_file_func,
-                                             close_file_func);
+  return sub_compact->CloseCompactionFiles(status, open_file_func,
+                                           close_file_func);
+}
 
+Status CompactionJob::FinalizeBlobFiles(SubcompactionState* sub_compact,
+                                        BlobFileBuilder* blob_file_builder,
+                                        Status status) {
   if (blob_file_builder) {
     if (status.ok()) {
       status = blob_file_builder->Finish();
     } else {
       blob_file_builder->Abandon(status);
     }
-    blob_file_builder.reset();
     sub_compact->Current().UpdateBlobStats();
   }
 
-  uint64_t cur_cpu_micros = db_options_.clock->CPUMicros();
-  sub_compact->compaction_job_stats.cpu_micros =
-      cur_cpu_micros - prev_cpu_micros;
-  RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME,
-             cur_cpu_micros - last_cpu_micros);
+  return status;
+}
 
-  if (measure_io_stats_) {
-    sub_compact->compaction_job_stats.file_write_nanos +=
-        IOSTATS(write_nanos) - prev_write_nanos;
-    sub_compact->compaction_job_stats.file_fsync_nanos +=
-        IOSTATS(fsync_nanos) - prev_fsync_nanos;
-    sub_compact->compaction_job_stats.file_range_sync_nanos +=
-        IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
-    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
-        IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
-    sub_compact->compaction_job_stats.cpu_micros -=
-        (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
-         IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
-        1000;
-    if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
-      SetPerfLevel(prev_perf_level);
-    }
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+
+  if (!ShouldUseLocalCompaction(sub_compact)) {
+    return;
   }
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+  const uint64_t start_cpu_micros = db_options_.clock->CPUMicros();
+  uint64_t prev_cpu_micros = start_cpu_micros;
+  const CompactionIOStatsSnapshot io_stats = InitializeIOStats();
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  const CompactionFilter* compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  Status filter_status = SetupAndValidateCompactionFilter(
+      sub_compact, cfd->ioptions().compaction_filter, compaction_filter,
+      compaction_filter_from_factory);
+  if (!filter_status.ok()) {
+    sub_compact->status = filter_status;
+    return;
+  }
+
+  NotifyOnSubcompactionBegin(sub_compact);
+
+  SubcompactionKeyBoundaries boundaries(sub_compact->start, sub_compact->end);
+  SubcompactionInternalIterators iterators;
+  ReadOptions read_options;
+  const WriteOptions write_options(Env::IOPriority::IO_LOW,
+                                   Env::IOActivity::kCompaction);
+  MergeHelper merge(
+      env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
+      compaction_filter, db_options_.info_log.get(),
+      false /* internal key corruption is expected */,
+      job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
+      compact_->compaction->level(), db_options_.stats);
+  BlobFileResources blob_resources;
+
+  InternalIterator* input_iter = CreateInputIterator(
+      sub_compact, cfd, iterators, boundaries, read_options);
+  assert(input_iter);
+  input_iter->SeekToFirst();
+
+  auto c_iter =
+      CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter,
+                               merge, blob_resources, write_options);
+  assert(c_iter);
+  c_iter->SeekToFirst();
+
+  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:1",
+                           static_cast<void*>(const_cast<std::atomic<bool>*>(
+                               &manual_compaction_canceled_)));
+
+  auto [open_file_func, close_file_func] =
+      CreateFileHandlers(sub_compact, boundaries);
+
+  Status status =
+      ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
+                      close_file_func, prev_cpu_micros);
+
+  status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status);
+
+  FinalizeSubcompaction(sub_compact, status, open_file_func, close_file_func,
+                        blob_resources.blob_file_builder.get(), c_iter.get(),
+                        input_iter, start_cpu_micros, prev_cpu_micros,
+                        io_stats);
+
+  NotifyOnSubcompactionCompleted(sub_compact);
+}
+
+void CompactionJob::FinalizeSubcompaction(
+    SubcompactionState* sub_compact, Status status,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func,
+    BlobFileBuilder* blob_file_builder, CompactionIterator* c_iter,
+    [[maybe_unused]] InternalIterator* input_iter, uint64_t start_cpu_micros,
+    uint64_t prev_cpu_micros, const CompactionIOStatsSnapshot& io_stats) {
+  status = CleanupCompactionFiles(sub_compact, status, open_file_func,
+                                  close_file_func);
+  status = FinalizeBlobFiles(sub_compact, blob_file_builder, status);
+
+  FinalizeSubcompactionJobStats(sub_compact, c_iter, start_cpu_micros,
+                                prev_cpu_micros, io_stats);
+
 #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
   if (!status.ok()) {
     if (c_iter) {
       c_iter->status().PermitUncheckedError();
     }
-    if (input) {
-      input->status().PermitUncheckedError();
+    if (input_iter) {
+      input_iter->status().PermitUncheckedError();
     }
   }
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
 
-  blob_counter.reset();
-  clip.reset();
-  raw_input.reset();
   sub_compact->status = status;
-  NotifyOnSubcompactionCompleted(sub_compact);
 }
 
 uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
@@ -2150,7 +2270,8 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 }
 }  // namespace
 
-bool CompactionJob::BuildStatsFromInputFiles(uint64_t* num_input_range_del) {
+bool CompactionJob::UpdateInternalStatsFromInputFiles(
+    uint64_t* num_input_range_del) {
   assert(compact_);
 
   Compaction* compaction = compact_->compaction;
@@ -2232,7 +2353,7 @@ bool CompactionJob::BuildStatsFromInputFiles(uint64_t* num_input_range_del) {
   return !has_error;
 }
 
-void CompactionJob::UpdateCompactionJobInputStats(
+void CompactionJob::UpdateCompactionJobInputStatsFromInternalStats(
     const InternalStats::CompactionStatsFull& internal_stats,
     uint64_t num_input_range_del) const {
   assert(job_stats_);
@@ -2285,7 +2406,7 @@ void CompactionJob::UpdateCompactionJobInputStats(
   }
 }
 
-void CompactionJob::UpdateCompactionJobOutputStats(
+void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
     const InternalStats::CompactionStatsFull& internal_stats) const {
   assert(job_stats_);
   job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 436169c5691a..87a9ccd11619 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -196,7 +196,7 @@ class CompactionJob {
   IOStatus io_status() const { return io_status_; }
 
  protected:
-  void UpdateCompactionJobOutputStats(
+  void UpdateCompactionJobOutputStatsFromInternalStats(
       const InternalStats::CompactionStatsFull& internal_stats) const;
 
   void LogCompaction();
@@ -242,9 +242,10 @@ class CompactionJob {
   // num_input_range_del are calculated successfully.
   //
   // This should be called only once for compactions (not per subcompaction)
-  bool BuildStatsFromInputFiles(uint64_t* num_input_range_del = nullptr);
+  bool UpdateInternalStatsFromInputFiles(
+      uint64_t* num_input_range_del = nullptr);
 
-  void UpdateCompactionJobInputStats(
+  void UpdateCompactionJobInputStatsFromInternalStats(
       const InternalStats::CompactionStatsFull& internal_stats,
       uint64_t num_input_range_del) const;
 
@@ -287,7 +288,10 @@ class CompactionJob {
   Status SyncOutputDirectories();
   Status VerifyOutputFiles();
   void SetOutputTableProperties();
-  void AggregateSubcompactionStats();
+  // Aggregates subcompaction output stats to internal stat, and aggregates
+  // subcompaction's compaction job stats to the whole entire surrounding
+  // compaction job stats.
+  void AggregateSubcompactionOutputAndJobStats();
   Status VerifyCompactionRecordCounts(bool stats_built_from_input_table_prop,
                                       uint64_t num_input_range_del);
   void FinalizeCompactionRun(const Status& status,
@@ -297,6 +301,113 @@ class CompactionJob {
   CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
       SubcompactionState* sub_compact);
 
+  struct CompactionIOStatsSnapshot {
+    PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+    uint64_t prev_write_nanos = 0;
+    uint64_t prev_fsync_nanos = 0;
+    uint64_t prev_range_sync_nanos = 0;
+    uint64_t prev_prepare_write_nanos = 0;
+    uint64_t prev_cpu_write_nanos = 0;
+    uint64_t prev_cpu_read_nanos = 0;
+  };
+
+  struct SubcompactionKeyBoundaries {
+    const std::optional<const Slice> start;
+    const std::optional<const Slice> end;
+
+    // Boundaries without timestamps for read options
+    std::optional<Slice> start_without_ts;
+    std::optional<Slice> end_without_ts;
+
+    // Timestamp management
+    static constexpr char kMaxTs[] =
+        "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+    std::string max_ts;
+    Slice ts_slice;
+
+    // Internal key boundaries
+    IterKey start_ikey;
+    IterKey end_ikey;
+    Slice start_internal_key;
+    Slice end_internal_key;
+
+    // User key boundaries
+    Slice start_user_key;
+    Slice end_user_key;
+
+    SubcompactionKeyBoundaries(std::optional<const Slice> start_boundary,
+                               std::optional<const Slice> end_boundary)
+        : start(start_boundary), end(end_boundary) {}
+  };
+
+  struct SubcompactionInternalIterators {
+    std::unique_ptr<InternalIterator> raw_input;
+    std::unique_ptr<InternalIterator> clip;
+    std::unique_ptr<InternalIterator> blob_counter;
+    std::unique_ptr<InternalIterator> trim_history_iter;
+  };
+
+  struct BlobFileResources {
+    std::vector<std::string> blob_file_paths;
+    std::unique_ptr<BlobFileBuilder> blob_file_builder;
+  };
+
+  bool ShouldUseLocalCompaction(SubcompactionState* sub_compact);
+  CompactionIOStatsSnapshot InitializeIOStats();
+  Status SetupAndValidateCompactionFilter(
+      SubcompactionState* sub_compact,
+      const CompactionFilter* configured_compaction_filter,
+      const CompactionFilter*& compaction_filter,
+      std::unique_ptr<CompactionFilter>& compaction_filter_from_factory);
+  void InitializeReadOptions(ColumnFamilyData* cfd, ReadOptions& read_options,
+                             SubcompactionKeyBoundaries& boundaries);
+  InternalIterator* CreateInputIterator(
+      SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+      SubcompactionInternalIterators& iterators,
+      SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options);
+  void CreateBlobFileBuilder(SubcompactionState* sub_compact,
+                             ColumnFamilyData* cfd,
+                             BlobFileResources& blob_resources,
+                             const WriteOptions& write_options);
+  std::unique_ptr<CompactionIterator> CreateCompactionIterator(
+      SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+      InternalIterator* input_iter, const CompactionFilter* compaction_filter,
+      MergeHelper& merge, BlobFileResources& blob_resources,
+      const WriteOptions& write_options);
+  std::pair<CompactionFileOpenFunc, CompactionFileCloseFunc> CreateFileHandlers(
+      SubcompactionState* sub_compact, SubcompactionKeyBoundaries& boundaries);
+  Status ProcessKeyValue(SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+                         CompactionIterator* c_iter,
+                         const CompactionFileOpenFunc& open_file_func,
+                         const CompactionFileCloseFunc& close_file_func,
+                         uint64_t& prev_cpu_micros);
+  void UpdateSubcompactionJobStatsIncrementally(
+      CompactionIterator* c_iter, CompactionJobStats* compaction_job_stats,
+      uint64_t cur_cpu_micros, uint64_t& prev_cpu_micros);
+  void FinalizeSubcompactionJobStats(SubcompactionState* sub_compact,
+                                     CompactionIterator* c_iter,
+                                     uint64_t start_cpu_micros,
+                                     uint64_t prev_cpu_micros,
+                                     const CompactionIOStatsSnapshot& io_stats);
+  Status FinalizeProcessKeyValueStatus(ColumnFamilyData* cfd,
+                                       InternalIterator* input_iter,
+                                       CompactionIterator* c_iter,
+                                       Status status);
+  Status CleanupCompactionFiles(SubcompactionState* sub_compact, Status status,
+                                const CompactionFileOpenFunc& open_file_func,
+                                const CompactionFileCloseFunc& close_file_func);
+  Status FinalizeBlobFiles(SubcompactionState* sub_compact,
+                           BlobFileBuilder* blob_file_builder, Status status);
+  void FinalizeSubcompaction(SubcompactionState* sub_compact, Status status,
+                             const CompactionFileOpenFunc& open_file_func,
+                             const CompactionFileCloseFunc& close_file_func,
+                             BlobFileBuilder* blob_file_builder,
+                             CompactionIterator* c_iter,
+                             InternalIterator* input_iter,
+                             uint64_t start_cpu_micros,
+                             uint64_t prev_cpu_micros,
+                             const CompactionIOStatsSnapshot& io_stats);
+
   // update the thread status for starting a compaction.
   void ReportStartedCompaction(Compaction* compaction);
 
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 11ba31daf3b7..fc21cb127025 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -408,7 +408,7 @@ Status CompactionServiceCompactionJob::Run() {
   // 2. Update job-level output stats with the aggregated internal_stats_
   // Please note that input stats will be updated by primary host when all
   // subcompactions are finished
-  UpdateCompactionJobOutputStats(internal_stats_);
+  UpdateCompactionJobOutputStatsFromInternalStats(internal_stats_);
   // and set fields that are not propagated as part of the update
   compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
   compaction_result_->stats.is_full_compaction = c->is_full_compaction();

From 2950e992191f80c6898e5b7ca7716d1a96d8ad1e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 28 Aug 2025 16:59:16 -0700
Subject: [PATCH 255/500] Require C++20 (#13904)

Summary:
I am wanting to use std::counting_semaphore for something and the timing seems good to require C++20 support. The internets suggest:

* GCC >= 10 is adequate, >= 11 preferred
* Clang >= 10 is needed
* Visual Studio >= 2019 is adquate

And popular linux distributions look like this:
* CentOS Stream 9 -> GCC 11.2  (CentOS 8 is EOL)
* Ubuntu 22.04 LTS -> GCC 11.x  (Ubuntu 20 just ended standard support)
* Debian 12 (oldstable) -> GCC 12.2
  * (Debian 11 has ended security updates, uses GCC 10.2)

This required generating a new docker image based on Ubuntu 22 for CI using gcc. The existing Ubuntu 20 image works for covering appropriate clang versions (though we should maybe add a much later version as well, in the next increment of our Ubuntu 22 image; however the minimum available clang build from apt.llvm.org for Ubuntu 22 is clang 13).

Update to SetDumpFilter is to quiet a mysterious gcc-13 warning-as-error.

Removed --compile-no-warning-as-error from a cmake command line because cmake in the new docker image is too old for this option.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13904

Test Plan: CI, one minor unit test added to verify std::counting_semaphor works

Reviewed By: xingbowang

Differential Revision: D81266435

Pulled By: pdillinger

fbshipit-source-id: 26040eeccca7004416e29a6ff4f6ea93f2052684
---
 .github/workflows/nightly.yml                 | 25 ++----
 .github/workflows/pr-jobs.yml                 | 70 ++++++----------
 CMakeLists.txt                                |  5 +-
 INSTALL.md                                    | 10 +--
 Makefile                                      |  6 +-
 build_tools/build_detect_platform             |  4 +-
 build_tools/ubuntu22_image/Dockerfile         | 80 +++++++++++++++++++
 db/db_impl/db_impl.cc                         |  2 -
 examples/Makefile                             | 18 ++---
 include/rocksdb/advanced_options.h            |  4 -
 include/rocksdb/compression_type.h            |  2 -
 include/rocksdb/slice.h                       |  2 +-
 include/rocksdb/universal_compaction.h        |  2 -
 include/rocksdb/utilities/cache_dump_load.h   |  2 +-
 options/cf_options.h                          |  2 -
 .../public_api_changes/cplusplus20.md         |  1 +
 util/slice_test.cc                            | 20 +++++
 util/string_util.cc                           | 18 +----
 utilities/cache_dump_load_impl.cc             |  2 +-
 utilities/cache_dump_load_impl.h              |  2 +-
 20 files changed, 158 insertions(+), 119 deletions(-)
 create mode 100644 build_tools/ubuntu22_image/Dockerfile
 create mode 100644 unreleased_history/public_api_changes/cplusplus20.md

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 6d3139e799fb..937e6683720d 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -32,7 +32,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     env:
       TEST_TMPDIR: "/tmp/rocksdb_test_tmp"
@@ -63,7 +63,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -94,7 +94,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -117,28 +117,13 @@ jobs:
     - name: Build fuzzers
       run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
     - uses: "./.github/actions/post-steps"
-  build-linux-gcc-11-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
-    - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-folly-lite-no-test:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 8d423c240ce5..a828991ca9ce 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -66,7 +66,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -78,7 +78,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -97,11 +97,8 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
@@ -114,11 +111,8 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
@@ -131,11 +125,8 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
@@ -147,11 +138,8 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
-    env:
-      CC: gcc-10
-      CXX: g++-10
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
@@ -164,7 +152,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -176,7 +164,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -190,7 +178,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -236,29 +224,17 @@ jobs:
     - run: make clean
     - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release
     - uses: "./.github/actions/post-steps"
-  build-linux-gcc-8-no_test_run:
+  build-linux-gcc-13-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=gcc-8 CXX=g++-8 V=1 make -j32 all
-    - uses: "./.github/actions/post-steps"
-  build-linux-gcc-10-cxx20-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: zjay437/rocksdb:0.6
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all
+    - run: CC=gcc-13 CXX=g++-13 V=1 make -j32 all microbench
     - uses: "./.github/actions/post-steps"
 
   # ======================== Linux Other Checks ======================= #
@@ -300,7 +276,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -337,7 +313,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -404,11 +380,13 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:centos7_x64-be
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
     # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step.
+    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.0
+    # until a more appropriate docker image with C++20 support is made.
     - name: Checkout
       env:
         GH_TOKEN: ${{ github.token }}
@@ -425,18 +403,21 @@ jobs:
         which java && java -version
         which javac && javac -version
     - name: Test RocksDBJava
-      run: scl enable devtoolset-7 'make V=1 J=8 -j8 jtest'
-    # NOTE: post-steps skipped because of compatibility issues with docker image
+    # NOTE: replaced scl enable devtoolset-7 'make V=1 J=8 -j8 jtest'
+      run: make V=1 J=8 -j8 jtest
+    # post-steps skipped because of compatibility issues with docker image
   build-linux-java-static:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: evolvedbinary/rocksjava:centos7_x64-be
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
     # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step.
+    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.0
+    # until a more appropriate docker image with C++20 support is made.
     - name: Checkout
       env:
         GH_TOKEN: ${{ github.token }}
@@ -453,8 +434,9 @@ jobs:
         which java && java -version
         which javac && javac -version
     - name: Build RocksDBJava Static Library
-      run: scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic'
-    # NOTE: post-steps skipped because of compatibility issues with docker image
+    # NOTE: replaced scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic'
+      run: make V=1 J=8 -j8 rocksdbjavastatic
+    # post-steps skipped because of compatibility issues with docker image
   build-macos-java:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on: macos-13
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef93aa20d6dd..19a66a7b7791 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@
 #
 # Linux:
 #
-# 1. Install a recent toolchain if you're on a older distro. C++17 required (GCC >= 7, Clang >= 5)
+# 1. Install a recent toolchain if you're on a older distro. C++20 required (GCC >= 11, Clang >= 10)
 # 2. mkdir build; cd build
 # 3. cmake ..
 # 4. make -j
@@ -100,7 +100,7 @@ endif()
 option(ROCKSDB_BUILD_SHARED "Build shared versions of the RocksDB libraries" ON)
 
 if( NOT DEFINED CMAKE_CXX_STANDARD )
-  set(CMAKE_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD 20)
 endif()
 
 include(CMakeDependentOption)
@@ -314,7 +314,6 @@ endif()
 
 # Check if -latomic is required or not
 if (NOT MSVC)
-  set(CMAKE_REQUIRED_FLAGS "--std=c++17")
   CHECK_CXX_SOURCE_COMPILES("
 #include <atomic>
 std::atomic<uint64_t> x(0);
diff --git a/INSTALL.md b/INSTALL.md
index 5bc5bd7b297e..1e739d485d02 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -6,7 +6,7 @@ than release mode.
 
 RocksDB's library should be able to compile without any dependency installed,
 although we recommend installing some compression libraries (see below).
-We do depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5).
+We do depend on newer gcc/clang with C++20 support (GCC >= 11, Clang >= 10).
 
 There are few options when compiling RocksDB:
 
@@ -60,7 +60,7 @@ most processors made since roughly 2013.
 ## Supported platforms
 
 * **Linux - Ubuntu**
-    * Upgrade your gcc to version at least 7 to get C++17 support.
+    * Upgrade your gcc to version at least 11 to get C++20 support.
     * Install gflags. First, try: `sudo apt-get install libgflags-dev`
       If this doesn't work and you're using Ubuntu, here's a nice tutorial:
       (http://askubuntu.com/questions/312173/installing-gflags-12-04)
@@ -72,7 +72,7 @@ most processors made since roughly 2013.
     * Install zstandard: `sudo apt-get install libzstd-dev`.
 
 * **Linux - CentOS / RHEL**
-    * Upgrade your gcc to version at least 7 to get C++17 support
+    * Upgrade your gcc to version at least 11 to get C++20 support
     * Install gflags:
 
               git clone https://github.com/gflags/gflags.git
@@ -122,7 +122,7 @@ most processors made since roughly 2013.
               make && sudo make install
 
 * **OS X**:
-    * Install latest C++ compiler that supports C++ 17:
+    * Install latest C++ compiler that supports C++20:
         * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
         * Install via [homebrew](http://brew.sh/).
             * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
@@ -213,7 +213,7 @@ most processors made since roughly 2013.
              export PATH=/opt/freeware/bin:$PATH
 
 * **Solaris Sparc**
-    * Install GCC 7 and higher.
+    * Install GCC 11 and higher.
     * Use these environment variables:
 
              export CC=gcc
diff --git a/Makefile b/Makefile
index a766426b05ef..e0ec77115326 100644
--- a/Makefile
+++ b/Makefile
@@ -148,10 +148,8 @@ ifeq ($(USE_COROUTINES), 1)
 	USE_FOLLY = 1
 	# glog/logging.h requires HAVE_CXX11_ATOMIC
 	OPT += -DUSE_COROUTINES -DHAVE_CXX11_ATOMIC
-	ROCKSDB_CXX_STANDARD = c++2a
 	USE_RTTI = 1
 ifneq ($(USE_CLANG), 1)
-	ROCKSDB_CXX_STANDARD = c++20
 	PLATFORM_CXXFLAGS += -fcoroutines
 endif
 endif
@@ -683,7 +681,7 @@ am__v_CCH_1 =
 # user build settings
 %.h.pub: %.h # .h.pub not actually created, so re-checked on each invocation
 	$(AM_V_CCH) cd include/ && echo '#include "$(patsubst include/%,%,$<)"' | \
-	  $(CXX) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
+	  $(CXX) -std=$(or $(ROCKSDB_CXX_STANDARD),c++20) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
 
 check-headers: $(HEADER_OK_FILES)
 
@@ -2251,7 +2249,7 @@ libsnappy.a: snappy-$(SNAPPY_VER).tar.gz
 	-rm -rf snappy-$(SNAPPY_VER)
 	tar xvzf snappy-$(SNAPPY_VER).tar.gz
 	mkdir snappy-$(SNAPPY_VER)/build
-	cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF --compile-no-warning-as-error ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='$(ARCHFLAG) ${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DSNAPPY_BUILD_BENCHMARKS=OFF -DSNAPPY_BUILD_TESTS=OFF ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
 	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .
 
 lz4-$(LZ4_VER).tar.gz:
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index f0d4bb004cb4..93e0c0fa76b9 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -45,11 +45,11 @@ if test -z "$OUTPUT"; then
   exit 1
 fi
 
-# we depend on C++17, but should be compatible with newer standards
+# we depend on C++20, but should be compatible with newer standards
 if [ "$ROCKSDB_CXX_STANDARD" ]; then
   PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
 else
-  PLATFORM_CXXFLAGS="-std=c++17"
+  PLATFORM_CXXFLAGS="-std=c++20"
 fi
 
 # we currently depend on POSIX platform
diff --git a/build_tools/ubuntu22_image/Dockerfile b/build_tools/ubuntu22_image/Dockerfile
new file mode 100644
index 000000000000..353b0651fabd
--- /dev/null
+++ b/build_tools/ubuntu22_image/Dockerfile
@@ -0,0 +1,80 @@
+# INSTRUCTIONS:
+# I was not about to build docker images on an isolated devserver because of
+# issues with proxy internet access. Use a public cloud or other Linux system.
+# (I used a Debian system after installing docker features, adding my user to
+# the docker and docker-registry groups, and logging out and back in to pick
+# those up.)
+#
+# Follow https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic
+# to login with your GitHub credentials, as in
+#
+# $ docker login ghcr.io -u pdillinger
+#
+# and paste the limited-purpose GitHub token into the terminal.
+#
+# Then in the build_tools/ubuntu22_image directory, (bump minor version for
+# random docker file updates, major version tracks Ubuntu release)
+#
+# $ docker build -t ghcr.io/facebook/rocksdb_ubuntu:22.0
+# $ docker push ghcr.io/facebook/rocksdb_ubuntu:22.0
+#
+# Might need to change visibility to public through
+# https://github.com/orgs/facebook/packages/container/rocksdb_ubuntu/settings
+# or similar.
+
+# from official ubuntu 22.04
+FROM ubuntu:22.04
+# update system
+RUN apt-get update
+RUN apt-get upgrade -y
+# install basic tools
+RUN apt-get install -y vim wget curl
+# install tzdata noninteractive
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+# install git and default compilers
+RUN apt-get install -y git gcc g++ clang clang-tools
+# install basic package
+RUN apt-get install -y lsb-release software-properties-common gnupg
+# install gflags, tbb
+RUN apt-get install -y libgflags-dev libtbb-dev
+# install compression libs
+RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+# install cmake
+RUN apt-get install -y cmake
+RUN apt-get install -y libssl-dev
+# install clang-13
+WORKDIR /root
+RUN wget https://apt.llvm.org/llvm.sh
+RUN chmod +x llvm.sh
+RUN ./llvm.sh 13 all
+# install gcc-10 and more, default is 11
+RUN apt-get install -y gcc-10 g++-10
+RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
+RUN apt-get install -y gcc-13 g++-13
+# install apt-get install -y valgrind
+RUN apt-get install -y valgrind
+# install folly depencencies
+RUN apt-get install -y libunwind-dev libgoogle-glog-dev
+# install openjdk 8
+RUN apt-get install -y openjdk-8-jdk
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
+# install mingw
+RUN apt-get install -y mingw-w64
+
+# install gtest-parallel package
+RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+ENV PATH $PATH:/root/gtest-parallel
+
+# install libprotobuf for fuzzers test
+RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool
+RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install
+ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
+ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
+
+# install the latest google benchmark
+RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark
+RUN cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install
+
+# clean up
+RUN rm -rf /var/lib/apt/lists/*
+RUN rm -rf /root/benchmark
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 3acde3a39c25..3bf0def15359 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1245,13 +1245,11 @@ Status DBImpl::SetOptions(
           WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
       bg_cv_.SignalAll();
 
-#if __cplusplus >= 202002L
       assert(new_options_copy == cfd->GetLatestMutableCFOptions());
       assert(cfd->GetLatestMutableCFOptions() ==
              cfd->GetCurrentMutableCFOptions());
       assert(cfd->GetCurrentMutableCFOptions() ==
              cfd->current()->GetMutableCFOptions());
-#endif
     }
   }
   sv_context.Clean();
diff --git a/examples/Makefile b/examples/Makefile
index b056508a6c3f..0970cfd4002d 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -19,16 +19,16 @@ CFLAGS += -Wstrict-prototypes
 all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example rocksdb_backup_restore_example
 
 simple_example: librocksdb simple_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 column_families_example: librocksdb column_families_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 compaction_filter_example: librocksdb compaction_filter_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 compact_files_example: librocksdb compact_files_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 .c.o:
 	$(CC) $(CFLAGS) -c $< -o $@ -I../include
@@ -37,19 +37,19 @@ c_simple_example: librocksdb c_simple_example.o
 	$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
 
 optimistic_transaction_example: librocksdb optimistic_transaction_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 transaction_example: librocksdb transaction_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 options_file_example: librocksdb options_file_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 multi_processes_example: librocksdb multi_processes_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 rocksdb_backup_restore_example: librocksdb rocksdb_backup_restore_example.cc
-	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++20 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 clean:
 	rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example ./rocksdb_backup_restore_example
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index f78bb0c2c129..90767a06ecd5 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -64,9 +64,7 @@ enum CompactionPri : char {
 struct FileTemperatureAge {
   Temperature temperature = Temperature::kUnknown;
   uint64_t age = 0;
-#if __cplusplus >= 202002L
   bool operator==(const FileTemperatureAge& rhs) const = default;
-#endif
 };
 
 struct CompactionOptionsFIFO {
@@ -135,9 +133,7 @@ struct CompactionOptionsFIFO {
       : max_table_files_size(_max_table_files_size),
         allow_compaction(_allow_compaction) {}
 
-#if __cplusplus >= 202002L
   bool operator==(const CompactionOptionsFIFO& rhs) const = default;
-#endif
 };
 
 // The control option of how the cache tiers will be used. Currently rocksdb
diff --git a/include/rocksdb/compression_type.h b/include/rocksdb/compression_type.h
index a05aa3307874..2261a44439b9 100644
--- a/include/rocksdb/compression_type.h
+++ b/include/rocksdb/compression_type.h
@@ -313,9 +313,7 @@ struct CompressionOptions {
     max_compressed_bytes_per_kb = static_cast<int>(1024.0 / min_ratio + 0.5);
   }
 
-#if __cplusplus >= 202002L
   bool operator==(const CompressionOptions& rhs) const = default;
-#endif
 };
 
 // See advanced_compression.h
diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h
index c914b1637b50..dde34d709d65 100644
--- a/include/rocksdb/slice.h
+++ b/include/rocksdb/slice.h
@@ -24,7 +24,7 @@
 #include <cstdio>
 #include <cstring>
 #include <string>
-#include <string_view>  // RocksDB now requires C++17 support
+#include <string_view>
 
 #include "rocksdb/cleanable.h"
 
diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
index 6d2579baae2f..d94e9653aa61 100644
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -144,9 +144,7 @@ class CompactionOptionsUniversal {
         incremental(false),
         reduce_file_locking(false) {}
 
-#if __cplusplus >= 202002L
   bool operator==(const CompactionOptionsUniversal& rhs) const = default;
-#endif
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/cache_dump_load.h b/include/rocksdb/utilities/cache_dump_load.h
index 8f41839cd9de..ca2ce5ae11aa 100644
--- a/include/rocksdb/utilities/cache_dump_load.h
+++ b/include/rocksdb/utilities/cache_dump_load.h
@@ -90,7 +90,7 @@ class CacheDumper {
  public:
   virtual ~CacheDumper() = default;
   // Only dump the blocks in the block cache that belong to the DBs in this list
-  virtual Status SetDumpFilter(std::vector<DB*> db_list) {
+  virtual Status SetDumpFilter(const std::vector<DB*>& db_list) {
     (void)db_list;
     return Status::NotSupported("SetDumpFilter is not supported");
   }
diff --git a/options/cf_options.h b/options/cf_options.h
index c481c0587dcf..6ac660854f28 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -257,9 +257,7 @@ struct MutableCFOptions {
 
   void Dump(Logger* log) const;
 
-#if __cplusplus >= 202002L
   bool operator==(const MutableCFOptions& rhs) const = default;
-#endif
 
   // Memtable related options
   size_t write_buffer_size;
diff --git a/unreleased_history/public_api_changes/cplusplus20.md b/unreleased_history/public_api_changes/cplusplus20.md
new file mode 100644
index 000000000000..e2c7311fdfd9
--- /dev/null
+++ b/unreleased_history/public_api_changes/cplusplus20.md
@@ -0,0 +1 @@
+* RocksDB now requires a C++20 compatible compiler (GCC >= 11, Clang >= 10, Visual Studio >= 2019), including for any code using RocksDB headers.
diff --git a/util/slice_test.cc b/util/slice_test.cc
index bc8925299a56..c1a0c806b847 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -7,6 +7,8 @@
 
 #include <gtest/gtest.h>
 
+#include <semaphore>
+
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/data_structure.h"
@@ -422,6 +424,24 @@ TEST(ToBaseCharsStringTest, Tests) {
   ASSERT_EQ(ToBaseCharsString<32>(2, 255, false), "7v");
 }
 
+TEST(SemaphoreTest, BasicStdCountingSemaphore) {
+  // Verify the C++20 API is available and apparently working
+  std::counting_semaphore sem{0};
+  int kCount = 5;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < kCount; ++i) {
+    threads.emplace_back([&sem] { sem.release(); });
+  }
+  for (int i = 0; i < kCount; ++i) {
+    threads.emplace_back([&sem] { sem.acquire(); });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  // Nothing left on the semaphore
+  ASSERT_FALSE(sem.try_acquire());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/util/string_util.cc b/util/string_util.cc
index 2a45c3a0ee8f..0dc3e7158e9f 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -20,20 +20,6 @@
 #include "port/sys_time.h"
 #include "rocksdb/slice.h"
 
-#ifndef __has_cpp_attribute
-#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0
-#else
-#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#endif
-
-#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L
-#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]]
-#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__
-#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]]
-#else
-#define ROCKSDB_MAYBE_UNUSED
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 const std::string kNullptrString = "nullptr";
@@ -501,7 +487,7 @@ bool TryParseTimeRangeString(const std::string& value, int& start_time,
 // selects proper function.
 
 #if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
-ROCKSDB_MAYBE_UNUSED
+[[maybe_unused]]
 static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
                                      int err, char* buf, size_t buflen) {
   // Using XSI-compatible strerror_r
@@ -515,7 +501,7 @@ static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
   return buf;
 }
 
-ROCKSDB_MAYBE_UNUSED
+[[maybe_unused]]
 static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t),
                                      int err, char* buf, size_t buflen) {
   // Using GNU strerror_r
diff --git a/utilities/cache_dump_load_impl.cc b/utilities/cache_dump_load_impl.cc
index 042ed534112c..40552ce12066 100644
--- a/utilities/cache_dump_load_impl.cc
+++ b/utilities/cache_dump_load_impl.cc
@@ -24,7 +24,7 @@ namespace ROCKSDB_NAMESPACE {
 // DBs and we may only want to dump out the blocks belonging to certain DB(s).
 // Therefore, a filter is need to decide if the key of the block satisfy the
 // requirement.
-Status CacheDumperImpl::SetDumpFilter(std::vector<DB*> db_list) {
+Status CacheDumperImpl::SetDumpFilter(const std::vector<DB*>& db_list) {
   Status s = Status::OK();
   dump_all_keys_ = false;
   for (size_t i = 0; i < db_list.size(); i++) {
diff --git a/utilities/cache_dump_load_impl.h b/utilities/cache_dump_load_impl.h
index ee892f47488e..b9b62df2a4b3 100644
--- a/utilities/cache_dump_load_impl.h
+++ b/utilities/cache_dump_load_impl.h
@@ -100,7 +100,7 @@ class CacheDumperImpl : public CacheDumper {
     dumped_size_bytes_ = 0;
   }
   ~CacheDumperImpl() { writer_.reset(); }
-  Status SetDumpFilter(std::vector<DB*> db_list) override;
+  Status SetDumpFilter(const std::vector<DB*>& db_list) override;
   IOStatus DumpCacheEntriesToWriter() override;
 
  private:

From ac4d563dd1e45435af9a5983f68af8d4da433baf Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Thu, 28 Aug 2025 23:04:13 -0700
Subject: [PATCH 256/500] Add random seed to db_crashtest.py to make reproduce
 test easier. (#13906)

Summary:
Add a new argument --random_seed to script db_crashtest.py to allow reusing the same random seed to produce exactly same test argument. When the argument is missing, a random seed is used, and printed. When developer wants to reproduce the exactly same setup, they could use the same seed with --random_seed for reproduction. The example below shows running the command without and with the argument. All of the arguments are same, except --db and --expected_values_dir, which does not use python random.

* Without --random_seed, a new seed is generated and printed.
```
[xbw@devvm16622.vll0 ~/workspace/ws1/rocksdb (crashtest)]$ /usr/local/bin/python3 -u tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --cf_consistency blackbox --duration=960 --max_key=2500000
Start with random seed 17953760416546706382
Running blackbox-crash-test with
interval_between_crash=120
total-duration=960

Running db_stress with pid=2957716: ./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=10000 --adaptive_readahead=0 --adm_policy=0 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=0 --async_io=1 --atomic_flush=1 --auto_readahead_size=0 --auto_refresh_iterator_with_snapshot=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=1000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=2097152 --blob_compaction_readahead_size=4194304 --blob_compression_type=zstd --blob_file_size=1073741824 --blob_file_starting_level=0 --blob_garbage_collection_age_cutoff=0.5 --blob_garbage_collection_force_threshold=0.75 --block_align=1 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=1 --bloom_bits=12 --bottommost_compression_type=none --bottommost_file_compaction_delay=3600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=8388608 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=1 --checkpoint_one_in=10000 --checksum_type=kxxHash64 --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=0 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=100 --compress_format_version=1 --compressed_secondary_cache_ratio=0.0 --compressed_secondary_cache_size=0 --compression_checksum=0 --compression_manager=none --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=23:30-03:15 --data_block_index_type=0 --db=/tmp/rocksdb_crashtest_blackboxqishhgdc --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kWarm --default_write_temperature=kCold --delete_obsolete_files_period_micros=30000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=10000 --disable_wal=1 --dump_malloc_stats=1 --enable_blob_files=1 --enable_blob_garbage_collection=1 --enable_checksum_handoff=0 --enable_compaction_filter=0 --enable_custom_split_merge=1 --enable_do_not_compress_roles=1 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=0 --expected_values_dir=/tmp/rocksdb_crashtest_expected_udz8mw68 --fifo_allow_compaction=0 --file_checksum_impl=crc32c --file_temperature_age_thresholds= --fill_cache=0 --flush_one_in=1000 --format_version=4 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=500 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=0 --log_file_time_to_roll=0 --log_readahead_size=16777216 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=8 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=1048576 --memtable_avg_op_scan_flush_trigger=2 --memtable_insert_hint_per_batch=0 --memtable_max_range_deletions=100 --memtable_op_scan_flush_trigger=10 --memtable_prefix_bloom_size_ratio=0.5 --memtable_protection_bytes_per_key=8 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=32 --metadata_write_fault_one_in=128 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_bottom_pri_threads=1 --num_file_reads_for_auto_readahead=2 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=1000 --prefix_size=5 --prefixpercent=5 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=36000 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=32 --read_fault_one_in=1000 --readahead_size=0 --readpercent=45 --recycle_log_file_num=0 --remote_compaction_worker_threads=0 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=1000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=2 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=1 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=0 --track_and_verify_wals=0 --uncache_aggressiveness=211 --universal_max_read_amp=4 --universal_reduce_file_locking=0 --unpartitioned_pinning=0 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=1 --use_attribute_group=1 --use_blob_cache=0 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=1 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_multiscan=0 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=0 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=1048576 --write_dbid_to_manifest=0 --write_fault_one_in=0 --write_identity_file=1 --writepercent=35
```

* With --random_seed, the seed specified in the argument is used.
```
[xbw@devvm16622.vll0 ~/workspace/ws1/rocksdb (crashtest)]$ /usr/local/bin/python3 -u tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --cf_consistency blackbox --duration=960 --max_key=2500000 --random_seed=17953760416546706382
Start with random seed 17953760416546706382
Running blackbox-crash-test with
interval_between_crash=120
total-duration=960

Running db_stress with pid=2959006: ./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=10000 --adaptive_readahead=0 --adm_policy=0 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=0 --async_io=1 --atomic_flush=1 --auto_readahead_size=0 --auto_refresh_iterator_with_snapshot=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=1000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=2097152 --blob_compaction_readahead_size=4194304 --blob_compression_type=zstd --blob_file_size=1073741824 --blob_file_starting_level=0 --blob_garbage_collection_age_cutoff=0.5 --blob_garbage_collection_force_threshold=0.75 --block_align=1 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=1 --bloom_bits=12 --bottommost_compression_type=none --bottommost_file_compaction_delay=3600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=8388608 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=1 --checkpoint_one_in=10000 --checksum_type=kxxHash64 --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=0 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=100 --compress_format_version=1 --compressed_secondary_cache_ratio=0.0 --compressed_secondary_cache_size=0 --compression_checksum=0 --compression_manager=none --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=23:30-03:15 --data_block_index_type=0 --db=/tmp/rocksdb_crashtest_blackbox0kxvhzbm --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kWarm --default_write_temperature=kCold --delete_obsolete_files_period_micros=30000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=10000 --disable_wal=1 --dump_malloc_stats=1 --enable_blob_files=1 --enable_blob_garbage_collection=1 --enable_checksum_handoff=0 --enable_compaction_filter=0 --enable_custom_split_merge=1 --enable_do_not_compress_roles=1 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=0 --expected_values_dir=/tmp/rocksdb_crashtest_expected_hhk9kcgo --fifo_allow_compaction=0 --file_checksum_impl=crc32c --file_temperature_age_thresholds= --fill_cache=0 --flush_one_in=1000 --format_version=4 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=500 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=0 --log_file_time_to_roll=0 --log_readahead_size=16777216 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=8 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=1048576 --memtable_avg_op_scan_flush_trigger=2 --memtable_insert_hint_per_batch=0 --memtable_max_range_deletions=100 --memtable_op_scan_flush_trigger=10 --memtable_prefix_bloom_size_ratio=0.5 --memtable_protection_bytes_per_key=8 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=32 --metadata_write_fault_one_in=128 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_bottom_pri_threads=1 --num_file_reads_for_auto_readahead=2 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=1000 --prefix_size=5 --prefixpercent=5 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=36000 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=32 --read_fault_one_in=1000 --readahead_size=0 --readpercent=45 --recycle_log_file_num=0 --remote_compaction_worker_threads=0 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=1000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=2 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=1 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=0 --track_and_verify_wals=0 --uncache_aggressiveness=211 --universal_max_read_amp=4 --universal_reduce_file_locking=0 --unpartitioned_pinning=0 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=1 --use_attribute_group=1 --use_blob_cache=0 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=1 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_multiscan=0 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=0 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=1048576 --write_dbid_to_manifest=0 --write_fault_one_in=0 --write_identity_file=1 --writepercent=35
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13906

Test Plan: stress test

Reviewed By: hx235

Differential Revision: D81201034

Pulled By: xingbowang

fbshipit-source-id: 0bb4e0cbcdcf2de9b730492342dcfa18f07e93d6
---
 tools/db_crashtest.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f307009b9399..42e6f860b385 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -11,6 +11,27 @@
 import tempfile
 import time
 
+
+def setup_random_seed_before_main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--random_seed",
+        default=0,
+        type=int,
+        help="Random seed used for reproduce the same test parameter set",
+    )
+    args, _ = parser.parse_known_args()
+    random_seed = (
+        random.randint(1, 2**64) if args.random_seed == 0 else args.random_seed
+    )
+    print(f"Start with random seed {random_seed}")
+    random.seed(random_seed)
+
+
+# Random seed has to be setup before the rest of the script, so that the random
+# value selected in the global variable uses the random seed specified
+setup_random_seed_before_main()
+
 # params overwrite priority:
 #   for default:
 #       default_params < {blackbox,whitebox}_default_params < args
@@ -1181,6 +1202,7 @@ def gen_cmd(params, unknown_params):
             not in {
                 "test_type",
                 "simple",
+                "random_seed",
                 "duration",
                 "interval",
                 "random_kill_odd",

From fc8bc60f2dd5b81afb05f58ac04d59a1d79a3816 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 2 Sep 2025 12:37:16 -0700
Subject: [PATCH 257/500] Avoid overwriting non-okay status due to shutdown or
 manual compaction pause (#13891)

Summary:
**Context/Summary:**
A small change as titled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13891

Test Plan: - Existing UT and rehearsal stress test

Reviewed By: jaykorean

Differential Revision: D80588011

Pulled By: hx235

fbshipit-source-id: 6987e08a4855782305ad742eef6c0196da0d67ca
---
 db/compaction/compaction_iterator.cc | 20 +++++++++-----------
 db/compaction/compaction_job.cc      |  5 ++---
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 58f3afaea662..81d6266bdf61 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -1120,17 +1120,15 @@ void CompactionIterator::NextFromInput() {
     }
   }
 
-  if (!Valid() && IsShuttingDown()) {
-    status_ = Status::ShutdownInProgress();
-  }
-
-  if (IsPausingManualCompaction()) {
-    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-  }
-
-  // Propagate corruption status from memtable itereator
-  if (!input_.Valid() && input_.status().IsCorruption()) {
-    status_ = input_.status();
+  if (status_.ok()) {
+    if (!Valid() && IsShuttingDown()) {
+      status_ = Status::ShutdownInProgress();
+    } else if (IsPausingManualCompaction()) {
+      status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    } else if (!input_.Valid() && input_.status().IsCorruption()) {
+      // Propagate corruption status from memtable iterator
+      status_ = input_.status();
+    }
   }
 }
 
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 07bad56d3cb2..907e178df804 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1610,11 +1610,10 @@ Status CompactionJob::FinalizeProcessKeyValueStatus(
     status =
         Status::ColumnFamilyDropped("Column family dropped during compaction");
   }
-  if ((status.ok() || status.IsColumnFamilyDropped()) &&
-      shutting_down_->load(std::memory_order_relaxed)) {
+  if (status.ok() && shutting_down_->load(std::memory_order_relaxed)) {
     status = Status::ShutdownInProgress("Database shutdown");
   }
-  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+  if (status.ok() &&
       (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
     status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }

From 8fa2aae7f4abf9dd7f160256b1396469fca7f771 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 2 Sep 2025 15:32:12 -0700
Subject: [PATCH 258/500] Re-enable Remote Compaction Stress Test (#13913)

Summary:
Re-enabling Remote Compaction Stress Test with some changes to stress test feature combo sanitization changes

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13913

Test Plan:
Ran Meta Internal Tests for a few days

# Follow up
- Skip recovering from WAL in remote worker and re-enable WAL
- Investigate and fix races with Integrated BlobDB

Reviewed By: hx235

Differential Revision: D81509225

Pulled By: jaykorean

fbshipit-source-id: 949762c48ece0a25e3d0281e3510f1e7d3fe3667
---
 tools/db_crashtest.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 42e6f860b385..09ac7250a08b 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -367,8 +367,7 @@ def setup_random_seed_before_main():
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
     "track_and_verify_wals": lambda: random.choice([0]),
-    # TODO(jaykorean): Re-enable remote compaction once all incompatible features are addressed in stress test
-    "remote_compaction_worker_threads": lambda: 0,
+    "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
@@ -505,6 +504,8 @@ def is_direct_io_supported(dbname):
     #
     # Second, we need to make sure disabling WAL works with `-reopen > 0`.
     "disable_wal": 0,
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
     "duration": 10000,
     "log2_keys_per_lock": 10,
     "ops_per_thread": 200000,
@@ -566,6 +567,8 @@ def is_direct_io_supported(dbname):
     # TODO: there is such a thing as transactions with WAL disabled. We should
     # cover that case.
     "disable_wal": 0,
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
     # OpenReadOnly after checkpoint is not currnetly compatible with WritePrepared txns
     "checkpoint_one_in": 0,
     # pipeline write is not currnetly compatible with WritePrepared txns
@@ -776,6 +779,18 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("best_efforts_recovery") == 1:
         dest_params["inplace_update_support"] = 0
 
+    # Remote Compaction Incompatible Tests and Features
+    if dest_params.get("remote_compaction_worker_threads", 0) > 0:
+        # TODO Fix races when both Remote Compaction + BlobDB enabled
+        dest_params["enable_blob_files"] = 0
+        dest_params["enable_blob_garbage_collection"] = 0
+        # TODO Fix - Remote worker shouldn't recover from WAL
+        dest_params["disable_wal"] = 1
+        # Disable Incompatible Ones
+        dest_params["inplace_update_support"] = 0
+        dest_params["checkpoint_one_in"] = 0
+        dest_params["use_timed_put_one_in"] = 0
+
     # Multi-key operations are not currently compatible with transactions or
     # timestamp.
     if (
@@ -838,14 +853,6 @@ def finalize_and_sanitize(src_params):
             dest_params["allow_concurrent_memtable_write"] = 1
         else:
             dest_params["unordered_write"] = 0
-    if dest_params.get("remote_compaction_worker_threads", 0) > 0:
-        # TODO Fix races when both Remote Compaction + BlobDB enabled
-        dest_params["enable_blob_files"] = 0
-        # TODO Fix - Remote worker shouldn't recover from WAL
-        dest_params["disable_wal"] = 1
-        # Disable Incompatible Ones
-        dest_params["checkpoint_one_in"] = 0
-        dest_params["use_timed_put_one_in"] = 0
     if dest_params.get("disable_wal", 0) == 1:
         dest_params["atomic_flush"] = 1
         dest_params["sync"] = 0

From a34683bf543cc3eb151d08eeac00791862acd4d6 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 2 Sep 2025 21:23:11 -0700
Subject: [PATCH 259/500] Disable Remote Compaction when Integrated BlobDB is
 enabled in Stress Test (#13916)

Summary:
Fixing "Integrated BlobDB is currently incompatible with Remote Compaction" error

https://github.com/facebook/rocksdb/actions/runs/17417658959/job/49449586139

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13916

Test Plan: CI

Reviewed By: anand1976

Differential Revision: D81537676

Pulled By: jaykorean

fbshipit-source-id: f5e2c40cd498a17cb08486a1cb9404ccf1d812e0
---
 tools/db_crashtest.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 09ac7250a08b..414c56e516d4 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -626,6 +626,9 @@ def is_direct_io_supported(dbname):
     "use_shared_block_and_blob_cache": lambda: random.randint(0, 1),
     "blob_cache_size": lambda: random.choice([1048576, 2097152, 4194304, 8388608]),
     "prepopulate_blob_cache": lambda: random.randint(0, 1),
+
+     # TODO Fix races when both Remote Compaction + BlobDB enabled
+     "remote_compaction_worker_threads": 0,
 }
 
 ts_params = {
@@ -677,6 +680,8 @@ def is_direct_io_supported(dbname):
     "two_write_queues": lambda: random.choice([0, 1]),
     # TODO: enable write-prepared
     "disable_wal": 0,
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
     "use_only_the_last_commit_time_batch_for_recovery": lambda: random.choice([0, 1]),
     "clear_column_family_one_in": 0,
     "column_families": 1,
@@ -784,6 +789,7 @@ def finalize_and_sanitize(src_params):
         # TODO Fix races when both Remote Compaction + BlobDB enabled
         dest_params["enable_blob_files"] = 0
         dest_params["enable_blob_garbage_collection"] = 0
+        dest_params["allow_setting_blob_options_dynamically"] = 0
         # TODO Fix - Remote worker shouldn't recover from WAL
         dest_params["disable_wal"] = 1
         # Disable Incompatible Ones

From dfbcdaf70eda84afa18c7baf759b7e16b2714ca3 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 3 Sep 2025 12:33:44 -0700
Subject: [PATCH 260/500] Disable Remote Compaction in UDT enabled Stress Tests
 (#13919)

Summary:
# Summary

Until we get WAL + Remote Compaction in Stress Test working, temporarily disable this

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13919

Test Plan: Meta Internal CI run

Reviewed By: anand1976

Differential Revision: D81605621

Pulled By: jaykorean

fbshipit-source-id: 6e1f9a0a7a0f27e7465512689b51364b63ef3e2b
---
 tools/db_crashtest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 414c56e516d4..311a8aee2cb2 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -645,6 +645,9 @@ def is_direct_io_supported(dbname):
     "use_put_entity_one_in": 0,
     # TimedPut is not compatible with user-defined timestamps yet.
     "use_timed_put_one_in": 0,
+    # when test_best_efforts_recovery == true, disable_wal becomes 0.
+    # TODO: Re-enable this once we fix WAL + Remote Compaction in Stress Test
+    "remote_compaction_worker_threads": 0,
 }
 
 tiered_params = {

From a805c9b9a8e72143856c6a3438d1f6778f52d767 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 5 Sep 2025 12:40:32 -0700
Subject: [PATCH 261/500] Add option to limit max prefetching in MultiScan
 (#13920)

Summary:
Add a new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks. Note that this only accounts for compressed block size. This is intended to be a stopgap until we implement some kind of global prefetch manager that limits the global multiscan memory usage.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13920

Test Plan: new unit test `./block_based_table_reader_test --gtest_filter="*MultiScanPrefetchSizeLimit/*"`

Reviewed By: xingbowang

Differential Revision: D81630629

Pulled By: cbi42

fbshipit-source-id: 9f66678915242fe1220620531a4b9fd22747cdea
---
 include/rocksdb/options.h                     |  16 ++
 include/rocksdb/status.h                      |  12 ++
 .../block_based/block_based_table_iterator.cc |  70 ++++--
 .../block_based/block_based_table_iterator.h  |  11 +-
 .../block_based_table_reader_test.cc          | 199 ++++++++++++++++++
 .../new_features/multi-scan-max-prefetch.md   |   1 +
 util/status.cc                                |   1 +
 7 files changed, 294 insertions(+), 16 deletions(-)
 create mode 100644 unreleased_history/new_features/multi-scan-max-prefetch.md

diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 206085b208a7..9ba148aa0e89 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1790,9 +1790,11 @@ class MultiScanArgs {
     comp_ = other.comp_;
     original_ranges_ = other.original_ranges_;
     io_coalesce_threshold = other.io_coalesce_threshold;
+    max_prefetch_size = other.max_prefetch_size;
   }
   MultiScanArgs(MultiScanArgs&& other) noexcept
       : io_coalesce_threshold(other.io_coalesce_threshold),
+        max_prefetch_size(other.max_prefetch_size),
         comp_(other.comp_),
         original_ranges_(std::move(other.original_ranges_)) {}
 
@@ -1800,6 +1802,7 @@ class MultiScanArgs {
     comp_ = other.comp_;
     original_ranges_ = other.original_ranges_;
     io_coalesce_threshold = other.io_coalesce_threshold;
+    max_prefetch_size = other.max_prefetch_size;
     return *this;
   }
 
@@ -1808,6 +1811,7 @@ class MultiScanArgs {
       comp_ = other.comp_;
       original_ranges_ = std::move(other.original_ranges_);
       io_coalesce_threshold = other.io_coalesce_threshold;
+      max_prefetch_size = other.max_prefetch_size;
     }
     return *this;
   }
@@ -1849,6 +1853,18 @@ class MultiScanArgs {
 
   uint64_t io_coalesce_threshold = 16 << 10;  // 16KB by default
 
+  // Maximum size (in bytes) for the data blocks loaded by a MultiScan.
+  // This limits the amount of I/O and memory usage by pinned data blocks.
+  //
+  // When set to 0 (the default), there is no limit. When the limit is reached,
+  // the iterator will start returning Status::PrefetchLimitReached().
+  //
+  // Note that prefetching happens only once in Prepare(), which is different
+  // from ReadOptions::readahead_size, which applies any time the iterator does
+  // I/O.
+  // Note that this limit is per file and applies to compressed block size.
+  uint64_t max_prefetch_size = 0;
+
  private:
   // The comparator used for ordering ranges
   const Comparator* comp_;
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index fad18d673936..4dcd5eb3a026 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -115,6 +115,7 @@ class Status {
     kIOFenced = 14,
     kMergeOperatorFailed = 15,
     kMergeOperandThresholdExceeded = 16,
+    kPrefetchLimitReached = 17,
     kMaxSubCode
   };
 
@@ -318,6 +319,10 @@ class Status {
 
   static Status LockLimit() { return Status(kAborted, kLockLimit); }
 
+  static Status PrefetchLimitReached() {
+    return Status(kIncomplete, kPrefetchLimitReached);
+  }
+
   // Returns true iff the status indicates success.
   bool ok() const {
     MarkChecked();
@@ -486,6 +491,13 @@ class Status {
     return (code() == kIOError) && (subcode() == kIOFenced);
   }
 
+  // Returns true iff the status indicates prefetch limit reached during
+  // MultiScan.
+  bool IsPrefetchLimitReached() const {
+    MarkChecked();
+    return (code() == kIncomplete) && (subcode() == kPrefetchLimitReached);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index a8d821e2c326..2a7d9893360b 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -982,7 +982,6 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 
   // Gather all relevant data block handles
   std::vector<BlockHandle> blocks_to_prepare;
-  Status s;
   std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
   for (const auto& scan_opt : *scan_opts) {
     size_t num_blocks = 0;
@@ -1042,11 +1041,26 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   // Look up entries in cache and pin if exist.
   // Store indices of blocks to read.
   std::vector<size_t> blocks_to_read;
-  std::vector<CachableEntry<Block>> pinned_data_blocks_guard;
-  pinned_data_blocks_guard.resize(blocks_to_prepare.size());
+  std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
+      blocks_to_prepare.size());
+  uint64_t total_prefetch_size = 0;
+
   for (size_t i = 0; i < blocks_to_prepare.size(); ++i) {
     const auto& data_block_handle = blocks_to_prepare[i];
-    s = table_->LookupAndPinBlocksInCache<Block_kData>(
+
+    // Check if we would exceed the prefetch size limit with this block
+    total_prefetch_size +=
+        BlockBasedTable::BlockSizeWithTrailer(data_block_handle);
+    if (multiscan_opts->max_prefetch_size > 0 &&
+        total_prefetch_size > multiscan_opts->max_prefetch_size) {
+      // All remaining blocks are by default empty.
+      for (size_t j = i; j < blocks_to_prepare.size(); ++j) {
+        assert(pinned_data_blocks_guard[j].IsEmpty());
+      }
+      break;
+    }
+
+    Status s = table_->LookupAndPinBlocksInCache<Block_kData>(
         read_options_, data_block_handle,
         &pinned_data_blocks_guard[i].As<Block_kData>());
 
@@ -1088,10 +1102,13 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 
     // do IO
     IOOptions io_opts;
-    s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
-    if (!s.ok()) {
-      // Abort: PrepareIOOptions failed
-      return;
+    {
+      Status s =
+          table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
+      if (!s.ok()) {
+        // Abort: PrepareIOOptions failed
+        return;
+      }
     }
 
     // Init read requests for Multi-Read
@@ -1163,11 +1180,13 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
     }
 
     AlignedBuf aligned_buf;
-    s = table_->get_rep()->file.get()->MultiRead(
-        io_opts, read_reqs.data(), read_reqs.size(),
-        direct_io ? &aligned_buf : nullptr);
-    if (!s.ok()) {
-      return;
+    {
+      Status s = table_->get_rep()->file.get()->MultiRead(
+          io_opts, read_reqs.data(), read_reqs.size(),
+          direct_io ? &aligned_buf : nullptr);
+      if (!s.ok()) {
+        return;
+      }
     }
     for (auto& req : read_reqs) {
       if (!req.status.ok()) {
@@ -1181,7 +1200,8 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
         table_->get_rep()->decompressor.get();
     CachableEntry<DecompressorDict> cached_dict;
     if (table_->get_rep()->uncompression_dict_reader) {
-      s = table_->get_rep()
+      Status s =
+          table_->get_rep()
               ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
                   /* prefetch_buffer= */ nullptr, read_options_,
                   /* get_context= */ nullptr, /* lookup_context= */ nullptr,
@@ -1226,7 +1246,7 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
             table_->get_rep()->footer.GetBlockTrailerSize() > 0;
 #endif
         assert(pinned_data_blocks_guard[block_idx].IsEmpty());
-        s = table_->CreateAndPinBlockInCache<Block_kData>(
+        Status s = table_->CreateAndPinBlockInCache<Block_kData>(
             read_options_, block, decompressor, &tmp_contents,
             &(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
         if (!s.ok()) {
@@ -1290,6 +1310,16 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
       }
 
       ResetDataIter();
+
+      // Check if we've hit an empty entry indicating prefetch limit reached
+      if (multi_scan_->pinned_data_blocks[cur_scan_start_idx].IsEmpty()) {
+        multi_scan_->cur_data_block_idx = cur_scan_start_idx;
+        multi_scan_->prefetch_limit_reached = true;
+        assert(!Valid());
+        assert(status().IsPrefetchLimitReached());
+        return true;
+      }
+
       // Note that the block_iter_ takes ownership of the pinned data block
       // TODO: we can delegate the clean up like with pinned_iters_mgr_ if
       // need to pin blocks longer.
@@ -1346,6 +1376,16 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     // Move to the next pinned data block
     ResetDataIter();
     ++multi_scan_->cur_data_block_idx;
+
+    // Check if we've hit an empty entry indicating prefetch limit reached
+    if (multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
+            .IsEmpty()) {
+      multi_scan_->prefetch_limit_reached = true;
+      assert(!Valid());
+      assert(status().IsPrefetchLimitReached());
+      return;
+    }
+
     table_->NewDataBlockIterator<DataBlockIter>(
         read_options_,
         multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx],
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index d31296fcf841..dfc9cf479083 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -145,10 +145,14 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
       assert(!multi_scan_);
       return index_iter_->status();
     } else if (block_iter_points_to_real_block_) {
+      // This is the common case.
       return block_iter_.status();
     } else if (async_read_in_progress_) {
       assert(!multi_scan_);
       return Status::TryAgain("Async read in progress");
+    } else if (multi_scan_ && multi_scan_->prefetch_limit_reached) {
+      assert(!Valid());
+      return Status::PrefetchLimitReached();
     } else {
       return Status::OK();
     }
@@ -385,6 +389,10 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     size_t next_scan_idx;
     size_t cur_data_block_idx;
 
+    // When true, the iterator will return
+    // Status::Incomplete(Status::kPrefetchLimitReached).
+    bool prefetch_limit_reached;
+
     MultiScanState(
         const MultiScanArgs* _scan_opts,
         std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
@@ -393,7 +401,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
           pinned_data_blocks(std::move(_pinned_data_blocks)),
           block_ranges_per_scan(std::move(_block_ranges_per_scan)),
           next_scan_idx(0),
-          cur_data_block_idx(0) {}
+          cur_data_block_idx(0),
+          prefetch_limit_reached(false) {}
   };
 
   std::unique_ptr<MultiScanState> multi_scan_;
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 6f22965eb7df..41728894e76d 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1176,6 +1176,205 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
   ASSERT_OK(iter->status());
 }
 
+TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
+  if (compression_type_ != kNoCompression) {
+    // This test relies on block sizes to be close to what's set in option.
+    ROCKSDB_GTEST_BYPASS("This test assumes no compression.");
+    return;
+  }
+  Options options;
+  ReadOptions read_opts;
+  size_t ts_sz = options.comparator->timestamp_size();
+
+  // Generate data that spans multiple blocks
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          20 /* num_block */, true /* mixed_with_human_readable_string_value */,
+          ts_sz);
+
+  std::string table_name = "BlockBasedTableReaderTest_PrefetchSizeLimit" +
+                           CompressionTypeToString(compression_type_);
+
+  ImmutableOptions ioptions(options);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  // Default block size is 4KB
+  //
+  // Tests when no block is loaded
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.max_prefetch_size = 1024;  // less than block size
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5].first));
+
+    iter->Prepare(&scan_options);
+
+    // Should be able to scan the first block, but not more
+    iter->Seek(kv[0].first);
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+  }
+
+  // Some blocks are loaded
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.max_prefetch_size = 9 * 1024;  // 9KB - 2 blocks with buffer
+    scan_options.insert(ExtractUserKey(kv[1 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[8 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+    iter->Seek(kv[1 * kEntriesPerBlock].first);
+    size_t scanned_keys = 0;
+
+    // Should be able to scan up to 2 blocks worth of data
+    while (iter->Valid()) {
+      ASSERT_EQ(iter->key().ToString(),
+                kv[scanned_keys + 1 * kEntriesPerBlock].first);
+      iter->Next();
+      scanned_keys++;
+    }
+
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+    ASSERT_EQ(scanned_keys, 2 * kEntriesPerBlock);
+  }
+
+  // Tests with some block loaded in cache already:
+  // Blocks 1 and 2 are already in cache by the above test.
+  // Here we try blocks 0 - 5, with prefetch limit to 3 blocks, and expect to
+  // read 3 blocks.
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.max_prefetch_size = 3 * 4 * 1024 + 1024;  // 3 blocks + 1KB
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+    iter->Seek(kv[0].first);
+    size_t scanned_keys = 0;
+    // Should only read 3 blocks (blocks 0, 1, 2)
+    // already cached.
+    while (iter->Valid()) {
+      ASSERT_EQ(iter->key().ToString(), kv[scanned_keys].first);
+      iter->Next();
+      scanned_keys++;
+    }
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+    ASSERT_EQ(scanned_keys, 3 * kEntriesPerBlock);
+  }
+
+  // Multiple scan ranges with prefetch limit
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.max_prefetch_size = 5 * 4 * 1024 + 1024;  // 5 blocks + 1KB
+    // Will read 5 entries from first scan range, and 4 blocks from the second
+    // scan range
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5].first));
+    scan_options.insert(ExtractUserKey(kv[12 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[17 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[19 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+
+    iter->Seek(kv[0].first);
+    size_t scanned_keys = 0;
+    size_t key_idx = 0;
+    while (iter->Valid()) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+      if (key_idx == 5) {
+        iter->Seek(kv[12 * kEntriesPerBlock].first);
+        key_idx = 12 * kEntriesPerBlock;
+      }
+    }
+    ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock);
+    ASSERT_TRUE(iter->status().IsPrefetchLimitReached());
+  }
+
+  // Prefetch limit is big enough for all scan ranges.
+  {
+    std::unique_ptr<InternalIterator> iter;
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.max_prefetch_size = 10 * 1024 * 1024;  // 10MB
+    scan_options.insert(ExtractUserKey(kv[0].first),
+                        ExtractUserKey(kv[5].first));
+    scan_options.insert(ExtractUserKey(kv[8 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[12 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[18 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[19 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+
+    iter->Seek(kv[0].first);
+    size_t scanned_keys = 0;
+    size_t key_idx = 0;
+    // Scan first range
+    while (iter->Valid() && key_idx < 5) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+    }
+    // Move to second range
+    iter->Seek(kv[8 * kEntriesPerBlock].first);
+    key_idx = 8 * kEntriesPerBlock;
+    while (iter->Valid() && key_idx < 12 * kEntriesPerBlock) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+    }
+    // Move to third range
+    iter->Seek(kv[18 * kEntriesPerBlock].first);
+    key_idx = 18 * kEntriesPerBlock;
+    while (iter->Valid() && key_idx < 19 * kEntriesPerBlock) {
+      ASSERT_EQ(iter->key().ToString(), kv[key_idx].first);
+      iter->Next();
+      scanned_keys++;
+      key_idx++;
+    }
+    // Should not hit prefetch limit
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(scanned_keys, 5 + 4 * kEntriesPerBlock + 1 * kEntriesPerBlock);
+  }
+}
+
 // Param 1: compression type
 // Param 2: whether to use direct reads
 // Param 3: Block Based Table Index type, partitioned filters are also enabled
diff --git a/unreleased_history/new_features/multi-scan-max-prefetch.md b/unreleased_history/new_features/multi-scan-max-prefetch.md
new file mode 100644
index 000000000000..4725de1e52b0
--- /dev/null
+++ b/unreleased_history/new_features/multi-scan-max-prefetch.md
@@ -0,0 +1 @@
+* Add new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks.
diff --git a/util/status.cc b/util/status.cc
index 8f49077406bc..56d62b66190a 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -46,6 +46,7 @@ static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
     "IO fenced off",          // kIOFenced
     "Merge operator failed",  // kMergeOperatorFailed
     "Number of operands merged exceeded threshold",  // kMergeOperandThresholdExceeded
+    "MultiScan reached file prefetch limit",         // kMultiScanPrefetchLimit
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,

From 0044a76d36e71ebf8d1c0f8c6a17912d94dcfdbd Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 5 Sep 2025 19:06:28 -0700
Subject: [PATCH 262/500] Make failure to load UDI when opening an SST a soft
 failure (#13921)

Summary:
If user_defined_index_factory in BlockBasedTableOptions is configured and we try to open an SST file without the corresponding UDI (either during DB open or file ingestion), ignore a failure to load the UDI by default. If fail_if_no_udi_on_open in BlockBasedTableOptions is true, then treat it as a fatal error.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13921

Test Plan: Update unit tests

Reviewed By: xingbowang

Differential Revision: D81826054

Pulled By: anand1976

fbshipit-source-id: f4fe0b13ccb02b9448622af487680131e349c52b
---
 include/rocksdb/statistics.h                  |  3 +
 include/rocksdb/table.h                       |  6 ++
 monitoring/statistics.cc                      |  2 +
 options/options_settable_test.cc              |  3 +-
 .../block_based/block_based_table_factory.cc  | 11 +++
 table/block_based/block_based_table_reader.cc | 65 +++++++++++-----
 table/table_test.cc                           | 78 +++++++++++++++++++
 .../new_features/fail_if_no_udi_on_open.md    |  1 +
 8 files changed, 150 insertions(+), 19 deletions(-)
 create mode 100644 unreleased_history/new_features/fail_if_no_udi_on_open.md

diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index cec02261fb57..0d88d8937c5f 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -542,6 +542,9 @@ enum Tickers : uint32_t {
   // TransactionOptions::large_txn_commit_optimize_threshold.
   NUMBER_WBWI_INGEST,
 
+  // Failure to load the UDI during SST table open
+  SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
+
   TICKER_ENUM_MAX
 };
 
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 6e71ed99f279..51dffe7cc4f6 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -509,6 +509,12 @@ struct BlockBasedTableOptions {
   // (CompressionOptions::parallel_threads sanitized to 1).
   std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory = nullptr;
 
+  // EXPERIMENTAL
+  //
+  // Return an error Status if a user_defined_index_factory is configured,
+  // but there's no corresponding UDI block in the SST file being opened.
+  bool fail_if_no_udi_on_open = false;
+
   // If true, place whole keys in the filter (not just prefixes).
   // This must generally be true for gets to be efficient.
   bool whole_key_filtering = true;
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 0dbc0ac2ba8f..b2f7cbe59d69 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -273,6 +273,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
      "rocksdb.file.read.corruption.retry.success.count"},
     {NUMBER_WBWI_INGEST, "rocksdb.number.wbwi.ingest"},
+    {SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
+     "rocksdb.sst.user.defined.index.load.fail.count"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index fe16b170446b..3df151b492bb 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -203,7 +203,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "max_auto_readahead_size=0;"
       "prepopulate_block_cache=kDisable;"
       "initial_auto_readahead_size=0;"
-      "num_file_reads_for_auto_readahead=0",
+      "num_file_reads_for_auto_readahead=0;"
+      "fail_if_no_udi_on_open=true",
       new_bbto));
 
   ASSERT_EQ(unset_bytes_base,
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index ff6cdaaa2b74..7c11875252c3 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -399,6 +399,9 @@ static struct BlockBasedTableTypeInfo {
          {offsetof(struct BlockBasedTableOptions,
                    num_file_reads_for_auto_readahead),
           OptionType::kUInt64T, OptionVerificationType::kNormal}},
+        {"fail_if_no_udi_on_open",
+         {offsetof(struct BlockBasedTableOptions, fail_if_no_udi_on_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal}},
     };
   }
 } block_based_table_type_info;
@@ -874,6 +877,14 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
                ? "nullptr"
                : table_options_.filter_policy->Name());
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  user_defined_index_factory: %s\n",
+           table_options_.user_defined_index_factory == nullptr
+               ? "nullptr"
+               : table_options_.user_defined_index_factory->Name());
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  fail_if_no_udi_on_open: %d\n",
+           table_options_.fail_if_no_udi_on_open);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
            table_options_.whole_key_filtering);
   ret.append(buffer);
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index d52d246a2ea6..5c22173223de 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1333,25 +1333,54 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     s = FindMetaBlock(meta_iter, kUserDefinedIndexPrefix + udi_name,
                       &udi_block_handle);
     if (!s.ok()) {
-      return s;
-    }
-    // Read the block, and allocate on heap or pin in cache. The UDI block is
-    // not compressed. RetrieveBlock will verify the checksum.
-    s = RetrieveBlock(prefetch_buffer, ro, udi_block_handle,
-                      rep_->decompressor.get(), &rep_->udi_block,
-                      /*get_context=*/nullptr, lookup_context,
-                      /*for_compaction=*/false, use_cache, /*async_read=*/false,
-                      /*use_block_cache_for_lookup=*/false);
-    if (!s.ok()) {
-      return s;
+      RecordTick(rep_->ioptions.statistics.get(),
+                 SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT);
+      if (table_options.fail_if_no_udi_on_open) {
+        ROCKS_LOG_ERROR(rep_->ioptions.logger,
+                        "Failed to find the the UDI block %s in file %s; %s",
+                        udi_name.c_str(), rep_->file->file_name().c_str(),
+                        s.ToString().c_str());
+        // MAke the status more informative
+        s = Status::Corruption(s.ToString(), rep_->file->file_name());
+        return s;
+      } else {
+        // Emit a warning, but ignore the error status
+        ROCKS_LOG_WARN(rep_->ioptions.logger,
+                       "Failed to find the the UDI block %s in file %s; %s",
+                       udi_name.c_str(), rep_->file->file_name().c_str(),
+                       s.ToString().c_str());
+        s = Status::OK();
+      }
     }
-    assert(!rep_->udi_block.IsEmpty());
 
-    std::unique_ptr<UserDefinedIndexReader> udi_reader =
-        table_options.user_defined_index_factory->NewReader(
-            rep_->udi_block.GetValue()->data);
-    index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
-        udi_name, std::move(index_reader), std::move(udi_reader));
+    // If the UDI block size is 0, that means there's effectively no user
+    // defined index. In that case, skip setting up the reader.
+    if (udi_block_handle.size() > 0) {
+      // Read the block, and allocate on heap or pin in cache. The UDI block is
+      // not compressed. RetrieveBlock will verify the checksum.
+      if (s.ok()) {
+        s = RetrieveBlock(prefetch_buffer, ro, udi_block_handle,
+                          rep_->decompressor.get(), &rep_->udi_block,
+                          /*get_context=*/nullptr, lookup_context,
+                          /*for_compaction=*/false, use_cache,
+                          /*async_read=*/false,
+                          /*use_block_cache_for_lookup=*/false);
+      }
+      if (s.ok()) {
+        assert(!rep_->udi_block.IsEmpty());
+
+        std::unique_ptr<UserDefinedIndexReader> udi_reader =
+            table_options.user_defined_index_factory->NewReader(
+                rep_->udi_block.GetValue()->data);
+        if (udi_reader) {
+          index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
+              udi_name, std::move(index_reader), std::move(udi_reader));
+        } else {
+          s = Status::Corruption("Failed to create UDI reader for " + udi_name +
+                                 " in file " + rep_->file->file_name());
+        }
+      }
+    }
   }
 
   rep_->index_reader = std::move(index_reader);
@@ -1359,7 +1388,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   // The partitions of partitioned index are always stored in cache. They
   // are hence follow the configuration for pin and prefetch regardless of
   // the value of cache_index_and_filter_blocks
-  if (prefetch_all || pin_partition) {
+  if (s.ok() && (prefetch_all || pin_partition)) {
     s = rep_->index_reader->CacheDependencies(ro, pin_partition,
                                               prefetch_buffer);
   }
diff --git a/table/table_test.cc b/table/table_test.cc
index 9185827c5959..4ff2a54ebc12 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7480,6 +7480,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
                           const Slice* first_key_in_next_block,
                           const BlockHandle& block_handle,
                           std::string* separator_scratch) override {
+        if (keys_added_ == 0) {
+          return last_key_in_current_block;
+        }
         EXPECT_EQ(last_key_in_current_block.size(), 5);
         if (first_key_in_next_block) {
           EXPECT_EQ(first_key_in_next_block->size(), 5);
@@ -7500,12 +7503,19 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
 
       void OnKeyAdded(const Slice& key, ValueType /*value*/,
                       const Slice& /*value*/) override {
+        if (key.starts_with("dummy")) {
+          return;
+        }
         EXPECT_EQ(key.size(), 5);
         // Track keys added to the index
         keys_added_++;
       }
 
       Status Finish(Slice* index_contents) override {
+        if (entries_added_ == 0) {
+          *index_contents = Slice();
+          return Status::OK();
+        }
         // Serialize the index data
         std::string result;
         for (const auto& entry : index_data_) {
@@ -8020,6 +8030,7 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
 
 // Verify that external file ingestion fails if we try to ingest an SST file
 // without the UDI and a UDI factory is configured in BlockBasedTableOptions
+// and fail_if_no_udi_on_open is true in BlockBasedTableOptions.
 TEST_F(UserDefinedIndexTest, IngestFailTest) {
   Options options;
   BlockBasedTableOptions table_options;
@@ -8051,6 +8062,7 @@ TEST_F(UserDefinedIndexTest, IngestFailTest) {
   auto user_defined_index_factory =
       std::make_shared<TestUserDefinedIndexFactory>();
   table_options.user_defined_index_factory = user_defined_index_factory;
+  table_options.fail_if_no_udi_on_open = true;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<DB> db;
@@ -8065,6 +8077,72 @@ TEST_F(UserDefinedIndexTest, IngestFailTest) {
   s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
   ASSERT_NOK(s);
 
+  ASSERT_OK(db->SetOptions(
+      cfh, {{"block_based_table_factory", "{fail_if_no_udi_on_open=false;}"}}));
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
+TEST_F(UserDefinedIndexTest, IngestEmptyUDI) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+  std::string ingest_file2 = dbname + "dummy.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  // Add 100 keys instead of just 5
+  for (int i = 0; i < 100; i++) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file2));
+  ASSERT_OK(writer->Put("dummy", "val"));
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  table_options.fail_if_no_udi_on_open = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  std::vector<IngestExternalFileArg> ifa;
+  ifa.emplace_back();
+  ifa[0].column_family = cfh;
+  ifa[0].external_files.emplace_back(ingest_file);
+  ifa[0].external_files.emplace_back(ingest_file2);
+  s = db->IngestExternalFiles(ifa);
+  ASSERT_OK(s);
+
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
   ASSERT_OK(DestroyDB(dbname, options));
diff --git a/unreleased_history/new_features/fail_if_no_udi_on_open.md b/unreleased_history/new_features/fail_if_no_udi_on_open.md
new file mode 100644
index 000000000000..d250fd77e147
--- /dev/null
+++ b/unreleased_history/new_features/fail_if_no_udi_on_open.md
@@ -0,0 +1 @@
+Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not.

From 96f796f93a3cbe3ce328f5fbeb4476e76ca50d11 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Mon, 8 Sep 2025 09:25:34 -0700
Subject: [PATCH 263/500] Add logging for errors in external file ingestion
 path (#13905)

Summary:
This diff adds logging in various places in the external file ingestion code where we check for non-OK status codes.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13905

Test Plan: Debugging external file ingestion should be easier with additional logging.

Differential Revision: D81814033

Pulled By: archang19

fbshipit-source-id: 77f8b342cbad892acedc4603c02865c38886f2f4
---
 db/external_sst_file_ingestion_job.cc | 77 +++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index fc14b6613c73..086208095884 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -42,6 +42,9 @@ Status ExternalSstFileIngestionJob::Prepare(
     status =
         GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to get ingested file info: %s: %s",
+                     file_path.c_str(), status.ToString().c_str());
       return status;
     }
 
@@ -189,6 +192,10 @@ Status ExternalSstFileIngestionJob::Prepare(
         ROCKS_LOG_INFO(db_options_.info_log,
                        "Tried to link file %s but it's not supported : %s",
                        path_outside_db.c_str(), status.ToString().c_str());
+      } else {
+        ROCKS_LOG_WARN(db_options_.info_log, "Failed to link file %s to %s: %s",
+                       path_outside_db.c_str(), path_inside_db.c_str(),
+                       status.ToString().c_str());
       }
     } else {
       f.copy_file = true;
@@ -213,6 +220,12 @@ Status ExternalSstFileIngestionJob::Prepare(
                         io_tracer_);
       // The destination of the copy will be ingested
       f.file_temperature = dst_temp;
+
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log, "Failed to copy file %s to %s: %s",
+                       path_outside_db.c_str(), path_inside_db.c_str(),
+                       status.ToString().c_str());
+      }
     } else {
       // Note: we currently assume that linking files does not cross
       // temperatures, so no need to change f.file_temperature
@@ -438,6 +451,11 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
     }
     status = cfd_->RangesOverlapWithMemtables(
         ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to check ranges overlap with memtables: %s",
+                     status.ToString().c_str());
+    }
   }
   if (status.ok() && *flush_needed) {
     if (!ingestion_options_.allow_blocking_flush) {
@@ -472,6 +490,9 @@ Status ExternalSstFileIngestionJob::Run() {
   bool need_flush = false;
   status = NeedsFlush(&need_flush, super_version);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to check if flush is needed: %s",
+                   status.ToString().c_str());
     return status;
   }
   if (need_flush) {
@@ -543,6 +564,9 @@ Status ExternalSstFileIngestionJob::Run() {
                                      &last_seqno, &batch_uppermost_level,
                                      prev_batch_uppermost_level);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to assign levels for one batch: %s",
+                     status.ToString().c_str());
       return status;
     }
 
@@ -585,6 +609,8 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
                                 &largest_parsed, false /* log_err_key */);
     }
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log, "Failed to parse internal key: %s",
+                     status.ToString().c_str());
       return status;
     }
 
@@ -607,6 +633,10 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
 
     status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to assign global sequence number for ingested file: %s",
+          status.ToString().c_str());
       return status;
     }
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
@@ -619,6 +649,9 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
 
     status = GenerateChecksumForIngestedFile(file);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to generate checksum for ingested file: %s",
+                     status.ToString().c_str());
       return status;
     }
 
@@ -844,6 +877,10 @@ Status ExternalSstFileIngestionJob::ResetTableReader(
   Status status =
       fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "Failed to create random access file for external file %s: %s",
+        external_file.c_str(), status.ToString().c_str());
     return status;
   }
   Temperature updated_temp = sst_file->GetTemperature();
@@ -966,6 +1003,10 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties(
     // user_defined_timestamps_persisted flag for the file.
     file_to_ingest->user_defined_timestamps_persisted = false;
   } else if (!s.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "ValidateUserDefinedTimestampsOptions failed for external file %s: %s",
+        external_file.c_str(), s.ToString().c_str());
     return s;
   }
 
@@ -990,6 +1031,9 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   Status status = fs_->GetFileSize(external_file, IOOptions(),
                                    &file_to_ingest->file_size, nullptr);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get file size for external file %s: %s",
+                   external_file.c_str(), status.ToString().c_str());
     return status;
   }
 
@@ -1006,12 +1050,19 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
                             /*user_defined_timestamps_persisted=*/true, sv,
                             file_to_ingest, &table_reader);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to reset table reader for external file %s: %s",
+                   external_file.c_str(), status.ToString().c_str());
     return status;
   }
 
   status = SanityCheckTableProperties(external_file, new_file_number, sv,
                                       file_to_ingest, &table_reader);
   if (!status.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log,
+        "Failed to sanity check table properties for external file %s: %s",
+        external_file.c_str(), status.ToString().c_str());
     return status;
   }
 
@@ -1025,6 +1076,10 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
         table_reader.get(), sv, file_to_ingest, allow_data_in_errors);
 
     if (!seqno_status.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to get sequence number boundary for external file %s: %s",
+          external_file.c_str(), seqno_status.ToString().c_str());
       return seqno_status;
     }
     assert(file_to_ingest->smallest_seqno <= file_to_ingest->largest_seqno);
@@ -1052,6 +1107,9 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
     status = table_reader->VerifyChecksum(
         ro, TableReaderCaller::kExternalSSTIngestion);
     if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to verify checksum for table reader: %s",
+                     status.ToString().c_str());
       return status;
     }
   }
@@ -1243,6 +1301,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
           ro, env_options_, file_to_ingest->start_ukey,
           file_to_ingest->limit_ukey, lvl, &overlap_with_level);
       if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to check overlap with level iterator: %s",
+                       status.ToString().c_str());
         return status;
       }
       if (overlap_with_level) {
@@ -1355,6 +1416,14 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
       PutFixed64(&seqno_val, seqno);
       status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
                             IOOptions(), nullptr);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to write global seqno to %s: %s",
+                       file_to_ingest->internal_file_path.c_str(),
+                       status.ToString().c_str());
+        return status;
+      }
+
       if (status.ok()) {
         TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
         status = SyncIngestedFile(fsptr.get());
@@ -1371,6 +1440,11 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
         return status;
       }
     } else if (!status.IsNotSupported()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to open ingested file %s for random read/write: %s",
+          file_to_ingest->internal_file_path.c_str(),
+          status.ToString().c_str());
       return status;
     }
   }
@@ -1403,6 +1477,9 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
       db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
       ro, db_options_.stats, db_options_.clock);
   if (!io_s.ok()) {
+    ROCKS_LOG_WARN(
+        db_options_.info_log, "Failed to generate checksum for %s: %s",
+        file_to_ingest->internal_file_path.c_str(), io_s.ToString().c_str());
     return io_s;
   }
   file_to_ingest->file_checksum = std::move(file_checksum);

From 5a498bf688ddad1f04de657e92ee31ccde923c17 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 8 Sep 2025 11:30:42 -0700
Subject: [PATCH 264/500] Disable Remote Compaction In Stress Test (#13925)

Summary:
After running stress test over a week, we've identified more failures to fix. While we work on the fix, disable the remote compaction temporarily to reduce noise and avoid these failures hiding other failures.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13925

Test Plan: CI

Reviewed By: anand1976

Differential Revision: D81934248

Pulled By: jaykorean

fbshipit-source-id: 9ac11926429eebe1aebf7b520a548dc5987b7d76
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 311a8aee2cb2..c1e904f8128b 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -367,7 +367,8 @@ def setup_random_seed_before_main():
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
     "track_and_verify_wals": lambda: random.choice([0]),
-    "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
+    # TODO(jaykorean): re-enable remote compaction worker threads after addressing all issues
+    "remote_compaction_worker_threads": 0,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),

From 6b02f137a42ce1f2d83ab2a52403bed337823c0c Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 8 Sep 2025 13:03:42 -0700
Subject: [PATCH 265/500] Turn on stats collection in crash test (#13926)

Summary:
**Context/Summary:** it's for formal testing to cover statistics in our stress test

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13926

Reviewed By: anand1976, jaykorean

Differential Revision: D81943762

Pulled By: hx235

fbshipit-source-id: 4186be0b35839976b7299667492d0cc722128a06
---
 tools/db_crashtest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index c1e904f8128b..c043243f434d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -384,6 +384,8 @@ def setup_random_seed_before_main():
     # fixed within a run for easier debugging
     # actual frequency is lower after option sanitization
     "use_multiscan": random.choice([1] + [0] * 3),
+    # By default, `statistics` use kExceptDetailedTimers level
+    "statistics": random.choice([0, 1]),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"

From 86bb0c0d1b41a5fdf48b085884ddf238135daf0a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 8 Sep 2025 13:11:28 -0700
Subject: [PATCH 266/500] Use C++20 in public API, fix CI (#13915)

Summary:
A follow-up to https://github.com/facebook/rocksdb/issues/13904 which was incomplete in updating CI jobs to support C++20 because the C++20 usage was only in tests. Here we add subtle C++20 usage in the public API ("using enum" feature in db.h) to force the issue.

A lot of the work for this PR was in updating the Ubuntu22 docker image, for earlier compiler/runtime versions supporting C++20, and generating a new Ubuntu24 docker image, for later compiler/runtime versions. The Ubuntu22 image needed to be updated because there are incompatibilities with clang-13 + c++20 + libstdc++ for gcc 11, seen on these examples

```
#include <chrono>

int main(int argc, char *argv[]) {
  std::chrono::microseconds d = {}; return 0;
}
```

and

```
#include <coroutine>

int main() { return 0; }
```

The second was causing recurring failures in build-linux-clang-13-asan-ubsan-with-folly, now fixed.

So we have to install clang's libc++ to compile with clang-13. I haven't been able to get this to work with some of the libraries like benchmark, glog, and/or gflags, but I'm able to compile core RocksDB with clang-13. On this docker image, an extra compiler parameter is needed to compile with gcc and glog because it's built from source perhaps not perfectly, because the ubuntu package transitively conflicts with libc++.

The Ubuntu24 image seems to be low-drama and generally work for testing out newer compiler versions. The mingw build uses Ubuntu24 because the mingw package on Ubuntu22 uses a gcc version that is too old.

And the mass of other code changes are trying to work around new warnings, mostly from clang-analyze, which I upgraded to clang-18 in CI.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13915

Test Plan: CI, including temporarily including the nightly jobs in the PR jobs in earlier revisions to test and stabilize

Reviewed By: archang19

Differential Revision: D81933067

Pulled By: pdillinger

fbshipit-source-id: 7e33823006a79d5f3cf5bc1d625f0a3c08a7d74c
---
 .github/workflows/nightly.yml                 | 24 +++---
 .github/workflows/pr-jobs.yml                 | 75 ++++++++++---------
 Makefile                                      |  6 +-
 build_tools/ubuntu22_image/Dockerfile         | 18 +++--
 build_tools/ubuntu24_image/Dockerfile         | 72 ++++++++++++++++++
 cache/lru_cache_test.cc                       |  2 +-
 cache/secondary_cache_adapter.cc              |  2 +-
 db/db_test_util.cc                            |  4 +-
 db/merge_operator.cc                          |  2 +-
 db_stress_tool/db_stress_test_base.cc         |  3 +-
 env/file_system_tracer.cc                     |  2 +
 include/rocksdb/db.h                          |  9 +--
 table/multiget_context.h                      |  4 +-
 tools/sst_dump_test.cc                        | 49 ++++++------
 utilities/persistent_cache/hash_table_test.cc |  2 +
 15 files changed, 183 insertions(+), 91 deletions(-)
 create mode 100644 build_tools/ubuntu24_image/Dockerfile

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 937e6683720d..4ea230899737 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -32,7 +32,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     env:
       TEST_TMPDIR: "/tmp/rocksdb_test_tmp"
@@ -41,16 +41,16 @@ jobs:
     - uses: "./.github/actions/pre-steps"
     - run: make V=1 -j32 check
     - uses: "./.github/actions/post-steps"
-  build-linux-clang-13-asan-ubsan-with-folly:
+  build-linux-clang-18-asan-ubsan-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     env:
-      CC: clang-13
-      CXX: clang++-13
+      CC: clang-18
+      CXX: clang++-18
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
@@ -63,7 +63,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -94,7 +94,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -107,13 +107,13 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - name: Build rocksdb lib
-      run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib
+      run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 make -j4 static_lib
     - name: Build fuzzers
       run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
     - uses: "./.github/actions/post-steps"
@@ -122,11 +122,11 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - uses: "./.github/actions/setup-folly"
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 -DCMAKE_CXX_FLAGS=-DGLOG_USE_GLOG_EXPORT .. && make V=1 -j20)"
     - uses: "./.github/actions/post-steps"
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index a828991ca9ce..8f8da7b9d724 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -66,7 +66,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -78,7 +78,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -97,7 +97,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -111,7 +111,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -125,20 +125,20 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - uses: "./.github/actions/setup-folly"
-    - run: USE_FOLLY_LITE=1 V=1 make -j32 all
+    - run: USE_FOLLY_LITE=1 EXTRA_CXXFLAGS=-DGLOG_USE_GLOG_EXPORT V=1 make -j32 all
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-folly-coroutines:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -152,7 +152,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -164,7 +164,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -178,7 +178,7 @@ jobs:
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -199,56 +199,61 @@ jobs:
     - run: ls librocksdb.a
     - run: if ./trace_analyzer --version; then false; else true; fi
     - uses: "./.github/actions/post-steps"
-  build-linux-clang-no_test_run:
+  build-linux-clang-13-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 8-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
-    - run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all
+    - uses: "./.github/actions/pre-steps"
+    # FIXME: get back to "all microbench" targets
+    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ make -j32 shared_lib
+    - run: make clean
+    # FIXME: get back to "release" target
+    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ DEBUG_LEVEL=0 make -j32 shared_lib
     - uses: "./.github/actions/post-steps"
-  build-linux-clang-13-no_test_run:
+  build-linux-clang-18-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench
+    - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 make -j32 all microbench
     - run: make clean
-    - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release
+    - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release
     - uses: "./.github/actions/post-steps"
-  build-linux-gcc-13-no_test_run:
+  build-linux-gcc-14-no_test_run:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=gcc-13 CXX=g++-13 V=1 make -j32 all microbench
+    - run: CC=gcc-14 CXX=g++-14 V=1 make -j32 all microbench
     - uses: "./.github/actions/post-steps"
 
   # ======================== Linux Other Checks ======================= #
-  build-linux-clang10-clang-analyze:
+  build-linux-clang18-clang-analyze:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze
+    - run: CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-18" CLANG_SCAN_BUILD=scan-build-18 USE_CLANG=1 make V=1 -j32 analyze
     - uses: "./.github/actions/post-steps"
     - name: compress test report
       run: tar -cvzf scan_build_report.tar.gz scan_build_report
@@ -276,7 +281,7 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -284,36 +289,36 @@ jobs:
     - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
     - uses: "./.github/actions/post-steps"
   # ======================= Linux with Sanitizers ===================== #
-  build-linux-clang10-asan-ubsan:
+  build-linux-clang18-asan-ubsan:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 32-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j40 check
+    - run: COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j40 check
     - uses: "./.github/actions/post-steps"
-  build-linux-clang13-mini-tsan:
+  build-linux-clang18-mini-tsan:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 32-core-ubuntu
     container:
-      image: zjay437/rocksdb:0.6
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: COMPILE_WITH_TSAN=1 CC=clang-13 CXX=clang++-13 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
+    - run: COMPILE_WITH_TSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-static_lib-alt_namespace-status_checked:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     - uses: actions/checkout@v4.1.0
@@ -380,12 +385,12 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
     # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step.
-    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.0
+    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.1
     # until a more appropriate docker image with C++20 support is made.
     - name: Checkout
       env:
@@ -411,12 +416,12 @@ jobs:
     runs-on:
       labels: 4-core-ubuntu
     container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.0
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
       options: --shm-size=16gb
     steps:
     # The docker image is intentionally based on an OS that has an older GLIBC version.
     # That GLIBC is incompatibile with GitHub's actions/checkout. Thus we implement a manual checkout step.
-    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.0
+    # NOTE: replaced evolvedbinary/rocksjava:centos7_x64-be with ghcr.io/facebook/rocksdb_ubuntu:22.1
     # until a more appropriate docker image with C++20 support is made.
     - name: Checkout
       env:
diff --git a/Makefile b/Makefile
index e0ec77115326..8df7a2621333 100644
--- a/Makefile
+++ b/Makefile
@@ -1423,13 +1423,13 @@ agg_merge_test: $(OBJ_DIR)/utilities/agg_merge/agg_merge_test.o $(TEST_LIBRARY)
 stringappend_test: $(OBJ_DIR)/utilities/merge_operators/string_append/stringappend_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+cassandra_format_test: $(OBJ_DIR)/utilities/cassandra/cassandra_format_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+cassandra_functional_test: $(OBJ_DIR)/utilities/cassandra/cassandra_functional_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(OBJ_DIR)/utilities/cassandra/test_utils.o $(TEST_LIBRARY) $(LIBRARY)
+cassandra_row_merge_test: $(OBJ_DIR)/utilities/cassandra/cassandra_row_merge_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 cassandra_serialize_test: $(OBJ_DIR)/utilities/cassandra/cassandra_serialize_test.o $(TEST_LIBRARY) $(LIBRARY)
diff --git a/build_tools/ubuntu22_image/Dockerfile b/build_tools/ubuntu22_image/Dockerfile
index 353b0651fabd..cb627f33daa7 100644
--- a/build_tools/ubuntu22_image/Dockerfile
+++ b/build_tools/ubuntu22_image/Dockerfile
@@ -1,5 +1,5 @@
 # INSTRUCTIONS:
-# I was not about to build docker images on an isolated devserver because of
+# I was not able to build docker images on an isolated devserver because of
 # issues with proxy internet access. Use a public cloud or other Linux system.
 # (I used a Debian system after installing docker features, adding my user to
 # the docker and docker-registry groups, and logging out and back in to pick
@@ -47,6 +47,13 @@ WORKDIR /root
 RUN wget https://apt.llvm.org/llvm.sh
 RUN chmod +x llvm.sh
 RUN ./llvm.sh 13 all
+# There are incompatibilities between clang with -std=c++20 and libstdc++
+# provided by gcc, so we have to compile with clang-13 using -stdlib=libc++
+# and only one version of libc++ can be installed on the system at one time.
+# So to avoid confusion we remove unusable clang-14 also.
+RUN apt-get install libc++-13-dev libc++abi-13-dev
+RUN apt-get purge -y clang-14 && apt-get autoremove -y
+
 # install gcc-10 and more, default is 11
 RUN apt-get install -y gcc-10 g++-10
 RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
@@ -54,7 +61,10 @@ RUN apt-get install -y gcc-13 g++-13
 # install apt-get install -y valgrind
 RUN apt-get install -y valgrind
 # install folly depencencies
-RUN apt-get install -y libunwind-dev libgoogle-glog-dev
+# Missing compatible libunwind: RUN apt-get install -y libgoogle-glog-dev
+# So instead install from source. This currently requires compiling with
+# -DGLOG_USE_GLOG_EXPORT
+RUN wget https://github.com/google/glog/archive/refs/tags/v0.7.1.tar.gz && tar xzf v0.7.1.tar.gz && cd glog-0.7.1/ && cmake -S . -B build -G "Unix Makefiles" && cmake --build build && cmake --build build --target install && cd .. && rm -rf v0.7.1.tar.gz glog-0.7.1
 # install openjdk 8
 RUN apt-get install -y openjdk-8-jdk
 ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
@@ -72,9 +82,7 @@ ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.pro
 ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
 
 # install the latest google benchmark
-RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark
-RUN cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install
+RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark && cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install && cd ~ && rm -rf /root/benchmark
 
 # clean up
 RUN rm -rf /var/lib/apt/lists/*
-RUN rm -rf /root/benchmark
diff --git a/build_tools/ubuntu24_image/Dockerfile b/build_tools/ubuntu24_image/Dockerfile
new file mode 100644
index 000000000000..0f7e98ca6e9f
--- /dev/null
+++ b/build_tools/ubuntu24_image/Dockerfile
@@ -0,0 +1,72 @@
+# INSTRUCTIONS:
+# I was not able to build docker images on an isolated devserver because of
+# issues with proxy internet access. Use a public cloud or other Linux system.
+# (I used a Debian system after installing docker features, adding my user to
+# the docker and docker-registry groups, and logging out and back in to pick
+# those up.)
+#
+# Follow https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry#authenticating-with-a-personal-access-token-classic
+# to login with your GitHub credentials, as in
+#
+# $ docker login ghcr.io -u pdillinger
+#
+# and paste the limited-purpose GitHub token into the terminal.
+#
+# Then in the build_tools/ubuntu24_image directory, (bump minor version for
+# random docker file updates, major version tracks Ubuntu release)
+#
+# $ docker build -t ghcr.io/facebook/rocksdb_ubuntu:24.0
+# $ docker push ghcr.io/facebook/rocksdb_ubuntu:24.0
+#
+# Might need to change visibility to public through
+# https://github.com/orgs/facebook/packages/container/rocksdb_ubuntu/settings
+# or similar.
+
+# from official ubuntu 24.04
+FROM ubuntu:24.04
+# update system
+RUN apt-get update
+RUN apt-get upgrade -y
+# install basic tools
+RUN apt-get install -y vim wget curl
+# install tzdata noninteractive
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
+# install git and default compilers
+RUN apt-get install -y git gcc g++ clang clang-tools
+# install basic package
+RUN apt-get install -y lsb-release software-properties-common gnupg
+# install gflags, tbb
+RUN apt-get install -y libgflags-dev libtbb-dev
+# install compression libs
+RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+# install cmake
+RUN apt-get install -y cmake
+RUN apt-get install -y libssl-dev
+
+# install gcc-12 and more, default is 13
+RUN apt-get install -y gcc-12 g++-12 gcc-14 g++-14
+# install apt-get install -y valgrind
+RUN apt-get install -y valgrind
+# install folly depencencies
+RUN apt-get install -y libgoogle-glog-dev
+# install openjdk 8
+RUN apt-get install -y openjdk-8-jdk
+ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
+# install mingw
+RUN apt-get install -y mingw-w64
+
+# install gtest-parallel package
+RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
+ENV PATH $PATH:/root/gtest-parallel
+
+# install libprotobuf for fuzzers test
+RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool
+RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install
+ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
+ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
+
+# install the latest google benchmark
+RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark && cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install && cd ~ && rm -rf /root/benchmark
+
+# clean up
+RUN rm -rf /var/lib/apt/lists/*
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 7a1f18ed6f53..486e595e12b4 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -1405,9 +1405,9 @@ TEST_P(BasicSecondaryCacheTest, SaveFailTest) {
   TestItem* item1 = new TestItem(str1.data(), str1.length());
   ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelperFail(), str1.length()));
   std::string str2 = rnd.RandomString(1020);
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   TestItem* item2 = new TestItem(str2.data(), str2.length());
   // k1 should be demoted to NVM
-  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelperFail(), str2.length()));
   ASSERT_EQ(secondary_cache->num_inserts(), 1u);
 
diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index 11a330284c90..2db601d2ecf8 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -584,7 +584,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio(
   size_t pri_capacity = target_->GetCapacity();
   size_t sec_capacity =
       static_cast<size_t>(pri_capacity * compressed_secondary_ratio);
-  size_t old_sec_capacity;
+  size_t old_sec_capacity = 0;
   Status s = secondary_cache_->GetCapacity(old_sec_capacity);
   if (!s.ok()) {
     return s;
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 0f839b77fc9f..018df7978cef 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -71,9 +71,9 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
   if (getenv("MEM_ENV")) {
     mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
   }
-  if (getenv("ENCRYPTED_ENV")) {
+  if (auto ee = getenv("ENCRYPTED_ENV")) {
     std::shared_ptr<EncryptionProvider> provider;
-    std::string provider_id = getenv("ENCRYPTED_ENV");
+    std::string provider_id = ee;
     if (provider_id.find('=') == std::string::npos &&
         !EndsWith(provider_id, "://test")) {
       provider_id = provider_id + "://test";
diff --git a/db/merge_operator.cc b/db/merge_operator.cc
index bb5dbbc36533..ef12f726d393 100644
--- a/db/merge_operator.cc
+++ b/db/merge_operator.cc
@@ -32,6 +32,7 @@ bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in,
                                 MergeOperationOutputV3* merge_out) const {
   assert(merge_out);
 
+  Slice value_of_default;  // avoid warning about in_v2 pointing at this
   MergeOperationInput in_v2(merge_in.key, nullptr, merge_in.operand_list,
                             merge_in.logger);
 
@@ -66,7 +67,6 @@ bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in,
             const bool has_default_column =
                 WideColumnsHelper::HasDefaultColumn(existing_columns);
 
-            Slice value_of_default;
             if (has_default_column) {
               value_of_default = existing_columns.front().value();
             }
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index c5846f79b56f..c8ec2c430445 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3516,8 +3516,9 @@ void StressTest::PrintEnv() const {
   fprintf(stdout, "Verification only         : %s\n",
           FLAGS_verification_only ? "true" : "false");
 
-  const char* memtablerep = "";
+  const char* memtablerep;
   switch (FLAGS_rep_factory) {
+    default:
     case kSkipList:
       memtablerep = "skip_list";
       break;
diff --git a/env/file_system_tracer.cc b/env/file_system_tracer.cc
index dc44107b58c9..46fe4ce7491b 100644
--- a/env/file_system_tracer.cc
+++ b/env/file_system_tracer.cc
@@ -355,9 +355,11 @@ IOStatus FSRandomAccessFileTracingWrapper::ReadAsync(
   IOStatus s = target()->ReadAsync(req, opts, read_async_callback,
                                    read_async_cb_info, io_handle, del_fn, dbg);
 
+#ifndef __clang_analyzer__
   if (!s.ok()) {
     delete read_async_cb_info;
   }
+#endif  // __clang_analyzer__
   return s;
 }
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index fdc3e7a8f26a..4c7ff0f0585c 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -2247,12 +2247,9 @@ inline Status DB::GetApproximateSizes(ColumnFamilyHandle* column_family,
                                       uint64_t* sizes,
                                       SizeApproximationFlags include_flags) {
   SizeApproximationOptions options;
-  options.include_memtables =
-      ((include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) !=
-       SizeApproximationFlags::NONE);
-  options.include_files =
-      ((include_flags & SizeApproximationFlags::INCLUDE_FILES) !=
-       SizeApproximationFlags::NONE);
+  using enum SizeApproximationFlags;  // Require C++20 support
+  options.include_memtables = ((include_flags & INCLUDE_MEMTABLES) != NONE);
+  options.include_files = ((include_flags & INCLUDE_FILES) != NONE);
   return GetApproximateSizes(options, column_family, ranges, n, sizes);
 }
 
diff --git a/table/multiget_context.h b/table/multiget_context.h
index 52dcf1b174c4..c42b3b2c1869 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -129,7 +129,9 @@ class MultiGetContext {
       lookup_key_ptr_ = reinterpret_cast<LookupKey*>(lookup_key_heap_buf.get());
     }
 
-    for (size_t iter = 0; iter != num_keys_; ++iter) {
+    for (size_t iter = 0;
+         iter < num_keys_ && /* suppress a warning */ iter < MAX_BATCH_SIZE;
+         ++iter) {
       // autovector may not be contiguous storage, so make a copy
       sorted_keys_[iter] = (*sorted_keys)[begin + iter];
       sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 6df982b4f472..ef7005e1085e 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -98,6 +98,23 @@ class SSTDumpToolTest : public testing::Test {
     return path;
   }
 
+  // RAII class to ensure cleanup of usage array
+  template <std::size_t N>
+  struct CleanupUsage {
+    char* (&usage)[N];
+    explicit CleanupUsage(char* (&_usage)[N]) : usage(_usage) {}
+    // No copies/moves
+    CleanupUsage(const CleanupUsage&) = delete;
+    CleanupUsage& operator=(const CleanupUsage&) = delete;
+    CleanupUsage(CleanupUsage&&) = delete;
+    CleanupUsage& operator=(CleanupUsage&&) = delete;
+    ~CleanupUsage() {
+      for (std::size_t i = 0; i < N; ++i) {
+        delete[] usage[i];
+      }
+    }
+  };
+
   template <std::size_t N>
   void PopulateCommandArgs(const std::string& file_path, const char* command,
                            char* (&usage)[N]) const {
@@ -189,19 +206,16 @@ class SSTDumpToolTest : public testing::Test {
     createSST(opts, file_path, wide_column_one_in);
 
     char* usage[3];
+    auto cleanup_usage = CleanupUsage{usage};
     PopulateCommandArgs(file_path, cmd_arg, usage);
 
     ROCKSDB_NAMESPACE::SSTDumpTool tool;
     ASSERT_TRUE(!tool.Run(3, usage, opts));
 
     cleanup(opts, file_path);
-    for (int i = 0; i < 3; i++) {
-      delete[] usage[i];
-    }
   }
 };
 
-
 TEST_F(SSTDumpToolTest, HelpAndVersion) {
   Options opts;
   opts.env = env();
@@ -322,6 +336,7 @@ TEST_F(SSTDumpToolTest, CompressionManager) {
   createSST(opts, file_path, 10);
 
   char* usage[5];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=recompress", usage);
   snprintf(usage[3], kOptLength, "--compression_manager=%s",
            MyManager::kCompatibilityName);
@@ -331,9 +346,6 @@ TEST_F(SSTDumpToolTest, CompressionManager) {
   ASSERT_TRUE(!tool.Run(5, usage, opts));
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 5; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, MemEnv) {
@@ -344,15 +356,13 @@ TEST_F(SSTDumpToolTest, MemEnv) {
   createSST(opts, file_path);
 
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
   ASSERT_TRUE(!tool.Run(3, usage, opts));
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, ReadaheadSize) {
@@ -362,6 +372,7 @@ TEST_F(SSTDumpToolTest, ReadaheadSize) {
   createSST(opts, file_path);
 
   char* usage[4];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=verify", usage);
   snprintf(usage[3], kOptLength, "--readahead_size=4000000");
 
@@ -382,16 +393,15 @@ TEST_F(SSTDumpToolTest, ReadaheadSize) {
   SyncPoint::GetInstance()->DisableProcessing();
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 4; i++) {
-    delete[] usage[i];
-  }
 }
 
+#ifndef __clang_analyzer__  // False positive memory leaks reported
 TEST_F(SSTDumpToolTest, NoSstFile) {
   Options opts;
   opts.env = env();
   std::string file_path = MakeFilePath("no_such_file.sst");
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "", usage);
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
   for (const auto& command :
@@ -401,15 +411,13 @@ TEST_F(SSTDumpToolTest, NoSstFile) {
     snprintf(usage[1], kOptLength, "%s", command);
     ASSERT_TRUE(tool.Run(3, usage, opts));
   }
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, ValidSSTPath) {
   Options opts;
   opts.env = env();
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs("", "", usage);
   SSTDumpTool tool;
   std::string file_not_exists = MakeFilePath("file_not_exists.sst");
@@ -438,11 +446,8 @@ TEST_F(SSTDumpToolTest, ValidSSTPath) {
   ASSERT_OK(opts.env->DeleteFile(sst_file));
   ASSERT_OK(opts.env->DeleteFile(text_file));
   ASSERT_OK(opts.env->DeleteFile(fake_sst));
-
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
+#endif  // __clang_analyzer__
 
 TEST_F(SSTDumpToolTest, RawOutput) {
   Options opts;
@@ -451,6 +456,7 @@ TEST_F(SSTDumpToolTest, RawOutput) {
   createSST(opts, file_path, 10);
 
   char* usage[3];
+  auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs(file_path, "--command=raw", usage);
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
@@ -477,9 +483,6 @@ TEST_F(SSTDumpToolTest, RawOutput) {
   raw_file.close();
 
   cleanup(opts, file_path);
-  for (int i = 0; i < 3; i++) {
-    delete[] usage[i];
-  }
 }
 
 TEST_F(SSTDumpToolTest, SstFileDumperMmapReads) {
diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc
index 7ae6a4a643dc..76c70813fb23 100644
--- a/utilities/persistent_cache/hash_table_test.cc
+++ b/utilities/persistent_cache/hash_table_test.cc
@@ -132,6 +132,7 @@ TEST_F(HashTableTest, TestErase) {
 }
 
 TEST_F(EvictableHashTableTest, TestEvict) {
+#ifndef __clang_analyzer__
   const uint64_t max_keys = 1024 * 1024;
 
   // insert
@@ -148,6 +149,7 @@ TEST_F(EvictableHashTableTest, TestEvict) {
     assert(val->val_ == std::string(1000, val->key_ % 255));
     delete val;
   }
+#endif  // __clang_analyzer__
 }
 
 }  // namespace ROCKSDB_NAMESPACE

From 1aca60c089a48857930b4191b0c84b6dd98a221c Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 8 Sep 2025 15:52:54 -0700
Subject: [PATCH 267/500] Improve efficiency in PointLockManager by using
 separate Condvar (#13731)

Summary:
PointLockManager manages point lock per key. The old implementation partition the per key lock into 16 stripes. Each stripe handles the point lock for a subset of keys. Each stripe have only one conditional variable. This conditional variable is used by all the transactions that are waiting for its turn to acquire a lock of a key that belongs to this stripe.

In production, we notified that when there are multiple transactions trying to write to the same key, all of them will wait on the same conditional variables. When the previous lock holder released the key, all of the transactions are woken up, but only one of them could proceed, and the rest goes back to sleep. This wasted a lot of CPU cycles. In addition, when there are other keys being locked/unlocked on the same lock stripe, the problem becomes even worse.

In order to solve this issue, we implemented a new PerKeyPointLockManager that keeps a transaction waiter queue at per key level. When a transaction could not acquire a lock immediately, it joins the waiter queue of the key and waits on a dedicated conditional variable. When previous lock holder released the lock, it wakes up the next set of transactions that are eligible to acquire the lock from the waiting queue. The queue respect FIFO order, except it prioritizes lock upgrade/downgrade operation.

However, this waiter queue change increases the deadlock detection cost, because the transaction waiting in the queue also needs to be considered during deadlock detection. To resolve this issue, a new deadlock_timeout_us (microseconds) configuration is introduced in transaction option. Essentially, when a transaction is waiting on a lock, it will join the wait queue and wait for the duration configured by deadlock_timeout_us without perform deadlock detection. If the transaction didn't get the lock after the deadlock_timeout_us timeout is reached, it will then perform deadlock detection and wait until lock_timeout is reached. This optimization takes the heuristic where majority of the transaction would be able to get the lock without perform deadlock detection.

The deadlock_timeout_us configuration needs to be tuned for different workload, if the likelihood of deadlock is very low, the deadlock_timeout_us could be configured close to a big higher than the average transaction execution time, so that majority of the transaction would be able to acquire the lock without performing deadlock detection. If the likelihood of deadlock is high, deadlock_timeout_us could be configured with lower value, so that deadlock would get detected faster.

The new PerKeyPointLockManager is disabled by default. It can be enabled by TransactionDBOptions.use_per_key_point_lock_mgr. The deadlock_timeout_us is only effective when PerKeyPointLockManager is used. When deadlock_timeout_us is set to 0, transaction will perform deadlock detection immediately before wait.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13731

Test Plan:
Unit test.
Stress unit test that validates deadlock detection and exclusive, shared lock guarantee.
A new point_lock_bench binary is created to help perform performance test.

Reviewed By: pdillinger

Differential Revision: D77353607

Pulled By: xingbowang

fbshipit-source-id: 21cf93354f9a367a78c8666596ed14013ac7240b
---
 BUCK                                          |   10 +
 CMakeLists.txt                                |    7 +
 Makefile                                      |    9 +-
 buckifier/buckify_rocksdb.py                  |   12 +
 db/c.cc                                       |    5 +
 db_stress_tool/db_stress_common.h             |    1 +
 db_stress_tool/db_stress_gflags.cc            |    4 +
 db_stress_tool/db_stress_test_base.cc         |    2 +
 include/rocksdb/point_lock_bench_tool.h       |   14 +
 include/rocksdb/status.h                      |    4 +
 include/rocksdb/utilities/transaction.h       |    7 +-
 include/rocksdb/utilities/transaction_db.h    |   21 +
 src.mk                                        |    6 +-
 tools/db_crashtest.py                         |    1 +
 .../improve_point_lock_manager_performance.md |    2 +
 utilities/transactions/lock/lock_manager.cc   |    7 +-
 .../lock/point/any_lock_manager_test.h        |  239 +++
 .../lock/point/point_lock_bench.cc            |   18 +
 .../lock/point/point_lock_bench_tool.cc       |  159 ++
 .../lock/point/point_lock_manager.cc          | 1402 +++++++++++++++--
 .../lock/point/point_lock_manager.h           |   78 +-
 .../point/point_lock_manager_stress_test.cc   |  103 ++
 .../lock/point/point_lock_manager_test.cc     | 1273 ++++++++++++++-
 .../lock/point/point_lock_manager_test.h      |  302 +---
 .../point/point_lock_manager_test_common.h    |   78 +
 .../point/point_lock_validation_test_runner.h |  466 ++++++
 .../lock/range/range_locking_test.cc          |    8 +-
 .../transactions/pessimistic_transaction.cc   |    4 +
 .../transactions/pessimistic_transaction.h    |   10 +-
 .../transactions/timestamped_snapshot_test.cc |   27 +-
 utilities/transactions/transaction_base.h     |    2 +
 .../transactions/transaction_db_mutex_impl.cc |    3 +-
 utilities/transactions/transaction_test.cc    |  116 +-
 utilities/transactions/transaction_test.h     |   97 +-
 .../write_committed_transaction_ts_test.cc    |   26 +-
 .../write_prepared_transaction_test.cc        |  255 +--
 .../write_unprepared_transaction_test.cc      |   39 +-
 37 files changed, 4148 insertions(+), 669 deletions(-)
 create mode 100644 include/rocksdb/point_lock_bench_tool.h
 create mode 100644 unreleased_history/performance_improvements/improve_point_lock_manager_performance.md
 create mode 100644 utilities/transactions/lock/point/any_lock_manager_test.h
 create mode 100644 utilities/transactions/lock/point/point_lock_bench.cc
 create mode 100644 utilities/transactions/lock/point/point_lock_bench_tool.cc
 create mode 100644 utilities/transactions/lock/point/point_lock_manager_stress_test.cc
 create mode 100644 utilities/transactions/lock/point/point_lock_manager_test_common.h
 create mode 100644 utilities/transactions/lock/point/point_lock_validation_test_runner.h

diff --git a/BUCK b/BUCK
index 565e6b831ad8..6e57e5cd1a7a 100644
--- a/BUCK
+++ b/BUCK
@@ -419,6 +419,8 @@ cpp_library_wrapper(name="rocksdb_tools_lib", srcs=[
 
 cpp_library_wrapper(name="rocksdb_cache_bench_tools_lib", srcs=["cache/cache_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)
 
+cpp_library_wrapper(name="rocksdb_point_lock_bench_tools_lib", srcs=["utilities/transactions/lock/point/point_lock_bench_tool.cc"], deps=[":rocksdb_lib"], headers=[], link_whole=False, extra_test_libs=False)
+
 rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
         "db_stress_tool/batched_ops_stress.cc",
         "db_stress_tool/cf_consistency_stress.cc",
@@ -450,6 +452,8 @@ cpp_binary_wrapper(name="db_bench", srcs=["tools/db_bench.cc"], deps=[":rocksdb_
 
 cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
 
+cpp_binary_wrapper(name="point_lock_bench", srcs=["utilities/transactions/lock/point/point_lock_bench.cc"], deps=[":rocksdb_point_lock_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False)
+
 cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
 
 cpp_binary_wrapper(name="db_basic_bench", srcs=["microbench/db_basic_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True)
@@ -5381,6 +5385,12 @@ cpp_unittest_wrapper(name="plain_table_db_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="point_lock_manager_stress_test",
+            srcs=["utilities/transactions/lock/point/point_lock_manager_stress_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="point_lock_manager_test",
             srcs=["utilities/transactions/lock/point/point_lock_manager_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19a66a7b7791..dd602fdacff4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1503,6 +1503,7 @@ if(WITH_TESTS)
         utilities/transactions/optimistic_transaction_test.cc
         utilities/transactions/transaction_test.cc
         utilities/transactions/lock/point/point_lock_manager_test.cc
+        utilities/transactions/lock/point/point_lock_manager_stress_test.cc
         utilities/transactions/write_committed_transaction_ts_test.cc
         utilities/transactions/write_prepared_transaction_test.cc
         utilities/transactions/write_unprepared_transaction_test.cc
@@ -1613,6 +1614,12 @@ if(WITH_BENCHMARK_TOOLS)
     utilities/persistent_cache/hash_table_bench.cc)
   target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
+
+  add_executable(point_lock_bench${ARTIFACT_SUFFIX}
+    utilities/transactions/lock/point/point_lock_bench.cc
+    utilities/transactions/lock/point/point_lock_bench_tool.cc)
+  target_link_libraries(point_lock_bench${ARTIFACT_SUFFIX}
+    ${ROCKSDB_LIB} ${GFLAGS_LIB} ${FOLLY_LIBS})
 endif()
 
 option(WITH_TRACE_TOOLS "build with trace tools" ON)
diff --git a/Makefile b/Makefile
index 8df7a2621333..7b019a389144 100644
--- a/Makefile
+++ b/Makefile
@@ -636,13 +636,14 @@ endif
 TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
 BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
 CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
+POINT_LOCK_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(POINT_LOCK_BENCH_LIB_SOURCES))
 TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
 ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
 STRESS_OBJECTS =  $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
 
 # Exclude build_version.cc -- a generated source file -- from all sources.  Not needed for dependencies
 ALL_SOURCES  = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
-ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
+ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(POINT_LOCK_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
 ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
 ALL_SOURCES += $(ROCKSDB_PLUGIN_SOURCES) $(ROCKSDB_PLUGIN_TESTS)
 
@@ -1343,6 +1344,9 @@ block_cache_trace_analyzer: $(OBJ_DIR)/tools/block_cache_analyzer/block_cache_tr
 cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY)
 	$(AM_LINK)
 
+point_lock_bench: $(OBJ_DIR)/utilities/transactions/lock/point/point_lock_bench.o $(POINT_LOCK_BENCH_OBJECTS) $(LIBRARY)
+	$(AM_LINK)
+
 persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY)
 	$(AM_LINK)
 
@@ -1879,6 +1883,9 @@ heap_test: $(OBJ_DIR)/util/heap_test.o $(TEST_LIBRARY) $(LIBRARY)
 point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+point_lock_manager_stress_test: utilities/transactions/lock/point/point_lock_manager_stress_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index 035254b5ad1f..113d58e11655 100755
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -206,6 +206,12 @@ def generate_buck(repo_path, deps_map):
         src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
         [":rocksdb_lib"],
     )
+    # rocksdb_point_lock_bench_tools_lib
+    BUCK.add_library(
+        "rocksdb_point_lock_bench_tools_lib",
+        src_mk.get("POINT_LOCK_BENCH_LIB_SOURCES", []),
+        [":rocksdb_lib"],
+    )
     # rocksdb_stress_lib
     BUCK.add_rocksdb_library(
         "rocksdb_stress_lib",
@@ -229,6 +235,12 @@ def generate_buck(repo_path, deps_map):
     BUCK.add_binary(
         "cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"]
     )
+    # point_lock_bench binary
+    BUCK.add_binary(
+        "point_lock_bench",
+        ["utilities/transactions/lock/point/point_lock_bench.cc"],
+        [":rocksdb_point_lock_bench_tools_lib"]
+    )
     # bench binaries
     for src in src_mk.get("MICROBENCH_SOURCES", []):
         name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0]
diff --git a/db/c.cc b/db/c.cc
index dcc19bf333d1..1da15274efac 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -6433,6 +6433,11 @@ void rocksdb_transactiondb_options_set_default_lock_timeout(
   opt->rep.default_lock_timeout = default_lock_timeout;
 }
 
+void rocksdb_transactiondb_options_set_use_per_key_point_lock_mgr(
+    rocksdb_transactiondb_options_t* opt, int use_per_key_point_lock_mgr) {
+  opt->rep.use_per_key_point_lock_mgr = use_per_key_point_lock_mgr;
+}
+
 rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
   return new rocksdb_transaction_options_t;
 }
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 4b73733933fc..4fe0a3ffcfcf 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -284,6 +284,7 @@ DECLARE_bool(use_txn);
 // Options for TransactionDB (a.k.a. Pessimistic Transaction DB)
 DECLARE_uint64(txn_write_policy);
 DECLARE_bool(unordered_write);
+DECLARE_bool(use_per_key_point_lock_mgr);
 
 // Options for OptimisticTransactionDB
 DECLARE_bool(use_optimistic_txn);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index d221c374286a..e3cb957a19e2 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -724,6 +724,10 @@ DEFINE_uint64(txn_write_policy, 0,
               "TxnDBWritePolicy::WRITE_COMMITTED. Note that this should not be "
               "changed across crashes.");
 
+DEFINE_bool(use_per_key_point_lock_mgr, true,
+            "Use PointLockManager(false) or PerKeyPointLockManager(true) in "
+            "TransactionDB.");
+
 DEFINE_bool(use_optimistic_txn, false, "Use OptimisticTransactionDB.");
 DEFINE_uint64(occ_validation_policy, 1,
               "Optimistic Concurrency Control Validation Policy for "
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index c8ec2c430445..b24a95c72f37 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3944,6 +3944,8 @@ void StressTest::Open(SharedState* shared, bool reopen) {
             static_cast<size_t>(FLAGS_wp_snapshot_cache_bits);
         txn_db_options.wp_commit_cache_bits =
             static_cast<size_t>(FLAGS_wp_commit_cache_bits);
+        txn_db_options.use_per_key_point_lock_mgr =
+            FLAGS_use_per_key_point_lock_mgr;
         PrepareTxnDbOptions(shared, txn_db_options);
         s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
                                 cf_descriptors, &column_families_, &txn_db_);
diff --git a/include/rocksdb/point_lock_bench_tool.h b/include/rocksdb/point_lock_bench_tool.h
new file mode 100644
index 000000000000..ed6066c43128
--- /dev/null
+++ b/include/rocksdb/point_lock_bench_tool.h
@@ -0,0 +1,14 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int point_lock_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index 4dcd5eb3a026..afb9651faf27 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -116,6 +116,7 @@ class Status {
     kMergeOperatorFailed = 15,
     kMergeOperandThresholdExceeded = 16,
     kPrefetchLimitReached = 17,
+    kNotExpectedCodePath = 18,
     kMaxSubCode
   };
 
@@ -329,6 +330,9 @@ class Status {
     return code() == kOk;
   }
 
+  // Assert the status is OK in debug mode
+  void AssertOK() const { assert(ok()); }
+
   // Returns true iff the status indicates success *with* something
   // overwritten
   bool IsOkOverwritten() const {
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index 6c444ac26df5..51b4eb026211 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -653,7 +653,12 @@ class Transaction {
   // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
   // this transaction.
   // Has no effect on OptimisticTransactions.
-  virtual void SetLockTimeout(int64_t timeout) = 0;
+  virtual void SetLockTimeout(int64_t timeout_ms) = 0;
+
+  // Change the value of deadlock_timeout (in milliseconds) for this
+  // transaction.
+  // Has no effect on OptimisticTransactions.
+  virtual void SetDeadlockTimeout(int64_t timeout_ms) = 0;
 
   // Return the WriteOptions that will be used during Commit()
   virtual WriteOptions* GetWriteOptions() = 0;
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index c5c10be0c8b5..e0af0caa0bd1 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -217,6 +217,11 @@ struct TransactionDBOptions {
   // Other value means the user provides a custom lock manager.
   std::shared_ptr<LockManagerHandle> lock_mgr_handle;
 
+  // EXPERIMENTAL
+  //
+  // Flag to enable/disable the per key point lock manager.
+  bool use_per_key_point_lock_mgr = false;
+
   // If true, the TransactionDB implementation might skip concurrency control
   // unless it is overridden by TransactionOptions or
   // TransactionDBWriteOptimizations. This can be used in conjunction with
@@ -319,6 +324,22 @@ struct TransactionOptions {
   // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
   int64_t lock_timeout = -1;
 
+  // Timeout in microseconds before perform dead lock detection.
+  // If 0, deadlock detection will be performed immediately.
+  //
+  // To optimize performance, this parameter could be tuned.
+  //
+  // When deadlock happens very frequently, deadlock timeout should be set to 0,
+  // so deadlock will be detected immediately.
+  //
+  // When deadlock happen very rarely, this timeout could be turned to be
+  // slightly longer than the typical transaction execution time, so that
+  // transaction will be waked up to take the lock before this timeout, which
+  // will allow the transaction to save the CPU time on deadlock detection.
+  //
+  // Deadlock timeout is always smaller than lock_timeout.
+  int64_t deadlock_timeout_us = 500;
+
   // Expiration duration in milliseconds.  If non-negative, transactions that
   // last longer than this many milliseconds will fail to commit.  If not set,
   // a forgotten transaction that is never committed, rolled back, or deleted
diff --git a/src.mk b/src.mk
index 9d771f45a8e2..3954622ba350 100644
--- a/src.mk
+++ b/src.mk
@@ -386,9 +386,12 @@ BENCH_LIB_SOURCES =                                             \
   tools/tool_hooks.cc                                           \
   tools/simulated_hybrid_file_system.cc                         \
 
-CACHE_BENCH_LIB_SOURCES =					                              \
+CACHE_BENCH_LIB_SOURCES =                                       \
   cache/cache_bench_tool.cc                                     \
 
+POINT_LOCK_BENCH_LIB_SOURCES =                                  \
+  utilities/transactions/lock/point/point_lock_bench_tool.cc    \
+
 STRESS_LIB_SOURCES =                                           \
   db_stress_tool/batched_ops_stress.cc                         \
   db_stress_tool/cf_consistency_stress.cc                      \
@@ -651,6 +654,7 @@ TEST_MAIN_SOURCES =                                                     \
   utilities/transactions/lock/range/range_locking_test.cc               \
   utilities/transactions/transaction_test.cc                            \
   utilities/transactions/lock/point/point_lock_manager_test.cc          \
+  utilities/transactions/lock/point/point_lock_manager_stress_test.cc   \
   utilities/transactions/write_prepared_transaction_test.cc             \
   utilities/transactions/write_unprepared_transaction_test.cc           \
   utilities/transactions/write_committed_transaction_ts_test.cc         \
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index c043243f434d..1c00ce60026e 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -567,6 +567,7 @@ def is_direct_io_supported(dbname):
     # NOTE: often passed in from command line overriding this
     "txn_write_policy": random.randint(0, 2),
     "unordered_write": random.randint(0, 1),
+    "use_per_key_point_lock_mgr": lambda: random.choice([0, 1]),
     # TODO: there is such a thing as transactions with WAL disabled. We should
     # cover that case.
     "disable_wal": 0,
diff --git a/unreleased_history/performance_improvements/improve_point_lock_manager_performance.md b/unreleased_history/performance_improvements/improve_point_lock_manager_performance.md
new file mode 100644
index 000000000000..7713818f38cd
--- /dev/null
+++ b/unreleased_history/performance_improvements/improve_point_lock_manager_performance.md
@@ -0,0 +1,2 @@
+Add a new experimental PerKeyPointLockManager to improve efficiency under high lock contention. PointLockManager was not efficient when there is high write contention on same key, as it uses a single conditional variable per lock stripe. PerKeyPointLockManager uses per thread conditional variable supporting fifo order. Although this is an experimental feature. By default, it is disabled. A new boolean flag TransactionDBOptions::use_per_key_point_lock_mgr is added to optionally enable it. Search the flag in code for more info.
+Together, a new configuration TransactionOptions::deadlock_timeout_us is added, which allows the transaction to wait for a short period before perform deadlock detection. When the workload has low lock contention, the deadlock_timeout_us can be configured to be slightly higher than average transaction execution time, so that transaction would likely be able to take the lock before deadlock detection is performed when it is waiting for a lock. This allows transaction to reduce CPU cost on performing deadlock detection, which could be expensive in CPU time. When the workload has high lock contention, the deadlock_timeout_us can be configured to 0, so that transaction would perform deadlock detection immediately. By default the value is 0 to keep the behavior same as before.
diff --git a/utilities/transactions/lock/lock_manager.cc b/utilities/transactions/lock/lock_manager.cc
index 7bcbf6f9d804..f4828ec59069 100644
--- a/utilities/transactions/lock/lock_manager.cc
+++ b/utilities/transactions/lock/lock_manager.cc
@@ -17,8 +17,11 @@ std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
     auto mgr = opt.lock_mgr_handle->getLockManager();
     return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr);
   } else {
-    // Use a point lock manager by default
-    return std::shared_ptr<LockManager>(new PointLockManager(db, opt));
+    if (opt.use_per_key_point_lock_mgr) {
+      return std::make_shared<PerKeyPointLockManager>(db, opt);
+    } else {
+      return std::make_shared<PointLockManager>(db, opt);
+    }
   }
 }
 
diff --git a/utilities/transactions/lock/point/any_lock_manager_test.h b/utilities/transactions/lock/point/any_lock_manager_test.h
new file mode 100644
index 000000000000..4562f215a9a6
--- /dev/null
+++ b/utilities/transactions/lock/point/any_lock_manager_test.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using init_func_t = void (*)(PointLockManagerTest*);
+
+class AnyLockManagerTest : public PointLockManagerTest,
+                           public testing::WithParamInterface<init_func_t> {
+ public:
+  void SetUp() override {
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto init_func = GetParam();
+    if (init_func) {
+      (*init_func)(this);
+    } else {
+      PointLockManagerTest::SetUp();
+    }
+  }
+};
+
+TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
+  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
+  // Tests that a txn can acquire shared lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  if (dynamic_cast<PointLockManager*>(locker_.get()) != nullptr &&
+      dynamic_cast<PerKeyPointLockManager*>(locker_.get()) == nullptr) {
+    // PointLockManager would create 2 entries in the lock manager, so it needs
+    // to unlock it twice.
+    locker_->UnLock(txn, 1, "k", env_);
+  }
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockUpgrade) {
+  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockDowngrade) {
+  // Tests that a txn can acquire a shared lock after acquiring an exclusive
+  // lock on the same key.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockConflict) {
+  // Tests that lock conflicts lead to lock timeout.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+
+  {
+    // exclusive-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // exclusive-shared conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // shared-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, SharedLocks) {
+  // Tests that shared locks can be concurrently held by multiple transactions.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, Deadlock) {
+  // Tests that deadlock can be detected.
+  // Deadlock scenario:
+  // txn1 exclusively locks k1, and wants to lock k2;
+  // txn2 exclusively locks k2, and wants to lock k1.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  // disable dead lock timeout, so that the dead lock detection behavior is
+  // consistent. This prevents the test to be flaky
+  txn_opt.deadlock_timeout_us = 0;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+
+  // txn1 tries to lock k2, will be blocked.
+  port::Thread t;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t, [&]() {
+    // block because txn2 is holding a lock on k2.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k2", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  auto txn3 = NewTxn();
+  txn3->SetLockTimeout(10000);
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [&]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
+    locker_->UnLock(txn3, 1, "k", env_);
+  });
+
+  // Ok, now txn3 is waiting for lock on "k", which is owned by two
+  // transactions. Check that GetWaitingTxns reports this correctly
+  uint32_t wait_cf_id;
+  std::string wait_key;
+  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
+
+  ASSERT_EQ(wait_cf_id, 1u);
+  ASSERT_EQ(wait_key, "k");
+  ASSERT_EQ(waiters.size(), 2);
+  bool waits_correct =
+      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
+      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
+  ASSERT_EQ(waits_correct, true);
+
+  // Release locks so txn3 can proceed with execution
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  // Wait until txn3 finishes
+  t1.join();
+
+  delete txn1;
+  delete txn2;
+  delete txn3;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_bench.cc b/utilities/transactions/lock/point/point_lock_bench.cc
new file mode 100644
index 000000000000..2867738fdf1e
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_bench.cc
@@ -0,0 +1,18 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "rocksdb/point_lock_bench_tool.h"
+int main(int argc, char** argv) {
+  return ROCKSDB_NAMESPACE::point_lock_bench_tool(argc, argv);
+}
+#endif  // GFLAGS
diff --git a/utilities/transactions/lock/point/point_lock_bench_tool.cc b/utilities/transactions/lock/point/point_lock_bench_tool.cc
new file mode 100644
index 000000000000..b9d55c34deaa
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_bench_tool.cc
@@ -0,0 +1,159 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include <cstdio>
+#include <iostream>
+#include <memory>
+
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/gflags_compat.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_validation_test_runner.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+namespace ROCKSDB_NAMESPACE {
+
+DEFINE_string(db_dir, "/tmp/point_lock_manager_test",
+              "DB path for running the benchmark");
+DEFINE_uint32(stripe_count, 16, "Number of stripes in point lock manager");
+DEFINE_bool(is_per_key_point_lock_manager, false,
+            "Use PerKeyPointLockManager or PointLockManager");
+DEFINE_uint32(thread_count, 64,
+              "Number of threads to acquire release locks concurrently");
+DEFINE_uint32(key_count, 16, "Number of keys to acquire release locks upon");
+DEFINE_uint32(max_num_keys_to_lock_per_txn, 8,
+              "Max Number of keys to lock in a transaction");
+DEFINE_uint32(execution_time_sec, 10,
+              "Number of seconds to execute the benchmark");
+DEFINE_uint32(lock_type, 2,
+              "Lock type to test, 0: exclusive lock only; 1: shared lock only; "
+              "2: both shared and exclusive locks");
+DEFINE_int64(lock_timeout_ms, 1000,
+             "Lock acquisition request timeout in milliseconds.");
+DEFINE_int64(deadlock_timeout_us, 500,
+             "DeadLock detection timeout in microseconds.");
+DEFINE_int64(lock_expiration_ms, 100,
+             "Acquired Lock expiration time in milliseconds.");
+DEFINE_bool(allow_non_deadlock_error, true,
+            "Allow returned error code other than deadlock, such as timeout.");
+DEFINE_uint32(
+    max_sleep_after_lock_acquisition_ms, 5,
+    "Max number of milliseconds to sleep after acquiring all the locks in the "
+    "transaction. The actuall sleep time will be randomized from 0 to max. It "
+    "is used to simulate some useful work performed.");
+DEFINE_bool(check_thread_stuck, false,
+            "Check thread periodically to see whether they are stuck or not. "
+            "This is useful for detecting stuck transaction quickly. But it "
+            "could have false-positive when running with ASAN or running with "
+            "high thread count on a small number of CPUs");
+
+namespace {  // anonymous namespace
+
+class PointLockManagerBenchmark {
+ public:
+  PointLockManagerBenchmark() {
+    env_ = Env::Default();
+    env_->CreateDir(FLAGS_db_dir);
+
+    Options opt;
+    opt.create_if_missing = true;
+    txndb_opt_.num_stripes = FLAGS_stripe_count;
+
+    db_ = nullptr;
+
+    auto s = TransactionDB::Open(opt, txndb_opt_, FLAGS_db_dir, &db_);
+    ASSERT_OK(s);
+
+    if (FLAGS_is_per_key_point_lock_manager) {
+      locker_ = std::make_shared<PerKeyPointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    } else {
+      locker_ = std::make_shared<PointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    }
+
+    txn_opt_.deadlock_detect = true;
+    txn_opt_.lock_timeout = FLAGS_lock_timeout_ms;
+    txn_opt_.deadlock_timeout_us = FLAGS_deadlock_timeout_us;
+    txn_opt_.expiration = FLAGS_lock_expiration_ms;
+  }
+
+  // Disable copy and assignment
+  PointLockManagerBenchmark(const PointLockManagerBenchmark&) = delete;
+  PointLockManagerBenchmark& operator=(const PointLockManagerBenchmark&) =
+      delete;
+  PointLockManagerBenchmark(PointLockManagerBenchmark&&) = delete;
+  PointLockManagerBenchmark& operator=(PointLockManagerBenchmark&&) = delete;
+
+  ~PointLockManagerBenchmark() {
+    delete db_;
+    auto s = DestroyDir(env_, FLAGS_db_dir);
+    ASSERT_OK(s);
+  }
+
+  void run() {
+    PointLockValidationTestRunner test_runner(
+        env_, txndb_opt_, locker_, db_, txn_opt_, FLAGS_thread_count,
+        FLAGS_key_count, FLAGS_max_num_keys_to_lock_per_txn,
+        FLAGS_execution_time_sec, static_cast<LockTypeToTest>(FLAGS_lock_type),
+        FLAGS_allow_non_deadlock_error,
+        FLAGS_max_sleep_after_lock_acquisition_ms, FLAGS_check_thread_stuck);
+    test_runner.run();
+  }
+
+ private:
+  Env* env_;
+  TransactionDBOptions txndb_opt_;
+  std::shared_ptr<LockManager> locker_;
+  TransactionDB* db_;
+  TransactionOptions txn_opt_;
+};
+
+}  // anonymous namespace
+
+int point_lock_bench_tool(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  // Print test configuration
+  std::vector<gflags::CommandLineFlagInfo> all_flags;
+  gflags::GetAllFlags(&all_flags);
+
+  for (const auto& flag : all_flags) {
+    // only show the flags defined in this file
+    if (flag.filename.find("point_lock_bench_tool.cc") != std::string::npos) {
+      std::cout << "-" << flag.name << "=";
+      if (flag.type == "bool") {
+        std::cout << (gflags::GetCommandLineFlagInfoOrDie(flag.name.c_str())
+                                  .current_value == "true"
+                          ? "true"
+                          : "false");
+      } else {
+        std::cout << gflags::GetCommandLineFlagInfoOrDie(flag.name.c_str())
+                         .current_value;
+      }
+      std::cout << " ";
+    }
+  }
+  std::cout << std::endl;
+
+  // Run the benchmark
+  PointLockManagerBenchmark benchmark;
+  benchmark.run();
+
+  return 0;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 82a12f17d506..78f6073082b6 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -13,7 +13,6 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
 #include "test_util/sync_point.h"
-#include "util/cast_util.h"
 #include "util/hash.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
@@ -21,36 +20,275 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+constexpr bool kDebugLog = false;
+
+// KeyLockWaiter represents a waiter for a key lock. It contains a conditional
+// variable to allow waiter to wait for the key lock. It also contains other
+// metadata about the waiter such as transaction id, lock type etc.
+struct KeyLockWaiter {
+  KeyLockWaiter(std::shared_ptr<TransactionDBCondVar> c, TransactionID i,
+                bool ex)
+      : id(i), exclusive(ex), ready(false), cv(std::move(c)) {}
+
+  // disable copy constructor and assignment operator, move and move
+  // assignment
+  KeyLockWaiter(const KeyLockWaiter&) = delete;
+  KeyLockWaiter& operator=(const KeyLockWaiter&) = delete;
+  KeyLockWaiter(KeyLockWaiter&&) = delete;
+  KeyLockWaiter& operator=(KeyLockWaiter&&) = delete;
+
+  ~KeyLockWaiter() = default;
+
+  // Reset the waiter to be used again
+  void Reset(TransactionID i, bool e) {
+    id = i;
+    exclusive = e;
+    ready = false;
+  }
+
+  // Check whether the waiter has been notified that it is its turn to take the
+  // lock
+  bool IsReady() const { return ready; }
+
+  // Wait until its turn to take the lock forever
+  Status Wait(std::shared_ptr<TransactionDBMutex>& mutex) {
+    // Mutex is already locked by caller
+    // Check ready flag before wait
+    if (ready) {
+      return Status::OK();
+    }
+    return AfterWait(cv->Wait(mutex));
+  }
+
+  // Wait until its turn to take the lock within timeout_us
+  Status WaitFor(std::shared_ptr<TransactionDBMutex>& mutex,
+                 int64_t timeout_us) {
+    // Mutex is already locked by caller
+    // Check ready flag before wait
+    if (ready) {
+      return Status::OK();
+    }
+    return AfterWait(cv->WaitFor(mutex, timeout_us));
+  }
+
+  // Notify the waiter to take the lock
+  void Notify() {
+    // Mutex is already locked by caller
+    ready = true;
+    cv->Notify();
+  }
+
+  TransactionID id;
+  bool exclusive;
+
+ private:
+  Status AfterWait(Status wait_result) {
+    if (wait_result.ok() || wait_result.IsTimedOut()) {
+      // check ready again after wake up.
+      if (ready) {
+        return Status::OK();
+      } else {
+        return Status::TimedOut(Status::SubCode::kMutexTimeout);
+      }
+    } else {
+      return wait_result;
+    }
+  }
+
+  // Track whether the waiter has been woken up explicitly.
+  bool ready;
+  // TODO(Xingbo), Switch to std::binary_semaphore, once we have c++20
+  // semaphore is likely more performant than mutex + cv.
+  // Although we will also need to implement TransactionDBSemaphore, which would
+  // be required if external system wants to do instrumented lock wait tracking
+  std::shared_ptr<TransactionDBCondVar> cv;
+};
+
 struct LockInfo {
+  LockInfo(TransactionID id, uint64_t time, bool ex)
+      : exclusive(ex), expiration_time(time) {
+    txn_ids.push_back(id);
+  }
+
+  DECLARE_DEFAULT_MOVES(LockInfo);
+
   bool exclusive;
   autovector<TransactionID> txn_ids;
 
   // Transaction locks are not valid after this time in us
   uint64_t expiration_time;
 
-  LockInfo(TransactionID id, uint64_t time, bool ex)
-      : exclusive(ex), expiration_time(time) {
-    txn_ids.push_back(id);
+  // waiter queue for this key
+  // TODO xingbo, use intrusive list to avoid extra memory allocation
+  std::unique_ptr<std::list<KeyLockWaiter*>> waiter_queue;
+};
+
+// Print debug info for lock waiter wake up action.
+void DebugWakeUpWaiter(TransactionID txn_id, TransactionID waiter_id,
+                       const std::string& key, const std::string& msg) {
+  if (kDebugLog) {
+    // print which waiter got woken up
+    fprintf(stderr,
+            "Txn %" PRIu64 ": wake up next waiter on %s Txn %" PRIu64
+            " on key %s\n",
+            txn_id, msg.c_str(), waiter_id, key.c_str());
+    fflush(stderr);
+  }
+}
+
+// Key lock waiter context, used for free the lock automatically
+struct KeyLockWaiterContext {
+  // When a lock waiter is aborted due to dead lock or time out, this function
+  // is used to wake up the waiters after it, if they could proceed.
+  void TryWakeUpNextWaiters(const LockInfo& lock_info, const std::string& key) {
+    if (waiter_queue != nullptr && lock_waiter != waiter_queue->end()) {
+      bool wake_up_next_shared_waiters = false;
+
+      if (lock_waiter == waiter_queue->begin()) {
+        // if lock waiter is at the head of the queue, check the current lock
+        // status. If it is exclusive lock, no waiter should be woken up. other
+        // wise, try to wake up shared lock waiters on the right side of itself.
+        wake_up_next_shared_waiters = !lock_info.exclusive;
+      } else {
+        // if lock waiter is not at the head of the queue, check the previous
+        // lock status. If it is active and shared, it should try to wake up the
+        // shared lock waiter on the right side of itself.
+        auto lock_waiter_prev = lock_waiter;
+        lock_waiter_prev--;
+        wake_up_next_shared_waiters =
+            (*lock_waiter_prev)->IsReady() && !(*lock_waiter_prev)->exclusive;
+      }
+
+      if (wake_up_next_shared_waiters) {
+        // Go through all the waiters on the right side of the lock waiter and
+        // wake up the shared lock waiter until the end of the queue or
+        // encountered an exclusive lock waiter.
+        auto lock_waiter_next = lock_waiter;
+        lock_waiter_next++;
+        while (lock_waiter_next != waiter_queue->end() &&
+               !(*lock_waiter_next)->exclusive) {
+          (*lock_waiter_next)->Notify();
+          DebugWakeUpWaiter((*lock_waiter)->id, (*lock_waiter_next)->id, key,
+                            "TryWakeUpNextWaiters");
+          lock_waiter_next++;
+        }
+      }
+    }
   }
-  LockInfo(const LockInfo& lock_info)
 
-      = default;
-  void operator=(const LockInfo& lock_info) {
-    exclusive = lock_info.exclusive;
-    txn_ids = lock_info.txn_ids;
-    expiration_time = lock_info.expiration_time;
+  ~KeyLockWaiterContext() {
+    if (waiter_queue != nullptr && lock_waiter != waiter_queue->end()) {
+      waiter_queue->erase(lock_waiter);
+      lock_waiter = waiter_queue->end();
+    }
+    waiter_queue = nullptr;
   }
-  DECLARE_DEFAULT_MOVES(LockInfo);
+
+  // The waiter queue the lock waiter joined. Used for remove the waiter from
+  // the waiter queue.
+  std::list<KeyLockWaiter*>* waiter_queue = nullptr;
+  // The stable iterator that tracks the position of the waiter in the waiter
+  // queue. Used for remove the waiter from the waiter queue.
+  std::list<KeyLockWaiter*>::iterator lock_waiter;
 };
 
 struct LockMapStripe {
-  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
-    stripe_mutex = factory->AllocateMutex();
-    stripe_cv = factory->AllocateCondVar();
+  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory,
+                         ThreadLocalPtr& key_lock_waiter)
+      : mutex_factory_(std::move(factory)), key_lock_waiter_(key_lock_waiter) {
+    stripe_mutex = mutex_factory_->AllocateMutex();
+    stripe_cv = mutex_factory_->AllocateCondVar();
+
     assert(stripe_mutex);
     assert(stripe_cv);
   }
 
+  LockInfo* GetLockInfo(const std::string& key) {
+    auto lock_info_iter = keys.find(key);
+    if (lock_info_iter != keys.end()) {
+      return &lock_info_iter->second;
+    } else {
+      return nullptr;
+    }
+  }
+
+  // Wait until its turn to take the lock of this key within timeout_us.
+  // By default timeout_us == 0, which means wait forever
+  void JoinWaitQueue(LockInfo& lock_info, TransactionID id, bool exclusive,
+                     bool isUpgrade, KeyLockWaiterContext& waiter_context) {
+    if (lock_info.waiter_queue == nullptr) {
+      // no waiter queue yet, create a new one
+      lock_info.waiter_queue = std::make_unique<std::list<KeyLockWaiter*>>();
+    }
+
+    auto waiter_queue = lock_info.waiter_queue.get();
+
+    // by default insert the new lock waiter at the end of the queue.
+    auto insert_point = waiter_queue->end();
+
+    if (isUpgrade) {
+      // If transaction is upgrading a shared lock to exclusive lock, prioritize
+      // it by moving its lock waiter before the first exclusive lock in the
+      // queue if there is one, or end of the queue if not exist. It will be
+      // able to acquire the lock after the other shared locks waiters at the
+      // front of queue acquired and released locks. This reduces the chance of
+      // deadlock, which makes transaction run more efficiently.
+
+      if (waiter_context.waiter_queue != nullptr) {
+        // If waiter_context is already initialized, it means current
+        // transaction already joined the lock queue. Don't move the lock
+        // position if it is already at the head of the queue or the lock
+        // waiters before it are ready to take the lock.
+        if (waiter_context.lock_waiter == waiter_queue->begin()) {
+          return;
+        }
+
+        auto prev_lock_waiter = waiter_context.lock_waiter;
+        prev_lock_waiter--;
+        if ((*prev_lock_waiter)->IsReady()) {
+          return;
+        }
+
+        // Remove existing lock waiter
+        waiter_queue->erase(waiter_context.lock_waiter);
+      }
+
+      // For upgrade, insert waiter either at the end of the queue or before the
+      // first exlusive lock waiter.
+      insert_point = waiter_queue->begin();
+      while ((insert_point != waiter_queue->end()) &&
+             (!(*insert_point)->exclusive)) {
+        insert_point++;
+      }
+    }
+
+    // Insert the new lock waiter
+    waiter_context.lock_waiter =
+        waiter_queue->insert(insert_point, GetKeyLockWaiter(id, exclusive));
+
+    waiter_context.waiter_queue = waiter_queue;
+  }
+
+  // Wait on an existing KeyLockWaiter until its turn to take the lock or
+  // timeout
+  Status WaitOnLock(std::list<KeyLockWaiter*>::iterator& lock_waiter,
+                    int64_t timeout_us = 0) {
+    Status ret;
+    if (timeout_us == 0) {
+      ret = (*lock_waiter)->Wait(stripe_mutex);
+    } else {
+      ret = (*lock_waiter)->WaitFor(stripe_mutex, timeout_us);
+    }
+    return ret;
+  }
+
+  void ReleaseLastLockHolder(
+      LockInfo& lock_info,
+      UnorderedMap<std::string, LockInfo>::iterator stripe_iter,
+      LockMap* lock_map, TransactionID txn_id, const std::string& key,
+      const int64_t max_num_locks, autovector<TransactionID>& txns,
+      autovector<TransactionID>::iterator& txn_it);
+
   // Mutex must be held before modifying keys map
   std::shared_ptr<TransactionDBMutex> stripe_mutex;
 
@@ -60,16 +298,39 @@ struct LockMapStripe {
   // Locked keys mapped to the info about the transactions that locked them.
   // TODO(agiardullo): Explore performance of other data structures.
   UnorderedMap<std::string, LockInfo> keys;
+
+ private:
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  // key lock waiter, wrapped in thread local for reusing it across
+  // transactions.
+  ThreadLocalPtr& key_lock_waiter_;
+
+  // Return key lock waiter stored in thread local var, create on first use
+  KeyLockWaiter* GetKeyLockWaiter(TransactionID id, bool exclusive) {
+    KeyLockWaiter* waiter = nullptr;
+    if (key_lock_waiter_.Get() == nullptr) {
+      // create key lock waiter
+      key_lock_waiter_.Reset(
+          new KeyLockWaiter(mutex_factory_->AllocateCondVar(), id, exclusive));
+      waiter = static_cast<KeyLockWaiter*>(key_lock_waiter_.Get());
+    } else {
+      waiter = static_cast<KeyLockWaiter*>(key_lock_waiter_.Get());
+      waiter->Reset(id, exclusive);
+    }
+    return waiter;
+  }
 };
 
 // Map of #num_stripes LockMapStripes
 struct LockMap {
   explicit LockMap(size_t num_stripes,
-                   std::shared_ptr<TransactionDBMutexFactory> factory)
-      : num_stripes_(num_stripes) {
+                   std::shared_ptr<TransactionDBMutexFactory> factory,
+                   ThreadLocalPtr& key_lock_waiter)
+      : num_stripes_(num_stripes), key_lock_waiter_(key_lock_waiter) {
     lock_map_stripes_.reserve(num_stripes);
     for (size_t i = 0; i < num_stripes; i++) {
-      LockMapStripe* stripe = new LockMapStripe(factory);
+      LockMapStripe* stripe = new LockMapStripe(factory, key_lock_waiter_);
       lock_map_stripes_.push_back(stripe);
     }
   }
@@ -78,20 +339,80 @@ struct LockMap {
     for (auto stripe : lock_map_stripes_) {
       delete stripe;
     }
+    // Validate total locked key count is 0, when lock map is destructed.
+    assert(locked_key_cnt.LoadRelaxed() == 0);
   }
 
   // Number of sepearate LockMapStripes to create, each with their own Mutex
   const size_t num_stripes_;
+  ThreadLocalPtr& key_lock_waiter_;
 
   // Count of keys that are currently locked in this column family.
+  // Note that multiple shared locks on the same key is counted as 1 lock.
   // (Only maintained if PointLockManager::max_num_locks_ is positive.)
-  std::atomic<int64_t> lock_cnt{0};
+  RelaxedAtomic<int64_t> locked_key_cnt{0};
 
   std::vector<LockMapStripe*> lock_map_stripes_;
 
   size_t GetStripe(const std::string& key) const;
 };
 
+inline void RemoveTransaction(autovector<TransactionID>& txns,
+                              autovector<TransactionID>::iterator& txn_it) {
+  if (txns.size() > 1) {
+    auto last_it = txns.end() - 1;
+    if (txn_it != last_it) {
+      *txn_it = *last_it;
+    }
+  }
+  txns.pop_back();
+}
+
+void LockMapStripe::ReleaseLastLockHolder(
+    LockInfo& lock_info,
+    UnorderedMap<std::string, LockInfo>::iterator stripe_iter,
+    LockMap* lock_map, TransactionID txn_id, const std::string& key,
+    const int64_t max_num_locks, autovector<TransactionID>& txns,
+    autovector<TransactionID>::iterator& txn_it) {
+  // check whether there is other waiting transactions
+  if (lock_info.waiter_queue == nullptr || lock_info.waiter_queue->empty()) {
+    keys.erase(stripe_iter);
+    if (max_num_locks > 0) {
+      // Maintain lock count if there is a limit on the number of
+      // locks.
+      assert(lock_map->locked_key_cnt.LoadRelaxed() > 0);
+      lock_map->locked_key_cnt.FetchSubRelaxed(1);
+    }
+  } else {
+    // there are waiters in the queue, so we need to wake the next
+    // one up
+    RemoveTransaction(txns, txn_it);
+    // loop through the waiter queue and wake up all the shared lock
+    // waiters until the first exclusive lock waiter, or wake up the
+    // first waiter, if it is waiting for an exclusive lock.
+    bool first_waiter = true;
+    for (auto& waiter : *lock_info.waiter_queue) {
+      if (waiter->exclusive) {
+        if (first_waiter) {
+          // the first waiter is an exclusive lock waiter, wake it
+          // up Note that they are only notified, but not removed
+          // from the waiter queue. This allows new transaction to
+          // be aware that there are waiters ahead of them.
+          waiter->Notify();
+          DebugWakeUpWaiter(txn_id, waiter->id, key, "UnlockKey X waiter");
+        }
+        // found the first exclusive lock waiter, stop
+        break;
+      } else {
+        // wake up the shared lock waiter
+        waiter->Notify();
+        DebugWakeUpWaiter(txn_id, waiter->id, key, "UnlockKey S waiter");
+      }
+      first_waiter = false;
+    }
+  }
+}
+
 namespace {
 void UnrefLockMapsCache(void* ptr) {
   // Called when a thread exits or a ThreadLocalPtr gets destroyed.
@@ -99,6 +420,10 @@ void UnrefLockMapsCache(void* ptr) {
       static_cast<UnorderedMap<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
   delete lock_maps_cache;
 }
+void UnrefKeyLockWaiter(void* ptr) {
+  auto key_lock_waiter = static_cast<KeyLockWaiter*>(ptr);
+  delete key_lock_waiter;
+}
 }  // anonymous namespace
 
 PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
@@ -107,6 +432,7 @@ PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
       default_num_stripes_(opt.num_stripes),
       max_num_locks_(opt.max_num_locks),
       lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
+      key_lock_waiter_(&UnrefKeyLockWaiter),
       dlock_buffer_(opt.max_num_deadlocks),
       mutex_factory_(opt.custom_mutex_factory
                          ? opt.custom_mutex_factory
@@ -122,7 +448,8 @@ void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
 
   if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
     lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
-                                        default_num_stripes_, mutex_factory_));
+                                        default_num_stripes_, mutex_factory_,
+                                        key_lock_waiter_));
   } else {
     // column_family already exists in lock map
     assert(false);
@@ -242,16 +569,18 @@ Status PointLockManager::TryLock(PessimisticTransaction* txn,
 
   LockInfo lock_info(txn->GetID(), txn->GetExpirationTime(), exclusive);
   int64_t timeout = txn->GetLockTimeout();
+  int64_t deadlock_timeout_us = txn->GetDeadlockTimeout();
 
   return AcquireWithTimeout(txn, lock_map, stripe, column_family_id, key, env,
-                            timeout, lock_info);
+                            timeout, deadlock_timeout_us, lock_info);
 }
 
 // Helper function for TryLock().
 Status PointLockManager::AcquireWithTimeout(
     PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
     ColumnFamilyId column_family_id, const std::string& key, Env* env,
-    int64_t timeout, const LockInfo& lock_info) {
+    int64_t timeout, int64_t /*deadlock_timeout_us*/,
+    const LockInfo& lock_info) {
   Status result;
   uint64_t end_time = 0;
 
@@ -364,6 +693,130 @@ Status PointLockManager::AcquireWithTimeout(
   return result;
 }
 
+// Try to lock this key after we have acquired the mutex.
+// Sets *expire_time to the expiration time in microseconds
+//  or 0 if no expiration.
+//
+// Returns Status::TimeOut if the lock cannot be acquired due to it being
+// held by other transactions, `txn_ids` will be populated with the id of
+// transactions that hold the lock, excluding lock_info.txn_ids[0].
+// Returns Status::Aborted(kLockLimit) if the lock cannot be acquired due to
+// reaching per CF limit on the number of locks.
+//
+// REQUIRED:  Stripe mutex must be held. txn_ids must be empty.
+Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                                       const std::string& key, Env* env,
+                                       const LockInfo& txn_lock_info,
+                                       uint64_t* expire_time,
+                                       autovector<TransactionID>* txn_ids) {
+  assert(txn_lock_info.txn_ids.size() == 1);
+  assert(txn_ids && txn_ids->empty());
+
+  Status result;
+  // Check if this key is already locked
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    // Lock already held
+    auto& lock_info = stripe_iter->second;
+    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+
+    if (lock_info.exclusive || txn_lock_info.exclusive) {
+      if (lock_info.txn_ids.size() == 1 &&
+          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
+        // The list contains one txn and we're it, so just take it.
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+      } else {
+        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
+        // it's there for a shared lock with multiple holders which was not
+        // caught in the first case.
+        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
+                          expire_time)) {
+          // lock is expired, can steal it
+          lock_info.txn_ids = txn_lock_info.txn_ids;
+          lock_info.exclusive = txn_lock_info.exclusive;
+          lock_info.expiration_time = txn_lock_info.expiration_time;
+          // lock_cnt does not change
+        } else {
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+          for (auto id : lock_info.txn_ids) {
+            // A transaction is not blocked by itself
+            if (id != txn_lock_info.txn_ids[0]) {
+              txn_ids->push_back(id);
+            }
+          }
+        }
+      }
+    } else {
+      // We are requesting shared access to a shared lock, so just grant it.
+      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
+      // Using std::max means that expiration time never goes down even when
+      // a transaction is removed from the list. The correct solution would be
+      // to track expiry for every transaction, but this would also work for
+      // now.
+      lock_info.expiration_time =
+          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+    }
+  } else {
+    // Lock not held.
+    // Check lock limit
+    if (max_num_locks_ > 0 &&
+        lock_map->locked_key_cnt.LoadRelaxed() >= max_num_locks_) {
+      result = Status::LockLimit();
+    } else {
+      // acquire lock
+      stripe->keys.try_emplace(key, txn_lock_info.txn_ids[0],
+                               txn_lock_info.expiration_time,
+                               txn_lock_info.exclusive);
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_ > 0) {
+        lock_map->locked_key_cnt.FetchAddRelaxed(1);
+      }
+    }
+  }
+
+  return result;
+}
+
+void PointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                 const std::string& key, LockMapStripe* stripe,
+                                 LockMap* lock_map, Env* env) {
+#ifdef NDEBUG
+  (void)env;
+#endif
+  TransactionID txn_id = txn->GetID();
+
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    auto& txns = stripe_iter->second.txn_ids;
+    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+    // Found the key we locked.  unlock it.
+    if (txn_it != txns.end()) {
+      if (txns.size() == 1) {
+        stripe->keys.erase(stripe_iter);
+      } else {
+        auto last_it = txns.end() - 1;
+        if (txn_it != last_it) {
+          *txn_it = *last_it;
+        }
+        txns.pop_back();
+      }
+
+      if (max_num_locks_ > 0) {
+        // Maintain lock count if there is a limit on the number of locks.
+        assert(lock_map->locked_key_cnt.LoadRelaxed() > 0);
+        lock_map->locked_key_cnt.FetchSubRelaxed(1);
+      }
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+}
+
 void PointLockManager::DecrementWaiters(
     const PessimisticTransaction* txn,
     const autovector<TransactionID>& wait_ids) {
@@ -481,143 +934,22 @@ bool PointLockManager::IncrementWaiters(
   return true;
 }
 
-// Try to lock this key after we have acquired the mutex.
-// Sets *expire_time to the expiration time in microseconds
-//  or 0 if no expiration.
-//
-// Returns Status::TimeOut if the lock cannot be acquired due to it being
-// held by other transactions, `txn_ids` will be populated with the id of
-// transactions that hold the lock, excluding lock_info.txn_ids[0].
-// Returns Status::Aborted(kLockLimit) if the lock cannot be acquired due to
-// reaching per CF limit on the number of locks.
-//
-// REQUIRED:  Stripe mutex must be held. txn_ids must be empty.
-Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
-                                       const std::string& key, Env* env,
-                                       const LockInfo& txn_lock_info,
-                                       uint64_t* expire_time,
-                                       autovector<TransactionID>* txn_ids) {
-  assert(txn_lock_info.txn_ids.size() == 1);
-  assert(txn_ids && txn_ids->empty());
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              ColumnFamilyId column_family_id,
+                              const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
 
-  Status result;
-  // Check if this key is already locked
-  auto stripe_iter = stripe->keys.find(key);
-  if (stripe_iter != stripe->keys.end()) {
-    // Lock already held
-    LockInfo& lock_info = stripe_iter->second;
-    assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
 
-    if (lock_info.exclusive || txn_lock_info.exclusive) {
-      if (lock_info.txn_ids.size() == 1 &&
-          lock_info.txn_ids[0] == txn_lock_info.txn_ids[0]) {
-        // The list contains one txn and we're it, so just take it.
-        lock_info.exclusive = txn_lock_info.exclusive;
-        lock_info.expiration_time = txn_lock_info.expiration_time;
-      } else {
-        // Check if it's expired. Skips over txn_lock_info.txn_ids[0] in case
-        // it's there for a shared lock with multiple holders which was not
-        // caught in the first case.
-        if (IsLockExpired(txn_lock_info.txn_ids[0], lock_info, env,
-                          expire_time)) {
-          // lock is expired, can steal it
-          lock_info.txn_ids = txn_lock_info.txn_ids;
-          lock_info.exclusive = txn_lock_info.exclusive;
-          lock_info.expiration_time = txn_lock_info.expiration_time;
-          // lock_cnt does not change
-        } else {
-          result = Status::TimedOut(Status::SubCode::kLockTimeout);
-          for (auto id : lock_info.txn_ids) {
-            // A transaction is not blocked by itself
-            if (id != txn_lock_info.txn_ids[0]) {
-              txn_ids->push_back(id);
-            }
-          }
-        }
-      }
-    } else {
-      // We are requesting shared access to a shared lock, so just grant it.
-      lock_info.txn_ids.push_back(txn_lock_info.txn_ids[0]);
-      // Using std::max means that expiration time never goes down even when
-      // a transaction is removed from the list. The correct solution would be
-      // to track expiry for every transaction, but this would also work for
-      // now.
-      lock_info.expiration_time =
-          std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
-    }
-  } else {  // Lock not held.
-    // Check lock limit
-    if (max_num_locks_ > 0 &&
-        lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
-      result = Status::LockLimit();
-    } else {
-      // acquire lock
-      stripe->keys.emplace(key, txn_lock_info);
-
-      // Maintain lock count if there is a limit on the number of locks
-      if (max_num_locks_) {
-        lock_map->lock_cnt++;
-      }
-    }
-  }
-
-  return result;
-}
-
-void PointLockManager::UnLockKey(PessimisticTransaction* txn,
-                                 const std::string& key, LockMapStripe* stripe,
-                                 LockMap* lock_map, Env* env) {
-#ifdef NDEBUG
-  (void)env;
-#endif
-  TransactionID txn_id = txn->GetID();
-
-  auto stripe_iter = stripe->keys.find(key);
-  if (stripe_iter != stripe->keys.end()) {
-    auto& txns = stripe_iter->second.txn_ids;
-    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
-    // Found the key we locked.  unlock it.
-    if (txn_it != txns.end()) {
-      if (txns.size() == 1) {
-        stripe->keys.erase(stripe_iter);
-      } else {
-        auto last_it = txns.end() - 1;
-        if (txn_it != last_it) {
-          *txn_it = *last_it;
-        }
-        txns.pop_back();
-      }
-
-      if (max_num_locks_ > 0) {
-        // Maintain lock count if there is a limit on the number of locks.
-        assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
-        lock_map->lock_cnt--;
-      }
-    }
-  } else {
-    // This key is either not locked or locked by someone else.  This should
-    // only happen if the unlocking transaction has expired.
-    assert(txn->GetExpirationTime() > 0 &&
-           txn->GetExpirationTime() < env->NowMicros());
-  }
-}
-
-void PointLockManager::UnLock(PessimisticTransaction* txn,
-                              ColumnFamilyId column_family_id,
-                              const std::string& key, Env* env) {
-  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
-  LockMap* lock_map = lock_map_ptr.get();
-  if (lock_map == nullptr) {
-    // Column Family must have been dropped.
-    return;
-  }
-
-  // Lock the mutex for the stripe that this key hashes to
-  size_t stripe_num = lock_map->GetStripe(key);
-  assert(lock_map->lock_map_stripes_.size() > stripe_num);
-  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
-
-  stripe->stripe_mutex->Lock().PermitUncheckedError();
+  stripe->stripe_mutex->Lock().AssertOK();
   UnLockKey(txn, key, stripe, lock_map, env);
   stripe->stripe_mutex->UnLock();
 
@@ -659,7 +991,7 @@ void PointLockManager::UnLock(PessimisticTransaction* txn,
       assert(lock_map->lock_map_stripes_.size() > stripe_num);
       LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
 
-      stripe->stripe_mutex->Lock().PermitUncheckedError();
+      stripe->stripe_mutex->Lock().AssertOK();
 
       for (const std::string* key : stripe_keys) {
         UnLockKey(txn, *key, stripe, lock_map, env);
@@ -690,7 +1022,7 @@ PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
     const auto& stripes = lock_maps_[i]->lock_map_stripes_;
     // Iterate and lock all stripes in ascending order.
     for (const auto& j : stripes) {
-      j->stripe_mutex->Lock().PermitUncheckedError();
+      j->stripe_mutex->Lock().AssertOK();
       for (const auto& it : j->keys) {
         struct KeyLockInfo info;
         info.exclusive = it.second.exclusive;
@@ -742,4 +1074,758 @@ void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
   // no-op
 }
 
+// PerKeyPointLockManager implementation
+PerKeyPointLockManager::PerKeyPointLockManager(PessimisticTransactionDB* db,
+                                               const TransactionDBOptions& opt)
+    : PointLockManager(db, opt) {}
+
+void DebugLockStatus(TransactionID my_txn_id, const LockInfo& lock_info,
+                     const std::string& key,
+                     const KeyLockWaiterContext& key_lock_waiter_ctx) {
+  if (kDebugLog) {
+    char msg[512];
+    size_t offset = 0;
+
+    // print lock holders
+    offset += snprintf(msg + offset, sizeof(msg),
+                       "Txn %" PRIu64 ": LockStatus key %s: holder [",
+                       my_txn_id, key.c_str());
+    for (const auto& txn_id : lock_info.txn_ids) {
+      offset += snprintf(msg + offset, sizeof(msg), "%s%" PRIu64 ",",
+                         lock_info.exclusive ? "X" : "S", txn_id);
+    }
+
+    // print waiter queue
+    offset += snprintf(msg + offset, sizeof(msg), "], waiter_queue [");
+    for (auto it = key_lock_waiter_ctx.waiter_queue->begin();
+         it != key_lock_waiter_ctx.waiter_queue->end(); it++) {
+      offset += snprintf(msg + offset, sizeof(msg), "%s%" PRIu64 ",",
+                         (*it)->exclusive ? "X" : "S", (*it)->id);
+    }
+
+    offset += snprintf(msg + offset, sizeof(msg), "]\n");
+    fprintf(stderr, "%s", msg);
+    fflush(stderr);
+  }
+}
+
+int64_t PerKeyPointLockManager::CalculateWaitEndTime(int64_t expire_time_hint,
+                                                     int64_t end_time) {
+  int64_t cv_end_time = -1;
+  if (expire_time_hint > 0 && end_time > 0) {
+    cv_end_time = std::min(expire_time_hint, end_time);
+  } else if (expire_time_hint > 0) {
+    cv_end_time = expire_time_hint;
+  } else if (end_time > 0) {
+    cv_end_time = end_time;
+  }
+  return cv_end_time;
+}
+
+// Acquire lock within timeout.
+// This function is similar to PointLockManger::AcquireWithTimeout with
+// following differences.
+//
+// If deadlock_timeout_us is not 0, it first performs a wait without doing dead
+// lock detection. This wait duration is specified by deadlock_timeout_us.
+// If this wait times out and it is still not able to acquire the lock, perform
+// the deadlock detection before wait again.
+//
+// It uses a per key lock waiter queue to handle lock waiting and wake up
+// efficiently. When a transaction is waiting for acquiring a lock on a key, it
+// joins a wait queue that is dedicated for this key. It will either timeout, or
+// get woken up when it is its turn to take the lock. This is more efficient
+// than the PointLockManger implementation where all lock waiters wait on the
+// same lock stripe cond var.
+Status PerKeyPointLockManager::AcquireWithTimeout(
+    PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+    ColumnFamilyId column_family_id, const std::string& key, Env* env,
+    int64_t timeout, int64_t deadlock_timeout_us,
+    const LockInfo& txn_lock_info) {
+  Status result;
+  uint64_t end_time = 0;
+  auto my_txn_id = txn_lock_info.txn_ids[0];
+
+  if (timeout > 0) {
+    uint64_t start_time = env->NowMicros();
+    end_time = start_time + timeout;
+  }
+
+  if (timeout < 0) {
+    // If timeout is negative, we wait indefinitely to acquire the lock
+    result = stripe->stripe_mutex->Lock();
+  } else {
+    result = stripe->stripe_mutex->TryLockFor(timeout);
+  }
+
+  if (!result.ok()) {
+    // failed to acquire mutex
+    return result;
+  }
+
+  // Acquire lock if we are able to
+  uint64_t expire_time_hint = 0;
+  autovector<TransactionID> wait_ids;
+  bool isUpgrade = false;
+
+  auto lock_info = stripe->GetLockInfo(key);
+
+  auto wait_before_deadlock_detection =
+      txn->IsDeadlockDetect() && (deadlock_timeout_us > 0);
+  result = AcquireLocked(
+      lock_map, stripe, key, env, txn_lock_info, &expire_time_hint,
+      // If wait before deadlock detection, it executes a fast path to save CPU
+      // cycles, wait ids are not collected.
+      wait_before_deadlock_detection ? nullptr : &wait_ids, &lock_info,
+      &isUpgrade, true);
+  if (!result.ok() && timeout != 0 &&
+      /* No need to retry after reach lock limit or aborted */
+      !result.IsLockLimit() && !result.IsAborted()) {
+    assert(lock_info);
+
+    PERF_TIMER_GUARD(key_lock_wait_time);
+    PERF_COUNTER_ADD(key_lock_wait_count, 1);
+    // If we weren't able to acquire the lock, we will keep retrying as long
+    // as the timeout allows.
+    bool timed_out = false;
+    bool cv_wait_fail = false;
+
+    KeyLockWaiterContext key_lock_waiter_ctx;
+
+    // Decide how long to wait
+    auto cv_end_time = CalculateWaitEndTime(expire_time_hint, end_time);
+
+    // We will try to wait a little bit before checking deadlock, as
+    // deadlock check is expensive.
+    if (wait_before_deadlock_detection) {
+      int64_t now = env->NowMicros();
+      if (cv_end_time < 0 || cv_end_time > now) {
+        if (kDebugLog) {
+          // print lock status before deadlock detection
+          fprintf(stderr,
+                  "Txn %" PRIu64
+                  " wait before deadlock detection %s, exclusive lock "
+                  "%d\n",
+                  my_txn_id, key.c_str(), txn_lock_info.exclusive);
+          fflush(stderr);
+        }
+        stripe->JoinWaitQueue(*lock_info, my_txn_id, txn_lock_info.exclusive,
+                              false, key_lock_waiter_ctx);
+        DebugLockStatus(my_txn_id, *lock_info, key, key_lock_waiter_ctx);
+
+        TEST_SYNC_POINT(
+            "PerKeyPointLockManager::AcquireWithTimeout:"
+            "WaitingTxnBeforeDeadLockDetection");
+        result = stripe->WaitOnLock(
+            key_lock_waiter_ctx.lock_waiter,
+            std::min(cv_end_time - now, (int64_t)deadlock_timeout_us));
+        assert(result.ok() || result.IsTimedOut());
+        // Refresh lock info pointer, as this pointer is not guaranteed to be
+        // stable in folly
+        lock_info = stripe->GetLockInfo(key);
+        // try to take a lock again to get wait ids after deadlock timeout
+        result = AcquireLocked(lock_map, stripe, key, env, txn_lock_info,
+                               &expire_time_hint, &wait_ids, &lock_info,
+                               &isUpgrade, !result.ok());
+      } else {
+        // Already timed out
+        timed_out = true;
+        result = Status::TimedOut(Status::SubCode::kLockTimeout);
+      }
+    }
+
+    while (!result.ok() && !timed_out && !result.IsAborted()) {
+      // Refresh wait end time
+      cv_end_time = CalculateWaitEndTime(expire_time_hint, end_time);
+
+      // We are dependent on a transaction to finish, so perform deadlock
+      // detection.
+      if (!wait_ids.empty()) {
+        if (txn->IsDeadlockDetect()) {
+          if (IncrementWaiters(txn, wait_ids, key, column_family_id,
+                               txn_lock_info.exclusive, env)) {
+            result = Status::Busy(Status::SubCode::kDeadlock);
+            break;
+          }
+        }
+        txn->SetWaitingTxn(wait_ids, column_family_id, &key);
+      }
+
+      TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
+
+      if (kDebugLog) {
+        // print transaction lock status and wait ids
+        char msg[512];
+        size_t offset = 0;
+        offset += snprintf(msg + offset, sizeof(msg),
+                           "Txn %" PRIu64
+                           " wait after deadlock detection %s, exclusive lock "
+                           "%d, upgrade %d, wait_ids [",
+                           my_txn_id, key.c_str(), txn_lock_info.exclusive,
+                           isUpgrade);
+
+        for (auto it = wait_ids.begin(); it != wait_ids.end(); it++) {
+          offset += snprintf(msg + offset, sizeof(msg), "%" PRIu64 ",", *it);
+        }
+
+        offset += snprintf(msg + offset, sizeof(msg), "]\n");
+
+        fprintf(stderr, "%s", msg);
+        fflush(stderr);
+      }
+
+      // If it has not joined wait queue, join it now.
+      // If it is a lock upgrade, rejoin it.
+      if (isUpgrade || (key_lock_waiter_ctx.waiter_queue == nullptr)) {
+        stripe->JoinWaitQueue(*lock_info, my_txn_id, txn_lock_info.exclusive,
+                              isUpgrade, key_lock_waiter_ctx);
+
+        DebugLockStatus(my_txn_id, *lock_info, key, key_lock_waiter_ctx);
+      }
+
+      int64_t now = 0;
+      if (cv_end_time < 0) {
+        // Wait indefinitely
+        result = stripe->WaitOnLock(key_lock_waiter_ctx.lock_waiter);
+        cv_wait_fail = !result.ok();
+      } else {
+        now = env->NowMicros();
+        if (cv_end_time > now) {
+          result = stripe->WaitOnLock(key_lock_waiter_ctx.lock_waiter,
+                                      cv_end_time - now);
+
+          cv_wait_fail = !result.ok() && !result.IsTimedOut();
+        } else {
+          // now >= cv_end_time, we already timed out
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+        }
+      }
+
+#ifndef NDEBUG
+      stripe->stripe_mutex->UnLock();
+      TEST_SYNC_POINT_CALLBACK(
+          "PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+          &my_txn_id);
+      TEST_SYNC_POINT(
+          "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock");
+      auto lock_status = stripe->stripe_mutex->Lock();
+      assert(lock_status.ok());
+#endif
+
+      if (!wait_ids.empty()) {
+        txn->ClearWaitingTxn();
+        if (txn->IsDeadlockDetect()) {
+          DecrementWaiters(txn, wait_ids);
+        }
+      }
+
+      if (cv_wait_fail) {
+        break;
+      }
+
+      if (result.IsTimedOut()) {
+        timed_out = true;
+        // Even though we timed out, we will still make one more attempt to
+        // acquire lock below (it is possible the lock expired and we
+        // were never signaled).
+      }
+      assert(result.ok() || result.IsTimedOut());
+
+      // Refresh lock info pointer, as this pointer is not guaranteed to be
+      // stable in folly
+      lock_info = stripe->GetLockInfo(key);
+
+      // Try to get the lock again.
+      result = AcquireLocked(
+          lock_map, stripe, key, env, txn_lock_info, &expire_time_hint,
+          &wait_ids, &lock_info, &isUpgrade,
+          /* If wait is timed out, it means it is not its turn to take the lock.
+           * Therefore, it should still follow FIFO order. */
+          timed_out);
+      auto fail_to_take_lock_on_its_turn = !timed_out && !result.ok();
+      if (fail_to_take_lock_on_its_turn) {
+        // If it is its turn, but it failed to take lock, something is broken.
+        // Assert this should not happen in debug build during testing.
+        // In prod, it simply gives up the attempt.
+        assert(!fail_to_take_lock_on_its_turn);
+        break;
+      }
+
+      if (!result.ok() && cv_end_time >= 0) {
+        if (static_cast<int64_t>(end_time) <= now) {
+          // lock timeout timed out
+          result = Status::TimedOut(Status::SubCode::kLockTimeout);
+          timed_out = true;
+        }
+      }
+    }
+
+    // For any reason that the transaction failed to acquire the lock, it should
+    // try to wake up next waiters, if they are ready to proceed.
+    if (!result.ok()) {
+      key_lock_waiter_ctx.TryWakeUpNextWaiters(*lock_info, key);
+    }
+  }
+
+  stripe->stripe_mutex->UnLock();
+
+  // On timeout, persist the lock information so we can debug the contention
+  if (result.IsTimedOut()) {
+    txn->SetWaitingTxn(wait_ids, column_family_id, &key, true);
+  }
+
+  return result;
+}
+
+Status PerKeyPointLockManager::FillWaitIds(LockInfo& lock_info,
+                                           const LockInfo& txn_lock_info,
+                                           autovector<TransactionID>* wait_ids,
+                                           bool& isUpgrade,
+                                           TransactionID& my_txn_id,
+                                           const std::string& key) {
+  if (wait_ids != nullptr) {
+    for (auto id : lock_info.txn_ids) {
+      // A transaction is not blocked by itself
+      if (id != my_txn_id) {
+        wait_ids->push_back(id);
+      } else {
+        // Itself is already holding a lock, so it is either an upgrade or
+        // downgrade. Downgrade has already been handled above. Assert it
+        // is an upgrade here.
+        auto is_upgrade = !lock_info.exclusive && txn_lock_info.exclusive;
+        if (!is_upgrade) {
+          if (kDebugLog) {
+            fprintf(stderr,
+                    "txn id %" PRIu64 " assert failed on lock upgrade key %s\n",
+                    my_txn_id, key.c_str());
+            fflush(stderr);
+          }
+          assert(is_upgrade);
+          return Status::Aborted(Status::SubCode::kNotExpectedCodePath);
+        }
+        isUpgrade = true;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// This function is similar to PointLockManager::AcquireLocked with following
+// differences.
+//
+// It introduces a per key lock waiter queue. When it tries to take the lock, it
+// will first check whether there are other transactions already in the waiter
+// queue, if so it will return TimeOut. Caller will join the waiter queue, if
+// lock timeout is not reached yet. When it is its to take the lock, it will be
+// woken up and take the lock.
+//
+// It introduces a fast path check that will quickly check whether the lock
+// could be obtained without gathering waiter id information. This allows
+// transaction to sleep a short time before perform deadlock detection.
+//
+// @param lock_info_ptr: pointer to the LockInfo associated with the key. If the
+//    key is already locked, LockInfo will be not null. If not, LockInfo is
+//    null, and a new LockInfo is created and assigned to lock_info_ptr.
+//
+// @param wait_ids: When wait_ids is nullptr, it perform a fast path check to
+//    see whether it could take the lock, it does not fill waiter_ids. If
+//    wait_ids is not nullptr, it will fill the wait_ids with the lock holder.
+//
+// @param isUpgrade: isUpgrade is set to true, if the transaction tries to
+//    uprade a lock to exclusive, but it needs to wait for other lock holders to
+//    release the shared locks. Note that isUpgrade is not set on fast path
+//    check.
+//
+// @param fifo: fifo flag indicates whether it should follow fifo order to check
+//    whether there is already a waiter waiting for the lock or not. If fifo is
+//    true and there is already a lock waiter waiting in the queue and it is not
+//    itself, return TimedOut. If fifo is false, it means it is its turn to take
+//    the lock.
+Status PerKeyPointLockManager::AcquireLocked(
+    LockMap* lock_map, LockMapStripe* stripe, const std::string& key, Env* env,
+    const LockInfo& txn_lock_info, uint64_t* expire_time,
+    autovector<TransactionID>* wait_ids, LockInfo** lock_info_ptr,
+    bool* isUpgrade, bool fifo) {
+  assert(txn_lock_info.txn_ids.size() == 1);
+
+  if (wait_ids != nullptr) {
+    wait_ids->clear();
+  }
+
+  *isUpgrade = false;
+  auto my_txn_id = txn_lock_info.txn_ids[0];
+
+  if (!*lock_info_ptr) {
+    // No lock nor waiter on this key, so it can try to acquire the lock
+    // directly
+    if (max_num_locks_ > 0 &&
+        lock_map->locked_key_cnt.LoadRelaxed() >= max_num_locks_) {
+      return Status::LockLimit();
+    } else {
+      // acquire lock
+      auto ret = stripe->keys.try_emplace(key, my_txn_id,
+                                          txn_lock_info.expiration_time,
+                                          txn_lock_info.exclusive);
+      assert(ret.second);
+      *lock_info_ptr = &(ret.first->second);
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_ > 0) {
+        lock_map->locked_key_cnt.FetchAddRelaxed(1);
+      }
+
+      return Status::OK();
+    }
+  }
+
+  auto& lock_info = **lock_info_ptr;
+  auto locked = !lock_info.txn_ids.empty();
+  auto solo_lock_owner =
+      (lock_info.txn_ids.size() == 1) && (lock_info.txn_ids[0] == my_txn_id);
+
+  // Handle lock downgrade and reentrant first, it should always succeed
+  if (locked) {
+    if (solo_lock_owner) {
+      // Lock is already owned by itself.
+      if (lock_info.exclusive && !txn_lock_info.exclusive) {
+        // For downgrade, wake up all the shared lock waiters at the front of
+        // the waiter queue
+        if (lock_info.waiter_queue != nullptr) {
+          for (auto& waiter : *lock_info.waiter_queue) {
+            if (waiter->exclusive) {
+              break;
+            }
+            waiter->Notify();
+            DebugWakeUpWaiter(my_txn_id, waiter->id, key, "Lock Downgrade");
+          }
+        }
+      }
+
+      if (lock_info.exclusive || !txn_lock_info.exclusive) {
+        // If it is lock downgrade or re-entrant, grant it immediately
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+        return Status::OK();
+      }
+    } else {
+      // handle read reentrant lock for non solo lock owner case
+      // Check whether the transaction already hold a shared lock and it is
+      // trying to acquire it again.
+      if (!txn_lock_info.exclusive && !lock_info.exclusive) {
+        auto lock_it = std::find(lock_info.txn_ids.begin(),
+                                 lock_info.txn_ids.end(), my_txn_id);
+        if (lock_it != lock_info.txn_ids.end()) {
+          lock_info.expiration_time = std::max(lock_info.expiration_time,
+                                               txn_lock_info.expiration_time);
+          return Status::OK();
+        }
+      }
+    }
+  }
+
+  auto has_waiter =
+      (lock_info.waiter_queue != nullptr) && !lock_info.waiter_queue->empty();
+
+  // Update solo lock owner for the rest of the cases
+  if (solo_lock_owner) {
+    // If there is a shared lock waiter that is ready to take the lock, the
+    // current transaction would not be the solo lock owner.
+    auto has_ready_shared_lock_waiter =
+        has_waiter && lock_info.waiter_queue->front()->IsReady() &&
+        (!lock_info.waiter_queue->front()->exclusive);
+    solo_lock_owner = !has_ready_shared_lock_waiter;
+  }
+
+  // If myself is the first waiter in the queue, skip checking waiter queue
+  auto is_first_waiter =
+      has_waiter && (lock_info.waiter_queue->front()->id == my_txn_id);
+
+  if (fifo && has_waiter && !is_first_waiter) {
+    // There are other waiters ahead of myself
+    {
+      // handle shared lock request on a shared lock with only shared lock
+      // waiters
+      if (!txn_lock_info.exclusive &&
+          (!locked || (locked && !lock_info.exclusive))) {
+        bool has_exclusive_waiter = false;
+        // check whether there is exclusive lock waiter
+        for (auto& waiter : *lock_info.waiter_queue) {
+          if (waiter->exclusive) {
+            has_exclusive_waiter = true;
+            break;
+          }
+        }
+        if (!has_exclusive_waiter) {
+          // no X waiter in the queue, so it can acquire the lock without
+          // waiting
+          lock_info.txn_ids.push_back(my_txn_id);
+          lock_info.exclusive = false;
+          lock_info.expiration_time = std::max(lock_info.expiration_time,
+                                               txn_lock_info.expiration_time);
+          return Status::OK();
+        }
+      }
+    }
+
+    // fast path check for lock upgrade
+    if (solo_lock_owner && !lock_info.exclusive && txn_lock_info.exclusive) {
+      // During lock upgrade, if it is the only transaction owns the lock and no
+      // other shared lock requesting transaction is ready to take the lock,
+      // prioritize the lock grade and grant it now.
+      lock_info.exclusive = txn_lock_info.exclusive;
+      lock_info.expiration_time = txn_lock_info.expiration_time;
+      return Status::OK();
+    }
+
+    if (wait_ids == nullptr) {
+      // If wait_ids is nullptr, it is a fast path check to see whether it is
+      // able to take the lock or not, skip filling the waiting txn ids for
+      // deadlock detection.
+      return Status::TimedOut(Status::SubCode::kLockTimeout);
+    }
+
+    // For other cases with fifo and lock waiter, try to wait in the queue
+    // and fill the waiting txn list
+    auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
+                         my_txn_id, key);
+    if (!s.ok()) {
+      // propagate error up
+      return s;
+    }
+
+    // Add the waiter txn ids to the blocking txn id list
+    if (txn_lock_info.exclusive) {
+      // For exclusive lock, it traverse the queue from front to back to
+      // handle upgrade
+      for (auto& waiter : *lock_info.waiter_queue) {
+        // For upgrade locks, it will be placed at the beginning of
+        // the queue. However, for shared lock waiters that are at
+        // the beginning of the queue that got woken up but haven't
+        // taken the lock yet, they should still be added to the
+        // blocking txn id list.
+        if (*isUpgrade && waiter->exclusive) {
+          break;
+        }
+        if (waiter->id != my_txn_id) {
+          wait_ids->push_back(waiter->id);
+        }
+      }
+    } else {
+      // For shared lock, skip the S lock waiters at the end of the queue, as
+      // they will be waked up together. Therefore, it traverses the queue from
+      // from back to front.
+      bool skip_shared_lock_waiter = true;
+      for (auto it = lock_info.waiter_queue->rbegin();
+           it != lock_info.waiter_queue->rend(); ++it) {
+        if ((*it)->exclusive) {
+          skip_shared_lock_waiter = false;
+        } else {
+          if (skip_shared_lock_waiter) {
+            continue;
+          }
+        }
+        if ((*it)->id != my_txn_id) {
+          wait_ids->push_back((*it)->id);
+        }
+      }
+    }
+
+    return Status::TimedOut(Status::SubCode::kLockTimeout);
+  } else {
+    // there is no waiter or it is its turn to take the lock
+    if (!locked) {
+      // no lock on this key, acquire it directly
+      lock_info.txn_ids = txn_lock_info.txn_ids;
+      lock_info.exclusive = txn_lock_info.exclusive;
+      lock_info.expiration_time = txn_lock_info.expiration_time;
+      return Status::OK();
+    }
+
+    if (IsLockExpired(my_txn_id, lock_info, env, expire_time)) {
+      // current lock is expired, steal it.
+      lock_info.txn_ids = txn_lock_info.txn_ids;
+      lock_info.exclusive = txn_lock_info.exclusive;
+      lock_info.expiration_time = txn_lock_info.expiration_time;
+      return Status::OK();
+    }
+
+    // Check lock compatibility
+    if (txn_lock_info.exclusive) {
+      // handle lock upgrade
+      if (solo_lock_owner) {
+        // Lock re-entrant or downgrade has already been handled above.
+        // Assert it is an upgrade here. Acquire the lock directly.
+        assert(!lock_info.exclusive);
+        lock_info.exclusive = txn_lock_info.exclusive;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+        return Status::OK();
+      } else {
+        // lock is already owned by other transactions
+        auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
+                             my_txn_id, key);
+        if (!s.ok()) {
+          // propagate error up
+          return s;
+        }
+        return Status::TimedOut(Status::SubCode::kLockTimeout);
+      }
+    } else {
+      // handle shared lock request
+      if (lock_info.exclusive) {
+        // lock is already owned by other exclusive lock
+        auto s = FillWaitIds(lock_info, txn_lock_info, wait_ids, *isUpgrade,
+                             my_txn_id, key);
+        if (!s.ok()) {
+          // propagate error up
+          return s;
+        }
+        return Status::TimedOut(Status::SubCode::kLockTimeout);
+      } else {
+        // lock is on shared lock state, acquire it
+        lock_info.txn_ids.push_back(my_txn_id);
+        // update the expiration time
+        lock_info.expiration_time =
+            std::max(lock_info.expiration_time, txn_lock_info.expiration_time);
+        return Status::OK();
+      }
+    }
+  }
+}
+
+void PerKeyPointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                       const std::string& key,
+                                       LockMapStripe* stripe, LockMap* lock_map,
+                                       Env* env) {
+#ifdef NDEBUG
+  (void)env;
+#endif
+  TransactionID txn_id = txn->GetID();
+
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
+    auto& lock_info = stripe_iter->second;
+    auto& txns = lock_info.txn_ids;
+    auto txn_it = std::find(txns.begin(), txns.end(), txn_id);
+
+    if (txn_it != txns.end()) {
+      // If the lock was held in exclusive mode, only one transaction should
+      // holding it.
+      if (lock_info.exclusive) {
+        assert(txns.size() == 1);
+        stripe->ReleaseLastLockHolder(lock_info, stripe_iter, lock_map, txn_id,
+                                      key, max_num_locks_, txns, txn_it);
+      } else {
+        // In shared mode, it is possible that another transaction is holding
+        // a shared lock and is waiting to upgrade the lock to exclusive.
+        assert(txns.size() >= 1);
+        if (txns.size() > 2) {
+          // Including the current transaction, if there are more than 2
+          // transactions holding the lock in shared mode, don't wake up any
+          // waiter, as the next waiter will not be able to acquire the lock
+          // anyway.
+          RemoveTransaction(txns, txn_it);
+        } else if (txns.size() == 2) {
+          // remove the current transaction first.
+          RemoveTransaction(txns, txn_it);
+          // Check whether the one remained is trying to upgrade the lock by
+          // checking whether its id matches.
+          auto& waiter_queue = lock_info.waiter_queue;
+          if (waiter_queue != nullptr && !waiter_queue->empty() &&
+              waiter_queue->front()->id == txns[0]) {
+            // There are waiters in the queue and the next one is same as the
+            // only one that is still holding the shared lock, wake the waiter
+            // up
+            waiter_queue->front()->Notify();
+            DebugWakeUpWaiter(txn_id, waiter_queue->front()->id, key,
+                              "Lock Upgrade");
+          }
+        } else {
+          // Current transaction is the only one holding the shared lock
+          stripe->ReleaseLastLockHolder(lock_info, stripe_iter, lock_map,
+                                        txn_id, key, max_num_locks_, txns,
+                                        txn_it);
+        }
+      }
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+}
+
+void PerKeyPointLockManager::UnLock(PessimisticTransaction* txn,
+                                    ColumnFamilyId column_family_id,
+                                    const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
+
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  stripe->stripe_mutex->Lock().AssertOK();
+  UnLockKey(txn, key, stripe, lock_map, env);
+  stripe->stripe_mutex->UnLock();
+}
+
+void PerKeyPointLockManager::UnLock(PessimisticTransaction* txn,
+                                    const LockTracker& tracker, Env* env) {
+  std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
+      tracker.GetColumnFamilyIterator());
+  assert(cf_it != nullptr);
+  while (cf_it->HasNext()) {
+    ColumnFamilyId cf = cf_it->Next();
+    std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(cf);
+    LockMap* lock_map = lock_map_ptr.get();
+    if (!lock_map) {
+      // Column Family must have been dropped.
+      return;
+    }
+
+    // Bucket keys by lock_map_ stripe
+    UnorderedMap<size_t, std::vector<const std::string*>> keys_by_stripe(
+        lock_map->num_stripes_);
+    std::unique_ptr<LockTracker::KeyIterator> key_it(
+        tracker.GetKeyIterator(cf));
+    assert(key_it != nullptr);
+    while (key_it->HasNext()) {
+      const std::string& key = key_it->Next();
+      size_t stripe_num = lock_map->GetStripe(key);
+      keys_by_stripe[stripe_num].push_back(&key);
+    }
+
+    // For each stripe, grab the stripe mutex and unlock all keys in this
+    // stripe
+    for (auto& stripe_iter : keys_by_stripe) {
+      size_t stripe_num = stripe_iter.first;
+      auto& stripe_keys = stripe_iter.second;
+
+      assert(lock_map->lock_map_stripes_.size() > stripe_num);
+      LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+      stripe->stripe_mutex->Lock().AssertOK();
+
+      for (const std::string* key : stripe_keys) {
+        UnLockKey(txn, *key, stripe, lock_map, env);
+      }
+
+      stripe->stripe_mutex->UnLock();
+    }
+  }
+}
+
+void PerKeyPointLockManager::UnLock(PessimisticTransaction* /* txn */,
+                                    ColumnFamilyId /* cf_id */,
+                                    const Endpoint& /* start */,
+                                    const Endpoint& /* end */, Env* /* env */) {
+  // no-op
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_manager.h b/utilities/transactions/lock/point/point_lock_manager.h
index c93006df7354..1fa8e7a78a3f 100644
--- a/utilities/transactions/lock/point/point_lock_manager.h
+++ b/utilities/transactions/lock/point/point_lock_manager.h
@@ -132,8 +132,12 @@ class PointLockManager : public LockManager {
   // this column family is no longer in use.
   void RemoveColumnFamily(const ColumnFamilyHandle* cf) override;
 
+  // Caller makes sure that a lock on the key is not requested again, unless it
+  // is an upgrade or downgrade.
   Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
                  const std::string& key, Env* env, bool exclusive) override;
+  // Caller makes sure that a lock on the key is not requested again, unless it
+  // is an upgrade or downgrade.
   Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
                  const Endpoint& start, const Endpoint& end, Env* env,
                  bool exclusive) override;
@@ -153,7 +157,7 @@ class PointLockManager : public LockManager {
 
   void Resize(uint32_t new_size) override;
 
- private:
+ protected:
   PessimisticTransactionDB* txn_db_impl_;
 
   // Default number of lock map stripes per column family
@@ -179,6 +183,11 @@ class PointLockManager : public LockManager {
   // to avoid acquiring a mutex in order to look up a LockMap
   std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
 
+  // Thread local variable for KeyLockWaiter. As one thread could only need one
+  // KeyLockWaiter.
+  // Lazy init on first time usage
+  ThreadLocalPtr key_lock_waiter_;
+
   // Must be held when modifying wait_txn_map_ and rev_wait_txn_map_.
   std::mutex wait_txn_map_mutex_;
 
@@ -196,18 +205,13 @@ class PointLockManager : public LockManager {
 
   std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
 
-  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
-                            LockMapStripe* stripe, uint32_t column_family_id,
-                            const std::string& key, Env* env, int64_t timeout,
-                            const LockInfo& lock_info);
+  virtual Status AcquireWithTimeout(
+      PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
+      uint32_t column_family_id, const std::string& key, Env* env,
+      int64_t timeout, int64_t deadlock_timeout_us, const LockInfo& lock_info);
 
-  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
-                       const std::string& key, Env* env,
-                       const LockInfo& lock_info, uint64_t* wait_time,
-                       autovector<TransactionID>* txn_ids);
-
-  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
-                 LockMapStripe* stripe, LockMap* lock_map, Env* env);
+  virtual void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+                         LockMapStripe* stripe, LockMap* lock_map, Env* env);
 
   // Returns true if a deadlock is detected.
   // Will DecrementWaiters() if a deadlock is detected.
@@ -219,6 +223,56 @@ class PointLockManager : public LockManager {
                         const autovector<TransactionID>& wait_ids);
   void DecrementWaitersImpl(const PessimisticTransaction* txn,
                             const autovector<TransactionID>& wait_ids);
+
+ private:
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       const LockInfo& lock_info, uint64_t* wait_time,
+                       autovector<TransactionID>* txn_ids);
+};
+
+class PerKeyPointLockManager : public PointLockManager {
+ public:
+  PerKeyPointLockManager(PessimisticTransactionDB* db,
+                         const TransactionDBOptions& opt);
+  // No copying allowed
+  PerKeyPointLockManager(const PerKeyPointLockManager&) = delete;
+  PerKeyPointLockManager& operator=(const PerKeyPointLockManager&) = delete;
+  // No move allowed
+  PerKeyPointLockManager(PerKeyPointLockManager&&) = delete;
+  PerKeyPointLockManager& operator=(PerKeyPointLockManager&&) = delete;
+
+  ~PerKeyPointLockManager() override {}
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const Endpoint& start, const Endpoint& end, Env* env) override;
+
+  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
+                 LockMapStripe* stripe, LockMap* lock_map, Env* env) override;
+
+ protected:
+  Status AcquireWithTimeout(PessimisticTransaction* txn, LockMap* lock_map,
+                            LockMapStripe* stripe, uint32_t column_family_id,
+                            const std::string& key, Env* env, int64_t timeout,
+                            int64_t deadlock_timeout_us,
+                            const LockInfo& lock_info) override;
+
+ private:
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       const LockInfo& txn_lock_info, uint64_t* wait_time,
+                       autovector<TransactionID>* txn_ids,
+                       LockInfo** lock_info_ptr, bool* isUpgrade, bool fifo);
+
+  int64_t CalculateWaitEndTime(int64_t expire_time_hint, int64_t end_time);
+
+  Status FillWaitIds(LockInfo& lock_info, const LockInfo& txn_lock_info,
+                     autovector<TransactionID>* wait_ids, bool& isUpgrade,
+                     TransactionID& my_txn_id, const std::string& key);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_manager_stress_test.cc b/utilities/transactions/lock/point/point_lock_manager_stress_test.cc
new file mode 100644
index 000000000000..c15a3c04c732
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_manager_stress_test.cc
@@ -0,0 +1,103 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+#include "utilities/transactions/lock/point/point_lock_validation_test_runner.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct PointLockCorrectnessCheckTestParam {
+  bool is_per_key_point_lock_manager;
+  uint32_t thread_count;
+  uint32_t key_count;
+  uint32_t max_num_keys_to_lock_per_txn;
+  uint32_t execution_time_sec;
+  LockTypeToTest lock_type;
+  int64_t lock_timeout_us;
+  int64_t lock_expiration_us;
+  bool allow_non_deadlock_error;
+  // to simulate some useful work
+  uint32_t max_sleep_after_lock_acquisition_ms;
+};
+
+class PointLockCorrectnessCheckTest
+    : public PointLockManagerTest,
+      public testing::WithParamInterface<PointLockCorrectnessCheckTestParam> {
+ public:
+  void SetUp() override {
+    init();
+    auto const& param = GetParam();
+    auto per_key_lock_manager = param.is_per_key_point_lock_manager;
+    if (per_key_lock_manager) {
+      locker_ = std::make_shared<PerKeyPointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    } else {
+      locker_ = std::make_shared<PointLockManager>(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
+    }
+
+    txn_opt_.deadlock_detect = true;
+    txn_opt_.lock_timeout = param.lock_timeout_us;
+    txn_opt_.expiration = param.lock_expiration_us;
+  }
+
+ protected:
+  TransactionOptions txn_opt_;
+};
+
+TEST_P(PointLockCorrectnessCheckTest, LockCorrectnessValidation) {
+  auto const& param = GetParam();
+  PointLockValidationTestRunner test_runner(
+      env_, txndb_opt_, locker_, db_, txn_opt_, param.thread_count,
+      param.key_count, param.max_num_keys_to_lock_per_txn,
+      param.execution_time_sec, static_cast<LockTypeToTest>(param.lock_type),
+      param.allow_non_deadlock_error,
+      param.max_sleep_after_lock_acquisition_ms);
+  test_runner.run();
+}
+
+constexpr auto X_S_LOCK = LockTypeToTest::EXCLUSIVE_AND_SHARED;
+constexpr auto X_LOCK = LockTypeToTest::EXCLUSIVE_ONLY;
+constexpr auto S_LOCK = LockTypeToTest::SHARED_ONLY;
+
+INSTANTIATE_TEST_CASE_P(
+    PointLockCorrectnessCheckTestSuite, PointLockCorrectnessCheckTest,
+    ::testing::ValuesIn(std::vector<PointLockCorrectnessCheckTestParam>{
+        // 2 second timeout and no expiration simulates myrocks default
+        // configuration
+        {true, 16, 16, 8, 10, X_S_LOCK, 2000, -1, true, 0},
+        {false, 16, 16, 8, 10, X_S_LOCK, 2000, -1, true, 0},
+        {true, 16, 16, 8, 10, X_LOCK, 2000, -1, true, 0},
+        {false, 16, 16, 8, 10, X_LOCK, 2000, -1, true, 0},
+        {true, 16, 16, 8, 10, S_LOCK, 2000, -1, true, 0},
+        {false, 16, 16, 8, 10, S_LOCK, 2000, -1, true, 0},
+        // short timeout and expiration to test lock stealing
+        {true, 16, 16, 8, 10, X_S_LOCK, 10, 10, true, 10},
+        {false, 16, 16, 8, 10, X_S_LOCK, 10, 10, true, 10},
+        {true, 16, 16, 8, 10, X_LOCK, 10, 10, true, 10},
+        {false, 16, 16, 8, 10, X_LOCK, 10, 10, true, 10},
+        {true, 16, 16, 8, 10, S_LOCK, 10, 10, true, 10},
+        {false, 16, 16, 8, 10, S_LOCK, 10, 10, true, 10},
+        // long timeout and expiration to test deadlock detection without
+        // timeout
+        {true, 16, 16, 8, 10, X_S_LOCK, 100000, 100000, false, 0},
+        {false, 16, 16, 8, 10, X_S_LOCK, 100000, 100000, false, 0},
+        {true, 16, 16, 8, 10, X_LOCK, 100000, 100000, false, 0},
+        {false, 16, 16, 8, 10, X_LOCK, 100000, 100000, false, 0},
+        {true, 16, 16, 8, 10, S_LOCK, 100000, 100000, false, 0},
+        {false, 16, 16, 8, 10, S_LOCK, 100000, 100000, false, 0},
+        // Low lock contention
+        {true, 4, 1024 * 1024, 2, 10, S_LOCK, 100000, 100000, false, 0},
+        {false, 4, 1024 * 1024, 2, 10, S_LOCK, 100000, 100000, false, 0},
+    }));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utilities/transactions/lock/point/point_lock_manager_test.cc b/utilities/transactions/lock/point/point_lock_manager_test.cc
index 0ed8cd67fe2e..b9b4dc724482 100644
--- a/utilities/transactions/lock/point/point_lock_manager_test.cc
+++ b/utilities/transactions/lock/point/point_lock_manager_test.cc
@@ -5,11 +5,39 @@
 
 #include "utilities/transactions/lock/point/point_lock_manager_test.h"
 
+#include "utilities/transactions/lock/point/any_lock_manager_test.h"
+
 namespace ROCKSDB_NAMESPACE {
 
+struct SpotLockManagerTestParam {
+  bool use_per_key_point_lock_manager;
+  int deadlock_timeout_us;
+};
+
+// including test for both PointLockManager and PerKeyPointLockManager
+class SpotLockManagerTest
+    : public PointLockManagerTest,
+      public testing::WithParamInterface<SpotLockManagerTestParam> {
+ public:
+  void SetUp() override {
+    init();
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto param = GetParam();
+    if (param.use_per_key_point_lock_manager) {
+      locker_.reset(new PerKeyPointLockManager(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+    } else {
+      locker_.reset(new PointLockManager(
+          static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+    }
+    deadlock_timeout_us = param.deadlock_timeout_us;
+  }
+};
+
 // This test is not applicable for Range Lock manager as Range Lock Manager
 // operates on Column Families, not their ids.
-TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
+TEST_P(SpotLockManagerTest, LockNonExistingColumnFamily) {
   MockColumnFamilyHandle cf(1024);
   locker_->RemoveColumnFamily(&cf);
   auto txn = NewTxn();
@@ -19,7 +47,7 @@ TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
   delete txn;
 }
 
-TEST_F(PointLockManagerTest, LockStatus) {
+TEST_P(SpotLockManagerTest, LockStatus) {
   MockColumnFamilyHandle cf1(1024), cf2(2048);
   locker_->AddColumnFamily(&cf1);
   locker_->AddColumnFamily(&cf2);
@@ -61,7 +89,7 @@ TEST_F(PointLockManagerTest, LockStatus) {
   delete txn2;
 }
 
-TEST_F(PointLockManagerTest, UnlockExclusive) {
+TEST_P(SpotLockManagerTest, UnlockExclusive) {
   MockColumnFamilyHandle cf(1);
   locker_->AddColumnFamily(&cf);
 
@@ -79,7 +107,7 @@ TEST_F(PointLockManagerTest, UnlockExclusive) {
   delete txn2;
 }
 
-TEST_F(PointLockManagerTest, UnlockShared) {
+TEST_P(SpotLockManagerTest, UnlockShared) {
   MockColumnFamilyHandle cf(1);
   locker_->AddColumnFamily(&cf);
 
@@ -100,7 +128,7 @@ TEST_F(PointLockManagerTest, UnlockShared) {
 // This test doesn't work with Range Lock Manager, because Range Lock Manager
 // doesn't support deadlock_detect_depth.
 
-TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
+TEST_P(SpotLockManagerTest, DeadlockDepthExceeded) {
   // Tests that when detecting deadlock, if the detection depth is exceeded,
   // it's also viewed as deadlock.
   MockColumnFamilyHandle cf(1);
@@ -108,7 +136,7 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
   TransactionOptions txn_opt;
   txn_opt.deadlock_detect = true;
   txn_opt.deadlock_detect_depth = 1;
-  txn_opt.lock_timeout = 1000000;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
   auto txn1 = NewTxn(txn_opt);
   auto txn2 = NewTxn(txn_opt);
   auto txn3 = NewTxn(txn_opt);
@@ -124,7 +152,8 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
   // it must have another txn waiting on it, which is txn4 in this case.
   ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
 
-  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [&]() {
     ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
     // block because txn1 is holding a lock on k1.
     ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
@@ -132,7 +161,8 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
 
   ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
 
-  port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [&]() {
     // block because txn3 is holding a lock on k1.
     ASSERT_OK(locker_->TryLock(txn4, 1, "k3", env_, true));
   });
@@ -150,15 +180,1242 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
   t1.join();
   t2.join();
 
+  locker_->UnLock(txn2, 1, "k2", env_);
+  locker_->UnLock(txn2, 1, "k1", env_);
+  locker_->UnLock(txn4, 1, "k3", env_);
+
   delete txn4;
   delete txn3;
   delete txn2;
   delete txn1;
 }
 
+TEST_P(SpotLockManagerTest, PrioritizedLockUpgradeWithExclusiveLock) {
+  // Tests that a lock upgrade request is prioritized over other lock requests.
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires exclusive lock on k1.
+  // txn1 acquires exclusive locks k1 successfully
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+
+  // txn2 tries to lock k1 exclusively, will be blocked.
+  port::Thread t;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t, [this, &txn2]() {
+    // block because txn1 is holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+  });
+
+  // verify lock upgrade successfully
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // unlock txn1, so txn2 could proceed
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Cleanup
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k1", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest,
+       PrioritizedLockUpgradeWithExclusiveLockAndSharedLock) {
+  // Tests that lock upgrade is prioritized when mixed with shared and exclusive
+  // locks requests
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn3 acquires exclusive lock on k1.
+  // txn1 acquires exclusive locks k1 <- request granted after txn2 release the
+  // lock
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+
+  // txn3 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn3_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn3_thread, [this, &txn3]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+  // Verify txn3 is blocked
+  ASSERT_TRUE(txn3_thread.joinable());
+
+  // txn1 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn1_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn1_thread, [this, &txn1]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  });
+  // Verify txn1 is blocked
+  ASSERT_TRUE(txn1_thread.joinable());
+
+  // Unlock txn2, so txn1 could proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  txn1_thread.join();
+
+  // Unlock txn1, so txn3 could proceed
+  locker_->UnLock(txn1, 1, "k1", env_);
+  txn3_thread.join();
+
+  // Cleanup
+  locker_->UnLock(txn3, 1, "k1", env_);
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, Deadlock_MultipleUpgrade) {
+  // Tests that deadlock can be detected for shared locks and exclusive locks
+  // mixed Deadlock scenario:
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn1 acquires exclusive locks k1
+  // txn2 acquires exclusive locks k1 <- dead lock detected
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+
+  // txn1 tries to lock k1 exclusively, will be blocked.
+  port::Thread t;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t, [this, &txn1]() {
+    // block because txn2 is holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k1");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, Deadlock_MultipleUpgradeInterleaveExclusive) {
+  // Tests that deadlock can be detected for shared locks and exclusive locks
+  // mixed Deadlock scenario:
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn3 acquires exclusive lock on k1.
+  // txn1 acquires exclusive locks k1 <- request granted after txn2 release the
+  // lock.
+  // txn2 acquires exclusive locks k1 <- dead lock detected
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+
+  // txn3 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn3_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn3_thread, [this, &txn3]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+  // Verify txn3 is blocked
+  ASSERT_TRUE(txn3_thread.joinable());
+
+  // txn1 tries to lock k1 exclusively, will be blocked.
+  port::Thread txn1_thread;
+  BlockUntilWaitingTxn(wait_sync_point_name_, txn1_thread, [this, &txn1]() {
+    // block because txn1 and txn2 are holding a shared lock on k1.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  });
+  // Verify txn1 is blocked
+  ASSERT_TRUE(txn1_thread.joinable());
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k1");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  // Unlock txn2, so txn1 could proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  txn1_thread.join();
+
+  // Unlock txn1, so txn3 could proceed
+  locker_->UnLock(txn1, 1, "k1", env_);
+  txn3_thread.join();
+
+  // Cleanup
+  locker_->UnLock(txn3, 1, "k1", env_);
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+class PerKeyPointLockManagerTest : public PointLockManagerTest {
+ public:
+  void SetUp() override {
+    init();
+    cf_ = std::make_unique<MockColumnFamilyHandle>(1);
+    txn_opt_.deadlock_detect = true;
+    // by default use long timeout and disable expiration
+    txn_opt_.lock_timeout = kLongTxnTimeoutMs;
+    txn_opt_.expiration = -1;
+
+    // CAUTION: This test creates a separate lock manager object (right, NOT
+    // the one that the TransactionDB is using!), and runs tests on it.
+    locker_.reset(new PerKeyPointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+    locker_->AddColumnFamily(cf_.get());
+  }
+
+  TransactionOptions txn_opt_;
+  std::unique_ptr<MockColumnFamilyHandle> cf_;
+};
+
+TEST_F(PerKeyPointLockManagerTest, LockEfficiency) {
+  // Create multiple transactions, each acquire exclusive lock on the same key
+  std::vector<PessimisticTransaction*> txns;
+  std::vector<port::Thread> blockingThreads;
+
+  // Count the total number of wait sync point calls
+  std::atomic_int wait_sync_point_times = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&wait_sync_point_times](void* /*arg*/) { wait_sync_point_times++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr auto num_of_txn = 10;
+  // create 10 transactions, each of them try to acquire exclusive lock on the
+  // same key
+  for (int i = 0; i < num_of_txn; i++) {
+    auto txn = NewTxn(txn_opt_);
+    txns.push_back(txn);
+
+    if (i == 0) {
+      // txn0 acquires the lock, so the rest of the transactions could block
+      ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, true));
+    } else {
+      blockingThreads.emplace_back([this, txn]() {
+        // block because first txn is holding an exclusive lock on k1.
+        ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, true));
+      });
+    }
+
+    // wait for transaction i to be blocked
+    while (wait_sync_point_times.load() < i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
+
+  // unlock the key, so next transaction could take the lock.
+  locker_->UnLock(txns[0], 1, "k1", env_);
+
+  auto num_of_blocking_thread = num_of_txn - 1;
+
+  for (int i = 0; i < num_of_blocking_thread; i++) {
+    // validate the thread is finished
+    blockingThreads[i].join();
+    auto num_of_threads_completed = i + 1;
+    for (int j = 0; j < num_of_blocking_thread; j++) {
+      if (j < num_of_threads_completed) {
+        // validate the thread is no longer joinable
+        ASSERT_FALSE(blockingThreads[j].joinable());
+      } else {
+        // validate the rest of the threads are still joinable
+        ASSERT_TRUE(blockingThreads[j].joinable());
+      }
+    }
+    // unlock the key, so next transaction could take the lock.
+    locker_->UnLock(txns[i + 1], 1, "k1", env_);
+  }
+
+  ASSERT_EQ(wait_sync_point_times.load(), num_of_blocking_thread);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < num_of_txn; i++) {
+    delete txns[num_of_txn - i - 1];
+  }
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockFairness) {
+  // Create multiple transactions requesting locks on the same key, validate
+  // that they are executed in FIFO order
+
+  // txn0 acquires exclusive lock on k1.
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  // txn3 acquires exclusive lock on k1.
+  // txn4 acquires shared lock on k1.
+  // txn5 acquires exclusive lock on k1.
+  // txn6 acquires exclusive lock on k1.
+  // txn7 acquires shared lock on k1.
+  // txn8 acquires shared lock on k1.
+  // txn9 acquires exclusive lock on k1.
+
+  std::vector<PessimisticTransaction*> txns;
+  std::vector<port::Thread> blockingThreads;
+
+  // Count the total number of wait sync point calls
+  std::atomic_int wait_sync_point_times = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&wait_sync_point_times](void* /*arg*/) { wait_sync_point_times++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr auto num_of_txn = 10;
+  std::vector<bool> txn_lock_types = {true, false, false, true,  false,
+                                      true, true,  false, false, true};
+  // create 10 transactions, each of them try to acquire exclusive lock on the
+  // same key
+  for (int i = 0; i < num_of_txn; i++) {
+    auto txn = NewTxn(txn_opt_);
+    txns.push_back(txn);
+
+    if (i == 0) {
+      // txn0 acquires the lock, so the rest of the transactions would block
+      ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, txn_lock_types[0]));
+    } else {
+      blockingThreads.emplace_back([this, txn, type = txn_lock_types[i]]() {
+        ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, type));
+      });
+    }
+
+    // wait for transaction i to be blocked
+    while (wait_sync_point_times.load() < i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
+
+  auto num_of_blocking_thread = num_of_txn - 1;
+
+  auto thread_idx = 0;
+  auto txn_idx = 0;
+
+  auto unlockTxn = [&]() {
+    // unlock the key in transaction.
+    locker_->UnLock(txns[txn_idx++], 1, "k1", env_);
+  };
+
+  auto validateLockTakenByNextTxn = [&]() {
+    // validate the thread is finished
+    blockingThreads[thread_idx++].join();
+  };
+
+  auto stillWaitingForLock = [&]() {
+    // validate the thread is no longer joinable
+    ASSERT_TRUE(blockingThreads[thread_idx].joinable());
+  };
+
+  // unlock the key, so next group of transactions could take the lock.
+  unlockTxn();
+
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires shared lock on k1.
+  validateLockTakenByNextTxn();
+  validateLockTakenByNextTxn();
+
+  // txn3 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn4 acquires shared lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn5 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn6 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // txn7 acquires shared lock on k1.
+  // txn8 acquires shared lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+  validateLockTakenByNextTxn();
+
+  // txn9 acquires exclusive lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // clean up
+  unlockTxn();
+
+  ASSERT_EQ(wait_sync_point_times.load(), num_of_blocking_thread);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < num_of_txn; i++) {
+    delete txns[num_of_txn - i - 1];
+  }
+}
+
+TEST_F(PerKeyPointLockManagerTest, FIFO) {
+  // validate S, X, S lock order would be executed in FIFO order
+  // txn1 acquires shared lock on k1.
+  // txn2 acquires exclusive lock on k1.
+  // txn3 acquires shared lock on k1.
+
+  std::vector<PessimisticTransaction*> txns;
+  std::vector<port::Thread> blockingThreads;
+
+  // Count the total number of wait sync point calls
+  std::atomic_int wait_sync_point_times = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&wait_sync_point_times](void* /*arg*/) { wait_sync_point_times++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr auto num_of_txn = 3;
+  std::vector<bool> txn_lock_types = {false, true, false};
+  // create 3 transactions, each of them try to acquire exclusive lock on the
+  // same key
+  for (int i = 0; i < num_of_txn; i++) {
+    auto txn = NewTxn(txn_opt_);
+    txns.push_back(txn);
+
+    if (i == 0) {
+      // txn0 acquires the lock, so the rest of the transactions would block
+      ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, txn_lock_types[0]));
+    } else {
+      blockingThreads.emplace_back([this, txn, type = txn_lock_types[i]]() {
+        ASSERT_OK(locker_->TryLock(txn, 1, "k1", env_, type));
+      });
+    }
+
+    // wait for transaction i to be blocked
+    while (wait_sync_point_times.load() < i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+  }
+
+  auto num_of_blocking_thread = num_of_txn - 1;
+
+  auto thread_idx = 0;
+  auto txn_idx = 0;
+
+  auto unlockTxn = [&]() {
+    // unlock the key in transaction.
+    locker_->UnLock(txns[txn_idx++], 1, "k1", env_);
+  };
+
+  auto validateLockTakenByNextTxn = [&]() {
+    // validate the thread is finished
+    blockingThreads[thread_idx++].join();
+  };
+
+  auto stillWaitingForLock = [&]() {
+    // validate the thread is no longer joinable
+    ASSERT_TRUE(blockingThreads[thread_idx].joinable());
+  };
+
+  // unlock the key, so next group of transactions could take the lock.
+  stillWaitingForLock();
+  unlockTxn();
+
+  // txn1 acquires exclusive lock on k1.
+  validateLockTakenByNextTxn();
+
+  // txn2 acquires shared lock on k1.
+  stillWaitingForLock();
+  unlockTxn();
+  validateLockTakenByNextTxn();
+
+  // clean up
+  unlockTxn();
+
+  ASSERT_EQ(wait_sync_point_times.load(), num_of_blocking_thread);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < num_of_txn; i++) {
+    delete txns[num_of_txn - i - 1];
+  }
+}
+
+TEST_P(SpotLockManagerTest, LockDownGradeWithOtherLockRequests) {
+  // Test lock down grade always succeeds, even if there are other lock requests
+  // waiting for the same lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  for (bool exclusive : {true, false}) {
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+    port::Thread t;
+    BlockUntilWaitingTxn(wait_sync_point_name_, t, [this, &txn2, exclusive]() {
+      // block because txn1 is holding a exclusive lock on k1.
+      ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, exclusive));
+    });
+
+    // txn1 downgrades the lock to shared lock, so txn2 could proceed
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+
+    locker_->UnLock(txn1, 1, "k1", env_);
+    t.join();
+    locker_->UnLock(txn2, 1, "k1", env_);
+  }
+
+  // clean up
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, LockTimeout) {
+  // Test lock timeout
+  // txn1 acquires an exclusive lock on k1 successfully.
+  // txn2 try to acquire a lock on k1, but timedout.
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kShortTxnTimeoutMs;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  for (bool exclusive : {true, false}) {
+    auto ret = locker_->TryLock(txn2, 1, "k1", env_, exclusive);
+    ASSERT_TRUE(ret.IsTimedOut());
+  }
+
+  // clean up
+  locker_->UnLock(txn1, 1, "k1", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, ExpiredLockStolenAfterTimeout) {
+  // validate an expired lock can be stolen by another transaction that timed
+  // out on the lock.
+  // txn1 acquires an exclusive lock on k1 successfully with a short expiration
+  // time.
+  // txn2 try to acquire a shared lock on k1 with timeout that is slightly
+  // longer than the txn1 expiration.
+  // Validate txn2 will take the lock.
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.expiration = 1000;
+  txn_opt.lock_timeout = 1000 * 2;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+    // block because txn1 is holding an exclusive lock on k1.
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  t1.join();
+
+  // clean up
+  locker_->UnLock(txn2, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  delete txn2;
+  delete txn1;
+}
+
+// Try to block until transaction enters waiting state.
+// However due to timing, it could fail, so return true if succeeded, false
+// otherwise.
+bool TryBlockUntilWaitingTxn(const char* sync_point_name, port::Thread& t,
+                             std::function<void()> function) {
+  std::atomic<bool> reached(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      sync_point_name, [&](void* /*arg*/) { reached.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // As the lifetime of the complete variable could go beyond the scope of this
+  // function, so we wrap it in a shared_ptr, and copy it into the lambda
+  std::shared_ptr<std::atomic<bool>> complete =
+      std::make_shared<std::atomic<bool>>(false);
+  t = port::Thread([complete, &function]() {
+    function();
+    complete->store(true);
+  });
+
+  auto ret = false;
+
+  while (true) {
+    if (complete->load()) {
+      // function completed, before sync point was reached, return false
+      t.join();
+      ret = false;
+      break;
+    }
+    if (reached.load()) {
+      // sync point was reached before function completed, return true
+      ret = true;
+      break;
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  return ret;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockStealAfterExpirationExclusive) {
+  // There are multiple transactions waiting for the same lock.
+  // txn1 acquires an exclusive lock on k1 successfully with a short expiration
+  // time.
+  // txn2 try to acquire an exclusive lock on k1, before expiration time,
+  // so it is blocked and waits for txn1 lock expired.
+  // txn3 try to acquire an exclusive lock on k1 after txn1 lock expires, FIFO
+  // order is respected.
+  // txn2 is woken up and takes the lock. unlock txn2, txn3 should proceed.
+
+  txn_opt_.expiration = 1000;
+  auto txn1 = NewTxn(txn_opt_);
+  txn_opt_.expiration = -1;
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  port::Thread t1;
+  auto retry_times = 10;
+
+  // Use a loop to reduce test flakiness.
+  // that the test is flaky because the txn2 thread start could be delayed until
+  // txn1 lock expired. In that case, txn2 will not enter into wait state, which
+  // will defeat the test purpose. Use a loop to retry a few times, until it is
+  // able to enter into wait state.
+  while (retry_times--) {
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    if (TryBlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+          // block because txn1 is holding a shared lock on k1.
+          ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+        })) {
+      break;
+    }
+    // failed, retry again
+    locker_->UnLock(txn1, 1, "k1", env_);
+    locker_->UnLock(txn2, 1, "k1", env_);
+  }
+  // make sure txn2 is able to reach the wait state before proceed
+  ASSERT_GT(retry_times, 0);
+
+  // txn3 try to acquire an exclusive lock on k1, FIFO order is respected.
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [this, &txn3]() {
+    // block because txn1 is holding an exclusive lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+
+  // validate txn2 is woken up and takes the lock
+  t1.join();
+
+  // unlock txn2, txn3 should proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+
+  // clean up
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockStealAfterExpirationShared) {
+  // There are multiple transactions waiting for the same lock.
+  // txn1 acquires a shared lock on k1 successfully with a short expiration
+  // time.
+  // txn2 try to acquire an exclusive lock on k1, before expiration time,
+  // so it is blocked and waits for txn1 lock expired.
+  // txn3 try to acquire a shared lock on k1 after txn1 lock expires, FIFO
+  // order is respected.
+  // txn2 is woken up and takes the lock. unlock txn2, txn3 should proceed.
+
+  txn_opt_.expiration = 1000;
+  auto txn1 = NewTxn(txn_opt_);
+  txn_opt_.expiration = -1;
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  port::Thread t1;
+  auto retry_times = 10;
+
+  // Use a loop to reduce test flakiness.
+  // that the test is flaky because the txn2 thread start could be delayed until
+  // txn1 lock expired. In that case, txn2 will not enter into wait state, which
+  // will defeat the test purpose. Use a loop to retry a few times, until it is
+  // able to enter into wait state.
+  while (retry_times--) {
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+    if (TryBlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+          // block because txn1 is holding an exclusive lock on k1.
+          ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+        })) {
+      break;
+    }
+    // failed, retry again
+    locker_->UnLock(txn1, 1, "k1", env_);
+    locker_->UnLock(txn2, 1, "k1", env_);
+  }
+  // make sure txn2 is able to reach the wait state before proceed
+  ASSERT_GT(retry_times, 0);
+
+  // txn3 try to acquire an exclusive lock on k1, FIFO order is respected.
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [this, &txn3]() {
+    // block because txn1 is holding an exclusive lock on k1.
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+  });
+
+  // validate txn2 is woken up and takes the lock
+  t1.join();
+
+  // unlock txn2, txn3 should proceed
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+
+  // clean up
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, DeadLockOnWaiter) {
+  // Txn1 acquires exclusive lock on k1
+  // Txn3 acquires shared lock on k2
+  // Txn2 tries to acquire exclusive lock on k1, waiting in the waiter queue.
+  // Txn3 tries to acquire exclusive lock on k1, waiting in the waiter queue.
+  // Txn3 depends on both Txn1 and Txn2. Txn1 unlocks k1.
+  // Txn2 takes the lock k1, and tries to acquire lock k2.
+  // Now Txn2 depends on Txn3.
+  // Deadlock is detected, and Txn2 is aborted.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k2", env_, false));
+
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsDeadlock());
+  });
+
+  port::Thread t2;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t2, [this, &txn3]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  t1.join();
+
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+
+  // clean up
+  locker_->UnLock(txn3, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k2", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, SharedLockRaceCondition) {
+  // Verify a shared lock race condition is handled properly.
+  // When there are waiters in the queue, and all of them are shared waiters,
+  // and no one has taken the lock and all of them just got woken up and not
+  // yet taken the lock yet. A new shared lock request should be granted
+  // directly, without wait in the queue. If it did, It would not be woken up
+  // until the last shared lock is released.
+
+  // Disable deadlock detection timeout to prevent test flakyness.
+  deadlock_timeout_us = 0;
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+        "PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+        "BeforeNewSharedLockRequest"},
+       {"PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+        "AfterNewSharedLockRequest",
+        "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock"}});
+
+  std::atomic<bool> reached(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&reached](void* /*arg*/) { reached.store(true); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1 acquires an exclusive lock on k1, so that the following shared lock
+  // request would be blocked
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // txn2 try to acquire a shared lock on k1, and get blocked
+  auto t1 = port::Thread([this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  // unlock txn1, txn2 should be woken up, but txn2 stops on the sync point
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Use sync point to simulate the race condition.
+  // txn3 tries to take the lock right after txn2 is woken up, but before it
+  // takes the lock
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+      "BeforeNewSharedLockRequest");
+
+  // txn3 try to acquire a shared lock on k1, and get granted immediately
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::SharedLockRaceCondition:"
+      "AfterNewSharedLockRequest");
+
+  // validate txn2 is woken up and takes the lock
+  t1.join();
+
+  // cleanup
+  locker_->UnLock(txn2, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, UpgradeLockRaceCondition) {
+  // Verify an upgrade lock race condition is handled properly.
+  // When a key is locked in exlusive mode, shared lock waiters will be enqueued
+  // as waiters.
+  // When the exclusive lock holder release the lock. The shared lock waiters
+  // are woken up to take the lock. At this point, when a new shared lock
+  // requester comes in, it will take the lock directly without waiting or
+  // queueing. This requester then immediately upgrade the lock to exclusive
+  // lock. This request will be prioritized to the head of the queue.
+  // Meantime, it should also depend on the shared lock waiters which are still
+  // in the queue that are ready to take the lock. Later, when one of the reader
+  // lock want to also upgrade its lock, it will detect a dead lock and abort.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+        "PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+        "BeforeNewSharedLockRequest"},
+       {"PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+        "AfterNewSharedLockRequest",
+        "PerKeyPointLockManager::AcquireWithTimeout:BeforeTakeLock"}});
+
+  std::atomic<bool> reached(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_,
+      [&reached](void* /*arg*/) { reached.store(true); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1 acquires an exclusive lock on k1, so that the following shared lock
+  // request would be blocked
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  auto t1 = port::Thread([this, &txn2]() {
+    // txn2 try to acquire a shared lock on k1, and get blocked
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  // unlock txn1, txn2 should be woken up, but txn2 stops on the sync point
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Use sync point to simulate the race condition.
+  // txn3 tries to take the lock right after txn2 is woken up, but before it
+  // takes the lock
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+      "BeforeNewSharedLockRequest");
+
+  // txn3 try to acquire a shared lock on k1, and get granted immediately
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+
+  // txn3 try to upgrade its lock to exclusive lock and get blocked.
+  reached = false;
+  auto t2 = port::Thread([this, &txn3]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, true));
+  });
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+
+  TEST_SYNC_POINT(
+      "PerKeyPointLockManagerTest::UpgradeLockRaceCondition:"
+      "AfterNewSharedLockRequest");
+
+  // validate txn2 is woken up and takes the shared lock
+  t1.join();
+
+  // validate txn2 would get deadlock when it try to upgrade its lock to
+  // exclusive
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsDeadlock());
+
+  // cleanup
+  locker_->UnLock(txn2, 1, "k1", env_);
+  t2.join();
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(SpotLockManagerTest, Catch22) {
+  // Benchmark the overhead of one transaction depends on another in a circle
+  // repeatedly
+
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = kLongTxnTimeoutMs;
+  txn_opt.expiration = kLongTxnTimeoutMs;
+
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  // use a wait count to count the number of times the lock is waited inside
+  // transaction lock
+  std::atomic_int wait_count(0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  if (GetParam().use_per_key_point_lock_manager &&
+      GetParam().deadlock_timeout_us != 0) {
+    // Use special sync point when deadlock timeout is enabled, so the test run
+    // faster
+    SyncPoint::GetInstance()->SetCallBack(
+        "PerKeyPointLockManager::AcquireWithTimeout:"
+        "WaitingTxnBeforeDeadLockDetection",
+        [&wait_count](void* /*arg*/) { wait_count++; });
+  } else {
+    // PointLockManager
+    SyncPoint::GetInstance()->SetCallBack(
+        wait_sync_point_name_, [&wait_count](void* /*arg*/) { wait_count++; });
+  }
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // txn1 X lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  std::mutex coordinator_mutex;
+  int iteration_count = 10000;
+
+  // txn1 try to lock X lock in a loop
+  auto t1 = port::Thread(
+      [this, &txn1, &wait_count, &coordinator_mutex, &iteration_count]() {
+        while (wait_count.load() < iteration_count) {
+          // spin wait until the other thread enters the lock waiter queue.
+          while (wait_count.load() % 2 == 0);
+          // unlock the lock, so that the other thread can acquire the lock
+          locker_->UnLock(txn1, 1, "k1", env_);
+          {
+            // Use the coordinator mutex to make sure the other thread has been
+            // waked up and acquired the lock, before this thread try to acquire
+            // the lock again.
+            std::scoped_lock<std::mutex> lock(coordinator_mutex);
+            ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+          }
+        }
+        locker_->UnLock(txn1, 1, "k1", env_);
+      });
+
+  // txn2 try to lock X lock in a loop
+  auto t2 = port::Thread(
+      [this, &txn2, &wait_count, &coordinator_mutex, &iteration_count]() {
+        while (wait_count.load() < iteration_count) {
+          {
+            // Use the coordinator mutex to make sure the other thread has been
+            // waked up and acquired the lock, before this thread try to acquire
+            // the lock again.
+            std::scoped_lock<std::mutex> lock(coordinator_mutex);
+            ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+          }
+          // spin wait until the other thread enters the lock waiter queue.
+          while (wait_count.load() % 2 == 1);
+          // unlock the lock, so that the other thread can acquire the lock
+          locker_->UnLock(txn2, 1, "k1", env_);
+        }
+      });
+
+  // clean up
+  t1.join();
+  t2.join();
+
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockUpgradeOrdering) {
+  // When lock is upgraded, verify that it will only upgrade its lock after all
+  // the shared lock that are before the first exclusive lock in the lock wait
+  // queue.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+  auto txn3 = NewTxn(txn_opt_);
+  auto txn4 = NewTxn(txn_opt_);
+
+  std::mutex txn4_mutex;
+  std::unique_lock<std::mutex> txn4_lock(txn4_mutex);
+  std::atomic_bool txn4_waked_up(false);
+  std::atomic_int wait_count(0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack(
+      wait_sync_point_name_, [&wait_count](void* /*arg*/) { wait_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "PerKeyPointLockManager::AcquireWithTimeout:AfterWokenUp",
+      [&txn4, &txn4_mutex, &txn4_waked_up](void* arg) {
+        auto transaction_id = *(static_cast<TransactionID*>(arg));
+        if (transaction_id == txn4->GetID()) {
+          txn4_waked_up.store(true);
+          {
+            // wait for txn4 mutex to be released, so that this thread will be
+            // blocked.
+            std::scoped_lock<std::mutex> lock(txn4_mutex);
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Txn1 X lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // Txn2,3,4 try S lock
+  port::Thread t1([this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+  port::Thread t2([this, &txn3]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k1", env_, false));
+  });
+  port::Thread t3([this, &txn4]() {
+    ASSERT_OK(locker_->TryLock(txn4, 1, "k1", env_, false));
+  });
+
+  // wait for all 3 transactions to enter wait state
+  while (wait_count.load() < 3) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  // Txn1 unlock
+  locker_->UnLock(txn1, 1, "k1", env_);
+
+  // Txn2,3 take S lock
+  t1.join();
+  t2.join();
+
+  // wait for txn4 to be woken up, otherwise txn2 will get deadlock
+  while (!txn4_waked_up.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  // Txn2 try X lock
+  std::atomic_bool txn2_exclusive_lock_acquired(false);
+  port::Thread t4([this, &txn2, &txn2_exclusive_lock_acquired]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true));
+    txn2_exclusive_lock_acquired.store(true);
+  });
+
+  // wait for txn2 to enter wait state
+  while (wait_count.load() < 4) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+
+  // Txn3 release S lock
+  locker_->UnLock(txn3, 1, "k1", env_);
+
+  // Validate Txn2 has not acquired the lock yet
+  ASSERT_FALSE(txn2_exclusive_lock_acquired.load());
+
+  // Txn4 take S lock
+  txn4_lock.unlock();
+  t3.join();
+
+  // Txn4 release S lock Txn2 upgraded to X lock Txn2
+  locker_->UnLock(txn4, 1, "k1", env_);
+  t4.join();
+  ASSERT_TRUE(txn2_exclusive_lock_acquired.load());
+
+  // release lock clean up
+  locker_->UnLock(txn2, 1, "k1", env_);
+
+  delete txn4;
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+TEST_F(PerKeyPointLockManagerTest, LockDownGradeRaceCondition) {
+  // When a lock is downgraded, it should notify all the shared waiters in the
+  // queue to take the lock.
+
+  auto txn1 = NewTxn(txn_opt_);
+  auto txn2 = NewTxn(txn_opt_);
+
+  // Txn1 X lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  // Txn2 try S lock
+  port::Thread t1;
+  BlockUntilWaitingTxn(wait_sync_point_name_, t1, [this, &txn2]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, false));
+  });
+
+  // Txn1 downgrade to S lock
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, false));
+
+  // Txn2 take S lock
+  t1.join();
+
+  // clean up
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn2, 1, "k1", env_);
+
+  delete txn2;
+  delete txn1;
+}
+
+// Run AnyLockManagerTest with PointLockManager
 INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest,
                         ::testing::Values(nullptr));
 
+// Run AnyLockManagerTest with PerKeyPointLockManager
+template <int64_t N>
+void PerKeyPointLockManagerTestSetup(PointLockManagerTest* self) {
+  self->init();
+  self->deadlock_timeout_us = N;
+  self->UsePerKeyPointLockManager();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    PerLockPointLockManager, AnyLockManagerTest,
+    ::testing::Values(PerKeyPointLockManagerTestSetup<0>,
+                      PerKeyPointLockManagerTestSetup<100>,
+                      PerKeyPointLockManagerTestSetup<1000>));
+
+// Run PointLockManagerTest with PerLockPointLockManager and PointLockManager
+INSTANTIATE_TEST_CASE_P(
+    PointLockCorrectnessCheckTestSuite, SpotLockManagerTest,
+    ::testing::ValuesIn(std::vector<SpotLockManagerTestParam>{
+        {true, 0}, {true, 100}, {true, 1000}, {false, 0}}));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/transactions/lock/point/point_lock_manager_test.h b/utilities/transactions/lock/point/point_lock_manager_test.h
index 4f0054459c99..0261a7b3b07c 100644
--- a/utilities/transactions/lock/point/point_lock_manager_test.h
+++ b/utilities/transactions/lock/point/point_lock_manager_test.h
@@ -4,321 +4,99 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
+
 #include "file/file_util.h"
-#include "port/port.h"
-#include "port/stack_trace.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "test_util/testharness.h"
-#include "test_util/testutil.h"
 #include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test_common.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
-#include "utilities/transactions/transaction_db_mutex_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class MockColumnFamilyHandle : public ColumnFamilyHandle {
- public:
-  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
-
-  ~MockColumnFamilyHandle() override {}
-
-  const std::string& GetName() const override { return name_; }
-
-  ColumnFamilyId GetID() const override { return cf_id_; }
-
-  Status GetDescriptor(ColumnFamilyDescriptor*) override {
-    return Status::OK();
-  }
-
-  const Comparator* GetComparator() const override {
-    return BytewiseComparator();
-  }
-
- private:
-  ColumnFamilyId cf_id_;
-  std::string name_ = "MockCF";
-};
-
 class PointLockManagerTest : public testing::Test {
  public:
-  void SetUp() override {
+  void init() {
     env_ = Env::Default();
     db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
     ASSERT_OK(env_->CreateDir(db_dir_));
 
     Options opt;
     opt.create_if_missing = true;
-    TransactionDBOptions txn_opt;
-    txn_opt.transaction_lock_timeout = 0;
+    // Reduce the number of stripes to 4 to increase contention in test
+    txndb_opt_.num_stripes = 4;
+    txndb_opt_.transaction_lock_timeout = 0;
 
-    ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
+    ASSERT_OK(TransactionDB::Open(opt, txndb_opt_, db_dir_, &db_));
+
+    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+  }
 
+  void SetUp() override {
+    init();
     // CAUTION: This test creates a separate lock manager object (right, NOT
     // the one that the TransactionDB is using!), and runs tests on it.
     locker_.reset(new PointLockManager(
-        static_cast<PessimisticTransactionDB*>(db_), txn_opt));
-
-    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+        static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
   }
 
   void TearDown() override {
+    std::string errmsg;
+    auto no_lock_held = verifyNoLocksHeld(locker_, errmsg);
+    ASSERT_TRUE(no_lock_held) << errmsg;
     delete db_;
     EXPECT_OK(DestroyDir(env_, db_dir_));
   }
 
   PessimisticTransaction* NewTxn(
       TransactionOptions txn_opt = TransactionOptions()) {
+    // override deadlock_timeout_us;
+    txn_opt.deadlock_timeout_us = deadlock_timeout_us;
     Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
     return static_cast<PessimisticTransaction*>(txn);
   }
 
+  int64_t deadlock_timeout_us = 0;
+
+  void UsePerKeyPointLockManager() {
+    locker_.reset(new PerKeyPointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txndb_opt_));
+  }
+
  protected:
   Env* env_;
+  TransactionDBOptions txndb_opt_;
   std::shared_ptr<LockManager> locker_;
   const char* wait_sync_point_name_;
   friend void PointLockManagerTestExternalSetup(PointLockManagerTest*);
 
- private:
   std::string db_dir_;
   TransactionDB* db_;
 };
 
-using init_func_t = void (*)(PointLockManagerTest*);
-
-class AnyLockManagerTest : public PointLockManagerTest,
-                           public testing::WithParamInterface<init_func_t> {
- public:
-  void SetUp() override {
-    // If a custom setup function was provided, use it. Otherwise, use what we
-    // have inherited.
-    auto init_func = GetParam();
-    if (init_func)
-      (*init_func)(this);
-    else
-      PointLockManagerTest::SetUp();
-  }
-};
-
-TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
-  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
-  // Tests that a txn can acquire shared lock on the same key repeatedly.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, LockUpgrade) {
-  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, LockDowngrade) {
-  // Tests that a txn can acquire a shared lock after acquiring an exclusive
-  // lock on the same key.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-
-  // Cleanup
-  locker_->UnLock(txn, 1, "k", env_);
-  delete txn;
-}
-
-TEST_P(AnyLockManagerTest, LockConflict) {
-  // Tests that lock conflicts lead to lock timeout.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn1 = NewTxn();
-  auto txn2 = NewTxn();
-
-  {
-    // exclusive-exclusive conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  {
-    // exclusive-shared conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
-    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  {
-    // shared-exclusive conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
-    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  // Cleanup
-  locker_->UnLock(txn1, 1, "k1", env_);
-  locker_->UnLock(txn1, 1, "k2", env_);
-
-  delete txn1;
-  delete txn2;
-}
-
-port::Thread BlockUntilWaitingTxn(const char* sync_point_name,
-                                  std::function<void()> f) {
+void BlockUntilWaitingTxn(const char* sync_point_name, port::Thread& t,
+                          std::function<void()> f) {
   std::atomic<bool> reached(false);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       sync_point_name, [&](void* /*arg*/) { reached.store(true); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  port::Thread t(f);
+  t = port::Thread(f);
 
-  while (!reached.load()) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  // timeout after 30 seconds, so test does not hang forever
+  // 30 seconds should be enough for the test to reach the expected state
+  // without causing too much flakiness
+  for (int i = 0; i < 3000; i++) {
+    if (reached.load()) {
+      break;
+    }
+    std::this_thread::sleep_for(std::chrono::milliseconds(10));
   }
+
+  ASSERT_TRUE(reached.load());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  return t;
-}
-
-TEST_P(AnyLockManagerTest, SharedLocks) {
-  // Tests that shared locks can be concurrently held by multiple transactions.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  auto txn1 = NewTxn();
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
-
-  // Cleanup
-  locker_->UnLock(txn1, 1, "k", env_);
-  locker_->UnLock(txn2, 1, "k", env_);
-
-  delete txn1;
-  delete txn2;
-}
-
-TEST_P(AnyLockManagerTest, Deadlock) {
-  // Tests that deadlock can be detected.
-  // Deadlock scenario:
-  // txn1 exclusively locks k1, and wants to lock k2;
-  // txn2 exclusively locks k2, and wants to lock k1.
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-  TransactionOptions txn_opt;
-  txn_opt.deadlock_detect = true;
-  txn_opt.lock_timeout = 1000000;
-  auto txn1 = NewTxn(txn_opt);
-  auto txn2 = NewTxn(txn_opt);
-
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
-
-  // txn1 tries to lock k2, will block forever.
-  port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
-    // block because txn2 is holding a lock on k2.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
-  });
-
-  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
-  ASSERT_TRUE(s.IsBusy());
-  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
-
-  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
-  ASSERT_EQ(deadlock_paths.size(), 1u);
-  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
-
-  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
-  ASSERT_EQ(deadlocks.size(), 2u);
-
-  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
-  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
-  ASSERT_TRUE(deadlocks[0].m_exclusive);
-  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
-
-  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
-  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
-  ASSERT_TRUE(deadlocks[1].m_exclusive);
-  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
-
-  locker_->UnLock(txn2, 1, "k2", env_);
-  t.join();
-
-  // Cleanup
-  locker_->UnLock(txn1, 1, "k1", env_);
-  locker_->UnLock(txn1, 1, "k2", env_);
-  delete txn2;
-  delete txn1;
-}
-
-TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
-  MockColumnFamilyHandle cf(1);
-  locker_->AddColumnFamily(&cf);
-
-  auto txn1 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
-
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
-
-  auto txn3 = NewTxn();
-  txn3->SetLockTimeout(10000);
-  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
-    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
-    locker_->UnLock(txn3, 1, "k", env_);
-  });
-
-  // Ok, now txn3 is waiting for lock on "k", which is owned by two
-  // transactions. Check that GetWaitingTxns reports this correctly
-  uint32_t wait_cf_id;
-  std::string wait_key;
-  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
-
-  ASSERT_EQ(wait_cf_id, 1u);
-  ASSERT_EQ(wait_key, "k");
-  ASSERT_EQ(waiters.size(), 2);
-  bool waits_correct =
-      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
-      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
-  ASSERT_EQ(waits_correct, true);
-
-  // Release locks so txn3 can proceed with execution
-  locker_->UnLock(txn1, 1, "k", env_);
-  locker_->UnLock(txn2, 1, "k", env_);
-
-  // Wait until txn3 finishes
-  t1.join();
-
-  delete txn1;
-  delete txn2;
-  delete txn3;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_manager_test_common.h b/utilities/transactions/lock/point/point_lock_manager_test_common.h
new file mode 100644
index 000000000000..a4cc7dafc135
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_manager_test_common.h
@@ -0,0 +1,78 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <sstream>
+
+#include "rocksdb/db.h"
+#include "utilities/transactions/lock/lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr auto kLongTxnTimeoutMs = 100000;
+constexpr auto kShortTxnTimeoutMs = 100;
+
+class MockColumnFamilyHandle : public ColumnFamilyHandle {
+ public:
+  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
+
+  // disable copy and assignment
+  MockColumnFamilyHandle(const MockColumnFamilyHandle&) = delete;
+  MockColumnFamilyHandle& operator=(const MockColumnFamilyHandle&) = delete;
+  // disable move
+  MockColumnFamilyHandle(MockColumnFamilyHandle&&) = delete;
+  MockColumnFamilyHandle& operator=(MockColumnFamilyHandle&&) = delete;
+
+  ~MockColumnFamilyHandle() override {}
+
+  const std::string& GetName() const override { return name_; }
+
+  ColumnFamilyId GetID() const override { return cf_id_; }
+
+  Status GetDescriptor(ColumnFamilyDescriptor*) override {
+    return Status::OK();
+  }
+
+  const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+ private:
+  ColumnFamilyId cf_id_;
+  std::string name_ = "MockCF";
+};
+
+// Verify no lock was held. Return true, if success. False, if there is. Set
+// error message on False.
+bool verifyNoLocksHeld(std::shared_ptr<LockManager>& locker,
+                       std::string& errmsg) {
+  // Validate no lock was held at the end of the test
+  auto lock_status = locker->GetPointLockStatus();
+  // print the lock status for debugging
+  std::stringstream ss;
+  for (auto& s : lock_status) {
+    ss << "id " << s.first;
+    ss << " key " << s.second.key;
+    ss << " type " << (s.second.exclusive ? "exclusive" : "shared");
+    ss << " txn ids [";
+    for (auto& t : s.second.ids) {
+      ss << t << ",";
+    }
+    ss << "]";
+    ss << std::endl;
+  }
+
+  if (!lock_status.empty()) {
+    errmsg = std::to_string(lock_status.size()) +
+             " locks were held at the end. " + ss.str();
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point/point_lock_validation_test_runner.h b/utilities/transactions/lock/point/point_lock_validation_test_runner.h
new file mode 100644
index 000000000000..92af254522e6
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_validation_test_runner.h
@@ -0,0 +1,466 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdio>
+#include <iostream>
+#include <memory>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test_common.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr bool kDebugLog = false;
+
+// Since this code is executed both with and without gtest, it supports assert
+// with different ways.
+#ifdef ASSERT_TRUE
+#define ASSERT_TRUE_WITH_MSG(expr, errmsg) ASSERT_TRUE(expr) << (errmsg)
+#else
+#define ASSERT_TRUE_WITH_MSG(expr, errmsg)                             \
+  if (!(expr)) {                                                       \
+    std::cerr << "Assert true failed with error message: " << (errmsg) \
+              << std::endl;                                            \
+    abort();                                                           \
+  }
+#endif
+
+#ifndef ASSERT_OK
+#define ASSERT_OK(s) \
+  ASSERT_TRUE_WITH_MSG(s.ok(), "Failed with " + s.ToString());
+#endif
+
+#define ASSERT_TRUE_WITH_INFO(X) \
+  ASSERT_TRUE_WITH_MSG(          \
+      (X), " Txn " + std::to_string(txn_id) + " key " + std::to_string(key))
+
+#define ASSERT_EQ_WITH_INFO(X, Y) ASSERT_TRUE_WITH_INFO((X) == (Y))
+
+#define DEBUG_LOG(...)            \
+  if (kDebugLog) {                \
+    fprintf(stderr, __VA_ARGS__); \
+    fflush(stderr);               \
+  }
+
+#define DEBUG_LOG_WITH_PREFIX(format, ...) \
+  DEBUG_LOG("Txn %" PRIu64 " " format, txn_id, ##__VA_ARGS__);
+
+enum class LockTypeToTest : int8_t {
+  EXCLUSIVE_ONLY = 0,
+  SHARED_ONLY = 1,
+  EXCLUSIVE_AND_SHARED = 2,
+};
+
+struct KeyStatus {
+  KeyStatus(uint32_t k, bool ex, int v) : key(k), exclusive(ex), value(v) {}
+  uint32_t key;
+  bool exclusive;
+  int value;
+};
+
+class PointLockValidationTestRunner {
+ public:
+  PointLockValidationTestRunner(
+      Env* env, TransactionDBOptions txndb_opt,
+      std::shared_ptr<LockManager> locker, TransactionDB* db,
+      TransactionOptions txn_opt, uint32_t thd_cnt, uint32_t key_cnt,
+      uint32_t max_num_keys_to_lock_per_txn, uint32_t execution_time_sec,
+      LockTypeToTest lock_type, bool allow_non_deadlock_error,
+      uint32_t max_sleep_after_lock_acquisition_ms,
+      bool enable_per_thread_lock_count_assertion = false)
+      : env_(env),
+        txndb_opt_(std::move(txndb_opt)),
+        locker_(std::move(locker)),
+        db_(db),
+        txn_opt_(std::move(txn_opt)),
+        thread_count_(thd_cnt),
+        key_count_(key_cnt),
+        max_num_keys_to_lock_per_txn_(max_num_keys_to_lock_per_txn),
+        execution_time_sec_(execution_time_sec),
+        lock_type_(lock_type),
+        allow_non_deadlock_error_(allow_non_deadlock_error),
+        max_sleep_after_lock_acquisition_ms_(
+            max_sleep_after_lock_acquisition_ms),
+        enable_per_thread_lock_count_assertion_(
+            enable_per_thread_lock_count_assertion),
+        shutdown_(false) {
+    // Only enable lock status validation when lock expiration/stealing isk
+    // disabled.
+    enable_lock_status_validation_ = txn_opt_.expiration == -1;
+    values_.resize(key_count_, 0);
+    exclusive_lock_status_.resize(key_count_, 0);
+
+    // init counters and values
+    for (size_t i = 0; i < key_count_; i++) {
+      counters_.emplace_back(std::make_unique<std::atomic_int>(0));
+      shared_lock_count_.emplace_back(std::make_unique<std::atomic_int>(0));
+    }
+
+    for (size_t i = 0; i < thread_count_; i++) {
+      num_of_locks_acquired_per_thread_.emplace_back(
+          std::make_unique<std::atomic_int64_t>(0));
+    }
+  }
+
+  // Decide which lock type to acquire
+  // If the key is already locked and only one type of locks to be tested,
+  // return false, so caller could try to lock a different key.
+  // Otherwise, return true.
+  bool DecideLockType(
+      bool& acquire_exclusive_lock, uint32_t key,
+      std::unordered_map<uint32_t, KeyStatus>& locked_key_status,
+      bool& isUpgrade, bool& isDowngrade) {
+    // Decide lock type
+    acquire_exclusive_lock = Random::GetTLSInstance()->OneIn(2);
+
+    // check whether a lock on the same key is already held
+    auto it = locked_key_status.find(key);
+    if (it != locked_key_status.end()) {
+      // a lock on the same key is already held.
+      if (lock_type_ == LockTypeToTest::EXCLUSIVE_AND_SHARED) {
+        // if test both shared and exclusive locks, switch their type
+        if (it->second.exclusive == false) {
+          // If it is a shared lock, upgrade to an exclusive lock
+          acquire_exclusive_lock = true;
+          isUpgrade = true;
+        } else {
+          // If it is an exclusive lock, downgrade to a shared lock
+          acquire_exclusive_lock = false;
+          isDowngrade = true;
+        }
+      } else {
+        // Only one type of lock to test, and the key is already locked,
+        return false;
+      }
+    }
+
+    // This is a new key to lock or the lock type is switched.
+    if (lock_type_ != LockTypeToTest::EXCLUSIVE_AND_SHARED) {
+      // if only one type of locks to be acquired, update its type
+      acquire_exclusive_lock = (lock_type_ == LockTypeToTest::EXCLUSIVE_ONLY);
+    }
+    return true;
+  }
+
+  void run() {
+    // Verify lock guarantee. Exclusive lock provide unique access guarantee.
+    // Shared lock provide shared access guarantee.
+    // Create multiple threads. Each try to grab a lock with random type on
+    // random key.
+
+    // To validate lock exclusive guarantee, each key has a value and a counter
+    // used for tracking the number of exclusive locks have been acquired on it
+    // in each test run across all threads.
+
+    // Every time an exclusive lock is acquired, both the counter and the value
+    // are bumped by 1. The difference between the counter and the value is that
+    // counter is atomic, so it is guaranteed that it would not lose update,
+    // while value is not atomic. Its correctness is only guaranteed by the
+    // exclusiveness provided by the lock manager which is being tested. If the
+    // lock manager does not guarantee exclusiveness, the value would lose
+    // update, and the counter would mismatch with the value, which fails the
+    // test.
+
+    // To validate lock shared guarantee, after a shared lock is acquired, the
+    // counter and value are read and stored in a local variable inside the
+    // thread. Before the lock is released, the local copy is compared against
+    // the counter and value. If they mismatch, it means the shared lock
+    // guaranteed is violated.
+
+    MockColumnFamilyHandle cf(1);
+    locker_->AddColumnFamily(&cf);
+
+    for (uint32_t thd_idx = 0; thd_idx < thread_count_; thd_idx++) {
+      threads_.emplace_back([this, thd_idx]() {
+        auto txn = static_cast<PessimisticTransaction*>(
+            db_->BeginTransaction(WriteOptions(), txn_opt_));
+        auto txn_id = txn->GetID();
+        DEBUG_LOG_WITH_PREFIX("Thd %" PRIu32 " new txn\n", thd_idx);
+        while (!shutdown_) {
+          std::unordered_map<uint32_t, KeyStatus> locked_key_status;
+          auto num_key_to_lock = max_num_keys_to_lock_per_txn_;
+          Status s;
+
+          for (uint32_t j = 0; j < num_key_to_lock; j++) {
+            uint32_t key = 0;
+            key = Random::GetTLSInstance()->Uniform(key_count_);
+            auto key_str = std::to_string(key);
+            bool isUpgrade = false;
+            bool isDowngrade = false;
+            bool exclusive_lock_type;
+
+            if (!DecideLockType(exclusive_lock_type, key, locked_key_status,
+                                isUpgrade, isDowngrade)) {
+              // try a different key
+              j--;
+              continue;
+            }
+
+            if (enable_lock_status_validation_) {
+              if (isDowngrade) {
+                // Before downgrade, validate the lock is in exlusive status
+                // This could not be done after downgrade, as another thread
+                // could take a shared lock and update lock status
+                ASSERT_TRUE_WITH_INFO(exclusive_lock_status_[key]);
+                ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 0);
+                // for downgrade, update the lock status before acquiring the
+                // lock, as afterwards, it will not have exclusive access to it
+                exclusive_lock_status_[key] = 0;
+              }
+            }
+
+            // try to acquire the lock
+            DEBUG_LOG_WITH_PREFIX("try to acquire lock %" PRIu32 " type %s\n",
+                                  key,
+                                  exclusive_lock_type ? "exclusive" : "shared");
+            s = locker_->TryLock(txn, 1, key_str, env_, exclusive_lock_type);
+
+            if (s.ok()) {
+              DEBUG_LOG_WITH_PREFIX(
+                  "acquired lock %" PRIu32 " type %s\n", key,
+                  exclusive_lock_type ? "exclusive" : "shared");
+
+              auto it = locked_key_status.find(key);
+              if (isUpgrade || isDowngrade) {
+                // If it is either upgrade or downgrade, the key should exist
+                // already.
+                ASSERT_TRUE_WITH_INFO(it != locked_key_status.end());
+              } else {
+                locked_key_status.emplace(
+                    std::piecewise_construct, std::forward_as_tuple(key),
+                    std::forward_as_tuple(key, exclusive_lock_type,
+                                          values_[key]));
+              }
+              // update local lock status
+              if (exclusive_lock_type) {
+                if (isUpgrade) {
+                  it->second.exclusive = true;
+                }
+                num_of_exclusive_locks_acquired_++;
+              } else {
+                if (isDowngrade) {
+                  it->second.exclusive = false;
+                }
+                num_of_shared_locks_acquired_++;
+              }
+              num_of_locks_acquired_++;
+              (*num_of_locks_acquired_per_thread_[thd_idx])++;
+
+              if (enable_lock_status_validation_) {
+                if (exclusive_lock_type) {
+                  // validate the lock is not in exclusive status
+                  ASSERT_TRUE_WITH_INFO(!exclusive_lock_status_[key]);
+                  if (isUpgrade) {
+                    // validate the lock is in shared status and only had one
+                    // shared lock
+                    ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 1);
+                    shared_lock_count_[key]->fetch_sub(1);
+                  } else {
+                    ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 0);
+                  }
+                  // update the lock status
+                  exclusive_lock_status_[key] = 1;
+                } else {
+                  shared_lock_count_[key]->fetch_add(1);
+                  ASSERT_TRUE_WITH_INFO(!exclusive_lock_status_[key]);
+                }
+              }
+            } else {
+              if (!allow_non_deadlock_error_) {
+                ASSERT_TRUE_WITH_INFO(s.IsDeadlock());
+              }
+              if (s.IsDeadlock()) {
+                DEBUG_LOG_WITH_PREFIX(
+                    "detected deadlock on key %" PRIu32 ", abort\n", key);
+                num_of_deadlock_detected_++;
+                // for deadlock, release all locks acquired
+                break;
+              } else {
+                // for other errors, try again
+                DEBUG_LOG_WITH_PREFIX("failed to acquire lock on key %" PRIu32
+                                      ", due to "
+                                      "%s, "
+                                      "abort\n",
+                                      key, s.ToString().c_str());
+              }
+            }
+          }
+
+          // After all of the locks are acquired, try to sleep a bit to simulate
+          // some useful work to be done
+          if (max_sleep_after_lock_acquisition_ms_ != 0 && s.ok()) {
+            auto sleep_time_us = Random::GetTLSInstance()->Uniform(
+                static_cast<uint32_t>(max_sleep_after_lock_acquisition_ms_));
+            std::this_thread::sleep_for(
+                std::chrono::milliseconds(sleep_time_us));
+          }
+
+          // release all locks
+          for (const auto& pair : locked_key_status) {
+            auto key_status = pair.second;
+            auto key = key_status.key;
+            ASSERT_TRUE_WITH_INFO(key < key_count_);
+            if (enable_lock_status_validation_) {
+              ASSERT_EQ_WITH_INFO(counters_[key]->load(), values_[key]);
+              auto exclusive = key_status.exclusive;
+              if (exclusive) {
+                // for exclusive lock, bump the value by 1
+                (*counters_[key])++;
+                values_[key]++;
+                DEBUG_LOG_WITH_PREFIX("bump key %" PRIu32 " by 1 to %d\n", key,
+                                      values_[key]);
+                ASSERT_EQ_WITH_INFO(counters_[key]->load(), values_[key]);
+              } else {
+                // shared lock, validate the value has not changed since it was
+                // read
+                ASSERT_EQ_WITH_INFO(counters_[key]->load(), key_status.value);
+                ASSERT_EQ_WITH_INFO(values_[key], key_status.value);
+              }
+              if (exclusive) {
+                ASSERT_TRUE_WITH_INFO(exclusive_lock_status_[key]);
+                ASSERT_EQ_WITH_INFO(*shared_lock_count_[key], 0);
+                exclusive_lock_status_[key] = 0;
+              } else {
+                ASSERT_TRUE_WITH_INFO(!exclusive_lock_status_[key]);
+                ASSERT_TRUE_WITH_INFO(shared_lock_count_[key]->fetch_sub(1) >=
+                                      1);
+              }
+            }
+            DEBUG_LOG_WITH_PREFIX("release lock %" PRIu32 "\n", key);
+            locker_->UnLock(txn, 1, std::to_string(key), env_);
+          }
+        }
+        delete txn;
+      });
+    }
+
+    // run test for a few seconds
+    // print progress
+    auto prev_num_of_locks_acquired = num_of_locks_acquired_.load();
+    std::vector<int64_t> prev_num_of_locks_acquired_per_thread(thread_count_,
+                                                               0);
+    int64_t measured_locks_acquired = 0;
+    for (uint32_t i = 0; i < execution_time_sec_; i++) {
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+      auto num_of_locks_acquired = num_of_locks_acquired_.load();
+      DEBUG_LOG("num_of_locks_acquired: %" PRId64 "\n", num_of_locks_acquired);
+      DEBUG_LOG("num_of_exclusive_locks_acquired: %" PRId64 "\n",
+                num_of_exclusive_locks_acquired_.load());
+      DEBUG_LOG("num_of_shared_locks_acquired: %" PRId64 "\n",
+                num_of_shared_locks_acquired_.load());
+      DEBUG_LOG("num_of_deadlock_detected: %" PRId64 "\n",
+                num_of_deadlock_detected_.load());
+      ASSERT_TRUE_WITH_MSG(num_of_locks_acquired > prev_num_of_locks_acquired,
+                           "No locks were acquired in the last 1 second");
+      for (uint32_t thd_idx = 0; thd_idx < thread_count_; thd_idx++) {
+        auto num_of_locks_acquired_per_thread =
+            num_of_locks_acquired_per_thread_[thd_idx]->load();
+        DEBUG_LOG("thread: %" PRIu32 " acquired %" PRId64 " locks\n", thd_idx,
+                  num_of_locks_acquired_per_thread);
+        if (enable_per_thread_lock_count_assertion_) {
+          ASSERT_TRUE_WITH_MSG(
+              num_of_locks_acquired_per_thread >
+                  prev_num_of_locks_acquired_per_thread[thd_idx],
+              "No locks were acquired in the last 1 second on thread " +
+                  std::to_string(thd_idx));
+        }
+        prev_num_of_locks_acquired_per_thread[thd_idx] =
+            num_of_locks_acquired_per_thread;
+      }
+      prev_num_of_locks_acquired = num_of_locks_acquired;
+      if (i == 0) {
+        measured_locks_acquired = num_of_locks_acquired;
+      }
+      if (i == execution_time_sec_ - 1) {
+        measured_locks_acquired =
+            num_of_locks_acquired - measured_locks_acquired;
+        // Skip the first second, as threads are warming up
+        printf("measured_num_of_locks_acquired: %" PRId64 "\n",
+               measured_locks_acquired / (execution_time_sec_ - 1));
+      }
+    }
+
+    shutdown_ = true;
+    for (auto& t : threads_) {
+      t.join();
+    }
+
+    // validate values against counters
+    for (uint32_t i = 0; i < key_count_; i++) {
+      ASSERT_TRUE_WITH_MSG(counters_[i]->load() == values_[i],
+                           "Exclusive lock guarantee is violated.");
+    }
+
+    ASSERT_TRUE_WITH_MSG(num_of_locks_acquired_.load() >= 0,
+                         "No lock were acquired at all");
+    printf("num_of_locks_acquired: %" PRId64 "\n",
+           num_of_locks_acquired_.load());
+
+    std::string errmsg;
+    auto no_lock_held = verifyNoLocksHeld(locker_, errmsg);
+    ASSERT_TRUE_WITH_MSG(no_lock_held, errmsg);
+  }
+
+ private:
+  // test configuration
+  Env* env_;
+  TransactionDBOptions txndb_opt_;
+  std::shared_ptr<LockManager> locker_;
+
+  TransactionDB* db_;
+  TransactionOptions txn_opt_;
+
+  uint32_t thread_count_;
+  uint32_t key_count_;
+  uint32_t max_num_keys_to_lock_per_txn_;
+  uint32_t execution_time_sec_;
+  LockTypeToTest lock_type_;
+  bool allow_non_deadlock_error_;
+  uint32_t max_sleep_after_lock_acquisition_ms_;
+
+  // In some of the test run, due to debug or ASAN build and short lock timeout,
+  // a thread may not be able to acquire any lock within a second. So skip this
+  // assertion by default. However, this could be useful for quickly detecting
+  // stuck thread, when running locally with longer timeout.
+  bool enable_per_thread_lock_count_assertion_;
+
+  // Internal test variables
+
+  bool enable_lock_status_validation_;
+  std::vector<std::thread> threads_;
+  std::vector<std::unique_ptr<std::atomic_int>> counters_;
+  std::vector<int> values_;
+
+  // track whether the lock is in exclusive status or
+  // not. vector<bool> does something special underneath, causing consistency
+  // issue. Therefore int64_t is used.
+  std::vector<int64_t> exclusive_lock_status_;
+
+  // A counter to track number of shared locks for tracking shared lock status
+  std::vector<std::unique_ptr<std::atomic_int>> shared_lock_count_;
+
+  // shutdown flag to signal threads to exit
+  std::atomic_bool shutdown_ = false;
+
+  // test statistics
+  std::atomic_int64_t num_of_locks_acquired_ = 0;
+  std::atomic_int64_t num_of_shared_locks_acquired_ = 0;
+  std::atomic_int64_t num_of_exclusive_locks_acquired_ = 0;
+  std::atomic_int64_t num_of_deadlock_detected_ = 0;
+  std::vector<std::unique_ptr<std::atomic_int64_t>>
+      num_of_locks_acquired_per_thread_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc
index 45531910d159..0e37badbbcad 100644
--- a/utilities/transactions/lock/range/range_locking_test.cc
+++ b/utilities/transactions/lock/range/range_locking_test.cc
@@ -5,22 +5,18 @@
 
 #ifndef OS_WIN
 
-#include <algorithm>
 #include <functional>
 #include <iomanip>
 #include <string>
 #include <thread>
 
 #include "db/db_impl/db_impl.h"
-#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
-#include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
-#include "utilities/transactions/lock/point/point_lock_manager_test.h"
-#include "utilities/transactions/pessimistic_transaction_db.h"
-#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/lock/point/any_lock_manager_test.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
 
 using std::string;
 
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index 9bdb587274f3..79b26f201d8e 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -84,6 +84,10 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
         txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000;
   }
 
+  // deadlock timeout should be lower than lock timeout
+  deadlock_timeout_us_ =
+      std::min(txn_options.deadlock_timeout_us, lock_timeout_);
+
   if (txn_options.expiration >= 0) {
     expiration_time_ = start_time_ + txn_options.expiration * 1000;
   } else {
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index b55d69685dd1..71ec74f0efa4 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -81,7 +81,7 @@ class PessimisticTransaction : public TransactionBaseImpl {
     return ids;
   }
 
-  void SetWaitingTxn(autovector<TransactionID> ids, uint32_t column_family_id,
+  void SetWaitingTxn(autovector<TransactionID>& ids, uint32_t column_family_id,
                      const std::string* key, bool is_timed_out = false) {
     std::lock_guard<std::mutex> lock(wait_mutex_);
     waiting_txn_ids_ = ids;
@@ -114,6 +114,10 @@ class PessimisticTransaction : public TransactionBaseImpl {
   void SetLockTimeout(int64_t timeout) override {
     lock_timeout_ = timeout * 1000;
   }
+  int64_t GetDeadlockTimeout() const { return deadlock_timeout_us_; }
+  void SetDeadlockTimeout(int64_t timeout_ms) override {
+    deadlock_timeout_us_ = timeout_ms * 1000;
+  }
 
   // Returns true if locks were stolen successfully, false otherwise.
   bool TryStealingLocks();
@@ -213,6 +217,10 @@ class PessimisticTransaction : public TransactionBaseImpl {
   // Timeout in microseconds when locking a key or -1 if there is no timeout.
   int64_t lock_timeout_;
 
+  // Timeout in microseconds before perform dead lock detection.
+  // If 0, deadlock detection will be performed immediately.
+  int64_t deadlock_timeout_us_;
+
   // Whether to perform deadlock detection or not.
   bool deadlock_detect_;
 
diff --git a/utilities/transactions/timestamped_snapshot_test.cc b/utilities/transactions/timestamped_snapshot_test.cc
index 1ca265aa153a..8bd72eea01b1 100644
--- a/utilities/transactions/timestamped_snapshot_test.cc
+++ b/utilities/transactions/timestamped_snapshot_test.cc
@@ -9,17 +9,26 @@
 #include "utilities/transactions/transaction_test.h"
 
 namespace ROCKSDB_NAMESPACE {
+
+constexpr std::array TimestampedSnapshotWithTsSanityCheck_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     Unsupported, TimestampedSnapshotWithTsSanityCheck,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite)));
-
-INSTANTIATE_TEST_CASE_P(WriteCommitted, TransactionTest,
-                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
-                                           ::testing::Values(WRITE_COMMITTED),
-                                           ::testing::Values(kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        TimestampedSnapshotWithTsSanityCheck_Params)));
+
+INSTANTIATE_TEST_CASE_P(
+    WriteCommitted, TransactionTest,
+    ::testing::Combine(/*use_stackable_db=*/::testing::Bool(),
+                       /*two_write_queue=*/::testing::Bool(),
+                       ::testing::Values(WRITE_COMMITTED),
+                       ::testing::Values(kOrderedWrite),
+                       /*use_per_key_point_lock_mgr=*/::testing::Bool(),
+                       /*deadlock_timeout_us=*/::testing::Values(0, 1000)));
 
 namespace {
 // Not thread-safe. Caller needs to provide external synchronization.
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index 859518ceceea..49366e59d56b 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -250,6 +250,8 @@ class TransactionBaseImpl : public Transaction {
 
   void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */ }
 
+  void SetDeadlockTimeout(int64_t /*timeout*/) override { /* Do nothing */ }
+
   const Snapshot* GetSnapshot() const override {
     // will return nullptr when there is no snapshot
     return snapshot_.get();
diff --git a/utilities/transactions/transaction_db_mutex_impl.cc b/utilities/transactions/transaction_db_mutex_impl.cc
index 7e10feccbd0f..9f549eae952d 100644
--- a/utilities/transactions/transaction_db_mutex_impl.cc
+++ b/utilities/transactions/transaction_db_mutex_impl.cc
@@ -7,8 +7,9 @@
 
 #include <chrono>
 #include <condition_variable>
-#include <functional>
 #include <mutex>
+#include <sstream>
+#include <thread>
 
 #include "rocksdb/utilities/transaction_db_mutex.h"
 
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 83b115711167..e3b22804c2a7 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -35,51 +35,71 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+constexpr std::array DBAsBaseDB_TransactionTest_Params = {
+    std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        DBAsBaseDB_TransactionTest_Params)));
+
+constexpr std::array DBAsBaseDB_TransactionStressTest_Params = {
+    std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     DBAsBaseDB, TransactionStressTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        DBAsBaseDB_TransactionStressTest_Params)));
+
+constexpr std::array StackableDBAsBaseDB_TransactionTest_Params = {
+    std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
+    std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     StackableDBAsBaseDB, TransactionTest,
-    ::testing::Values(
-        std::make_tuple(true, true, WRITE_COMMITTED, kOrderedWrite),
-        std::make_tuple(true, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(true, true, WRITE_UNPREPARED, kOrderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        StackableDBAsBaseDB_TransactionTest_Params)));
 
 // MySQLStyleTransactionTest takes far too long for valgrind to run. Only do it
 // in full mode (`ROCKSDB_FULL_VALGRIND_RUN` compiler flag is set).
 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+constexpr std::array MySQLStyleTransactionTest_Params = {
+    std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
+    std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
+    std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
+    std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)};
+
 INSTANTIATE_TEST_CASE_P(
     MySQLStyleTransactionTest, MySQLStyleTransactionTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_COMMITTED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_COMMITTED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, true),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, false),
-        std::make_tuple(false, false, WRITE_UNPREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, false),
-        std::make_tuple(false, true, WRITE_UNPREPARED, kOrderedWrite, true),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, false),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, bool),
+        MySQLStyleTransactionTest_Params)));
+
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(TransactionTest, TestUpperBoundUponDeletion) {
@@ -5777,8 +5797,8 @@ Status TransactionStressTestInserter(
   TransactionOptions txn_options;
   txn_options.use_only_the_last_commit_time_batch_for_recovery = true;
 
-  // Inside the inserter we might also retake the snapshot. We do both since two
-  // separte functions are engaged for each.
+  // Inside the inserter we might also retake the snapshot. We do both since
+  // two separte functions are engaged for each.
   txn_options.set_snapshot = rand->OneIn(2);
 
   RandomTransactionInserter inserter(
@@ -8862,7 +8882,7 @@ TEST_P(TransactionTest, SecondaryIndexOnKey) {
   }
 }
 
-TEST_F(TransactionDBTest, CollapseKey) {
+TEST_P(TransactionDBTest, CollapseKey) {
   ASSERT_OK(ReOpen());
   ASSERT_OK(db->Put({}, "hello", "world"));
   ASSERT_OK(db->Flush({}));
@@ -8911,7 +8931,7 @@ TEST_F(TransactionDBTest, CollapseKey) {
   }
 }
 
-TEST_F(TransactionDBTest, FlushedLogWithPendingPrepareIsSynced) {
+TEST_P(TransactionDBTest, FlushedLogWithPendingPrepareIsSynced) {
   // Repro for a bug where we missed a necessary sync of the old WAL during
   // memtable flush. It happened due to applying an optimization to skip syncing
   // the old WAL in too many scenarios (all memtable flushes on single CF
@@ -8956,8 +8976,9 @@ TEST_F(TransactionDBTest, FlushedLogWithPendingPrepareIsSynced) {
   }
 }
 
-class CommitBypassMemtableTest : public DBTestBase,
-                                 public ::testing::WithParamInterface<bool> {
+class CommitBypassMemtableTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
   CommitBypassMemtableTest() : DBTestBase("commit_bypass_memtable_test", true) {
     SetUpTransactionDB();
@@ -8968,12 +8989,11 @@ class CommitBypassMemtableTest : public DBTestBase,
   Options options;
   TransactionDBOptions txn_db_opts;
 
-  void SetUpTransactionDB(
-      bool atomic_flush = false) {
+  void SetUpTransactionDB(bool atomic_flush = false) {
     options = CurrentOptions();
     options.create_if_missing = true;
     options.allow_2pc = true;
-    options.two_write_queues = GetParam();
+    options.two_write_queues = std::get<0>(GetParam());
     // Avoid write stall
     options.max_write_buffer_number = 8;
     options.atomic_flush = atomic_flush;
@@ -8982,13 +9002,16 @@ class CommitBypassMemtableTest : public DBTestBase,
     Destroy(options, true);
 
     txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+    txn_db_opts.use_per_key_point_lock_mgr = std::get<1>(GetParam());
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
     ASSERT_NE(txn_db, nullptr);
     db_ = txn_db;
   }
 };
 
-INSTANTIATE_TEST_CASE_P(, CommitBypassMemtableTest, testing::Bool());
+INSTANTIATE_TEST_CASE_P(, CommitBypassMemtableTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
 
 // TODO: parameterize other tests in the file with commit_bypass_memtable
 TEST_P(CommitBypassMemtableTest, SingleCFUpdate) {
@@ -9776,7 +9799,7 @@ TEST_P(CommitBypassMemtableTest, MergeMiniStress) {
   }
 }
 
-TEST_F(TransactionDBTest, SelfDeadlockBug) {
+TEST_P(TransactionDBTest, SelfDeadlockBug) {
   ASSERT_OK(ReOpen());
 
   // Create two transactions
@@ -9820,6 +9843,11 @@ TEST_F(TransactionDBTest, SelfDeadlockBug) {
   delete txn2;
 }
 
+INSTANTIATE_TEST_CASE_P(
+    TransactionDBBasicTest, TransactionDBTest,
+    ::testing::Combine(/*user_per_key_point_lock_manager=*/::testing::Bool(),
+                       /*deadlock_timeout_us=*/::testing::Values(0, 1000)));
+
 TEST_P(CommitBypassMemtableTest,
        OptimizeLargeTxnCommitWriteBatchSizeThreshold) {
   // Tests TransactionOptions::large_txn_commit_optimize_byte_threshold
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 72f7e7036bf4..dc5a1b414f1f 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -49,14 +49,18 @@ class TransactionTestBase : public ::testing::Test {
 
   TransactionDBOptions txn_db_options;
   bool use_stackable_db_;
+  int64_t deadlock_timeout_us_;
 
   TransactionTestBase(bool use_stackable_db, bool two_write_queue,
                       TxnDBWritePolicy write_policy,
-                      WriteOrdering write_ordering)
+                      WriteOrdering write_ordering,
+                      bool use_per_key_point_lock_mgr,
+                      int64_t deadlock_timeout_us)
       : db(nullptr),
         special_env(Env::Default()),
         env(nullptr),
-        use_stackable_db_(use_stackable_db) {
+        use_stackable_db_(use_stackable_db),
+        deadlock_timeout_us_(deadlock_timeout_us) {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
     options.write_buffer_size = 4 * 1024;
@@ -77,6 +81,7 @@ class TransactionTestBase : public ::testing::Test {
     txn_db_options.default_lock_timeout = 0;
     txn_db_options.write_policy = write_policy;
     txn_db_options.rollback_merge_operands = true;
+    txn_db_options.use_per_key_point_lock_mgr = use_per_key_point_lock_mgr;
     // This will stress write unprepared, by forcing write batch flush on every
     // write.
     txn_db_options.default_write_batch_flush_threshold = 1;
@@ -481,30 +486,35 @@ class TransactionTestBase : public ::testing::Test {
 
 class TransactionTest
     : public TransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, int64_t>> {
  public:
   TransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam()), std::get<3>(GetParam())){};
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam()), std::get<5>(GetParam())) {}
 };
 
-class TransactionDBTest : public TransactionTestBase {
+class TransactionDBTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<std::tuple<bool, int64_t>> {
  public:
   TransactionDBTest()
-      : TransactionTestBase(false, false, WRITE_COMMITTED, kOrderedWrite) {}
+      : TransactionTestBase(false, false, WRITE_COMMITTED, kOrderedWrite,
+                            std::get<0>(GetParam()), std::get<1>(GetParam())) {}
 };
 
 class TransactionStressTest : public TransactionTest {};
 
 class MySQLStyleTransactionTest
     : public TransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, bool>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, bool, int64_t>> {
  public:
   MySQLStyleTransactionTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam()), std::get<3>(GetParam())),
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<5>(GetParam()), std::get<6>(GetParam())),
         with_slow_threads_(std::get<4>(GetParam())) {
     if (with_slow_threads_ &&
         (txn_db_options.write_policy == WRITE_PREPARED ||
@@ -527,11 +537,13 @@ class MySQLStyleTransactionTest
 
 class WriteCommittedTxnWithTsTest
     : public TransactionTestBase,
-      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, int64_t>> {
  public:
   WriteCommittedTxnWithTsTest()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            WRITE_COMMITTED, kOrderedWrite) {}
+                            WRITE_COMMITTED, kOrderedWrite,
+                            std::get<3>(GetParam()), std::get<4>(GetParam())) {}
   ~WriteCommittedTxnWithTsTest() override {
     for (auto* h : handles_) {
       delete h;
@@ -567,12 +579,13 @@ class WriteCommittedTxnWithTsTest
 
 class TimestampedSnapshotWithTsSanityCheck
     : public TransactionTestBase,
-      public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+      public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, int64_t>> {
  public:
   explicit TimestampedSnapshotWithTsSanityCheck()
       : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
-                            std::get<2>(GetParam()), std::get<3>(GetParam())) {}
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam()), std::get<5>(GetParam())) {}
   ~TimestampedSnapshotWithTsSanityCheck() override {
     for (auto* h : handles_) {
       delete h;
@@ -583,4 +596,58 @@ class TimestampedSnapshotWithTsSanityCheck
   std::vector<ColumnFamilyHandle*> handles_{};
 };
 
+// Wrap existing params with per-key point lock manager parameters
+template <typename TargetParamType, typename SourceParamType, std::size_t... Is>
+std::vector<TargetParamType> WrapParamWithPerKeyPointLockManagerParamsImpl(
+    SourceParamType&& source_param, std::index_sequence<Is...>) {
+  std::vector<TargetParamType> wrapped_params;
+  // Use original PointLockManager
+  wrapped_params.push_back(TargetParamType(
+      std::get<Is>(std::forward<SourceParamType>(source_param))..., false,
+      INT64_C(0)));
+  // Use PerKeyPointLockManager with deadlock timeout 0
+  wrapped_params.push_back(TargetParamType(
+      std::get<Is>(std::forward<SourceParamType>(source_param))..., true,
+      INT64_C(0)));
+  // Use PerKeyPointLockManager with deadlock timeout 1000
+  wrapped_params.push_back(TargetParamType(
+      std::get<Is>(std::forward<SourceParamType>(source_param))..., true,
+      INT64_C(1000)));
+
+  return wrapped_params;
+}
+
+template <typename TargetParamType, typename SourceParamType>
+std::vector<TargetParamType> WrapParamWithPerKeyPointLockManagerParams(
+    SourceParamType&& source_param) {
+  // Get the size of the source param
+  constexpr std::size_t N = std::tuple_size_v<std::decay_t<SourceParamType>>;
+  // Create an index sequence from 0 to N-1
+  return WrapParamWithPerKeyPointLockManagerParamsImpl<TargetParamType>(
+      std::forward<SourceParamType>(source_param),
+      std::make_index_sequence<N>{});
+}
+
+template <typename TargetParamType, typename SourceParamType, size_t M>
+std::vector<TargetParamType> WrapParamsWithPerKeyPointLockManagerParams(
+    std::array<SourceParamType, M> source_param) {
+  std::vector<TargetParamType> wrapped_params;
+  for (auto& param : source_param) {
+    // Create an index sequence from 0 to N-1
+    auto new_params =
+        WrapParamWithPerKeyPointLockManagerParams<TargetParamType>(
+            std::forward<SourceParamType>(param));
+    wrapped_params.insert(wrapped_params.end(), new_params.begin(),
+                          new_params.end());
+  }
+  return wrapped_params;
+}
+
+#define WRAP_PARAM(...) __VA_ARGS__
+
+#define WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(SOURCE_PARAM_TYPES, \
+                                                          PARAMS)             \
+  WrapParamsWithPerKeyPointLockManagerParams<                                 \
+      std::tuple<SOURCE_PARAM_TYPES, bool, int64_t>>(PARAMS)
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/write_committed_transaction_ts_test.cc b/utilities/transactions/write_committed_transaction_ts_test.cc
index d73371f80f40..36c958c33d08 100644
--- a/utilities/transactions/write_committed_transaction_ts_test.cc
+++ b/utilities/transactions/write_committed_transaction_ts_test.cc
@@ -14,26 +14,12 @@
 namespace ROCKSDB_NAMESPACE {
 
 INSTANTIATE_TEST_CASE_P(
-    DBAsBaseDB, WriteCommittedTxnWithTsTest,
-    ::testing::Values(std::make_tuple(false, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(false, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(false, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/true),
-                      std::make_tuple(false, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/true)));
-
-INSTANTIATE_TEST_CASE_P(
-    DBAsStackableDB, WriteCommittedTxnWithTsTest,
-    ::testing::Values(std::make_tuple(true, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(true, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/false),
-                      std::make_tuple(true, /*two_write_queue=*/false,
-                                      /*enable_indexing=*/true),
-                      std::make_tuple(true, /*two_write_queue=*/true,
-                                      /*enable_indexing=*/true)));
+    DBAsBaseDBAndStackableDB, WriteCommittedTxnWithTsTest,
+    ::testing::Combine(/*use_stackable_db=*/::testing::Bool(),
+                       /*two_write_queue=*/::testing::Bool(),
+                       /*enable_indexing=*/::testing::Bool(),
+                       /*use_per_key_point_lock_mgr=*/::testing::Bool(),
+                       /*deadlock_timeout_us=*/::testing::Values(0, 1000)));
 
 TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) {
   ASSERT_OK(ReOpenNoDelete());
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 9781694e61d5..2b0056adc4d9 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -354,9 +354,12 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
  public:
   WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
                                    TxnDBWritePolicy write_policy,
-                                   WriteOrdering write_ordering)
+                                   WriteOrdering write_ordering,
+                                   bool user_per_key_point_lock_mgr,
+                                   int64_t deadlock_timeout_us)
       : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
-                            write_ordering){};
+                            write_ordering, user_per_key_point_lock_mgr,
+                            deadlock_timeout_us) {}
 
  protected:
   void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
@@ -528,27 +531,30 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
 
 class WritePreparedTransactionTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, bool, TxnDBWritePolicy, WriteOrdering, bool, int64_t>> {
  public:
   WritePreparedTransactionTest()
       : WritePreparedTransactionTestBase(
             std::get<0>(GetParam()), std::get<1>(GetParam()),
-            std::get<2>(GetParam()), std::get<3>(GetParam())){};
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<4>(GetParam()), std::get<5>(GetParam())) {}
 };
 
 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 class SnapshotConcurrentAccessTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<std::tuple<
-          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, size_t,
+                     size_t, bool, int64_t>> {
  public:
   SnapshotConcurrentAccessTest()
       : WritePreparedTransactionTestBase(
             std::get<0>(GetParam()), std::get<1>(GetParam()),
-            std::get<2>(GetParam()), std::get<3>(GetParam())),
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<6>(GetParam()), std::get<7>(GetParam())),
         split_id_(std::get<4>(GetParam())),
-        split_cnt_(std::get<5>(GetParam())){};
+        split_cnt_(std::get<5>(GetParam())) {}
 
  protected:
   // A test is split into split_cnt_ tests, each identified with split_id_ where
@@ -560,13 +566,15 @@ class SnapshotConcurrentAccessTest
 
 class SeqAdvanceConcurrentTest
     : public WritePreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<std::tuple<
-          bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t>> {
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, WriteOrdering, size_t,
+                     size_t, bool, int64_t>> {
  public:
   SeqAdvanceConcurrentTest()
       : WritePreparedTransactionTestBase(
             std::get<0>(GetParam()), std::get<1>(GetParam()),
-            std::get<2>(GetParam()), std::get<3>(GetParam())),
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<6>(GetParam()), std::get<7>(GetParam())),
         split_id_(std::get<4>(GetParam())),
         split_cnt_(std::get<5>(GetParam())) {
     special_env.skip_fsync_ = true;
@@ -579,120 +587,143 @@ class SeqAdvanceConcurrentTest
   size_t split_cnt_;
 };
 
+constexpr std::array WritePreparedTransactionTest_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)};
+
 INSTANTIATE_TEST_CASE_P(
     WritePreparedTransaction, WritePreparedTransactionTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering),
+        WritePreparedTransactionTest_Params)));
 
 #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+constexpr std::array TwoWriteQueue_SnapshotConcurrentAccessTest_Params = {
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
+
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)};
+
 INSTANTIATE_TEST_CASE_P(
-    TwoWriteQueues, SnapshotConcurrentAccessTest,
-    ::testing::Values(
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 10, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 11, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 12, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 13, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 14, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 15, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 16, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 17, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 18, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 19, 20),
-
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 10, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 11, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 12, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 13, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 14, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 15, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 16, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 17, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 18, 20),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 19, 20)));
+    TwoWriteQueuesPointLockManager, SnapshotConcurrentAccessTest,
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        TwoWriteQueue_SnapshotConcurrentAccessTest_Params)));
+
+constexpr std::array OneWriteQueue_SnapshotConcurrentAccessTest_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20),
+};
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SnapshotConcurrentAccessTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 10, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 11, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 12, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 13, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 14, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 15, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 16, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 17, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 18, 20),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 19, 20)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        OneWriteQueue_SnapshotConcurrentAccessTest_Params)));
+
+constexpr std::array TwoWriteQueues_SeqAdvanceConcurrentTest_Params = {
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
+    std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)};
 
 INSTANTIATE_TEST_CASE_P(
     TwoWriteQueues, SeqAdvanceConcurrentTest,
-    ::testing::Values(
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 0, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 1, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 2, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 3, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 4, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 5, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 6, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 7, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 8, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kOrderedWrite, 9, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 0, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 1, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 2, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 3, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 4, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 5, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 6, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 7, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 8, 10),
-        std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, 9, 10)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        TwoWriteQueues_SeqAdvanceConcurrentTest_Params)));
+
+constexpr std::array OneWriteQueue_SeqAdvanceConcurrentTest_Params = {
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
+    std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)};
 
 INSTANTIATE_TEST_CASE_P(
     OneWriteQueue, SeqAdvanceConcurrentTest,
-    ::testing::Values(
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 0, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 1, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 2, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 3, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 4, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 5, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 6, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 7, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 8, 10),
-        std::make_tuple(false, false, WRITE_PREPARED, kOrderedWrite, 9, 10)));
+    ::testing::ValuesIn(WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(
+        WRAP_PARAM(bool, bool, TxnDBWritePolicy, WriteOrdering, size_t, size_t),
+        OneWriteQueue_SeqAdvanceConcurrentTest_Params)));
+
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_P(WritePreparedTransactionTest, CommitMap) {
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index e655eb42a04e..587f12ea2d0b 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -13,37 +13,43 @@ class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
  public:
   WriteUnpreparedTransactionTestBase(bool use_stackable_db,
                                      bool two_write_queue,
-                                     TxnDBWritePolicy write_policy)
+                                     TxnDBWritePolicy write_policy,
+                                     bool use_per_key_point_lock_mgr,
+                                     int64_t deadlock_timeout_us)
       : TransactionTestBase(use_stackable_db, two_write_queue, write_policy,
-                            kOrderedWrite) {}
+                            kOrderedWrite, use_per_key_point_lock_mgr,
+                            deadlock_timeout_us) {}
 };
 
 class WriteUnpreparedTransactionTest
     : public WriteUnpreparedTransactionTestBase,
       virtual public ::testing::WithParamInterface<
-          std::tuple<bool, bool, TxnDBWritePolicy>> {
+          std::tuple<bool, bool, TxnDBWritePolicy, bool, int64_t>> {
  public:
   WriteUnpreparedTransactionTest()
-      : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()),
-                                           std::get<1>(GetParam()),
-                                           std::get<2>(GetParam())) {}
+      : WriteUnpreparedTransactionTestBase(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()), std::get<3>(GetParam()),
+            std::get<4>(GetParam())) {}
 };
 
 INSTANTIATE_TEST_CASE_P(
     WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest,
-    ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
-                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+    ::testing::Combine(::testing::Values(false), ::testing::Bool(),
+                       ::testing::Values(WRITE_UNPREPARED), ::testing::Bool(),
+                       ::testing::Values(0, 1000)));
 
 enum SnapshotAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT };
 enum VerificationOperation { VERIFY_GET, VERIFY_NEXT, VERIFY_PREV };
 class WriteUnpreparedSnapshotTest
     : public WriteUnpreparedTransactionTestBase,
-      virtual public ::testing::WithParamInterface<
-          std::tuple<bool, SnapshotAction, VerificationOperation>> {
+      virtual public ::testing::WithParamInterface<std::tuple<
+          bool, SnapshotAction, VerificationOperation, bool, int64_t>> {
  public:
   WriteUnpreparedSnapshotTest()
-      : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()),
-                                           WRITE_UNPREPARED),
+      : WriteUnpreparedTransactionTestBase(
+            false, std::get<0>(GetParam()), WRITE_UNPREPARED,
+            std::get<3>(GetParam()), std::get<4>(GetParam())),
         action_(std::get<1>(GetParam())),
         verify_op_(std::get<2>(GetParam())) {}
   SnapshotAction action_;
@@ -56,10 +62,11 @@ class WriteUnpreparedSnapshotTest
 // verification operation
 INSTANTIATE_TEST_CASE_P(
     WriteUnpreparedSnapshotTest, WriteUnpreparedSnapshotTest,
-    ::testing::Combine(
-        ::testing::Bool(),
-        ::testing::Values(NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT),
-        ::testing::Values(VERIFY_GET, VERIFY_NEXT, VERIFY_PREV)));
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Values(NO_SNAPSHOT, RO_SNAPSHOT,
+                                         REFRESH_SNAPSHOT),
+                       ::testing::Values(VERIFY_GET, VERIFY_NEXT, VERIFY_PREV),
+                       ::testing::Bool(), ::testing::Values(0, 1000)));
 
 TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
   // The following tests checks whether reading your own write for

From 85f1ba572e6ee589f53a8199757bd066275519b8 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Tue, 9 Sep 2025 14:47:29 -0700
Subject: [PATCH 268/500] Add support for custom IOActivity types (#13924)

Summary:
There are some internal use cases that do not map cleanly onto the existing `IOActivity` enums. This PR creates new custom IOActivity types that internal users can use as they see fit.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13924

Test Plan: Wrote a simple unit test

Reviewed By: pdillinger

Differential Revision: D82029992

Pulled By: archang19

fbshipit-source-id: a3e23c360baa96cd2e9adf570e71c6e43947bfc8
---
 env/env.cc            |  12 ++++
 env/env_test.cc       |  18 ++++++
 include/rocksdb/env.h | 136 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 165 insertions(+), 1 deletion(-)

diff --git a/env/env.cc b/env/env.cc
index 7d97c42b0fa0..4047f2797038 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -9,6 +9,7 @@
 
 #include "rocksdb/env.h"
 
+#include <sstream>
 #include <thread>
 
 #include "env/composite_env_wrapper.h"
@@ -769,6 +770,17 @@ std::string Env::IOActivityToString(IOActivity activity) {
       return "GetFileChecksumsFromCurrentManifest";
     case Env::IOActivity::kUnknown:
       return "Unknown";
+    default:
+      int activityIndex = static_cast<int>(activity);
+      if (activityIndex >=
+              static_cast<int>(Env::IOActivity::kFirstCustomIOActivity) &&
+          activityIndex <=
+              static_cast<int>(Env::IOActivity::kLastCustomIOActivity)) {
+        std::stringstream ss;
+        ss << std::hex << std::uppercase << activityIndex;
+        return "CustomIOActivity" + ss.str();
+      }
+      return "Invalid";
   };
   assert(false);
   return "Invalid";
diff --git a/env/env_test.cc b/env/env_test.cc
index 421d13ec5ea5..30cfdde51055 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -3692,6 +3692,24 @@ TEST_F(TestGetFileSize, GetFileSize) {
   ASSERT_EQ(fileSizeFromFsRandomAccessFileAPI, expectedFileSize);
 }
 
+class TestIOActivity : public testing::Test {
+ public:
+  TestIOActivity() {}
+};
+
+TEST_F(TestIOActivity, IOActivityToString) {
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kMultiGet), "MultiGet");
+
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivity80),
+            "CustomIOActivity80");
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivityA9),
+            "CustomIOActivityA9");
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kCustomIOActivityFE),
+            "CustomIOActivityFE");
+
+  ASSERT_EQ(Env::IOActivityToString(Env::IOActivity::kUnknown), "Unknown");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 58f182751d16..c0f667ff8c48 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -463,7 +463,141 @@ class Env : public Customizable {
     kGetEntity = 8,
     kMultiGetEntity = 9,
     kGetFileChecksumsFromCurrentManifest = 10,
-    kUnknown,  // Keep last for easy array of non-unknowns
+    // Enums after this, up to 0x7F, are reserved for future use for the public
+    // RocksDB API (i.e. they should be "non-custom" IO activities). Make sure
+    // to also update IOActivityToString when adding new values.
+
+    kCustomIOActivity80 = 0x80,
+    kFirstCustomIOActivity = kCustomIOActivity80,
+    kCustomIOActivity81 = 0x81,
+    kCustomIOActivity82 = 0x82,
+    kCustomIOActivity83 = 0x83,
+    kCustomIOActivity84 = 0x84,
+    kCustomIOActivity85 = 0x85,
+    kCustomIOActivity86 = 0x86,
+    kCustomIOActivity87 = 0x87,
+    kCustomIOActivity88 = 0x88,
+    kCustomIOActivity89 = 0x89,
+    kCustomIOActivity8A = 0x8A,
+    kCustomIOActivity8B = 0x8B,
+    kCustomIOActivity8C = 0x8C,
+    kCustomIOActivity8D = 0x8D,
+    kCustomIOActivity8E = 0x8E,
+    kCustomIOActivity8F = 0x8F,
+    kCustomIOActivity90 = 0x90,
+    kCustomIOActivity91 = 0x91,
+    kCustomIOActivity92 = 0x92,
+    kCustomIOActivity93 = 0x93,
+    kCustomIOActivity94 = 0x94,
+    kCustomIOActivity95 = 0x95,
+    kCustomIOActivity96 = 0x96,
+    kCustomIOActivity97 = 0x97,
+    kCustomIOActivity98 = 0x98,
+    kCustomIOActivity99 = 0x99,
+    kCustomIOActivity9A = 0x9A,
+    kCustomIOActivity9B = 0x9B,
+    kCustomIOActivity9C = 0x9C,
+    kCustomIOActivity9D = 0x9D,
+    kCustomIOActivity9E = 0x9E,
+    kCustomIOActivity9F = 0x9F,
+    kCustomIOActivityA0 = 0xA0,
+    kCustomIOActivityA1 = 0xA1,
+    kCustomIOActivityA2 = 0xA2,
+    kCustomIOActivityA3 = 0xA3,
+    kCustomIOActivityA4 = 0xA4,
+    kCustomIOActivityA5 = 0xA5,
+    kCustomIOActivityA6 = 0xA6,
+    kCustomIOActivityA7 = 0xA7,
+    kCustomIOActivityA8 = 0xA8,
+    kCustomIOActivityA9 = 0xA9,
+    kCustomIOActivityAA = 0xAA,
+    kCustomIOActivityAB = 0xAB,
+    kCustomIOActivityAC = 0xAC,
+    kCustomIOActivityAD = 0xAD,
+    kCustomIOActivityAE = 0xAE,
+    kCustomIOActivityAF = 0xAF,
+    kCustomIOActivityB0 = 0xB0,
+    kCustomIOActivityB1 = 0xB1,
+    kCustomIOActivityB2 = 0xB2,
+    kCustomIOActivityB3 = 0xB3,
+    kCustomIOActivityB4 = 0xB4,
+    kCustomIOActivityB5 = 0xB5,
+    kCustomIOActivityB6 = 0xB6,
+    kCustomIOActivityB7 = 0xB7,
+    kCustomIOActivityB8 = 0xB8,
+    kCustomIOActivityB9 = 0xB9,
+    kCustomIOActivityBA = 0xBA,
+    kCustomIOActivityBB = 0xBB,
+    kCustomIOActivityBC = 0xBC,
+    kCustomIOActivityBD = 0xBD,
+    kCustomIOActivityBE = 0xBE,
+    kCustomIOActivityBF = 0xBF,
+    kCustomIOActivityC0 = 0xC0,
+    kCustomIOActivityC1 = 0xC1,
+    kCustomIOActivityC2 = 0xC2,
+    kCustomIOActivityC3 = 0xC3,
+    kCustomIOActivityC4 = 0xC4,
+    kCustomIOActivityC5 = 0xC5,
+    kCustomIOActivityC6 = 0xC6,
+    kCustomIOActivityC7 = 0xC7,
+    kCustomIOActivityC8 = 0xC8,
+    kCustomIOActivityC9 = 0xC9,
+    kCustomIOActivityCA = 0xCA,
+    kCustomIOActivityCB = 0xCB,
+    kCustomIOActivityCC = 0xCC,
+    kCustomIOActivityCD = 0xCD,
+    kCustomIOActivityCE = 0xCE,
+    kCustomIOActivityCF = 0xCF,
+    kCustomIOActivityD0 = 0xD0,
+    kCustomIOActivityD1 = 0xD1,
+    kCustomIOActivityD2 = 0xD2,
+    kCustomIOActivityD3 = 0xD3,
+    kCustomIOActivityD4 = 0xD4,
+    kCustomIOActivityD5 = 0xD5,
+    kCustomIOActivityD6 = 0xD6,
+    kCustomIOActivityD7 = 0xD7,
+    kCustomIOActivityD8 = 0xD8,
+    kCustomIOActivityD9 = 0xD9,
+    kCustomIOActivityDA = 0xDA,
+    kCustomIOActivityDB = 0xDB,
+    kCustomIOActivityDC = 0xDC,
+    kCustomIOActivityDD = 0xDD,
+    kCustomIOActivityDE = 0xDE,
+    kCustomIOActivityDF = 0xDF,
+    kCustomIOActivityE0 = 0xE0,
+    kCustomIOActivityE1 = 0xE1,
+    kCustomIOActivityE2 = 0xE2,
+    kCustomIOActivityE3 = 0xE3,
+    kCustomIOActivityE4 = 0xE4,
+    kCustomIOActivityE5 = 0xE5,
+    kCustomIOActivityE6 = 0xE6,
+    kCustomIOActivityE7 = 0xE7,
+    kCustomIOActivityE8 = 0xE8,
+    kCustomIOActivityE9 = 0xE9,
+    kCustomIOActivityEA = 0xEA,
+    kCustomIOActivityEB = 0xEB,
+    kCustomIOActivityEC = 0xEC,
+    kCustomIOActivityED = 0xED,
+    kCustomIOActivityEE = 0xEE,
+    kCustomIOActivityEF = 0xEF,
+    kCustomIOActivityF0 = 0xF0,
+    kCustomIOActivityF1 = 0xF1,
+    kCustomIOActivityF2 = 0xF2,
+    kCustomIOActivityF3 = 0xF3,
+    kCustomIOActivityF4 = 0xF4,
+    kCustomIOActivityF5 = 0xF5,
+    kCustomIOActivityF6 = 0xF6,
+    kCustomIOActivityF7 = 0xF7,
+    kCustomIOActivityF8 = 0xF8,
+    kCustomIOActivityF9 = 0xF9,
+    kCustomIOActivityFA = 0xFA,
+    kCustomIOActivityFB = 0xFB,
+    kCustomIOActivityFC = 0xFC,
+    kCustomIOActivityFD = 0xFD,
+    kCustomIOActivityFE = 0xFE,
+    kLastCustomIOActivity = kCustomIOActivityFE,
+
+    kUnknown = 0xFF,  // Keep last as unknown
   };
 
   static std::string IOActivityToString(IOActivity activity);

From 0e59c3864f4acde9b5d9004ab73bb686d33df17c Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:57:13 -0700
Subject: [PATCH 269/500] Add copyright to header file (#13930)

Summary:
Add copyright notice to any_lock_manager_test.h

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13930

Reviewed By: xingbowang

Differential Revision: D82035581

Pulled By: anand1976

fbshipit-source-id: 2275f7c8b41fbd4384bdae011d244bfa117225f7
---
 utilities/transactions/lock/point/any_lock_manager_test.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/utilities/transactions/lock/point/any_lock_manager_test.h b/utilities/transactions/lock/point/any_lock_manager_test.h
index 4562f215a9a6..9ea9114b9264 100644
--- a/utilities/transactions/lock/point/any_lock_manager_test.h
+++ b/utilities/transactions/lock/point/any_lock_manager_test.h
@@ -1,3 +1,8 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
 #pragma once
 
 #include "utilities/transactions/lock/point/point_lock_manager_test.h"

From 8b8a3de2c6f88e0703d465d29c902944231b036b Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Tue, 9 Sep 2025 21:45:50 -0700
Subject: [PATCH 270/500] Fix PointLockManager in C++20 (#13933)

Summary:
Fix broken build in PointLockManager change with C++20

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13933

Test Plan: CI

Reviewed By: pdillinger

Differential Revision: D82073490

Pulled By: xingbowang

fbshipit-source-id: 0bd4936fe0a27a28db61ca5f23d3bea90bce73ef
---
 .../lock/point/point_lock_validation_test_runner.h     |  7 +++++--
 utilities/transactions/transaction_test.h              | 10 ++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/utilities/transactions/lock/point/point_lock_validation_test_runner.h b/utilities/transactions/lock/point/point_lock_validation_test_runner.h
index 92af254522e6..00ae526e9c0d 100644
--- a/utilities/transactions/lock/point/point_lock_validation_test_runner.h
+++ b/utilities/transactions/lock/point/point_lock_validation_test_runner.h
@@ -387,8 +387,11 @@ class PointLockValidationTestRunner {
         measured_locks_acquired =
             num_of_locks_acquired - measured_locks_acquired;
         // Skip the first second, as threads are warming up
-        printf("measured_num_of_locks_acquired: %" PRId64 "\n",
-               measured_locks_acquired / (execution_time_sec_ - 1));
+        auto measured_execution_time_sec = execution_time_sec_ - 1;
+        if (measured_execution_time_sec > 0) {
+          printf("measured_num_of_locks_acquired: %" PRId64 "\n",
+                 measured_locks_acquired / (measured_execution_time_sec));
+        }
       }
     }
 
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index dc5a1b414f1f..464c9e6883f1 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -596,6 +596,12 @@ class TimestampedSnapshotWithTsSanityCheck
   std::vector<ColumnFamilyHandle*> handles_{};
 };
 
+// The following templates causes a bug in GCC 14, ignore the error for now
+#if defined(__GNUC__) && __GNUC__ == 14
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
 // Wrap existing params with per-key point lock manager parameters
 template <typename TargetParamType, typename SourceParamType, std::size_t... Is>
 std::vector<TargetParamType> WrapParamWithPerKeyPointLockManagerParamsImpl(
@@ -643,6 +649,10 @@ std::vector<TargetParamType> WrapParamsWithPerKeyPointLockManagerParams(
   return wrapped_params;
 }
 
+#if defined(__GNUC__) && __GNUC__ == 14
+#pragma GCC diagnostic pop
+#endif
+
 #define WRAP_PARAM(...) __VA_ARGS__
 
 #define WRAP_PARAM_WITH_PER_KEY_POINT_LOCK_MANAGER_PARAMS(SOURCE_PARAM_TYPES, \

From 67af5bdc388cdc2e26215fcf7645e5c0f9097c52 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 10 Sep 2025 10:29:49 -0700
Subject: [PATCH 271/500] Add Temperature::kIce (#13927)

Summary:
... and associated statistics, etc. Someone needs it, so here it is.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13927

Test Plan: Updated / extended / added some unit tests

Reviewed By: cbi42

Differential Revision: D81981469

Pulled By: pdillinger

fbshipit-source-id: 52558c08741890b781310906acbc18d9eb479363
---
 db/compaction/compaction_picker_fifo.cc       |  14 +-
 db/compaction/tiered_compaction_test.cc       |  10 +-
 db/db_compaction_test.cc                      | 222 ++++++++++++++++++
 db/db_test2.cc                                |   3 +-
 db/version_edit.cc                            |   2 +-
 db_stress_tool/db_stress_test_base.cc         |   2 +
 file/random_access_file_reader.cc             |   6 +
 include/rocksdb/iostats_context.h             |   6 +
 include/rocksdb/statistics.h                  |   2 +
 include/rocksdb/types.h                       |   3 +
 java/rocksjni/portal.h                        |   8 +
 .../src/main/java/org/rocksdb/TickerType.java |   2 +
 monitoring/statistics.cc                      |   2 +
 options/options_helper.cc                     |   6 +-
 tools/db_crashtest.py                         |  14 +-
 15 files changed, 283 insertions(+), 19 deletions(-)

diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index cc2a9bfd0aa1..98c03d01131c 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -387,12 +387,14 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
       assert(compaction_target_temp == Temperature::kLastTemperature);
       compaction_target_temp = cur_target_temp;
       inputs[0].files.push_back(cur_file);
-      ROCKS_LOG_BUFFER(
-          log_buffer,
-          "[%s] FIFO compaction: picking file %" PRIu64
-          " with estimated newest key time %" PRIu64 " for temperature %s.",
-          cf_name.c_str(), cur_file->fd.GetNumber(), est_newest_key_time,
-          temperature_to_string[cur_target_temp].c_str());
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with estimated newest key time %" PRIu64
+                       " and temperature %s for temperature %s.",
+                       cf_name.c_str(), cur_file->fd.GetNumber(),
+                       est_newest_key_time,
+                       temperature_to_string[cur_file->temperature].c_str(),
+                       temperature_to_string[cur_target_temp].c_str());
       break;
     }
   }
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index 879dc0712aa0..0d623678c4b2 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -1764,7 +1764,9 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   options.env = mock_env_.get();
   options.level0_file_num_compaction_trigger = kNumTrigger;
   options.num_levels = kNumLevels;
-  options.last_level_temperature = Temperature::kCold;
+  // This existing test selected to also check the kIce case, which should not
+  // be interesting enough to exercise across all the test cases
+  options.last_level_temperature = Temperature::kIce;
   DestroyAndReopen(options);
 
   Random rnd(301);
@@ -1791,6 +1793,9 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   ASSERT_FALSE(tp_mapping.Empty());
   auto seqs = tp_mapping.TEST_GetInternalMapping();
   ASSERT_FALSE(seqs.empty());
+  ASSERT_GE(GetSstSizeHelper(Temperature::kUnknown), 1);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kIce), 0);
 
   // Wait more than preclude_last_level time, then make sure all the data is
   // compacted to the last level even there's no write (no seqno -> time
@@ -1800,7 +1805,8 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
   ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
-  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GE(GetSstSizeHelper(Temperature::kIce), 1);
 
   Close();
 }
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 940cdeaa5af9..69621278c177 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
+#include "rocksdb/iostats_context.h"
 #include "rocksdb/sst_file_writer.h"
 #include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
@@ -9727,6 +9728,7 @@ TEST_F(DBCompactionTest, FIFOChangeTemperature) {
       int total_cold = 0;
       int total_warm = 0;
       int total_hot = 0;
+      int total_ice = 0;
       int total_unknown = 0;
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
           "NewWritableFile::FileOptions.temperature", [&](void* arg) {
@@ -9737,6 +9739,8 @@ TEST_F(DBCompactionTest, FIFOChangeTemperature) {
               total_warm++;
             } else if (temperature == Temperature::kHot) {
               total_hot++;
+            } else if (temperature == Temperature::kIce) {
+              total_ice++;
             } else {
               assert(temperature == Temperature::kUnknown);
               total_unknown++;
@@ -9810,6 +9814,224 @@ TEST_F(DBCompactionTest, FIFOChangeTemperature) {
   }
 }
 
+using TemperatureSet = SmallEnumSet<Temperature, Temperature::kLastTemperature>;
+static void VerifyTemperatureFileReadStats(const Statistics& st,
+                                           TemperatureSet temps) {
+  SCOPED_TRACE("Temp set size = " + std::to_string(temps.count()));
+  constexpr uint64_t min_bytes = 100;
+  constexpr uint64_t min_count = 1;
+
+  IOStatsContext* iostats = get_iostats_context();
+  if (temps.Contains(Temperature::kHot)) {
+    EXPECT_GE(st.getTickerCount(HOT_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(HOT_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.hot_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.hot_file_read_count,
+              min_count);
+
+  } else {
+    EXPECT_EQ(st.getTickerCount(HOT_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(HOT_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kWarm)) {
+    EXPECT_GE(st.getTickerCount(WARM_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(WARM_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.warm_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.warm_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(WARM_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(WARM_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kCold)) {
+    EXPECT_GE(st.getTickerCount(COLD_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(COLD_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cold_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(COLD_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(COLD_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0);
+  }
+
+  if (temps.Contains(Temperature::kIce)) {
+    EXPECT_GE(st.getTickerCount(ICE_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(ICE_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.ice_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.ice_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(ICE_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(ICE_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.ice_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.ice_file_read_count, 0);
+  }
+}
+
+TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
+  // Test multi-tier aging: Hot -> Warm -> Cold -> Ice
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;  // Simplify statistics
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  CompactionOptionsFIFO fifo_options;
+  // Multi-tier aging: files age through multiple temperatures
+  fifo_options.file_temperature_age_thresholds = {
+      {Temperature::kWarm, 500},   // Hot -> Warm after 500s
+      {Temperature::kCold, 1000},  // Warm -> Cold after 1000s
+      {Temperature::kIce, 1500}    // Cold -> Ice after 1500s
+  };
+  fifo_options.max_table_files_size = 100000000;
+  fifo_options.allow_trivial_copy_when_change_temperature = true;
+  options.compaction_options_fifo = fifo_options;
+  options.default_write_temperature = Temperature::kHot;
+
+  Reopen(options);
+  env_->SetMockSleep();
+
+  // Track all temperature file creations
+  int total_hot = 0, total_warm = 0, total_cold = 0, total_ice = 0,
+      total_unknown = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+        Temperature temperature = *(static_cast<Temperature*>(arg));
+        switch (temperature) {
+          case Temperature::kHot:
+            total_hot++;
+            break;
+          case Temperature::kWarm:
+            total_warm++;
+            break;
+          case Temperature::kCold:
+            total_cold++;
+            break;
+          case Temperature::kIce:
+            total_ice++;
+            break;
+          case Temperature::kUnknown:
+            total_unknown++;
+            break;
+          default:
+            break;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create initial three files (will start as Hot), enough to ensure key
+  // range filtering will be applied in FilePicker::GetNextFile() with one
+  // more file
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put(Key(0), Random::GetTLSInstance()->RandomBinaryString(100)));
+    ASSERT_OK(Flush());
+  }
+
+  // Test reading from Hot temperature file
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kHot);
+
+  // Age initial files to warm
+  env_->MockSleepForSeconds(600);
+  ASSERT_OK(Put(Key(1), Random::GetTLSInstance()->RandomBinaryString(101)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Test reading from Warm temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  // Verify Warm file statistics
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kWarm);
+
+  // Age initial files to cold
+  env_->MockSleepForSeconds(600);
+  ASSERT_OK(Put(Key(2), Random::GetTLSInstance()->RandomBinaryString(102)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Test reading from Cold temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCold);
+
+  // Age initial files to ice
+  env_->MockSleepForSeconds(600);
+  ASSERT_OK(Put(Key(3), Random::GetTLSInstance()->RandomBinaryString(103)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Test reading from Ice temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kIce);
+
+  // Verify temperature progression in metadata
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+
+  // Should have files at different temperatures
+  std::map<Temperature, int> temp_counts;
+  for (const auto& file : metadata.levels[0].files) {
+    temp_counts[file.temperature]++;
+  }
+
+  // Verify current files temperatures
+  EXPECT_EQ(temp_counts[Temperature::kHot], 1);
+  EXPECT_EQ(temp_counts[Temperature::kWarm], 1);
+  EXPECT_EQ(temp_counts[Temperature::kCold], 1);
+  EXPECT_EQ(temp_counts[Temperature::kIce], 3);
+
+  // Verify historical (and current) file temperatures
+  EXPECT_EQ(total_hot, 6);
+  EXPECT_EQ(total_warm, 5);
+  EXPECT_EQ(total_cold, 4);
+  EXPECT_EQ(total_ice, 3);
+
+  // Final comprehensive test: read from all temperature files
+  Reopen(options);
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  // Read from all files to verify cumulative statistics
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(static_cast<unsigned>(100 + i), Get(Key(i)).size());
+  }
+
+  VerifyTemperatureFileReadStats(*options.statistics, TemperatureSet::All());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
   const int kNumL0Files = 10;
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index b84c4b35a1fb..4f1738880c3e 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -6068,7 +6068,8 @@ TEST_F(DBTest2, VariousFileTemperatures) {
   // non-unknown temperatures.
   auto RandomTemp = [] {
     static std::vector<Temperature> temps = {
-        Temperature::kHot, Temperature::kWarm, Temperature::kCold};
+        Temperature::kHot, Temperature::kWarm, Temperature::kCold,
+        Temperature::kIce};
     return temps[Random::GetTLSInstance()->Uniform(
         static_cast<int>(temps.size()))];
   };
diff --git a/db/version_edit.cc b/db/version_edit.cc
index f666308bc071..84aeba823faa 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -396,7 +396,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "temperature field wrong size";
           } else {
             Temperature casted_field = static_cast<Temperature>(field[0]);
-            if (casted_field <= Temperature::kCold) {
+            if (casted_field < Temperature::kLastTemperature) {
               f.temperature = casted_field;
             }
           }
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index b24a95c72f37..9f420b9645ad 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -428,6 +428,8 @@ bool StressTest::BuildOptionsTable() {
     options_tbl.emplace(
         "file_temperature_age_thresholds",
         std::vector<std::string>{
+            "{{temperature=kWarm;age=10}:{temperature=kCold;age=50}:{"
+            "temperature=kIce;age=250}}",
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}", "{}"});
     options_tbl.emplace(
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index c8edc86360ec..b14a9c8bfecd 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -86,6 +86,12 @@ inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
         RecordTick(stats, COLD_FILE_READ_BYTES, size);
         RecordTick(stats, COLD_FILE_READ_COUNT, 1);
         break;
+      case Temperature::kIce:
+        IOSTATS_ADD(file_io_stats_by_temperature.ice_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.ice_file_read_count, 1);
+        RecordTick(stats, ICE_FILE_READ_BYTES, size);
+        RecordTick(stats, ICE_FILE_READ_COUNT, 1);
+        break;
       default:
         break;
     }
diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h
index 592bc0c46709..64cf8cb49365 100644
--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@@ -34,20 +34,26 @@ struct FileIOByTemperature {
   uint64_t warm_file_bytes_read;
   // the number of bytes read to Temperature::kCold file
   uint64_t cold_file_bytes_read;
+  // the number of bytes read to Temperature::kIce file
+  uint64_t ice_file_bytes_read;
   // total number of reads to Temperature::kHot file
   uint64_t hot_file_read_count;
   // total number of reads to Temperature::kWarm file
   uint64_t warm_file_read_count;
   // total number of reads to Temperature::kCold file
   uint64_t cold_file_read_count;
+  // total number of reads to Temperature::kIce file
+  uint64_t ice_file_read_count;
   // reset all the statistics to 0.
   void Reset() {
     hot_file_bytes_read = 0;
     warm_file_bytes_read = 0;
     cold_file_bytes_read = 0;
+    ice_file_bytes_read = 0;
     hot_file_read_count = 0;
     warm_file_read_count = 0;
     cold_file_read_count = 0;
+    ice_file_read_count = 0;
   }
 };
 
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 0d88d8937c5f..af97cffeb8d5 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -444,9 +444,11 @@ enum Tickers : uint32_t {
   HOT_FILE_READ_BYTES,
   WARM_FILE_READ_BYTES,
   COLD_FILE_READ_BYTES,
+  ICE_FILE_READ_BYTES,
   HOT_FILE_READ_COUNT,
   WARM_FILE_READ_COUNT,
   COLD_FILE_READ_COUNT,
+  ICE_FILE_READ_COUNT,
 
   // Last level and non-last level read statistics
   LAST_LEVEL_READ_BYTES,
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
index 368736cbd097..33bd9c869c90 100644
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -119,6 +119,9 @@ enum class Temperature : uint8_t {
   kHot = 0x04,
   kWarm = 0x08,
   kCold = 0x0C,
+  kIce = 0x10,
+  // XXX: this is mis-named. It is instead an invalid temperature beyond the
+  // rest
   kLastTemperature,
 };
 
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 9f94bcee0273..5371c97a17c6 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5275,6 +5275,10 @@ class TickerTypeJni {
         return -0x57;
       case ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS:
         return -0x58;
+      case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES:
+        return -0x59;
+      case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT:
+        return -0x5A;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5739,6 +5743,10 @@ class TickerTypeJni {
             FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
       case -0x58:
         return ROCKSDB_NAMESPACE::Tickers::FIFO_CHANGE_TEMPERATURE_COMPACTIONS;
+      case -0x59:
+        return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES;
+      case -0x5A:
+        return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT;
       case -0x54:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 3b488660e851..12cea6d2385b 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -765,9 +765,11 @@ public enum TickerType {
     HOT_FILE_READ_BYTES((byte) -0x31),
     WARM_FILE_READ_BYTES((byte) -0x32),
     COLD_FILE_READ_BYTES((byte) -0x33),
+    ICE_FILE_READ_BYTES((byte) -0x58),
     HOT_FILE_READ_COUNT((byte) -0x34),
     WARM_FILE_READ_COUNT((byte) -0x35),
     COLD_FILE_READ_COUNT((byte) -0x36),
+    ICE_FILE_READ_COUNT((byte) -0x59),
 
     /**
      * (non-)last level read statistics
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index b2f7cbe59d69..4aaf3c6e7c72 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -227,9 +227,11 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
     {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
     {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"},
+    {ICE_FILE_READ_BYTES, "rocksdb.ice.file.read.bytes"},
     {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"},
     {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"},
     {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"},
+    {ICE_FILE_READ_COUNT, "rocksdb.ice.file.read.count"},
     {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"},
     {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"},
     {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"},
diff --git a/options/options_helper.cc b/options/options_helper.cc
index ef7292bf0c22..2f7a303929f3 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -368,7 +368,8 @@ std::map<Temperature, std::string> OptionsHelper::temperature_to_string = {
     {Temperature::kUnknown, "kUnknown"},
     {Temperature::kHot, "kHot"},
     {Temperature::kWarm, "kWarm"},
-    {Temperature::kCold, "kCold"}};
+    {Temperature::kCold, "kCold"},
+    {Temperature::kIce, "kIce"}};
 
 std::unordered_map<std::string, ChecksumType>
     OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
@@ -966,7 +967,8 @@ std::unordered_map<std::string, Temperature>
         {"kUnknown", Temperature::kUnknown},
         {"kHot", Temperature::kHot},
         {"kWarm", Temperature::kWarm},
-        {"kCold", Temperature::kCold}};
+        {"kCold", Temperature::kCold},
+        {"kIce", Temperature::kIce}};
 
 std::unordered_map<std::string, PrepopulateBlobCache>
     OptionsHelper::prepopulate_blob_cache_string_map = {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 1c00ce60026e..5b5234056a8e 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -348,13 +348,13 @@ def setup_random_seed_before_main():
     "enable_custom_split_merge": lambda: random.choice([0, 1]),
     "adm_policy": lambda: random.choice([0, 1, 2, 3]),
     "last_level_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold"]
+        ["kUnknown", "kHot", "kWarm", "kCold", "kIce"]
     ),
     "default_write_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold"]
+        ["kUnknown", "kHot", "kWarm", "kCold", "kIce"]
     ),
     "default_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold"]
+        ["kUnknown", "kHot", "kWarm", "kCold", "kIce"]
     ),
     # TODO(hx235): enable `enable_memtable_insert_with_hint_prefix_extractor`
     # after fixing the surfaced issue with delete range
@@ -630,9 +630,8 @@ def is_direct_io_supported(dbname):
     "use_shared_block_and_blob_cache": lambda: random.randint(0, 1),
     "blob_cache_size": lambda: random.choice([1048576, 2097152, 4194304, 8388608]),
     "prepopulate_blob_cache": lambda: random.randint(0, 1),
-
-     # TODO Fix races when both Remote Compaction + BlobDB enabled
-     "remote_compaction_worker_threads": 0,
+    # TODO Fix races when both Remote Compaction + BlobDB enabled
+    "remote_compaction_worker_threads": 0,
 }
 
 ts_params = {
@@ -661,10 +660,11 @@ def is_direct_io_supported(dbname):
     "preclude_last_level_data_seconds": lambda: random.choice(
         [-1, -1, 10, 60, 1200, 86400]
     ),
-    "last_level_temperature": "kCold",
+    "last_level_temperature": lambda: random.choice(["kCold", "kIce"]),
     # For FIFO compaction (ignored otherwise)
     "file_temperature_age_thresholds": lambda: random.choice(
         [
+            "{{temperature=kWarm;age=10}:{temperature=kCold;age=50}:{temperature=kIce;age=250}}",
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}",
         ]

From f46242cef631351a5c8f4a7b0fb0935ec7fa61c8 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Wed, 10 Sep 2025 10:42:07 -0700
Subject: [PATCH 272/500] Fix uninitialized value complaint in valgrind
 (#13934)

Summary:
Fix uninitialized value complaint in valgrind due to gtest print padded struct.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13934

Test Plan: CI. Verified that valgrind no longer complains about it.

Reviewed By: pdillinger

Differential Revision: D82124983

Pulled By: xingbowang

fbshipit-source-id: 99eb7bab99726c45affe0a231777e5951844d73b
---
 .../transactions/lock/point/point_lock_manager_test.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/utilities/transactions/lock/point/point_lock_manager_test.cc b/utilities/transactions/lock/point/point_lock_manager_test.cc
index b9b4dc724482..5f7c789282fa 100644
--- a/utilities/transactions/lock/point/point_lock_manager_test.cc
+++ b/utilities/transactions/lock/point/point_lock_manager_test.cc
@@ -14,6 +14,16 @@ struct SpotLockManagerTestParam {
   int deadlock_timeout_us;
 };
 
+// Define operator<< for SpotLockManagerTestParam to stop valgrind from
+// complaining uinitialized value when printing SpotLockManagerTestParam.
+std::ostream& operator<<(std::ostream& os,
+                         const SpotLockManagerTestParam& param) {
+  os << "use_per_key_point_lock_manager: "
+     << param.use_per_key_point_lock_manager
+     << ", deadlock_timeout_us: " << param.deadlock_timeout_us;
+  return os;
+}
+
 // including test for both PointLockManager and PerKeyPointLockManager
 class SpotLockManagerTest
     : public PointLockManagerTest,

From d87e598f70c960e3de2ea1984111f4fa35cbfee6 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Wed, 10 Sep 2025 17:54:26 -0700
Subject: [PATCH 273/500] Update error logging and status reporting for
 unsupported iouring (#13936)

Summary:
We should add error logging to be able to pinpoint why RocksDB is returning status `NotSupported` for `ReadAsync`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13936

Test Plan: Look at logs (and client logs of error status)

Reviewed By: anand1976

Differential Revision: D82141529

Pulled By: archang19

fbshipit-source-id: c71b70967457be35ef5168321d449f96b2b9441d
---
 env/io_posix.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/env/io_posix.cc b/env/io_posix.cc
index 0c7ddc73cd5a..6f3edf47a507 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -935,7 +935,8 @@ IOStatus PosixRandomAccessFile::ReadAsync(
 
   // Init failed, platform doesn't support io_uring.
   if (iu == nullptr) {
-    return IOStatus::NotSupported("ReadAsync");
+    fprintf(stderr, "failed to init io_uring\n");
+    return IOStatus::NotSupported("ReadAsync: failed to init io_uring");
   }
 
   // Allocate io_handle.
@@ -978,7 +979,8 @@ IOStatus PosixRandomAccessFile::ReadAsync(
   (void)cb_arg;
   (void)io_handle;
   (void)del_fn;
-  return IOStatus::NotSupported("ReadAsync");
+  return IOStatus::NotSupported(
+      "ReadAsync: ROCKSDB_IOURING_PRESENT is not set");
 #endif
 }
 

From 799f83a934041b305441b746f1de8b14c0f11810 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 11 Sep 2025 12:19:11 -0700
Subject: [PATCH 274/500] Rename and clarify
 CompactionJobStats::has_num_input_records for clarity and set true by default
 (#13929)

Summary:
**Context/Summary:**
Internally `CompactionJobStats ::num_input_records` is only used for input record count [verification](https://github.com/facebook/rocksdb/blob/1aca60c089a48857930b4191b0c84b6dd98a221c/db/compaction/compaction_job.cc#L2535) and such verification always checks for `CompactionJobStats::has_num_input_records` (now renamed) before using this field. This is needed because the `CompactionJobStats::num_input_records` gets its number from `CompactionIterator::NumInputEntryScanned()` in a subcompaction and this number can be inaccurate purposefully to increase performance, see [CompactionIterator::must_count_input_entries](https://github.com/facebook/rocksdb/pull/13929/files#diff-e6c876f655a21865c0f3dff94b9763f1bd40cf88a8a86f04868201b2e845a890R186-R199) for more.
- This PR renames the `CompactionJobStats::has_num_input_records` to more explicit naming and adds more comments. Not a behavior change.

Also, aggregation of  `CompactionJobStats::has_num_input_records` among all subcompactions is done by [AND](https://github.com/facebook/rocksdb/blob/1aca60c089a48857930b4191b0c84b6dd98a221c/util/compaction_job_stats_impl.cc#L62) operation so it's false if any of the subcompaction has this field being false. The default value of this field should be "true" in order to not mistakenly "false" by default. We are currently fine because `CompactionJobStats::Reset()` that [sets the value to be true](https://github.com/facebook/rocksdb/blob/1aca60c089a48857930b4191b0c84b6dd98a221c/util/compaction_job_stats_impl.cc#L14) is always called before such aggregation.
 - This PR changes the default value to be true.
 - Resumable compaction development plans to set `CompactionJobStats::has_num_input_records` to be false if the previous compaction carries inaccurate records. In order for this not be overwritten by the subsequent progress in [here](https://github.com/facebook/rocksdb/blob/1aca60c089a48857930b4191b0c84b6dd98a221c/db/compaction/compaction_job.cc#L1540-L1543), this PR also changes this = to AND operation and +=. With the default value `CompactionJobStats::has_num_input_records` now to be true (or Reset() already called) and `CompactionJobStats::num_input_records=0` already, this will not a behavior change.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13929

Test Plan: - Existing UT to test "...changes the default value to be true" is safe.

Reviewed By: jaykorean

Differential Revision: D82014912

Pulled By: hx235

fbshipit-source-id: 6f211c3b2c9eb7d39abf37271d21a4d3f407b934
---
 db/compaction/compaction_iterator.h    | 22 ++++++++++++++++++----
 db/compaction/compaction_job.cc        |  9 ++++-----
 include/rocksdb/compaction_job_stats.h |  7 ++++---
 util/compaction_job_stats_impl.cc      |  4 ++--
 4 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index bc0407e0ee6f..6117d23f9e18 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -183,10 +183,20 @@ class CompactionIterator {
     const Compaction* compaction_;
   };
 
-  // @param must_count_input_entries  if true, `NumInputEntryScanned()` will
-  // return the number of input keys scanned. If false, `NumInputEntryScanned()`
-  // will return this number if no Seek was called on `input`. User should call
-  // `HasNumInputEntryScanned()` first in this case.
+  // @param must_count_input_entries Controls input entry counting accuracy vs
+  // performance:
+  //   - If true: `NumInputEntryScanned()` always returns the exact count of
+  //   input keys
+  //     scanned. The iterator will use sequential `Next()` calls instead of
+  //     `Seek()` to maintain count accuracy as `Seek()` will not count the
+  //     skipped input entries, which is slower but guarantees correctness.
+  //   - If false: `NumInputEntryScanned()` returns the count only if no
+  //   `Seek()` operations
+  //     were performed on the input iterator. When compaction filters request
+  //     skipping ranges of keys or other optimizations trigger seek operations,
+  //     the count becomes unreliable. Always call `HasNumInputEntryScanned()`
+  //     first to verify if the count is accurate before using
+  //     `NumInputEntryScanned()`.
   CompactionIterator(
       InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
       SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@@ -255,6 +265,10 @@ class CompactionIterator {
   }
   const CompactionIterationStats& iter_stats() const { return iter_stats_; }
   bool HasNumInputEntryScanned() const { return input_.HasNumItered(); }
+
+  // This method should only be used when `HasNumInputEntryScanned()` returns
+  // true, unless `must_count_input_entries=true` was specified during iterator
+  // creation (which ensures the count is always accurate).
   uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
   Status InputStatus() const { return input_.status(); }
 
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 907e178df804..f63115319761 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -896,7 +896,8 @@ void CompactionJob::AggregateSubcompactionOutputAndJobStats() {
 Status CompactionJob::VerifyCompactionRecordCounts(
     bool stats_built_from_input_table_prop, uint64_t num_input_range_del) {
   Status status;
-  if (stats_built_from_input_table_prop && job_stats_->has_num_input_records) {
+  if (stats_built_from_input_table_prop &&
+      job_stats_->has_accurate_num_input_records) {
     status = VerifyInputRecordCount(num_input_range_del);
     if (!status.ok()) {
       return status;
@@ -1533,13 +1534,11 @@ void CompactionJob::FinalizeSubcompactionJobStats(
     const CompactionIOStatsSnapshot& io_stats) {
   const CompactionIterationStats& c_iter_stats = c_iter->iter_stats();
 
-  // This number may not be accurate when CompactionIterator was created
-  // with `must_count_input_entries=false`.
   assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
          c_iter->HasNumInputEntryScanned());
-  sub_compact->compaction_job_stats.has_num_input_records =
+  sub_compact->compaction_job_stats.has_accurate_num_input_records &=
       c_iter->HasNumInputEntryScanned();
-  sub_compact->compaction_job_stats.num_input_records =
+  sub_compact->compaction_job_stats.num_input_records +=
       c_iter->NumInputEntryScanned();
   sub_compact->compaction_job_stats.num_blobs_read =
       c_iter_stats.num_blobs_read;
diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index 0af8c3eb689b..c9476d70a78d 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -24,9 +24,10 @@ struct CompactionJobStats {
   // the elapsed CPU time of this compaction in microseconds.
   uint64_t cpu_micros = 0;
 
-  // Used internally indicating whether a subcompaction's
-  // `num_input_records` is accurate.
-  bool has_num_input_records = false;
+  // True if `num_input_records` is accurate across all subcompactions.
+  // See CompactionIterator::must_count_input_entries for some implementation
+  // details why `num_input_records` may not be accurate.
+  bool has_accurate_num_input_records = true;
   // the number of compaction input records.
   uint64_t num_input_records = 0;
   // the number of blobs read from blob files
diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc
index 895db35c1e87..1d8eaa3693d8 100644
--- a/util/compaction_job_stats_impl.cc
+++ b/util/compaction_job_stats_impl.cc
@@ -11,7 +11,7 @@ void CompactionJobStats::Reset() {
   elapsed_micros = 0;
   cpu_micros = 0;
 
-  has_num_input_records = true;
+  has_accurate_num_input_records = true;
   num_input_records = 0;
   num_blobs_read = 0;
   num_input_files = 0;
@@ -59,7 +59,7 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) {
   elapsed_micros += stats.elapsed_micros;
   cpu_micros += stats.cpu_micros;
 
-  has_num_input_records &= stats.has_num_input_records;
+  has_accurate_num_input_records &= stats.has_accurate_num_input_records;
   num_input_records += stats.num_input_records;
   num_blobs_read += stats.num_blobs_read;
   num_input_files += stats.num_input_files;

From 4f12c55e3e06e9572c3bc38cc8fdbfb8e561725b Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Fri, 12 Sep 2025 11:42:48 -0700
Subject: [PATCH 275/500] Make Remote Compaction Failures fall back to local in
 Stress Test (#13945)

Summary:
This PR enables Stress Test to fall back to local compaction when a remote compaction fails, allowing the compaction to be retried on the main thread.

If the local compaction succeeds, the stress test will continue without failing. The main thread will log that the remote compaction failed and was retried locally, while detailed failure logs from the remote compaction attempt will still be printed by the worker thread for further investigation.

This approach allows us to keep collecting useful logs for diagnosing remote compaction failures in Stress Test, while ensuring the test continues to run with remote compaction enabled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13945

Test Plan:
```
python3 -u tools/db_crashtest.py --cleanup_cmd='' --simple blackbox --remote_compaction_worker_threads=8 --interval=10
```

# Internal Only

https://www.internalfb.com/sandcastle/workflow/1315051091202224133

https://www.internalfb.com/sandcastle/workflow/3382203320165521367

https://www.internalfb.com/sandcastle/workflow/2616591383512372892

https://www.internalfb.com/sandcastle/workflow/4607182418810099066

Reviewed By: hx235

Differential Revision: D82279337

Pulled By: jaykorean

fbshipit-source-id: 6f663ec2eeb642fd4ad885a90efb344432a32f89
---
 db_stress_tool/db_stress_common.h             |  1 +
 db_stress_tool/db_stress_compaction_service.h | 13 +++++++++++--
 db_stress_tool/db_stress_gflags.cc            |  4 ++++
 db_stress_tool/db_stress_test_base.cc         |  4 ++--
 tools/db_crashtest.py                         |  8 +++++---
 5 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 4fe0a3ffcfcf..bf5b47ab2a52 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -425,6 +425,7 @@ DECLARE_uint32(commit_bypass_memtable_one_in);
 DECLARE_bool(track_and_verify_wals);
 DECLARE_int32(remote_compaction_worker_threads);
 DECLARE_int32(remote_compaction_worker_interval);
+DECLARE_bool(remote_compaction_failure_fall_back_to_local);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
 DECLARE_uint32(memtable_op_scan_flush_trigger);
 DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
diff --git a/db_stress_tool/db_stress_compaction_service.h b/db_stress_tool/db_stress_compaction_service.h
index 824d77b11d11..a47963e261f9 100644
--- a/db_stress_tool/db_stress_compaction_service.h
+++ b/db_stress_tool/db_stress_compaction_service.h
@@ -14,8 +14,11 @@ namespace ROCKSDB_NAMESPACE {
 // Service to simulate Remote Compaction in Stress Test
 class DbStressCompactionService : public CompactionService {
  public:
-  explicit DbStressCompactionService(SharedState* shared)
-      : shared_(shared), aborted_(false) {}
+  explicit DbStressCompactionService(SharedState* shared,
+                                     bool failure_should_fall_back_to_local)
+      : shared_(shared),
+        aborted_(false),
+        failure_should_fall_back_to_local_(failure_should_fall_back_to_local) {}
 
   static const char* kClassName() { return "DbStressCompactionService"; }
 
@@ -56,6 +59,11 @@ class DbStressCompactionService : public CompactionService {
       }
       Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
     }
+    if (failure_should_fall_back_to_local_) {
+      fprintf(stdout,
+              "Remote Compaction failed - fall back to local compaction!\n");
+      return CompactionServiceJobStatus::kUseLocal;
+    }
     return CompactionServiceJobStatus::kFailure;
   }
 
@@ -90,6 +98,7 @@ class DbStressCompactionService : public CompactionService {
  private:
   SharedState* shared_;
   std::atomic_bool aborted_{false};
+  bool failure_should_fall_back_to_local_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index e3cb957a19e2..94cc3ea1e446 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -857,6 +857,10 @@ DEFINE_int32(remote_compaction_worker_interval, 10,
              "Remote Compaction Worker Thread dequeue tasks every N "
              "milliseconds. (Default: 10ms)");
 
+DEFINE_bool(remote_compaction_failure_fall_back_to_local, true,
+            "If true, remote compaction failures will be ignored and "
+            "compactions will fall back to local and retried");
+
 DEFINE_uint32(ingest_wbwi_one_in, 0,
               "If set, will call"
               "IngestWriteBatchWithIndex() instead of regular write operations "
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 9f420b9645ad..41b5d73ee668 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3669,8 +3669,8 @@ void StressTest::Open(SharedState* shared, bool reopen) {
               "Compaction\n");
       exit(1);
     }
-    options_.compaction_service =
-        std::make_shared<DbStressCompactionService>(shared);
+    options_.compaction_service = std::make_shared<DbStressCompactionService>(
+        shared, FLAGS_remote_compaction_failure_fall_back_to_local);
   }
 
   if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 5b5234056a8e..763be6c99403 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -366,9 +366,10 @@ def setup_random_seed_before_main():
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
-    "track_and_verify_wals": lambda: random.choice([0]),
-    # TODO(jaykorean): re-enable remote compaction worker threads after addressing all issues
-    "remote_compaction_worker_threads": 0,
+    "track_and_verify_wals": lambda: random.choice([0]),    
+    "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
+    # TODO(jaykorean): Change to lambda: random.choice([0, 1]) after addressing all remote compaction failures
+    "remote_compaction_failure_fall_back_to_local": 1,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
     "memtable_op_scan_flush_trigger": lambda: random.choice([0, 10, 100, 1000]),
     "memtable_avg_op_scan_flush_trigger": lambda: random.choice([0, 2, 20, 200]),
@@ -803,6 +804,7 @@ def finalize_and_sanitize(src_params):
         dest_params["inplace_update_support"] = 0
         dest_params["checkpoint_one_in"] = 0
         dest_params["use_timed_put_one_in"] = 0
+        dest_params["test_secondary"] = 0
 
     # Multi-key operations are not currently compatible with transactions or
     # timestamp.

From 54941a8d42de156f4b1fc8031ce439877e097b97 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 12 Sep 2025 13:52:10 -0700
Subject: [PATCH 276/500] Fix a race condition in FIFO size-based compaction
 where concurrent threads could select the same non-L0 file (#13946)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
**Context/Summary:**
Fix a race condition (illustrated below) in FIFO size-based compaction where concurrent threads could select the same non-L0 file, causing assertion failures in debug builds or "Cannot delete table file from LSM tree" errors in release builds.
```
Thread 1                           Thread 2
--------                           --------
FIFO size-based compaction
   ↓
Pick L2 file
   ↓
Mark: file.being_compacted = true (file.being_compacted was false)
   ↓
WriteManifestStart (unlock mutex) ─→ FIFO size-based compaction starts
   ↓                                   ↓
Continue manifest write...          Pick SAME L2 file
   ↓                                Mark: file.being_compacted = true  (file.being_compacted was true) ❌
   ↓                                   ↓
   ↓                                Unlock mutex, wait for manifest
   ↓                                   ↓
Lock mutex ←─────────────────────────────────┘
   ↓
Delete L2 file ✅
   ↓
Complete ─────────────────────────────→ Try delete same file ❌
                                        ↓
                                     ERROR: "file not in LSM tree"

🐛 BUG: Both threads pick the same file!
    Thread 2 doesn't properly check file.being_compacted flag
```

**Test**
New test that fails before the fix and passes after

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13946

Reviewed By: xingbowang

Differential Revision: D82279731

Pulled By: hx235

fbshipit-source-id: b426517f2d1b23dd7d4951157822a2d322fe1435
---
 db/compaction/compaction_picker_fifo.cc       |  3 +
 db/db_compaction_test.cc                      | 64 +++++++++++++++++++
 ...fix_multi_level_fifo_double_picking_bug.md |  1 +
 3 files changed, 68 insertions(+)
 create mode 100644 unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md

diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index 98c03d01131c..51dd4ea5344e 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -258,6 +258,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
     // better serves a major type of FIFO use cases where smaller keys are
     // associated with older data.
     for (const auto& f : last_level_files) {
+      if (f->being_compacted) {
+        continue;
+      }
       total_size -= f->fd.file_size;
       inputs[0].files.push_back(f);
       char tmp_fsize[16];
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 69621278c177..99b2c7208dba 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -7129,6 +7129,70 @@ TEST_F(DBCompactionTest, PartialManualCompaction) {
   ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 }
 
+TEST_F(DBCompactionTest, ConcurrentFIFOPickingSameFileBug) {
+  Options opts = CurrentOptions();
+  opts.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  opts.num_levels = 3;
+  opts.disable_auto_compactions = true;
+  opts.max_background_jobs = 3;
+
+  DestroyAndReopen(opts);
+
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+
+  // Create a non-L0 SST file for multi-level FIFO size-based compaction later
+  MoveFilesToLevel(2);
+
+  Options opts_new(opts);
+  opts_new.compaction_style = CompactionStyle::kCompactionStyleFIFO;
+  opts_new.max_open_files = -1;
+  // Set a low threshold to trigger multi-level size-based compaction
+  opts_new.compaction_options_fifo.max_table_files_size = 1;
+
+  Reopen(opts_new);
+
+  const CompactRangeOptions cro;
+  const Slice begin_key("k1");
+  const Slice end_key("k2");
+
+  std::unique_ptr<port::Thread> concurrent_compaction;
+
+  bool within_first_compaction = true;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifestStart", [&](void* /*arg*/) {
+        if (!within_first_compaction) {
+          return;
+        }
+        within_first_compaction = false;
+
+        // To allow the second/concurrent compaction to still see the non-L0
+        // SST file and coerce the bug of picking that file
+        SyncPoint::GetInstance()->LoadDependency({
+            {"DBImpl::BackgroundCompaction:BeforeCompaction",
+             "VersionSet::LogAndApply:WriteManifest"},
+        });
+
+        concurrent_compaction.reset(new port::Thread([&]() {
+          // Before the fix, the second CompactRange() will either fail the
+          // assertion of double file picking `being_compacted !=
+          // inputs_[i][j]->being_compacted` in debug mode or cause LSM shape
+          // corruption "Cannot delete table file XXX from level 2 since it is
+          // not in the LSM tree" in release mode
+          Status s = db_->CompactRange(cro, &begin_key, &end_key);
+          ASSERT_OK(s);
+        }));
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = db_->CompactRange(cro, &begin_key, &end_key);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_OK(s);
+
+  concurrent_compaction->join();
+}
+
 TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
   // Regression test for bug where manual compaction hangs forever when the DB
   // is in read-only mode. Verify it now at least returns, despite failing.
diff --git a/unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md b/unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md
new file mode 100644
index 000000000000..e6d88a67fc35
--- /dev/null
+++ b/unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md
@@ -0,0 +1 @@
+Fix a race condition in FIFO size-based compaction where concurrent threads could select the same non-L0 file, causing assertion failures in debug builds or "Cannot delete table file from LSM tree" errors in release builds.

From acf9d4e44508c9f8f3b9b2c2dae9967363408bc8 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 12 Sep 2025 15:56:49 -0700
Subject: [PATCH 277/500] Fix UDT handling in MultiScan (#13938)

Summary:
we saw some crash test failure at https://github.com/facebook/rocksdb/blob/f46242cef631351a5c8f4a7b0fb0935ec7fa61c8/table/block_based/block_based_table_iterator.cc#L964-L965. This is likely due to timestamp not being considered properly in some places in MultiScan code paths. This PR fixes the issue.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13938

Test Plan: crash test with timestamp and multiscan: `python3 -u ./tools/db_crashtest.py whitebox --enable_ts --iterpercent=60 --prefix_size=-1 --prefixpercent=0 --readpercent=0 --test_batches_snapshots=0 --use_multiscan=1 --read_fault_one_in=0 --kill_random_test=88888 --interval=60`

Reviewed By: anand1976

Differential Revision: D82175263

Pulled By: cbi42

fbshipit-source-id: 5d40ede1aec15f8faeaa7fd041b939e68611ff73
---
 db/version_set.cc                             | 22 +++++++++++++--
 db_stress_tool/db_stress_test_base.cc         | 13 +++++----
 .../block_based/block_based_table_iterator.cc | 28 ++++++++++++++-----
 3 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 70649114a9ce..98f8955a2119 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1124,8 +1124,26 @@ class LevelIterator final : public InternalIterator {
         continue;
       }
 
-      InternalKey istart(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
-      InternalKey iend(end.value(), 0, kValueTypeForSeekForPrev);
+      const size_t timestamp_size =
+          user_comparator_.user_comparator()->timestamp_size();
+      InternalKey istart, iend;
+      if (timestamp_size == 0) {
+        istart =
+            InternalKey(start.value(), kMaxSequenceNumber, kValueTypeForSeek);
+        // end key is exclusive for multiscan
+        iend = InternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+      } else {
+        std::string start_key_with_ts, end_key_with_ts;
+        AppendKeyWithMaxTimestamp(&start_key_with_ts, start.value(),
+                                  timestamp_size);
+        AppendKeyWithMaxTimestamp(&end_key_with_ts, end.value(),
+                                  timestamp_size);
+        istart = InternalKey(start_key_with_ts, kMaxSequenceNumber,
+                             kValueTypeForSeek);
+        // end key is exclusive for multiscan
+        iend =
+            InternalKey(end_key_with_ts, kMaxSequenceNumber, kValueTypeForSeek);
+      }
 
       // TODO: This needs to be optimized, right now we iterate twice, which
       // we dont need to. We can do this in N rather than 2N.
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 41b5d73ee668..a13ce4db87ed 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1818,11 +1818,14 @@ Status StressTest::TestMultiScan(ThreadState* thread,
                      key, op_logs, verify_func, &diverged);
 
       if (diverged) {
-        const std::vector<ScanOptions>& scanoptions = scan_opts.GetScanRanges();
-        for (const auto& t : scanoptions) {
-          fprintf(stdout, "Multiscan options: %s to %s \n",
-                  t.range.start.value().ToString(true).c_str(),
-                  t.range.limit.value().ToString(true).c_str());
+        if (thread->shared->HasVerificationFailedYet()) {
+          const std::vector<ScanOptions>& scanoptions =
+              scan_opts.GetScanRanges();
+          for (const auto& t : scanoptions) {
+            fprintf(stdout, "Multiscan options: %s to %s \n",
+                    t.range.start.value().ToString(true).c_str(),
+                    t.range.limit.value().ToString(true).c_str());
+          }
         }
         break;
       }
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 2a7d9893360b..4a480c05e47f 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -961,8 +961,9 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 
     // Assume for each scan range start <= limit.
     if (scan_range.limit.has_value()) {
-      assert(user_comparator_.Compare(scan_range.start.value(),
-                                      scan_range.limit.value()) <= 0);
+      assert(user_comparator_.CompareWithoutTimestamp(
+                 scan_range.start.value(), /*a_has_ts=*/false,
+                 scan_range.limit.value(), /*b_has_ts=*/false) <= 0);
     }
 
     if (i > 0) {
@@ -972,8 +973,9 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       }
 
       const auto& last_end_key = (*scan_opts)[i - 1].range.limit.value();
-      if (user_comparator_.Compare(scan_range.start.value(), last_end_key) <
-          0) {
+      if (user_comparator_.CompareWithoutTimestamp(
+              scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
+              /*b_has_ts=*/false) < 0) {
         // Abort: overlapping ranges
         return;
       }
@@ -983,16 +985,28 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   // Gather all relevant data block handles
   std::vector<BlockHandle> blocks_to_prepare;
   std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
+
+  const size_t timestamp_size =
+      user_comparator_.user_comparator()->timestamp_size();
   for (const auto& scan_opt : *scan_opts) {
     size_t num_blocks = 0;
     // Current scan overlap the last block of the previous scan.
     bool check_overlap = !blocks_to_prepare.empty();
+    InternalKey start_key;
+    if (timestamp_size == 0) {
+      start_key = InternalKey(scan_opt.range.start.value(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+    } else {
+      std::string seek_key;
+      AppendKeyWithMaxTimestamp(&seek_key, scan_opt.range.start.value(),
+                                timestamp_size);
+      start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
+    }
+
+    index_iter_->Seek(start_key.Encode());
 
     // Scan range is specified in user key, here we seek to the minimum internal
     // key with this user key.
-    InternalKey start_key(scan_opt.range.start.value(), kMaxSequenceNumber,
-                          kValueTypeForSeek);
-    index_iter_->Seek(start_key.Encode());
     while (index_iter_->Valid() &&
            (!scan_opt.range.limit.has_value() ||
             user_comparator_.CompareWithoutTimestamp(

From 29d9798ae86ed358cb2ad64781ab88257dbe8c30 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Sun, 14 Sep 2025 07:38:00 -0700
Subject: [PATCH 278/500] Revamp of parallel compression (#13910)

Summary:
Complete redo of parallel compression in block_based_table_builder.cc to greatly reduce cross-thread hand-off and blocking. A ring buffer of blocks-in-progress is used to essentially bound working memory while enabling high throughput. Unlike before, all threads can participate in compression work, for a kind of work-stealing algorithm that reduces the need for threads to block. This builds on improvements in https://github.com/facebook/rocksdb/pull/13850

Previously, there was either
* parallel_threads==1, the *emit thread* (caller from flush/compaction) doing all the work
* parallel_threads > 1, the emit thread generates uncompressed blocks, `parallel_threads` worker threads compress blocks, and a writer thread writes to the SST file. Total of `parallel_threads + 2` threads participating. (Other bookkeeping in emit and write steps omitted from description for simplicity.)

Now we have either
* parallel_threads==1 (same), the emit thread doing all the work
* parallel_threads > 1, the emit thread generates uncompressed blocks and can take up compression work when the ring buffer is full; `parallel_threads` worker threads have as their top priority to write compressed blocks to the SST file but also take up compression work in priority order of next-to-write. Total of `parallel_threads + 1` threads participating. In some cases, this could result in less throughput than before, but arguably the previous implementation was using more threads than explicitly allowed.

## Future/alternate considerations
Although we could likely have used some framework for micro-work sharing across threads, that could be difficult with the asymmetry of work loads and thread affinity. Specifically, (a) it would be quite challenging to allow emit work in other threads, because it happens in the caller of BlockBasedTableBuilder, (b) async programming is unlikely to pay off until we have an async interface for writing SST files, and (c) this implementation will nevertheless serve as a benchmark for what we lose or gain in such a framework vs. a hand-tuned system.

This implementation still creates and destroys threads for each SST file created. We hope in the future to have more governance and/or pooling of worker threads across various flushes and compactions, but that is not available currently and would require significant design and implementation work.

## More details
* This implementation makes use of semaphores for idling and re-waking threads. `std::counting_semaphore` and `binary_semaphore` offer the best performance (see benchmark results below) but some implementations are known to have correctness bugs. Also, my attempt at upgrading CI for C++20 support (required for these) in https://github.com/facebook/rocksdb/pull/13904 is actually incomplete. Therefore, using these structures is opt-in with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, and a naive semaphore implementation based on mutex and condvar is used by default. A folly alternative (folly::fibers::Semaphore) was dropped in during development and found to be less efficient than the naive implementation. One CI job is upgraded to test with the new opt-in.
* One of the biggest concerns about correctness/reliability for this implementation is the possibility of hitting a deadlock, in part because that is not well checked in the DB crash test (a challenging problem!). Note also that with the parallel compression improvements in this release, I am calling the feature production-ready, so there is an extra level of confidence needed in the reliability of the feature. Thus, for DEBUG builds including crash test, I have added a watchdog thread to each parallel SST construction that heuristically checks for the most likely kinds of deadlock that could happen, including for the case of buggy semaphore implementations. It periodically verifies that some thread is outside of its "idle" state, and if the watchdog wakes up repeatedly to see all live threads stuck in their idle state (even if wake-up was attempted) then it declares a deadlock. This feature was manually verified for several seeded deadlock bugs. (More details in code comments.)
* For CPU efficiency, this implementation greatly simplifies the logic to estimate the outstanding or "inflight" size not yet written to the SST file. I expect this size to generally be insignificant relative to the full SST file size so is not worth careful engineering. And based on Meta's current needs, landing under-size for an SST file is better than over-size. See comments on `estimated_inflight_size` for details.
* Some other existing atomics in block_based_table_builder.cc modified to use safe atomic wrappers.
* Status handling in BlockBasedTableBuilder was streamlined to get rid of essentially redundant `status`+`io_status` fields and associated code. Made small optimizations to reduce unnecessary IOStatus copies (with StatusOk()) and mark status conditional branches as LIKELY or UNLIKELY.
* Prefer inline field initialization to initialization in constructor.
* Minimize references to the `parallel_threads` configuration parameter for better separation of concerns / sanitization / etc.  For example, use non-nullity of `pc_rep` to indicate that parallel compression is enabled (and active).
* Some other refactoring to aid the new implementation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13910

Test Plan:
## Correctness
Already integrated into unit tests and crash test. CI updated for opt-in semaphore implementation. Basic semaphore unit tests added/updated.

As for the tremendous simplification of logic relating to hitting target SST file size, as expected, the new behavior could under-shoot the single-threaded behavior by a small number of blocks, which will typically affect the file size by ~1/1000th or less. I think that's a good trade-off for cutting out unnecessarily complex code with non-trivial CPU cost (FileSizeEstimator).
```
./db_bench -db=/dev/shm/dbbench_filesize_after8 -benchmarks=fillseq,compact -num=10000000 -compression_type=zstd -compression_level=8 -compression_parallel_threads=8
```

Before, PT=8 & PT=1, and After PT=1 the same or very similar
```
-rw-r--r-- 1 peterd users 67474097 Sep 12 15:32 000052.sst
-rw-r--r-- 1 peterd users 67474214 Sep 12 15:32 000053.sst
-rw-r--r-- 1 peterd users 67473834 Sep 12 15:32 000054.sst
-rw-r--r-- 1 peterd users 67473437 Sep 12 15:32 000055.sst
-rw-r--r-- 1 peterd users 67473835 Sep 12 15:32 000056.sst
-rw-r--r-- 1 peterd users 67473204 Sep 12 15:33 000057.sst
-rw-r--r-- 1 peterd users 67473294 Sep 12 15:33 000058.sst
-rw-r--r-- 1 peterd users 67473839 Sep 12 15:33 000059.sst
```

After, PT=8 (worst case here ~0.05% smaller)
```
-rw-r--r-- 1 peterd users 67463189 Sep 12 14:55 000052.sst
-rw-r--r-- 1 peterd users 67465233 Sep 12 14:55 000053.sst
-rw-r--r-- 1 peterd users 67466822 Sep 12 14:55 000054.sst
-rw-r--r-- 1 peterd users 67466221 Sep 12 14:55 000055.sst
-rw-r--r-- 1 peterd users 67441675 Sep 12 14:55 000056.sst
-rw-r--r-- 1 peterd users 67467855 Sep 12 14:55 000057.sst
-rw-r--r-- 1 peterd users 67455132 Sep 12 14:55 000058.sst
-rw-r--r-- 1 peterd users 67458334 Sep 12 14:55 000059.sst
```

## Performance, modest load
We are primarily interested in balancing throughput in building SST files and CPU usage in doing so. (For example, we could maximize throughput by having worker threads only spin waiting for work, but that would likely be extra CPU usage we want to avoid to allow other productive CPU work to be scheduled.) No read path code has been touched.

A benchmark script running "before" and "after" configurations at the same time to minimize random machine load effects:
```
$ SUFFIX=`tty | sed 's|/|_|g'`; for CT in none lz4 zstd; do for PT in 1 2 3 4 6 8; do echo -n "$CT pt=$PT -> "; (for I in `seq 1 10`; do BIN=/tmp/dbbench${SUFFIX}.bin; rm -f $BIN; cp db_bench $BIN; /usr/bin/time $BIN -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 -compression_type=$CT -compression_parallel_threads=$PT 2>&1; done) | awk '/micros.op/ {n++; sum += $5;} /system / { cpu += $1 + $2; } END { print "ops/s: " int(sum/n) " cpu*s: " cpu; }'; done; done
```

Before this change:
```
none pt=1 -> ops/s: 1999603 cpu*s: 72.08
none pt=2 -> ops/s: 1871094 cpu*s: 148.3
none pt=3 -> ops/s: 1882907 cpu*s: 147.7
lz4  pt=1 -> ops/s: 1987858 cpu*s: 94.74
lz4  pt=2 -> ops/s: 1590192 cpu*s: 182.65
lz4  pt=3 -> ops/s: 1896294 cpu*s: 174.7
lz4  pt=4 -> ops/s: 1949174 cpu*s: 172.26
lz4  pt=6 -> ops/s: 1912517 cpu*s: 175.91
lz4  pt=8 -> ops/s: 1930585 cpu*s: 176.71
zstd pt=1 -> ops/s: 1239379 cpu*s: 129.85
zstd pt=2 -> ops/s: 1171742 cpu*s: 226.12
zstd pt=3 -> ops/s: 1832574 cpu*s: 214.21
zstd pt=4 -> ops/s: 1887124 cpu*s: 212.51
zstd pt=6 -> ops/s: 1920936 cpu*s: 211.7
zstd pt=8 -> ops/s: 1885544 cpu*s: 214.87
```

After this change:
```
none pt=1 -> ops/s: 1964361 cpu*s: 72.66
none pt=2 -> ops/s: 1914033 cpu*s: 104.95
none pt=3 -> ops/s: 1978567 cpu*s: 100.24
lz4  pt=1 -> ops/s: 2041703 cpu*s: 92.88
lz4  pt=2 -> ops/s: 1903210 cpu*s: 121.64
lz4  pt=3 -> ops/s: 1973906 cpu*s: 122.22
lz4  pt=4 -> ops/s: 1952605 cpu*s: 123.05
lz4  pt=6 -> ops/s: 1957524 cpu*s: 124.31
lz4  pt=8 -> ops/s: 1986274 cpu*s: 129.06
zstd pt=1 -> ops/s: 1233748 cpu*s: 130.43
zstd pt=2 -> ops/s: 1675226 cpu*s: 158.41
zstd pt=3 -> ops/s: 1929878 cpu*s: 159.77
zstd pt=4 -> ops/s: 1916403 cpu*s: 160.99
zstd pt=6 -> ops/s: 1942526 cpu*s: 166.21
zstd pt=8 -> ops/s: 1966704 cpu*s: 171.56
```

For parallel_threads=1, results are very similar, as expected.

For parallel_threads>1, throughput is usually improved a bit, but cpu consumption is dramatically reduced. For zstd, maximum throughput is essentially achieved with pt=3 rather than the previous roughly pt=4 to 6. And the old used about 30% more CPU.

We can also compare with more expensive compression by raising the compression level.
```
SUFFIX=`tty | sed 's|/|_|g'`; CT=zstd; for CL in 4 6 8; do for PT in 1 4 8; do echo -n "$CT@$CL pt=$PT -> "; (for I in `seq 1 10`; do BIN=/tmp/dbbench${SUFFIX}.bin; rm -f $BIN; cp db_bench $BIN; /usr/bin/time $BIN -db=/dev/shm/dbbench$SUFFIX --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 -compression_type=$CT -compression_parallel_threads=$PT -compression_level=$CL 2>&1; done) | awk '/micros.op/ {n++; sum += $5;} /system / { cpu += $1 + $2; } END { print "ops/s: " int(sum/n) " cpu*s: " cpu; }'; done; done
```

Before:
```
zstd@4 pt=1 -> ops/s:  883630 cpu*s: 161.12
zstd@4 pt=4 -> ops/s: 1878206 cpu*s: 243.25
zstd@4 pt=8 -> ops/s: 1885002 cpu*s: 245.89
zstd@6 pt=1 -> ops/s:  710767 cpu*s: 189.44
zstd@6 pt=4 -> ops/s: 1706377 cpu*s: 277.29
zstd@6 pt=8 -> ops/s: 1866736 cpu*s: 275.07
zstd@8 pt=1 -> ops/s:  529047 cpu*s: 237.87
zstd@8 pt=4 -> ops/s: 1401379 cpu*s: 330.61
zstd@8 pt=8 -> ops/s: 1895601 cpu*s: 321.59
```

After:
```
zstd@4 pt=1 -> ops/s:  889905 cpu*s: 161.03
zstd@4 pt=4 -> ops/s: 1942240 cpu*s: 193.18
zstd@4 pt=8 -> ops/s: 1922367 cpu*s: 205.21
zstd@6 pt=1 -> ops/s:  713870 cpu*s: 188.91
zstd@6 pt=4 -> ops/s: 1832314 cpu*s: 219.66
zstd@6 pt=8 -> ops/s: 1949631 cpu*s: 229.34
zstd@8 pt=1 -> ops/s:  530324 cpu*s: 238.02
zstd@8 pt=4 -> ops/s: 1479767 cpu*s: 271.65
zstd@8 pt=8 -> ops/s: 1949631 cpu*s: 275.6
```

And we can also look at the cumulative effect of this change and  https://github.com/facebook/rocksdb/pull/13850 that will combine for the parallel compression improvements in the upcoming 10.7 release:

Before both:
```
lz4 pt=1 -> ops/s: 1954445 cpu*s: 95.14
lz4 pt=3 -> ops/s: 1687043 cpu*s: 186.62
lz4 pt=5 -> ops/s: 1708196 cpu*s: 188.33
zstd pt=1 -> ops/s: 1220649 cpu*s: 131.2
zstd pt=3 -> ops/s: 1658100 cpu*s: 227.08
zstd pt=5 -> ops/s: 1685074 cpu*s: 226.08
```

After:
```
lz4 pt=1 -> ops/s: 2048214 cpu*s: 93.24
lz4 pt=3 -> ops/s: 1922049 cpu*s: 122.9
lz4 pt=5 -> ops/s: 1980165 cpu*s: 122.49
zstd pt=1 -> ops/s: 1245165 cpu*s: 128.84
zstd pt=3 -> ops/s: 1956961 cpu*s: 158.73
zstd pt=5 -> ops/s: 1970458 cpu*s: 161.02
```

In summary, before with zstd default level, you could see only
* about 38% increase in throughput for about 73% increase in CPU usage

Now you can get
* about 58% increase in throughput for about 25% increase in CPU usage

## Performance, high load
To validate this for usage on remote compaction workers, we also need to test whether it falls over at high load or anything concerning like that. For this I did a lot of testing with concurrent db_bench and zstd compression_level=8 and parallel_thread (PT) in {1,8} trying to observe "bad" behaviors such as stalls due to preempted threads and such. On a 166 core machine where a "job" is a db_bench process running a fillseq benchmark similar to above in parallel with others, I could summarize the results like this:

10 jobs PT=8 vs. PT=1 -> 12% more CPU usage, 75% reduction in wall time, 1.9 jobs/sec (vs. 0.5)
50 jobs PT=8 vs. PT=1 -> 89% more CPU usage, 27% reduction in wall time, 3.1 jobs/sec (vs. 2.3)
100 jobs PT=8 vs. PT=1 -> 24% more CPU usage, 5% reduction in wall time, 3.25 jobs/sec (vs. 3.1)
150 jobs PT=8 vs. PT=1 -> 4% more CPU usage, 2% increase in wall time, 3.3 jobs/sec (vs. 3.4)
500 jobs PT=8 vs. PT=1 -> 1% more CPU usage, insignificant difference in wall time, 3.3 jobs/sec

Even when there are 4000 threads potentially competing for 166 cores, the throughput (3.3 jobs / sec) is still very close to maximum (3.4). Enabling parallel compression didn't result in notably less throughput (based on wall clock time for all jobs to complete) in any case tested above, and much higher throughput for many cases. If parallel compression causes us to tip from comfortably under-saturating to over-saturating the cores (as in the 50 jobs case), the overall CPU usage can be much higher, presumably due to lower CPU cache hit rates and maybe clock throttling, but parallel compression still has the throughput advantage in those cases.

In other words, what would we stand to gain from being able to intelligently share worker threads between compaction jobs? It doesn't seem that much.

Reviewed By: xingbowang

Differential Revision: D81365623

Pulled By: pdillinger

fbshipit-source-id: 5db5151a959b5d25b84dbe185bc208bd188f2d1c
---
 .github/workflows/pr-jobs.yml                 |    2 +-
 .../block_based/block_based_table_builder.cc  | 1536 +++++++++++------
 table/block_based/block_based_table_builder.h |   43 +-
 tools/db_crashtest.py                         |    2 +-
 .../parallel_compression.md                   |    2 +-
 util/bit_fields.h                             |  331 ++++
 util/semaphore.h                              |  164 ++
 util/slice_test.cc                            |  130 +-
 8 files changed, 1611 insertions(+), 599 deletions(-)
 create mode 100644 util/bit_fields.h
 create mode 100644 util/semaphore.h

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 8f8da7b9d724..1ae487a4e1bd 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -323,7 +323,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
+    - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_USE_STD_SEMAPHORES -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
     - uses: "./.github/actions/post-steps"
   # ========================= MacOS build only ======================== #
   build-macos:
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 121d520bbd75..3339be673eb2 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -50,11 +50,13 @@
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/table_builder.h"
+#include "util/bit_fields.h"
 #include "util/coding.h"
 #include "util/compression.h"
+#include "util/defer.h"
+#include "util/semaphore.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/work_queue.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -191,269 +193,571 @@ struct BlockBasedTableBuilder::WorkingAreaPair {
   Decompressor::ManagedWorkingArea verify;
 };
 
+// ParallelCompressionRep essentially defines a framework for parallelizing
+// block generation ("emit"), block compression, and block writing to storage.
+// The synchronization is lock-free/wait-free, so thread waiting only happens
+// when work-order dependencies are unsatisfied, though sleeping/idle threads
+// might be kept idle when it seems unlikely they would improve throughput by
+// waking them up (essentially auto-tuned parallelism). But because all threads
+// are capable of 2 out of 3 kinds of work, in a quasi-work-stealing system,
+// running threads can usually expect that compatible work is available.
+//
+// This is currently activated with CompressionOptions::parallel_threads > 1
+// but that is a somewhat crude API that would ideally be adapted along with
+// the implementation in the future to allow threads to serve multiple
+// flush/compaction jobs, though the available improvement might be small.
+// Even within the scope of a single file it might be nice to use a general
+// framework for distributing work across threads, but (a) different threads
+// are limited to which work they can do because of technical challenges, (b)
+// being largely CPU bound on small work units means such a framework would
+// likely have big overheads compared to this hand-optimized solution.
 struct BlockBasedTableBuilder::ParallelCompressionRep {
-  // TODO: consider replacing with autovector or similar
-  // Keys is a wrapper of vector of strings avoiding
-  // releasing string memories during vector clear()
-  // in order to save memory allocation overhead
-  class Keys {
-   public:
-    Keys() : keys_(kKeysInitSize), size_(0) {}
-    void PushBack(const Slice& key) {
-      if (size_ == keys_.size()) {
-        keys_.emplace_back(key.data(), key.size());
-      } else {
-        keys_[size_].assign(key.data(), key.size());
-      }
-      size_++;
-    }
-    void SwapAssign(std::vector<std::string>& keys) {
-      size_ = keys.size();
-      std::swap(keys_, keys);
-    }
-    void Clear() { size_ = 0; }
-    size_t Size() { return size_; }
-    std::string& Back() { return keys_[size_ - 1]; }
-    std::string& operator[](size_t idx) {
-      assert(idx < size_);
-      return keys_[idx];
-    }
-
-   private:
-    static constexpr size_t kKeysInitSize = 32;
-    std::vector<std::string> keys_;
-    size_t size_;
+  // The framework has two kinds of threads: the calling thread from
+  // flush/compaction/SstFileWriter is called the "emit thread" (kEmitter).
+  // Other threads cannot generally take over "emit" work because that is
+  // largely happening up the call stack from BlockBasedTableBuilder.
+  // The emit thread can also take on compression work in a quasi-work-stealing
+  // manner when the buffer for emitting new blocks is full.
+  //
+  // When parallelism is enabled, there are also "worker" threads that
+  // can handle compressing blocks and (one worker thread at a time) write them
+  // to the SST file (and handle other single-threaded wrap-up of each block).
+  //
+  // NOTE: when parallelism is enabled, the emit thread is not permitted to
+  // write to the SST file because that is the potential "output" bottleneck,
+  // and it's generally bad for parallelism to allow the only thread that can
+  // serve the "input" bottleneck (emit work) to also spend exclusive time on
+  // the output bottleneck.
+  enum class ThreadKind {
+    kEmitter,
+    kWorker,
   };
 
-  struct BlockRep;
-
-  // Use BlockRepSlot to keep block order in write thread.
-  // slot_ will pass references to BlockRep
-  class BlockRepSlot {
-   public:
-    BlockRepSlot() : slot_(1) {}
-    template <typename T>
-    void Fill(T&& rep) {
-      slot_.push(std::forward<T>(rep));
-    }
-    void Take(BlockRep*& rep) { slot_.pop(rep); }
-
-   private:
-    // slot_ will pass references to BlockRep in block_rep_buf,
-    // and those references are always valid before the destruction of
-    // block_rep_buf.
-    WorkQueue<BlockRep*> slot_;
+  // ThreadState allows each thread to track its work assignment. In addition to
+  // the cases already mentioned, kEmitting, kCompressing, and kWriting to the
+  // SST file writer,
+  // * Threads can enter the kIdle state so that they can sleep when no work is
+  // available for them, to be woken up when appropriate.
+  // * The kEnd state means the thread is not doing any more work items, which
+  // for worker threads means they will end soon.
+  // * The kCompressingAndWriting state means a worker can compress and write a
+  // block without additional state updates because the same block to be
+  // compressed is the next to be written.
+  enum class ThreadState {
+    /* BEGIN Emitter only states */
+    kEmitting,
+    /* END Emitter only states */
+    /* BEGIN states for emitter and worker */
+    kIdle,
+    kCompressing,
+    kEnd,
+    /* END states for emitter and worker */
+    /* BEGIN Worker only states */
+    kCompressingAndWriting,
+    kWriting,
+    /* END Worker only states */
   };
 
-  // BlockRep instances are fetched from and recycled to
-  // block_rep_pool during parallel compression.
+  // BlockRep instances are used and reused in a ring buffer (below), so that
+  // many blocks can be in an intermediate state between serialized into
+  // uncompressed bytes and written to the SST file. Notably, each block is
+  // "emitted" in uncompressed form into a BlockRep, compressed (at least
+  // attempted, when configured) for updated BlockRep, and then written from the
+  // BlockRep to the writer for the SST file bytes.
   struct ALIGN_AS(CACHE_LINE_SIZE) BlockRep {
     // Uncompressed block contents
     std::string uncompressed;
     GrowableBuffer compressed;
     CompressionType compression_type = kNoCompression;
     std::unique_ptr<IndexBuilder::PreparedIndexEntry> prepared_index_entry;
-    BlockRepSlot slot;
-    Status status;
   };
 
-  // Use a vector of BlockRep as a buffer for a determined number
-  // of BlockRep structures. All data referenced by pointers in
-  // BlockRep will be freed when this vector is destructed.
-  using BlockRepBuffer = std::vector<BlockRep>;
-  BlockRepBuffer block_rep_buf;
-  // Use a thread-safe queue for concurrent access from block
-  // building thread and writer thread.
-  using BlockRepPool = WorkQueue<BlockRep*>;
-  BlockRepPool block_rep_pool;
-
-  // Compression queue will pass references to BlockRep in block_rep_buf,
-  // and those references are always valid before the destruction of
-  // block_rep_buf.
-  using CompressQueue = WorkQueue<BlockRep*>;
-  CompressQueue compress_queue;
-  std::vector<port::Thread> compress_thread_pool;
-
-  // Write queue will pass references to BlockRep::slot in block_rep_buf,
-  // and those references are always valid before the corresponding
-  // BlockRep::slot is destructed, which is before the destruction of
-  // block_rep_buf.
-  using WriteQueue = WorkQueue<BlockRepSlot*>;
-  WriteQueue write_queue;
-  std::unique_ptr<port::Thread> write_thread;
-
-  // Estimate output file size when parallel compression is enabled. This is
-  // necessary because compression & flush are no longer synchronized,
-  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
-  // memory_order_relaxed suffices because accurate statistics is not required.
-  class FileSizeEstimator {
-   public:
-    explicit FileSizeEstimator()
-        : uncomp_bytes_compressed(0),
-          uncomp_bytes_curr_block(0),
-          uncomp_bytes_curr_block_set(false),
-          uncomp_bytes_inflight(0),
-          blocks_inflight(0),
-          curr_compression_ratio(0),
-          estimated_file_size(0) {}
-
-    // Estimate file size when a block is about to be emitted to
-    // compression thread
-    void EmitBlock(uint64_t uncomp_block_size, uint64_t curr_file_size) {
-      uint64_t new_uncomp_bytes_inflight =
-          uncomp_bytes_inflight.fetch_add(uncomp_block_size,
-                                          std::memory_order_relaxed) +
-          uncomp_block_size;
-
-      uint64_t new_blocks_inflight =
-          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
-
-      estimated_file_size.store(
-          curr_file_size +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_uncomp_bytes_inflight) *
-                  curr_compression_ratio.load(std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
-    }
-
-    // Estimate file size when a block is already reaped from
-    // compression thread
-    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
-      assert(uncomp_bytes_curr_block_set);
-
-      uint64_t new_uncomp_bytes_compressed =
-          uncomp_bytes_compressed + uncomp_bytes_curr_block;
-      assert(new_uncomp_bytes_compressed > 0);
-
-      curr_compression_ratio.store(
-          (curr_compression_ratio.load(std::memory_order_relaxed) *
-               uncomp_bytes_compressed +
-           compressed_block_size) /
-              static_cast<double>(new_uncomp_bytes_compressed),
-          std::memory_order_relaxed);
-      uncomp_bytes_compressed = new_uncomp_bytes_compressed;
-
-      uint64_t new_uncomp_bytes_inflight =
-          uncomp_bytes_inflight.fetch_sub(uncomp_bytes_curr_block,
-                                          std::memory_order_relaxed) -
-          uncomp_bytes_curr_block;
-
-      uint64_t new_blocks_inflight =
-          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
-
-      estimated_file_size.store(
-          curr_file_size +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_uncomp_bytes_inflight) *
-                  curr_compression_ratio.load(std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
-
-      uncomp_bytes_curr_block_set = false;
-    }
-
-    void SetEstimatedFileSize(uint64_t size) {
-      estimated_file_size.store(size, std::memory_order_relaxed);
-    }
-
-    uint64_t GetEstimatedFileSize() {
-      return estimated_file_size.load(std::memory_order_relaxed);
-    }
-
-    void SetCurrBlockUncompSize(uint64_t size) {
-      uncomp_bytes_curr_block = size;
-      uncomp_bytes_curr_block_set = true;
-    }
-
-   private:
-    // Input bytes compressed so far.
-    uint64_t uncomp_bytes_compressed;
-    // Size of current block being appended.
-    uint64_t uncomp_bytes_curr_block;
-    // Whether uncomp_bytes_curr_block has been set for next
-    // ReapBlock call.
-    bool uncomp_bytes_curr_block_set;
-    // Input bytes under compression and not appended yet.
-    std::atomic<uint64_t> uncomp_bytes_inflight;
-    // Number of blocks under compression and not appended yet.
-    std::atomic<uint64_t> blocks_inflight;
-    // Current compression ratio, maintained by BGWorkWriteMaybeCompressedBlock.
-    std::atomic<double> curr_compression_ratio;
-    // Estimated SST file size.
-    std::atomic<uint64_t> estimated_file_size;
-  };
-  FileSizeEstimator file_size_estimator;
+  // Ring buffer of emitted blocks that may or may not yet be compressed.
+  std::unique_ptr<BlockRep[]> ring_buffer;
+  // log_2(ring buffer size), where ring buffer size must be a power of two
+  const int ring_buffer_nbits;
+  // ring buffer size - 1, to function as a bit mask for ring buffer positions
+  // (e.g. given the ordinal number of a block)
+  const uint32_t ring_buffer_mask;
+  // Number of threads in worker_threads. (Emit thread doesn't count)
+  const uint32_t num_worker_threads;
+
+  // Rough upper bound on the sst file size contribution from blocks emitted
+  // into the parallel compression ring buffer but not yet written. Tracks
+  // uncompressed size, with trailer, until a block is compressed, then
+  // compressed size until the block is written. (TODO: does not currently
+  // account for block_align)
+  RelaxedAtomic<uint64_t> estimated_inflight_size{0};
+  // Thread objects for worker threads
+  std::vector<port::Thread> worker_threads;
+  // Working areas for data_block_compressor for each worker thread
+  std::vector<WorkingAreaPair> working_areas;
+
+  // Semaphores for threads to sleep when there's no available work for them
+  // and to wake back up when someone determines there is available work (most
+  // likely). Split between worker threads and emit thread because they can do
+  // different kinds of work.
+  CountingSemaphore idle_worker_sem{0};
+  BinarySemaphore idle_emit_sem{0};
+
+  // Primary atomic state of parallel compression, which includes a number of
+  // state fields that are best updated atomically to avoid locking and/or to
+  // simplify the interesting interleavings that have to be considered and
+  // accommodated.
+  struct StateID {};
+  struct State : public BitFields<uint64_t, StateID> {};
+  ALIGN_AS(CACHE_LINE_SIZE) AcqRelBitFieldsAtomic<State> atomic_state;
+
+  // The first field is a bit for each ring buffer slot (max 32) for whether
+  // that slot is ready to be claimed for writing by a worker thread. Because
+  // compressions might finish out-of-order, we need to track individually
+  // whether they are finished, though this field doesn't differentiate
+  // "compression completed" from "compression not started" because that can be
+  // inferred from NextToCompress. A block might not enter this state, because
+  // the same thread that compresses it can also immediately write the block if
+  // it notices that the block is next to write.
+  using NeedsWriter = UnsignedBitField<State, 32, NoPrevBitField>;
+  // Track how many worker threads are in an idle state because there was no
+  // available work and haven't been selected to wake back up.
+  using IdleWorkerCount = UnsignedBitField<State, 5, NeedsWriter>;
+  // Track whether the emit thread is an idle state because there was no
+  // available work and hasn't been triggered to wake back up. The nature of
+  // available work and atomic CAS assignment of work ensures at least one
+  // thread is kept out of the idle state.
+  using IdleEmitFlag = BoolBitField<State, IdleWorkerCount>;
+  // Track whether threads should end when they finish available work because no
+  // more blocks will be emitted.
+  using NoMoreToEmitFlag = BoolBitField<State, IdleEmitFlag>;
+  // Track whether threads should abort ASAP because of an error.
+  using AbortFlag = BoolBitField<State, NoMoreToEmitFlag>;
+  // Track three "NextTo" counters for the positions of the next block to write,
+  // to start compression, and to emit into the ring buffer. If these counters
+  // never overflowed / wrapped around, we would have next_to_write <=
+  // next_to_compress <= next_to_emit because a block must be emitted before
+  // compressed, and compressed (at least attempted) before writing. We need to
+  // track more than ring_buffer_nbits of these counters to be able to
+  // distinguish an empty ring buffer (next_to_write == next_to_emit) from a
+  // full ring buffer (next_to_write != next_to_emit but equal under
+  // ring_buffer_mask).
+  using NextToWrite = UnsignedBitField<State, 8, AbortFlag>;
+  using NextToCompress = UnsignedBitField<State, 8, NextToWrite>;
+  using NextToEmit = UnsignedBitField<State, 8, NextToCompress>;
+  static_assert(NextToEmit::kEndBit == 64);
+
+  // BEGIN fields for use by the emit thread only. These can't live on the stack
+  // because the emit thread frequently returns out of BlockBasedTableBuilder.
+  ALIGN_AS(CACHE_LINE_SIZE)
+  ThreadState emit_thread_state = ThreadState::kEmitting;
+  // Ring buffer index that emit thread is operating on (for emitting and
+  // compressing states)
+  uint32_t emit_slot = 0;
+  // Including some data to inform when to wake up idle worker threads (see
+  // implementation for details)
+  int32_t emit_counter_toward_wake_up = 0;
+  int32_t emit_counter_for_wake_up = 0;
+  static constexpr int32_t kMaxWakeupInterval = 8;
+  // END fields for use by the emit thread only
 
-  // Facilities used for waiting first block completion. Need to Wait for
-  // the completion of first block compression and flush to get a non-zero
-  // compression ratio.
-  std::atomic<bool> first_block_processed;
-  std::condition_variable first_block_cond;
-  std::mutex first_block_mutex;
+#ifndef NDEBUG
+  // These are for an extra "watchdog" thread in DEBUG builds that heuristically
+  // checks for the most likely deadlock conditions. False positives and false
+  // negatives are technically possible.
+  std::thread watchdog_thread;
+  std::mutex watchdog_mutex;
+  std::condition_variable watchdog_cv;
+  bool shutdown_watchdog = false;
+  RelaxedAtomic<uint32_t> live_workers{0};
+  RelaxedAtomic<uint32_t> idling_workers{0};
+  RelaxedAtomic<bool> live_emit{0};
+  RelaxedAtomic<bool> idling_emit{0};
+#endif  // !NDEBUG
 
-  explicit ParallelCompressionRep(uint32_t parallel_threads)
-      : block_rep_buf(parallel_threads),
-        block_rep_pool(parallel_threads),
-        compress_queue(parallel_threads),
-        write_queue(parallel_threads),
-        first_block_processed(false) {
-    for (uint32_t i = 0; i < parallel_threads; i++) {
-      // Prime the queue of available BlockReps
-      block_rep_pool.push(&block_rep_buf[i]);
+  int ComputeRingBufferNbits(uint32_t parallel_threads) {
+    // Ring buffer size is a power of two not to exceed 32 but otherwise
+    // at least twice the number of threads.
+    if (parallel_threads >= 9) {
+      return 5;
+    } else if (parallel_threads >= 5) {
+      return 4;
+    } else if (parallel_threads >= 3) {
+      return 3;
+    } else {
+      assert(parallel_threads > 1);
+      return 2;
     }
   }
 
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
+      : ring_buffer_nbits(ComputeRingBufferNbits(parallel_threads)),
+        ring_buffer_mask((uint32_t{1} << ring_buffer_nbits) - 1),
+        num_worker_threads(std::min(parallel_threads, ring_buffer_mask)) {
+    assert(num_worker_threads <= IdleWorkerCount::kMask);
+
+    ring_buffer = std::make_unique<BlockRep[]>(ring_buffer_mask + 1);
+
+    // Start by aggressively waking up idle workers
+    emit_counter_for_wake_up = -static_cast<int32_t>(num_worker_threads);
+  }
+
   ~ParallelCompressionRep() {
-    block_rep_pool.finish();
 #ifndef NDEBUG
-    // Silence ASSERT_STATUS_CHECKED warnings
-    for (auto& block_rep : block_rep_buf) {
-      assert(block_rep.status.ok());
+    auto state = atomic_state.Load();
+    if (state.Get<AbortFlag>() == false) {
+      // Should be clear / cancelled out with normal shutdown
+      assert(state.Get<NeedsWriter>() == 0);
+
+      // Ring buffer reached empty state
+      assert(state.Get<NextToWrite>() == state.Get<NextToCompress>());
+      assert(state.Get<NextToCompress>() == state.Get<NextToEmit>());
+
+      // Everything cancels out in inflight size
+      assert(estimated_inflight_size.LoadRelaxed() == 0);
     }
+    // All idling metadata cleaned up, properly tracked
+    assert(state.Get<IdleWorkerCount>() == 0);
+    assert(state.Get<IdleEmitFlag>() == false);
+
+    // No excess in semaphores
+    assert(!idle_emit_sem.TryAcquire());
+    assert(!idle_worker_sem.TryAcquire());
+#endif  // !NDEBUG
+  }
+
+  // The primary function for a thread transitioning from one state or work
+  // assignment to the next. `slot` refers to a position in the ring buffer
+  // for assigned emit, compression, or write work.
+  //
+  // Because both the emit thread and worker threads can work on compression,
+  // this is a quasi-work-stealing parallel algorithm. (Enabling other threads
+  // to do emit work would be quite challenging, and allowing the emit thread
+  // to handle writes could create a bottle-neck.)
+  //
+  // This function is basically a CAS loop trying to pick the next piece of work
+  // for this thread and retrying if CAS fails. This function also handles
+  // thread idling when that's the appropriate assignment, continuing the loop
+  // looking for productive work when woken from an idle state.
+  //
+  // Precondition: thread_state is appropriate for thread_kind and not kEnd. It
+  // must match the previously returned state for that thread, and is only kIdle
+  // for the thread on startup (though the kIdle state is used internal to the
+  // function).
+  //
+  // Postcondition: thread_state is appropriate for thread_kind and not kIdle.
+  // Except for kEnd state, the calling thread has exclusive access to
+  // ring_buffer[slot] until next StateTransition().
+  template <ThreadKind thread_kind>
+  void StateTransition(
+      /*in/out*/ ThreadState& thread_state,
+      /*in/out*/ uint32_t& slot) {
+    assert(slot <= ring_buffer_mask);
+    // Last known value for atomic_state
+    State seen_state = atomic_state.Load();
+
+    for (;;) {
+      if (seen_state.Get<AbortFlag>()) {
+        thread_state = ThreadState::kEnd;
+        return;
+      }
+
+      assert(static_cast<uint8_t>(seen_state.Get<NextToEmit>() -
+                                  seen_state.Get<NextToCompress>()) <=
+             ring_buffer_mask + 1);
+      assert(static_cast<uint8_t>(seen_state.Get<NextToCompress>() -
+                                  seen_state.Get<NextToWrite>()) <=
+             ring_buffer_mask + 1);
+      assert(static_cast<uint8_t>(seen_state.Get<NextToEmit>() -
+                                  seen_state.Get<NextToWrite>()) <=
+             ring_buffer_mask + 1);
+
+      // Draft of the next proposed atomic_state. Start by marking completion of
+      // the current thread's last work.
+      State next_state = seen_state;
+      bool wake_idle = false;
+      switch (thread_state) {
+        case ThreadState::kEmitting: {
+          assert(thread_kind == ThreadKind::kEmitter);
+          assert(slot == (next_state.Get<NextToEmit>() & ring_buffer_mask));
+          next_state.Ref<NextToEmit>() += 1;
+          // Check whether to wake up idle worker thread
+          if (next_state.Get<IdleWorkerCount>() > 0 &&
+              // The number of blocks for which compression hasn't started
+              // is well over the number of active threads.
+              static_cast<uint8_t>(next_state.Get<NextToEmit>() -
+                                   next_state.Get<NextToCompress>()) >=
+                  (ring_buffer_mask + 1) / 4 +
+                      (num_worker_threads -
+                       next_state.Get<IdleWorkerCount>())) {
+            // At first, emit_counter_for_wake_up is negative to aggressively
+            // wake up idle worker threads. Then it backs off the interval at
+            // which we wake up, up to some maximum that attempts to balance
+            // maximum throughput and minimum CPU overhead.
+            if (emit_counter_toward_wake_up >= emit_counter_for_wake_up) {
+              // We reached a threshold to justify a wake-up.
+              wake_idle = true;
+              // Adjust idle count assuming we are going to own waking it up,
+              // so no one else can duplicate that. (The idle count is really
+              // the number idling for which no one yet owns waking them up.)
+              next_state.Ref<IdleWorkerCount>() -= 1;
+              // Reset the counter toward the threshold for wake-up
+              emit_counter_toward_wake_up = 0;
+              // Raise the threshold (up to some limit) to stabilize the number
+              // of active threads after some ramp-up period.
+              emit_counter_for_wake_up =
+                  std::min(emit_counter_for_wake_up + 1,
+                           static_cast<int32_t>(num_worker_threads +
+                                                kMaxWakeupInterval));
+            } else {
+              // Advance closer to the threshold for justifying a wake-up
+              emit_counter_toward_wake_up++;
+            }
+          }
+          break;
+        }
+        case ThreadState::kIdle:
+          // NOTE: thread that signalled to wake up already updated idle count
+          // or marker. This is required to avoid overflow on the semaphore,
+          // especially the binary semaphore for idle_emit_sem, and likely
+          // desirable to avoid spurious/extra Release().
+          break;
+        case ThreadState::kCompressing:
+          next_state.Ref<NeedsWriter>() |= uint32_t{1} << slot;
+          if constexpr (thread_kind == ThreadKind::kEmitter) {
+            if (next_state.Get<IdleWorkerCount>() == num_worker_threads) {
+              // Work is available for a worker thread and none are running
+              wake_idle = true;
+              // Adjust idle count assuming we are going to own waking it up
+              next_state.Ref<IdleWorkerCount>() -= 1;
+            }
+          }
+          break;
+        case ThreadState::kEnd:
+          // Should have already recognized the end state
+          assert(thread_state != ThreadState::kEnd);
+          return;
+        case ThreadState::kCompressingAndWriting:
+        case ThreadState::kWriting:
+          assert(thread_kind == ThreadKind::kWorker);
+          assert((next_state.Get<NextToWrite>() & ring_buffer_mask) == slot);
+          assert(next_state.Get<NextToCompress>() !=
+                 next_state.Get<NextToWrite>());
+          assert(next_state.Get<NextToEmit>() != next_state.Get<NextToWrite>());
+          assert((next_state.Get<NeedsWriter>() & (uint32_t{1} << slot)) == 0);
+          next_state.Ref<NextToWrite>() += 1;
+          if (next_state.Get<IdleEmitFlag>()) {
+            wake_idle = true;
+            // Clear idle emit flag assuming we are going to own waking it up
+            next_state.Set<IdleEmitFlag>(false);
+          }
+          break;
+      }
+
+      // Find the next state, depending on the kind of thread
+      ThreadState next_thread_state = ThreadState::kEnd;
+      uint32_t next_slot = 0;
+      if constexpr (thread_kind == ThreadKind::kEmitter) {
+        // First priority is emitting more uncompressed blocks, if there's
+        // room in the ring buffer.
+        if (static_cast<uint8_t>(next_state.Get<NextToEmit>() -
+                                 next_state.Get<NextToWrite>()) <=
+            ring_buffer_mask) {
+          // There is room
+          next_thread_state = ThreadState::kEmitting;
+          next_slot = next_state.Get<NextToEmit>() & ring_buffer_mask;
+        }
+      }
+      if constexpr (thread_kind == ThreadKind::kWorker) {
+        // First priority is writing next block to write, if it needs a writer
+        // assigned to it
+        uint32_t next_to_write_slot =
+            next_state.Get<NextToWrite>() & ring_buffer_mask;
+        uint32_t needs_writer_bit = uint32_t{1} << next_to_write_slot;
+        if (next_state.Get<NeedsWriter>() & needs_writer_bit) {
+          // Clear the "needs writer" marker on the slot
+          next_state.Ref<NeedsWriter>() &= ~needs_writer_bit;
+          // Take ownership of writing it
+          next_thread_state = ThreadState::kWriting;
+          next_slot = next_to_write_slot;
+        }
+      }
+
+      // If didn't find higher priority work
+      if (next_thread_state == ThreadState::kEnd) {
+        if (next_state.Get<NextToCompress>() != seen_state.Get<NextToEmit>()) {
+          // Compression work is available, select that
+          if (thread_kind == ThreadKind::kWorker &&
+              next_state.Get<NextToCompress>() ==
+                  next_state.Get<NextToWrite>()) {
+            next_thread_state = ThreadState::kCompressingAndWriting;
+          } else {
+            next_thread_state = ThreadState::kCompressing;
+          }
+          next_slot = next_state.Get<NextToCompress>() & ring_buffer_mask;
+          next_state.Ref<NextToCompress>() += 1;
+        } else if constexpr (thread_kind == ThreadKind::kEmitter) {
+          // Emitter thread goes idle
+          next_thread_state = ThreadState::kIdle;
+          assert(next_state.Get<IdleEmitFlag>() == false);
+          assert(next_state.Get<NoMoreToEmitFlag>() == false);
+          next_state.Set<IdleEmitFlag>(true);
+        } else if (next_state.Get<NoMoreToEmitFlag>()) {
+          // Worker thread shall not idle if we are done emitting. At least
+          // one worker will remain unblocked to finish writing
+          next_thread_state = ThreadState::kEnd;
+        } else {
+          // Worker thread goes idle
+          next_thread_state = ThreadState::kIdle;
+          assert(next_state.Get<IdleWorkerCount>() < IdleWorkerCount::kMask);
+          next_state.Ref<IdleWorkerCount>() += 1;
+        }
+      }
+      assert(thread_state != ThreadState::kEnd);
+
+      // Attempt to atomically apply the desired/computed state transition
+      if (atomic_state.CasWeak(seen_state, next_state)) {
+        // Success
+        thread_state = next_thread_state;
+        slot = next_slot;
+        seen_state = next_state;
+        if (wake_idle) {
+          if constexpr (thread_kind == ThreadKind::kEmitter) {
+            idle_worker_sem.Release();
+          } else {
+            idle_emit_sem.Release();
+          }
+        }
+        if (thread_state != ThreadState::kIdle) {
+          // Successfully transitioned to another useful state
+          return;
+        }
+        // Handle idle state
+        if constexpr (thread_kind == ThreadKind::kEmitter) {
+#ifndef NDEBUG
+          // Tracking for watchdog
+          idling_emit.StoreRelaxed(true);
+          Defer decr{[this]() { idling_emit.StoreRelaxed(false); }};
+#endif
+          idle_emit_sem.Acquire();  // Likely block
+        } else {
+#ifndef NDEBUG
+          // Tracking for watchdog
+          idling_workers.FetchAddRelaxed(1);
+          Defer decr{[this]() { idling_workers.FetchSubRelaxed(1); }};
 #endif
+          idle_worker_sem.Acquire();  // Likely block
+        }
+        // Update state after sleep
+        seen_state = atomic_state.Load();
+      }
+      // else loop and try again
+    }
   }
 
-  BlockRep* PopRecycledBlockRep() {
-    BlockRep* block_rep = nullptr;
-    block_rep_pool.pop(block_rep);
-    assert(block_rep != nullptr);
+  void EmitterStateTransition(
+      /*in/out*/ ThreadState& thread_state,
+      /*in/out*/ uint32_t& slot) {
+    StateTransition<ThreadKind::kEmitter>(thread_state, slot);
+  }
 
-    block_rep->compression_type = kNoCompression;
-    return block_rep;
+  void WorkerStateTransition(
+      /*in/out*/ ThreadState& thread_state,
+      /*in/out*/ uint32_t& slot) {
+    StateTransition<ThreadKind::kWorker>(thread_state, slot);
   }
 
-  // Emit a block to compression thread
-  void EmitBlock(BlockRep* block_rep) {
-    assert(block_rep != nullptr);
-    assert(block_rep->status.ok());
-    if (!write_queue.push(&block_rep->slot)) {
-      return;
-    }
-    if (!compress_queue.push(block_rep)) {
-      return;
+  // Exactly wake all idling threads (for an end state)
+  void WakeAllIdle() {
+    State old_state, new_state;
+    auto transform =
+        IdleEmitFlag::ClearTransform() + IdleWorkerCount::ClearTransform();
+    atomic_state.Apply(transform, &old_state, &new_state);
+    assert(new_state.Get<IdleEmitFlag>() == false);
+    assert(new_state.Get<IdleWorkerCount>() == 0);
+    if (old_state.Get<IdleEmitFlag>()) {
+      idle_emit_sem.Release();
     }
+    idle_worker_sem.Release(old_state.Get<IdleWorkerCount>());
+  }
 
-    if (!first_block_processed.load(std::memory_order_relaxed)) {
-      std::unique_lock<std::mutex> lock(first_block_mutex);
-      first_block_cond.wait(lock, [this] {
-        return first_block_processed.load(std::memory_order_relaxed);
-      });
+  // Called by emit thread if it is decided no more blocks will be emitted into
+  // this SST file.
+  void SetNoMoreToEmit(/*in/out*/ ThreadState& thread_state,
+                       /*in/out*/ uint32_t& slot) {
+    (void)slot;
+    State old_state;
+    atomic_state.Apply(NoMoreToEmitFlag::SetTransform(), &old_state);
+    assert(old_state.Get<NoMoreToEmitFlag>() == false);
+    assert(slot == BitwiseAnd(old_state.Get<NextToEmit>(), ring_buffer_mask));
+    assert(thread_state == ThreadState::kEmitting);
+    thread_state = ThreadState::kEnd;
+    WakeAllIdle();
+  }
+
+  // Called by any thread to abort parallel compression, etc. because of an
+  // error.
+  void SetAbort(/*in/out*/ ThreadState& thread_state) {
+    State old_state;
+    atomic_state.Apply(AbortFlag::SetTransform(), &old_state);
+    if (old_state.Get<AbortFlag>() == false) {
+      // First to set abort. Wake all workers and emitter
+      WakeAllIdle();
     }
+    thread_state = ThreadState::kEnd;
   }
 
-  // Reap a block from compression thread
-  void ReapBlock(BlockRep* block_rep) {
-    assert(block_rep != nullptr);
-    block_rep->compressed.ResetForSize(0);
-    block_rep_pool.push(block_rep);
+#ifndef NDEBUG
+  // Logic for the extra "watchdog" thread in DEBUG builds that heuristically
+  // checks for the most likely deadlock conditions.
+  //
+  // Some ways to manually validate the watchdog:
+  // * Insert
+  //      if (Random::GetTLSInstance()->OneIn(100)) {
+  //        sleep(100);
+  //      }
+  //   after either of the calls to semaphore Acquire above.
+  // * Miss some Release()s in WakeAllIdle()
+  //
+  // and run table_test unit tests.
+  void BGWatchdog() {
+    int count_toward_deadlock_judgment = 0;
+    for (;;) {
+      // Check for termination condition: All workers and emit thread have
+      // completed.
+      if (live_workers.LoadRelaxed() == 0 && live_emit.LoadRelaxed() == false) {
+        return;
+      }
+
+      // Check for potential deadlock condition
+      if (idling_workers.LoadRelaxed() < live_workers.LoadRelaxed() ||
+          (live_emit.LoadRelaxed() && !idling_emit.LoadRelaxed())) {
+        // Someone is working, all good
+        count_toward_deadlock_judgment = 0;
+      } else {
+        // Could be a deadlock state, but could also be a transient
+        // state where someone has woken up but not cleared their idling flag.
+        // Give it plenty of time and watchdog thread wake-ups before
+        // declaring deadlock.
+        count_toward_deadlock_judgment++;
+        if (count_toward_deadlock_judgment >= 70) {
+          fprintf(stderr,
+                  "Error: apparent deadlock in parallel compression. "
+                  "Aborting. %u / %u, %d / %d, %llx\n",
+                  (unsigned)idling_workers.LoadRelaxed(),
+                  (unsigned)live_workers.LoadRelaxed(),
+                  (int)idling_emit.LoadRelaxed(), (int)live_emit.LoadRelaxed(),
+                  (long long)atomic_state.Load().underlying);
+          std::terminate();
+        }
+      }
 
-    if (!first_block_processed.load(std::memory_order_relaxed)) {
-      std::lock_guard<std::mutex> lock(first_block_mutex);
-      first_block_processed.store(true, std::memory_order_relaxed);
-      first_block_cond.notify_one();
+      // Sleep for 1s at a time unless we are woken up because other threads
+      // ended.
+      std::unique_lock<std::mutex> lock(watchdog_mutex);
+      if (!shutdown_watchdog) {
+        watchdog_cv.wait_for(lock, std::chrono::seconds{1});
+      }
     }
   }
+#endif  // !NDEBUG
 };
 
 struct BlockBasedTableBuilder::Rep {
@@ -479,7 +783,9 @@ struct BlockBasedTableBuilder::Rep {
   // user key should contain the minimum timestamp.
   bool persist_user_defined_timestamps;
   WritableFileWriter* file;
-  std::atomic<uint64_t> offset;
+  // The current offset is only written by the current designated writer thread
+  // but can be read by other threads to estimate current file size
+  RelaxedAtomic<uint64_t> offset{0};
   size_t alignment;
   BlockBuilder data_block;
   // Buffers uncompressed data blocks to replay later. Needed when
@@ -498,11 +804,11 @@ struct BlockBasedTableBuilder::Rep {
   bool uses_explicit_compression_manager = false;
 
   uint64_t sample_for_compression;
-  std::atomic<uint64_t> compressible_input_data_bytes;
-  std::atomic<uint64_t> uncompressible_input_data_bytes;
-  std::atomic<uint64_t> sampled_input_data_bytes;
-  std::atomic<uint64_t> sampled_output_slow_data_bytes;
-  std::atomic<uint64_t> sampled_output_fast_data_bytes;
+  RelaxedAtomic<uint64_t> compressible_input_data_bytes{0};
+  RelaxedAtomic<uint64_t> uncompressible_input_data_bytes{0};
+  RelaxedAtomic<uint64_t> sampled_input_data_bytes{0};
+  RelaxedAtomic<uint64_t> sampled_output_slow_data_bytes{0};
+  RelaxedAtomic<uint64_t> sampled_output_fast_data_bytes{0};
   uint32_t compression_parallel_threads;
   int max_compressed_bytes_per_kb;
   size_t max_dict_sample_bytes = 0;
@@ -539,9 +845,8 @@ struct BlockBasedTableBuilder::Rep {
 
   // Working area for basic_compressor when compression_parallel_threads==1
   WorkingAreaPair basic_working_area;
-  // Working areas for data_block_compressor, for each of
-  // compression_parallel_threads
-  std::vector<WorkingAreaPair> data_block_working_areas;
+  // Working area for data_block_compressor, for emit/compaction thread
+  WorkingAreaPair data_block_working_area;
 
   size_t data_begin_offset = 0;
 
@@ -602,75 +907,85 @@ struct BlockBasedTableBuilder::Rep {
   // See class Footer
   uint32_t base_context_checksum;
 
-  uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
-  void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
+  uint64_t get_offset() { return offset.LoadRelaxed(); }
+  void set_offset(uint64_t o) { offset.StoreRelaxed(o); }
 
-  bool IsParallelCompressionEnabled() const {
-    return compression_parallel_threads > 1;
-  }
+  bool IsParallelCompressionActive() const { return pc_rep != nullptr; }
 
-  Status GetStatus() {
-    // We need to make modifications of status visible when status_ok is set
-    // to false, and this is ensured by status_mutex, so no special memory
-    // order for status_ok is required.
-    if (status_ok.load(std::memory_order_relaxed)) {
-      return Status::OK();
-    } else {
-      return CopyStatus();
-    }
-  }
+  Status GetStatus() { return GetIOStatus(); }
 
-  Status CopyStatus() {
-    std::lock_guard<std::mutex> lock(status_mutex);
-    return status;
+  bool StatusOk() {
+    // The OK case is optimized with an atomic. Relaxed is sufficient because
+    // if a thread other than the emit/compaction thread sets to non-OK it
+    // will synchronize that in aborting parallel compression.
+    bool ok = io_status_ok.LoadRelaxed();
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (ok) {
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      // Double-check
+      if (io_status_ok.LoadRelaxed()) {
+        io_status.PermitUncheckedError();
+        assert(io_status.ok());
+      } else {
+        ok = false;
+      }
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    return ok;
   }
 
   IOStatus GetIOStatus() {
-    // We need to make modifications of io_status visible when status_ok is set
-    // to false, and this is ensured by io_status_mutex, so no special memory
-    // order for io_status_ok is required.
-    if (io_status_ok.load(std::memory_order_relaxed)) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED  // Avoid unnecessary lock acquisition
-      auto ios = CopyIOStatus();
-      ios.PermitUncheckedError();
-      // Assume no races in unit tests
-      assert(ios.ok());
+    // See StatusOk, which is optimized to avoid Status object copies
+    if (LIKELY(io_status_ok.LoadRelaxed())) {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      // Double-check
+      if (io_status_ok.LoadRelaxed()) {
+        io_status.PermitUncheckedError();
+        assert(io_status.ok());
+      } else {
+        return io_status;
+      }
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
       return IOStatus::OK();
     } else {
-      return CopyIOStatus();
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      return io_status;
     }
   }
 
-  IOStatus CopyIOStatus() {
-    std::lock_guard<std::mutex> lock(io_status_mutex);
-    return io_status;
+  // Avoid copying Status and IOStatus objects as much as possible.
+  // Never erase an existing I/O status that is not OK.
+  void SetStatus(Status&& s) {
+    if (UNLIKELY(!s.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(status_to_io_status(std::move(s)));
+    }
   }
-
-  // Never erase an existing status that is not OK.
-  void SetStatus(Status s) {
-    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_parallel_threads
-      // case but since it's unlikely that s is not OK, we take this cost
-      // to be simplicity.
-      std::lock_guard<std::mutex> lock(status_mutex);
-      status = s;
-      status_ok.store(false, std::memory_order_relaxed);
+  void SetStatus(const Status& s) {
+    if (UNLIKELY(!s.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(status_to_io_status(Status(s)));
+    }
+  }
+  void SetIOStatus(IOStatus&& ios) {
+    if (UNLIKELY(!ios.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(std::move(ios));
+    }
+  }
+  void SetIOStatus(const IOStatus& ios) {
+    if (UNLIKELY(!ios.ok()) && io_status_ok.LoadRelaxed()) {
+      SetFailedIOStatus(IOStatus(ios));
     }
   }
 
-  // Never erase an existing I/O status that is not OK.
-  // Calling this will also SetStatus(ios)
-  void SetIOStatus(IOStatus ios) {
-    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
-      // Locking is an overkill for non compression_parallel_threads
-      // case but since it's unlikely that s is not OK, we take this cost
-      // to be simplicity.
-      std::lock_guard<std::mutex> lock(io_status_mutex);
-      io_status = ios;
-      io_status_ok.store(false, std::memory_order_relaxed);
+  void SetFailedIOStatus(IOStatus&& ios) {
+    assert(!ios.ok());
+    // Because !s.ok() is rare, locking is acceptable even in non-parallel case.
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    // Double-check
+    if (io_status.ok()) {
+      io_status = std::move(ios);
+      io_status_ok.StoreRelaxed(false);
     }
-    SetStatus(ios);
   }
 
   Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
@@ -684,7 +999,6 @@ struct BlockBasedTableBuilder::Rep {
         persist_user_defined_timestamps(
             tbo.ioptions.persist_user_defined_timestamps),
         file(f),
-        offset(0),
         alignment(table_options.block_align
                       ? std::min(static_cast<size_t>(table_options.block_size),
                                  kDefaultPageSize)
@@ -706,11 +1020,6 @@ struct BlockBasedTableBuilder::Rep {
             persist_user_defined_timestamps),
         internal_prefix_transform(prefix_extractor.get()),
         sample_for_compression(tbo.moptions.sample_for_compression),
-        compressible_input_data_bytes(0),
-        uncompressible_input_data_bytes(0),
-        sampled_input_data_bytes(0),
-        sampled_output_slow_data_bytes(0),
-        sampled_output_fast_data_bytes(0),
         compression_parallel_threads(
             ((table_opt.partition_filters &&
               !table_opt.decouple_partitioned_filters) ||
@@ -719,7 +1028,6 @@ struct BlockBasedTableBuilder::Rep {
                 : tbo.compression_opts.parallel_threads),
         max_compressed_bytes_per_kb(
             tbo.compression_opts.max_compressed_bytes_per_kb),
-        data_block_working_areas(compression_parallel_threads),
         use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
                                             !table_opt.block_align),
         reason(tbo.reason),
@@ -733,9 +1041,7 @@ struct BlockBasedTableBuilder::Rep {
                        !use_delta_encoding_for_index_values,
                        table_opt.index_type ==
                            BlockBasedTableOptions::kBinarySearchWithFirstKey),
-        tail_size(0),
-        status_ok(true),
-        io_status_ok(true) {
+        tail_size(0) {
     FilterBuildingContext filter_context(table_options);
 
     filter_context.info_log = ioptions.logger;
@@ -797,10 +1103,8 @@ struct BlockBasedTableBuilder::Rep {
       } else {
         // No distinct data block compressor using dictionary
         data_block_compressor = basic_compressor.get();
-        for (uint32_t i = 0; i < compression_parallel_threads; i++) {
-          data_block_working_areas[i].compress =
-              data_block_compressor->ObtainWorkingArea();
-        }
+        data_block_working_area.compress =
+            data_block_compressor->ObtainWorkingArea();
       }
       basic_decompressor = basic_compressor->GetOptimizedDecompressor();
       if (basic_decompressor == nullptr) {
@@ -818,11 +1122,9 @@ struct BlockBasedTableBuilder::Rep {
         if (state == State::kUnbuffered) {
           assert(data_block_compressor);
           data_block_verify_decompressor = verify_decompressor.get();
-          for (uint32_t i = 0; i < compression_parallel_threads; i++) {
-            data_block_working_areas[i].verify =
-                data_block_verify_decompressor->ObtainWorkingArea(
-                    data_block_compressor->GetPreferredCompressionType());
-          }
+          data_block_working_area.verify =
+              data_block_verify_decompressor->ObtainWorkingArea(
+                  data_block_compressor->GetPreferredCompressionType());
         }
       }
     }
@@ -888,10 +1190,10 @@ struct BlockBasedTableBuilder::Rep {
         std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder(
             table_options.user_defined_index_factory->NewBuilder());
         if (user_defined_index_builder != nullptr) {
-          index_builder.reset(new UserDefinedIndexBuilderWrapper(
+          index_builder = std::make_unique<UserDefinedIndexBuilderWrapper>(
               std::string(table_options.user_defined_index_factory->Name()),
               std::move(index_builder), std::move(user_defined_index_builder),
-              &internal_comparator, ts_sz, persist_user_defined_timestamps));
+              &internal_comparator, ts_sz, persist_user_defined_timestamps);
         }
       }
     }
@@ -927,13 +1229,13 @@ struct BlockBasedTableBuilder::Rep {
       }
     }
     table_properties_collectors.emplace_back(
-        new BlockBasedTablePropertiesCollector(
+        std::make_unique<BlockBasedTablePropertiesCollector>(
             table_options.index_type, table_options.whole_key_filtering,
             prefix_extractor != nullptr,
             table_options.decouple_partitioned_filters));
     if (ts_sz > 0 && persist_user_defined_timestamps) {
       table_properties_collectors.emplace_back(
-          new TimestampTablePropertiesCollector(
+          std::make_unique<TimestampTablePropertiesCollector>(
               tbo.internal_comparator.user_comparator()));
     }
 
@@ -991,9 +1293,9 @@ struct BlockBasedTableBuilder::Rep {
       props.compression_name.push_back(';');
       // Rest of property to be filled out at the end of building the file
     } else {
-      // Use legacy compression_name property, populated at the end of building
-      // the file. Not compatible with compression managers using custom
-      // algorithms / compression types.
+      // Use legacy compression_name property, populated at the end of
+      // building the file. Not compatible with compression managers using
+      // custom algorithms / compression types.
       assert(Slice(mgr->CompatibilityName())
                  .compare(GetBuiltinCompressionManager(
                               GetCompressFormatForVersion(
@@ -1024,8 +1326,8 @@ struct BlockBasedTableBuilder::Rep {
 
     std::string& compression_name = props.compression_name;
     if (FormatVersionUsesCompressionManagerName(table_options.format_version)) {
-      // Fill in extended field of "compression name" property, which is the set
-      // of compression types used, sorted by unsigned byte and then hex
+      // Fill in extended field of "compression name" property, which is the
+      // set of compression types used, sorted by unsigned byte and then hex
       // encoded with two digits each (so that table properties are human
       // readable).
       assert(*compression_name.rbegin() == ';');
@@ -1047,8 +1349,8 @@ struct BlockBasedTableBuilder::Rep {
       // based on the legacy configured compression type.
       assert(compression_name.empty());
       if (ctype_count == 0) {
-        // We could get a slight performance boost in the reader by marking the
-        // file as "no compression" if compression is configured but
+        // We could get a slight performance boost in the reader by marking
+        // the file as "no compression" if compression is configured but
         // consistently rejected, but that would give misleading info for
         // debugging purposes. So instead we record the configured compression
         // type, matching the historical behavior.
@@ -1069,13 +1371,10 @@ struct BlockBasedTableBuilder::Rep {
   }
 
  private:
-  // Synchronize status & io_status accesses across threads from main thread,
-  // compression thread and write thread in parallel compression.
-  std::mutex status_mutex;
-  std::atomic<bool> status_ok;
-  Status status;
+  // Synchronize io_status to be readable/writable across threads, but
+  // optimize for the OK case
   std::mutex io_status_mutex;
-  std::atomic<bool> io_status_ok;
+  RelaxedAtomic<bool> io_status_ok{true};
   IOStatus io_status;
 };
 
@@ -1086,7 +1385,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
   auto ucmp = tbo.internal_comparator.user_comparator();
   assert(ucmp);
   (void)ucmp;  // avoids unused variable error.
-  rep_ = new Rep(sanitized_table_options, tbo, file);
+  rep_ = std::make_unique<Rep>(sanitized_table_options, tbo, file);
 
   TEST_SYNC_POINT_CALLBACK(
       "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
@@ -1095,9 +1394,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
   BlockBasedTable::SetupBaseCacheKey(&rep_->props, tbo.db_session_id,
                                      tbo.cur_file_num, &rep_->base_cache_key);
 
-  if (rep_->IsParallelCompressionEnabled()) {
-    StartParallelCompression();
-  } else if (rep_->basic_compressor) {
+  MaybeStartParallelCompression();
+  if (!rep_->IsParallelCompressionActive() && rep_->basic_compressor) {
     rep_->single_threaded_compressed_output.ResetForSize(
         table_options.block_size);
   }
@@ -1106,13 +1404,12 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
 BlockBasedTableBuilder::~BlockBasedTableBuilder() {
   // Catch errors where caller forgot to call Finish()
   assert(rep_->state == Rep::State::kClosed);
-  delete rep_;
 }
 
 void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(rep_->state != Rep::State::kClosed);
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
   ValueType value_type;
@@ -1206,9 +1503,9 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
 }
 
 void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(rep_->state != Rep::State::kClosed);
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
   if (r->data_block.empty()) {
@@ -1221,7 +1518,8 @@ void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
   // property collectors:
   // * BlockAdd function expects block_compressed_bytes_{fast,slow} for
   //   historical reasons. Probably a hassle to remove.
-  // * Collector is not thread safe so calls need to be serialized/synchronized.
+  // * Collector is not thread safe so calls need to be
+  // serialized/synchronized.
   // * Ideally, AddUserKey and BlockAdd calls need to line up such that a
   //   reported block corresponds to all the keys reported since the previous
   //   block.
@@ -1268,12 +1566,12 @@ void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
 
     if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
       // Currently compression sampling is only enabled for data block.
-      r->sampled_input_data_bytes.fetch_add(uncompressed_block_data.size(),
-                                            std::memory_order_relaxed);
-      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
-                                                  std::memory_order_relaxed);
-      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
-                                                  std::memory_order_relaxed);
+      r->sampled_input_data_bytes.FetchAddRelaxed(
+          uncompressed_block_data.size());
+      r->sampled_output_slow_data_bytes.FetchAddRelaxed(
+          sampled_output_slow.size());
+      r->sampled_output_fast_data_bytes.FetchAddRelaxed(
+          sampled_output_fast.size());
     }
 
     NotifyCollectTableCollectorsOnBlockAdd(
@@ -1294,61 +1592,111 @@ void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
     rep_->data_begin_offset += uncompressed_block_data.size();
     MaybeEnterUnbuffered(first_key_in_next_block);
   } else {
-    EmitBlock(r->data_block.MutableBuffer(), r->last_ikey,
-              first_key_in_next_block);
+    if (r->IsParallelCompressionActive()) {
+      EmitBlockForParallel(r->data_block.MutableBuffer(), r->last_ikey,
+                           first_key_in_next_block);
+    } else {
+      EmitBlock(r->data_block.MutableBuffer(), r->last_ikey,
+                first_key_in_next_block);
+    }
     r->data_block.Reset();
   }
 }
 
+void BlockBasedTableBuilder::EmitBlockForParallel(
+    std::string& uncompressed, const Slice& last_key_in_current_block,
+    const Slice* first_key_in_next_block) {
+  Rep* r = rep_.get();
+  assert(r->state == Rep::State::kUnbuffered);
+  assert(uncompressed.size() > 0);
+  auto& pc_rep = *r->pc_rep;
+  // Can emit the uncompressed block into the ring buffer
+  assert(pc_rep.emit_thread_state ==
+         ParallelCompressionRep::ThreadState::kEmitting);
+  auto* block_rep = &pc_rep.ring_buffer[pc_rep.emit_slot];
+  pc_rep.estimated_inflight_size.FetchAddRelaxed(uncompressed.size() +
+                                                 kBlockTrailerSize);
+  std::swap(uncompressed, block_rep->uncompressed);
+  r->index_builder->PrepareIndexEntry(last_key_in_current_block,
+                                      first_key_in_next_block,
+                                      block_rep->prepared_index_entry.get());
+  block_rep->compressed.Reset();
+  block_rep->compression_type = kNoCompression;
+
+  // Might need to take up some compression work before we are able to
+  // resume emitting the next uncompressed block.
+  for (;;) {
+    pc_rep.EmitterStateTransition(pc_rep.emit_thread_state, pc_rep.emit_slot);
+
+    if (pc_rep.emit_thread_state ==
+        ParallelCompressionRep::ThreadState::kCompressing) {
+      // Took up some compression work to help unblock ourself
+      block_rep = &pc_rep.ring_buffer[pc_rep.emit_slot];
+      Status s = CompressAndVerifyBlock(
+          block_rep->uncompressed, /*is_data_block=*/true,
+          r->data_block_working_area, &block_rep->compressed,
+          &block_rep->compression_type);
+      if (UNLIKELY(!s.ok())) {
+        r->SetStatus(s);
+        pc_rep.SetAbort(pc_rep.emit_thread_state);
+        break;
+      }
+    } else {
+      assert(pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kCompressingAndWriting);
+      assert(pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kWriting);
+      assert(pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kIdle);
+      // Either emitting or end state.
+      // Detect nothing more to emit and set if so.
+      if (first_key_in_next_block == nullptr &&
+          pc_rep.emit_thread_state ==
+              ParallelCompressionRep::ThreadState::kEmitting) {
+        pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot);
+      }
+      break;
+    }
+  }
+}
 void BlockBasedTableBuilder::EmitBlock(std::string& uncompressed,
                                        const Slice& last_key_in_current_block,
                                        const Slice* first_key_in_next_block) {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(r->state == Rep::State::kUnbuffered);
+  // Single-threaded context only
+  assert(!r->IsParallelCompressionActive());
   assert(uncompressed.size() > 0);
-  if (r->IsParallelCompressionEnabled()) {
-    ParallelCompressionRep::BlockRep* block_rep =
-        r->pc_rep->PopRecycledBlockRep();
-    std::swap(uncompressed, block_rep->uncompressed);
-    r->index_builder->PrepareIndexEntry(last_key_in_current_block,
-                                        first_key_in_next_block,
-                                        block_rep->prepared_index_entry.get());
-
-    assert(block_rep != nullptr);
-    r->pc_rep->file_size_estimator.EmitBlock(block_rep->uncompressed.size(),
-                                             r->get_offset());
-    r->pc_rep->EmitBlock(block_rep);
-  } else {
-    WriteBlock(uncompressed, &r->pending_handle, BlockType::kData);
-    if (ok()) {
-      // We do not emit the index entry for a block until we have seen the
-      // first key for the next data block.  This allows us to use shorter
-      // keys in the index block.  For example, consider a block boundary
-      // between the keys "the quick brown fox" and "the who".  We can use
-      // "the r" as the key for the index block entry since it is >= all
-      // entries in the first block and < all entries in subsequent
-      // blocks.
-      r->index_builder->AddIndexEntry(
-          last_key_in_current_block, first_key_in_next_block, r->pending_handle,
-          &r->index_separator_scratch);
-    }
+  WriteBlock(uncompressed, &r->pending_handle, BlockType::kData);
+  if (LIKELY(ok())) {
+    // We do not emit the index entry for a block until we have seen the
+    // first key for the next data block.  This allows us to use shorter
+    // keys in the index block.  For example, consider a block boundary
+    // between the keys "the quick brown fox" and "the who".  We can use
+    // "the r" as the key for the index block entry since it is >= all
+    // entries in the first block and < all entries in subsequent
+    // blocks.
+    r->index_builder->AddIndexEntry(last_key_in_current_block,
+                                    first_key_in_next_block, r->pending_handle,
+                                    &r->index_separator_scratch);
   }
 }
 
 void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
                                         BlockHandle* handle,
                                         BlockType block_type) {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(r->state == Rep::State::kUnbuffered);
+  // Single-threaded context only
+  assert(!r->IsParallelCompressionActive());
   CompressionType type;
-  Status compress_status;
   bool is_data_block = block_type == BlockType::kData;
-  CompressAndVerifyBlock(
+  Status compress_status = CompressAndVerifyBlock(
       uncompressed_block_data, is_data_block,
-      is_data_block ? r->data_block_working_areas[0] : r->basic_working_area,
-      &r->single_threaded_compressed_output, &type, &compress_status);
+      is_data_block ? r->data_block_working_area : r->basic_working_area,
+      &r->single_threaded_compressed_output, &type);
   r->SetStatus(compress_status);
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
 
@@ -1366,25 +1714,85 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   }
 }
 
-void BlockBasedTableBuilder::BGWorkCompression(WorkingAreaPair& working_area) {
-  ParallelCompressionRep::BlockRep* block_rep = nullptr;
-  while (rep_->pc_rep->compress_queue.pop(block_rep)) {
-    assert(block_rep != nullptr);
-    // Skip compression if we are aborting anyway
-    if (ok()) {
-      CompressAndVerifyBlock(block_rep->uncompressed, true, /* is_data_block*/
-                             working_area, &block_rep->compressed,
-                             &block_rep->compression_type, &block_rep->status);
+void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
+  auto& pc_rep = *rep_->pc_rep;
+#ifndef NDEBUG
+  // Tracking for watchdog
+  pc_rep.live_workers.FetchAddRelaxed(1);
+  Defer decr{[&pc_rep]() { pc_rep.live_workers.FetchSubRelaxed(1); }};
+#endif  // !NDEBUG
+  ParallelCompressionRep::ThreadState thread_state =
+      ParallelCompressionRep::ThreadState::kIdle;
+  uint32_t slot = 0;
+  // Workers should avoid checking the shared status (e.g. ok()) to minimize
+  // potential data dependencies across threads. If another thread hits an
+  // error, we will pick up the kEnd state from the abort.
+  IOStatus ios;
+  do {
+    pc_rep.WorkerStateTransition(thread_state, slot);
+    ParallelCompressionRep::BlockRep* block_rep = &pc_rep.ring_buffer[slot];
+    auto compress_fn = [this, block_rep, &ios, &working_area]() {
+      ios = status_to_io_status(CompressAndVerifyBlock(
+          block_rep->uncompressed, /*is_data_block=*/true, working_area,
+          &block_rep->compressed, &block_rep->compression_type));
+    };
+    auto write_fn = [this, block_rep, &ios]() {
+      Slice compressed = block_rep->compressed;
+      Slice uncompressed = block_rep->uncompressed;
+      ios = WriteMaybeCompressedBlockImpl(
+          block_rep->compression_type == kNoCompression ? uncompressed
+                                                        : compressed,
+          block_rep->compression_type, &rep_->pending_handle, BlockType::kData,
+          &uncompressed);
+      if (LIKELY(ios.ok())) {
+        rep_->props.data_size = rep_->get_offset();
+        ++rep_->props.num_data_blocks;
+
+        rep_->index_builder->FinishIndexEntry(
+            rep_->pending_handle, block_rep->prepared_index_entry.get());
+      }
+    };
+    switch (thread_state) {
+      case ParallelCompressionRep::ThreadState::kEnd:
+        // All done
+        assert(ios.ok());
+        return;
+      case ParallelCompressionRep::ThreadState::kCompressing:
+        compress_fn();
+        break;
+      case ParallelCompressionRep::ThreadState::kCompressingAndWriting:
+        compress_fn();
+        if (LIKELY(ios.ok())) {
+          write_fn();
+        }
+        break;
+      case ParallelCompressionRep::ThreadState::kWriting:
+        write_fn();
+        break;
+      case ParallelCompressionRep::ThreadState::kEmitting:
+        // Shouldn't happen
+        assert(thread_state != ParallelCompressionRep::ThreadState::kEmitting);
+        break;
+      case ParallelCompressionRep::ThreadState::kIdle:
+        // Shouldn't happen
+        assert(thread_state != ParallelCompressionRep::ThreadState::kIdle);
+        break;
+      default:
+        assert(false);
+        break;
     }
-    block_rep->slot.Fill(block_rep);
-  }
+  } while (LIKELY(ios.ok()));
+  // Hit an error, so abort
+  rep_->SetIOStatus(ios);
+  pc_rep.SetAbort(thread_state);
 }
 
-void BlockBasedTableBuilder::CompressAndVerifyBlock(
+Status BlockBasedTableBuilder::CompressAndVerifyBlock(
     const Slice& uncompressed_block_data, bool is_data_block,
     WorkingAreaPair& working_area, GrowableBuffer* compressed_output,
-    CompressionType* result_compression_type, Status* out_status) {
-  Rep* r = rep_;
+    CompressionType* result_compression_type) {
+  Rep* r = rep_.get();
+  Status status;
 
   Compressor* compressor = nullptr;
   Decompressor* verify_decomp = nullptr;
@@ -1409,12 +1817,12 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
            uncompressed_block_data.size()) >>
           10);
       compressed_output->ResetForSize(max_compressed_size);
-      *out_status = compressor->CompressBlock(
+      status = compressor->CompressBlock(
           uncompressed_block_data, compressed_output->data(),
           &compressed_output->MutableSize(), &type, &working_area.compress);
 
       // Post-condition of Compressor::CompressBlock
-      assert(type == kNoCompression || out_status->ok());
+      assert(type == kNoCompression || status.ok());
       assert(type == kNoCompression ||
              r->table_options.verify_compression == (verify_decomp != nullptr));
 
@@ -1428,21 +1836,20 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
             *verify_decomp, &contents, r->ioptions,
             /*allocator=*/nullptr, &working_area.verify);
 
-        if (uncompress_status.ok()) {
+        if (LIKELY(uncompress_status.ok())) {
           bool data_match = contents.data.compare(uncompressed_block_data) == 0;
           if (!data_match) {
             // The result of the compression was invalid. abort.
             const char* const msg =
                 "Decompressed block did not match pre-compression block";
             ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg);
-            *out_status = Status::Corruption(msg);
+            status = Status::Corruption(msg);
             type = kNoCompression;
           }
         } else {
           // Decompression reported an error. abort.
-          *out_status =
-              Status::Corruption(std::string("Could not decompress: ") +
-                                 uncompress_status.getState());
+          status = Status::Corruption(std::string("Could not decompress: ") +
+                                      uncompress_status.getState());
           type = kNoCompression;
         }
       }
@@ -1452,17 +1859,15 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
       }
     }
     if (is_data_block) {
-      r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(),
-                                                 std::memory_order_relaxed);
-      r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
-                                                   std::memory_order_relaxed);
+      r->compressible_input_data_bytes.FetchAddRelaxed(
+          uncompressed_block_data.size());
+      r->uncompressible_input_data_bytes.FetchAddRelaxed(kBlockTrailerSize);
     }
   } else {
     // Status is not OK, or block is too big to be compressed.
     if (is_data_block) {
-      r->uncompressible_input_data_bytes.fetch_add(
-          uncompressed_block_data.size() + kBlockTrailerSize,
-          std::memory_order_relaxed);
+      r->uncompressible_input_data_bytes.FetchAddRelaxed(
+          uncompressed_block_data.size() + kBlockTrailerSize);
     }
   }
 
@@ -1483,25 +1888,37 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
                uncompressed_block_data.size());
     RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO,
                compressed_output->size());
+    if (r->IsParallelCompressionActive() && is_data_block) {
+      r->pc_rep->estimated_inflight_size.FetchSubRelaxed(
+          uncompressed_block_data.size() - compressed_output->size());
+    }
   }
   *result_compression_type = type;
+  return status;
 }
 
 void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
     BlockType block_type, const Slice* uncompressed_block_data) {
+  rep_->SetIOStatus(WriteMaybeCompressedBlockImpl(
+      block_contents, comp_type, handle, block_type, uncompressed_block_data));
+}
+
+IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl(
+    const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
+    BlockType block_type, const Slice* uncompressed_block_data) {
   // File format contains a sequence of blocks where each block has:
   //    block_data: uint8[n]
   //    compression_type: uint8
   //    checksum: uint32
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   bool is_data_block = block_type == BlockType::kData;
   IOOptions io_options;
+  // Always return io_s for NRVO
   IOStatus io_s =
       WritableFileWriter::PrepareIOOptions(r->write_options, io_options);
-  if (!io_s.ok()) {
-    r->SetIOStatus(io_s);
-    return;
+  if (UNLIKELY(!io_s.ok())) {
+    return io_s;
   }
   // Old, misleading name of this function: WriteRawBlock
   StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
@@ -1520,9 +1937,8 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
   // r->file->Append call
   {
     io_s = r->file->Append(io_options, block_contents);
-    if (!io_s.ok()) {
-      r->SetIOStatus(io_s);
-      return;
+    if (UNLIKELY(!io_s.ok())) {
+      return io_s;
     }
   }
 
@@ -1535,10 +1951,10 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
   checksum += ChecksumModifierForContext(r->base_context_checksum, offset);
 
   if (block_type == BlockType::kFilter) {
-    Status s = r->filter_builder->MaybePostVerifyFilter(block_contents);
-    if (!s.ok()) {
-      r->SetStatus(s);
-      return;
+    io_s = status_to_io_status(
+        r->filter_builder->MaybePostVerifyFilter(block_contents));
+    if (UNLIKELY(!io_s.ok())) {
+      return io_s;
     }
   }
 
@@ -1548,18 +1964,16 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
       trailer.data());
   {
     io_s = r->file->Append(io_options, Slice(trailer.data(), trailer.size()));
-    if (!io_s.ok()) {
-      r->SetIOStatus(io_s);
-      return;
+    if UNLIKELY (!io_s.ok()) {
+      return io_s;
     }
   }
 
   if (r->warm_cache) {
-    Status s =
-        InsertBlockInCacheHelper(*uncompressed_block_data, handle, block_type);
-    if (!s.ok()) {
-      r->SetStatus(s);
-      return;
+    io_s = status_to_io_status(
+        InsertBlockInCacheHelper(*uncompressed_block_data, handle, block_type));
+    if (UNLIKELY(!io_s.ok())) {
+      return io_s;
     }
   }
 
@@ -1573,90 +1987,80 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
         (r->alignment - 1);
 
     io_s = r->file->Pad(io_options, pad_bytes);
-    if (io_s.ok()) {
+    if (LIKELY(io_s.ok())) {
       r->pre_compression_size += pad_bytes;
       r->set_offset(r->get_offset() + pad_bytes);
     } else {
-      r->SetIOStatus(io_s);
-      return;
+      return io_s;
     }
   }
 
-  if (r->IsParallelCompressionEnabled()) {
-    if (is_data_block) {
-      r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
-                                               r->get_offset());
-    } else {
-      r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
-    }
+  if (r->IsParallelCompressionActive() && is_data_block) {
+    r->pc_rep->estimated_inflight_size.FetchSubRelaxed(block_contents.size() +
+                                                       kBlockTrailerSize);
   }
+  return io_s;
 }
 
-void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
-  Rep* r = rep_;
-  ParallelCompressionRep::BlockRepSlot* slot = nullptr;
-  ParallelCompressionRep::BlockRep* block_rep = nullptr;
-  // Starts empty; see FilterBlockBuilder::AddWithPrevKey
-  while (r->pc_rep->write_queue.pop(slot)) {
-    // FIXME: this is weird popping off write queue just to wait again on
-    // compress queue
-    assert(slot != nullptr);
-    slot->Take(block_rep);
-    assert(block_rep != nullptr);
-    if (!block_rep->status.ok()) {
-      r->SetStatus(block_rep->status);
-      // Reap block so that blocked Flush() can finish
-      // if there is one, and Flush() will notice !ok() next time.
-      block_rep->status = Status::OK();
-      r->pc_rep->ReapBlock(block_rep);
-      continue;
-    }
-
-    r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
-        block_rep->uncompressed.size());
-    Slice compressed = block_rep->compressed;
-    Slice uncompressed = block_rep->uncompressed;
-    WriteMaybeCompressedBlock(block_rep->compression_type == kNoCompression
-                                  ? uncompressed
-                                  : compressed,
-                              block_rep->compression_type, &r->pending_handle,
-                              BlockType::kData, &uncompressed);
-    if (!ok()) {
-      break;
-    }
-
-    r->props.data_size = r->get_offset();
-    ++r->props.num_data_blocks;
-
-    r->index_builder->FinishIndexEntry(r->pending_handle,
-                                       block_rep->prepared_index_entry.get());
-
-    r->pc_rep->ReapBlock(block_rep);
+void BlockBasedTableBuilder::MaybeStartParallelCompression() {
+  if (rep_->compression_parallel_threads <= 1) {
+    return;
   }
-}
-
-void BlockBasedTableBuilder::StartParallelCompression() {
-  rep_->pc_rep.reset(
-      new ParallelCompressionRep(rep_->compression_parallel_threads));
-  rep_->pc_rep->compress_thread_pool.reserve(
+  rep_->pc_rep = std::make_unique<ParallelCompressionRep>(
       rep_->compression_parallel_threads);
-  for (uint32_t i = 0; i < rep_->compression_parallel_threads; i++) {
-    rep_->pc_rep->block_rep_buf[i].prepared_index_entry =
+  auto& pc_rep = *rep_->pc_rep;
+  for (uint32_t i = 0; i <= pc_rep.ring_buffer_mask; i++) {
+    pc_rep.ring_buffer[i].prepared_index_entry =
         rep_->index_builder->CreatePreparedIndexEntry();
-    rep_->pc_rep->compress_thread_pool.emplace_back(
-        [this, i] { BGWorkCompression(rep_->data_block_working_areas[i]); });
   }
-  rep_->pc_rep->write_thread.reset(
-      new port::Thread([this] { BGWorkWriteMaybeCompressedBlock(); }));
+  pc_rep.worker_threads.reserve(pc_rep.num_worker_threads);
+  pc_rep.working_areas.resize(pc_rep.num_worker_threads);
+  for (uint32_t i = 0; i < pc_rep.num_worker_threads; i++) {
+    auto& wa = pc_rep.working_areas[i];
+    if (rep_->data_block_compressor) {
+      wa.compress = rep_->data_block_compressor->ObtainWorkingArea();
+    }
+    if (rep_->data_block_verify_decompressor) {
+      wa.verify = rep_->data_block_verify_decompressor->ObtainWorkingArea(
+          rep_->data_block_compressor->GetPreferredCompressionType());
+    }
+    pc_rep.worker_threads.emplace_back([this, &wa] { BGWorker(wa); });
+  }
+#ifndef NDEBUG
+  // Start watchdog thread in DEBUG builds
+  pc_rep.watchdog_thread = std::thread([&pc_rep] { pc_rep.BGWatchdog(); });
+  pc_rep.live_emit.StoreRelaxed(true);
+#endif  // !NDEBUG
 }
 
-void BlockBasedTableBuilder::StopParallelCompression() {
-  rep_->pc_rep->compress_queue.finish();
-  for (auto& thread : rep_->pc_rep->compress_thread_pool) {
+void BlockBasedTableBuilder::StopParallelCompression(bool abort) {
+  auto& pc_rep = *rep_->pc_rep;
+  if (abort) {
+    pc_rep.SetAbort(pc_rep.emit_thread_state);
+  } else if (pc_rep.emit_thread_state !=
+             ParallelCompressionRep::ThreadState::kEnd) {
+    // In case we didn't do a final flush with no next key
+    assert(rep_->props.num_data_blocks == 0);
+    pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot);
+  }
+#ifndef NDEBUG
+  // Tracking for watchdog
+  pc_rep.live_emit.StoreRelaxed(false);
+#endif  // !NDEBUG
+  assert(pc_rep.emit_thread_state == ParallelCompressionRep::ThreadState::kEnd);
+  for (auto& thread : pc_rep.worker_threads) {
     thread.join();
   }
-  rep_->pc_rep->write_queue.finish();
-  rep_->pc_rep->write_thread->join();
+#ifndef NDEBUG
+  // Wake & shutdown watchdog thread
+  {
+    std::unique_lock<std::mutex> lock(pc_rep.watchdog_mutex);
+    pc_rep.shutdown_watchdog = true;
+    pc_rep.watchdog_cv.notify_all();
+  }
+  pc_rep.watchdog_thread.join();
+#endif  // !NDEBUG
+  rep_->pc_rep.reset();
 }
 
 Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
@@ -1665,6 +2069,8 @@ IOStatus BlockBasedTableBuilder::io_status() const {
   return rep_->GetIOStatus();
 }
 
+bool BlockBasedTableBuilder::ok() const { return rep_->StatusOk(); }
+
 Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
     const Slice& block_contents, const BlockHandle* handle,
     BlockType block_type) {
@@ -1683,7 +2089,7 @@ Status BlockBasedTableBuilder::InsertBlockInCacheHelper(
     s = WarmInCache(block_cache, key.AsSlice(), block_contents,
                     &rep_->create_context, helper, Cache::Priority::LOW,
                     &charge);
-    if (s.ok()) {
+    if (LIKELY(s.ok())) {
       BlockBasedTable::UpdateCacheInsertionMetrics(
           block_type, nullptr /*get_context*/, charge, s.IsOkOverwritten(),
           rep_->ioptions.stats);
@@ -1709,11 +2115,11 @@ void BlockBasedTableBuilder::WriteFilterBlock(
   }
   BlockHandle filter_block_handle;
   bool is_partitioned_filter = rep_->table_options.partition_filters;
-  if (ok()) {
+  if (LIKELY(ok())) {
     rep_->props.num_filter_entries +=
         rep_->filter_builder->EstimateEntriesAdded();
     Status s = Status::Incomplete();
-    while (ok() && s.IsIncomplete()) {
+    while (LIKELY(ok()) && s.IsIncomplete()) {
       // filter_data is used to store the transferred filter data payload from
       // FilterBlockBuilder and deallocate the payload by going out of scope.
       // Otherwise, the payload will unnecessarily remain until
@@ -1743,7 +2149,7 @@ void BlockBasedTableBuilder::WriteFilterBlock(
     }
     rep_->filter_builder->ResetFilterBitsBuilder();
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
     // Add mapping from "<filter_block_prefix>.Name" to location
     // of filter data.
     std::string key;
@@ -1756,12 +2162,12 @@ void BlockBasedTableBuilder::WriteFilterBlock(
 
 void BlockBasedTableBuilder::WriteIndexBlock(
     MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
-  if (!ok()) {
+  if (UNLIKELY(!ok())) {
     return;
   }
   IndexBuilder::IndexBlocks index_blocks;
   auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
-  if (ok() && !index_builder_status.ok() &&
+  if (LIKELY(ok()) && !index_builder_status.ok() &&
       !index_builder_status.IsIncomplete()) {
     // If the index builder failed for non-Incomplete errors, we should
     // mark the entire builder as having failed wit that status. However,
@@ -1770,7 +2176,7 @@ void BlockBasedTableBuilder::WriteIndexBlock(
     rep_->SetStatus(index_builder_status);
   }
 
-  if (ok()) {
+  if (LIKELY(ok())) {
     for (const auto& item : index_blocks.meta_blocks) {
       BlockHandle block_handle;
       if (item.second.first == BlockType::kIndex) {
@@ -1780,13 +2186,13 @@ void BlockBasedTableBuilder::WriteIndexBlock(
         WriteMaybeCompressedBlock(item.second.second, kNoCompression,
                                   &block_handle, item.second.first);
       }
-      if (!ok()) {
+      if (UNLIKELY(!ok())) {
         break;
       }
       meta_index_builder->Add(item.first, block_handle);
     }
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
     if (rep_->table_options.enable_index_compression) {
       WriteBlock(index_blocks.index_block_contents, index_block_handle,
                  BlockType::kIndex);
@@ -1799,7 +2205,7 @@ void BlockBasedTableBuilder::WriteIndexBlock(
   // If there are more index partitions, finish them and write them out
   if (index_builder_status.IsIncomplete()) {
     bool index_building_finished = false;
-    while (ok() && !index_building_finished) {
+    while (LIKELY(ok()) && !index_building_finished) {
       Status s =
           rep_->index_builder->Finish(&index_blocks, *index_block_handle);
       if (s.ok()) {
@@ -1825,8 +2231,8 @@ void BlockBasedTableBuilder::WriteIndexBlock(
     }
   }
   // If success and need to record in metaindex rather than footer...
-  if (ok() && !FormatVersionUsesIndexHandleInFooter(
-                  rep_->table_options.format_version)) {
+  if (LIKELY(ok()) && !FormatVersionUsesIndexHandleInFooter(
+                          rep_->table_options.format_version)) {
     meta_index_builder->Add(kIndexBlockName, *index_block_handle);
   }
 }
@@ -1834,7 +2240,7 @@ void BlockBasedTableBuilder::WriteIndexBlock(
 void BlockBasedTableBuilder::WritePropertiesBlock(
     MetaIndexBuilder* meta_index_builder) {
   BlockHandle properties_block_handle;
-  if (ok()) {
+  if (LIKELY(ok())) {
     PropertyBlockBuilder property_block_builder;
     rep_->props.filter_policy_name =
         rep_->table_options.filter_policy != nullptr
@@ -1870,32 +2276,34 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
       assert(rep_->p_index_builder_ != nullptr);
       rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
       rep_->props.top_level_index_size =
-          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset.LoadRelaxed());
     }
     rep_->props.index_key_is_user_key =
         !rep_->index_builder->separator_is_key_plus_seq();
     rep_->props.index_value_is_delta_encoded =
         rep_->use_delta_encoding_for_index_values;
-    if (rep_->sampled_input_data_bytes > 0) {
+    if (rep_->sampled_input_data_bytes.LoadRelaxed() > 0) {
       rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
-          static_cast<double>(rep_->sampled_output_slow_data_bytes) /
-              rep_->sampled_input_data_bytes *
-              rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes + 0.5);
+          static_cast<double>(
+              rep_->sampled_output_slow_data_bytes.LoadRelaxed()) /
+              rep_->sampled_input_data_bytes.LoadRelaxed() *
+              rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed() + 0.5);
       rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
-          static_cast<double>(rep_->sampled_output_fast_data_bytes) /
-              rep_->sampled_input_data_bytes *
-              rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes + 0.5);
+          static_cast<double>(
+              rep_->sampled_output_fast_data_bytes.LoadRelaxed()) /
+              rep_->sampled_input_data_bytes.LoadRelaxed() *
+              rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed() + 0.5);
     } else if (rep_->sample_for_compression > 0) {
-      // We tried to sample but none were found. Assume worst-case (compression
-      // ratio 1.0) so data is complete and aggregatable.
+      // We tried to sample but none were found. Assume worst-case
+      // (compression ratio 1.0) so data is complete and aggregatable.
       rep_->props.slow_compression_estimated_data_size =
-          rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes;
+          rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed();
       rep_->props.fast_compression_estimated_data_size =
-          rep_->compressible_input_data_bytes +
-          rep_->uncompressible_input_data_bytes;
+          rep_->compressible_input_data_bytes.LoadRelaxed() +
+          rep_->uncompressible_input_data_bytes.LoadRelaxed();
     }
     rep_->props.user_defined_timestamps_persisted =
         rep_->persist_user_defined_timestamps;
@@ -1916,7 +2324,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
     WriteMaybeCompressedBlock(block_data, kNoCompression,
                               &properties_block_handle, BlockType::kProperties);
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
 #ifndef NDEBUG
     {
       uint64_t props_block_offset = properties_block_handle.offset();
@@ -1946,7 +2354,7 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock(
   }
   if (!compression_dict.empty()) {
     BlockHandle compression_dict_block_handle;
-    if (ok()) {
+    if (LIKELY(ok())) {
       WriteMaybeCompressedBlock(compression_dict, kNoCompression,
                                 &compression_dict_block_handle,
                                 BlockType::kCompressionDictionary);
@@ -1954,7 +2362,7 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock(
           "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
           &compression_dict);
     }
-    if (ok()) {
+    if (LIKELY(ok())) {
       meta_index_builder->Add(kCompressionDictBlockName,
                               compression_dict_block_handle);
     }
@@ -1963,7 +2371,7 @@ void BlockBasedTableBuilder::WriteCompressionDictBlock(
 
 void BlockBasedTableBuilder::WriteRangeDelBlock(
     MetaIndexBuilder* meta_index_builder) {
-  if (ok() && !rep_->range_del_block.empty()) {
+  if (LIKELY(ok()) && !rep_->range_del_block.empty()) {
     BlockHandle range_del_block_handle;
     WriteMaybeCompressedBlock(rep_->range_del_block.Finish(), kNoCompression,
                               &range_del_block_handle,
@@ -1974,8 +2382,8 @@ void BlockBasedTableBuilder::WriteRangeDelBlock(
 
 void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
                                          BlockHandle& index_block_handle) {
-  assert(ok());
-  Rep* r = rep_;
+  assert(LIKELY(ok()));
+  Rep* r = rep_.get();
   // this is guaranteed by BlockBasedTableBuilder's constructor
   assert(r->table_options.checksum == kCRC32c ||
          r->table_options.format_version != 0);
@@ -2006,9 +2414,10 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
 
 void BlockBasedTableBuilder::MaybeEnterUnbuffered(
     const Slice* first_key_in_next_block) {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(r->state == Rep::State::kBuffered);
-  // Don't yet enter unbuffered (early return) if none of the conditions are met
+  // Don't yet enter unbuffered (early return) if none of the conditions are
+  // met
   if (first_key_in_next_block != nullptr) {
     bool exceeds_buffer_limit =
         (r->buffer_limit != 0 && r->data_begin_offset > r->buffer_limit);
@@ -2043,7 +2452,8 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
   // Abstract algebra teaches us that a finite cyclic group (such as the
   // additive group of integers modulo N) can be generated by a number that is
   // coprime with N. Since N is variable (number of buffered data blocks), we
-  // must then pick a prime number in order to guarantee coprimeness with any N.
+  // must then pick a prime number in order to guarantee coprimeness with any
+  // N.
   //
   // One downside of this approach is the spread will be poor when
   // `kPrimeGeneratorRemainder` is close to zero or close to
@@ -2085,10 +2495,6 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
   r->data_block_compressor = r->compressor_with_dict
                                  ? r->compressor_with_dict.get()
                                  : r->basic_compressor.get();
-  for (uint32_t i = 0; i < r->compression_parallel_threads; i++) {
-    r->data_block_working_areas[i].compress =
-        r->data_block_compressor->ObtainWorkingArea();
-  }
   Slice serialized_dict = r->data_block_compressor->GetSerializedDict();
   if (r->verify_decompressor) {
     if (serialized_dict.empty()) {
@@ -2098,17 +2504,12 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
       // Get an updated dictionary-aware decompressor for verification.
       Status s = r->verify_decompressor->MaybeCloneForDict(
           serialized_dict, &r->verify_decompressor_with_dict);
-      // Dictionary support must be present on the decompressor side if it's on
-      // the compressor side.
+      // Dictionary support must be present on the decompressor side if it's
+      // on the compressor side.
       assert(r->verify_decompressor_with_dict);
       if (r->verify_decompressor_with_dict) {
         r->data_block_verify_decompressor =
             r->verify_decompressor_with_dict.get();
-        for (uint32_t i = 0; i < r->compression_parallel_threads; i++) {
-          r->data_block_working_areas[i].verify =
-              r->data_block_verify_decompressor->ObtainWorkingArea(
-                  r->data_block_compressor->GetPreferredCompressionType());
-        }
         assert(s.ok());
       } else {
         assert(!s.ok());
@@ -2165,7 +2566,13 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
     auto& data_block = r->data_block_buffers[i];
     iter->SeekToLast();
     assert(iter->Valid());
-    EmitBlock(data_block, iter->key(), first_key_in_loop_next_block_ptr);
+    if (r->IsParallelCompressionActive()) {
+      EmitBlockForParallel(data_block, iter->key(),
+                           first_key_in_loop_next_block_ptr);
+
+    } else {
+      EmitBlock(data_block, iter->key(), first_key_in_loop_next_block_ptr);
+    }
     std::swap(iter, next_block_iter);
   }
   r->data_block_buffers.clear();
@@ -2179,7 +2586,7 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
 }
 
 Status BlockBasedTableBuilder::Finish() {
-  Rep* r = rep_;
+  Rep* r = rep_.get();
   assert(r->state != Rep::State::kClosed);
   // To make sure properties block is able to keep the accurate size of index
   // block, we will finish writing all index entries first, in Flush().
@@ -2188,16 +2595,11 @@ Status BlockBasedTableBuilder::Finish() {
     MaybeEnterUnbuffered(nullptr);
   }
   assert(r->state == Rep::State::kUnbuffered);
-  if (r->IsParallelCompressionEnabled()) {
-    StopParallelCompression();
-#ifndef NDEBUG
-    for (const auto& br : r->pc_rep->block_rep_buf) {
-      assert(br.status.ok());
-    }
-#endif  // !NDEBUG
+  if (r->IsParallelCompressionActive()) {
+    StopParallelCompression(/*abort=*/false);
   }
 
-  r->props.tail_start_offset = r->offset;
+  r->props.tail_start_offset = r->offset.LoadRelaxed();
 
   // Write meta blocks, metaindex block and footer in the following order.
   //    1. [meta block: filter]
@@ -2214,36 +2616,27 @@ Status BlockBasedTableBuilder::Finish() {
   WriteCompressionDictBlock(&meta_index_builder);
   WriteRangeDelBlock(&meta_index_builder);
   WritePropertiesBlock(&meta_index_builder);
-  if (ok()) {
+  if (LIKELY(ok())) {
     // flush the meta index block
     WriteMaybeCompressedBlock(meta_index_builder.Finish(), kNoCompression,
                               &metaindex_block_handle, BlockType::kMetaIndex);
   }
-  if (ok()) {
+  if (LIKELY(ok())) {
     WriteFooter(metaindex_block_handle, index_block_handle);
   }
   r->state = Rep::State::kClosed;
-  r->tail_size = r->offset - r->props.tail_start_offset;
+  r->tail_size = r->offset.LoadRelaxed() - r->props.tail_start_offset;
 
-  Status ret_status = r->CopyStatus();
-  IOStatus ios = r->GetIOStatus();
-  if (!ios.ok() && ret_status.ok()) {
-    // Let io_status supersede ok status (otherwise status takes precedennce)
-    ret_status = ios;
-  }
-  return ret_status;
+  return r->GetStatus();
 }
 
 void BlockBasedTableBuilder::Abandon() {
   assert(rep_->state != Rep::State::kClosed);
-  if (rep_->IsParallelCompressionEnabled()) {
-    StopParallelCompression();
+  if (rep_->IsParallelCompressionActive()) {
+    StopParallelCompression(/*abort=*/true);
   }
   rep_->state = Rep::State::kClosed;
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED  // Avoid unnecessary lock acquisition
-  rep_->CopyStatus().PermitUncheckedError();
-  rep_->CopyIOStatus().PermitUncheckedError();
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  rep_->GetIOStatus().PermitUncheckedError();
 }
 
 uint64_t BlockBasedTableBuilder::NumEntries() const {
@@ -2258,13 +2651,14 @@ uint64_t BlockBasedTableBuilder::PreCompressionSize() const {
   return rep_->pre_compression_size;
 }
 
-uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
+uint64_t BlockBasedTableBuilder::FileSize() const {
+  return rep_->offset.LoadRelaxed();
+}
 
 uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
-  if (rep_->IsParallelCompressionEnabled()) {
-    // Use compression ratio so far and inflight uncompressed bytes to estimate
-    // final SST size.
-    return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
+  if (rep_->IsParallelCompressionActive()) {
+    // Use upper bound on "inflight" data size to estimate
+    return FileSize() + rep_->pc_rep->estimated_inflight_size.LoadRelaxed();
   } else {
     return FileSize();
   }
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 2ba0ef8c8d6c..1e6c3217c1ce 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -113,24 +113,33 @@ class BlockBasedTableBuilder : public TableBuilder {
                                    uint64_t oldest_ancestor_time) override;
 
  private:
-  bool ok() const { return status().ok(); }
+  bool ok() const;
 
   // Transition state from buffered to unbuffered if the conditions are met. See
   // `Rep::State` API comment for details of the states.
   // REQUIRES: `rep_->state == kBuffered`
   void MaybeEnterUnbuffered(const Slice* first_key_in_next_block);
 
+  // Try to keep some parallel-specific code separate to improve hot code
+  // locality for non-parallel case
   void EmitBlock(std::string& uncompressed,
                  const Slice& last_key_in_current_block,
                  const Slice* first_key_in_next_block);
+  void EmitBlockForParallel(std::string& uncompressed,
+                            const Slice& last_key_in_current_block,
+                            const Slice* first_key_in_next_block);
 
-  // Compress and write block content to the file.
+  // Compress and write block content to the file, from a single-threaded
+  // context
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
                   BlockType block_type);
   // Directly write data to the file.
   void WriteMaybeCompressedBlock(
       const Slice& block_contents, CompressionType, BlockHandle* handle,
       BlockType block_type, const Slice* uncompressed_block_data = nullptr);
+  IOStatus WriteMaybeCompressedBlockImpl(
+      const Slice& block_contents, CompressionType, BlockHandle* handle,
+      BlockType block_type, const Slice* uncompressed_block_data = nullptr);
 
   void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
 
@@ -158,7 +167,7 @@ class BlockBasedTableBuilder : public TableBuilder {
   struct Rep;
   class BlockBasedTablePropertiesCollectorFactory;
   class BlockBasedTablePropertiesCollector;
-  Rep* rep_;
+  std::unique_ptr<Rep> rep_;
   struct WorkingAreaPair;
   struct ParallelCompressionRep;
 
@@ -173,27 +182,23 @@ class BlockBasedTableBuilder : public TableBuilder {
   // compress it
   const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
 
-  // Get blocks from mem-table walking thread, compress them and
-  // pass them to the write thread. Used in parallel compression mode only
-  void BGWorkCompression(WorkingAreaPair& working_area);
+  // Code for a "parallel compression" worker thread, which can really do SST
+  // writes and block compressions alternately.
+  void BGWorker(WorkingAreaPair& working_area);
 
   // Given uncompressed block content, try to compress it and return result and
   // compression type
-  void CompressAndVerifyBlock(const Slice& uncompressed_block_data,
-                              bool is_data_block, WorkingAreaPair& working_area,
-                              GrowableBuffer* compressed_output,
-                              CompressionType* result_compression_type,
-                              Status* out_status);
-
-  // Get compressed blocks from BGWorkCompression and write them into SST
-  void BGWorkWriteMaybeCompressedBlock();
+  Status CompressAndVerifyBlock(const Slice& uncompressed_block_data,
+                                bool is_data_block,
+                                WorkingAreaPair& working_area,
+                                GrowableBuffer* compressed_output,
+                                CompressionType* result_compression_type);
 
-  // Initialize parallel compression context and
-  // start BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
-  void StartParallelCompression();
+  // If configured, start worker threads for parallel compression
+  void MaybeStartParallelCompression();
 
-  // Stop BGWorkCompression and BGWorkWriteMaybeCompressedBlock threads
-  void StopParallelCompression();
+  // Stop worker threads for parallel compression
+  void StopParallelCompression(bool abort);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 763be6c99403..9edb85ba4f0d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -82,7 +82,7 @@ def setup_random_seed_before_main():
     ),
     "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
     "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
-    "compression_parallel_threads": lambda: random.choice([1] * 3 + [4, 8, 16]),
+    "compression_parallel_threads": lambda: random.choice([1, 1, 2, 3, 4, 5, 8, 9, 16]),
     "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1,
     "compression_use_zstd_dict_trainer": lambda: random.randint(0, 1),
     "compression_checksum": lambda: random.randint(0, 1),
diff --git a/unreleased_history/performance_improvements/parallel_compression.md b/unreleased_history/performance_improvements/parallel_compression.md
index 769b03941e13..4a3b9a4361e4 100644
--- a/unreleased_history/performance_improvements/parallel_compression.md
+++ b/unreleased_history/performance_improvements/parallel_compression.md
@@ -1 +1 @@
-* Improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature.
+* Majorly improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature. Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, but this is not currently recommended because of reported bugs in implementations of `std::counting_semaphore`/`binary_semaphore`.
diff --git a/util/bit_fields.h b/util/bit_fields.h
new file mode 100644
index 000000000000..e0cadd02bca6
--- /dev/null
+++ b/util/bit_fields.h
@@ -0,0 +1,331 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Declares a wrapper type around UnderlyingT that allows it to be divided up
+// into and accessed as bit fields. This is mostly intended to aid in packing
+// fields into atomic variables to reduce the need for locking in concurrent
+// code and/or to simplify reasoning on and accommodation of different
+// interesting, bug-prone interleavings. Convenient atomic wrappers
+// (RelaxedAtomic, AcqRelAtomic) are provided below to aid usage with atomics,
+// especially for CAS updates, but it is even possible to combine operations on
+// multiple bit fields into a single non-CAS atomic operation using Transforms
+// below.
+//
+// Unlike C/C++ bit fields, this implementation guarantees tight bit packing
+// so that all available lock-free atomic bits can be utilized.
+//
+// The specific bit fields are declared outside the declaration using
+// BoolBitField and UnsignedBitField below. Example usage:
+//
+// // A unique compile-time identifier to ensure we don't mix up different
+// // bit fields.
+// struct MyStateID {};
+//
+// using MyState = BitFields<uint32_t, MyStateID>;
+//  - or -
+// struct MyState : public BitFields<uint32_t, MyStateID> {
+//   // Extra helper declarations and/or field type declarations
+// };
+//
+// // Starts with a 16-bit field returned as uint16_t
+// using Field1 = UnsignedBitField<MyState, 16, NoPrevBitField>;
+// using Field2 = BoolBitField<MyState, Field1>;
+// using Field3 = BoolBitField<MyState, Field2>;
+// using Field4 = UnsignedBitField<MyState, 5, Field3>;  // 5 bits in a uint8_t
+//
+// MyState state;  // zero-initialized
+// state.Set<Field1>(42U);
+// state.Set<Field2>(true);
+// state.Set<Field4>(3U);
+// state.Ref<Field1>() += state.Get<Field4>();
+//
+// Note that there's nothing preventing you from declaring overlapping fields
+// in the same 'MyState' family. This could be useful for variant types where
+// an earlier field determines which layout later fields are using. For example,
+// an alternate field after Field2:
+//
+// using Field3a = UnsignedBitField<State, 6, Field2>;  // 6 bits in a uint8_t
+//
+template <typename UnderlyingT, typename IdentifyingT>
+struct BitFields {
+  using U = UnderlyingT;
+  U underlying = 0;
+  static constexpr int kBitCount = sizeof(U) * 8;
+
+  using ID = IdentifyingT;
+
+  template <typename BitFieldT>
+  void Set(typename BitFieldT::V value) {
+    BitFieldT::SetIn(static_cast<typename BitFieldT::Parent&>(*this), value);
+  }
+
+  template <typename BitFieldT>
+  typename BitFieldT::V Get() const {
+    return BitFieldT::GetFrom(
+        static_cast<const typename BitFieldT::Parent&>(*this));
+  }
+
+  // Reference and Ref() are not intended to behave as full references but to
+  // provide a convenient way to do operations like +=, |=, etc. Get and Set
+  // are preferred for simple operations.
+  template <typename BitFieldT>
+  struct Reference {
+    explicit Reference(BitFields& bf) : bf_(bf) {}
+    Reference(const Reference&) = default;
+    Reference& operator=(const Reference&) = default;
+    // no moves
+    Reference(Reference&&) = default;
+    Reference& operator=(Reference&&) = default;
+
+    void operator=(typename BitFieldT::V value) { bf_.Set<BitFieldT>(value); }
+    void operator+=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() + value);
+    }
+    void operator-=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() - value);
+    }
+    void operator|=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() | value);
+    }
+    void operator&=(typename BitFieldT::V value) {
+      bf_.Set<BitFieldT>(bf_.Get<BitFieldT>() & value);
+    }
+
+   private:
+    BitFields& bf_;
+  };
+
+  template <typename BitFieldT>
+  Reference<BitFieldT> Ref() {
+    return Reference<BitFieldT>(*this);
+  }
+
+  bool operator==(const BitFields& other) const = default;
+  bool operator!=(const BitFields& other) const = default;
+};
+
+// For building atomic updates affecting one or more fields, assuming all the
+// updates are bitwise-or.
+template <typename BitFieldsT>
+struct OrTransform {
+  using U = typename BitFieldsT::U;
+  U to_or = 0;
+  // + for general combine
+  OrTransform<BitFieldsT> operator+(OrTransform<BitFieldsT> other) const {
+    return OrTransform<BitFieldsT>{to_or | other.to_or};
+  }
+};
+
+// For building atomic updates affecting one or more fields, assuming all the
+// updates are bitwise-and.
+template <typename BitFieldsT>
+struct AndTransform {
+  using U = typename BitFieldsT::U;
+  U to_and = 0;
+  // + for general combine
+  AndTransform<BitFieldsT> operator+(AndTransform<BitFieldsT> other) const {
+    return AndTransform<BitFieldsT>{to_and & other.to_and};
+  }
+};
+
+// TODO: AddTransfrom, which is more complicated due to possible overflow into
+// other fields etc.
+
+// Placeholder for PrevField for the first field
+struct NoPrevBitField {
+  // no instances
+  NoPrevBitField() = delete;
+  static constexpr int kEndBit = 0;
+};
+
+// For declaring a single-bit field accessed as a boolean. See example above on
+// BitFields
+template <typename BitFieldsT, typename PrevField>
+struct BoolBitField {
+  using Parent = BitFieldsT;
+  using ParentBase = BitFields<typename BitFieldsT::U, typename BitFieldsT::ID>;
+  using U = typename BitFieldsT::U;
+  using V = bool;
+  static constexpr int kBitOffset = PrevField::kEndBit;
+  static constexpr int kEndBit = kBitOffset + 1;
+  static_assert(kBitOffset >= 0 && kEndBit <= BitFieldsT::kBitCount);
+
+  // no instances
+  BoolBitField() = delete;
+
+  // NOTE: allow BitFieldsT to be derived from BitFields<> which can be
+  // passed in here
+  static bool GetFrom(const ParentBase& bf) {
+    return (bf.underlying & (U{1} << kBitOffset)) != 0;
+  }
+  static void SetIn(ParentBase& bf, bool value) {
+    bf.underlying =
+        (bf.underlying & ~(U{1} << kBitOffset)) | (U{value} << kBitOffset);
+  }
+  static OrTransform<BitFieldsT> SetTransform() {
+    return OrTransform<BitFieldsT>{U{1} << kBitOffset};
+  }
+  static AndTransform<BitFieldsT> ClearTransform() {
+    return AndTransform<BitFieldsT>{~(U{1} << kBitOffset)};
+  }
+};
+
+// For declaring a multi-bit field accessed as an unsigned int. See example
+// above on BitFields
+template <typename BitFieldsT, int kBitCount, typename PrevField>
+struct UnsignedBitField {
+  using Parent = BitFieldsT;
+  using U = typename BitFieldsT::U;
+  // Smallest uint type that can fit kBitCount bits
+  using V = std::conditional_t<
+      kBitCount <= 8, uint8_t,
+      std::conditional_t<
+          kBitCount <= 16, uint16_t,
+          std::conditional_t<kBitCount <= 32, uint32_t, uint64_t>>>;
+  static constexpr int kBitOffset = PrevField::kEndBit;
+  static constexpr int kEndBit = kBitOffset + kBitCount;
+  static_assert(kBitCount >= 1);
+  static_assert(kBitCount <= 64);
+  static_assert(kBitOffset >= 0 && kEndBit <= BitFieldsT::kBitCount);
+
+  static constexpr V kMask = (V{1} << (kBitCount - 1) << 1) - 1;
+
+  // no instances
+  UnsignedBitField() = delete;
+
+  static V GetFrom(const BitFieldsT& bf) {
+    return BitwiseAnd(bf.underlying >> kBitOffset, kMask);
+  }
+
+  static void SetIn(BitFieldsT& bf, V value) {
+    bf.underlying &= ~(static_cast<U>(kMask) << kBitOffset);
+    bf.underlying |= static_cast<U>(value & kMask) << kBitOffset;
+  }
+
+  static AndTransform<BitFieldsT> ClearTransform() {
+    return AndTransform<BitFieldsT>{~(static_cast<U>(kMask) << kBitOffset)};
+  }
+};
+
+// A handy wrapper for a relaxed atomic on some BitFields type (unlike
+// RelaxedAtomic for arithmetic types). For encapsulation, usual arithmetic
+// atomic operations are only available by calling Apply[Relaxed]() on
+// Transforms returned from field classes. Extending an example from BitFields:
+//
+// auto transform = Field2::ClearTransform() + Field4::ClearTransform();
+// MyState old_state;
+// my_atomic.ApplyRelaxed(transform, &old_state);
+// auto field2_before_clearing = old_state.Get<Field2>();
+//
+template <typename BitFieldsT>
+class RelaxedBitFieldsAtomic {
+ public:
+  using U = typename BitFieldsT::U;
+  explicit RelaxedBitFieldsAtomic(BitFieldsT initial = {})
+      : v_(initial.underlying) {}
+  void StoreRelaxed(BitFieldsT desired) {
+    v_.store(desired.underlying, std::memory_order_relaxed);
+  }
+  BitFieldsT LoadRelaxed() const {
+    return BitFieldsT{v_.load(std::memory_order_relaxed)};
+  }
+  bool CasWeakRelaxed(BitFieldsT& expected, BitFieldsT desired) {
+    return v_.compare_exchange_weak(expected.underlying, desired.underlying,
+                                    std::memory_order_relaxed);
+  }
+  bool CasStrongRelaxed(BitFieldsT& expected, BitFieldsT desired) {
+    return v_.compare_exchange_strong(expected.underlying, desired.underlying,
+                                      std::memory_order_relaxed);
+  }
+  BitFieldsT ExchangeRelaxed(BitFieldsT desired) {
+    return BitFieldsT{
+        v_.exchange(desired.underlying, std::memory_order_relaxed)};
+  }
+  void ApplyRelaxed(OrTransform<BitFieldsT> transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_or(transform.to_or, std::memory_order_relaxed);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val | transform.to_or;
+    }
+  }
+  void ApplyRelaxed(AndTransform<BitFieldsT> transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_and(transform.to_and, std::memory_order_relaxed);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val & transform.to_and;
+    }
+  }
+
+ protected:
+  std::atomic<U> v_;
+};
+
+// A handy wrapper for an aquire-release atomic (also relaxed semantics
+// available) on some BitFields type. See RelaxedBitFieldsAtomic for more info.
+template <typename BitFieldsT>
+class AcqRelBitFieldsAtomic : public RelaxedBitFieldsAtomic<BitFieldsT> {
+ public:
+  using Base = RelaxedBitFieldsAtomic<BitFieldsT>;
+  using U = typename BitFieldsT::U;
+
+  explicit AcqRelBitFieldsAtomic(BitFieldsT initial = {}) : Base(initial) {}
+
+  void Store(BitFieldsT desired) {
+    Base::v_.store(desired.underlying, std::memory_order_release);
+  }
+  BitFieldsT Load() const {
+    return BitFieldsT{Base::v_.load(std::memory_order_acquire)};
+  }
+  bool CasWeak(BitFieldsT& expected, BitFieldsT desired) {
+    return Base::v_.compare_exchange_weak(
+        expected.underlying, desired.underlying, std::memory_order_acq_rel);
+  }
+  bool CasStrong(BitFieldsT& expected, BitFieldsT desired) {
+    return Base::v_.compare_exchange_strong(
+        expected.underlying, desired.underlying, std::memory_order_acq_rel);
+  }
+  BitFieldsT Exchange(BitFieldsT desired) {
+    return BitFieldsT{
+        Base::v_.exchange(desired.underlying, std::memory_order_acq_rel)};
+  }
+  void Apply(OrTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
+             BitFieldsT* after = nullptr) {
+    U before_val =
+        Base::v_.fetch_or(transform.to_or, std::memory_order_acq_rel);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val | transform.to_or;
+    }
+  }
+  void Apply(AndTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
+             BitFieldsT* after = nullptr) {
+    U before_val =
+        Base::v_.fetch_and(transform.to_and, std::memory_order_acq_rel);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val & transform.to_and;
+    }
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/semaphore.h b/util/semaphore.h
new file mode 100644
index 000000000000..59e767d6246d
--- /dev/null
+++ b/util/semaphore.h
@@ -0,0 +1,164 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <condition_variable>
+#include <mutex>
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+#include <semaphore>
+#endif
+
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Wrapper providing a chosen counting semaphore implementation. The default
+// implementation based on a mutex and condvar unfortunately can result in
+// Release() temporarily waiting on another thread to make progress (if that
+// other thread is preempted while holding the mutex), but that should be rare.
+// However, alternative implementations may have correctness issues or even
+// worse performance. See std::counting_semaphore for general contract.
+//
+// NOTE1: std::counting_semaphore is known to be buggy on many std library
+// implementations, so be cautious about enabling it. Reportedly, an acquire()
+// can falsely block indefinitely. And we can't easily work around that with
+// try_acquire_for because another common bug has that function consistently
+// sleeping for the entire timeout duration even if a release() happens earlier.
+// Therefore, using std::counting_semaphore/binary_semaphore is strictly opt-in
+// for now.
+//
+// NOTE2: Also tried wrapping folly::fibers::Semaphore here but it was not as
+// efficient (for parallel compression) as even the mutex+condvar version.
+class ALIGN_AS(CACHE_LINE_SIZE) CountingSemaphore {
+ public:
+  explicit CountingSemaphore(std::ptrdiff_t starting_count)
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+      : sem_(starting_count)
+#else
+      : count_(static_cast<int32_t>(starting_count))
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  {
+    assert(starting_count >= 0);
+    assert(starting_count <= INT32_MAX);
+  }
+  void Acquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    assert(count_ >= 0);
+    cv_.wait(lock, [this] { return count_ > 0; });
+    --count_;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  bool TryAcquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    return sem_.try_acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    assert(count_ >= 0);
+    if (count_ == 0) {
+      return false;
+    } else {
+      --count_;
+      return true;
+    }
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  void Release(std::ptrdiff_t n = 1) {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.release(n);
+#else
+    assert(n >= 0);
+    assert(n <= INT32_MAX);
+    if (n > 0) {
+      std::unique_lock<std::mutex> lock(mutex_);
+      assert(count_ >= 0);
+      count_ += static_cast<int32_t>(n);
+      assert(count_ >= 0);  // no overflow
+      if (n == 1) {
+        cv_.notify_one();
+      } else {
+        cv_.notify_all();
+      }
+    }
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+
+ private:
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+  std::counting_semaphore<INT32_MAX> sem_;
+#else
+  int32_t count_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+};  // namespace ROCKSDB_NAMESPACE
+
+// Wrapper providing a chosen binary semaphore implementation. See notes on
+// CountingSemaphore above, and on Release() below.
+class BinarySemaphore {
+ public:
+  explicit BinarySemaphore(std::ptrdiff_t starting_count)
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+      : sem_(starting_count)
+#else
+      : state_(starting_count > 0)
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  {
+    assert(starting_count >= 0);
+  }
+  void Acquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [this] { return state_; });
+    state_ = false;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  bool TryAcquire() {
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    return sem_.try_acquire();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (state_) {
+      state_ = false;
+      return true;
+    } else {
+      return false;
+    }
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+  void Release() {
+    // NOTE: implementations of std::binary_semaphore::release() tend to behave
+    // like counting semaphores in the case of multiple Release() calls without
+    // Acquire() in between, though it is undefined behavior. It is also OK to
+    // cap the count at 1.
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+    sem_.release();
+#else
+    std::unique_lock<std::mutex> lock(mutex_);
+    // check precondition to avoid UB in std implementation
+    assert(state_ == false);
+    state_ = true;
+    cv_.notify_one();
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+  }
+
+ private:
+#ifdef ROCKSDB_USE_STD_SEMAPHORES
+  std::binary_semaphore sem_;
+#else
+  bool state_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+#endif  // ROCKSDB_USE_STD_SEMAPHORES
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/slice_test.cc b/util/slice_test.cc
index c1a0c806b847..380c6f50bea4 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -3,6 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+// Because there are a small set of tests for Slice and there's a cost in having
+// extra test binaries for each component, this test file has evolved into a
+// "grab bag" of small tests for various reusable components, mostly in  util/.
+
 #include "rocksdb/slice.h"
 
 #include <gtest/gtest.h>
@@ -15,7 +19,9 @@
 #include "rocksdb/types.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/bit_fields.h"
 #include "util/cast_util.h"
+#include "util/semaphore.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -424,22 +430,134 @@ TEST(ToBaseCharsStringTest, Tests) {
   ASSERT_EQ(ToBaseCharsString<32>(2, 255, false), "7v");
 }
 
-TEST(SemaphoreTest, BasicStdCountingSemaphore) {
-  // Verify the C++20 API is available and apparently working
-  std::counting_semaphore sem{0};
+TEST(SemaphoreTest, CountingSemaphore) {
+  CountingSemaphore sem{0};
   int kCount = 5;
   std::vector<std::thread> threads;
   for (int i = 0; i < kCount; ++i) {
-    threads.emplace_back([&sem] { sem.release(); });
+    threads.emplace_back([&sem] { sem.Release(); });
   }
   for (int i = 0; i < kCount; ++i) {
-    threads.emplace_back([&sem] { sem.acquire(); });
+    threads.emplace_back([&sem] { sem.Acquire(); });
   }
   for (auto& t : threads) {
     t.join();
   }
   // Nothing left on the semaphore
-  ASSERT_FALSE(sem.try_acquire());
+  ASSERT_FALSE(sem.TryAcquire());
+  // Keep testing
+  sem.Release(2);
+  ASSERT_TRUE(sem.TryAcquire());
+  sem.Acquire();
+  ASSERT_FALSE(sem.TryAcquire());
+}
+
+TEST(SemaphoreTest, BinarySemaphore) {
+  BinarySemaphore sem{0};
+  int kCount = 5;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < kCount; ++i) {
+    threads.emplace_back([&sem] {
+      sem.Acquire();
+      sem.Release();
+    });
+  }
+  threads.emplace_back([&sem] { sem.Release(); });
+  for (auto& t : threads) {
+    t.join();
+  }
+  // Only able to acquire one excess release
+  ASSERT_TRUE(sem.TryAcquire());
+  ASSERT_FALSE(sem.TryAcquire());
+}
+
+TEST(BitFieldsTest, BitFields) {
+  // Start by verifying example from BitFields comment
+  struct MyStateID {};
+  struct MyState : public BitFields<uint32_t, MyStateID> {
+    // Extra helper declarations and/or field type declarations
+  };
+
+  using Field1 = UnsignedBitField<MyState, 16, NoPrevBitField>;
+  using Field2 = BoolBitField<MyState, Field1>;
+  using Field3 = BoolBitField<MyState, Field2>;
+  using Field4 = UnsignedBitField<MyState, 5, Field3>;
+
+  MyState state;  // zero-initialized
+  state.Set<Field1>(42U);
+  state.Set<Field2>(true);
+  state.Set<Field4>(3U);
+  state.Ref<Field1>() += state.Get<Field4>();
+
+  ASSERT_EQ(state.Get<Field1>(), 45U);
+  ASSERT_EQ(state.Get<Field2>(), true);
+  ASSERT_EQ(state.Get<Field3>(), false);
+  ASSERT_EQ(state.Get<Field4>(), 3U);
+
+  // Misc operators
+  auto ref = state.Ref<Field3>();
+  auto ref2 = std::move(ref);
+  ref2 = true;
+  ASSERT_EQ(state.Get<Field3>(), true);
+
+  MyState state2;
+  // Basic non-concurrent tests for atomic wrappers
+  {
+    RelaxedBitFieldsAtomic<MyState> relaxed{state};
+    ASSERT_EQ(state, relaxed.LoadRelaxed());
+    relaxed.StoreRelaxed(state2);
+    ASSERT_EQ(state2, relaxed.LoadRelaxed());
+    MyState state3 = relaxed.ExchangeRelaxed(state);
+    ASSERT_EQ(state2, state3);
+    ASSERT_TRUE(relaxed.CasStrongRelaxed(state, state2));
+    while (!relaxed.CasWeakRelaxed(state2, state)) {
+    }
+    ASSERT_EQ(state2, state3);
+    ASSERT_EQ(state, relaxed.LoadRelaxed());
+
+    auto transform1 = Field2::ClearTransform() + Field3::ClearTransform();
+    MyState before, after;
+    relaxed.ApplyRelaxed(transform1, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field2>(), false);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2 = Field2::SetTransform() + Field3::SetTransform();
+    relaxed.ApplyRelaxed(transform2, &before, &after);
+    ASSERT_NE(before, state);
+    ASSERT_EQ(before.Get<Field2>(), false);
+    ASSERT_EQ(before.Get<Field3>(), false);
+    ASSERT_EQ(after, state);
+  }
+  {
+    AcqRelBitFieldsAtomic<MyState> acqrel{state};
+    ASSERT_EQ(state, acqrel.Load());
+    acqrel.Store(state2);
+    ASSERT_EQ(state2, acqrel.Load());
+    MyState state3 = acqrel.Exchange(state);
+    ASSERT_EQ(state2, state3);
+    ASSERT_TRUE(acqrel.CasStrong(state, state2));
+    while (!acqrel.CasWeak(state2, state)) {
+    }
+    ASSERT_EQ(state2, state3);
+    ASSERT_EQ(state, acqrel.Load());
+
+    auto transform1 = Field2::ClearTransform() + Field3::ClearTransform();
+    MyState before, after;
+    acqrel.Apply(transform1, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field2>(), false);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2 = Field2::SetTransform() + Field3::SetTransform();
+    acqrel.Apply(transform2, &before, &after);
+    ASSERT_NE(before, state);
+    ASSERT_EQ(before.Get<Field2>(), false);
+    ASSERT_EQ(before.Get<Field3>(), false);
+    ASSERT_EQ(after, state);
+  }
 }
 
 }  // namespace ROCKSDB_NAMESPACE

From 2620c85638bfa6e8b40ac675b494b722771c62b6 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 15 Sep 2025 11:39:45 -0700
Subject: [PATCH 279/500] Support async IO for MultiScan (#13932)

Summary:
add option MultiScanArgs::use_async_io option and implementation for using ReadAsync() for multiscan. Read requests are submitted during Prepare() and polled during actual scanning.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13932

Test Plan:
- updated existing unit test to use async_io.
- crash test: `python3 -u ./tools/db_crashtest.py whitebox --iterpercent=60 --prefix_size=-1 --prefixpercent=0 --readpercent=0 --test_batches_snapshots=0 --use_multiscan=1 --read_fault_one_in=0 --kill_random_test=88888 --interval=60 --multiscan_use_async_io=1 --mmap_read=0`

Benchmark:
- Default multiscan benchmark:
```
Set up: /db_bench --benchmarks="fillseq,compact" --disable_wal=1 --threads=1 --num_levels=1 --compaction_style=2 --fifo_compaction_max_table_files_size_mb=1000 --write_buffer_size=268435456

Without async IO:
./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --seek_nexts=100 --threads=32 --duration=10 --statistics=1 --use_direct_reads=1 --multiscan_use_async_io=0

multiscan    :     415.569 micros/op 75805 ops/sec 10.355 seconds 784968 operations; (multscans:24999)
rocksdb.read.async.micros COUNT : 0

With asycn IO:
./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --seek_nexts=100 --threads=32 --duration=10 --statistics=1 --use_direct_reads=1 --multiscan_use_async_io=1

multiscan    :     413.236 micros/op 76044 ops/sec 10.375 seconds 788968 operations; (multscans:24999)
rocksdb.read.async.micros COUNT : 3916499

Similar performance.
```

- Larger scan, more scans per multiscan, do not coalesce IO so that async IO can progress while scanning, and use one thread:
```
multiscan_stride = 1000
multiscan_size = 100
seek_nexts = 1000

./db_bench --db="/tmp/rocksdbtest-543376/dbbench" --use_existing_db=1 --benchmarks=multiscan --disable_auto_compactions=1 --threads=1 --duration=10 --statistics=0 --use_direct_reads=1  --cache_size=2097152 --multiscan_size=100 --multiscan_stride=1000 --seek_nexts=1000 --seed=1 --multiscan_coalesce_threshold=0  --multiscan_use_async_io=0

Without async IO:
multiscan    :   20495.205 micros/op 48 ops/sec 10.002 seconds 488 operations; (multscans:488)

With async IO:
multiscan    :   18337.883 micros/op 54 ops/sec 10.013 seconds 546 operations; (multscans:546)

~10% improvement in throughput
```

Reviewed By: xingbowang

Differential Revision: D82077818

Pulled By: cbi42

fbshipit-source-id: 66e32cf4039183c4841827409286dfbaa6dfbcd8
---
 db/version_set.cc                             |   1 +
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   3 +
 db_stress_tool/db_stress_test_base.cc         |   1 +
 env/fs_posix.cc                               |   2 +-
 file/random_access_file_reader.cc             |   7 +
 include/rocksdb/options.h                     |   9 +
 .../block_based/block_based_table_iterator.cc | 847 +++++++++++-------
 .../block_based/block_based_table_iterator.h  | 186 +++-
 .../block_based_table_reader_test.cc          | 381 ++++----
 tools/db_bench_tool.cc                        |  11 +-
 tools/db_crashtest.py                         |   5 +-
 .../new_features/multi-scan-async-io.md       |   1 +
 13 files changed, 931 insertions(+), 524 deletions(-)
 create mode 100644 unreleased_history/new_features/multi-scan-async-io.md

diff --git a/db/version_set.cc b/db/version_set.cc
index 98f8955a2119..0c98a01f0eb0 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1165,6 +1165,7 @@ class LevelIterator final : public InternalIterator {
     // Propagate io colaescing threshold
     for (auto& file_to_arg : *file_to_scan_opts_) {
       file_to_arg.second.io_coalesce_threshold = so->io_coalesce_threshold;
+      file_to_arg.second.use_async_io = so->use_async_io;
     }
   }
 
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index bf5b47ab2a52..8bfcb7b29746 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -432,6 +432,7 @@ DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
 DECLARE_uint32(ingest_wbwi_one_in);
 DECLARE_bool(universal_reduce_file_locking);
 DECLARE_bool(use_multiscan);
+DECLARE_bool(multiscan_use_async_io);
 
 // Compaction deletion trigger declarations for stress testing
 DECLARE_bool(enable_compaction_on_deletion_trigger);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 94cc3ea1e446..8e92dd25d960 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1535,4 +1535,7 @@ DEFINE_bool(
 DEFINE_bool(use_multiscan, false,
             "If set, use the batched MultiScan API for scans.");
 
+DEFINE_bool(multiscan_use_async_io, false,
+            "If set, enable async_io for MultiScan operations.");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index a13ce4db87ed..b62441403bee 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1693,6 +1693,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
   std::vector<std::string> start_key_strs;
   std::vector<std::string> end_key_strs;
   MultiScanArgs scan_opts;
+  scan_opts.use_async_io = FLAGS_multiscan_use_async_io;
   start_key_strs.reserve(num_scans);
   end_key_strs.reserve(num_scans);
 
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 06d7e1a9e939..dcadafde1a0e 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -1153,7 +1153,7 @@ class PosixFileSystem : public FileSystem {
     return IOStatus::OK();
 #else
     (void)io_handles;
-    return IOStatus::NotSupported("Poll");
+    return IOStatus::NotSupported("Poll not implemented");
 #endif
   }
 
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index b14a9c8bfecd..f7bf9699822c 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -497,6 +497,13 @@ IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
   }
 }
 
+// Notes for when direct_io is enabled:
+// Unless req.offset, req.len, req.scratch are all already aligned,
+// RandomAccessFileReader will creats aligned requests and aligned buffer for
+// the request. User should only provide either req.scratch or aligned_buf. If
+// only req.scratch is provided, result will be copied from allocated aligned
+// buffer to req.scratch. If only alignd_buf is provided, it will be set to
+// the ailgned buf allocated by RandomAccessFileReader and saves a copy.
 IOStatus RandomAccessFileReader::ReadAsync(
     FSReadRequest& req, const IOOptions& opts,
     std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 9ba148aa0e89..a43fa6fda941 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1791,10 +1791,12 @@ class MultiScanArgs {
     original_ranges_ = other.original_ranges_;
     io_coalesce_threshold = other.io_coalesce_threshold;
     max_prefetch_size = other.max_prefetch_size;
+    use_async_io = other.use_async_io;
   }
   MultiScanArgs(MultiScanArgs&& other) noexcept
       : io_coalesce_threshold(other.io_coalesce_threshold),
         max_prefetch_size(other.max_prefetch_size),
+        use_async_io(other.use_async_io),
         comp_(other.comp_),
         original_ranges_(std::move(other.original_ranges_)) {}
 
@@ -1803,6 +1805,7 @@ class MultiScanArgs {
     original_ranges_ = other.original_ranges_;
     io_coalesce_threshold = other.io_coalesce_threshold;
     max_prefetch_size = other.max_prefetch_size;
+    use_async_io = other.use_async_io;
     return *this;
   }
 
@@ -1812,6 +1815,7 @@ class MultiScanArgs {
       original_ranges_ = std::move(other.original_ranges_);
       io_coalesce_threshold = other.io_coalesce_threshold;
       max_prefetch_size = other.max_prefetch_size;
+      use_async_io = other.use_async_io;
     }
     return *this;
   }
@@ -1865,6 +1869,11 @@ class MultiScanArgs {
   // Note that this limit is per file and applies to compressed block size.
   uint64_t max_prefetch_size = 0;
 
+  // Enable async I/O for multi-scan operations
+  // When true, BlockBasedTableIterator will use ReadAsync() for reading blocks
+  // When false, it will use synchronous MultiRead().
+  bool use_async_io = false;
+
  private:
   // The comparator used for ordering ranges
   const Comparator* comp_;
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 4a480c05e47f..07e8cb9c857d 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -919,6 +919,42 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
   ResetPreviousBlockOffset();
 }
 
+BlockBasedTableIterator::MultiScanState::~MultiScanState() {
+  // Abort any pending async IO operations to prevent callback being called
+  // after async read states are destructed.
+  if (!async_states.empty()) {
+    std::vector<void*> io_handles_to_abort;
+    std::vector<AsyncReadState*> states_to_cleanup;
+
+    // Collect all pending IO handles
+    for (size_t i = 0; i < async_states.size(); ++i) {
+      auto& async_read = async_states[i];
+
+      if (async_read.io_handle != nullptr) {
+        assert(!async_read.finished);
+        io_handles_to_abort.push_back(async_read.io_handle);
+        states_to_cleanup.push_back(&async_read);
+      }
+    }
+
+    if (!io_handles_to_abort.empty()) {
+      IOStatus abort_status = fs->AbortIO(io_handles_to_abort);
+      if (!abort_status.ok()) {
+#ifndef NDEBUG
+        fprintf(stderr, "Error aborting async IO operations: %s\n",
+                abort_status.ToString().c_str());
+#endif
+        assert(false);
+      }
+      (void)abort_status;  // Suppress unused variable warning
+    }
+
+    for (auto async_read : states_to_cleanup) {
+      async_read->CleanUpIOHandle();
+    }
+  }
+}
+
 // Note:
 // - Iterator should not be reused for multiple multiscans or mixing
 // multiscan with regular iterator usage.
@@ -940,26 +976,292 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
     multi_scan_.reset();
     return;
   }
-  if (multiscan_opts == nullptr || multiscan_opts->empty()) {
+  if (!ValidateScanOptions(multiscan_opts)) {
     return;
   }
 
-  const std::vector<ScanOptions>* scan_opts = &multiscan_opts->GetScanRanges();
-  const bool has_limit = scan_opts->front().range.limit.has_value();
-  if (!has_limit && scan_opts->size() > 1) {
-    // Abort: overlapping ranges
+  std::vector<BlockHandle> scan_block_handles;
+  std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
+  const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
+  if (!CollectBlockHandles(scan_opts, &scan_block_handles,
+                           &block_index_ranges_per_scan)) {
+    return;
+  }
+
+  // Pin already cached blocks, collect remaining blocks to read
+  std::vector<size_t> block_indices_to_read;
+  std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
+      scan_block_handles.size());
+  size_t prefetched_max_idx;
+  if (!FilterAndPinCachedBlocks(
+          scan_block_handles, multiscan_opts, &block_indices_to_read,
+          &pinned_data_blocks_guard, &prefetched_max_idx)) {
     return;
   }
 
-  // Validate scan ranges to be increasing and with limit.
-  for (size_t i = 0; i < scan_opts->size(); ++i) {
-    const auto& scan_range = (*scan_opts)[i].range;
+  std::vector<AsyncReadState> async_states;
+  // Maps from block index into async read request (index into async_states[])
+  UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
+  if (!block_indices_to_read.empty()) {
+    std::vector<FSReadRequest> read_reqs;
+    std::vector<std::vector<size_t>> coalesced_block_indices;
+    PrepareIORequests(block_indices_to_read, scan_block_handles, multiscan_opts,
+                      &read_reqs, &block_idx_to_readreq_idx,
+                      &coalesced_block_indices);
+
+    if (!ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
+                   &read_reqs, &async_states, &pinned_data_blocks_guard)) {
+      return;
+    }
+  }
+
+  // Successful Prepare, init related states so the iterator reads from prepared
+  // blocks.
+  multi_scan_ = std::make_unique<MultiScanState>(
+      table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
+      std::move(pinned_data_blocks_guard),
+      std::move(block_index_ranges_per_scan),
+      std::move(block_idx_to_readreq_idx), std::move(async_states),
+      prefetched_max_idx);
+
+  is_index_at_curr_block_ = false;
+  block_iter_points_to_real_block_ = false;
+}
+
+bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
+  assert(multi_scan_);
+  // This is a MultiScan and Preapre() has been called.
+  //
+  // Validate seek key with scan options
+  if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
+    multi_scan_.reset();
+  } else if (!target) {
+    // start key must be set for multi-scan
+    multi_scan_.reset();
+  } else if (user_comparator_.CompareWithoutTimestamp(
+                 ExtractUserKey(*target), /*a_has_ts=*/true,
+                 multi_scan_->scan_opts
+                     ->GetScanRanges()[multi_scan_->next_scan_idx]
+                     .range.start.value(),
+                 /*b_has_ts=*/false) != 0) {
+    // Unexpected seek key
+    multi_scan_.reset();
+  } else {
+    auto [cur_scan_start_idx, cur_scan_end_idx] =
+        multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
+    // We should have the data block already loaded
+    ++multi_scan_->next_scan_idx;
+    if (cur_scan_start_idx >= cur_scan_end_idx) {
+      is_out_of_bound_ = true;
+      assert(!Valid());
+      return true;
+    } else {
+      is_out_of_bound_ = false;
+    }
+
+    if (!block_iter_points_to_real_block_ ||
+        multi_scan_->cur_data_block_idx != cur_scan_start_idx) {
+      if (block_iter_points_to_real_block_) {
+        // Should be scan in increasing key range.
+        // All blocks before cur_data_block_idx_ are not pinned anymore.
+        assert(multi_scan_->cur_data_block_idx < cur_scan_start_idx);
+      }
+
+      ResetDataIter();
+
+      multi_scan_->cur_data_block_idx = cur_scan_start_idx;
+      multi_scan_->status = MultiScanLoadDataBlock(cur_scan_start_idx);
+      if (!multi_scan_->status.ok()) {
+        assert(!Valid());
+        assert(status() == multi_scan_->status);
+        return true;
+      }
+    }
+    multi_scan_->cur_data_block_idx = cur_scan_start_idx;
+    block_iter_points_to_real_block_ = true;
+    block_iter_.Seek(*target);
+    FindKeyForward();
+    return true;
+  }
+
+  // We are aborting MultiScan.
+  ResetDataIter();
+  assert(!is_index_at_curr_block_);
+  assert(!block_iter_points_to_real_block_);
+  return false;
+}
+
+void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
+  assert(multi_scan_);
+  assert(multi_scan_->next_scan_idx >= 1);
+  const auto cur_scan_end_idx = std::get<1>(
+      multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
+  do {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
+    // level has reached iterate_upper_bound_ and will not continue to iterate
+    // into the next file. When we are doing the last scan within a MultiScan
+    // for this file, it may need to continue to scan into the next file, so
+    // we do not set is_out_of_bound_ in this case.
+    if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
+      if (multi_scan_->next_scan_idx >=
+          multi_scan_->block_index_ranges_per_scan.size()) {
+        // We are done with this file, should let LevelIter advance to the next
+        // file instead of ending the scan
+        ResetDataIter();
+        assert(!is_out_of_bound_);
+        assert(!Valid());
+        return;
+      }
+      // We don't ResetDataIter() here since next scan might be reading from
+      // the same block. ResetDataIter() will free the underlying block cache
+      // handle and we don't want the block to be unpinned.
+      is_out_of_bound_ = true;
+      assert(!Valid());
+      return;
+    }
+    // Move to the next pinned data block
+    ResetDataIter();
+    ++multi_scan_->cur_data_block_idx;
+
+    multi_scan_->status =
+        MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx);
+    if (!multi_scan_->status.ok()) {
+      assert(!Valid());
+      assert(status() == multi_scan_->status);
+      return;
+    }
+
+    block_iter_points_to_real_block_ = true;
+    block_iter_.SeekToFirst();
+  } while (!block_iter_.Valid());
+}
+
+Status BlockBasedTableIterator::PollForBlock(size_t idx) {
+  assert(multi_scan_);
+  const auto async_idx = multi_scan_->block_idx_to_readreq_idx.find(idx);
+  if (async_idx == multi_scan_->block_idx_to_readreq_idx.end()) {
+    // Did not require async read, should already be pinned.
+    assert(multi_scan_->pinned_data_blocks[idx].GetValue());
+    return Status::OK();
+  }
+
+  AsyncReadState& async_read = multi_scan_->async_states[async_idx->second];
+  if (async_read.finished) {
+    assert(async_read.io_handle == nullptr);
+    assert(async_read.status.ok());
+    return async_read.status;
+  }
+
+  {
+    std::vector<void*> handles = {async_read.io_handle};
+    Status poll_s =
+        table_->get_rep()->ioptions.env->GetFileSystem()->Poll(handles, 1);
+    if (!poll_s.ok()) {
+      return poll_s;
+    }
+  }
+  assert(async_read.status.ok());
+  if (!async_read.status.ok()) {
+    return async_read.status;
+  }
+  async_read.CleanUpIOHandle();
+
+  // Initialize and pin blocks from async read result.
+  for (size_t i = 0; i < async_read.blocks.size(); ++i) {
+    const auto& block = async_read.blocks[i];
+
+    Status s = CreateAndPinBlockFromBuffer(
+        block, async_read.offset, async_read.result,
+        multi_scan_->pinned_data_blocks[async_read.block_indices[i]]);
+
+    if (!s.ok()) {
+      return s;
+    }
+    assert(multi_scan_->pinned_data_blocks[async_read.block_indices[i]]
+               .GetValue());
+  }
+  assert(multi_scan_->pinned_data_blocks[idx].GetValue());
+  return Status::OK();
+}
+
+Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
+    const BlockHandle& block, uint64_t buffer_start_offset,
+    const Slice& buffer_data, CachableEntry<Block>& pinned_block_entry) {
+  // Get decompressor and handle dictionary loading
+  UnownedPtr<Decompressor> decompressor = table_->get_rep()->decompressor.get();
+  CachableEntry<DecompressorDict> cached_dict;
+
+  if (table_->get_rep()->uncompression_dict_reader) {
+    {
+      Status s =
+          table_->get_rep()
+              ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+                  /* prefetch_buffer= */ nullptr, read_options_,
+                  /* get_context= */ nullptr, /* lookup_context= */ nullptr,
+                  &cached_dict);
+      if (!s.ok()) {
+#ifndef NDEBUG
+        fprintf(stdout, "Prepare dictionary loading failed with %s\n",
+                s.ToString().c_str());
+#endif
+        return s;
+      }
+    }
+    if (!cached_dict.GetValue()) {
+#ifndef NDEBUG
+      fprintf(stdout, "Success but no dictionary read\n");
+#endif
+      return Status::InvalidArgument("No dictionary found");
+    }
+    decompressor = cached_dict.GetValue()->decompressor_.get();
+  }
+
+  // Create block from buffer data
+  const auto block_size_with_trailer =
+      BlockBasedTable::BlockSizeWithTrailer(block);
+  const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
+
+  CacheAllocationPtr data =
+      AllocateBlock(block_size_with_trailer,
+                    GetMemoryAllocator(table_->get_rep()->table_options));
+  memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
+         block_size_with_trailer);
+  BlockContents tmp_contents(std::move(data), block.size());
+
+#ifndef NDEBUG
+  tmp_contents.has_trailer =
+      table_->get_rep()->footer.GetBlockTrailerSize() > 0;
+#endif
+
+  return table_->CreateAndPinBlockInCache<Block_kData>(
+      read_options_, block, decompressor, &tmp_contents,
+      &pinned_block_entry.As<Block_kData>());
+}
+
+bool BlockBasedTableIterator::ValidateScanOptions(
+    const MultiScanArgs* multiscan_opts) {
+  if (multiscan_opts == nullptr || multiscan_opts->empty()) {
+    return false;
+  }
+
+  const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
+  const bool has_limit = scan_opts.front().range.limit.has_value();
+  if (!has_limit && scan_opts.size() > 1) {
+    // Abort: overlapping ranges
+    return false;
+  }
+
+  for (size_t i = 0; i < scan_opts.size(); ++i) {
+    const auto& scan_range = scan_opts[i].range;
     if (!scan_range.start.has_value()) {
       // Abort: no start key
-      return;
+      return false;
     }
 
-    // Assume for each scan range start <= limit.
     if (scan_range.limit.has_value()) {
       assert(user_comparator_.CompareWithoutTimestamp(
                  scan_range.start.value(), /*a_has_ts=*/false,
@@ -969,30 +1271,32 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
     if (i > 0) {
       if (!scan_range.limit.has_value()) {
         // multiple no limit scan ranges
-        return;
+        return false;
       }
 
-      const auto& last_end_key = (*scan_opts)[i - 1].range.limit.value();
+      const auto& last_end_key = scan_opts[i - 1].range.limit.value();
       if (user_comparator_.CompareWithoutTimestamp(
               scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
               /*b_has_ts=*/false) < 0) {
         // Abort: overlapping ranges
-        return;
+        return false;
       }
     }
   }
+  return true;
+}
 
-  // Gather all relevant data block handles
-  std::vector<BlockHandle> blocks_to_prepare;
-  std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
-
-  const size_t timestamp_size =
-      user_comparator_.user_comparator()->timestamp_size();
-  for (const auto& scan_opt : *scan_opts) {
+bool BlockBasedTableIterator::CollectBlockHandles(
+    const std::vector<ScanOptions>& scan_opts,
+    std::vector<BlockHandle>* scan_block_handles,
+    std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan) {
+  for (const auto& scan_opt : scan_opts) {
     size_t num_blocks = 0;
-    // Current scan overlap the last block of the previous scan.
-    bool check_overlap = !blocks_to_prepare.empty();
+    bool check_overlap = !scan_block_handles->empty();
+
     InternalKey start_key;
+    const size_t timestamp_size =
+        user_comparator_.user_comparator()->timestamp_size();
     if (timestamp_size == 0) {
       start_key = InternalKey(scan_opt.range.start.value(), kMaxSequenceNumber,
                               kValueTypeForSeek);
@@ -1002,11 +1306,7 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
                                 timestamp_size);
       start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
     }
-
     index_iter_->Seek(start_key.Encode());
-
-    // Scan range is specified in user key, here we seek to the minimum internal
-    // key with this user key.
     while (index_iter_->Valid() &&
            (!scan_opt.range.limit.has_value() ||
             user_comparator_.CompareWithoutTimestamp(
@@ -1014,10 +1314,10 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
                 /*a_has_ts*/ true, *scan_opt.range.limit,
                 /*b_has_ts=*/false) <= 0)) {
       if (check_overlap &&
-          blocks_to_prepare.back() == index_iter_->value().handle) {
+          scan_block_handles->back() == index_iter_->value().handle) {
         // Skip the current block since it's already in the list
       } else {
-        blocks_to_prepare.push_back(index_iter_->value().handle);
+        scan_block_handles->push_back(index_iter_->value().handle);
       }
       ++num_blocks;
       index_iter_->Next();
@@ -1026,17 +1326,15 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 
     if (!index_iter_->status().ok()) {
       // Abort: index iterator error
-      return;
+      return false;
     }
 
-    // Stop until index->key > limit
-    // Include the current block since it can still contain keys <= limit
     if (index_iter_->Valid()) {
       if (check_overlap &&
-          blocks_to_prepare.back() == index_iter_->value().handle) {
+          scan_block_handles->back() == index_iter_->value().handle) {
         // Skip adding the current block since it's already in the list
       } else {
-        blocks_to_prepare.push_back(index_iter_->value().handle);
+        scan_block_handles->push_back(index_iter_->value().handle);
       }
       ++num_blocks;
     } else if (num_blocks == 0) {
@@ -1044,368 +1342,249 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       // range. This is important for FindBlockForwardInMultiScan() which only
       // lets the upper layer (LevelIterator) advance to the next SST file when
       // the last scan range is exhausted.
-      return;
+      return false;
     }
     assert(num_blocks);
-    block_ranges_per_scan.emplace_back(blocks_to_prepare.size() - num_blocks,
-                                       blocks_to_prepare.size());
+    block_index_ranges_per_scan->emplace_back(
+        scan_block_handles->size() - num_blocks, scan_block_handles->size());
   }
+  return true;
+}
 
-  // blocks_to_prepare has all the blocks that need to be read.
-  // Look up entries in cache and pin if exist.
-  // Store indices of blocks to read.
-  std::vector<size_t> blocks_to_read;
-  std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
-      blocks_to_prepare.size());
+bool BlockBasedTableIterator::FilterAndPinCachedBlocks(
+    const std::vector<BlockHandle>& scan_block_handles,
+    const MultiScanArgs* multiscan_opts,
+    std::vector<size_t>* block_indices_to_read,
+    std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
+    size_t* prefetched_max_idx) {
   uint64_t total_prefetch_size = 0;
+  *prefetched_max_idx = scan_block_handles.size();
 
-  for (size_t i = 0; i < blocks_to_prepare.size(); ++i) {
-    const auto& data_block_handle = blocks_to_prepare[i];
+  for (size_t i = 0; i < scan_block_handles.size(); ++i) {
+    const auto& data_block_handle = scan_block_handles[i];
 
-    // Check if we would exceed the prefetch size limit with this block
     total_prefetch_size +=
         BlockBasedTable::BlockSizeWithTrailer(data_block_handle);
     if (multiscan_opts->max_prefetch_size > 0 &&
         total_prefetch_size > multiscan_opts->max_prefetch_size) {
-      // All remaining blocks are by default empty.
-      for (size_t j = i; j < blocks_to_prepare.size(); ++j) {
-        assert(pinned_data_blocks_guard[j].IsEmpty());
+      for (size_t j = i; j < scan_block_handles.size(); ++j) {
+        assert((*pinned_data_blocks_guard)[j].IsEmpty());
       }
+      *prefetched_max_idx = i;
       break;
     }
 
     Status s = table_->LookupAndPinBlocksInCache<Block_kData>(
         read_options_, data_block_handle,
-        &pinned_data_blocks_guard[i].As<Block_kData>());
+        &(*pinned_data_blocks_guard)[i].As<Block_kData>());
 
     if (!s.ok()) {
       // Abort: block cache look up failed.
-      return;
+      return false;
+    }
+    if (!(*pinned_data_blocks_guard)[i].GetValue()) {
+      // Block not in cache
+      block_indices_to_read->emplace_back(i);
     }
-    if (!pinned_data_blocks_guard[i].GetValue()) {
-      // Block not in cache, will read it below.
-      blocks_to_read.emplace_back(i);
+  }
+  return true;
+}
+
+void BlockBasedTableIterator::PrepareIORequests(
+    const std::vector<size_t>& block_indices_to_read,
+    const std::vector<BlockHandle>& scan_block_handles,
+    const MultiScanArgs* multiscan_opts, std::vector<FSReadRequest>* read_reqs,
+    UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
+    std::vector<std::vector<size_t>>* coalesced_block_indices) {
+  assert(coalesced_block_indices->empty());
+  coalesced_block_indices->resize(1);
+
+  for (const auto& block_idx : block_indices_to_read) {
+    if (!coalesced_block_indices->back().empty()) {
+      // Check if we can coalesce.
+      const auto& last_block_handle =
+          scan_block_handles[coalesced_block_indices->back().back()];
+      uint64_t last_block_end =
+          last_block_handle.offset() +
+          BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
+      uint64_t current_start = scan_block_handles[block_idx].offset();
+
+      if (current_start >
+          last_block_end + multiscan_opts->io_coalesce_threshold) {
+        // new IO
+        coalesced_block_indices->emplace_back();
+      }
     }
+    coalesced_block_indices->back().emplace_back(block_idx);
   }
 
-  // Coalesce IOs
-  // TODO: limit prefetching size to bound memory usage.
-  if (!blocks_to_read.empty()) {
-    // Each vector correspond to blocks to read in a single read request.
-    // Each member in the vector is an index into blocks_to_prepare.
-    std::vector<std::vector<size_t>> collapsed_blocks_to_read(1);
-
-    for (const auto& block_idx : blocks_to_read) {
-      if (!collapsed_blocks_to_read.back().empty()) {
-        // Check if we can coalesce.
-        const auto& last_block =
-            blocks_to_prepare[collapsed_blocks_to_read.back().back()];
-        uint64_t last_block_end =
-            last_block.offset() +
-            BlockBasedTable::BlockSizeWithTrailer(last_block);
-        uint64_t current_start = blocks_to_prepare[block_idx].offset();
-
-        if (current_start >
-            last_block_end + multiscan_opts->io_coalesce_threshold) {
-          // new IO
-          collapsed_blocks_to_read.emplace_back();
+  assert(read_reqs->empty());
+  read_reqs->reserve(coalesced_block_indices->size());
+  for (const auto& block_indices : *coalesced_block_indices) {
+    assert(block_indices.size());
+    const auto& first_block_handle = scan_block_handles[block_indices[0]];
+    const auto& last_block_handle = scan_block_handles[block_indices.back()];
+
+    const auto start_offset = first_block_handle.offset();
+    const auto end_offset =
+        last_block_handle.offset() +
+        BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
+#ifndef NDEBUG
+    // Debug print for failing the assertion below.
+    if (start_offset >= end_offset) {
+      fprintf(stderr, "scan_block_handles: ");
+      for (const auto& block : scan_block_handles) {
+        fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
+                block.offset(), block.size());
+      }
+      fprintf(stderr,
+              "\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
+              first_block_handle.offset(), first_block_handle.size());
+      fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
+              last_block_handle.offset(), last_block_handle.size());
+
+      fprintf(stderr, "coalesced_block_indices: ");
+      for (const auto& b : *coalesced_block_indices) {
+        fprintf(stderr, "[");
+        for (const auto& block_idx : b) {
+          fprintf(stderr, "%zu ", block_idx);
         }
+        fprintf(stderr, "] ");
+      }
+      fprintf(stderr, "\ncurrent blocks: ");
+      for (const auto& block_idx : block_indices) {
+        fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
+                scan_block_handles[block_idx].offset(),
+                scan_block_handles[block_idx].size());
       }
-      collapsed_blocks_to_read.back().emplace_back(block_idx);
+      fprintf(stderr, "\n");
     }
+#endif  // NDEBUG
+    assert(end_offset > start_offset);
 
-    // do IO
-    IOOptions io_opts;
-    {
-      Status s =
-          table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
-      if (!s.ok()) {
-        // Abort: PrepareIOOptions failed
-        return;
+    read_reqs->emplace_back();
+    read_reqs->back().offset = start_offset;
+    read_reqs->back().len = end_offset - start_offset;
+
+    if (multiscan_opts->use_async_io) {
+      for (const auto& block_idx : block_indices) {
+        (*block_idx_to_readreq_idx)[block_idx] = read_reqs->size() - 1;
       }
     }
+  }
+}
 
-    // Init read requests for Multi-Read
-    std::vector<FSReadRequest> read_reqs;
-    read_reqs.reserve(collapsed_blocks_to_read.size());
-    size_t total_len = 0;
-    for (const auto& blocks : collapsed_blocks_to_read) {
-      assert(blocks.size());
-      const auto& first_block = blocks_to_prepare[blocks[0]];
-      const auto& last_block = blocks_to_prepare[blocks.back()];
-
-      const auto start_offset = first_block.offset();
-      const auto end_offset = last_block.offset() +
-                              BlockBasedTable::BlockSizeWithTrailer(last_block);
+bool BlockBasedTableIterator::ExecuteIO(
+    const std::vector<BlockHandle>& scan_block_handles,
+    const MultiScanArgs* multiscan_opts,
+    const std::vector<std::vector<size_t>>& coalesced_block_indices,
+    std::vector<FSReadRequest>* read_reqs,
+    std::vector<AsyncReadState>* async_states,
+    std::vector<CachableEntry<Block>>* pinned_data_blocks_guard) {
+  IOOptions io_opts;
+  if (!table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts).ok()) {
+    // Abort: PrepareIOOptions failed
+    return false;
+  }
+  const bool direct_io = table_->get_rep()->file->use_direct_io();
+
+  if (multiscan_opts->use_async_io) {
+    async_states->resize(read_reqs->size());
+    for (size_t i = 0; i < read_reqs->size(); ++i) {
+      auto& read_req = (*read_reqs)[i];
+      auto& async_read = (*async_states)[i];
+
+      async_read.finished = false;
+      async_read.offset = read_req.offset;
+      async_read.block_indices = coalesced_block_indices[i];
+      for (const auto idx : coalesced_block_indices[i]) {
+        async_read.blocks.emplace_back(scan_block_handles[idx]);
+      }
+
+      if (direct_io) {
+        read_req.scratch = nullptr;
+      } else {
+        async_read.buf.reset(new char[read_req.len]);
+        read_req.scratch = async_read.buf.get();
+      }
+
+      auto cb = std::bind(&BlockBasedTableIterator::PrepareReadAsyncCallBack,
+                          this, std::placeholders::_1, std::placeholders::_2);
+      // TODO: for mmap, io_handle will not be set but callback will already
+      // be called.
+      Status s = table_->get_rep()->file.get()->ReadAsync(
+          read_req, io_opts, cb, &async_read, &async_read.io_handle,
+          &async_read.del_fn, direct_io ? &async_read.aligned_buf : nullptr);
+      if (!s.ok()) {
 #ifndef NDEBUG
-      // Debug print for failing the assertion below.
-      if (start_offset >= end_offset) {
-        fprintf(stderr, "blocks_to_prepare: ");
-        for (const auto& block : blocks_to_prepare) {
-          fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
-                  block.offset(), block.size());
-        }
-        fprintf(stderr,
-                "\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
-                first_block.offset(), first_block.size());
-        fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
-                last_block.offset(), last_block.size());
-
-        fprintf(stderr, "collapsed_blocks_to_read: ");
-        for (const auto& b : collapsed_blocks_to_read) {
-          fprintf(stderr, "[");
-          for (const auto& block_idx : b) {
-            fprintf(stderr, "%zu ", block_idx);
-          }
-          fprintf(stderr, "] ");
-        }
-        fprintf(stderr, "\ncurrent blocks: ");
-        for (const auto& block_idx : blocks) {
-          fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
-                  blocks_to_prepare[block_idx].offset(),
-                  blocks_to_prepare[block_idx].size());
+        fprintf(stderr, "ReadAsync failed with %s\n", s.ToString().c_str());
+#endif
+        assert(false);
+        return false;
+      }
+      assert(async_read.io_handle);
+      for (auto& req : *read_reqs) {
+        if (!req.status.ok()) {
+          assert(false);
+          return false;
         }
-        fprintf(stderr, "\n");
       }
-#endif  // NDEBUG
-      assert(end_offset > start_offset);
-      FSReadRequest read_req;
-      read_req.offset = start_offset;
-      read_req.len = end_offset - start_offset;
-      total_len += read_req.len;
-      read_reqs.emplace_back(std::move(read_req));
     }
-
-    // Init buffer for read
+  } else {
+    // Synchronous IO using MultiRead
     std::unique_ptr<char[]> buf;
-    const bool direct_io = table_->get_rep()->file->use_direct_io();
+
     if (direct_io) {
-      for (auto& read_req : read_reqs) {
+      for (auto& read_req : *read_reqs) {
         read_req.scratch = nullptr;
       }
     } else {
       // TODO: optimize if FSSupportedOps::kFSBuffer is supported.
+      size_t total_len = 0;
+      for (const auto& req : *read_reqs) {
+        total_len += req.len;
+      }
       buf.reset(new char[total_len]);
       size_t offset = 0;
-      for (auto& read_req : read_reqs) {
+      for (auto& read_req : *read_reqs) {
         read_req.scratch = buf.get() + offset;
         offset += read_req.len;
       }
     }
 
     AlignedBuf aligned_buf;
-    {
-      Status s = table_->get_rep()->file.get()->MultiRead(
-          io_opts, read_reqs.data(), read_reqs.size(),
-          direct_io ? &aligned_buf : nullptr);
-      if (!s.ok()) {
-        return;
-      }
+    Status s = table_->get_rep()->file->MultiRead(
+        io_opts, read_reqs->data(), read_reqs->size(),
+        direct_io ? &aligned_buf : nullptr);
+    if (!s.ok()) {
+      return false;
     }
-    for (auto& req : read_reqs) {
+    for (auto& req : *read_reqs) {
       if (!req.status.ok()) {
-        return;
-      }
-    }
-
-    // Get compression dictionary if available - needed for dictionary-aware
-    // decompression
-    UnownedPtr<Decompressor> decompressor =
-        table_->get_rep()->decompressor.get();
-    CachableEntry<DecompressorDict> cached_dict;
-    if (table_->get_rep()->uncompression_dict_reader) {
-      Status s =
-          table_->get_rep()
-              ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
-                  /* prefetch_buffer= */ nullptr, read_options_,
-                  /* get_context= */ nullptr, /* lookup_context= */ nullptr,
-                  &cached_dict);
-      if (!s.ok()) {
-#ifndef NDEBUG
-        fprintf(stdout, "Prepare dictionary loading failed with %s\n",
-                s.ToString().c_str());
-#endif
-        // Abort: dictionary lookup failed.
-        return;
-      }
-      if (!cached_dict.GetValue()) {
-#ifndef NDEBUG
-        fprintf(stdout, "Success but no dictionary read\n");
-#endif
-        return;
+        return false;
       }
-      decompressor = cached_dict.GetValue()->decompressor_.get();
     }
 
     // Init blocks and pin them in block cache.
-    MemoryAllocator* memory_allocator =
-        table_->get_rep()->table_options.block_cache->memory_allocator();
-    for (size_t i = 0; i < collapsed_blocks_to_read.size(); i++) {
-      const auto& blocks = collapsed_blocks_to_read[i];
-      const auto& read_req = read_reqs[i];
-      for (const auto& block_idx : blocks) {
-        const auto& block = blocks_to_prepare[block_idx];
-        const auto block_size_with_trailer =
-            BlockBasedTable::BlockSizeWithTrailer(block);
-        const auto block_offset_in_buffer = block.offset() - read_req.offset;
-
-        CacheAllocationPtr data =
-            AllocateBlock(block_size_with_trailer, memory_allocator);
-        memcpy(data.get(), read_req.result.data() + block_offset_in_buffer,
-               block_size_with_trailer);
-        BlockContents tmp_contents(std::move(data), block.size());
-
-#ifndef NDEBUG
-        tmp_contents.has_trailer =
-            table_->get_rep()->footer.GetBlockTrailerSize() > 0;
-#endif
-        assert(pinned_data_blocks_guard[block_idx].IsEmpty());
-        Status s = table_->CreateAndPinBlockInCache<Block_kData>(
-            read_options_, block, decompressor, &tmp_contents,
-            &(pinned_data_blocks_guard[block_idx].As<Block_kData>()));
+    assert(read_reqs->size() == coalesced_block_indices.size());
+    for (size_t i = 0; i < coalesced_block_indices.size(); i++) {
+      const auto& read_req = (*read_reqs)[i];
+      for (const auto& block_idx : coalesced_block_indices[i]) {
+        const auto& block = scan_block_handles[block_idx];
+
+        assert((*pinned_data_blocks_guard)[block_idx].IsEmpty());
+        s = CreateAndPinBlockFromBuffer(block, read_req.offset, read_req.result,
+                                        (*pinned_data_blocks_guard)[block_idx]);
         if (!s.ok()) {
-#ifndef NDEBUG
-          fprintf(stdout, "Prepare failed with %s\n", s.ToString().c_str());
-#endif
+          assert(false);
           // Abort: failed to create and pin block in cache
-          return;
+          return false;
         }
-        assert(pinned_data_blocks_guard[block_idx].GetValue());
+        assert((*pinned_data_blocks_guard)[block_idx].GetValue());
       }
     }
   }
-
-  // Successful Prepare, init related states so the iterator reads from prepared
-  // blocks
-  multi_scan_.reset(new MultiScanState(multiscan_opts,
-                                       std::move(pinned_data_blocks_guard),
-                                       std::move(block_ranges_per_scan)));
-  is_index_at_curr_block_ = false;
-  block_iter_points_to_real_block_ = false;
-}
-
-bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
-  assert(multi_scan_);
-  // This is a MultiScan and Preapre() has been called.
-  //
-  // Validate seek key with scan options
-  if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
-    multi_scan_.reset();
-  } else if (!target) {
-    // start key must be set for multi-scan
-    multi_scan_.reset();
-  } else if (user_comparator_.CompareWithoutTimestamp(
-                 ExtractUserKey(*target), /*a_has_ts=*/true,
-                 multi_scan_->scan_opts
-                     ->GetScanRanges()[multi_scan_->next_scan_idx]
-                     .range.start.value(),
-                 /*b_has_ts=*/false) != 0) {
-    // Unexpected seek key
-    multi_scan_.reset();
-  } else {
-    auto [cur_scan_start_idx, cur_scan_end_idx] =
-        multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx];
-    // We should have the data block already loaded
-    ++multi_scan_->next_scan_idx;
-    if (cur_scan_start_idx >= cur_scan_end_idx) {
-      is_out_of_bound_ = true;
-      assert(!Valid());
-      return true;
-    } else {
-      is_out_of_bound_ = false;
-    }
-
-    if (!block_iter_points_to_real_block_ ||
-        multi_scan_->cur_data_block_idx != cur_scan_start_idx) {
-      if (block_iter_points_to_real_block_) {
-        // Should be scan in increasing key range.
-        // All blocks before cur_data_block_idx_ are not pinned anymore.
-        assert(multi_scan_->cur_data_block_idx < cur_scan_start_idx);
-      }
-
-      ResetDataIter();
-
-      // Check if we've hit an empty entry indicating prefetch limit reached
-      if (multi_scan_->pinned_data_blocks[cur_scan_start_idx].IsEmpty()) {
-        multi_scan_->cur_data_block_idx = cur_scan_start_idx;
-        multi_scan_->prefetch_limit_reached = true;
-        assert(!Valid());
-        assert(status().IsPrefetchLimitReached());
-        return true;
-      }
-
-      // Note that the block_iter_ takes ownership of the pinned data block
-      // TODO: we can delegate the clean up like with pinned_iters_mgr_ if
-      // need to pin blocks longer.
-      table_->NewDataBlockIterator<DataBlockIter>(
-          read_options_, multi_scan_->pinned_data_blocks[cur_scan_start_idx],
-          &block_iter_, Status::OK());
-    }
-    multi_scan_->cur_data_block_idx = cur_scan_start_idx;
-    block_iter_points_to_real_block_ = true;
-    block_iter_.Seek(*target);
-    FindKeyForward();
-    return true;
-  }
-
-  // We are aborting MultiScan.
-  ResetDataIter();
-  assert(!is_index_at_curr_block_);
-  assert(!block_iter_points_to_real_block_);
-  return false;
+  return true;
 }
 
-void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
-  assert(multi_scan_);
-  assert(multi_scan_->next_scan_idx >= 1);
-  const auto cur_scan_end_idx = std::get<1>(
-      multi_scan_->block_ranges_per_scan[multi_scan_->next_scan_idx - 1]);
-  do {
-    if (!block_iter_.status().ok()) {
-      return;
-    }
-
-    // If is_out_of_bound_ is true, upper layer (LevelIterator) considers this
-    // level has reached iterate_upper_bound_ and will not continue to iterate
-    // into the next file. When we are doing the last scan within a MultiScan
-    // for this file, it may need to continue to scan into the next file, so
-    // we do not set is_out_of_bound_ in this case.
-    if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
-      if (multi_scan_->next_scan_idx >=
-          multi_scan_->block_ranges_per_scan.size()) {
-        // We are done with this file, should let LevelIter advance to the next
-        // file instead of ending the scan
-        ResetDataIter();
-        assert(!is_out_of_bound_);
-        assert(!Valid());
-        return;
-      }
-      // We don't ResetDataIter() here since next scan might be reading from
-      // the same block. ResetDataIter() will free the underlying block cache
-      // handle and we don't want the block to be unpinned.
-      is_out_of_bound_ = true;
-      assert(!Valid());
-      return;
-    }
-    // Move to the next pinned data block
-    ResetDataIter();
-    ++multi_scan_->cur_data_block_idx;
-
-    // Check if we've hit an empty entry indicating prefetch limit reached
-    if (multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
-            .IsEmpty()) {
-      multi_scan_->prefetch_limit_reached = true;
-      assert(!Valid());
-      assert(status().IsPrefetchLimitReached());
-      return;
-    }
-
-    table_->NewDataBlockIterator<DataBlockIter>(
-        read_options_,
-        multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx],
-        &block_iter_, Status::OK());
-    block_iter_points_to_real_block_ = true;
-    block_iter_.SeekToFirst();
-  } while (!block_iter_.Valid());
-}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index dfc9cf479083..0b1ad3348f2a 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -150,9 +150,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     } else if (async_read_in_progress_) {
       assert(!multi_scan_);
       return Status::TryAgain("Async read in progress");
-    } else if (multi_scan_ && multi_scan_->prefetch_limit_reached) {
-      assert(!Valid());
-      return Status::PrefetchLimitReached();
+    } else if (multi_scan_) {
+      return multi_scan_->status;
     } else {
       return Status::OK();
     }
@@ -377,32 +376,97 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // *** END States used by both regular scan and multiscan
 
   // *** BEGIN MultiScan related states ***
+  struct AsyncReadState {
+    std::unique_ptr<char[]> buf{nullptr};
+    // Indices into pinned_data_blocks that this request reads.
+    std::vector<size_t> block_indices;
+    // BlockHandle for each block in block_indices.
+    std::vector<BlockHandle> blocks;
+    void* io_handle{nullptr};
+    IOHandleDeleter del_fn{nullptr};
+    // offset for this async read request.
+    uint64_t offset{0};
+
+    // These two states are populated from the FSReadRequest
+    // by ReadAsync callback
+    Status status;
+    Slice result;
+
+    // For direct I/O support
+    AlignedBuf aligned_buf{nullptr};
+
+    bool finished{false};
+
+    AsyncReadState() = default;
+    DECLARE_DEFAULT_MOVES(AsyncReadState);
+    // Delete copy operations
+    AsyncReadState(const AsyncReadState&) = delete;
+    AsyncReadState& operator=(const AsyncReadState&) = delete;
+
+    void CleanUpIOHandle() {
+      if (io_handle != nullptr) {
+        assert(del_fn);
+        del_fn(io_handle);
+        io_handle = nullptr;
+      }
+      finished = true;
+    }
+
+    ~AsyncReadState() {
+      // Should be cleaned up before destruction.
+      assert(io_handle == nullptr);
+    }
+  };
+
   struct MultiScanState {
-    // bool prepared_ = false;
+    // For Aborting async I/Os in destructor.
+    const std::shared_ptr<FileSystem> fs;
     const MultiScanArgs* scan_opts;
     std::vector<CachableEntry<Block>> pinned_data_blocks;
 
-    // Indicies into multiscan_pinned_data_blocks_ for data blocks that are
-    // relevant for each scan range.
+    // Indicies into pinned_data_blocks for data blocks for each scan range.
     // inclusive start, exclusive end
-    std::vector<std::tuple<size_t, size_t>> block_ranges_per_scan;
+    std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
     size_t next_scan_idx;
     size_t cur_data_block_idx;
 
-    // When true, the iterator will return
-    // Status::Incomplete(Status::kPrefetchLimitReached).
-    bool prefetch_limit_reached;
+    // States for async reads.
+    //
+    // Each async state correspond to an async read request.
+    // Each async read request may read content for multiple blocks
+    // (potentially coalesced). In PollForBlock(idx), we will poll for the
+    // completion of the async read request responsible for
+    // pinned_data_blocks[idx], and populate `pinned_data_blocks` with all the
+    // blocks read. To find out the async read request responsible for
+    // pinned_data_blocks[idx], we store the mapping in
+    // block_idx_to_readreq_idx. Index i is in block_idx_to_readreq_idx and
+    // block_idx_to_readreq_idx[i] = j iff pinned_data_blocks[i] is read by
+    // async_states[j].
+    std::vector<AsyncReadState> async_states;
+    UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
+    Status status;
+    size_t prefetch_max_idx;
 
     MultiScanState(
-        const MultiScanArgs* _scan_opts,
+        const std::shared_ptr<FileSystem>& _fs, const MultiScanArgs* _scan_opts,
         std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
-        std::vector<std::tuple<size_t, size_t>>&& _block_ranges_per_scan)
-        : scan_opts(_scan_opts),
+        std::vector<std::tuple<size_t, size_t>>&& _block_index_ranges_per_scan,
+        UnorderedMap<size_t, size_t>&& _block_idx_to_readreq_idx,
+        std::vector<AsyncReadState>&& _async_states, size_t _prefetch_max_idx)
+        : fs(_fs),
+          scan_opts(_scan_opts),
           pinned_data_blocks(std::move(_pinned_data_blocks)),
-          block_ranges_per_scan(std::move(_block_ranges_per_scan)),
+          block_index_ranges_per_scan(std::move(_block_index_ranges_per_scan)),
           next_scan_idx(0),
           cur_data_block_idx(0),
-          prefetch_limit_reached(false) {}
+          async_states(std::move(_async_states)),
+          block_idx_to_readreq_idx(std::move(_block_idx_to_readreq_idx)),
+          status(Status::OK()),
+          prefetch_max_idx(_prefetch_max_idx) {
+      status.PermitUncheckedError();
+    }
+
+    ~MultiScanState();
   };
 
   std::unique_ptr<MultiScanState> multi_scan_;
@@ -524,10 +588,100 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // *** END APIs relevant to auto tuning of readahead_size ***
 
   // *** BEGIN APIs relevant to multiscan ***
-  // Returns true iff seek is successful.
+
+  // Returns true iff we should fallback to regular scan.
   bool SeekMultiScan(const Slice* target);
 
   void FindBlockForwardInMultiScan();
+
+  void PrepareReadAsyncCallBack(FSReadRequest& req, void* cb_arg) {
+    // Record status, result and sanity check offset from `req`.
+    AsyncReadState* async_state = static_cast<AsyncReadState*>(cb_arg);
+
+    async_state->status = req.status;
+    async_state->result = req.result;
+
+    if (async_state->status.ok()) {
+      assert(async_state->offset == req.offset);
+      if (async_state->offset != req.offset) {
+        async_state->status = Status::InvalidArgument(
+            "offset mismatch between async read request " +
+            std::to_string(async_state->offset) + " and async callback " +
+            std::to_string(req.offset));
+      }
+    } else {
+      assert(async_state->status.IsAborted());
+    }
+  }
+
+  Status MultiScanLoadDataBlock(size_t idx) {
+    if (idx >= multi_scan_->prefetch_max_idx) {
+      return Status::PrefetchLimitReached();
+    }
+
+    if (!multi_scan_->async_states.empty()) {
+      Status s = PollForBlock(idx);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    // This block should have been initialized
+    assert(multi_scan_->pinned_data_blocks[idx].GetValue());
+    // Note that the block_iter_ takes ownership of the pinned data block
+    // TODO: we can delegate the clean up like with pinned_iters_mgr_ if
+    // need to pin blocks longer.
+    table_->NewDataBlockIterator<DataBlockIter>(
+        read_options_, multi_scan_->pinned_data_blocks[idx], &block_iter_,
+        Status::OK());
+    return Status::OK();
+  }
+
+  // After PollForBlock(idx), the async request that contains
+  // pinned_data_blocks[idx] should be done, and all blocks contained in this
+  // read request will be initialzed in pinned_data_blocks and pinned in block
+  // cache.
+  Status PollForBlock(size_t idx);
+
+  // Helper function to create and pin a block in cache from buffer data
+  // Handles decompressor setup with dictionary loading and block
+  // creation/pinning. The buffer_start_offset is the file offset where
+  // buffer_data starts.
+  Status CreateAndPinBlockFromBuffer(const BlockHandle& block,
+                                     uint64_t buffer_start_offset,
+                                     const Slice& buffer_data,
+                                     CachableEntry<Block>& pinned_block_entry);
+
+  // Helper functions for Prepare():
+  bool ValidateScanOptions(const MultiScanArgs* multiscan_opts);
+
+  bool CollectBlockHandles(
+      const std::vector<ScanOptions>& scan_opts,
+      std::vector<BlockHandle>* scan_block_handles,
+      std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan);
+
+  bool FilterAndPinCachedBlocks(
+      const std::vector<BlockHandle>& scan_block_handles,
+      const MultiScanArgs* multiscan_opts,
+      std::vector<size_t>* block_indices_to_read,
+      std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
+      size_t* prefetched_max_idx);
+
+  void PrepareIORequests(
+      const std::vector<size_t>& block_indices_to_read,
+      const std::vector<BlockHandle>& scan_block_handles,
+      const MultiScanArgs* multiscan_opts,
+      std::vector<FSReadRequest>* read_reqs,
+      UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
+      std::vector<std::vector<size_t>>* coalesced_block_indices);
+
+  bool ExecuteIO(
+      const std::vector<BlockHandle>& scan_block_handles,
+      const MultiScanArgs* multiscan_opts,
+      const std::vector<std::vector<size_t>>& coalesced_block_indices,
+      std::vector<FSReadRequest>* read_reqs,
+      std::vector<AsyncReadState>* async_states,
+      std::vector<CachableEntry<Block>>* pinned_data_blocks_guard);
+
   // *** END APIs relevant to multiscan ***
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 41728894e76d..2010d36bc055 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -28,6 +28,9 @@
 #include "test_util/testutil.h"
 #include "util/random.h"
 
+// Enable io_uring support for this test
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
 namespace ROCKSDB_NAMESPACE {
 
 class BlockBasedTableReaderBaseTest : public testing::Test {
@@ -169,8 +172,9 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
         false /* _force_direct_prefetch */, -1 /* _level */,
         nullptr /* _block_cache_tracer */,
         0 /* _max_file_size_for_l0_meta_pin */, "" /* _cur_db_session_id */,
-        0 /* _cur_file_num */, {} /* _unique_id */, 0 /* _largest_seqno */,
-        0 /* _tail_size */, user_defined_timestamps_persisted);
+        table_num_++ /* _cur_file_num */, {} /* _unique_id */,
+        0 /* _largest_seqno */, 0 /* _tail_size */,
+        user_defined_timestamps_persisted);
 
     std::unique_ptr<RandomAccessFileReader> file;
     NewFileReader(table_name, foptions, &file, ioptions.statistics.get());
@@ -202,6 +206,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   Env* env_;
   std::shared_ptr<FileSystem> fs_;
   Options options_;
+  uint64_t table_num_{0};
 
  private:
   void WriteToFile(const std::string& content, const std::string& filename) {
@@ -993,6 +998,7 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
   ASSERT_EQ(s.code(), Status::kCorruption);
 }
 
+// TODO: test no block cache case
 TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
   std::ostringstream param_trace;
   param_trace << "[MultiScanPrepare] Test params: " << "CompressionType="
@@ -1004,176 +1010,213 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
               << compression_parallel_threads_
               << ", CompressionDictBytes=" << compression_dict_bytes_
               << ", SameKeyDiffTs=" << (same_key_diff_ts_ ? "true" : "false");
-  std::cout << param_trace.str() << std::endl;
-
-  Options options;
-  options.statistics = CreateDBStatistics();
-  ReadOptions read_opts;
-  size_t ts_sz = options.comparator->timestamp_size();
-  std::vector<std::pair<std::string, std::string>> kv =
-      BlockBasedTableReaderBaseTest::GenerateKVMap(
-          100 /* num_block */,
-          true /* mixed_with_human_readable_string_value */, ts_sz);
-
-  std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
-                           CompressionTypeToString(compression_type_);
-
-  ImmutableOptions ioptions(options);
-  CreateTable(table_name, ioptions, compression_type_, kv,
-              compression_parallel_threads_, compression_dict_bytes_);
-
-  std::unique_ptr<BlockBasedTable> table;
-  FileOptions foptions;
-  foptions.use_direct_reads = true;
-  InternalKeyComparator comparator(options.comparator);
-  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
-                           true /* bool prefetch_index_and_filter_in_cache */,
-                           nullptr /* status */, persist_udt_);
-
-  std::unique_ptr<InternalIterator> iter;
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-
-  // Should coalesce into a single I/O
-  MultiScanArgs scan_options(BytewiseComparator());
-  scan_options.insert(ExtractUserKey(kv[0].first),
-                      ExtractUserKey(kv[kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[3 * kEntriesPerBlock].first));
-
-  auto read_count_before =
-      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-  iter->Prepare(&scan_options);
-  auto read_count_after =
-      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-  ASSERT_EQ(read_count_before + 1, read_count_after);
-  iter->Seek(kv[0].first);
-  for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  // Iter may still be valid after scan range. Upper layer (DBIter) handles
-  // exact upper bound checking. So we don't check !iter->Valid() here.
-  ASSERT_OK(iter->status());
-  iter->Seek(kv[2 * kEntriesPerBlock].first);
-  for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-  // No I/O expected during scanning since all blocks were loaded and pinned.
-  ASSERT_EQ(read_count_after,
-            options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT));
-
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  // No IO coalesce, should do MultiRead with 2 read requests.
-  scan_options = MultiScanArgs(BytewiseComparator());
-  scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[75 * kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[95 * kEntriesPerBlock].first));
-
-  read_count_before =
-      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-  iter->Prepare(&scan_options);
-  read_count_after =
-      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-  ASSERT_EQ(read_count_before + 2, read_count_after);
-
-  iter->Seek(kv[70 * kEntriesPerBlock].first);
-  for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-  iter->Seek(kv[90 * kEntriesPerBlock].first);
-  for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  // Should do two I/Os since blocks 80-81 and 90-95 are already in block cache,
-  // reads from blocks 50-79 and 82-.. are coalesced.
-  scan_options = MultiScanArgs(BytewiseComparator());
-  scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
-  read_count_before =
-      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-  iter->Prepare(&scan_options);
-  read_count_after =
-      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-  ASSERT_EQ(read_count_before + 3, read_count_after);
-  iter->Seek(kv[50 * kEntriesPerBlock].first);
-  for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_FALSE(iter->Valid());
-  ASSERT_OK(iter->status());
-  ASSERT_EQ(read_count_after,
-            options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT));
+  SCOPED_TRACE(param_trace.str());
+
+  for (bool fill_cache : {false, true}) {
+    SCOPED_TRACE(std::string("fill_cache=") + std::to_string(fill_cache));
+    for (bool use_async_io : {false,
+#ifdef ROCKSDB_IOURING_PRESENT
+                              true
+#endif
+         }) {
+      SCOPED_TRACE(std::string("use_async_io=") + std::to_string(use_async_io));
+      Options options;
+      options.statistics = CreateDBStatistics();
+      std::shared_ptr<FileSystem> fs = options.env->GetFileSystem();
+      ReadOptions read_opts;
+      read_opts.fill_cache = fill_cache;
+      size_t ts_sz = options.comparator->timestamp_size();
+      std::vector<std::pair<std::string, std::string>> kv =
+          BlockBasedTableReaderBaseTest::GenerateKVMap(
+              100 /* num_block */,
+              true /* mixed_with_human_readable_string_value */, ts_sz);
+      std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
+                               CompressionTypeToString(compression_type_) +
+                               "_async" + std::to_string(use_async_io);
+      ImmutableOptions ioptions(options);
+      CreateTable(table_name, ioptions, compression_type_, kv,
+                  compression_parallel_threads_, compression_dict_bytes_);
+
+      std::unique_ptr<BlockBasedTable> table;
+      FileOptions foptions;
+      foptions.use_direct_reads = use_direct_reads_;
+      InternalKeyComparator comparator(options.comparator);
+      NewBlockBasedTableReader(
+          foptions, ioptions, comparator, table_name, &table,
+          true /* bool prefetch_index_and_filter_in_cache */,
+          nullptr /* status */, persist_udt_);
+
+      // 1. Should coalesce into a single I/O
+      std::unique_ptr<InternalIterator> iter;
+      iter.reset(table->NewIterator(
+          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+      MultiScanArgs scan_options(BytewiseComparator());
+      scan_options.use_async_io = use_async_io;
+      scan_options.insert(ExtractUserKey(kv[0].first),
+                          ExtractUserKey(kv[kEntriesPerBlock].first));
+      scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
+                          ExtractUserKey(kv[3 * kEntriesPerBlock].first));
+      auto read_count_before =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+
+      iter->Prepare(&scan_options);
+      iter->Seek(kv[0].first);
+      for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
+        ASSERT_TRUE(iter->status().ok()) << iter->status().ToString();
+        ASSERT_TRUE(iter->Valid()) << i;
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      // Iter may still be valid after scan range. Upper layer (DBIter) handles
+      // exact upper bound checking. So we don't check !iter->Valid() here.
+      ASSERT_OK(iter->status());
+      iter->Seek(kv[2 * kEntriesPerBlock].first);
+      for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      auto read_count_after =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+      ASSERT_EQ(read_count_before + 1, read_count_after);
+
+      // 2. No IO coalesce, should do MultiRead/ReadAsync with 2 read requests.
+      iter.reset(table->NewIterator(
+          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
+                          ExtractUserKey(kv[75 * kEntriesPerBlock].first));
+      scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
+                          ExtractUserKey(kv[95 * kEntriesPerBlock].first));
+
+      read_count_before =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+      iter->Prepare(&scan_options);
+
+      iter->Seek(kv[70 * kEntriesPerBlock].first);
+      for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      iter->Seek(kv[90 * kEntriesPerBlock].first);
+      for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+
+      read_count_after =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+      ASSERT_EQ(read_count_before + 2, read_count_after);
+
+      iter.reset(table->NewIterator(
+          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+      // 3. Tests I/O excludes blocks already in cache.
+      // Reading blocks from 50-99
+      // From reads above, blocks 70-75 and 90-95 already in cache
+      // So we should read 50-70 76-89 96-99 in three I/Os.
+      // If fill_cache is false, then we'll do one giant I/O.
+      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options.use_async_io = use_async_io;
+      scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
+      read_count_before =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+      iter->Prepare(&scan_options);
+      read_count_after =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+      if (!use_async_io) {
+        if (!fill_cache) {
+          ASSERT_EQ(read_count_before + 1, read_count_after);
+        } else {
+          ASSERT_EQ(read_count_before + 3, read_count_after);
+        }
+      } else {
+        // stat is recorded in async callback which happens in Poll(), and
+        // Poll() happens during scanning.
+        ASSERT_EQ(read_count_before, read_count_after);
+      }
 
-  // Check cases when Seek key does not match start key in ScanOptions
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  scan_options = MultiScanArgs(BytewiseComparator());
-  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[20 * kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[40 * kEntriesPerBlock].first));
-  iter->Prepare(&scan_options);
-  // Match start key
-  iter->Seek(kv[10 * kEntriesPerBlock].first);
-  for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-  // Does not match start key of the second ScanOptions.
-  iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
-  for (size_t i = 50 * kEntriesPerBlock + 1; i < 100 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_FALSE(iter->Valid());
-  ASSERT_OK(iter->status());
+      iter->Seek(kv[50 * kEntriesPerBlock].first);
+      for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+      read_count_after =
+          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+      if (!fill_cache) {
+        ASSERT_EQ(read_count_before + 1, read_count_after);
+      } else {
+        ASSERT_EQ(read_count_before + 3, read_count_after);
+      }
 
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  scan_options = MultiScanArgs(BytewiseComparator());
-  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
-  iter->Prepare(&scan_options);
-  // Does not match the first ScanOptions.
-  iter->SeekToFirst();
-  for (size_t i = 0; i < kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
-  }
-  ASSERT_OK(iter->status());
-  iter->Seek(kv[10 * kEntriesPerBlock].first);
-  for (size_t i = 10 * kEntriesPerBlock; i < 12 * kEntriesPerBlock; ++i) {
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().ToString(), kv[i].first);
-    iter->Next();
+      // 4. Check cases when Seek key does not match start key in ScanOptions
+      iter.reset(table->NewIterator(
+          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options.use_async_io = use_async_io;
+      scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
+                          ExtractUserKey(kv[20 * kEntriesPerBlock].first));
+      scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                          ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+      iter->Prepare(&scan_options);
+      // Match start key
+      iter->Seek(kv[10 * kEntriesPerBlock].first);
+      for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+
+      // Does not match start key of the second ScanOptions.
+      iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
+      for (size_t i = 50 * kEntriesPerBlock + 1; i < 100 * kEntriesPerBlock;
+           ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter.reset(table->NewIterator(
+          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options.use_async_io = use_async_io;
+      scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
+      scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
+      iter->Prepare(&scan_options);
+      // Does not match the first ScanOptions.
+      iter->SeekToFirst();
+      for (size_t i = 0; i < kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+      iter->Seek(kv[10 * kEntriesPerBlock].first);
+      for (size_t i = 10 * kEntriesPerBlock; i < 12 * kEntriesPerBlock; ++i) {
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_EQ(iter->key().ToString(), kv[i].first);
+        iter->Next();
+      }
+      ASSERT_OK(iter->status());
+    }
   }
-  ASSERT_OK(iter->status());
 }
 
 TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 727275233e30..003576da5a5a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1851,6 +1851,10 @@ DEFINE_uint64(multiscan_coalesce_threshold,
               ROCKSDB_NAMESPACE::MultiScanArgs().io_coalesce_threshold,
               "Configures io coalescing threshold for multiscans");
 
+DEFINE_bool(multiscan_use_async_io,
+            ROCKSDB_NAMESPACE::MultiScanArgs().use_async_io,
+            "Sets MultiScanArgs::use_async_io");
+
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static Status CreateMemTableRepFactory(
@@ -6414,10 +6418,12 @@ class Benchmark {
     options.readahead_size = readahead;
 
     Duration duration(FLAGS_duration, reads_);
-    while (!duration.Done(1)) {
+    int64_t num_keys = 1;
+    while (!duration.Done(num_keys)) {
       DB* db = SelectDB(thread);
       MultiScanArgs opts;
       opts.io_coalesce_threshold = FLAGS_multiscan_coalesce_threshold;
+      opts.use_async_io = FLAGS_multiscan_use_async_io;
       std::vector<std::unique_ptr<const char[]>> guards;
       opts.reserve(multiscan_size);
       // We create 1 random start, and then multiscan will start from that
@@ -6444,13 +6450,14 @@ class Benchmark {
 
       auto iter =
           db->NewMultiScan(read_options_, db->DefaultColumnFamily(), opts);
+      int64_t keys = 0;
       for (auto rng : *iter) {
-        [[maybe_unused]] size_t keys = 0;
         for ([[maybe_unused]] auto it : rng) {
           keys++;
         }
         assert(keys > 0);
       }
+      num_keys = std::max<int64_t>(1, keys);
 
       if (thread->shared->read_rate_limiter.get() != nullptr) {
         thread->shared->read_rate_limiter->Request(
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 9edb85ba4f0d..cdba233e5156 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -136,7 +136,7 @@ def setup_random_seed_before_main():
     "max_key": random.choice([100000, 25000000]),
     "max_sequential_skip_in_iterations": lambda: random.choice([1, 2, 8, 16]),
     "max_write_buffer_number": 3,
-    "mmap_read": lambda: random.randint(0, 1),
+    "mmap_read": lambda: random.choice([0, 0, 1]),
     # Setting `nooverwritepercent > 0` is only possible because we do not vary
     # the random seed, so the same keys are chosen by every run for disallowing
     # overwrites.
@@ -387,6 +387,7 @@ def setup_random_seed_before_main():
     "use_multiscan": random.choice([1] + [0] * 3),
     # By default, `statistics` use kExceptDetailedTimers level
     "statistics": random.choice([0, 1]),
+    "multiscan_use_async_io": random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
@@ -757,6 +758,7 @@ def finalize_and_sanitize(src_params):
     if dest_params["mmap_read"] == 1:
         dest_params["use_direct_io_for_flush_and_compaction"] = 0
         dest_params["use_direct_reads"] = 0
+        dest_params["multiscan_use_async_io"] = 0
     if (
         dest_params["use_direct_io_for_flush_and_compaction"] == 1
         or dest_params["use_direct_reads"] == 1
@@ -1154,7 +1156,6 @@ def finalize_and_sanitize(src_params):
     ):
         dest_params["use_multiscan"] = 0
     if dest_params.get("use_multiscan") == 1:
-        dest_params["fill_cache"] = 1
         dest_params["async_io"] = 0
     return dest_params
 
diff --git a/unreleased_history/new_features/multi-scan-async-io.md b/unreleased_history/new_features/multi-scan-async-io.md
new file mode 100644
index 000000000000..b8be3ce39bfc
--- /dev/null
+++ b/unreleased_history/new_features/multi-scan-async-io.md
@@ -0,0 +1 @@
+* Introduce option MultiScanArgs::use_async_io to enable asynchronous I/O during MultiScan, instead of waiting for I/O to be done in Prepare().

From 7c3472b4d96a201e5fa87affabbbdd6bba480ca6 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 16 Sep 2025 16:51:33 -0700
Subject: [PATCH 280/500] Work around GCC TSAN bug (#13958)

Summary:
... reporting false positive double-lock on some of the new parallel compression code. Switching from std::condition_variable to condition_variable_any simply changes the FP from double-lock to lock inversion. In addition, leaking ParallelCompressionRep instances to avoid memory location reuse fails to fix the FP reports. Thus, I've decided to disable the watchdog with GCC+TSAN.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13958

Test Plan: local crash test runs could reproduce, now don't reproduce. CLANG TSAN doesn't seem to be reporting the same supposed issues

Reviewed By: xingbowang

Differential Revision: D82555968

Pulled By: pdillinger

fbshipit-source-id: 537fbc3a787f917915a6faf0bdedd1449a7f378a
---
 .../block_based/block_based_table_builder.cc  | 52 +++++++++++--------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3339be673eb2..17b0a06ff2b4 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -358,7 +358,14 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   static constexpr int32_t kMaxWakeupInterval = 8;
   // END fields for use by the emit thread only
 
-#ifndef NDEBUG
+  // TSAN on GCC has bugs that report false positives on this watchdog code.
+  // Other efforts to work around the bug have failed, so to avoid those false
+  // positive reports, we simply disable the watchdog when running under GCC
+  // TSAN.
+#if !defined(NDEBUG) && !(defined(__GNUC__) && defined(__SANITIZE_THREAD__))
+#define BBTB_PC_WATCHDOG 1
+#endif
+#ifdef BBTB_PC_WATCHDOG
   // These are for an extra "watchdog" thread in DEBUG builds that heuristically
   // checks for the most likely deadlock conditions. False positives and false
   // negatives are technically possible.
@@ -370,7 +377,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   RelaxedAtomic<uint32_t> idling_workers{0};
   RelaxedAtomic<bool> live_emit{0};
   RelaxedAtomic<bool> idling_emit{0};
-#endif  // !NDEBUG
+#endif  // BBTB_PC_WATCHDOG
 
   int ComputeRingBufferNbits(uint32_t parallel_threads) {
     // Ring buffer size is a power of two not to exceed 32 but otherwise
@@ -631,19 +638,22 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
         }
         // Handle idle state
         if constexpr (thread_kind == ThreadKind::kEmitter) {
-#ifndef NDEBUG
-          // Tracking for watchdog
+#ifdef BBTB_PC_WATCHDOG
           idling_emit.StoreRelaxed(true);
           Defer decr{[this]() { idling_emit.StoreRelaxed(false); }};
-#endif
-          idle_emit_sem.Acquire();  // Likely block
+#endif  // BBTB_PC_WATCHDOG
+
+          // Likely go to sleep
+          idle_emit_sem.Acquire();
         } else {
-#ifndef NDEBUG
+#ifdef BBTB_PC_WATCHDOG
           // Tracking for watchdog
           idling_workers.FetchAddRelaxed(1);
           Defer decr{[this]() { idling_workers.FetchSubRelaxed(1); }};
-#endif
-          idle_worker_sem.Acquire();  // Likely block
+#endif  // BBTB_PC_WATCHDOG
+
+          // Likely go to sleep
+          idle_worker_sem.Acquire();
         }
         // Update state after sleep
         seen_state = atomic_state.Load();
@@ -704,7 +714,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
     thread_state = ThreadState::kEnd;
   }
 
-#ifndef NDEBUG
+#ifdef BBTB_PC_WATCHDOG
   // Logic for the extra "watchdog" thread in DEBUG builds that heuristically
   // checks for the most likely deadlock conditions.
   //
@@ -757,7 +767,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
       }
     }
   }
-#endif  // !NDEBUG
+#endif  // BBTB_PC_WATCHDOG
 };
 
 struct BlockBasedTableBuilder::Rep {
@@ -1716,11 +1726,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
 
 void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
   auto& pc_rep = *rep_->pc_rep;
-#ifndef NDEBUG
-  // Tracking for watchdog
+#ifdef BBTB_PC_WATCHDOG
   pc_rep.live_workers.FetchAddRelaxed(1);
   Defer decr{[&pc_rep]() { pc_rep.live_workers.FetchSubRelaxed(1); }};
-#endif  // !NDEBUG
+#endif  // BBTB_PC_WATCHDOG
   ParallelCompressionRep::ThreadState thread_state =
       ParallelCompressionRep::ThreadState::kIdle;
   uint32_t slot = 0;
@@ -2026,11 +2035,11 @@ void BlockBasedTableBuilder::MaybeStartParallelCompression() {
     }
     pc_rep.worker_threads.emplace_back([this, &wa] { BGWorker(wa); });
   }
-#ifndef NDEBUG
-  // Start watchdog thread in DEBUG builds
+#ifdef BBTB_PC_WATCHDOG
+  // Start watchdog thread
   pc_rep.watchdog_thread = std::thread([&pc_rep] { pc_rep.BGWatchdog(); });
   pc_rep.live_emit.StoreRelaxed(true);
-#endif  // !NDEBUG
+#endif  // BBTB_PC_WATCHDOG
 }
 
 void BlockBasedTableBuilder::StopParallelCompression(bool abort) {
@@ -2043,15 +2052,14 @@ void BlockBasedTableBuilder::StopParallelCompression(bool abort) {
     assert(rep_->props.num_data_blocks == 0);
     pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot);
   }
-#ifndef NDEBUG
-  // Tracking for watchdog
+#ifdef BBTB_PC_WATCHDOG
   pc_rep.live_emit.StoreRelaxed(false);
-#endif  // !NDEBUG
+#endif  // BBTB_PC_WATCHDOG
   assert(pc_rep.emit_thread_state == ParallelCompressionRep::ThreadState::kEnd);
   for (auto& thread : pc_rep.worker_threads) {
     thread.join();
   }
-#ifndef NDEBUG
+#ifdef BBTB_PC_WATCHDOG
   // Wake & shutdown watchdog thread
   {
     std::unique_lock<std::mutex> lock(pc_rep.watchdog_mutex);
@@ -2059,7 +2067,7 @@ void BlockBasedTableBuilder::StopParallelCompression(bool abort) {
     pc_rep.watchdog_cv.notify_all();
   }
   pc_rep.watchdog_thread.join();
-#endif  // !NDEBUG
+#endif  // BBTB_PC_WATCHDOG
   rep_->pc_rep.reset();
 }
 

From 95813a84cd1d5d4cf20e216eb313e5ef80b95934 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Tue, 16 Sep 2025 17:43:02 -0700
Subject: [PATCH 281/500] Fix error from transactiondb layer in stress test
 (#13950)

Summary:
The stress test runs concurrent transactions through many threads at the same time on a shared key space. It is possible that a dead lock or a timeout is detected from the transactiondb layer. When this happens, simply return from the function and continue the test, instead of fail the test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13950

Test Plan: Stress test pass locally with the same random seed from stress test 14723229280871643749.

Reviewed By: hx235

Differential Revision: D82373959

Pulled By: xingbowang

fbshipit-source-id: 5d72e89998171c5844fb22f13d8f061f81014c7d
---
 db_stress_tool/no_batched_ops_stress.cc | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 93294423da2f..471c24a64ce8 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -3156,13 +3156,15 @@ class NonBatchedOpsStressTest : public StressTest {
 
       Status s;
 
+      ExpectedValue new_expected_value;
+
       switch (op) {
         case Op::PutOrPutEntity:
         case Op::Merge: {
           ExpectedValue put_value;
           put_value.SyncPut(static_cast<uint32_t>(thread->rand.Uniform(
               static_cast<int>(ExpectedValue::GetValueBaseMask()))));
-          ryw_expected_values[k] = put_value;
+          new_expected_value = put_value;
 
           const uint32_t value_base = put_value.GetValueBase();
 
@@ -3186,7 +3188,7 @@ class NonBatchedOpsStressTest : public StressTest {
         case Op::Delete: {
           ExpectedValue delete_value;
           delete_value.SyncDelete();
-          ryw_expected_values[k] = delete_value;
+          new_expected_value = delete_value;
 
           s = txn->Delete(cfh, k);
           break;
@@ -3195,6 +3197,20 @@ class NonBatchedOpsStressTest : public StressTest {
           assert(false);
       }
 
+      // It is possible that multiple thread concurrently try to write to the
+      // same key, which could cause lock timeout or deadlock in the
+      // transactiondb layer, before transaction is rolled back.
+      // E.g.
+      // Timestamp 1: Transaction A: lock key M for write
+      // Timestamp 2: Transaction B: lock key N for write
+      // Timestamp 3: Transaction B: try to lock key M for write -> wait
+      // Timestamp 4: Transaction A: try to lock key N for write -> deadlock
+      if (s.IsTimedOut() || s.IsDeadlock()) {
+        return;
+      }
+
+      ryw_expected_values[k] = new_expected_value;
+
       if (!s.ok()) {
         fprintf(stderr,
                 "Transaction write error in read-your-own-write test: %s\n",

From 3c85aa8a69059cc82e45b95702be691eefeab0e4 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 17 Sep 2025 08:43:19 -0700
Subject: [PATCH 282/500] Some follow-up to parallel compression revamp
 (#13959)

Summary:
* Fix compaction/flush CPU usage stats to include CPU usage by parallel compression workers. (Validated with manual db_bench testing.)
* Disable the parallel compression framework when compression is disabled. See new code comment for details, because in theory it could be useful to hide SST write latency, but manual testing with db_bench and -rate_limiter_bytes_per_sec or -simulate_hdd options shows no useful increase in throughput, just more CPU usage.
* Fix some minor clean-up items in the implementation

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13959

Test Plan: Also ran some tests like in https://github.com/facebook/rocksdb/issues/13910 to ensure the new CPU usage tracking did not regress performance, all good.

Reviewed By: xingbowang

Differential Revision: D82556686

Pulled By: pdillinger

fbshipit-source-id: 77c522159a7e6ab0ab6f7fb1d662070a46661557
---
 db/builder.cc                                 |  3 ++
 db/compaction/compaction_job.cc               |  3 +-
 db/compaction/compaction_outputs.cc           |  1 +
 db/compaction/compaction_outputs.h            |  7 ++++
 db/compaction/subcompaction_state.h           |  8 +++++
 db/flush_job.cc                               |  4 +--
 .../block_based/block_based_table_builder.cc  | 32 ++++++++++++++++++-
 table/block_based/block_based_table_builder.h |  2 ++
 table/table_builder.h                         |  5 +++
 .../bug_fixes/compaction_cpu.md               |  1 +
 util/bit_fields.h                             |  1 -
 11 files changed, 61 insertions(+), 6 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/compaction_cpu.md

diff --git a/db/builder.cc b/db/builder.cc
index 854958f2478e..14e943f3212e 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -342,6 +342,9 @@ Status BuildTable(
     if (s.ok() && !empty) {
       if (flush_stats) {
         flush_stats->bytes_written_pre_comp = builder->PreCompressionSize();
+        // Add worker CPU micros here. Caller needs to add CPU micros from
+        // calling thread.
+        flush_stats->cpu_micros += builder->GetWorkerCPUMicros();
       }
       uint64_t file_size = builder->FileSize();
       meta->fd.file_size = file_size;
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index f63115319761..33380362a3ac 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1578,9 +1578,8 @@ void CompactionJob::FinalizeSubcompactionJobStats(
                                            cur_cpu_micros, prev_cpu_micros);
 
   // Finalize timing and I/O statistics
-
   sub_compact->compaction_job_stats.cpu_micros =
-      cur_cpu_micros - start_cpu_micros;
+      cur_cpu_micros - start_cpu_micros + sub_compact->GetWorkerCPUMicros();
 
   if (measure_io_stats_) {
     sub_compact->compaction_job_stats.file_write_nanos +=
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index 287dd98c106c..5351e7d33edf 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -56,6 +56,7 @@ Status CompactionOutputs::Finish(
   stats_.bytes_written += current_bytes;
   stats_.bytes_written_pre_comp += builder_->PreCompressionSize();
   stats_.num_output_files = static_cast<int>(outputs_.size());
+  worker_cpu_micros_ += builder_->GetWorkerCPUMicros();
 
   return s;
 }
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index de9a1741492e..ed7b8a3cdea4 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -168,6 +168,10 @@ class CompactionOutputs {
 
   uint64_t NumEntries() const { return builder_->NumEntries(); }
 
+  uint64_t GetWorkerCPUMicros() const {
+    return worker_cpu_micros_ + (builder_ ? builder_->GetWorkerCPUMicros() : 0);
+  }
+
   void ResetBuilder() {
     builder_.reset();
     current_output_file_size_ = 0;
@@ -296,6 +300,9 @@ class CompactionOutputs {
   uint64_t current_output_file_size_ = 0;
   SequenceNumber smallest_preferred_seqno_ = kMaxSequenceNumber;
 
+  // Sum of all the GetWorkerCPUMicros() for all the closed builders so far.
+  uint64_t worker_cpu_micros_ = 0;
+
   // all the compaction outputs so far
   std::vector<Output> outputs_;
 
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 14e11bcf2452..3f417b97eaa9 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -191,6 +191,14 @@ class SubcompactionState {
     return &compaction_outputs_.stats_;
   }
 
+  uint64_t GetWorkerCPUMicros() const {
+    uint64_t rv = compaction_outputs_.GetWorkerCPUMicros();
+    if (compaction->SupportsPerKeyPlacement()) {
+      rv += proximal_level_outputs_.GetWorkerCPUMicros();
+    }
+    return rv;
+  }
+
   CompactionRangeDelAggregator* RangeDelAgg() const {
     return range_del_agg_.get();
   }
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 6bed0afb2d96..12f94d7e7e7a 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -1104,13 +1104,13 @@ Status FlushJob::WriteLevel0Table() {
   const uint64_t micros = clock_->NowMicros() - start_micros;
   const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
   flush_stats.micros = micros;
-  flush_stats.cpu_micros = cpu_micros;
+  flush_stats.cpu_micros += cpu_micros;
 
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Flush lasted %" PRIu64
                  " microseconds, and %" PRIu64 " cpu microseconds.\n",
                  cfd_->GetName().c_str(), job_context_->job_id, micros,
-                 cpu_micros);
+                 flush_stats.cpu_micros);
 
   if (has_output) {
     flush_stats.bytes_written = meta_.fd.GetFileSize();
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 17b0a06ff2b4..3f8895e3283b 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -589,7 +589,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
 
       // If didn't find higher priority work
       if (next_thread_state == ThreadState::kEnd) {
-        if (next_state.Get<NextToCompress>() != seen_state.Get<NextToEmit>()) {
+        if (next_state.Get<NextToCompress>() != next_state.Get<NextToEmit>()) {
           // Compression work is available, select that
           if (thread_kind == ThreadKind::kWorker &&
               next_state.Get<NextToCompress>() ==
@@ -904,6 +904,7 @@ struct BlockBasedTableBuilder::Rep {
   std::vector<std::unique_ptr<InternalTblPropColl>> table_properties_collectors;
 
   std::unique_ptr<ParallelCompressionRep> pc_rep;
+  RelaxedAtomic<uint64_t> worker_cpu_micros{0};
   BlockCreateContext create_context;
 
   // The size of the "tail" part of a SST file. "Tail" refers to
@@ -1288,6 +1289,11 @@ struct BlockBasedTableBuilder::Rep {
     }
   }
 
+  ~Rep() {
+    // Must have been cleaned up by StopParallelCompression
+    assert(pc_rep == nullptr);
+  }
+
   Rep(const Rep&) = delete;
   Rep& operator=(const Rep&) = delete;
 
@@ -1724,7 +1730,19 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   }
 }
 
+uint64_t BlockBasedTableBuilder::GetWorkerCPUMicros() const {
+  return rep_->worker_cpu_micros.LoadRelaxed();
+}
+
 void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
+  // Record CPU usage of this thread
+  const uint64_t start_cpu_micros =
+      rep_->ioptions.env->GetSystemClock()->CPUMicros();
+  Defer log_cpu{[this, start_cpu_micros]() {
+    rep_->worker_cpu_micros.FetchAddRelaxed(
+        rep_->ioptions.env->GetSystemClock()->CPUMicros() - start_cpu_micros);
+  }};
+
   auto& pc_rep = *rep_->pc_rep;
 #ifdef BBTB_PC_WATCHDOG
   pc_rep.live_workers.FetchAddRelaxed(1);
@@ -2015,6 +2033,18 @@ void BlockBasedTableBuilder::MaybeStartParallelCompression() {
   if (rep_->compression_parallel_threads <= 1) {
     return;
   }
+  // Although in theory having a separate thread for writing to the SST file
+  // could help to hide the latency associated with writing, it is more often
+  // the case that the latency comes in large units for rare calls to write that
+  // flush downstream buffers, including in WritableFileWriter. The buffering
+  // provided by the compression ring buffer is almost negligible for hiding
+  // that latency. So even with some optimizations, turning on the parallel
+  // framework when compression is disabled just eats more CPU with little-to-no
+  // improvement in throughput.
+  if (rep_->data_block_compressor == nullptr) {
+    // Force the generally best configuration for no compression: no parallelism
+    return;
+  }
   rep_->pc_rep = std::make_unique<ParallelCompressionRep>(
       rep_->compression_parallel_threads);
   auto& pc_rep = *rep_->pc_rep;
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 1e6c3217c1ce..29a35c5135b3 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -112,6 +112,8 @@ class BlockBasedTableBuilder : public TableBuilder {
   void SetSeqnoTimeTableProperties(const SeqnoToTimeMapping& relevant_mapping,
                                    uint64_t oldest_ancestor_time) override;
 
+  uint64_t GetWorkerCPUMicros() const override;
+
  private:
   bool ok() const;
 
diff --git a/table/table_builder.h b/table/table_builder.h
index 8d0132966f8d..64a1ab02791d 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -245,6 +245,11 @@ class TableBuilder {
   virtual void SetSeqnoTimeTableProperties(
       const SeqnoToTimeMapping& /*relevant_mapping*/,
       uint64_t /*oldest_ancestor_time*/) {}
+
+  // If this builder used CPU work from threads other than the caller, return
+  // the CPU microseconds used. 0 = no work outside calling thread, or not
+  // supported.
+  virtual uint64_t GetWorkerCPUMicros() const { return 0; }
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/unreleased_history/bug_fixes/compaction_cpu.md b/unreleased_history/bug_fixes/compaction_cpu.md
new file mode 100644
index 000000000000..3d25b488feb8
--- /dev/null
+++ b/unreleased_history/bug_fixes/compaction_cpu.md
@@ -0,0 +1 @@
+* Reported numbers for compaction and flush CPU usage now include time spent by parallel compression worker threads. This now means compaction/flush CPU usage could exceed the wall clock time.
diff --git a/util/bit_fields.h b/util/bit_fields.h
index e0cadd02bca6..d1380cc28d9e 100644
--- a/util/bit_fields.h
+++ b/util/bit_fields.h
@@ -83,7 +83,6 @@ struct BitFields {
     explicit Reference(BitFields& bf) : bf_(bf) {}
     Reference(const Reference&) = default;
     Reference& operator=(const Reference&) = default;
-    // no moves
     Reference(Reference&&) = default;
     Reference& operator=(Reference&&) = default;
 

From 631fb8670b077aa80c3953ceb3ed5c82649db515 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 17 Sep 2025 09:59:18 -0700
Subject: [PATCH 283/500] Correctly handle upper bound iteration result from a
 UDI (#13960)

Summary:
This PR fixes a bug in BlockBasedTableIterator::Prepare in conjunction with a user defined index (UDI). If the UDI determines a scan range to be empty and thus returns the kOutOfBound iteration result during Seek, the iteration result is not propagated up and Prepare() assumes end of file and aborts the remaining scans. This results in incorrect behavior and unpredictable multi scan results.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13960

Test Plan: Add unit test to table_test.cc

Reviewed By: xingbowang

Differential Revision: D82590892

Pulled By: anand1976

fbshipit-source-id: 8cfaaae2bb1a9509ddf8ec967cb8a8801748413d
---
 .../block_based/block_based_table_iterator.cc |   6 +-
 .../block_based/user_defined_index_wrapper.h  |  14 +-
 table/table_test.cc                           | 233 ++++++++++++++++--
 .../bug_fixes/udi_empty_scan_range_fix.md     |   1 +
 4 files changed, 226 insertions(+), 28 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/udi_empty_scan_range_fix.md

diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 07e8cb9c857d..e5bc833ee91c 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1307,7 +1307,7 @@ bool BlockBasedTableIterator::CollectBlockHandles(
       start_key = InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
     }
     index_iter_->Seek(start_key.Encode());
-    while (index_iter_->Valid() &&
+    while (index_iter_->status().ok() && index_iter_->Valid() &&
            (!scan_opt.range.limit.has_value() ||
             user_comparator_.CompareWithoutTimestamp(
                 index_iter_->user_key(),
@@ -1337,14 +1337,14 @@ bool BlockBasedTableIterator::CollectBlockHandles(
         scan_block_handles->push_back(index_iter_->value().handle);
       }
       ++num_blocks;
-    } else if (num_blocks == 0) {
+    } else if (num_blocks == 0 && index_iter_->UpperBoundCheckResult() !=
+                                      IterBoundCheck::kOutOfBound) {
       // We should not have scan ranges that are completely after the file's
       // range. This is important for FindBlockForwardInMultiScan() which only
       // lets the upper layer (LevelIterator) advance to the next SST file when
       // the last scan range is exhausted.
       return false;
     }
-    assert(num_blocks);
     block_index_ranges_per_scan->emplace_back(
         scan_block_handles->size() - num_blocks, scan_block_handles->size());
   }
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 73161f64d628..416ed513ee72 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -189,11 +189,11 @@ class UserDefinedIndexIteratorWrapper
     status_ = ParseInternalKey(target, &pkey, /*log_err_key=*/false);
     if (status_.ok()) {
       status_ = udi_iter_->SeekAndGetResult(pkey.user_key, &result_);
-      if (status_.ok()) {
-        valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
-        if (valid_) {
-          ikey_.Set(result_.key, 0, ValueType::kTypeValue);
-        }
+    }
+    if (status_.ok()) {
+      valid_ = result_.bound_check_result == IterBoundCheck::kInbound;
+      if (valid_) {
+        ikey_.Set(result_.key, 0, ValueType::kTypeValue);
       }
     } else {
       valid_ = false;
@@ -251,6 +251,10 @@ class UserDefinedIndexIteratorWrapper
     }
   }
 
+  IterBoundCheck UpperBoundCheckResult() override {
+    return result_.bound_check_result;
+  }
+
  private:
   std::unique_ptr<UserDefinedIndexIterator> udi_iter_;
   IterateResult result_;
diff --git a/table/table_test.cc b/table/table_test.cc
index 4ff2a54ebc12..1f08ee700c07 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7490,6 +7490,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         // Unused parameters
         (void)separator_scratch;
         entries_added_++;
+        index_data_[last_key_in_current_block.ToString()].clear();
         // Store the block handle for each key
         PutFixed64(&index_data_[last_key_in_current_block.ToString()],
                    block_handle.offset);
@@ -7509,6 +7510,10 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         EXPECT_EQ(key.size(), 5);
         // Track keys added to the index
         keys_added_++;
+        // Add dummy entry
+        PutFixed64(&index_data_[key.ToString()], 0);
+        PutFixed64(&index_data_[key.ToString()], 0);
+        PutFixed32(&index_data_[key.ToString()], 0);
       }
 
       Status Finish(Slice* index_contents) override {
@@ -7562,8 +7567,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
       }
 
       std::unique_ptr<UserDefinedIndexIterator> NewIterator(
-          const ReadOptions& ro) override {
-        return std::make_unique<TestUserDefinedIndexIterator>(ro, index_data_,
+          const ReadOptions& /*ro*/) override {
+        return std::make_unique<TestUserDefinedIndexIterator>(index_data_,
                                                               factory_);
       }
 
@@ -7573,13 +7578,11 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
       class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
        public:
         TestUserDefinedIndexIterator(
-            const ReadOptions& ro,
             std::map<std::string,
                      std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
                 index,
             const TestUserDefinedIndexFactory* factory)
-            : ro_(ro),
-              index_(index),
+            : index_(index),
               iter_(index_.end()),
               scan_opts_(nullptr),
               num_opts_(0),
@@ -7598,19 +7601,19 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             return s;
           }
           if (scan_opts_) {
-            if (scan_opts_[scan_idx_].range.start.value().compare(key) == 0) {
-              EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value());
-              target_num_keys_ = std::stoi(scan_opts_[scan_idx_]
-                                               .property_bag.value()
-                                               .find("count")
-                                               ->second);
-              scan_idx_++;
-            } else {
-              scan_opts_ = nullptr;
-            }
+            // Seeks should be in order specified in scan_opts_
+            EXPECT_EQ(scan_opts_[scan_idx_].range.start.value().compare(key),
+                      0);
+            EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value());
+            target_num_keys_ = std::stoi(scan_opts_[scan_idx_]
+                                             .property_bag.value()
+                                             .find("count")
+                                             ->second);
+            scan_idx_++;
           }
           iter_ = index_.lower_bound(key.ToString());
-          if (iter_ != index_.end()) {
+          if ((iter_ != index_.end()) && IsInbound()) {
+            AdvanceToNextIndexEntry();
             result->bound_check_result = IterBoundCheck::kInbound;
             result->key = Slice(iter_->first);
             if (scan_opts_ && target_num_keys_ > 0 &&
@@ -7633,8 +7636,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
           if (!s.ok()) {
             return s;
           }
-          if (ro_.iterate_upper_bound) {
-            if (iter_->first.compare(ro_.iterate_upper_bound->ToString()) >=
+          if (scan_opts_ && scan_opts_[scan_idx_ - 1].range.limit.has_value()) {
+            if (iter_->first.compare(
+                    scan_opts_[scan_idx_ - 1].range.limit.value().ToString()) >=
                 0) {
               result->bound_check_result = IterBoundCheck::kOutOfBound;
               result->key = Slice();
@@ -7647,7 +7651,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             return Status::OK();
           }
           iter_++;
-          if (iter_ != index_.end()) {
+          if ((iter_ != index_.end()) && IsInbound()) {
+            AdvanceToNextIndexEntry();
             result->bound_check_result = IterBoundCheck::kInbound;
             result->key = Slice(iter_->first);
             target_num_keys_ -=
@@ -7660,6 +7665,24 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
           return Status::OK();
         }
 
+        void AdvanceToNextIndexEntry() {
+          while (iter_->second.second == 0) {
+            iter_++;
+          }
+        }
+
+        bool IsInbound() {
+          if (!scan_opts_) {
+            return true;
+          }
+          if (scan_opts_[scan_idx_ - 1].range.limit.has_value() &&
+              scan_opts_[scan_idx_ - 1].range.limit.value().compare(
+                  iter_->first) <= 0) {
+            return false;
+          }
+          return true;
+        }
+
         UserDefinedIndexBuilder::BlockHandle value() override {
           UserDefinedIndexBuilder::BlockHandle handle{0, 0};
           handle.offset = iter_->second.first.offset;
@@ -7668,13 +7691,14 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         }
 
         void Prepare(const ScanOptions scan_opts[], size_t num_opts) override {
+          // Prepare should only be called once
+          EXPECT_EQ(scan_opts_, nullptr);
           scan_opts_ = scan_opts;
           num_opts_ = num_opts;
           scan_idx_ = 0;
         }
 
        private:
-        const ReadOptions& ro_;
         std::map<std::string,
                  std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
             index_;
@@ -7697,6 +7721,32 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
 
  protected:
   void BasicTest(bool use_partitioned_index);
+
+  void ValidateMultiScan(const ReadOptions& ro, MultiScanArgs& scan_opts,
+                         std::vector<int>& key_counts, std::unique_ptr<DB>& db,
+                         ColumnFamilyHandle* cfh) {
+    Slice ub;
+    ReadOptions read_opts = ro;
+    int key_count = 0;
+    int index = 0;
+    auto opts = scan_opts.GetScanRanges();
+    read_opts.iterate_upper_bound = &ub;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_opts, cfh));
+    iter->Prepare(scan_opts);
+    for (auto opt : opts) {
+      ub = opt.range.limit.value();
+      iter->Seek(opt.range.start.value());
+      EXPECT_OK(iter->status());
+      while (iter->Valid()) {
+        key_count++;
+        iter->Next();
+      }
+      EXPECT_EQ(key_count, key_counts[index]);
+      key_count = 0;
+      index++;
+    }
+    EXPECT_OK(iter->status());
+  }
 };
 
 void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
@@ -8028,6 +8078,149 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
   ASSERT_OK(DestroyDB(dbname, options));
 }
 
+TEST_F(UserDefinedIndexTest, EmptyRangeTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  bool skip = false;
+  // Create a sparse file with some missing key ranges so we can do
+  // MultiScans with empty scans interspersed with non-zero scans.
+  for (int i = 0; i < 100; i++) {
+    if (i > 0 && i % 20 == 0) {
+      skip = !skip;
+    }
+    if (skip) {
+      continue;
+    }
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = "value" + ss.str();
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  // Test that we can read all the keys
+  int key_count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    key_count++;
+  }
+  ASSERT_EQ(key_count, 60);
+  ASSERT_OK(iter->status());
+  iter.reset();
+
+  ro.table_index_factory = user_defined_index_factory.get();
+  std::vector<int> key_counts;
+  MultiScanArgs scan_opts;
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(5);
+  // Empty scans
+  scan_opts.insert(Slice("key25"), Slice("key30"), std::optional(property_bag));
+  key_counts.push_back(0);
+  scan_opts.insert(Slice("key33"), Slice("key37"), std::optional(property_bag));
+  key_counts.push_back(0);
+  // Non-empty scan with range greater than count
+  scan_opts.insert(Slice("key42"), Slice("key56"), std::optional(property_bag));
+  // In the key42:key56 range, we might read an additional block worth of
+  // keys due to the boundaries (5 + 3)
+  key_counts.push_back(8);
+  // Empty scan succeeding a non-empty one
+  scan_opts.insert(Slice("key65"), Slice("key70"), std::optional(property_bag));
+  key_counts.push_back(0);
+  // A non-empty scan with range smaller than count
+  scan_opts.insert(Slice("key85"), Slice("key87"), std::optional(property_bag));
+  key_counts.push_back(2);
+  // Scan range completely outside the DB
+  scan_opts.insert(Slice("key991"), Slice("key999"),
+                   std::optional(property_bag));
+  key_counts.push_back(0);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  key_counts.clear();
+  (*scan_opts).clear();
+  // Scans that overlap with part of key range, with overlap less than count
+  scan_opts.insert(Slice("key18"), Slice("key25"), std::optional(property_bag));
+  key_counts.push_back(2);
+  scan_opts.insert(Slice("key38"), Slice("key43"), std::optional(property_bag));
+  key_counts.push_back(3);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  // Scans that overlap with part of key range, with overlap same as count
+  key_counts.clear();
+  (*scan_opts).clear();
+  scan_opts.insert(Slice("key15"), Slice("key26"), std::optional(property_bag));
+  key_counts.push_back(5);
+  scan_opts.insert(Slice("key38"), Slice("key46"), std::optional(property_bag));
+  key_counts.push_back(6);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  // Scans that overlap with part of key range, with overlap greater than count
+  key_counts.clear();
+  (*scan_opts).clear();
+  scan_opts.insert(Slice("key10"), Slice("key26"), std::optional(property_bag));
+  key_counts.push_back(8);
+  scan_opts.insert(Slice("key38"), Slice("key49"), std::optional(property_bag));
+  key_counts.push_back(7);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  // Scan bigger than one contiguous range of keys, with overlap greater than
+  // count
+  key_counts.clear();
+  (*scan_opts).clear();
+  scan_opts.insert(Slice("key75"), Slice("key991"),
+                   std::optional(property_bag));
+  key_counts.push_back(8);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  // Scan bigger than one contiguous range of keys, with overlap less than count
+  key_counts.clear();
+  (*scan_opts).clear();
+  property_bag["count"] = std::to_string(25);
+  scan_opts.insert(Slice("key75"), Slice("key991"),
+                   std::optional(property_bag));
+  key_counts.push_back(20);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
 // Verify that external file ingestion fails if we try to ingest an SST file
 // without the UDI and a UDI factory is configured in BlockBasedTableOptions
 // and fail_if_no_udi_on_open is true in BlockBasedTableOptions.
diff --git a/unreleased_history/bug_fixes/udi_empty_scan_range_fix.md b/unreleased_history/bug_fixes/udi_empty_scan_range_fix.md
new file mode 100644
index 000000000000..939612a035e6
--- /dev/null
+++ b/unreleased_history/bug_fixes/udi_empty_scan_range_fix.md
@@ -0,0 +1 @@
+Fix a bug in RocksDB MultiScan with UDI when one of the scan ranges is determined to be empty by the UDI, which causes incorrect results.

From 20bcd017584198ee4bca79939174c10f0b4be755 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 17 Sep 2025 20:20:33 -0700
Subject: [PATCH 284/500] Record smallest seqno in table properties for faster
 file ingestion (#13942)

Summary:
when ingesting DB generated file with non-zero sequence number, we need smallest seqno of each file for file meta data. To avoid full table scan, we record this information in table property and use it during file ingestion.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13942

Test Plan: new unit test and updated existing unit test.

Reviewed By: hx235

Differential Revision: D82331802

Pulled By: cbi42

fbshipit-source-id: 3009a6801ca7092cd0fde33692db1a13567068a9
---
 db/db_table_properties_test.cc                | 40 +++++++++++++++++++
 db/event_helpers.cc                           |  1 +
 db/external_sst_file_ingestion_job.cc         | 31 ++++++++------
 db/external_sst_file_test.cc                  | 17 +++++++-
 include/rocksdb/table_properties.h            |  9 +++++
 options/options_settable_test.cc              |  3 +-
 .../block_based/block_based_table_builder.cc  |  3 ++
 table/external_table.cc                       |  2 +
 table/meta_blocks.cc                          |  5 +++
 table/table_properties.cc                     |  8 ++++
 table/table_test.cc                           |  4 +-
 ...ternal_sst_ingestion_seqno_optimization.md |  2 +
 12 files changed, 107 insertions(+), 18 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md

diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index ddebfccbec83..095f090fd773 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -787,6 +787,46 @@ TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
   }
 }
 
+TEST_F(DBTablePropertiesTest, KeyLargestSmallestSeqno) {
+  ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "key3", "value3"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  {
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(1U, props.size());
+
+    auto table_props = props.begin()->second;
+
+    ASSERT_TRUE(table_props->HasKeyLargestSeqno());
+    ASSERT_TRUE(table_props->HasKeySmallestSeqno());
+
+    ASSERT_EQ(table_props->key_largest_seqno,
+              table_props->key_smallest_seqno + 2);
+    ASSERT_GT(table_props->key_largest_seqno, 0U);
+    ASSERT_GT(table_props->key_smallest_seqno, 0U);
+  }
+
+  // Becomes zero after compaction
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(1U, props.size());
+
+    auto table_props = props.begin()->second;
+    ASSERT_TRUE(table_props->HasKeyLargestSeqno());
+    ASSERT_TRUE(table_props->HasKeySmallestSeqno());
+
+    ASSERT_EQ(table_props->key_largest_seqno, table_props->key_smallest_seqno);
+    ASSERT_EQ(table_props->key_largest_seqno, 0U);
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest,
                         ::testing::Values("kCompactionStyleLevel",
                                           "kCompactionStyleUniversal"));
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index 638d0ed6e2c9..5c69f3fb81c6 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -129,6 +129,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
               << "user_defined_timestamps_persisted"
               << table_properties.user_defined_timestamps_persisted
               << "key_largest_seqno" << table_properties.key_largest_seqno
+              << "key_smallest_seqno" << table_properties.key_smallest_seqno
               << "merge_operator" << table_properties.merge_operator_name
               << "prefix_extractor_name"
               << table_properties.prefix_extractor_name << "property_collectors"
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 086208095884..d992e754d417 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -1522,21 +1522,26 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
 Status ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile(
     TableReader* table_reader, SuperVersion* sv,
     IngestedFileInfo* file_to_ingest, bool allow_data_in_errors) {
-  const bool has_largest_seqno =
-      table_reader->GetTableProperties()->HasKeyLargestSeqno();
-  SequenceNumber largest_seqno =
-      table_reader->GetTableProperties()->key_largest_seqno;
-  if (has_largest_seqno && largest_seqno == 0) {
-    file_to_ingest->largest_seqno = 0;
-    file_to_ingest->smallest_seqno = 0;
-    return Status::OK();
+  const auto tp = table_reader->GetTableProperties();
+  const bool has_largest_seqno = tp->HasKeyLargestSeqno();
+  SequenceNumber largest_seqno = tp->key_largest_seqno;
+  if (has_largest_seqno) {
+    file_to_ingest->largest_seqno = largest_seqno;
+    if (largest_seqno == 0) {
+      file_to_ingest->smallest_seqno = 0;
+      return Status::OK();
+    }
+    if (tp->HasKeySmallestSeqno()) {
+      file_to_ingest->smallest_seqno = tp->key_smallest_seqno;
+      return Status::OK();
+    }
   }
-  // The following file scan is only executed when ingesting files with
-  // non-zero seqno.
-  // TODO: record smallest_seqno in table properties to avoid the
-  // file scan here.
-  SequenceNumber smallest_seqno = kMaxSequenceNumber;
 
+  // For older SST files they may not be recorded in table properties, so
+  // we scan the file to find out.
+  TEST_SYNC_POINT(
+      "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan");
+  SequenceNumber smallest_seqno = kMaxSequenceNumber;
   SequenceNumber largest_seqno_from_iter = 0;
   ReadOptions ro;
   ro.fill_cache = ingestion_options_.fill_cache;
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 6d8c56ca6190..1a8a5f717651 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -4099,6 +4099,7 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
     SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
 
     Options options = CurrentOptions();
+    options.statistics = CreateDBStatistics();
     options.allow_concurrent_memtable_write =
         false;  // Required for VectorRepFactory
     CreateAndReopenWithCF({"non_overlap", "overlap"}, options);
@@ -4135,7 +4136,7 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
     // optional L5: files in key range [70, 98]
     // L6: files in key range [1, 79]
     temp_cf_opts.target_file_size_base =
-        4 << 10;  // Small files to create multiple SSTs
+        20 << 10;  // Small files to create multiple SSTs
     temp_cf_opts.num_levels = 7;
     temp_cf_opts.disable_auto_compactions = true;  // Manually set up LSM
     temp_cf_opts.env = options.env;
@@ -4155,7 +4156,7 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
     const Snapshot* snapshot = from_db->GetSnapshot();
 
     for (int k = 1; k < 99; ++k) {
-      expected_values[k] = rnd->RandomString(500);
+      expected_values[k] = rnd->RandomString(2000);
       ASSERT_OK(from_db->Put(wo, temp_cfh, Key(k), expected_values[k]));
     }
     ASSERT_OK(from_db->Flush({}, temp_cfh));
@@ -4233,9 +4234,21 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
           "assigned a non-zero sequence number"));
       db_->ReleaseSnapshot(snapshot);
     }
+
+    std::atomic<int> file_scan_count{0};
+    SyncPoint::GetInstance()->SetCallBack(
+        "ExternalSstFileIngestionJob::GetSeqnoBoundaryForFile:FileScan",
+        [&](void* /*arg*/) { file_scan_count++; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
     ASSERT_OK(
         db_->IngestExternalFile(non_overlap_cf, sst_file_paths, ingest_opts));
 
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    EXPECT_EQ(file_scan_count, 0);
+
     // Validate ingested data.
     ReadOptions ro;
     std::string val;
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index 860fa6fd4f2f..c47746a17d24 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -76,6 +76,7 @@ struct TablePropertiesNames {
   static const std::string kTailStartOffset;
   static const std::string kUserDefinedTimestampsPersisted;
   static const std::string kKeyLargestSeqno;
+  static const std::string kKeySmallestSeqno;
 };
 
 // `TablePropertiesCollector` provides the mechanism for users to collect
@@ -309,6 +310,14 @@ struct TableProperties {
 
   bool HasKeyLargestSeqno() const { return key_largest_seqno != UINT64_MAX; }
 
+  // The smallest sequence number of keys in this file.
+  // UINT64_MAX means unknown.
+  // Only written to properties block if known (should be known unless the
+  // table is empty).
+  uint64_t key_smallest_seqno = UINT64_MAX;
+
+  bool HasKeySmallestSeqno() const { return key_smallest_seqno != UINT64_MAX; }
+
   // DB identity
   // db_id is an identifier generated the first time the DB is created
   // If DB identity is unset or unassigned, `db_id` will be an empty string.
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 3df151b492bb..e4eba3fb6c50 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -289,7 +289,8 @@ TEST_F(OptionsSettableTest, TablePropertiesAllFieldsSettable) {
       "0;column_family_"
       "name=64656661756C74;user_defined_timestamps_persisted=1;num_entries=100;"
       "external_sst_file_global_seqno_offset=0;num_merge_operands=0;index_key_"
-      "is_user_key=0;key_largest_seqno=18446744073709551615;",
+      "is_user_key=0;key_largest_seqno=18446744073709551615;key_smallest_seqno="
+      "18;",
       new_tp));
 
   // All bytes are set from the parse
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3f8895e3283b..68fd4aab3648 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1267,6 +1267,8 @@ struct BlockBasedTableBuilder::Rep {
     // Default is UINT64_MAX for unknown. Setting it to 0 here
     // to allow updating it by taking max in BlockBasedTableBuilder::Add().
     props.key_largest_seqno = 0;
+    // Default is UINT64_MAX for unknown.
+    props.key_smallest_seqno = UINT64_MAX;
     PrePopulateCompressionProperties(mgr);
 
     if (FormatVersionUsesContextChecksum(table_options.format_version)) {
@@ -1432,6 +1434,7 @@ void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
   SequenceNumber seq;
   UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type);
   r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq);
+  r->props.key_smallest_seqno = std::min(r->props.key_smallest_seqno, seq);
   if (IsValueType(value_type)) {
 #ifndef NDEBUG
     if (r->props.num_entries > r->props.num_range_deletions) {
diff --git a/table/external_table.cc b/table/external_table.cc
index ecc08135bf30..514cf14b1e62 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -226,6 +226,7 @@ class ExternalTableReaderAdapter : public TableReader {
       // external table reader
       props = std::make_shared<TableProperties>(*reader_->GetTableProperties());
       props->key_largest_seqno = 0;
+      props->key_smallest_seqno = 0;
     }
     return props;
   }
@@ -262,6 +263,7 @@ class ExternalTableBuilderAdapter : public TableBuilder {
     properties_.filter_size = 0;
     properties_.format_version = 0;
     properties_.key_largest_seqno = 0;
+    properties_.key_smallest_seqno = 0;
     properties_.column_family_id = topts.column_family_id;
     properties_.column_family_name = topts.column_family_name;
     properties_.db_id = topts.db_id;
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 73764ae4bb5a..84f3a5343b46 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -167,6 +167,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   if (props.key_largest_seqno != UINT64_MAX) {
     Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
   }
+  if (props.key_smallest_seqno != UINT64_MAX) {
+    Add(TablePropertiesNames::kKeySmallestSeqno, props.key_smallest_seqno);
+  }
 }
 
 Slice PropertyBlockBuilder::Finish() {
@@ -311,6 +314,8 @@ Status ParsePropertiesBlock(
        &new_table_properties->user_defined_timestamps_persisted},
       {TablePropertiesNames::kKeyLargestSeqno,
        &new_table_properties->key_largest_seqno},
+      {TablePropertiesNames::kKeySmallestSeqno,
+       &new_table_properties->key_smallest_seqno},
   };
 
   Status s;
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 7fee67d1e928..d5a654676d7b 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -116,6 +116,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
                  prop_delim, kv_delim);
   AppendProperty(result, "largest sequence number in file", key_largest_seqno,
                  prop_delim, kv_delim);
+  AppendProperty(result, "smallest sequence number in file", key_smallest_seqno,
+                 prop_delim, kv_delim);
 
   AppendProperty(
       result, "merge operator name",
@@ -320,6 +322,8 @@ const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted =
     "rocksdb.user.defined.timestamps.persisted";
 const std::string TablePropertiesNames::kKeyLargestSeqno =
     "rocksdb.key.largest.seqno";
+const std::string TablePropertiesNames::kKeySmallestSeqno =
+    "rocksdb.key.smallest.seqno";
 
 static std::unordered_map<std::string, OptionTypeInfo>
     table_properties_type_info = {
@@ -434,6 +438,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct TableProperties, key_largest_seqno),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"key_smallest_seqno",
+         {offsetof(struct TableProperties, key_smallest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"db_id",
          {offsetof(struct TableProperties, db_id), OptionType::kEncodedString}},
         {"db_session_id",
diff --git a/table/table_test.cc b/table/table_test.cc
index 1f08ee700c07..bdbfe7750ded 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -4717,8 +4717,8 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   // an arbitrary slice between k04 and k05, either before or after k04a
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 10000, 211000));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 512000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 512000));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 612000));
   c.ResetTableReader();
 }
diff --git a/unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md b/unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md
new file mode 100644
index 000000000000..53b073a35ee3
--- /dev/null
+++ b/unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md
@@ -0,0 +1,2 @@
+* Add a new table property "rocksdb.key.smallest.seqno" which records the smallest sequence number of all keys in file. It makes ingesting DB generated files faster by
+avoiding scanning the whole file to find the smallest sequence number.

From 6127a42f98d88128e8a4cba24f1f7dab2be638fe Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 18 Sep 2025 13:27:51 -0700
Subject: [PATCH 285/500] Use/endorse (Auto)HyperClockCache by default over
 LRUCache (#13964)

Summary:
After seeing more people hit issues with thrashing small LRUCache shards and AutoHCC running fully in production for a while on a very large service, here I make these updates:

* In the public API, mark the case of `estimated_entry_charge = 0` (which is how you select AutoHCC) as production-ready and generally preferred. That means devoting a lot less space to how to tune FixedHCC (`estimated_entry_charge > 0`) because it is not generally recommended anymore even though in theory it is the fastest (conditional on a fragile configuration).
* In the public API, add more detail about potential problems with LRUCache and explicitly endorse HCC.
* When a default block cache is created, use AutoHCC instead of LRUCache. It's still a 32MB cache but that's just one cache shard for AutoHCC so the risk of issues with small cache shards is dramatically reduced. And a single AutoHCC shard is still essentially wait-free.
* Improve the handling of the hypothetical scenario of a failed anonymous mmap. This is hardly a concern for 64-bit Linux and likely most other OSes. It would in theory be possible to fall back on LRUCache in that case but the code structure makes that annoying/challenging. Instead we crash with an appropriate message.
* Cleaned up some includes
* Fixed some previously unreported leaks (better assertions on HCC perhaps, some subtle behavior changes)
* Added a new mode to cache_bench (detailed below)
* Avoid a particularly costly sanity check in `~AutoHyperClockTable()` even in debug builds so that unit testing, etc., isn't bogged down, except keep it in ASAN build.

Planned follow-up:
* Update HCC implementation to use my new "bit field atomics" API introduced in https://github.com/facebook/rocksdb/issues/13910 to make it easier to read and maintain

Possible follow-up:
* Re-engineer table cache to use AutoHCC also, instead of LRUCache and a single mutex to ensure no duplication across threads. (a) Pad table cache key to 128 bits for AutoHCC. (b) Stripe/shard the no-duplication mutex. (HCC's consistency model is too weak for concurrent threads to use its API to agree on a winner, even if entries could be inserted in an "open in progress" state.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13964

Test Plan:
existing tests. ClockCacheTest.ClockEvictionEffortCapTest caught a regression during my development, and the crash test has a history of finding subtle HCC bugs.

## Performance

Although we've validated AutoHCC performance under high load, etc., before we haven't really considered whether there will be unacceptable overheads for small DBs and CFs, e.g. in unit tests. For this, I have added a new mode to cache_bench: with the -stress_cache_instances=n parameter, it will create and destroy n empty cache instances several times. In the debug build, this found that a particular check in `~AutoHyperClockTable()` was extremely costly for short-lived caches (fixed). Beyond that, we can answer the question of whether it is feasible for a single process to host 1000 DBs each with 1000 CFs with default block cache instances, after moving LRUCache -> AutoHCC, for example:

```
/usr/bin/time ./cache_bench -stress_
cache_instances=1000000 -cache_type=auto_hyper_clock_cache -cache_size=33554432
```

Release build:
Average 9.8 us per 32MB LRUCache creation, 2.9 us per destruction, 24.6GB max RSS (~25KB each)
->
Average 4.3 us per  32MB AutoHCC creation, 4.9 us per destruction, 4.8GB max RSS (~5KB each)

Debug build:
Average 10.9 us per 32MB LRUCache creation, 3.5 us per destruction, 28.7GB max RSS (~29KB each)
->
Average 4.5 us per 32MB AutoHCC creation, 4.9 us per destruction, 4.7GB max RSS (~5KB each)

Despite the anonymous mmaps, it's apparently more efficient for default/small/empty structures. This is likely due to the dramatically low number of cache shards at this size. If we switch to `-stress_cache_instances=10000  -cache_size=1073741824`:

Release build:
Average 10.6 us per 1GB LRUCache, 2.8 us per destruction, 2.3 GB max RSS (~230KB each)
->
Average 130 us per 1GB AutoHCC creation, 153 us per destruction, 1.5 GB max RSS (~150KB each)

Debug build:
Average 11.2 us per 1GB LRUCache, 3.6 us per destruction, 2.4 GB max RSS (~240KB each)
->
Average 130 us per 1GB AutoHCC creation, 150 us per destruction, 1.6 GB max RSS (~160KB each)

Here it's clear that we are paying a price in time for setting up all those mmaps for the good number of cache shards and potential table growth, even though the RSS is well under control. However, I am not concerned about this at all, as it's unlikely to slow down anything notably such as unit tests. Before and after full testsuite runs confirm:

3327.73user 5188.71system 3:38.88elapsed -> 3312.07user 5704.77system 3:41.61elapsed

There is increased kernel time but acceptable. With ASAN+UBSAN:

11618.70user 15671.30system 5:54.68elapsed -> 12595.81user 16159.67system 6:32.77elapsed

Acceptable given that our ASAN+UBSAN builds are not the slowest in CI

Reviewed By: hx235

Differential Revision: D82661067

Pulled By: pdillinger

fbshipit-source-id: ab25c766ca70f2b8664849c2a838b9e1b4e72d3b
---
 cache/cache_bench_tool.cc                     | 82 +++++++++++++++--
 cache/clock_cache.cc                          | 27 ++++--
 cache/clock_cache.h                           |  7 --
 include/rocksdb/cache.h                       | 92 +++++++++----------
 include/rocksdb/ldb_tool.h                    |  8 ++
 .../java/org/rocksdb/SstFileReaderTest.java   |  2 +
 .../block_based/block_based_table_factory.cc  |  8 +-
 tools/ldb.cc                                  |  3 +-
 tools/ldb_tool.cc                             | 12 ++-
 .../behavior_changes/autohcc.md               |  1 +
 .../public_api_changes/autohcc.md             |  1 +
 11 files changed, 161 insertions(+), 82 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/autohcc.md
 create mode 100644 unreleased_history/public_api_changes/autohcc.md

diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index 0e29dc67b189..6de9c00818b6 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -184,6 +184,11 @@ DEFINE_bool(sck_randomize, false,
 DEFINE_bool(sck_footer_unique_id, false,
             "(-stress_cache_key) Simulate using proposed footer unique id");
 // ## END stress_cache_key sub-tool options ##
+// ## BEGIN stress_cache_instances sub-tool options ##
+DEFINE_uint32(stress_cache_instances, 0,
+              "If > 0, run cache instance stress test instead");
+// Uses cache_size and cache_type, maybe more
+// ## END stress_cache_instance sub-tool options ##
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -387,7 +392,12 @@ class CacheBench {
       fprintf(stderr, "Percentages must add to 100.\n");
       exit(1);
     }
+    cache_ = MakeCache();
+  }
+
+  ~CacheBench() = default;
 
+  static std::shared_ptr<Cache> MakeCache() {
     std::shared_ptr<MemoryAllocator> allocator;
     if (FLAGS_use_jemalloc_no_dump_allocator) {
       JemallocAllocatorOptions opts;
@@ -406,12 +416,12 @@ class CacheBench {
       opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
       opts.memory_allocator = allocator;
       opts.eviction_effort_cap = FLAGS_eviction_effort_cap;
-      if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
-          FLAGS_cache_type == "hyper_clock_cache") {
+      if (FLAGS_cache_type == "fixed_hyper_clock_cache") {
         opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0
                                           ? FLAGS_value_bytes_estimate
                                           : FLAGS_value_bytes;
-      } else if (FLAGS_cache_type == "auto_hyper_clock_cache") {
+      } else if (FLAGS_cache_type == "auto_hyper_clock_cache" ||
+                 FLAGS_cache_type == "hyper_clock_cache") {
         if (FLAGS_value_bytes_estimate > 0) {
           opts.min_avg_entry_charge = FLAGS_value_bytes_estimate;
         }
@@ -420,7 +430,7 @@ class CacheBench {
         exit(1);
       }
       ConfigureSecondaryCache(opts);
-      cache_ = opts.MakeSharedCache();
+      return opts.MakeSharedCache();
     } else if (FLAGS_cache_type == "lru_cache") {
       LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
                            false /* strict_capacity_limit */,
@@ -428,15 +438,13 @@ class CacheBench {
       opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
       opts.memory_allocator = allocator;
       ConfigureSecondaryCache(opts);
-      cache_ = NewLRUCache(opts);
+      return NewLRUCache(opts);
     } else {
       fprintf(stderr, "Cache type not supported.\n");
       exit(1);
     }
   }
 
-  ~CacheBench() = default;
-
   void PopulateCache() {
     Random64 rnd(FLAGS_seed);
     KeyGen keygen;
@@ -490,7 +498,7 @@ class CacheBench {
 
     PrintEnv();
     SharedState shared(this);
-    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+    std::vector<std::unique_ptr<ThreadState>> threads(FLAGS_threads);
     for (uint32_t i = 0; i < FLAGS_threads; i++) {
       threads[i].reset(new ThreadState(i, &shared));
       std::thread(ThreadBody, threads[i].get()).detach();
@@ -1152,6 +1160,59 @@ class StressCacheKey {
   double multiplier_ = 0.0;
 };
 
+// cache_bench -stress_cache_instances is a partially independent embedded tool
+// for evaluating the time and space required to create and destroy many cache
+// instances, as this is considered important for a default cache implementation
+// which could see many throw-away instances in handling of Options, or created
+// in large numbers for many very small DBs with many CFs. Prefix command line
+// with /usr/bin/time to see max RSS memory.
+class StressCacheInstances {
+ public:
+  void Run() {
+    const int kNumIterations = 10;
+    const auto clock = SystemClock::Default().get();
+    caches_.reserve(FLAGS_stress_cache_instances);
+
+    uint64_t total_create_time_us = 0;
+    uint64_t total_destroy_time_us = 0;
+
+    for (int iter = 0; iter < kNumIterations; ++iter) {
+      // Create many cache instances
+      uint64_t start_create = clock->NowMicros();
+      for (uint32_t i = 0; i < FLAGS_stress_cache_instances; ++i) {
+        caches_.emplace_back(CacheBench::MakeCache());
+      }
+      uint64_t end_create = clock->NowMicros();
+      uint64_t create_time = end_create - start_create;
+      total_create_time_us += create_time;
+
+      // Destroy them
+      uint64_t start_destroy = clock->NowMicros();
+      caches_.clear();
+      uint64_t end_destroy = clock->NowMicros();
+      uint64_t destroy_time = end_destroy - start_destroy;
+      total_destroy_time_us += destroy_time;
+
+      printf(
+          "Iteration %d: Created %u caches in %.3f ms, destroyed in %.3f ms\n",
+          iter + 1, FLAGS_stress_cache_instances, create_time / 1000.0,
+          destroy_time / 1000.0);
+    }
+
+    printf("Average creation time: %.3f ms (%.1f us per cache)\n",
+           static_cast<double>(total_create_time_us) / kNumIterations / 1000.0,
+           static_cast<double>(total_create_time_us) / kNumIterations /
+               FLAGS_stress_cache_instances);
+    printf("Average destruction time: %.3f ms (%.1f us per cache)\n",
+           static_cast<double>(total_destroy_time_us) / kNumIterations / 1000.0,
+           static_cast<double>(total_destroy_time_us) / kNumIterations /
+               FLAGS_stress_cache_instances);
+  }
+
+ private:
+  std::vector<std::shared_ptr<Cache>> caches_;
+};
+
 int cache_bench_tool(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ParseCommandLineFlags(&argc, &argv, true);
@@ -1162,6 +1223,11 @@ int cache_bench_tool(int argc, char** argv) {
     return 0;
   }
 
+  if (FLAGS_stress_cache_instances > 0) {
+    StressCacheInstances().Run();
+    return 0;
+  }
+
   if (FLAGS_threads <= 0) {
     fprintf(stderr, "threads number <= 0\n");
     exit(1);
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 090213cb0d02..d65fd56495b3 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -10,12 +10,12 @@
 #include "cache/clock_cache.h"
 
 #include <algorithm>
-#include <atomic>
 #include <bitset>
 #include <cassert>
 #include <cinttypes>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <exception>
 #include <functional>
 #include <numeric>
@@ -26,10 +26,9 @@
 #include "cache/cache_key.h"
 #include "cache/secondary_cache_adapter.h"
 #include "logging/logging.h"
-#include "monitoring/perf_context_imp.h"
-#include "monitoring/statistics_impl.h"
-#include "port/lang.h"
+#include "port/likely.h"
 #include "rocksdb/env.h"
+#include "util/autovector.h"
 #include "util/hash.h"
 #include "util/math.h"
 #include "util/random.h"
@@ -1985,6 +1984,11 @@ AutoHyperClockTable::AutoHyperClockTable(
       grow_frontier_(GetTableSize()),
       clock_pointer_mask_(
           BottomNBits(UINT64_MAX, LengthInfoToMinShift(length_info_.Load()))) {
+  if (array_.Get() == nullptr) {
+    fprintf(stderr,
+            "Anonymous mmap for RocksDB HyperClockCache failed. Aborting.\n");
+    std::terminate();
+  }
   if (metadata_charge_policy ==
       CacheMetadataChargePolicy::kFullChargeCacheMetadata) {
     // NOTE: ignoring page boundaries for simplicity
@@ -2052,15 +2056,20 @@ AutoHyperClockTable::~AutoHyperClockTable() {
              HandleImpl::kUnusedMarker) {
     used_end++;
   }
-#ifndef NDEBUG
+  // This check can be extra expensive for a cache that is just created,
+  // maybe used for a small number of entries, as in a unit test, and then
+  // destroyed. Only do this in rare modes.
+#ifdef MUST_FREE_HEAP_ALLOCATIONS
   for (size_t i = used_end; i < array_.Count(); i++) {
     assert(array_[i].head_next_with_shift.LoadRelaxed() == 0);
     assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0);
     assert(array_[i].meta.LoadRelaxed() == 0);
   }
+#endif          // MUST_FREE_HEAP_ALLOCATIONS
+#ifndef NDEBUG  // Extra invariant checking
   std::vector<bool> was_populated(used_end);
   std::vector<bool> was_pointed_to(used_end);
-#endif
+#endif  // !NDEBUG
   for (size_t i = 0; i < used_end; i++) {
     HandleImpl& h = array_[i];
     switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) {
@@ -2083,7 +2092,7 @@ AutoHyperClockTable::~AutoHyperClockTable() {
           assert(!was_pointed_to[next]);
           was_pointed_to[next] = true;
         }
-#endif
+#endif  // !NDEBUG
         break;
       // otherwise
       default:
@@ -2097,7 +2106,7 @@ AutoHyperClockTable::~AutoHyperClockTable() {
       assert(!was_pointed_to[next]);
       was_pointed_to[next] = true;
     }
-#endif
+#endif  // !NDEBUG
   }
 #ifndef NDEBUG  // Extra invariant checking
   // This check is not perfect, but should detect most reasonable cases
@@ -2110,7 +2119,7 @@ AutoHyperClockTable::~AutoHyperClockTable() {
       assert(!was_pointed_to[i]);
     }
   }
-#endif
+#endif  // !NDEBUG
 
   // Metadata charging only follows the published table size
   assert(usage_.LoadRelaxed() == 0 ||
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index 2d5d0d9eef3c..895936900dd8 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -9,8 +9,6 @@
 
 #pragma once
 
-#include <array>
-#include <atomic>
 #include <climits>
 #include <cstddef>
 #include <cstdint>
@@ -19,14 +17,9 @@
 
 #include "cache/cache_key.h"
 #include "cache/sharded_cache.h"
-#include "port/lang.h"
-#include "port/malloc.h"
 #include "port/mmap.h"
-#include "port/port.h"
 #include "rocksdb/cache.h"
-#include "rocksdb/secondary_cache.h"
 #include "util/atomic.h"
-#include "util/autovector.h"
 #include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 8ca5f272f132..0d3603a8e262 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -210,7 +210,15 @@ struct ShardedCacheOptions {
 // shard has its own LRU list for evictions. Each shard also has a mutex for
 // exclusive access during operations; even read operations need exclusive
 // access in order to update the LRU list. Mutex contention is usually low
-// with enough shards.
+// with enough shards. However,
+// * For a single hot block, there will be mutex contention even for reads
+// regardless of the number of shards.
+// * LRUCaches in the size of MBs instead of GBs can have shards small enough
+// that there is a random probability of some modest number of large blocks
+// (especially non-partitioned filters) thrashing a single cache shard.
+//
+// HYPERCLOCKCACHE IS NOW GENERALLY RECOMMENDED OVER LRUCACHE. See
+// HyperClockCacheOptions below.
 struct LRUCacheOptions : public ShardedCacheOptions {
   // Ratio of cache reserved for high-priority and low-priority entries,
   // respectively. (See Cache::Priority below more information on the levels.)
@@ -371,64 +379,50 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
   return opts.MakeSharedSecondaryCache();
 }
 
-// HyperClockCache - A lock-free Cache alternative for RocksDB block cache
-// that offers much improved CPU efficiency vs. LRUCache under high parallel
-// load or high contention, with some caveats:
+// HyperClockCache (also known as HCC) - A lock-free Cache alternative for
+// RocksDB block cache that offers much improved CPU efficiency vs. LRUCache
+// under high parallel load or high contention. Additionally, HCC only uses
+// sharding for a modest performance boost, so can use much larger cache shards
+// than LRUCache, dramatically reducing the risk of thrashing in configurations
+// or work loads with some large blocks.
+//
+// HYPERCLOCKCACHE IS NOW GENERALLY RECOMMENDED OVER LRUCACHE
+//
+// Some caveats:
 // * Not a general Cache implementation: can only be used for
 // BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is
 // compatible with HyperClockCache.
-// * Requires an extra tuning parameter: see estimated_entry_charge below.
-// Similarly, substantially changing the capacity with SetCapacity could
-// harm efficiency. -> EXPERIMENTAL: the tuning parameter can be set to 0
-// to find the appropriate balance automatically.
 // * Cache priorities are less aggressively enforced, which could cause
 // cache dilution from long range scans (unless they use fill_cache=false).
+// * In some configurations, depends on anonymous mmap support, available in
+// Linux, Windows and more.
+// * May have slightly lower (or slightly higher) cache hit rate vs. LRUCache,
+// because of the bounded counting-CLOCK eviction algorithm.
 //
 // See internal cache/clock_cache.h for full description.
 struct HyperClockCacheOptions : public ShardedCacheOptions {
-  // The estimated average `charge` associated with cache entries.
-  //
-  // EXPERIMENTAL: the field can be set to 0 to size the table dynamically
-  // and automatically. See also min_avg_entry_charge. This feature requires
-  // platform support for lazy anonymous memory mappings (incl Linux, Windows).
-  // Performance is very similar to choosing the best configuration parameter.
-  //
-  // PRODUCTION-TESTED: This is a critical configuration parameter for good
-  // performance, because having a table size that is fixed at creation time
-  // greatly reduces the required synchronization between threads.
-  // * If the estimate is substantially too low (e.g. less than half the true
-  // average) then metadata space overhead with be substantially higher (e.g.
-  // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this
-  // can slightly reduce cache hit rates, and slightly reduce access times due
-  // to the larger working memory size.
-  // * If the estimate is substantially too high (e.g. 25% higher than the true
-  // average) then there might not be sufficient slots in the hash table for
-  // both efficient operation and capacity utilization (hit rate). The hyper
-  // cache will evict entries to prevent load factors that could dramatically
-  // affect lookup times, instead letting the hit rate suffer by not utilizing
-  // the full capacity.
+  // OPTIONAL: The estimated average `charge` associated with cache entries.
   //
-  // A reasonable choice is the larger of block_size and metadata_block_size.
-  // When WriteBufferManager (and similar) charge memory usage to the block
-  // cache, this can lead to the same effect as estimate being too low, which
-  // is better than the opposite. Therefore, the general recommendation is to
-  // assume that other memory charged to block cache could be negligible, and
-  // ignore it in making the estimate.
+  // When not provided (== 0, recommended and default), an HCC variant with a
+  // dynamically-growing table and generally good performance is used. This
+  // variant depends on anonymous mmaps so might not be available on all
+  // platforms.
   //
-  // The best parameter choice based on a cache in use is given by
-  // GetUsage() / GetOccupancyCount(), ignoring metadata overheads such as
-  // with kDontChargeCacheMetadata. More precisely with
-  // kFullChargeCacheMetadata is (GetUsage() - 64 * GetTableAddressCount()) /
-  // GetOccupancyCount(). However, when the average value size might vary
-  // (e.g. balance between metadata and data blocks in cache), it is better
-  // to estimate toward the lower side than the higher side.
+  // If the average "charge" (uncompressed block size) of block cache entries
+  // is reasonably predicted and provided here, the most efficient variant of
+  // HCC is used. Performance is degraded if the prediction is inaccurate.
+  // Prediction could be difficult or impossible with cache-charging features
+  // such as WriteBufferManager. The best parameter choice based on a cache
+  // in use is roughly given by GetUsage() / GetOccupancyCount(), though it is
+  // better to estimate toward the lower side than the higher side when the
+  // ratio might vary.
   size_t estimated_entry_charge;
 
-  // EXPERIMENTAL: When estimated_entry_charge == 0, this parameter establishes
-  // a promised lower bound on the average charge of all entries in the table,
-  // which is roughly the average uncompressed SST block size of block cache
-  // entries, typically > 4KB. The default should generally suffice with almost
-  // no cost. (This option is ignored for estimated_entry_charge > 0.)
+  // When estimated_entry_charge == 0, this parameter establishes a promised
+  // lower bound on the average charge of all entries in the table, which is
+  // roughly the average uncompressed SST block size of block cache entries,
+  // typically > 4KB. The default should generally suffice with almost no cost.
+  // (This option is ignored for estimated_entry_charge > 0.)
   //
   // More detail: The table for indexing cache entries will grow automatically
   // as needed, but a hard upper bound on that size is needed at creation time.
@@ -478,8 +472,8 @@ struct HyperClockCacheOptions : public ShardedCacheOptions {
   // keep operations very fast.
   int eviction_effort_cap = 30;
 
-  HyperClockCacheOptions(
-      size_t _capacity, size_t _estimated_entry_charge,
+  explicit HyperClockCacheOptions(
+      size_t _capacity, size_t _estimated_entry_charge = 0,
       int _num_shard_bits = -1, bool _strict_capacity_limit = false,
       std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
       CacheMetadataChargePolicy _metadata_charge_policy =
diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h
index 7a4c6ca11fbd..623fb1f0b918 100644
--- a/include/rocksdb/ldb_tool.h
+++ b/include/rocksdb/ldb_tool.h
@@ -32,10 +32,18 @@ struct LDBOptions {
 
 class LDBTool {
  public:
+  // DEPRECATED because this function does not return, which can result in
+  // memory leaks being reported because of the default Options() etc. not being
+  // destroyed.
   void Run(
       int argc, char** argv, Options db_options = Options(),
       const LDBOptions& ldb_options = LDBOptions(),
       const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
+
+  int RunAndReturn(
+      int argc, char** argv, const Options& db_options = Options(),
+      const LDBOptions& ldb_options = LDBOptions(),
+      const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
index ef74b08a72ab..27934e0f80b6 100644
--- a/java/src/test/java/org/rocksdb/SstFileReaderTest.java
+++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java
@@ -217,6 +217,8 @@ public void readSstFile() throws RocksDBException, IOException {
       assertThat(iterator.isValid()).isTrue();
       assertThat(iterator.key()).isEqualTo("key1".getBytes());
       assertThat(iterator.value()).isEqualTo("value1".getBytes());
+
+      iterator.close();
     }
   }
 }
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 7c11875252c3..1a1ace7d1ef8 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -437,10 +437,10 @@ void BlockBasedTableFactory::InitializeOptions() {
   if (table_options_.no_block_cache) {
     table_options_.block_cache.reset();
   } else if (table_options_.block_cache == nullptr) {
-    LRUCacheOptions co;
-    // 32MB, the recommended minimum size for 64 shards, to reduce contention
-    co.capacity = 32 << 20;
-    table_options_.block_cache = NewLRUCache(co);
+    // Now using AutoHCC by default, with existing default size of 32MB
+    // which is just one cache shard in HCC
+    HyperClockCacheOptions hcc_opts{size_t{32} << 20};
+    table_options_.block_cache = hcc_opts.MakeSharedCache();
   }
   if (table_options_.block_size_deviation < 0 ||
       table_options_.block_size_deviation > 100) {
diff --git a/tools/ldb.cc b/tools/ldb.cc
index 52533e6b0f6e..5ef91df1b209 100644
--- a/tools/ldb.cc
+++ b/tools/ldb.cc
@@ -8,6 +8,5 @@
 
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::LDBTool tool;
-  tool.Run(argc, argv);
-  return 0;
+  return tool.RunAndReturn(argc, argv);
 }
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index 3dd1905e83ba..ebf40e25d8ab 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -185,8 +185,14 @@ int LDBCommandRunner::RunCommand(
 void LDBTool::Run(int argc, char** argv, Options options,
                   const LDBOptions& ldb_options,
                   const std::vector<ColumnFamilyDescriptor>* column_families) {
-  int error_code = LDBCommandRunner::RunCommand(argc, argv, options,
-                                                ldb_options, column_families);
-  exit(error_code);
+  exit(RunAndReturn(argc, argv, options, ldb_options, column_families));
+}
+
+int LDBTool::RunAndReturn(
+    int argc, char** argv, const Options& options,
+    const LDBOptions& ldb_options,
+    const std::vector<ColumnFamilyDescriptor>* column_families) {
+  return LDBCommandRunner::RunCommand(argc, argv, options, ldb_options,
+                                      column_families);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/unreleased_history/behavior_changes/autohcc.md b/unreleased_history/behavior_changes/autohcc.md
new file mode 100644
index 000000000000..d43f31117f9d
--- /dev/null
+++ b/unreleased_history/behavior_changes/autohcc.md
@@ -0,0 +1 @@
+* The default provided block cache implementation is now HyperClockCache instead of LRUCache, when `block_cache` is nullptr (default) and `no_block_cache==false` (default). We recommend explicitly creating a HyperClockCache block cache based on memory budget and sharing it across all column families and even DB instances. This change could expose previously hidden memory or resource leaks.
diff --git a/unreleased_history/public_api_changes/autohcc.md b/unreleased_history/public_api_changes/autohcc.md
new file mode 100644
index 000000000000..4bbe714fc5c2
--- /dev/null
+++ b/unreleased_history/public_api_changes/autohcc.md
@@ -0,0 +1 @@
+* HyperClockCache with no `estimated_entry_charge` is now production-ready and is the preferred block cache implementation vs. LRUCache. Please consider updating your code to minimize the risk of hitting performance bottlenecks or anomalies from LRUCache. See cache.h for more detail.

From 6a202c5570d9aca11a23c5b1a78019f8be245463 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 18 Sep 2025 15:10:04 -0700
Subject: [PATCH 286/500] Fix nullptr access in IsInjectedError() for stress
 test (#13968)

Summary:
**Context/Summary:**
`Status::state` can be nullptr when created with no specific error message. std::strstr on nullptr caused some segfault in our stress test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13968

Test Plan: Monitor stress test

Reviewed By: jaykorean

Differential Revision: D82695541

Pulled By: hx235

fbshipit-source-id: cf08f70163a9ee6c911cdc3a3d79acd3429f0d15
---
 utilities/fault_injection_fs.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 151ab1f09499..54d657d17d97 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -229,14 +229,25 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   static const char* kClassName() { return "FaultInjectionTestFS"; }
   const char* Name() const override { return kClassName(); }
 
-  static bool IsInjectedError(const Status& s) {
-    assert(!s.ok());
-    return std::strstr(s.getState(), kInjected.c_str());
+  static bool IsInjectedError(const Status& s,
+                              const std::string& specific_error_marker = "") {
+    if (s.ok()) {
+      return false;
+    }
+    const char* state = s.getState();
+    if (state == nullptr) {
+      return false;
+    }
+    bool is_injected_error = std::strstr(state, kInjected.c_str()) != nullptr;
+    bool is_specific_error =
+        specific_error_marker.empty() ||
+        std::strstr(state, specific_error_marker.c_str()) != nullptr;
+
+    return is_injected_error && is_specific_error;
   }
 
   static bool IsFailedToWriteToWALError(const Status& s) {
-    assert(!s.ok());
-    return std::strstr(s.getState(), kFailedToWriteToWAL.c_str());
+    return IsInjectedError(s, kFailedToWriteToWAL);
   }
 
   IOStatus NewDirectory(const std::string& name, const IOOptions& options,

From 5a1ff2cb146919dcf1a3ceec75c6c3c57f7fc67b Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Thu, 18 Sep 2025 15:18:18 -0700
Subject: [PATCH 287/500] Force caller to pass comparator in MultiScanArgs
 (#13970)

Summary:
Force caller of MultiScanArgs to pass comparator. Pass comparator from CF handle to MultiScanArgs in NewMultiScan.
Expand MultiScanArgs unit test with different comparator.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13970

Test Plan: unit test

Reviewed By: cbi42

Differential Revision: D82739270

Pulled By: xingbowang

fbshipit-source-id: e709f4a333ad547c0ba6d24d8fb2b22e50e8a12f
---
 db/version_set.cc                             | 38 +++++++++---
 db_stress_tool/db_stress_test_base.cc         |  3 +-
 include/rocksdb/db.h                          |  6 +-
 include/rocksdb/multi_scan.h                  |  5 +-
 include/rocksdb/options.h                     |  5 +-
 .../block_based_table_reader_test.cc          | 58 ++++++++++++-------
 table/table_test.cc                           |  6 +-
 tools/db_bench_tool.cc                        | 18 +++---
 ...ltiScanArgs_contructor_parameter_change.md |  1 +
 9 files changed, 94 insertions(+), 46 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md

diff --git a/db/version_set.cc b/db/version_set.cc
index 0c98a01f0eb0..23df68244fd4 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1101,6 +1101,27 @@ class LevelIterator final : public InternalIterator {
     read_seq_ = read_seq;
   }
 
+  inline bool FileHasMultiScanArg(size_t file_index) {
+    if (file_to_scan_opts_.get()) {
+      auto it = file_to_scan_opts_->find(file_index);
+      if (it != file_to_scan_opts_->end()) {
+        return !it->second.empty();
+      }
+    }
+    return false;
+  }
+
+  MultiScanArgs& GetMultiScanArgForFile(size_t file_index) {
+    auto multi_scan_args_it = file_to_scan_opts_->find(file_index);
+    if (multi_scan_args_it == file_to_scan_opts_->end()) {
+      auto ret = file_to_scan_opts_->emplace(
+          file_index, MultiScanArgs(user_comparator_.user_comparator()));
+      multi_scan_args_it = ret.first;
+      assert(ret.second);
+    }
+    return multi_scan_args_it->second;
+  }
+
   void Prepare(const MultiScanArgs* so) override {
     // We assume here that scan_opts is sorted such that
     // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping
@@ -1109,6 +1130,9 @@ class LevelIterator final : public InternalIterator {
     }
     scan_opts_ = so;
 
+    // Verify comparator is consistent
+    assert(so->GetComparator() == user_comparator_.user_comparator());
+
     file_to_scan_opts_ = std::make_unique<ScanOptionsMap>();
     for (size_t k = 0; k < scan_opts_->size(); k++) {
       const ScanOptions& opt = scan_opts_->GetScanRanges().at(k);
@@ -1157,8 +1181,8 @@ class LevelIterator final : public InternalIterator {
       // 3. [  S  ] ...... [  E  ]
       for (auto i = fstart; i <= fend; i++) {
         if (i < flevel_->num_files) {
-          (*file_to_scan_opts_)[i].insert(start.value(), end.value(),
-                                          opt.property_bag);
+          auto args = GetMultiScanArgForFile(i);
+          args.insert(start.value(), end.value(), opt.property_bag);
         }
       }
     }
@@ -1562,9 +1586,10 @@ bool LevelIterator::SkipEmptyFileForward() {
     if (file_iter_.iter() != nullptr) {
       // If we are doing prepared scan opts then we should seek to the values
       // specified by the scan opts
-      if (scan_opts_ && (*file_to_scan_opts_)[file_index_].size()) {
+
+      if (scan_opts_ && FileHasMultiScanArg(file_index_)) {
         const ScanOptions& opts =
-            file_to_scan_opts_->at(file_index_).GetScanRanges().front();
+            GetMultiScanArgForFile(file_index_).GetScanRanges().front();
         if (opts.range.start.has_value()) {
           InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber,
                              kValueTypeForSeek);
@@ -1621,9 +1646,8 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
 
   InternalIterator* old_iter = file_iter_.Set(iter);
   if (iter && scan_opts_) {
-    if (file_to_scan_opts_.get() &&
-        file_to_scan_opts_->find(file_index_) != file_to_scan_opts_->end()) {
-      const MultiScanArgs& new_opts = file_to_scan_opts_->at(file_index_);
+    if (FileHasMultiScanArg(file_index_)) {
+      const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
       file_iter_.Prepare(&new_opts);
     } else {
       file_iter_.Prepare(scan_opts_);
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index b62441403bee..c3c7d4bdc7f1 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1692,7 +1692,8 @@ Status StressTest::TestMultiScan(ThreadState* thread,
 
   std::vector<std::string> start_key_strs;
   std::vector<std::string> end_key_strs;
-  MultiScanArgs scan_opts;
+  // TODO support reverse BytewiseComparator in the stress test
+  MultiScanArgs scan_opts(BytewiseComparator());
   scan_opts.use_async_io = FLAGS_multiscan_use_async_io;
   start_key_strs.reserve(num_scans);
   end_key_strs.reserve(num_scans);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 4c7ff0f0585c..7fdad866784d 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1118,11 +1118,11 @@ class DB {
   //    // Check ex.what()
   //  }
   virtual std::unique_ptr<MultiScan> NewMultiScan(
-      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const ReadOptions& /*options*/, ColumnFamilyHandle* column_family,
       const MultiScanArgs& /*scan_opts*/) {
     std::unique_ptr<Iterator> iter(NewErrorIterator(Status::NotSupported()));
-    std::unique_ptr<MultiScan> ms_iter =
-        std::make_unique<MultiScan>(std::move(iter));
+    std::unique_ptr<MultiScan> ms_iter = std::make_unique<MultiScan>(
+        column_family->GetComparator(), std::move(iter));
     return ms_iter;
   }
 
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
index c9af9022a0e1..eb120c07a1b3 100644
--- a/include/rocksdb/multi_scan.h
+++ b/include/rocksdb/multi_scan.h
@@ -155,8 +155,9 @@ class MultiScan {
   MultiScan(const ReadOptions& read_options, const MultiScanArgs& scan_opts,
             DB* db, ColumnFamilyHandle* cfh);
 
-  explicit MultiScan(std::unique_ptr<Iterator>&& db_iter)
-      : db_iter_(std::move(db_iter)) {}
+  explicit MultiScan(const Comparator* comp,
+                     std::unique_ptr<Iterator>&& db_iter)
+      : scan_opts_(comp), db_iter_(std::move(db_iter)) {}
 
   class MultiScanIterator {
    public:
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index a43fa6fda941..78ea33f564f2 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1782,8 +1782,7 @@ struct ScanOptions {
 class MultiScanArgs {
  public:
   // Constructor that takes a comparator
-  explicit MultiScanArgs(const Comparator* comparator = BytewiseComparator())
-      : comp_(comparator) {}
+  explicit MultiScanArgs(const Comparator* comparator) : comp_(comparator) {}
 
   // Copy Constructor
   MultiScanArgs(const MultiScanArgs& other) {
@@ -1855,6 +1854,8 @@ class MultiScanArgs {
     return original_ranges_;
   }
 
+  const Comparator* GetComparator() const { return comp_; }
+
   uint64_t io_coalesce_threshold = 16 << 10;  // 16KB by default
 
   // Maximum size (in bytes) for the data blocks loaded by a MultiScan.
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 2010d36bc055..7a9dd81a4caa 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -52,7 +52,8 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   // user defined timestamps and different sequence number to differentiate them
   static std::vector<std::pair<std::string, std::string>> GenerateKVMap(
       int num_block = 2, bool mixed_with_human_readable_string_value = false,
-      size_t ts_sz = 0, bool same_key_diff_ts = false) {
+      size_t ts_sz = 0, bool same_key_diff_ts = false,
+      const Comparator* comparator = BytewiseComparator()) {
     std::vector<std::pair<std::string, std::string>> kv;
 
     SequenceNumber seq_no = 0;
@@ -100,6 +101,10 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
         }
       }
     }
+    auto comparator_name = std::string(comparator->Name());
+    if (comparator_name.find("Reverse") != std::string::npos) {
+      std::reverse(kv.begin(), kv.end());
+    }
     return kv;
   }
 
@@ -128,6 +133,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 
     InternalKeyComparator comparator(ioptions.user_comparator);
     ColumnFamilyOptions cf_options;
+    cf_options.comparator = ioptions.user_comparator;
     cf_options.prefix_extractor = options_.prefix_extractor;
     MutableCFOptions moptions(cf_options);
     CompressionOptions compression_opts;
@@ -255,11 +261,13 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 //          generate keys with different user provided key, same user-defined
 //          timestamps (if udt enabled), same sequence number. This test mode is
 //          used for testing `Get`, `MultiGet`, and `NewIterator`.
+// Param 9: test both the default comparator and a reverse comparator.
 class BlockBasedTableReaderTest
     : public BlockBasedTableReaderBaseTest,
-      public testing::WithParamInterface<std::tuple<
-          CompressionType, bool, BlockBasedTableOptions::IndexType, bool,
-          test::UserDefinedTimestampTestMode, uint32_t, uint32_t, bool>> {
+      public testing::WithParamInterface<
+          std::tuple<CompressionType, bool, BlockBasedTableOptions::IndexType,
+                     bool, test::UserDefinedTimestampTestMode, uint32_t,
+                     uint32_t, bool, const Comparator*>> {
  protected:
   void SetUp() override {
     compression_type_ = std::get<0>(GetParam());
@@ -270,6 +278,7 @@ class BlockBasedTableReaderTest
     compression_parallel_threads_ = std::get<5>(GetParam());
     compression_dict_bytes_ = std::get<6>(GetParam());
     same_key_diff_ts_ = std::get<7>(GetParam());
+    comparator_ = std::get<8>(GetParam());
     BlockBasedTableReaderBaseTest::SetUp();
   }
 
@@ -295,6 +304,7 @@ class BlockBasedTableReaderTest
   uint32_t compression_parallel_threads_;
   uint32_t compression_dict_bytes_;
   bool same_key_diff_ts_;
+  const Comparator* comparator_{};
 };
 
 class BlockBasedTableReaderGetTest : public BlockBasedTableReaderTest {};
@@ -1022,6 +1032,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       SCOPED_TRACE(std::string("use_async_io=") + std::to_string(use_async_io));
       Options options;
       options.statistics = CreateDBStatistics();
+      options.comparator = comparator_;
       std::shared_ptr<FileSystem> fs = options.env->GetFileSystem();
       ReadOptions read_opts;
       read_opts.fill_cache = fill_cache;
@@ -1029,7 +1040,8 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       std::vector<std::pair<std::string, std::string>> kv =
           BlockBasedTableReaderBaseTest::GenerateKVMap(
               100 /* num_block */,
-              true /* mixed_with_human_readable_string_value */, ts_sz);
+              true /* mixed_with_human_readable_string_value */, ts_sz,
+              same_key_diff_ts_, comparator_);
       std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
                                CompressionTypeToString(compression_type_) +
                                "_async" + std::to_string(use_async_io);
@@ -1052,7 +1064,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
           read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
           /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
-      MultiScanArgs scan_options(BytewiseComparator());
+      MultiScanArgs scan_options(comparator_);
       scan_options.use_async_io = use_async_io;
       scan_options.insert(ExtractUserKey(kv[0].first),
                           ExtractUserKey(kv[kEntriesPerBlock].first));
@@ -1087,7 +1099,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       iter.reset(table->NewIterator(
           read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
           /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options = MultiScanArgs(comparator_);
       scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
                           ExtractUserKey(kv[75 * kEntriesPerBlock].first));
       scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
@@ -1125,7 +1137,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       // From reads above, blocks 70-75 and 90-95 already in cache
       // So we should read 50-70 76-89 96-99 in three I/Os.
       // If fill_cache is false, then we'll do one giant I/O.
-      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options = MultiScanArgs(comparator_);
       scan_options.use_async_io = use_async_io;
       scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
       read_count_before =
@@ -1165,7 +1177,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       iter.reset(table->NewIterator(
           read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
           /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options = MultiScanArgs(comparator_);
       scan_options.use_async_io = use_async_io;
       scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
                           ExtractUserKey(kv[20 * kEntriesPerBlock].first));
@@ -1195,7 +1207,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       iter.reset(table->NewIterator(
           read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
           /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-      scan_options = MultiScanArgs(BytewiseComparator());
+      scan_options = MultiScanArgs(comparator_);
       scan_options.use_async_io = use_async_io;
       scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
       scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
@@ -1226,6 +1238,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
     return;
   }
   Options options;
+  options.comparator = comparator_;
   ReadOptions read_opts;
   size_t ts_sz = options.comparator->timestamp_size();
 
@@ -1233,7 +1246,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
   std::vector<std::pair<std::string, std::string>> kv =
       BlockBasedTableReaderBaseTest::GenerateKVMap(
           20 /* num_block */, true /* mixed_with_human_readable_string_value */,
-          ts_sz);
+          ts_sz, same_key_diff_ts_, comparator_);
 
   std::string table_name = "BlockBasedTableReaderTest_PrefetchSizeLimit" +
                            CompressionTypeToString(compression_type_);
@@ -1259,7 +1272,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
         read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
-    MultiScanArgs scan_options(BytewiseComparator());
+    MultiScanArgs scan_options(comparator_);
     scan_options.max_prefetch_size = 1024;  // less than block size
     scan_options.insert(ExtractUserKey(kv[0].first),
                         ExtractUserKey(kv[5].first));
@@ -1279,7 +1292,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
         read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
-    MultiScanArgs scan_options(BytewiseComparator());
+    MultiScanArgs scan_options(comparator_);
     scan_options.max_prefetch_size = 9 * 1024;  // 9KB - 2 blocks with buffer
     scan_options.insert(ExtractUserKey(kv[1 * kEntriesPerBlock].first),
                         ExtractUserKey(kv[8 * kEntriesPerBlock].first));
@@ -1310,7 +1323,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
         read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
-    MultiScanArgs scan_options(BytewiseComparator());
+    MultiScanArgs scan_options(comparator_);
     scan_options.max_prefetch_size = 3 * 4 * 1024 + 1024;  // 3 blocks + 1KB
     scan_options.insert(ExtractUserKey(kv[0].first),
                         ExtractUserKey(kv[5 * kEntriesPerBlock].first));
@@ -1336,7 +1349,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
         read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
-    MultiScanArgs scan_options(BytewiseComparator());
+    MultiScanArgs scan_options(comparator_);
     scan_options.max_prefetch_size = 5 * 4 * 1024 + 1024;  // 5 blocks + 1KB
     // Will read 5 entries from first scan range, and 4 blocks from the second
     // scan range
@@ -1373,7 +1386,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
         read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
         /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
-    MultiScanArgs scan_options(BytewiseComparator());
+    MultiScanArgs scan_options(comparator_);
     scan_options.max_prefetch_size = 10 * 1024 * 1024;  // 10MB
     scan_options.insert(ExtractUserKey(kv[0].first),
                         ExtractUserKey(kv[5].first));
@@ -1440,7 +1453,8 @@ INSTANTIATE_TEST_CASE_P(
             BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
         ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
         ::testing::Values(1, 2), ::testing::Values(0, 4096),
-        ::testing::Values(false)));
+        ::testing::Values(false),
+        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
 INSTANTIATE_TEST_CASE_P(
     BlockBasedTableReaderGetTest, BlockBasedTableReaderGetTest,
     ::testing::Combine(
@@ -1452,7 +1466,8 @@ INSTANTIATE_TEST_CASE_P(
             BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
         ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
         ::testing::Values(1, 2), ::testing::Values(0, 4096),
-        ::testing::Values(false, true)));
+        ::testing::Values(false, true),
+        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
 INSTANTIATE_TEST_CASE_P(
     StrictCapacityLimitReaderTest, StrictCapacityLimitReaderTest,
     ::testing::Combine(
@@ -1461,7 +1476,8 @@ INSTANTIATE_TEST_CASE_P(
             BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
         ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
         ::testing::Values(1, 2), ::testing::Values(0),
-        ::testing::Values(false, true)));
+        ::testing::Values(false, true),
+        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
 INSTANTIATE_TEST_CASE_P(
     VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
     ::testing::Combine(
@@ -1470,8 +1486,8 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(
             BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
         ::testing::Values(true), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0),
-        ::testing::Values(false)));
+        ::testing::Values(1, 2), ::testing::Values(0), ::testing::Values(false),
+        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
 
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/table/table_test.cc b/table/table_test.cc
index bdbfe7750ded..e1d01db61264 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -8056,7 +8056,7 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
   ro.iterate_upper_bound = nullptr;
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts;
+  MultiScanArgs scan_opts(options.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
   scan_opts.insert(Slice("key20"), std::optional(property_bag));
@@ -8146,7 +8146,7 @@ TEST_F(UserDefinedIndexTest, EmptyRangeTest) {
 
   ro.table_index_factory = user_defined_index_factory.get();
   std::vector<int> key_counts;
-  MultiScanArgs scan_opts;
+  MultiScanArgs scan_opts(options.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(5);
   // Empty scans
@@ -8405,7 +8405,7 @@ TEST_F(UserDefinedIndexTest, ConfigTest) {
   ro.table_index_factory = user_defined_index_factory.get();
   std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts;
+  MultiScanArgs scan_opts(options.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
   scan_opts.insert(Slice("key20"), std::optional(property_bag));
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 003576da5a5a..26490510e8ff 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1847,13 +1847,17 @@ DEFINE_bool(universal_reduce_file_locking,
                 .compaction_options_universal.reduce_file_locking,
             "See Options().compaction_options_universal.reduce_file_locking");
 
-DEFINE_uint64(multiscan_coalesce_threshold,
-              ROCKSDB_NAMESPACE::MultiScanArgs().io_coalesce_threshold,
-              "Configures io coalescing threshold for multiscans");
+DEFINE_uint64(
+    multiscan_coalesce_threshold,
+    ROCKSDB_NAMESPACE::MultiScanArgs(ROCKSDB_NAMESPACE::BytewiseComparator())
+        .io_coalesce_threshold,
+    "Configures io coalescing threshold for multiscans");
 
-DEFINE_bool(multiscan_use_async_io,
-            ROCKSDB_NAMESPACE::MultiScanArgs().use_async_io,
-            "Sets MultiScanArgs::use_async_io");
+DEFINE_bool(
+    multiscan_use_async_io,
+    ROCKSDB_NAMESPACE::MultiScanArgs(ROCKSDB_NAMESPACE::BytewiseComparator())
+        .use_async_io,
+    "Sets MultiScanArgs::use_async_io");
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
@@ -6421,7 +6425,7 @@ class Benchmark {
     int64_t num_keys = 1;
     while (!duration.Done(num_keys)) {
       DB* db = SelectDB(thread);
-      MultiScanArgs opts;
+      MultiScanArgs opts(open_options_.comparator);
       opts.io_coalesce_threshold = FLAGS_multiscan_coalesce_threshold;
       opts.use_async_io = FLAGS_multiscan_use_async_io;
       std::vector<std::unique_ptr<const char[]>> guards;
diff --git a/unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md b/unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md
new file mode 100644
index 000000000000..5912b4b3631a
--- /dev/null
+++ b/unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md
@@ -0,0 +1 @@
+MultiScanArgs used to have a default constructor with default parameter of BytewiseComparator. Now it always requires Comparator in its constructor.

From 94e65a2e0b4f817aa4bfa4c96cdf867e7980d7bc Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Thu, 18 Sep 2025 16:15:50 -0700
Subject: [PATCH 288/500] Add option to validate key during seek in SkipList
 Memtable (#13902)

Summary:
Add a new CF immutable option `paranoid_memory_check_key_checksum_on_seek` that allows additional data integrity validations during seek on SkipList Memtable. When this option is enabled and memtable_protection_bytes_per_key is non zero, skiplist-based memtable will validate the checksum of each key visited during seek operation. The option is opt-in due to performance overhead. This is an enhancement on top of paranoid_memory_checks option.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13902

Test Plan:
* new unit test added for paranoid_memory_check_key_checksum_on_seek=true.
    * existing unit test for paranoid_memory_check_key_checksum_on_seek=false.
    * enable in stress test.

Performance Benchmark: we check for performance regression in read path where data is in memtable only. For each benchmark, the script was run at the same time for main and this PR:

### Memtable-only randomread ops/sec:

* Value size = 100 Bytes
```
for B in 0 1 2 4 8; do (for I in $(seq 1 50);do  ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --value_size=100 --num=250000 --reads=500000  --seed=1723056275 --paranoid_memory_check_key_checksum_on_seek=true --memtable_protection_bytes_per_key=$B 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; done;
```

1. Main: 928999
2. PR with paranoid_memory_check_key_checksum_on_seek=false: 930993 (+0.2%)
3. PR with paranoid_memory_check_key_checksum_on_seek=true:
3.1 memtable_protection_bytes_per_key=1: 464577 (-50%)
3.2 memtable_protection_bytes_per_key=2: 470319 (-49%)
3.3 memtable_protection_bytes_per_key=4: 468457 (-50%)
3.4 memtable_protection_bytes_per_key=8: 465061 (-50%)

* Value size = 1000 Bytes
```
for B in 0 1 2 4 8; do (for I in $(seq 1 50);do  ./db_bench --benchmarks=fillseq,readrandom --write_buffer_size=268435456 --writes=250000 --value_size=1000 --num=250000 --reads=500000  --seed=1723056275 --paranoid_memory_check_key_checksum_on_seek=true --memtable_protection_bytes_per_key=$B 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; done;
```

1. Main: 601321
2. PR with paranoid_memory_check_key_checksum_on_seek=false: 607885 (+1.1%)
3. PR with paranoid_memory_check_key_checksum_on_seek=true:
3.1 memtable_protection_bytes_per_key=1: 185742 (-69%)
3.2 memtable_protection_bytes_per_key=2: 177167 (-71%)
3.3 memtable_protection_bytes_per_key=4: 185908 (-69%)
3.4 memtable_protection_bytes_per_key=8: 183639 (-69%)

Reviewed By: pdillinger

Differential Revision: D81199245

Pulled By: xingbowang

fbshipit-source-id: e3c29552ab92f2c5f360361366a293fa26934913
---
 db/db_basic_test.cc                           |   1 +
 db/db_memtable_test.cc                        | 129 ++++++++++++++++++
 db/memtable.cc                                |  51 +++++--
 db/memtable.h                                 |   6 +
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   5 +
 db_stress_tool/db_stress_test_base.cc         |   2 +
 include/rocksdb/advanced_options.h            |  18 ++-
 include/rocksdb/memtablerep.h                 |  20 +--
 include/rocksdb/options.h                     |   1 +
 memtable/inlineskiplist.h                     |  93 ++++++++-----
 memtable/skiplistrep.cc                       |  23 +++-
 memtable/vectorrep.cc                         |  13 +-
 options/cf_options.cc                         |   7 +
 options/cf_options.h                          |   4 +
 options/options_helper.cc                     |   2 +
 options/options_settable_test.cc              |   1 +
 tools/db_bench_tool.cc                        |   5 +
 tools/db_crashtest.py                         |   5 +
 .../improve_data_integrity_check_on_seek.md   |   1 +
 20 files changed, 320 insertions(+), 68 deletions(-)
 create mode 100644 unreleased_history/new_features/improve_data_integrity_check_on_seek.md

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index cb7313e090ca..9a4a5b983621 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -5087,6 +5087,7 @@ TEST_F(DBBasicTest, DisallowMemtableWrite) {
   Options options_disallow = options_allow;
   options_disallow.disallow_memtable_writes = true;
   options_disallow.paranoid_memory_checks = true;
+  options_disallow.memtable_veirfy_per_key_checksum_on_seek = true;
 
   DestroyAndReopen(options_allow);
   // CFs allowing and disallowing memtable write
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 1768cb9c0866..0e3beb5edfbf 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -339,6 +339,135 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
   }
 }
 
+class DBMemTableTestForSeek : public DBMemTableTest,
+                              virtual public ::testing::WithParamInterface<
+                                  std::tuple<bool, bool, bool>> {};
+
+TEST_P(DBMemTableTestForSeek, IntegrityChecks) {
+  // Validate key corruption could be detected during seek.
+  // We insert many keys into skiplist. Then we corrupt the each key one at a
+  // time. With memtable_veirfy_per_key_checksum_on_seek enabled, when the
+  // corrupted key is searched, the checksum of every key visited during the
+  // seek is validated. It will report data corruption. Otherwise seek returns
+  // not found.
+  auto allow_data_in_error = std::get<0>(GetParam());
+  Options options = CurrentOptions();
+  options.allow_data_in_errors = allow_data_in_error;
+  options.paranoid_memory_checks = std::get<1>(GetParam());
+  options.memtable_veirfy_per_key_checksum_on_seek = std::get<2>(GetParam());
+  options.memtable_protection_bytes_per_key = 8;
+  DestroyAndReopen(options);
+
+  // capture the data pointer of all of the keys
+  std::vector<char*> raw_data_pointer;
+
+  // Insert enough keys, so memtable would create multiple levels.
+  auto key_count = 100;
+  for (int i = 0; i < key_count; i++) {
+    // The last digit of the key will be corrupted from value 0 to value 5
+    ASSERT_OK(Put(Key(i * 10), "val0"));
+  }
+
+  ReadOptions rops;
+
+  // Iterate all the keys to get key pointers
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->SetCallBack("InlineSkipList::Iterator::Next::key",
+                                        [&raw_data_pointer](void* key) {
+                                          auto p = static_cast<char*>(key);
+                                          raw_data_pointer.push_back(p);
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
+    iter->Seek(Key(0));
+    while (iter->Valid()) {
+      ASSERT_OK(iter->status());
+      iter->Next();
+    }
+    // check status after valid returned false.
+    auto status = iter->status();
+    ASSERT_TRUE(status.ok());
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(raw_data_pointer.size(), key_count);
+
+  bool enable_key_validation_on_seek =
+      options.memtable_veirfy_per_key_checksum_on_seek;
+
+  // For each key, corrupt it, validate corruption is detected correctly, then
+  // revert it.
+  for (int i = 0; i < key_count; i++) {
+    std::string key_to_corrupt = Key(i * 10);
+    raw_data_pointer[i][key_to_corrupt.size()] = '5';
+
+    auto corrupted_key = key_to_corrupt;
+    corrupted_key.data()[key_to_corrupt.size() - 1] = '5';
+    auto corrupted_key_slice =
+        Slice(corrupted_key.data(), corrupted_key.length());
+    auto corrupted_key_hex = corrupted_key_slice.ToString(/*hex=*/true);
+
+    {
+      // Test Get API
+      std::string val;
+      auto status = db_->Get(rops, key_to_corrupt, &val);
+      if (enable_key_validation_on_seek) {
+        ASSERT_TRUE(status.IsCorruption()) << key_to_corrupt;
+        ASSERT_EQ(
+            status.ToString().find(corrupted_key_hex) != std::string::npos,
+            allow_data_in_error)
+            << status.ToString() << "\n"
+            << corrupted_key_hex;
+      } else {
+        ASSERT_TRUE(status.IsNotFound());
+      }
+    }
+
+    {
+      // Test MultiGet API
+      std::vector<std::string> vals;
+      std::vector<Status> statuses = db_->MultiGet(
+          rops, {db_->DefaultColumnFamily()}, {key_to_corrupt}, &vals, nullptr);
+      if (enable_key_validation_on_seek) {
+        ASSERT_TRUE(statuses[0].IsCorruption());
+        ASSERT_EQ(
+            statuses[0].ToString().find(corrupted_key_hex) != std::string::npos,
+            allow_data_in_error);
+      } else {
+        ASSERT_TRUE(statuses[0].IsNotFound());
+      }
+    }
+
+    {
+      // Test Iterator Seek API
+      std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
+      ASSERT_OK(iter->status());
+      iter->Seek(key_to_corrupt);
+      auto status = iter->status();
+      if (enable_key_validation_on_seek) {
+        ASSERT_TRUE(status.IsCorruption());
+        ASSERT_EQ(
+            status.ToString().find(corrupted_key_hex) != std::string::npos,
+            allow_data_in_error);
+      } else {
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_FALSE(status.ok());
+      }
+    }
+
+    // revert the key corruption.
+    raw_data_pointer[i][key_to_corrupt.size()] = '0';
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMemTableTestForSeek, DBMemTableTestForSeek,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+
 TEST_F(DBMemTableTest, IntegrityChecks) {
   // We insert keys key000000, key000001 and key000002 into skiplist at fixed
   // height 1 (smallest height). Then we corrupt the second key to aey000001 to
diff --git a/db/memtable.cc b/db/memtable.cc
index 3ef4db0ee277..7a2b0fe6880a 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -70,7 +70,9 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
       protection_bytes_per_key(
           mutable_cf_options.memtable_protection_bytes_per_key),
       allow_data_in_errors(ioptions.allow_data_in_errors),
-      paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
+      paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks),
+      memtable_veirfy_per_key_checksum_on_seek(
+          mutable_cf_options.memtable_veirfy_per_key_checksum_on_seek) {}
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ImmutableOptions& ioptions,
@@ -115,7 +117,13 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       oldest_key_time_(std::numeric_limits<uint64_t>::max()),
       approximate_memory_usage_(0),
       memtable_max_range_deletions_(
-          mutable_cf_options.memtable_max_range_deletions) {
+          mutable_cf_options.memtable_max_range_deletions),
+      key_validation_callback_(
+          (moptions_.protection_bytes_per_key != 0 &&
+           moptions_.memtable_veirfy_per_key_checksum_on_seek)
+              ? std::bind(&MemTable::ValidateKey, this, std::placeholders::_1,
+                          std::placeholders::_2)
+              : std::function<Status(const char*, bool)>(nullptr)) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
@@ -394,7 +402,11 @@ class MemTableIterator : public InternalIterator {
             !mem.GetImmutableMemTableOptions()->inplace_update_support),
         arena_mode_(arena != nullptr),
         paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
-        allow_data_in_error(mem.moptions_.allow_data_in_errors) {
+        validate_on_seek_(
+            mem.moptions_.paranoid_memory_checks ||
+            mem.moptions_.memtable_veirfy_per_key_checksum_on_seek),
+        allow_data_in_error_(mem.moptions_.allow_data_in_errors),
+        key_validation_callback_(mem.key_validation_callback_) {
     if (kind == kRangeDelEntries) {
       iter_ = mem.range_del_table_->GetIterator(arena);
     } else if (prefix_extractor_ != nullptr &&
@@ -463,8 +475,10 @@ class MemTableIterator : public InternalIterator {
         }
       }
     }
-    if (paranoid_memory_checks_) {
-      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
+    if (validate_on_seek_) {
+      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_,
+                                       paranoid_memory_checks_,
+                                       key_validation_callback_);
     } else {
       iter_->Seek(k, nullptr);
     }
@@ -488,8 +502,10 @@ class MemTableIterator : public InternalIterator {
         }
       }
     }
-    if (paranoid_memory_checks_) {
-      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
+    if (validate_on_seek_) {
+      status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error_,
+                                       paranoid_memory_checks_,
+                                       key_validation_callback_);
     } else {
       iter_->Seek(k, nullptr);
     }
@@ -518,7 +534,7 @@ class MemTableIterator : public InternalIterator {
     PERF_COUNTER_ADD(next_on_memtable_count, 1);
     assert(Valid());
     if (paranoid_memory_checks_) {
-      status_ = iter_->NextAndValidate(allow_data_in_error);
+      status_ = iter_->NextAndValidate(allow_data_in_error_);
     } else {
       iter_->Next();
       TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
@@ -540,7 +556,7 @@ class MemTableIterator : public InternalIterator {
     PERF_COUNTER_ADD(prev_on_memtable_count, 1);
     assert(Valid());
     if (paranoid_memory_checks_) {
-      status_ = iter_->PrevAndValidate(allow_data_in_error);
+      status_ = iter_->PrevAndValidate(allow_data_in_error_);
     } else {
       iter_->Prev();
     }
@@ -599,7 +615,9 @@ class MemTableIterator : public InternalIterator {
   bool value_pinned_;
   bool arena_mode_;
   const bool paranoid_memory_checks_;
-  const bool allow_data_in_error;
+  const bool validate_on_seek_;
+  const bool allow_data_in_error_;
+  const std::function<Status(const char*, bool)> key_validation_callback_;
 
   void VerifyEntryChecksum() {
     if (protection_bytes_per_key_ > 0 && Valid()) {
@@ -1493,11 +1511,13 @@ void MemTable::GetFromTable(const LookupKey& key,
   saver.allow_data_in_errors = moptions_.allow_data_in_errors;
   saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
 
-  if (!moptions_.paranoid_memory_checks) {
+  if (!moptions_.paranoid_memory_checks &&
+      !moptions_.memtable_veirfy_per_key_checksum_on_seek) {
     table_->Get(key, &saver, SaveValue);
   } else {
-    Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
-                                            moptions_.allow_data_in_errors);
+    Status check_s = table_->GetAndValidate(
+        key, &saver, SaveValue, moptions_.allow_data_in_errors,
+        moptions_.paranoid_memory_checks, key_validation_callback_);
     if (check_s.IsCorruption()) {
       *(saver.status) = check_s;
       // Should stop searching the LSM.
@@ -1508,6 +1528,11 @@ void MemTable::GetFromTable(const LookupKey& key,
   *seq = saver.seq;
 }
 
+Status MemTable::ValidateKey(const char* key, bool allow_data_in_errors) {
+  return VerifyEntryChecksum(key, moptions_.protection_bytes_per_key,
+                             allow_data_in_errors);
+}
+
 void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
                         ReadCallback* callback, bool immutable_memtable) {
   // The sequence number is updated synchronously in version_set.h
diff --git a/db/memtable.h b/db/memtable.h
index b3e6069531b8..fb3d2323156b 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -64,6 +64,7 @@ struct ImmutableMemTableOptions {
   uint32_t protection_bytes_per_key;
   bool allow_data_in_errors;
   bool paranoid_memory_checks;
+  bool memtable_veirfy_per_key_checksum_on_seek;
 };
 
 // Batched counters to updated when inserting keys in one write batch.
@@ -826,6 +827,9 @@ class MemTable final : public ReadOnlyMemTable {
                                     uint32_t protection_bytes_per_key,
                                     bool allow_data_in_errors = false);
 
+  // Validate the checksum of the key/value pair.
+  Status ValidateKey(const char* key, bool allow_data_in_errors);
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -956,6 +960,8 @@ class MemTable final : public ReadOnlyMemTable {
                            SequenceNumber s, char* checksum_ptr);
 
   void MaybeUpdateNewestUDT(const Slice& user_key);
+
+  const std::function<Status(const char*, bool)> key_validation_callback_;
 };
 
 const char* EncodeKey(std::string* scratch, const Slice& target);
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 8bfcb7b29746..58d7cf08e3b8 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -275,6 +275,7 @@ DECLARE_string(last_level_temperature);
 DECLARE_string(default_write_temperature);
 DECLARE_string(default_temperature);
 DECLARE_bool(paranoid_memory_checks);
+DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek);
 
 // Options for transaction dbs.
 // Use TransactionDB (a.k.a. Pessimistic Transaction DB)
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 8e92dd25d960..1cc8d8e1e610 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1484,6 +1484,11 @@ DEFINE_bool(paranoid_memory_checks,
             ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
             "Sets CF option paranoid_memory_checks.");
 
+DEFINE_bool(
+    memtable_veirfy_per_key_checksum_on_seek,
+    ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek,
+    "Sets CF option memtable_veirfy_per_key_checksum_on_seek.");
+
 DEFINE_uint32(commit_bypass_memtable_one_in, 0,
               "If greater than zero, transaction option will set "
               "commit_bypass_memtable to per every N transactions on average.");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index c3c7d4bdc7f1..06024c25cafc 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4442,6 +4442,8 @@ void InitializeOptionsFromFlags(
       FLAGS_memtable_protection_bytes_per_key;
   options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
   options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
+  options.memtable_veirfy_per_key_checksum_on_seek =
+      FLAGS_memtable_veirfy_per_key_checksum_on_seek;
 
   // Integrated BlobDB
   options.enable_blob_files = FLAGS_enable_blob_files;
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index 90767a06ecd5..b4e7a30e9523 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -1101,12 +1101,22 @@ struct AdvancedColumnFamilyOptions {
   uint32_t bottommost_file_compaction_delay = 0;
 
   // Enables additional integrity checks during reads/scans.
-  // Specifically, for skiplist-based memtables, we verify that keys visited
-  // are in order. This is helpful to detect corrupted memtable keys during
-  // reads. Enabling this feature incurs a performance overhead due to an
-  // additional key comparison during memtable lookup.
+  // Specifically, for skiplist-based memtables, key ordering validation could
+  // be enabled optionally. This is helpful to detect corrupted memtable keys
+  // during reads. Enabling this feature incurs a performance overhead due to
+  // additional comparison during memtable lookup.
   bool paranoid_memory_checks = false;
 
+  // Enables additional integrity checks during seek.
+  // Specifically, for skiplist-based memtables, key checksum validation could
+  // be enabled during seek optionally. This is helpful to detect corrupted
+  // memtable keys during reads. Enabling this feature incurs a performance
+  // overhead due to additional key checksum validation during memtable seek
+  // operation.
+  // This option depends on memtable_protection_bytes_per_key to be non zero.
+  // If memtable_protection_bytes_per_key is zero, no validation is performed.
+  bool memtable_veirfy_per_key_checksum_on_seek = false;
+
   // When an iterator scans this number of invisible entries (tombstones or
   // hidden puts) from the active memtable during a single iterator operation,
   // we will attempt to flush the memtable. Currently only forward scans are
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index dff6e4248b2a..00d08562762b 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -38,6 +38,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#include <functional>
 #include <memory>
 #include <stdexcept>
 #include <unordered_set>
@@ -201,11 +202,12 @@ class MemTableRep {
                    bool (*callback_func)(void* arg, const char* entry));
 
   // Same as Get() but performs data integrity validation.
-  virtual Status GetAndValidate(const LookupKey& /* k */,
-                                void* /* callback_args */,
-                                bool (* /* callback_func */)(void* arg,
-                                                             const char* entry),
-                                bool /*allow_data_in_error*/) {
+  virtual Status GetAndValidate(
+      const LookupKey& /* k */, void* /* callback_args */,
+      bool (* /* callback_func */)(void* arg, const char* entry),
+      bool /* allow_data_in_error */, bool /* detect_key_out_of_order */,
+      const std::function<Status(const char*, bool)>&
+      /* key_validation_callback */) {
     return Status::NotSupported("GetAndValidate() not implemented.");
   }
 
@@ -276,9 +278,11 @@ class MemTableRep {
     // Seek and perform integrity validations on the skip list.
     // Iterator becomes invalid and Corruption is returned if a
     // corruption is found.
-    virtual Status SeekAndValidate(const Slice& /* internal_key */,
-                                   const char* /* memtable_key */,
-                                   bool /* allow_data_in_errors */) {
+    virtual Status SeekAndValidate(
+        const Slice& /* internal_key */, const char* /* memtable_key */,
+        bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */,
+        const std::function<Status(const char*, bool)>&
+        /* key_validation_callback */) {
       return Status::NotSupported("SeekAndValidate() not implemented.");
     }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 78ea33f564f2..1f4e237d5fbb 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -623,6 +623,7 @@ struct DBOptions {
   // checking for corruption, including
   // * paranoid_file_checks
   // * paranoid_memory_checks
+  // * memtable_veirfy_per_key_checksum_on_seek
   // * DB::VerifyChecksum()
   //
   // Default: true
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index caa4c3aec4fa..a25436af495b 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -180,8 +180,11 @@ class InlineSkipList {
     // Advance to the first entry with a key >= target
     void Seek(const char* target);
 
-    [[nodiscard]] Status SeekAndValidate(const char* target,
-                                         bool allow_data_in_errors);
+    [[nodiscard]] Status SeekAndValidate(
+        const char* target, bool allow_data_in_errors,
+        bool detect_key_out_of_order,
+        const std::function<Status(const char*, bool)>&
+            key_validation_callback);
 
     // Retreat to the last entry with a key <= target
     void SeekForPrev(const char* target);
@@ -243,20 +246,23 @@ class InlineSkipList {
   bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
 
   // Returns the earliest node with a key >= key.
-  // Returns nullptr if there is no such node.
-  // @param out_of_order_node If not null, will validate the order of visited
-  // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
-  // returned and *out_of_order_node will be set to n2.
-  Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const;
+  // Returns OK, if no corruption is found.
+  // node is set to the found node, or to nullptr if no node is found.
+  // Returns Corruption if a corruption is found.
+  Status FindGreaterOrEqual(const char* key, Node** node,
+                            bool detect_key_out_of_order,
+                            bool allow_data_in_errors,
+                            const std::function<Status(const char*, bool)>&
+                                key_validation_callback) const;
 
   // Returns the latest node with a key < key.
   // Returns head_ if there is no such node.
   // Fills prev[level] with pointer to previous node at "level" for every
   // level in [0..max_height_-1], if prev is non-null.
-  // @param out_of_order_node If not null, will validate the order of visited
+  // @param corrupted_node If not null, will validate the order of visited
   // nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
-  // returned and *out_of_order_node will be set to n2.
-  Node* FindLessThan(const char* key, Node** out_of_order_node) const;
+  // returned and *corrupted_node will be set to n2.
+  Node* FindLessThan(const char* key, Node** corrupted_node) const;
 
   // Return the last node in the list.
   // Return head_ if list is empty.
@@ -396,6 +402,12 @@ inline const char* InlineSkipList<Comparator>::Iterator::key() const {
 template <class Comparator>
 inline void InlineSkipList<Comparator>::Iterator::Next() {
   assert(Valid());
+
+  // Capture the key before move on to next node
+  TEST_SYNC_POINT_CALLBACK(
+      "InlineSkipList::Iterator::Next::key",
+      static_cast<void*>(const_cast<char*>((node_->Key()))));
+
   node_ = node_->Next(0);
 }
 
@@ -403,6 +415,12 @@ template <class Comparator>
 inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
     bool allow_data_in_errors) {
   assert(Valid());
+
+  // Capture the key before move on to next node
+  TEST_SYNC_POINT_CALLBACK(
+      "InlineSkipList::Iterator::Next::key",
+      static_cast<void*>(const_cast<char*>((node_->Key()))));
+
   Node* prev_node = node_;
   node_ = node_->Next(0);
   // Verify that keys are increasing.
@@ -432,12 +450,12 @@ inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
     const bool allow_data_in_errors) {
   assert(Valid());
   // Skip list validation is done in FindLessThan().
-  Node* out_of_order_node = nullptr;
-  node_ = list_->FindLessThan(node_->Key(), &out_of_order_node);
-  if (out_of_order_node) {
+  Node* corrupted_node = nullptr;
+  node_ = list_->FindLessThan(node_->Key(), &corrupted_node);
+  if (corrupted_node) {
     Node* node = node_;
     node_ = nullptr;
-    return Corruption(node, out_of_order_node, allow_data_in_errors);
+    return Corruption(node, corrupted_node, allow_data_in_errors);
   }
   if (node_ == list_->head_) {
     node_ = nullptr;
@@ -447,20 +465,19 @@ inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
 
 template <class Comparator>
 inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
-  node_ = list_->FindGreaterOrEqual(target, nullptr);
+  auto status =
+      list_->FindGreaterOrEqual(target, &node_, false, false, nullptr);
+  assert(status.ok());
 }
 
 template <class Comparator>
 inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
-    const char* target, const bool allow_data_in_errors) {
-  Node* out_of_order_node = nullptr;
-  node_ = list_->FindGreaterOrEqual(target, &out_of_order_node);
-  if (out_of_order_node) {
-    Node* node = node_;
-    node_ = nullptr;
-    return Corruption(node, out_of_order_node, allow_data_in_errors);
-  }
-  return Status::OK();
+    const char* target, const bool allow_data_in_errors,
+    bool check_key_out_of_order,
+    const std::function<Status(const char*, bool)>& key_validation_callback) {
+  return list_->FindGreaterOrEqual(target, &node_, allow_data_in_errors,
+                                   check_key_out_of_order,
+                                   key_validation_callback);
 }
 
 template <class Comparator>
@@ -527,15 +544,18 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
 }
 
 template <class Comparator>
-typename InlineSkipList<Comparator>::Node*
-InlineSkipList<Comparator>::FindGreaterOrEqual(
-    const char* key, Node** const out_of_order_node) const {
+Status InlineSkipList<Comparator>::FindGreaterOrEqual(
+    const char* key, Node** node, bool allow_data_in_errors,
+    bool detect_key_out_of_order,
+    const std::function<Status(const char*, bool)>& key_validation_callback)
+    const {
   // Note: It looks like we could reduce duplication by implementing
   // this function as FindLessThan(key)->Next(0), but we wouldn't be able
   // to exit early on equality and the result wouldn't even be correct.
   // A concurrent insert might occur after FindLessThan(key) but before
   // we get a chance to call Next(0).
   Node* x = head_;
+  *node = nullptr;
   int level = GetMaxHeight() - 1;
   Node* last_bigger = nullptr;
   const DecodedKey key_decoded = compare_.decode_key(key);
@@ -543,10 +563,16 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(
     Node* next = x->Next(level);
     if (next != nullptr) {
       PREFETCH(next->Next(level), 0, 1);
-      if (out_of_order_node && x != head_ &&
+      if (detect_key_out_of_order && x != head_ &&
           compare_(x->Key(), next->Key()) >= 0) {
-        *out_of_order_node = next;
-        return x;
+        return Corruption(x, next, allow_data_in_errors);
+      }
+      if (key_validation_callback != nullptr) {
+        auto status =
+            key_validation_callback(next->Key(), allow_data_in_errors);
+        if (!status.ok()) {
+          return status;
+        }
       }
     }
     // Make sure the lists are sorted
@@ -557,7 +583,8 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(
                   ? 1
                   : compare_(next->Key(), key_decoded);
     if (cmp == 0 || (cmp > 0 && level == 0)) {
-      return next;
+      *node = next;
+      return Status::OK();
     } else if (cmp < 0) {
       // Keep searching in this list
       x = next;
@@ -1113,7 +1140,9 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
 
 template <class Comparator>
 bool InlineSkipList<Comparator>::Contains(const char* key) const {
-  Node* x = FindGreaterOrEqual(key, nullptr);
+  Node* x = nullptr;
+  auto status = FindGreaterOrEqual(key, &x, false, false, nullptr);
+  assert(status.ok());
   if (x != nullptr && Equal(key, x->Key())) {
     return true;
   } else {
diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc
index 93d32e9fec6e..c83baeeefcb2 100644
--- a/memtable/skiplistrep.cc
+++ b/memtable/skiplistrep.cc
@@ -94,11 +94,14 @@ class SkipListRep : public MemTableRep {
 
   Status GetAndValidate(const LookupKey& k, void* callback_args,
                         bool (*callback_func)(void* arg, const char* entry),
-                        bool allow_data_in_errors) override {
+                        bool allow_data_in_errors, bool detect_key_out_of_order,
+                        const std::function<Status(const char*, bool)>&
+                            key_validation_callback) override {
     SkipListRep::Iterator iter(&skip_list_);
     Slice dummy_slice;
-    Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
-                                         allow_data_in_errors);
+    Status status = iter.SeekAndValidate(
+        dummy_slice, k.memtable_key().data(), allow_data_in_errors,
+        detect_key_out_of_order, key_validation_callback);
     for (; iter.Valid() && status.ok() &&
            callback_func(callback_args, iter.key());
          status = iter.NextAndValidate(allow_data_in_errors)) {
@@ -244,12 +247,18 @@ class SkipListRep : public MemTableRep {
     }
 
     Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
-                           bool allow_data_in_errors) override {
+                           bool allow_data_in_errors,
+                           bool detect_key_out_of_order,
+                           const std::function<Status(const char*, bool)>&
+                               key_validation_callback) override {
       if (memtable_key != nullptr) {
-        return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
+        return iter_.SeekAndValidate(memtable_key, allow_data_in_errors,
+                                     detect_key_out_of_order,
+                                     key_validation_callback);
       } else {
-        return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
-                                     allow_data_in_errors);
+        return iter_.SeekAndValidate(
+            EncodeKey(&tmp_, user_key), allow_data_in_errors,
+            detect_key_out_of_order, key_validation_callback);
       }
     }
 
diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc
index fa9449c68cc5..738f89f79e9e 100644
--- a/memtable/vectorrep.cc
+++ b/memtable/vectorrep.cc
@@ -85,7 +85,10 @@ class VectorRep : public MemTableRep {
 
     // Seek and do some memory validation
     Status SeekAndValidate(const Slice& internal_key, const char* memtable_key,
-                           bool allow_data_in_errors) override;
+                           bool allow_data_in_errors,
+                           bool detect_key_out_of_order,
+                           const std::function<Status(const char*, bool)>&
+                               key_validation_callback) override;
 
     // Advance to the first entry with a key <= target
     void SeekForPrev(const Slice& user_key, const char* memtable_key) override;
@@ -266,9 +269,11 @@ void VectorRep::Iterator::Seek(const Slice& user_key,
              .first;
 }
 
-Status VectorRep::Iterator::SeekAndValidate(const Slice& /* internal_key */,
-                                            const char* /* memtable_key */,
-                                            bool /* allow_data_in_errors */) {
+Status VectorRep::Iterator::SeekAndValidate(
+    const Slice& /* internal_key */, const char* /* memtable_key */,
+    bool /* allow_data_in_errors */, bool /* detect_key_out_of_order */,
+    const std::function<Status(const char*, bool)>&
+    /* key_validation_callback */) {
   if (vrep_) {
     WriteLock l(&vrep_->rwlock_);
     if (bucket_->begin() == bucket_->end()) {
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 14f14b7c7e10..475e0d7a4386 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -662,6 +662,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, paranoid_memory_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"memtable_veirfy_per_key_checksum_on_seek",
+         {offsetof(struct MutableCFOptions,
+                   memtable_veirfy_per_key_checksum_on_seek),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {kOptNameCompOpts,
          OptionTypeInfo::Struct(
              kOptNameCompOpts, &compression_options_type_info,
@@ -1178,6 +1183,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  preserve_internal_time_seconds);
   ROCKS_LOG_INFO(log, "                   paranoid_memory_checks: %d",
                  paranoid_memory_checks);
+  ROCKS_LOG_INFO(log, "memtable_veirfy_per_key_checksum_on_seek: %d",
+                 memtable_veirfy_per_key_checksum_on_seek);
   std::string result;
   char buf[10];
   for (const auto m : max_bytes_for_level_multiplier_additional) {
diff --git a/options/cf_options.h b/options/cf_options.h
index 6ac660854f28..815c60f54c52 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -170,6 +170,8 @@ struct MutableCFOptions {
             options.memtable_protection_bytes_per_key),
         block_protection_bytes_per_key(options.block_protection_bytes_per_key),
         paranoid_memory_checks(options.paranoid_memory_checks),
+        memtable_veirfy_per_key_checksum_on_seek(
+            options.memtable_veirfy_per_key_checksum_on_seek),
         sample_for_compression(
             options.sample_for_compression),  // TODO: is 0 fine here?
         compression_per_level(options.compression_per_level),
@@ -231,6 +233,7 @@ struct MutableCFOptions {
         memtable_protection_bytes_per_key(0),
         block_protection_bytes_per_key(0),
         paranoid_memory_checks(false),
+        memtable_veirfy_per_key_checksum_on_seek(false),
         sample_for_compression(0),
         memtable_max_range_deletions(0),
         bottommost_file_compaction_delay(0),
@@ -337,6 +340,7 @@ struct MutableCFOptions {
   uint32_t memtable_protection_bytes_per_key;
   uint8_t block_protection_bytes_per_key;
   bool paranoid_memory_checks;
+  bool memtable_veirfy_per_key_checksum_on_seek;
 
   uint64_t sample_for_compression;
   std::vector<CompressionType> compression_per_level;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 2f7a303929f3..09788a31e2cc 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -232,6 +232,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->block_protection_bytes_per_key =
       moptions.block_protection_bytes_per_key;
   cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks;
+  cf_opts->memtable_veirfy_per_key_checksum_on_seek =
+      moptions.memtable_veirfy_per_key_checksum_on_seek;
   cf_opts->bottommost_file_compaction_delay =
       moptions.bottommost_file_compaction_delay;
 
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index e4eba3fb6c50..9e8768b28026 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -682,6 +682,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "bottommost_file_compaction_delay=7200;"
       "uncache_aggressiveness=1234;"
       "paranoid_memory_checks=1;"
+      "memtable_veirfy_per_key_checksum_on_seek=1;"
       "memtable_op_scan_flush_trigger=123;"
       "memtable_avg_op_scan_flush_trigger=12;"
       "cf_allow_ingest_behind=1;",
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 26490510e8ff..fd07e1d1b63e 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1281,6 +1281,9 @@ DEFINE_bool(
 DEFINE_bool(paranoid_memory_checks, false,
             "Sets CF option paranoid_memory_checks");
 
+DEFINE_bool(memtable_veirfy_per_key_checksum_on_seek, false,
+            "Sets CF option memtable_veirfy_per_key_checksum_on_seek");
+
 DEFINE_bool(
     auto_refresh_iterator_with_snapshot, false,
     "When set to true, RocksDB iterator will automatically refresh itself "
@@ -4850,6 +4853,8 @@ class Benchmark {
     options.block_protection_bytes_per_key =
         FLAGS_block_protection_bytes_per_key;
     options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
+    options.memtable_veirfy_per_key_checksum_on_seek =
+        FLAGS_memtable_veirfy_per_key_checksum_on_seek;
     options.memtable_op_scan_flush_trigger =
         FLAGS_memtable_op_scan_flush_trigger;
     options.compaction_options_universal.reduce_file_locking =
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index cdba233e5156..cf71c9dcdf94 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -364,6 +364,7 @@ def setup_random_seed_before_main():
     "use_timed_put_one_in": lambda: random.choice([0] * 7 + [1, 5, 10]),
     "universal_max_read_amp": lambda: random.choice([-1] * 3 + [0, 4, 10]),
     "paranoid_memory_checks": lambda: random.choice([0] * 7 + [1]),
+    "memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
     "track_and_verify_wals": lambda: random.choice([0]),    
@@ -775,7 +776,11 @@ def finalize_and_sanitize(src_params):
 
     if dest_params.get("memtablerep") == "vector":
         dest_params["inplace_update_support"] = 0
+
+    # only skip list memtable representation supports paranoid memory checks
+    if dest_params.get("memtablerep") != "skip_list":
         dest_params["paranoid_memory_checks"] = 0
+        dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0
 
     if dest_params["test_batches_snapshots"] == 1:
         dest_params["enable_compaction_filter"] = 0
diff --git a/unreleased_history/new_features/improve_data_integrity_check_on_seek.md b/unreleased_history/new_features/improve_data_integrity_check_on_seek.md
new file mode 100644
index 000000000000..7b17c5dad1ad
--- /dev/null
+++ b/unreleased_history/new_features/improve_data_integrity_check_on_seek.md
@@ -0,0 +1 @@
+A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.

From ef6fbe7ff97a8a6ae381c96cc53bb8ca85a86594 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 19 Sep 2025 01:55:06 -0700
Subject: [PATCH 289/500] Attempt fix initialization order dep on kPageSize
 (#13973)

Summary:
If there's a static initialization of Options() this could now instantiate an AutoHyperClockTable before kPageSize is initialized. Break the dependency because it's a very minor optimization.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13973

Test Plan: internal CI (not able to reproduce locally)

Reviewed By: hx235

Differential Revision: D82789849

Pulled By: pdillinger

fbshipit-source-id: 3f32b5779a4f56d2071be5aadacda2bf0f4b895d
---
 cache/clock_cache.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index d65fd56495b3..91cd2d4c2148 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -1727,9 +1727,11 @@ inline uint64_t UsedLengthToLengthInfo(size_t used_length) {
 }
 
 inline size_t GetStartingLength(size_t capacity) {
-  if (capacity > port::kPageSize) {
+  // Avoid potential initialization order race with port::kPageSize
+  constexpr size_t kPresumedPageSize = 4096;
+  if (capacity > kPresumedPageSize) {
     // Start with one memory page
-    return port::kPageSize / sizeof(AutoHyperClockTable::HandleImpl);
+    return kPresumedPageSize / sizeof(AutoHyperClockTable::HandleImpl);
   } else {
     // Mostly to make unit tests happy
     return 4;

From e9fc03eed73db75c32a2b4bdcaf3965d4416cfb2 Mon Sep 17 00:00:00 2001
From: Pavel Tcholakov <pavel@tcholakov.net>
Date: Fri, 19 Sep 2025 09:52:15 -0700
Subject: [PATCH 290/500] Expose C bindings for Column Family export/import
 (#13874)

Summary:
This change adds FFI support for exporting column family checkpoints, basic access to the export/import files metadata, and creating column families by import.

I've been able to successfully use this to [add checkpoint export and import support to `rust-rocksdb`](https://github.com/pcholakov/rust-rocksdb/pull/2), a forked version of which has been successfully used in production for some time.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13874

Reviewed By: hx235

Differential Revision: D82343565

Pulled By: jaykorean

fbshipit-source-id: fb4182bdfd5cce10743c021a1ac636fd6ac48df3
---
 db/c.cc             | 188 ++++++++++++++++++++++++++++++++++++++++++++
 db/c_test.c         |  72 +++++++++++++++++
 include/rocksdb/c.h |  85 ++++++++++++++++++++
 3 files changed, 345 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 1da15274efac..177343b889ba 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -82,6 +82,7 @@ using ROCKSDB_NAMESPACE::DbPath;
 using ROCKSDB_NAMESPACE::Env;
 using ROCKSDB_NAMESPACE::EnvOptions;
 using ROCKSDB_NAMESPACE::EventListener;
+using ROCKSDB_NAMESPACE::ExportImportFilesMetaData;
 using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo;
 using ROCKSDB_NAMESPACE::FileLock;
 using ROCKSDB_NAMESPACE::FilterPolicy;
@@ -89,6 +90,7 @@ using ROCKSDB_NAMESPACE::FlushJobInfo;
 using ROCKSDB_NAMESPACE::FlushOptions;
 using ROCKSDB_NAMESPACE::HistogramData;
 using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
+using ROCKSDB_NAMESPACE::ImportColumnFamilyOptions;
 using ROCKSDB_NAMESPACE::InfoLogLevel;
 using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
 using ROCKSDB_NAMESPACE::Iterator;
@@ -245,6 +247,9 @@ struct rocksdb_write_buffer_manager_t {
 struct rocksdb_sst_file_manager_t {
   std::shared_ptr<SstFileManager> rep;
 };
+struct rocksdb_livefile_t {
+  LiveFileMetaData rep;
+};
 struct rocksdb_livefiles_t {
   std::vector<LiveFileMetaData> rep;
 };
@@ -255,6 +260,12 @@ struct rocksdb_column_family_handle_t {
 struct rocksdb_column_family_metadata_t {
   ColumnFamilyMetaData rep;
 };
+struct rocksdb_export_import_files_metadata_t {
+  ExportImportFilesMetaData* rep;
+};
+struct rocksdb_import_column_family_options_t {
+  ImportColumnFamilyOptions rep;
+};
 struct rocksdb_level_metadata_t {
   const LevelMetaData* rep;
 };
@@ -947,6 +958,22 @@ void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
                         std::string(checkpoint_dir), log_size_for_flush));
 }
 
+rocksdb_export_import_files_metadata_t* rocksdb_checkpoint_export_column_family(
+    rocksdb_checkpoint_t* checkpoint,
+    rocksdb_column_family_handle_t* column_family, const char* export_dir,
+    char** errptr) {
+  ExportImportFilesMetaData* metadata = nullptr;
+  if (SaveError(errptr,
+                checkpoint->rep->ExportColumnFamily(
+                    column_family->rep, std::string(export_dir), &metadata))) {
+    return nullptr;
+  }
+  rocksdb_export_import_files_metadata_t* result =
+      new rocksdb_export_import_files_metadata_t;
+  result->rep = metadata;
+  return result;
+}
+
 void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
   delete checkpoint->rep;
   delete checkpoint;
@@ -1190,6 +1217,26 @@ rocksdb_column_family_handle_t** rocksdb_create_column_families(
   return c_handles;
 }
 
+rocksdb_column_family_handle_t* rocksdb_create_column_family_with_import(
+    rocksdb_t* db, rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    rocksdb_import_column_family_options_t* import_options,
+    rocksdb_export_import_files_metadata_t* export_import_files_metadata,
+    char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  handle->rep = nullptr;
+  if (SaveError(errptr,
+                db->rep->CreateColumnFamilyWithImport(
+                    ColumnFamilyOptions(column_family_options->rep),
+                    std::string(column_family_name), import_options->rep,
+                    *(export_import_files_metadata->rep), &(handle->rep)))) {
+    delete handle;
+    return nullptr;
+  }
+  handle->immortal = false;
+  return handle;
+}
+
 void rocksdb_create_column_families_destroy(
     rocksdb_column_family_handle_t** list) {
   free(list);
@@ -6209,6 +6256,10 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt,
   }
 }
 
+rocksdb_livefiles_t* rocksdb_livefiles_create() {
+  return new rocksdb_livefiles_t;
+}
+
 int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) {
   return static_cast<int>(lf->rep.size());
 }
@@ -6222,6 +6273,16 @@ const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) {
   return lf->rep[index].name.c_str();
 }
 
+const char* rocksdb_livefiles_directory(const rocksdb_livefiles_t* lf,
+                                        int index) {
+  if (lf->rep[index].directory.empty()) {
+    // db_path is deprecated but still returned by some code paths
+    return lf->rep[index].db_path.c_str();
+  } else {
+    return lf->rep[index].directory.c_str();
+  }
+}
+
 int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) {
   return lf->rep[index].level;
 }
@@ -6242,6 +6303,16 @@ const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf,
   return lf->rep[index].largestkey.data();
 }
 
+uint64_t rocksdb_livefiles_smallest_seqno(const rocksdb_livefiles_t* lf,
+                                          int index) {
+  return lf->rep[index].smallest_seqno;
+}
+
+uint64_t rocksdb_livefiles_largest_seqno(const rocksdb_livefiles_t* lf,
+                                         int index) {
+  return lf->rep[index].largest_seqno;
+}
+
 uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) {
   return lf->rep[index].num_entries;
 }
@@ -6252,6 +6323,71 @@ uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) {
 
 void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) { delete lf; }
 
+rocksdb_livefile_t* rocksdb_livefile_create() { return new rocksdb_livefile_t; }
+
+void rocksdb_livefile_set_column_family_name(rocksdb_livefile_t* lf,
+                                             const char* column_family_name) {
+  lf->rep.column_family_name = std::string(column_family_name);
+}
+
+void rocksdb_livefile_set_level(rocksdb_livefile_t* lf, int level) {
+  lf->rep.level = level;
+}
+
+void rocksdb_livefile_set_name(rocksdb_livefile_t* lf, const char* name) {
+  lf->rep.name = std::string(name);
+}
+
+void rocksdb_livefile_set_directory(rocksdb_livefile_t* lf,
+                                    const char* directory) {
+  lf->rep.directory = std::string(directory);
+  lf->rep.db_path = std::string(directory);  // deprecated but still needed
+}
+
+void rocksdb_livefile_set_size(rocksdb_livefile_t* lf, size_t size) {
+  lf->rep.size = size;
+}
+
+void rocksdb_livefile_set_smallest_key(rocksdb_livefile_t* lf,
+                                       const char* smallest_key,
+                                       size_t smallest_key_len) {
+  lf->rep.smallestkey = std::string(smallest_key, smallest_key_len);
+}
+
+void rocksdb_livefile_set_largest_key(rocksdb_livefile_t* lf,
+                                      const char* largest_key,
+                                      size_t largest_key_len) {
+  lf->rep.largestkey = std::string(largest_key, largest_key_len);
+}
+
+void rocksdb_livefile_set_smallest_seqno(rocksdb_livefile_t* lf,
+                                         uint64_t smallest_seqno) {
+  lf->rep.smallest_seqno = smallest_seqno;
+}
+
+void rocksdb_livefile_set_largest_seqno(rocksdb_livefile_t* lf,
+                                        uint64_t largest_seqno) {
+  lf->rep.largest_seqno = largest_seqno;
+}
+
+void rocksdb_livefile_set_num_entries(rocksdb_livefile_t* lf,
+                                      uint64_t num_entries) {
+  lf->rep.num_entries = num_entries;
+}
+
+void rocksdb_livefile_set_num_deletions(rocksdb_livefile_t* lf,
+                                        uint64_t num_deletions) {
+  lf->rep.num_deletions = num_deletions;
+}
+
+void rocksdb_livefile_destroy(rocksdb_livefile_t* lf) { delete lf; }
+
+void rocksdb_livefiles_add(rocksdb_livefiles_t* lf,
+                           rocksdb_livefile_t* livefile) {
+  lf->rep.push_back(std::move(livefile->rep));
+  delete livefile;
+}
+
 void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
                                      const char* opts_str,
                                      rocksdb_options_t* new_options,
@@ -6402,6 +6538,58 @@ char* rocksdb_sst_file_metadata_get_largestkey(
   return CopyString(file_meta->rep->largestkey);
 }
 
+rocksdb_import_column_family_options_t*
+rocksdb_import_column_family_options_create() {
+  return new rocksdb_import_column_family_options_t;
+}
+
+void rocksdb_import_column_family_options_set_move_files(
+    rocksdb_import_column_family_options_t* opt, unsigned char v) {
+  opt->rep.move_files = v;
+}
+
+void rocksdb_import_column_family_options_destroy(
+    rocksdb_import_column_family_options_t* metadata) {
+  delete metadata;
+}
+
+rocksdb_export_import_files_metadata_t*
+rocksdb_export_import_files_metadata_create() {
+  auto metadata = new rocksdb_export_import_files_metadata_t;
+  metadata->rep = new ExportImportFilesMetaData;
+  return metadata;
+}
+
+char* rocksdb_export_import_files_metadata_get_db_comparator_name(
+    rocksdb_export_import_files_metadata_t* metadata) {
+  return strdup(metadata->rep->db_comparator_name.c_str());
+}
+
+void rocksdb_export_import_files_metadata_set_db_comparator_name(
+    rocksdb_export_import_files_metadata_t* metadata, const char* name) {
+  metadata->rep->db_comparator_name = std::string(name);
+}
+
+rocksdb_livefiles_t* rocksdb_export_import_files_metadata_get_files(
+    rocksdb_export_import_files_metadata_t* export_import_metadata) {
+  auto files = new rocksdb_livefiles_t;
+  files->rep = std::vector(export_import_metadata->rep->files);
+  return files;
+}
+
+void rocksdb_export_import_files_metadata_set_files(
+    rocksdb_export_import_files_metadata_t* metadata,
+    rocksdb_livefiles_t* files) {
+  metadata->rep->files = std::move(files->rep);
+  delete files;
+}
+
+void rocksdb_export_import_files_metadata_destroy(
+    rocksdb_export_import_files_metadata_t* metadata) {
+  delete metadata->rep;
+  delete metadata;
+}
+
 /* Transactions */
 
 rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
diff --git a/db/c_test.c b/db/c_test.c
index 4e74651f4690..a06c8a74d2e7 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -1036,6 +1036,78 @@ int main(int argc, char** argv) {
     rocksdb_options_set_error_if_exists(options, 1);
   }
 
+  StartPhase("checkpoint_export_column_family");
+  {
+    static char cf_export_path[200];
+    static char db_import_path[200];
+    snprintf(cf_export_path, sizeof(cf_export_path),
+             "%s/rocksdb_c_test-%d-cf_export", GetTempDir(), ((int)geteuid()));
+    snprintf(db_import_path, sizeof(db_import_path),
+             "%s/rocksdb_c_test-%d-db_import", GetTempDir(), ((int)geteuid()));
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_column_family_handle_t* cf_export =
+        rocksdb_create_column_family(db, db_options, "cf_export", &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, cf_export, "k1", 2, "v1", 2, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, cf_export, "k2", 2, "v2", 2, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_t* checkpoint =
+        rocksdb_checkpoint_object_create(db, &err);
+    CheckNoError(err);
+
+    rocksdb_export_import_files_metadata_t* export_metadata =
+        rocksdb_checkpoint_export_column_family(checkpoint, cf_export,
+                                                cf_export_path, &err);
+    CheckNoError(err);
+    const char* comparator_name =
+        rocksdb_export_import_files_metadata_get_db_comparator_name(
+            export_metadata);
+    CheckEqual("leveldb.BytewiseComparator", comparator_name, 26);
+    rocksdb_free((void*)comparator_name);
+    rocksdb_checkpoint_object_destroy(checkpoint);
+    checkpoint = NULL;
+    rocksdb_drop_column_family(db, cf_export, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_destroy(cf_export);
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    rocksdb_options_set_error_if_exists(db_options, 1);
+    rocksdb_t* db_import = rocksdb_open(db_options, db_import_path, &err);
+    CheckNoError(err);
+    rocksdb_import_column_family_options_t* import_options =
+        rocksdb_import_column_family_options_create();
+    rocksdb_column_family_handle_t* cf_import =
+        rocksdb_create_column_family_with_import(db_import, db_options,
+                                                 "cf_import", import_options,
+                                                 export_metadata, &err);
+    CheckNoError(err);
+    rocksdb_import_column_family_options_destroy(import_options);
+    rocksdb_export_import_files_metadata_destroy(export_metadata);
+    size_t val_len;
+    char* val =
+        rocksdb_get_cf(db_import, roptions, cf_import, "k1", 2, &val_len, &err);
+    CheckNoError(err);
+    CheckEqual("v1", val, val_len);
+    free(val);
+
+    val =
+        rocksdb_get_cf(db_import, roptions, cf_import, "k2", 2, &val_len, &err);
+    CheckNoError(err);
+    CheckEqual("v2", val, val_len);
+    free(val);
+
+    rocksdb_column_family_handle_destroy(cf_import);
+    cf_import = NULL;
+    rocksdb_close(db_import);
+    rocksdb_destroy_db(db_options, db_import_path, &err);
+    CheckNoError(err);
+    rocksdb_options_destroy(db_options);
+    db_options = NULL;
+  }
+
   StartPhase("compactall");
   rocksdb_compact_range(db, NULL, 0, NULL, 0);
   CheckGet(db, roptions, "foo", "hello");
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 3f6d28e73541..e615c8da521c 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -113,10 +113,15 @@ typedef struct rocksdb_writebatch_wi_t rocksdb_writebatch_wi_t;
 typedef struct rocksdb_writeoptions_t rocksdb_writeoptions_t;
 typedef struct rocksdb_universal_compaction_options_t
     rocksdb_universal_compaction_options_t;
+typedef struct rocksdb_livefile_t rocksdb_livefile_t;
 typedef struct rocksdb_livefiles_t rocksdb_livefiles_t;
 typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
 typedef struct rocksdb_column_family_metadata_t
     rocksdb_column_family_metadata_t;
+typedef struct rocksdb_import_column_family_options_t
+    rocksdb_import_column_family_options_t;
+typedef struct rocksdb_export_import_files_metadata_t
+    rocksdb_export_import_files_metadata_t;
 typedef struct rocksdb_level_metadata_t rocksdb_level_metadata_t;
 typedef struct rocksdb_sst_file_metadata_t rocksdb_sst_file_metadata_t;
 typedef struct rocksdb_envoptions_t rocksdb_envoptions_t;
@@ -377,6 +382,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_create(
     rocksdb_checkpoint_t* checkpoint, const char* checkpoint_dir,
     uint64_t log_size_for_flush, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_export_import_files_metadata_t*
+rocksdb_checkpoint_export_column_family(
+    rocksdb_checkpoint_t* checkpoint,
+    rocksdb_column_family_handle_t* column_family, const char* export_dir,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_checkpoint_object_destroy(
     rocksdb_checkpoint_t* checkpoint);
 
@@ -437,6 +448,13 @@ rocksdb_create_column_families(rocksdb_t* db,
 extern ROCKSDB_LIBRARY_API void rocksdb_create_column_families_destroy(
     rocksdb_column_family_handle_t** list);
 
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family_with_import(
+    rocksdb_t* db, rocksdb_options_t* column_family_options,
+    const char* column_family_name,
+    rocksdb_import_column_family_options_t* import_options,
+    rocksdb_export_import_files_metadata_t* metadata, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
 rocksdb_create_column_family_with_ttl(
     rocksdb_t* db, const rocksdb_options_t* column_family_options,
@@ -2681,12 +2699,16 @@ rocksdb_fifo_compaction_options_get_max_table_files_size(
 extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts);
 
+extern ROCKSDB_LIBRARY_API rocksdb_livefiles_t* rocksdb_livefiles_create(void);
+
 extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
     const rocksdb_livefiles_t*);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_column_family_name(
     const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
     const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_directory(
+    const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
     const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API size_t
@@ -2696,12 +2718,44 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
     const rocksdb_livefiles_t*, int index, size_t* size);
 extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_smallest_seqno(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_livefiles_largest_seqno(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_livefiles_entries(const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_livefiles_deletions(const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
     const rocksdb_livefiles_t*);
 
+extern ROCKSDB_LIBRARY_API rocksdb_livefile_t* rocksdb_livefile_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_column_family_name(
+    rocksdb_livefile_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_level(rocksdb_livefile_t*,
+                                                           int);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_name(rocksdb_livefile_t*,
+                                                          const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_directory(
+    rocksdb_livefile_t*, const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_size(rocksdb_livefile_t*,
+                                                          size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_smallest_key(
+    rocksdb_livefile_t*, const char*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_largest_key(
+    rocksdb_livefile_t*, const char*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_smallest_seqno(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_largest_seqno(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_num_entries(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_set_num_deletions(
+    rocksdb_livefile_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefile_destroy(rocksdb_livefile_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_add(rocksdb_livefiles_t*,
+                                                      rocksdb_livefile_t*);
+
 /* Utility Helpers */
 
 extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
@@ -2722,6 +2776,37 @@ extern ROCKSDB_LIBRARY_API void rocksdb_delete_file_in_range_cf(
 extern ROCKSDB_LIBRARY_API rocksdb_column_family_metadata_t*
 rocksdb_get_column_family_metadata(rocksdb_t* db);
 
+extern ROCKSDB_LIBRARY_API rocksdb_import_column_family_options_t*
+rocksdb_import_column_family_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_import_column_family_options_set_move_files(
+    rocksdb_import_column_family_options_t*, unsigned char);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_import_column_family_options_destroy(
+    rocksdb_import_column_family_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_export_import_files_metadata_t*
+rocksdb_export_import_files_metadata_create(void);
+
+extern ROCKSDB_LIBRARY_API char*
+rocksdb_export_import_files_metadata_get_db_comparator_name(
+    rocksdb_export_import_files_metadata_t*);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_export_import_files_metadata_set_db_comparator_name(
+    rocksdb_export_import_files_metadata_t*, const char*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_livefiles_t*
+rocksdb_export_import_files_metadata_get_files(
+    rocksdb_export_import_files_metadata_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_export_import_files_metadata_set_files(
+    rocksdb_export_import_files_metadata_t*, rocksdb_livefiles_t*);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_export_import_files_metadata_destroy(
+    rocksdb_export_import_files_metadata_t*);
+
 /**
  * Returns the rocksdb_column_family_metadata_t of the specified
  * column family.

From 798373975ccf7fa6bf5248c9b534726ab00cc9b1 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 19 Sep 2025 10:21:38 -0700
Subject: [PATCH 291/500] Unpin skipped data blocks in MultiScan (#13972)

Summary:
Currently in MultiScan we only unpins a block after we scan through it. This PR adds unpinning during Seek to release all blocks pinned by the previous scan range. This is useful when users do not scan through the entire scan range. I plan to follow up with support for aborting async IOs from the previous scan.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13972

Test Plan: new test MultiScanUnpinPreviousBlocks validates unpinning behavior

Reviewed By: xingbowang

Differential Revision: D82779504

Pulled By: cbi42

fbshipit-source-id: 17ba7d1e5a6d8ff09ceea57b79c18febfba75584
---
 .../block_based/block_based_table_iterator.cc | 26 +++++++
 .../block_based/block_based_table_iterator.h  | 13 ++++
 .../block_based_table_reader_test.cc          | 78 +++++++++++++++++++
 3 files changed, 117 insertions(+)

diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index e5bc833ee91c..f5f19b09ebd5 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1047,6 +1047,10 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
     // Unexpected seek key
     multi_scan_.reset();
   } else {
+    if (multi_scan_->next_scan_idx > 0) {
+      UnpinPreviousScanBlocks(multi_scan_->next_scan_idx);
+    }
+
     auto [cur_scan_start_idx, cur_scan_end_idx] =
         multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
     // We should have the data block already loaded
@@ -1091,6 +1095,28 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
   return false;
 }
 
+void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
+  // TODO: support aborting and clearn up async IO requests, currently
+  // only unpins already initialized blocks
+  assert(multi_scan_);
+  assert(current_scan_idx < multi_scan_->block_index_ranges_per_scan.size());
+  if (current_scan_idx == 0) return;
+
+  auto [prev_start_block_idx, prev_end_block_idx] =
+      multi_scan_->block_index_ranges_per_scan[current_scan_idx - 1];
+  // Since a block can be shared between consecutive scans, we need
+  // curr_start_block_idx here instead of just release blocks
+  // up to prev_end_block_idx.
+  auto [curr_start_block_idx, curr_end_block_idx] =
+      multi_scan_->block_index_ranges_per_scan[current_scan_idx];
+  for (size_t block_idx = prev_start_block_idx;
+       block_idx < curr_start_block_idx; ++block_idx) {
+    if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
+      multi_scan_->pinned_data_blocks[block_idx].Reset();
+    }
+  }
+}
+
 void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
   assert(multi_scan_);
   assert(multi_scan_->next_scan_idx >= 1);
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 0b1ad3348f2a..39fc2a1bef04 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -238,6 +238,16 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
 
+  bool TEST_IsBlockPinnedByMultiScan(size_t block_idx) {
+    if (!multi_scan_) {
+      return false;
+    }
+    if (block_idx >= multi_scan_->pinned_data_blocks.size()) {
+      return false;
+    }
+    return !multi_scan_->pinned_data_blocks[block_idx].IsEmpty();
+  }
+
  private:
   enum class IterDirection {
     kForward,
@@ -594,6 +604,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   void FindBlockForwardInMultiScan();
 
+  // Unpins blocks from the immediately previous scan range.
+  void UnpinPreviousScanBlocks(size_t current_scan_idx);
+
   void PrepareReadAsyncCallBack(FSReadRequest& req, void* cb_arg) {
     // Record status, result and sanity check offset from `req`.
     AsyncReadState* async_state = static_cast<AsyncReadState*>(cb_arg);
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 7a9dd81a4caa..1922ef8fbd4c 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/options.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_iterator.h"
 #include "table/block_based/partitioned_index_iterator.h"
 #include "table/format.h"
 #include "test_util/testharness.h"
@@ -1431,6 +1432,83 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
   }
 }
 
+TEST_P(BlockBasedTableReaderTest, MultiScanUnpinPreviousBlocks) {
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          30 /* num_block */,
+          true /* mixed_with_human_readable_string_value */);
+  std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" +
+                           CompressionTypeToString(compression_type_);
+  ImmutableOptions ioptions(options_);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options_.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  ReadOptions read_opts;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  // Range 1: block 0-4, Range 2: block 4-4, Range 3: block 5-15
+  scan_options.insert(ExtractUserKey(kv[0 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[5 * kEntriesPerBlock - 5].first));
+  scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 4].first),
+                      ExtractUserKey(kv[5 * kEntriesPerBlock - 3].first));
+  scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock - 2].first),
+                      ExtractUserKey(kv[15 * kEntriesPerBlock - 1].first));
+
+  iter->Prepare(&scan_options);
+  auto* bbiter = dynamic_cast<BlockBasedTableIterator*>(iter.get());
+  ASSERT_TRUE(bbiter);
+  for (int block = 0; block < 15; ++block) {
+    ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+
+  // MultiScan require seeks to be called in scan_option order
+  iter->Seek(kv[0 * kEntriesPerBlock].first);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+
+  // Seek to second range - should unpin blocks from first range
+  iter->Seek(kv[5 * kEntriesPerBlock - 4].first);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 4].first);
+  ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 4].second);
+
+  // The last block (block 4) is shared with the second range, so
+  // it's not unpinned yet.
+  for (int block = 0; block < 4; ++block) {
+    ASSERT_FALSE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+  // Blocks from second range still in cache.
+  // We skip block 4 here since it's ownership is moved to the actual data
+  // block iter.
+  for (int block = 5; block < 15; ++block) {
+    ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+
+  iter->Seek(kv[5 * kEntriesPerBlock - 2].first);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ(iter->key(), kv[5 * kEntriesPerBlock - 2].first);
+  ASSERT_EQ(iter->value(), kv[5 * kEntriesPerBlock - 2].second);
+
+  // Still pinned
+  for (int block = 5; block < 15; ++block) {
+    ASSERT_TRUE(bbiter->TEST_IsBlockPinnedByMultiScan(block)) << block;
+  }
+}
+
 // Param 1: compression type
 // Param 2: whether to use direct reads
 // Param 3: Block Based Table Index type, partitioned filters are also enabled

From 19c8d1b7ed5f1a735d605d6161b2f55f9c9386af Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 19 Sep 2025 10:56:50 -0700
Subject: [PATCH 292/500] (Re-)fix initialization order dep on kPageSize
 (#13976)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13976

Missed an occurrence of kPageSize in the last PR
https://github.com/facebook/rocksdb/pull/13973

Reviewed By: mszeszko-meta

Differential Revision: D82826713

fbshipit-source-id: b112cd7c94b7d6604623ee80274b2b25911245eb
---
 cache/clock_cache.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 91cd2d4c2148..a14a15ece855 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -1726,9 +1726,10 @@ inline uint64_t UsedLengthToLengthInfo(size_t used_length) {
   return length_info;
 }
 
+// Avoid potential initialization order race with port::kPageSize
+constexpr size_t kPresumedPageSize = 4096;
+
 inline size_t GetStartingLength(size_t capacity) {
-  // Avoid potential initialization order race with port::kPageSize
-  constexpr size_t kPresumedPageSize = 4096;
   if (capacity > kPresumedPageSize) {
     // Start with one memory page
     return kPresumedPageSize / sizeof(AutoHyperClockTable::HandleImpl);
@@ -3590,7 +3591,7 @@ size_t AutoHyperClockTable::CalcMaxUsableLength(
   size_t num_slots =
       static_cast<size_t>(capacity / min_avg_slot_charge + 0.999999);
 
-  const size_t slots_per_page = port::kPageSize / sizeof(HandleImpl);
+  const size_t slots_per_page = kPresumedPageSize / sizeof(HandleImpl);
 
   // Round up to page size
   return ((num_slots + slots_per_page - 1) / slots_per_page) * slots_per_page;

From fa3e61cce23afde91ee3c2fc5e42f223c44876d1 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 19 Sep 2025 13:52:05 -0700
Subject: [PATCH 293/500] Improve sst_dump --command=recompress (#13977)

Summary:
* There was a bug where the compression manager would actually not be used for recompress because the options passed to SstFileDumper were not respected. That is now fixed by respecting the Options.
* Refactored SstFileDumper not to take explicit options that could naturally be embedded in Options.
* Report compressed and uncompressed data block sizes (and ratio) instead of total file size (without a useful ratio). Needed to add a new table property to support that.
* Allow --block_size instead of --set_block_size to be consistent with other tools
* Allow --compression_level as shorthand for both _from and _to options, for simplicity and consistency with other tools
* Support --compression_parallel_threads option

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13977

Test Plan:
* sst_dump manual testing
* TableProperties unit tests updated
* Made it much easier to detect when a functional change requires an update to ParseTablePropertiesString() (rather than causing cryptic downstream failures)

Reviewed By: cbi42

Differential Revision: D82841412

Pulled By: pdillinger

fbshipit-source-id: 8d3421be4d2a3e25b7590cd59d204a3779c2a928
---
 db/db_test_util.h                             | 31 ++++----
 include/rocksdb/table_properties.h            |  2 +
 options/options_settable_test.cc              |  4 +-
 .../block_based/block_based_table_builder.cc  |  2 +
 table/sst_file_dumper.cc                      | 79 ++++++++++---------
 table/sst_file_dumper.h                       | 12 +--
 table/table_properties.cc                     |  8 ++
 tools/sst_dump_tool.cc                        | 72 ++++++++++++++---
 .../new_features/sst_dump_recompress.md       |  1 +
 9 files changed, 138 insertions(+), 73 deletions(-)
 create mode 100644 unreleased_history/new_features/sst_dump_recompress.md

diff --git a/db/db_test_util.h b/db/db_test_util.h
index 168a6ebf0a07..d93d68532317 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -1438,20 +1438,23 @@ class DBTestBase : public testing::Test {
     std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
     std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
     ResetTableProperties(tp);
-    sscanf(tp_string.c_str(),
-           "# data blocks %" SCNu64 " # entries %" SCNu64
-           " # deletions %" SCNu64 " # merge operands %" SCNu64
-           " # range deletions %" SCNu64 " raw key size %" SCNu64
-           " raw average key size %lf "
-           " raw value size %" SCNu64
-           " raw average value size %lf "
-           " data block size %" SCNu64 " index block size (user-key? %" SCNu64
-           ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
-           &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
-           &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
-           &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
-           &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
-           &tp->index_size, &tp->filter_size);
+    int count = sscanf(
+        tp_string.c_str(),
+        "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+        " # merge operands %" SCNu64 " # range deletions %" SCNu64
+        " raw key size %" SCNu64
+        " raw average key size %lf "
+        " raw value size %" SCNu64
+        " raw average value size %lf "
+        " data block size %" SCNu64 " data uncompressed size %" SCNu64
+        " index block size (user-key? %" SCNu64 ", delta-value? %" SCNu64
+        ") %" SCNu64 " filter block size %" SCNu64,
+        &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+        &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+        &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+        &tp->uncompressed_data_size, &tp->index_key_is_user_key,
+        &tp->index_value_is_delta_encoded, &tp->index_size, &tp->filter_size);
+    ASSERT_EQ(count, 15);
   }
 
  private:  // Prone to error on direct use
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index c47746a17d24..6bac922761f9 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -221,6 +221,8 @@ struct TableProperties {
   uint64_t orig_file_number = 0;
   // the total size of all data blocks.
   uint64_t data_size = 0;
+  // the total uncompressed size of all data blocks (since RocksDB 10.7)
+  uint64_t uncompressed_data_size = 0;
   // the size of index block.
   uint64_t index_size = 0;
   // Total number of index partitions if kTwoLevelIndexSearch is used
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 9e8768b28026..cacb1d2be316 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -275,8 +275,8 @@ TEST_F(OptionsSettableTest, TablePropertiesAllFieldsSettable) {
       "property_collectors_names=;prefix_extractor_name=;db_host_id="
       "64625F686F73745F6964;db_session_id=64625F73657373696F6E5F6964;creation_"
       "time=0;num_data_blocks=123;index_value_is_delta_encoded=0;top_level_"
-      "index_"
-      "size=0;data_size=100;merge_operator_name=;index_partitions=0;file_"
+      "index_size=0;data_size=100;uncompressed_data_size=1234;"
+      "merge_operator_name=;index_partitions=0;file_"
       "creation_time=0;raw_value_size=0;index_size=200;user_collected_"
       "properties={757365725F6B6579=757365725F76616C7565;};tail_start_offset=0;"
       "seqno_to_time_mapping=;raw_key_size=0;slow_compression_estimated_data_"
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 68fd4aab3648..620cf5fe8d7c 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1729,6 +1729,7 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   r->single_threaded_compressed_output.Reset();
   if (is_data_block) {
     r->props.data_size = r->get_offset();
+    r->props.uncompressed_data_size += uncompressed_block_data.size();
     ++r->props.num_data_blocks;
   }
 }
@@ -1776,6 +1777,7 @@ void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
           &uncompressed);
       if (LIKELY(ios.ok())) {
         rep_->props.data_size = rep_->get_offset();
+        rep_->props.uncompressed_data_size += block_rep->uncompressed.size();
         ++rep_->props.num_data_blocks;
 
         rep_->index_builder->FinishIndexEntry(
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 80ac41367db2..b095073b8f37 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -231,8 +231,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
 }
 
 Status SstFileDumper::CalculateCompressedTableSize(
-    const TableBuilderOptions& tb_options, size_t block_size,
-    uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
+    const TableBuilderOptions& tb_options, TableProperties* props) {
   std::unique_ptr<Env> env(NewMemEnv(options_.env));
   std::unique_ptr<WritableFileWriter> dest_writer;
   Status s =
@@ -241,12 +240,9 @@ Status SstFileDumper::CalculateCompressedTableSize(
   if (!s.ok()) {
     return s;
   }
-  BlockBasedTableOptions table_options;
-  table_options.block_size = block_size;
-  BlockBasedTableFactory block_based_tf(table_options);
-  std::unique_ptr<TableBuilder> table_builder;
-  table_builder.reset(
-      block_based_tf.NewTableBuilder(tb_options, dest_writer.get()));
+  std::unique_ptr<TableBuilder> table_builder{
+      tb_options.moptions.table_factory->NewTableBuilder(tb_options,
+                                                         dest_writer.get())};
   std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
       read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
@@ -261,18 +257,13 @@ Status SstFileDumper::CalculateCompressedTableSize(
   if (!s.ok()) {
     return s;
   }
-  *compressed_table_size = table_builder->FileSize();
-  assert(num_data_blocks != nullptr);
-  *num_data_blocks = table_builder->GetTableProperties().num_data_blocks;
+  *props = table_builder->GetTableProperties();
   return env->DeleteFile(testFileName);
 }
 
 Status SstFileDumper::ShowAllCompressionSizes(
-    size_t block_size, const std::vector<CompressionType>& compression_types,
-    int32_t compress_level_from, int32_t compress_level_to,
-    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
-    uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer) {
-  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
+    const std::vector<CompressionType>& compression_types,
+    int32_t compress_level_from, int32_t compress_level_to) {
   for (CompressionType ctype : compression_types) {
     std::string cname;
     if (!GetStringFromCompressionType(&cname, ctype).ok()) {
@@ -283,15 +274,11 @@ Status SstFileDumper::ShowAllCompressionSizes(
             ? options_.compression_manager->SupportsCompressionType(ctype)
             : CompressionTypeSupported(ctype)) {
       fprintf(stdout, "Compression: %-24s\n", cname.c_str());
-      CompressionOptions compress_opt;
-      compress_opt.max_dict_bytes = max_dict_bytes;
-      compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
-      compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
-      compress_opt.use_zstd_dict_trainer = use_zstd_dict_trainer;
+      CompressionOptions compress_opt = options_.compression_opts;
       for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
         fprintf(stdout, "Compression level: %d", j);
         compress_opt.level = j;
-        Status s = ShowCompressionSize(block_size, ctype, compress_opt);
+        Status s = ShowCompressionSize(ctype, compress_opt);
         if (!s.ok()) {
           return s;
         }
@@ -304,14 +291,20 @@ Status SstFileDumper::ShowAllCompressionSizes(
 }
 
 Status SstFileDumper::ShowCompressionSize(
-    size_t block_size, CompressionType compress_type,
-    const CompressionOptions& compress_opt) {
-  Options opts;
+    CompressionType compress_type, const CompressionOptions& compress_opt) {
+  Options opts = options_;  // Use compression_manager etc.
   opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   opts.statistics->set_stats_level(StatsLevel::kAll);
+  if (!opts.table_factory->IsInstanceOf(TableFactory::kBlockBasedTableName())) {
+    // Currently need block-based table for compression
+    opts.table_factory = std::make_shared<BlockBasedTableFactory>();
+  }
+
+  // Create internal Options types
   const ImmutableOptions imoptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
+
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   const WriteOptions write_options;
@@ -326,20 +319,24 @@ Status SstFileDumper::ShowCompressionSize(
       &block_based_table_factories, compress_type, compress_opt,
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       column_family_name, unknown_level, kUnknownNewestKeyTime);
-  uint64_t num_data_blocks = 0;
+  TableProperties props;
   std::chrono::steady_clock::time_point start =
       std::chrono::steady_clock::now();
-  uint64_t file_size;
-  Status s = CalculateCompressedTableSize(tb_opts, block_size, &num_data_blocks,
-                                          &file_size);
+  Status s = CalculateCompressedTableSize(tb_opts, &props);
   if (!s.ok()) {
     return s;
   }
 
+  uint64_t num_data_blocks = props.num_data_blocks;
+
   std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-  fprintf(stdout, " Size: %10" PRIu64, file_size);
-  fprintf(stdout, " Blocks: %6" PRIu64, num_data_blocks);
-  fprintf(stdout, " Time Taken: %10s microsecs",
+  fprintf(stdout, " Comp size: %10" PRIu64, props.data_size);
+  fprintf(stdout, " Uncompressed: %10" PRIu64, props.uncompressed_data_size);
+  fprintf(stdout, " Ratio: %10s",
+          std::to_string(static_cast<double>(props.uncompressed_data_size) /
+                         static_cast<double>(props.data_size))
+              .c_str());
+  fprintf(stdout, " Microsecs: %10s ",
           std::to_string(
               std::chrono::duration_cast<std::chrono::microseconds>(end - start)
                   .count())
@@ -373,7 +370,7 @@ Status SstFileDumper::ShowCompressionSize(
                              : ((static_cast<double>(not_compressed_blocks) /
                                  static_cast<double>(num_data_blocks)) *
                                 100.0);
-  fprintf(stdout, " Compressed: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+  fprintf(stdout, " Comp count: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
           compressed_pcnt);
   fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
           ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
@@ -405,14 +402,21 @@ Status SstFileDumper::SetTableOptionsByMagicNumber(
   assert(table_properties_);
   if (table_magic_number == kBlockBasedTableMagicNumber ||
       table_magic_number == kLegacyBlockBasedTableMagicNumber) {
-    BlockBasedTableFactory* bbtf = new BlockBasedTableFactory();
+    // Preserve BlockBasedTableOptions on options_ when possible
+    if (!options_.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
+      options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+    }
+
+    BlockBasedTableFactory* bbtf =
+        static_cast_with_check<BlockBasedTableFactory>(
+            options_.table_factory.get());
     // To force tail prefetching, we fake reporting two useful reads of 512KB
     // from the tail.
     // It needs at least two data points to warm up the stats.
     bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
     bbtf->tail_prefetch_stats()->RecordEffectiveSize(512 * 1024);
 
-    options_.table_factory.reset(bbtf);
     if (!silent_) {
       fprintf(stdout, "Sst file format: block-based\n");
     }
@@ -464,7 +468,10 @@ Status SstFileDumper::SetTableOptionsByMagicNumber(
 
 Status SstFileDumper::SetOldTableOptions() {
   assert(table_properties_ == nullptr);
-  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  if (!options_.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName())) {
+    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
+  }
   if (!silent_) {
     fprintf(stdout, "Sst file format: block-based(old version)\n");
   }
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index 2cceec407439..22b1e860b4ee 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -43,12 +43,10 @@ class SstFileDumper {
   Status getStatus() { return init_result_; }
 
   Status ShowAllCompressionSizes(
-      size_t block_size, const std::vector<CompressionType>& compression_types,
-      int32_t compress_level_from, int32_t compress_level_to,
-      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
-      uint64_t max_dict_buffer_bytes, bool use_zstd_dict_trainer);
+      const std::vector<CompressionType>& compression_types,
+      int32_t compress_level_from, int32_t compress_level_to);
 
-  Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
+  Status ShowCompressionSize(CompressionType compress_type,
                              const CompressionOptions& compress_opt);
 
   BlockContents& GetMetaIndexContents() { return meta_index_contents_; }
@@ -61,9 +59,7 @@ class SstFileDumper {
                              FilePrefetchBuffer* prefetch_buffer);
 
   Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
-                                      size_t block_size,
-                                      uint64_t* num_data_blocks,
-                                      uint64_t* compressed_table_size);
+                                      TableProperties* props);
 
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
   Status SetOldTableOptions();
diff --git a/table/table_properties.cc b/table/table_properties.cc
index d5a654676d7b..48886c873fb7 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -65,6 +65,8 @@ std::string TableProperties::ToString(const std::string& prop_delim,
                  prop_delim, kv_delim);
 
   AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
+  AppendProperty(result, "data uncompressed size", uncompressed_data_size,
+                 prop_delim, kv_delim);
   char index_block_size_str[80];
   snprintf(index_block_size_str, sizeof(index_block_size_str),
            "index block size (user-key? %d, delta-value? %d)",
@@ -180,6 +182,7 @@ std::string TableProperties::ToString(const std::string& prop_delim,
 
 void TableProperties::Add(const TableProperties& tp) {
   data_size += tp.data_size;
+  uncompressed_data_size += tp.uncompressed_data_size;
   index_size += tp.index_size;
   index_partitions += tp.index_partitions;
   top_level_index_size += tp.top_level_index_size;
@@ -204,6 +207,7 @@ std::map<std::string, uint64_t>
 TableProperties::GetAggregatablePropertiesAsMap() const {
   std::map<std::string, uint64_t> rv;
   rv["data_size"] = data_size;
+  rv["uncompressed_data_size"] = uncompressed_data_size;
   rv["index_size"] = index_size;
   rv["index_partitions"] = index_partitions;
   rv["top_level_index_size"] = top_level_index_size;
@@ -334,6 +338,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {"data_size",
          {offsetof(struct TableProperties, data_size), OptionType::kUInt64T,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"uncompressed_data_size",
+         {offsetof(struct TableProperties, uncompressed_data_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"index_size",
          {offsetof(struct TableProperties, index_size), OptionType::kUInt64T,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index f81ee8e56b4e..f8f4d28ae431 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -15,6 +15,7 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/utilities/ldb_cmd.h"
 #include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/sst_file_dumper.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -84,7 +85,7 @@ void print_help(bool to_stderr) {
       Print table properties after iterating over the file when executing
       check|scan|raw|identify
 
-    --set_block_size=<block_size>
+    --block_size=<block_size>
       Can be combined with --command=recompress to set the block size that will
       be used when trying different compression algorithms
 
@@ -103,6 +104,9 @@ void print_help(bool to_stderr) {
       Convenience option to parse an internal key on the command line. Dumps the
       internal key in hex format {'key' @ SN: type}
 
+    --compression_level=<compression_level>
+      Sets both --compression_level_from= and --compression_level_to=
+
     --compression_level_from=<compression_level>
       Compression level to start compressing when executing recompress. One compression type
       and compression_level_to must also be specified
@@ -111,18 +115,21 @@ void print_help(bool to_stderr) {
       Compression level to stop compressing when executing recompress. One compression type
       and compression_level_from must also be specified
 
+    --compression_max_dict_buffer_bytes=<int64_t>
+      Limit on buffer size from which we collect samples for dictionary generation.
+
     --compression_max_dict_bytes=<uint32_t>
       Maximum size of dictionary used to prime the compression library
 
-    --compression_zstd_max_train_bytes=<uint32_t>
-      Maximum size of training data passed to zstd's dictionary trainer
-
-    --compression_max_dict_buffer_bytes=<int64_t>
-      Limit on buffer size from which we collect samples for dictionary generation.
+    --compression_parallel_threads=<uint32_t>
+      Number of parallel threads to use with --command=recompress
 
     --compression_use_zstd_finalize_dict
       Use zstd's finalizeDictionary() API instead of zstd's dictionary trainer to generate dictionary.
 
+    --compression_zstd_max_train_bytes=<uint32_t>
+      Maximum size of training data passed to zstd's dictionary trainer
+
     --list_meta_blocks
       Print the list of all meta blocks in the file
 )",
@@ -167,7 +174,6 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   bool show_properties = false;
   bool show_summary = false;
   bool list_meta_blocks = false;
-  bool set_block_size = false;
   bool has_compression_level_from = false;
   bool has_compression_level_to = false;
   bool has_specified_compression_types = false;
@@ -176,7 +182,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   std::string block_size_str;
   std::string compression_level_from_str;
   std::string compression_level_to_str;
-  size_t block_size = 0;
+  size_t block_size = 16384;  // A popular choice for default
   size_t readahead_size = 2 * 1024 * 1024;
   std::vector<CompressionType> compression_types;
   std::shared_ptr<CompressionManager> compression_manager;
@@ -195,6 +201,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes;
   bool compression_use_zstd_finalize_dict =
       !ROCKSDB_NAMESPACE::CompressionOptions().use_zstd_dict_trainer;
+  uint32_t compression_parallel_threads = 1;
 
   int64_t tmp_val;
 
@@ -235,8 +242,9 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     } else if (strcmp(argv[i], "--show_summary") == 0) {
       show_summary = true;
     } else if (ParseIntArg(argv[i], "--set_block_size=",
+                           "block size must be numeric", &tmp_val) ||
+               ParseIntArg(argv[i], "--block_size=",
                            "block size must be numeric", &tmp_val)) {
-      set_block_size = true;
       block_size = static_cast<size_t>(tmp_val);
     } else if (ParseIntArg(argv[i], "--readahead_size=",
                            "readahead_size must be numeric", &tmp_val)) {
@@ -297,6 +305,12 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       }
       fprintf(stdout, "key=%s\n", ikey.DebugString(true, true).c_str());
       return retc;
+    } else if (ParseIntArg(argv[i], "--compression_level=",
+                           "compression_level must be numeric", &tmp_val)) {
+      has_compression_level_from = true;
+      has_compression_level_to = true;
+      compress_level_from = static_cast<int>(tmp_val);
+      compress_level_to = static_cast<int>(tmp_val);
     } else if (ParseIntArg(argv[i], "--compression_level_from=",
                            "compression_level_from must be numeric",
                            &tmp_val)) {
@@ -316,6 +330,16 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         return 1;
       }
       compression_max_dict_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_parallel_threads=",
+                           "compression_parallel_threads must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0 || tmp_val > 100) {
+        fprintf(stderr, "compression_parallel_threads out of range: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_parallel_threads = static_cast<uint32_t>(tmp_val);
     } else if (ParseIntArg(argv[i], "--compression_zstd_max_train_bytes=",
                            "compression_zstd_max_train_bytes must be numeric",
                            &tmp_val)) {
@@ -448,9 +472,32 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       verify_checksum = true;
     }
 
+    // Update options for when simulating writing a table file
+    {
+      BlockBasedTableOptions bbto;
+      if (options.table_factory->IsInstanceOf(
+              TableFactory::kBlockBasedTableName()) &&
+          options.table_factory->GetOptions<BlockBasedTableOptions>()) {
+        bbto = *options.table_factory->GetOptions<BlockBasedTableOptions>();
+      }
+      bbto.block_size = block_size;
+      // Maximize compression features available
+      bbto.format_version = kLatestFormatVersion;
+      options.table_factory = std::make_shared<BlockBasedTableFactory>(bbto);
+    }
+    options.compression_opts.max_dict_bytes = compression_max_dict_bytes;
+    options.compression_opts.zstd_max_train_bytes =
+        compression_zstd_max_train_bytes;
+    options.compression_opts.max_dict_buffer_bytes =
+        compression_max_dict_buffer_bytes;
+    options.compression_opts.use_zstd_dict_trainer =
+        !compression_use_zstd_finalize_dict;
+    options.compression_opts.parallel_threads = compression_parallel_threads;
+
     ROCKSDB_NAMESPACE::SstFileDumper dumper(
         options, filename, Temperature::kUnknown, readahead_size,
         verify_checksum, output_hex, decode_blob_index);
+
     // Not a valid SST
     if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
@@ -471,15 +518,14 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     }
 
     if (command == "recompress") {
+      fprintf(stdout, "Block Size: %zu  Threads: %u\n", block_size,
+              (unsigned)compression_parallel_threads);
       // TODO: consider getting supported compressions from the compression
       // manager
       st = dumper.ShowAllCompressionSizes(
-          set_block_size ? block_size : 16384,
           compression_types.empty() ? GetSupportedCompressions()
                                     : compression_types,
-          compress_level_from, compress_level_to, compression_max_dict_bytes,
-          compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes,
-          !compression_use_zstd_finalize_dict);
+          compress_level_from, compress_level_to);
       if (!st.ok()) {
         fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
         exit(1);
diff --git a/unreleased_history/new_features/sst_dump_recompress.md b/unreleased_history/new_features/sst_dump_recompress.md
new file mode 100644
index 000000000000..76075299fa1c
--- /dev/null
+++ b/unreleased_history/new_features/sst_dump_recompress.md
@@ -0,0 +1 @@
+* Added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`

From a843991930aad82e5780e37a1a3126894dab0a7f Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 19 Sep 2025 16:01:43 -0700
Subject: [PATCH 294/500] Allow standalone file and directory arguments to
 sst_dump (#13978)

Summary:
longtime wanted e.g. for easy tab-completion, now implemented

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13978

Test Plan: pretty good unit test updates, manual testing

Reviewed By: cbi42

Differential Revision: D82857671

Pulled By: pdillinger

fbshipit-source-id: d2b63b7d15e61ebf22c58a6ecd3003311e2d03cb
---
 tools/sst_dump_test.cc                        |  82 +++++++++++---
 tools/sst_dump_tool.cc                        | 105 +++++++++++-------
 .../new_features/sst_dump_recompress.md       |   2 +-
 3 files changed, 131 insertions(+), 58 deletions(-)

diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index ef7005e1085e..9480ffb30f2a 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -115,6 +115,9 @@ class SSTDumpToolTest : public testing::Test {
     }
   };
 
+#define ASSERT_TOOL_PASS(tool_expr) ASSERT_EQ(0, (tool_expr));
+#define ASSERT_TOOL_FAIL(tool_expr) ASSERT_NE(0, (tool_expr));
+
   template <std::size_t N>
   void PopulateCommandArgs(const std::string& file_path, const char* command,
                            char* (&usage)[N]) const {
@@ -210,7 +213,7 @@ class SSTDumpToolTest : public testing::Test {
     PopulateCommandArgs(file_path, cmd_arg, usage);
 
     ROCKSDB_NAMESPACE::SSTDumpTool tool;
-    ASSERT_TRUE(!tool.Run(3, usage, opts));
+    ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
 
     cleanup(opts, file_path);
   }
@@ -223,11 +226,13 @@ TEST_F(SSTDumpToolTest, HelpAndVersion) {
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
 
   static const char* help[] = {"./sst_dump", "--help"};
-  ASSERT_TRUE(!tool.Run(2, help, opts));
+  ASSERT_TOOL_PASS(tool.Run(2, help, opts));
+  static const char* bad_help[] = {"./sst_dump", "--", "--help"};
+  ASSERT_TOOL_FAIL(tool.Run(3, bad_help, opts));
   static const char* version[] = {"./sst_dump", "--version"};
-  ASSERT_TRUE(!tool.Run(2, version, opts));
+  ASSERT_TOOL_PASS(tool.Run(2, version, opts));
   static const char* bad[] = {"./sst_dump", "--not_an_option"};
-  ASSERT_TRUE(tool.Run(2, bad, opts));
+  ASSERT_TOOL_FAIL(tool.Run(2, bad, opts));
 }
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
@@ -343,7 +348,7 @@ TEST_F(SSTDumpToolTest, CompressionManager) {
   snprintf(usage[4], kOptLength, "--compression_types=kCustomCompression8A");
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(5, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(5, usage, opts));
 
   cleanup(opts, file_path);
 }
@@ -360,7 +365,7 @@ TEST_F(SSTDumpToolTest, MemEnv) {
   PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
 
   cleanup(opts, file_path);
 }
@@ -382,7 +387,7 @@ TEST_F(SSTDumpToolTest, ReadaheadSize) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(4, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
 
   // The file is approximately 10MB. Readahead is 4MB.
   // We usually need 3 reads + one metadata read.
@@ -409,14 +414,14 @@ TEST_F(SSTDumpToolTest, NoSstFile) {
         "--command=verify", "--command=recompress", "--command=verify_checksum",
         "--show_properties"}) {
     snprintf(usage[1], kOptLength, "%s", command);
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+    ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
   }
 }
 
 TEST_F(SSTDumpToolTest, ValidSSTPath) {
   Options opts;
   opts.env = env();
-  char* usage[3];
+  char* usage[5];
   auto cleanup_usage = CleanupUsage{usage};
   PopulateCommandArgs("", "", usage);
   SSTDumpTool tool;
@@ -427,21 +432,62 @@ TEST_F(SSTDumpToolTest, ValidSSTPath) {
   ASSERT_OK(WriteStringToFile(opts.env, "Hello World!", text_file, false));
   std::string fake_sst = MakeFilePath("fake_sst.sst");
   ASSERT_OK(WriteStringToFile(opts.env, "Not an SST file!", fake_sst, false));
+  std::string good_dir = MakeFilePath("");
 
   for (const auto& command_arg : {"--command=verify", "--command=identify"}) {
     snprintf(usage[1], kOptLength, "%s", command_arg);
 
-    snprintf(usage[2], kOptLength, "--file=%s", file_not_exists.c_str());
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+    // Test both classic --file and standalone argument
+    for (const auto& file_fmt : {"--file=%s", "%s"}) {
+      snprintf(usage[2], kOptLength, file_fmt, file_not_exists.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, sst_file.c_str());
+      ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, good_dir.c_str());
+      ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, text_file.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
+
+      snprintf(usage[2], kOptLength, file_fmt, fake_sst.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(3, usage, opts));
+    }
 
-    snprintf(usage[2], kOptLength, "--file=%s", sst_file.c_str());
-    ASSERT_TRUE(!tool.Run(3, usage, opts));
+    // If one file is valid, that's enough to succeed as long as the others
+    // exist
+    for (const auto& good : {sst_file, good_dir}) {
+      // Additional file-or-dir argument
+      snprintf(usage[3], kOptLength, "%s", good.c_str());
 
-    snprintf(usage[2], kOptLength, "--file=%s", text_file.c_str());
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+      snprintf(usage[2], kOptLength, "%s", file_not_exists.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(4, usage, opts));
 
-    snprintf(usage[2], kOptLength, "--file=%s", fake_sst.c_str());
-    ASSERT_TRUE(tool.Run(3, usage, opts));
+      snprintf(usage[2], kOptLength, "%s", sst_file.c_str());
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      snprintf(usage[2], kOptLength, "%s", good_dir.c_str());
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      snprintf(usage[2], kOptLength, "%s", text_file.c_str());
+      // DIFFERENT
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      snprintf(usage[2], kOptLength, "%s", fake_sst.c_str());
+      // DIFFERENT
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      // Some extra cases to test "--" handling
+      snprintf(usage[2], kOptLength, "%s", "--");
+      ASSERT_TOOL_PASS(tool.Run(4, usage, opts));
+
+      snprintf(usage[4], kOptLength, "%s", file_not_exists.c_str());
+      ASSERT_TOOL_FAIL(tool.Run(5, usage, opts));
+
+      snprintf(usage[4], kOptLength, "%s", fake_sst.c_str());
+      ASSERT_TOOL_PASS(tool.Run(5, usage, opts));
+    }
   }
   ASSERT_OK(opts.env->DeleteFile(sst_file));
   ASSERT_OK(opts.env->DeleteFile(text_file));
@@ -460,7 +506,7 @@ TEST_F(SSTDumpToolTest, RawOutput) {
   PopulateCommandArgs(file_path, "--command=raw", usage);
 
   ROCKSDB_NAMESPACE::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage, opts));
+  ASSERT_TOOL_PASS(tool.Run(3, usage, opts));
 
   const std::string raw_path = MakeFilePath("rocksdb_sst_test_dump.txt");
   std::ifstream raw_file(raw_path);
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index f8f4d28ae431..3e4a05b0473e 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -35,9 +35,9 @@ void print_help(bool to_stderr) {
   }
   fprintf(
       to_stderr ? stderr : stdout,
-      R"(sst_dump --file=<data_dir_OR_sst_file> [--command=check|scan|raw|recompress|identify]
-    --file=<data_dir_OR_sst_file>
-      Path to SST file or directory containing SST files
+      R"(sst_dump <db_dirs_OR_sst_files...> [--command=check|scan|raw|recompress|identify]
+    --file=<db_dir_OR_sst_file>
+      Path to SST file or directory containing SST files (old option syntax)
 
     --env_uri=<uri of underlying Env>
       URI of underlying Env, mutually exclusive with fs_uri
@@ -158,7 +158,12 @@ bool ParseIntArg(const char* arg, const std::string arg_name,
 
 int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   std::string env_uri, fs_uri;
-  const char* dir_or_file = nullptr;
+  enum DirVsFile {
+    kUnknownDirVsFile,
+    kDir,
+    kFile,
+  };
+  std::vector<std::pair<const char*, DirVsFile>> dirs_or_files;
   uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
 
@@ -214,7 +219,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     } else if (strncmp(argv[i], "--fs_uri=", 9) == 0) {
       fs_uri = argv[i] + 9;
     } else if (strncmp(argv[i], "--file=", 7) == 0) {
-      dir_or_file = argv[i] + 7;
+      dirs_or_files.emplace_back(argv[i] + 7, kUnknownDirVsFile);
     } else if (strcmp(argv[i], "--output_hex") == 0) {
       output_hex = true;
     } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
@@ -372,10 +377,18 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     } else if (strcmp(argv[i], "--version") == 0) {
       printf("%s\n", GetRocksBuildInfoAsString("sst_dump").c_str());
       return 0;
-    } else {
+    } else if (strcmp(argv[i], "--") == 0) {
+      // Remaining args are dir-or-file
+      for (++i; i < argc; ++i) {
+        dirs_or_files.emplace_back(argv[i], kUnknownDirVsFile);
+      }
+    } else if (argv[i][0] == '-') {
       fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]);
       print_help(/*to_stderr*/ true);
       return 1;
+    } else {
+      // Dir-or-file arg
+      dirs_or_files.emplace_back(argv[i], kUnknownDirVsFile);
     }
   }
 
@@ -405,7 +418,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     }
   }
 
-  if (dir_or_file == nullptr) {
+  if (dirs_or_files.empty()) {
     fprintf(stderr, "file or directory must be specified.\n\n");
     print_help(/*to_stderr*/ true);
     exit(1);
@@ -431,26 +444,35 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
 
   std::vector<std::string> filenames;
   ROCKSDB_NAMESPACE::Env* env = options.env;
-  ROCKSDB_NAMESPACE::Status st = env->GetChildren(dir_or_file, &filenames);
-  bool dir = true;
-  if (!st.ok() || filenames.empty()) {
-    // dir_or_file does not exist or does not contain children
-    // Check its existence first
-    Status s = env->FileExists(dir_or_file);
-    // dir_or_file does not exist
-    if (!s.ok()) {
-      fprintf(stderr, "%s%s: No such file or directory\n", s.ToString().c_str(),
-              dir_or_file);
-      return 1;
+  ROCKSDB_NAMESPACE::Status st;
+
+  for (size_t i = 0; i < dirs_or_files.size(); ++i) {
+    auto dir_or_file = dirs_or_files[i].first;
+    std::vector<std::string> children;
+    st = env->GetChildren(dirs_or_files[i].first, &children);
+    if (!st.ok() || children.empty()) {
+      // dir_or_file does not exist or does not contain children
+      // Check its existence first
+      Status s = env->FileExists(dir_or_file);
+      // dir_or_file does not exist
+      if (!s.ok()) {
+        fprintf(stderr, "%s%s: No such file or directory\n",
+                s.ToString().c_str(), dir_or_file);
+        return 1;
+      }
+      // dir_or_file exists and is treated as a "file"
+      // since it has no children
+      // This is ok since later it will be checked
+      // that whether it is a valid sst or not
+      // (A directory "file" is not a valid sst)
+      filenames.emplace_back(dir_or_file);
+      dirs_or_files[i].second = kFile;
+    } else {
+      for (auto& child : children) {
+        filenames.push_back(std::string{dir_or_file} + "/" + child);
+      }
+      dirs_or_files[i].second = kDir;
     }
-    // dir_or_file exists and is treated as a "file"
-    // since it has no children
-    // This is ok since later it will be checked
-    // that whether it is a valid sst or not
-    // (A directory "file" is not a valid sst)
-    filenames.clear();
-    filenames.emplace_back(dir_or_file);
-    dir = false;
   }
 
   uint64_t total_read = 0;
@@ -464,10 +486,6 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       continue;
     }
 
-    if (dir) {
-      filename = std::string(dir_or_file) + "/" + filename;
-    }
-
     if (command == "verify") {
       verify_checksum = true;
     }
@@ -530,7 +548,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
         exit(1);
       }
-      return 0;
+      continue;
     }
 
     if (command == "raw") {
@@ -658,25 +676,34 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   if (valid_sst_files.empty()) {
     // No valid SST files are found
     // Exit with an error state
-    if (dir) {
-      fprintf(stdout, "------------------------------\n");
-      fprintf(stderr, "No valid SST files found in %s\n", dir_or_file);
-    } else {
-      fprintf(stderr, "%s is not a valid SST file\n", dir_or_file);
+    for (auto& e : dirs_or_files) {
+      if (e.second == kDir) {
+        fprintf(stdout, "------------------------------\n");
+        fprintf(stderr, "No valid SST files found in %s\n", e.first);
+      } else {
+        assert(e.second == kFile);
+        fprintf(stderr, "%s is not a valid SST file\n", e.first);
+      }
     }
     return 1;
   } else {
+    assert(!dirs_or_files.empty());
     if (command == "identify") {
-      if (dir) {
+      if (dirs_or_files.size() > 1 || dirs_or_files[0].second == kDir) {
         fprintf(stdout, "------------------------------\n");
-        fprintf(stdout, "List of valid SST files found in %s:\n", dir_or_file);
+        std::string single_dir_msg;
+        if (dirs_or_files.size() == 1) {
+          single_dir_msg += " found in ";
+          single_dir_msg += dirs_or_files[0].first;
+        }
+        fprintf(stdout, "List of valid SST files%s:\n", single_dir_msg.c_str());
         for (const auto& f : valid_sst_files) {
           fprintf(stdout, "%s\n", f.c_str());
         }
         fprintf(stdout, "Number of valid SST files: %zu\n",
                 valid_sst_files.size());
       } else {
-        fprintf(stdout, "%s is a valid SST file\n", dir_or_file);
+        fprintf(stdout, "%s is a valid SST file\n", dirs_or_files[0].first);
       }
     }
     // At least one valid SST
diff --git a/unreleased_history/new_features/sst_dump_recompress.md b/unreleased_history/new_features/sst_dump_recompress.md
index 76075299fa1c..de8a177e12bf 100644
--- a/unreleased_history/new_features/sst_dump_recompress.md
+++ b/unreleased_history/new_features/sst_dump_recompress.md
@@ -1 +1 @@
-* Added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`
+* Improved `sst_dump` by allowing standalone file and directory arguments without `--file=`. Also added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`

From f9f408f53626498385b19555de7f1ccc9332b6be Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 19 Sep 2025 17:34:48 -0700
Subject: [PATCH 295/500] Start migration of HCC implementation to BitFields
 (#13965)

Summary:
Start the process of migrating the HCC implementation over to my new system of "bit field atomics" to clean up the code. Here I took on the simplest of the three "bit field atomic" formats in HCC, but ended up moving some things around to end up with less plumbing of definitions and values overall.

In the process, updated BitFields to use the CRTP pattern to simplify some things (see updated example, etc.)
https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13965

Test Plan: existing tests. ClockCacheTest.ClockEvictionEffortCapTest caught a regression during my development, and the crash test has a history of finding subtle HCC bugs.

Reviewed By: xingbowang

Differential Revision: D82669582

Pulled By: pdillinger

fbshipit-source-id: b73dd47361cbe9fbd334413dd4ce01b3c667159e
---
 cache/clock_cache.cc                          | 124 +++++++++---------
 cache/clock_cache.h                           |  73 +++++++----
 .../block_based/block_based_table_builder.cc  |   3 +-
 util/bit_fields.h                             |  40 +++---
 util/slice_test.cc                            |   8 +-
 5 files changed, 134 insertions(+), 114 deletions(-)

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index a14a15ece855..e4e327e3b637 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -360,16 +360,9 @@ void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin,
   }
 }
 
-constexpr uint32_t kStrictCapacityLimitBit = 1u << 31;
-
-uint32_t SanitizeEncodeEecAndScl(int eviction_effort_cap,
-                                 bool strict_capacit_limit) {
+uint32_t SanitizeEvictionEffortCap(int eviction_effort_cap) {
   eviction_effort_cap = std::max(int{1}, eviction_effort_cap);
-  eviction_effort_cap =
-      std::min(static_cast<int>(~kStrictCapacityLimitBit), eviction_effort_cap);
-  uint32_t eec_and_scl = static_cast<uint32_t>(eviction_effort_cap);
-  eec_and_scl |= strict_capacit_limit ? kStrictCapacityLimitBit : 0;
-  return eec_and_scl;
+  return static_cast<uint32_t>(eviction_effort_cap);
 }
 
 }  // namespace
@@ -380,6 +373,22 @@ void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const {
   }
 }
 
+BaseClockTable::BaseClockTable(size_t capacity, bool strict_capacity_limit,
+                               int eviction_effort_cap,
+                               CacheMetadataChargePolicy metadata_charge_policy,
+                               MemoryAllocator* allocator,
+                               const Cache::EvictionCallback* eviction_callback,
+                               const uint32_t* hash_seed)
+    : capacity_(capacity),
+      eec_and_scl_(EecAndScl{}
+                       .With<EvictionEffortCap>(
+                           SanitizeEvictionEffortCap(eviction_effort_cap))
+                       .With<StrictCapacityLimit>(strict_capacity_limit)),
+      metadata_charge_policy_(metadata_charge_policy),
+      allocator_(allocator),
+      eviction_callback_(*eviction_callback),
+      hash_seed_(*hash_seed) {}
+
 template <class HandleImpl>
 HandleImpl* BaseClockTable::StandaloneInsert(
     const ClockHandleBasicData& proto) {
@@ -401,8 +410,7 @@ HandleImpl* BaseClockTable::StandaloneInsert(
 
 template <class Table>
 typename Table::HandleImpl* BaseClockTable::CreateStandalone(
-    ClockHandleBasicData& proto, size_t capacity, uint32_t eec_and_scl,
-    bool allow_uncharged) {
+    ClockHandleBasicData& proto, bool allow_uncharged) {
   Table& derived = static_cast<Table&>(*this);
   typename Table::InsertState state;
   derived.StartInsert(state);
@@ -411,10 +419,10 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone(
   // NOTE: we can use eec_and_scl as eviction_effort_cap below because
   // strict_capacity_limit=true is supposed to disable the limit on eviction
   // effort, and a large value effectively does that.
-  if (eec_and_scl & kStrictCapacityLimitBit) {
+  if (eec_and_scl_.LoadRelaxed().Get<StrictCapacityLimit>()) {
     Status s = ChargeUsageMaybeEvictStrict<Table>(
-        total_charge, capacity,
-        /*need_evict_for_occupancy=*/false, eec_and_scl, state);
+        total_charge,
+        /*need_evict_for_occupancy=*/false, state);
     if (!s.ok()) {
       if (allow_uncharged) {
         proto.total_charge = 0;
@@ -425,8 +433,8 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone(
   } else {
     // Case strict_capacity_limit == false
     bool success = ChargeUsageMaybeEvictNonStrict<Table>(
-        total_charge, capacity,
-        /*need_evict_for_occupancy=*/false, eec_and_scl, state);
+        total_charge,
+        /*need_evict_for_occupancy=*/false, state);
     if (!success) {
       // Force the issue
       usage_.FetchAddRelaxed(total_charge);
@@ -438,8 +446,9 @@ typename Table::HandleImpl* BaseClockTable::CreateStandalone(
 
 template <class Table>
 Status BaseClockTable::ChargeUsageMaybeEvictStrict(
-    size_t total_charge, size_t capacity, bool need_evict_for_occupancy,
-    uint32_t eviction_effort_cap, typename Table::InsertState& state) {
+    size_t total_charge, bool need_evict_for_occupancy,
+    typename Table::InsertState& state) {
+  const size_t capacity = capacity_.LoadRelaxed();
   if (total_charge > capacity) {
     return Status::MemoryLimit(
         "Cache entry too large for a single cache shard: " +
@@ -464,8 +473,7 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict(
   }
   if (request_evict_charge > 0) {
     EvictionData data;
-    static_cast<Table*>(this)->Evict(request_evict_charge, state, &data,
-                                     eviction_effort_cap);
+    static_cast<Table*>(this)->Evict(request_evict_charge, state, &data);
     occupancy_.FetchSub(data.freed_count);
     if (LIKELY(data.freed_charge > need_evict_charge)) {
       assert(data.freed_count > 0);
@@ -494,8 +502,8 @@ Status BaseClockTable::ChargeUsageMaybeEvictStrict(
 
 template <class Table>
 inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
-    size_t total_charge, size_t capacity, bool need_evict_for_occupancy,
-    uint32_t eviction_effort_cap, typename Table::InsertState& state) {
+    size_t total_charge, bool need_evict_for_occupancy,
+    typename Table::InsertState& state) {
   // For simplicity, we consider that either the cache can accept the insert
   // with no evictions, or we must evict enough to make (at least) enough
   // space. It could lead to unnecessary failures or excessive evictions in
@@ -505,7 +513,8 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
   // charge. Thus, we should evict some extra if it's not a signifcant
   // portion of the shard capacity. This can have the side benefit of
   // involving fewer threads in eviction.
-  size_t old_usage = usage_.LoadRelaxed();
+  const size_t old_usage = usage_.LoadRelaxed();
+  const size_t capacity = capacity_.LoadRelaxed();
   size_t need_evict_charge;
   // NOTE: if total_charge > old_usage, there isn't yet enough to evict
   // `total_charge` amount. Even if we only try to evict `old_usage` amount,
@@ -531,8 +540,7 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict(
   }
   EvictionData data;
   if (need_evict_charge > 0) {
-    static_cast<Table*>(this)->Evict(need_evict_charge, state, &data,
-                                     eviction_effort_cap);
+    static_cast<Table*>(this)->Evict(need_evict_charge, state, &data);
     // Deal with potential occupancy deficit
     if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) {
       assert(data.freed_charge == 0);
@@ -568,8 +576,10 @@ void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) {
   MarkEmpty(*h);
 }
 
-bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data,
-                              uint32_t eviction_effort_cap) {
+bool BaseClockTable::IsEvictionEffortExceeded(
+    const BaseClockTable::EvictionData& data) const {
+  auto eviction_effort_cap =
+      eec_and_scl_.LoadRelaxed().GetEffectiveEvictionEffortCap();
   // Basically checks whether the ratio of useful effort to wasted effort is
   // too low, with a start-up allowance for wasted effort before any useful
   // effort.
@@ -580,8 +590,7 @@ bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data,
 template <class Table>
 Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
                               typename Table::HandleImpl** handle,
-                              Cache::Priority priority, size_t capacity,
-                              uint32_t eec_and_scl) {
+                              Cache::Priority priority) {
   using HandleImpl = typename Table::HandleImpl;
   Table& derived = static_cast<Table&>(*this);
 
@@ -602,9 +611,9 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
   // NOTE: we can use eec_and_scl as eviction_effort_cap below because
   // strict_capacity_limit=true is supposed to disable the limit on eviction
   // effort, and a large value effectively does that.
-  if (eec_and_scl & kStrictCapacityLimitBit) {
+  if (eec_and_scl_.LoadRelaxed().Get<StrictCapacityLimit>()) {
     Status s = ChargeUsageMaybeEvictStrict<Table>(
-        total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state);
+        total_charge, need_evict_for_occupancy, state);
     if (!s.ok()) {
       // Revert occupancy
       occupancy_.FetchSubRelaxed(1);
@@ -613,7 +622,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
   } else {
     // Case strict_capacity_limit == false
     bool success = ChargeUsageMaybeEvictNonStrict<Table>(
-        total_charge, capacity, need_evict_for_occupancy, eec_and_scl, state);
+        total_charge, need_evict_for_occupancy, state);
     if (!success) {
       // Revert occupancy
       occupancy_.FetchSubRelaxed(1);
@@ -717,11 +726,13 @@ void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) {
 #endif
 
 FixedHyperClockTable::FixedHyperClockTable(
-    size_t capacity, CacheMetadataChargePolicy metadata_charge_policy,
+    size_t capacity, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
     MemoryAllocator* allocator,
     const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
     const Opts& opts)
-    : BaseClockTable(metadata_charge_policy, allocator, eviction_callback,
+    : BaseClockTable(capacity, strict_capacity_limit, opts.eviction_effort_cap,
+                     metadata_charge_policy, allocator, eviction_callback,
                      hash_seed),
       length_bits_(CalcHashBits(capacity, opts.estimated_value_size,
                                 metadata_charge_policy)),
@@ -1112,8 +1123,7 @@ inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) {
 }
 
 inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&,
-                                        EvictionData* data,
-                                        uint32_t eviction_effort_cap) {
+                                        EvictionData* data) {
   // precondition
   assert(requested_charge > 0);
 
@@ -1148,7 +1158,7 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&,
     if (old_clock_pointer >= max_clock_pointer) {
       return;
     }
-    if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) {
+    if (IsEvictionEffortExceeded(*data)) {
       eviction_effort_exceeded_count_.FetchAddRelaxed(1);
       return;
     }
@@ -1166,14 +1176,11 @@ ClockCacheShard<Table>::ClockCacheShard(
     const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
     const typename Table::Opts& opts)
     : CacheShardBase(metadata_charge_policy),
-      table_(capacity, metadata_charge_policy, allocator, eviction_callback,
-             hash_seed, opts),
-      capacity_(capacity),
-      eec_and_scl_(SanitizeEncodeEecAndScl(opts.eviction_effort_cap,
-                                           strict_capacity_limit)) {
+      table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator,
+             eviction_callback, hash_seed, opts) {
   // Initial charge metadata should not exceed capacity
-  assert(table_.GetUsage() <= capacity_.LoadRelaxed() ||
-         capacity_.LoadRelaxed() < sizeof(HandleImpl));
+  assert(table_.GetUsage() <= table_.GetCapacity() ||
+         table_.GetCapacity() < sizeof(HandleImpl));
 }
 
 template <class Table>
@@ -1239,18 +1246,14 @@ int FixedHyperClockTable::CalcHashBits(
 
 template <class Table>
 void ClockCacheShard<Table>::SetCapacity(size_t capacity) {
-  capacity_.StoreRelaxed(capacity);
+  table_.SetCapacity(capacity);
   // next Insert will take care of any necessary evictions
 }
 
 template <class Table>
 void ClockCacheShard<Table>::SetStrictCapacityLimit(
     bool strict_capacity_limit) {
-  if (strict_capacity_limit) {
-    eec_and_scl_.FetchOrRelaxed(kStrictCapacityLimitBit);
-  } else {
-    eec_and_scl_.FetchAndRelaxed(~kStrictCapacityLimitBit);
-  }
+  table_.SetStrictCapacityLimit(strict_capacity_limit);
   // next Insert will take care of any necessary evictions
 }
 
@@ -1270,9 +1273,7 @@ Status ClockCacheShard<Table>::Insert(const Slice& key,
   proto.value = value;
   proto.helper = helper;
   proto.total_charge = charge;
-  return table_.template Insert<Table>(proto, handle, priority,
-                                       capacity_.LoadRelaxed(),
-                                       eec_and_scl_.LoadRelaxed());
+  return table_.template Insert<Table>(proto, handle, priority);
 }
 
 template <class Table>
@@ -1287,9 +1288,7 @@ typename Table::HandleImpl* ClockCacheShard<Table>::CreateStandalone(
   proto.value = obj;
   proto.helper = helper;
   proto.total_charge = charge;
-  return table_.template CreateStandalone<Table>(proto, capacity_.LoadRelaxed(),
-                                                 eec_and_scl_.LoadRelaxed(),
-                                                 allow_uncharged);
+  return table_.template CreateStandalone<Table>(proto, allow_uncharged);
 }
 
 template <class Table>
@@ -1358,7 +1357,7 @@ size_t ClockCacheShard<Table>::GetStandaloneUsage() const {
 
 template <class Table>
 size_t ClockCacheShard<Table>::GetCapacity() const {
-  return capacity_.LoadRelaxed();
+  return table_.GetCapacity();
 }
 
 template <class Table>
@@ -1971,11 +1970,13 @@ class AutoHyperClockTable::ChainRewriteLock {
 };
 
 AutoHyperClockTable::AutoHyperClockTable(
-    size_t capacity, CacheMetadataChargePolicy metadata_charge_policy,
+    size_t capacity, bool strict_capacity_limit,
+    CacheMetadataChargePolicy metadata_charge_policy,
     MemoryAllocator* allocator,
     const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed,
     const Opts& opts)
-    : BaseClockTable(metadata_charge_policy, allocator, eviction_callback,
+    : BaseClockTable(capacity, strict_capacity_limit, opts.eviction_effort_cap,
+                     metadata_charge_policy, allocator, eviction_callback,
                      hash_seed),
       array_(MemMapping::AllocateLazyZeroed(
           sizeof(HandleImpl) * CalcMaxUsableLength(capacity,
@@ -3480,8 +3481,7 @@ void AutoHyperClockTable::EraseUnRefEntries() {
 }
 
 void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state,
-                                EvictionData* data,
-                                uint32_t eviction_effort_cap) {
+                                EvictionData* data) {
   // precondition
   assert(requested_charge > 0);
 
@@ -3573,7 +3573,7 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state,
       return;
     }
 
-    if (IsEvictionEffortExceeded(*data, eviction_effort_cap)) {
+    if (IsEvictionEffortExceeded(*data)) {
       eviction_effort_exceeded_count_.FetchAddRelaxed(1);
       return;
     }
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index 895936900dd8..5ac8467bd3a3 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -20,6 +20,7 @@
 #include "port/mmap.h"
 #include "rocksdb/cache.h"
 #include "util/atomic.h"
+#include "util/bit_fields.h"
 #include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -376,25 +377,20 @@ class BaseClockTable {
     int eviction_effort_cap;
   };
 
-  BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
+  BaseClockTable(size_t capacity, bool strict_capacity_limit,
+                 int eviction_effort_cap,
+                 CacheMetadataChargePolicy metadata_charge_policy,
                  MemoryAllocator* allocator,
                  const Cache::EvictionCallback* eviction_callback,
-                 const uint32_t* hash_seed)
-      : metadata_charge_policy_(metadata_charge_policy),
-        allocator_(allocator),
-        eviction_callback_(*eviction_callback),
-        hash_seed_(*hash_seed) {}
+                 const uint32_t* hash_seed);
 
   template <class Table>
   typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
-                                               size_t capacity,
-                                               uint32_t eec_and_scl,
                                                bool allow_uncharged);
 
   template <class Table>
   Status Insert(const ClockHandleBasicData& proto,
-                typename Table::HandleImpl** handle, Cache::Priority priority,
-                size_t capacity, uint32_t eec_and_scl);
+                typename Table::HandleImpl** handle, Cache::Priority priority);
 
   void Ref(ClockHandle& handle);
 
@@ -404,6 +400,18 @@ class BaseClockTable {
 
   size_t GetStandaloneUsage() const { return standalone_usage_.LoadRelaxed(); }
 
+  size_t GetCapacity() const { return capacity_.LoadRelaxed(); }
+
+  void SetCapacity(size_t capacity) { capacity_.StoreRelaxed(capacity); }
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) {
+    if (strict_capacity_limit) {
+      eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::SetTransform());
+    } else {
+      eec_and_scl_.ApplyRelaxed(StrictCapacityLimit::ClearTransform());
+    }
+  }
+
   uint32_t GetHashSeed() const { return hash_seed_; }
 
   uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); }
@@ -420,6 +428,7 @@ class BaseClockTable {
 
   void TrackAndReleaseEvictedEntry(ClockHandle* h);
 
+  bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data) const;
 #ifndef NDEBUG
   // Acquire N references
   void TEST_RefN(ClockHandle& handle, size_t n);
@@ -441,9 +450,8 @@ class BaseClockTable {
   // required, and the operation should fail if not possible.
   // NOTE: Otherwise, occupancy_ is not managed in this function
   template <class Table>
-  Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
+  Status ChargeUsageMaybeEvictStrict(size_t total_charge,
                                      bool need_evict_for_occupancy,
-                                     uint32_t eviction_effort_cap,
                                      typename Table::InsertState& state);
 
   // Helper for updating `usage_` for new entry with given `total_charge`
@@ -455,9 +463,8 @@ class BaseClockTable {
   // true, indicating success.
   // NOTE: occupancy_ is not managed in this function
   template <class Table>
-  bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
+  bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
                                       bool need_evict_for_occupancy,
-                                      uint32_t eviction_effort_cap,
                                       typename Table::InsertState& state);
 
  protected:  // data
@@ -490,6 +497,25 @@ class BaseClockTable {
   // Part of usage by standalone entries (not in table)
   AcqRelAtomic<size_t> standalone_usage_{};
 
+  // Maximum total charge of all elements stored in the table.
+  // (Relaxed: eventual consistency/update is OK)
+  RelaxedAtomic<size_t> capacity_;
+
+  // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit
+  // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc.
+  struct EecAndScl : public BitFields<uint32_t, EecAndScl> {
+    uint32_t GetEffectiveEvictionEffortCap() const {
+      // Because setting strict_capacity_limit is supposed to imply infinite
+      // cap on eviction effort, we can let the bit for strict_capacity_limit
+      // in the upper-most bit position to used as part of the effective cap.
+      return underlying;
+    }
+  };
+  using EvictionEffortCap = UnsignedBitField<EecAndScl, 31, NoPrevBitField>;
+  using StrictCapacityLimit = BoolBitField<EecAndScl, EvictionEffortCap>;
+  // (Relaxed: eventual consistency/update is OK)
+  RelaxedBitFieldsAtomic<EecAndScl> eec_and_scl_;
+
   ALIGN_AS(CACHE_LINE_SIZE)
   const CacheMetadataChargePolicy metadata_charge_policy_;
 
@@ -544,7 +570,7 @@ class FixedHyperClockTable : public BaseClockTable {
     size_t estimated_value_size;
   };
 
-  FixedHyperClockTable(size_t capacity,
+  FixedHyperClockTable(size_t capacity, bool strict_capacity_limit,
                        CacheMetadataChargePolicy metadata_charge_policy,
                        MemoryAllocator* allocator,
                        const Cache::EvictionCallback* eviction_callback,
@@ -566,8 +592,7 @@ class FixedHyperClockTable : public BaseClockTable {
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
-  void Evict(size_t requested_charge, InsertState& state, EvictionData* data,
-             uint32_t eviction_effort_cap);
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
 
   HandleImpl* Lookup(const UniqueId64x2& hashed_key);
 
@@ -834,7 +859,7 @@ class AutoHyperClockTable : public BaseClockTable {
     size_t min_avg_value_size;
   };
 
-  AutoHyperClockTable(size_t capacity,
+  AutoHyperClockTable(size_t capacity, bool strict_capacity_limit,
                       CacheMetadataChargePolicy metadata_charge_policy,
                       MemoryAllocator* allocator,
                       const Cache::EvictionCallback* eviction_callback,
@@ -861,8 +886,7 @@ class AutoHyperClockTable : public BaseClockTable {
   // Runs the clock eviction algorithm trying to reclaim at least
   // requested_charge. Returns how much is evicted, which could be less
   // if it appears impossible to evict the requested amount without blocking.
-  void Evict(size_t requested_charge, InsertState& state, EvictionData* data,
-             uint32_t eviction_effort_cap);
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
 
   HandleImpl* Lookup(const UniqueId64x2& hashed_key);
 
@@ -1095,15 +1119,6 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
 
  private:  // data
   Table table_;
-
-  // Maximum total charge of all elements stored in the table.
-  // (Relaxed: eventual consistency/update is OK)
-  RelaxedAtomic<size_t> capacity_;
-
-  // Encodes eviction_effort_cap (bottom 31 bits) and strict_capacity_limit
-  // (top bit). See HyperClockCacheOptions::eviction_effort_cap etc.
-  // (Relaxed: eventual consistency/update is OK)
-  RelaxedAtomic<uint32_t> eec_and_scl_;
 };  // class ClockCacheShard
 
 template <class Table>
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 620cf5fe8d7c..a660ff0b9854 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -304,8 +304,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   // state fields that are best updated atomically to avoid locking and/or to
   // simplify the interesting interleavings that have to be considered and
   // accommodated.
-  struct StateID {};
-  struct State : public BitFields<uint64_t, StateID> {};
+  struct State : public BitFields<uint64_t, State> {};
   ALIGN_AS(CACHE_LINE_SIZE) AcqRelBitFieldsAtomic<State> atomic_state;
 
   // The first field is a bit for each ring buffer slot (max 32) for whether
diff --git a/util/bit_fields.h b/util/bit_fields.h
index d1380cc28d9e..c2aeaf86ff8a 100644
--- a/util/bit_fields.h
+++ b/util/bit_fields.h
@@ -8,6 +8,7 @@
 #include <atomic>
 
 #include "rocksdb/rocksdb_namespace.h"
+#include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -27,13 +28,7 @@ namespace ROCKSDB_NAMESPACE {
 // The specific bit fields are declared outside the declaration using
 // BoolBitField and UnsignedBitField below. Example usage:
 //
-// // A unique compile-time identifier to ensure we don't mix up different
-// // bit fields.
-// struct MyStateID {};
-//
-// using MyState = BitFields<uint32_t, MyStateID>;
-//  - or -
-// struct MyState : public BitFields<uint32_t, MyStateID> {
+// struct MyState : public BitFields<uint32_t, MyState> {
 //   // Extra helper declarations and/or field type declarations
 // };
 //
@@ -43,9 +38,8 @@ namespace ROCKSDB_NAMESPACE {
 // using Field3 = BoolBitField<MyState, Field2>;
 // using Field4 = UnsignedBitField<MyState, 5, Field3>;  // 5 bits in a uint8_t
 //
-// MyState state;  // zero-initialized
-// state.Set<Field1>(42U);
-// state.Set<Field2>(true);
+// // MyState{} is zero-initialized
+// auto state = MyState{}.With<Field1>(42U).With<Field2>(true);
 // state.Set<Field4>(3U);
 // state.Ref<Field1>() += state.Get<Field4>();
 //
@@ -56,23 +50,36 @@ namespace ROCKSDB_NAMESPACE {
 //
 // using Field3a = UnsignedBitField<State, 6, Field2>;  // 6 bits in a uint8_t
 //
-template <typename UnderlyingT, typename IdentifyingT>
+template <typename UnderlyingT, typename DerivedT>
 struct BitFields {
   using U = UnderlyingT;
   U underlying = 0;
   static constexpr int kBitCount = sizeof(U) * 8;
 
-  using ID = IdentifyingT;
+  using Derived = DerivedT;
 
+  // Modify a given field in place
   template <typename BitFieldT>
   void Set(typename BitFieldT::V value) {
-    BitFieldT::SetIn(static_cast<typename BitFieldT::Parent&>(*this), value);
+    static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
+    Derived& derived = static_cast<Derived&>(*this);
+    BitFieldT::SetIn(derived, value);
+  }
+
+  // Return a copy with the given field modified
+  template <typename BitFieldT>
+  Derived With(typename BitFieldT::V value) const {
+    static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
+    Derived rv = static_cast<const Derived&>(*this);
+    BitFieldT::SetIn(rv, value);
+    return rv;
   }
 
+  // Get the value of a field
   template <typename BitFieldT>
   typename BitFieldT::V Get() const {
-    return BitFieldT::GetFrom(
-        static_cast<const typename BitFieldT::Parent&>(*this));
+    static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
+    return BitFieldT::GetFrom(static_cast<const Derived&>(*this));
   }
 
   // Reference and Ref() are not intended to behave as full references but to
@@ -152,7 +159,8 @@ struct NoPrevBitField {
 template <typename BitFieldsT, typename PrevField>
 struct BoolBitField {
   using Parent = BitFieldsT;
-  using ParentBase = BitFields<typename BitFieldsT::U, typename BitFieldsT::ID>;
+  using ParentBase =
+      BitFields<typename BitFieldsT::U, typename BitFieldsT::Derived>;
   using U = typename BitFieldsT::U;
   using V = bool;
   static constexpr int kBitOffset = PrevField::kEndBit;
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 380c6f50bea4..72b9f19376d9 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -473,8 +473,7 @@ TEST(SemaphoreTest, BinarySemaphore) {
 
 TEST(BitFieldsTest, BitFields) {
   // Start by verifying example from BitFields comment
-  struct MyStateID {};
-  struct MyState : public BitFields<uint32_t, MyStateID> {
+  struct MyState : public BitFields<uint32_t, MyState> {
     // Extra helper declarations and/or field type declarations
   };
 
@@ -483,9 +482,8 @@ TEST(BitFieldsTest, BitFields) {
   using Field3 = BoolBitField<MyState, Field2>;
   using Field4 = UnsignedBitField<MyState, 5, Field3>;
 
-  MyState state;  // zero-initialized
-  state.Set<Field1>(42U);
-  state.Set<Field2>(true);
+  // MyState{} is zero-initialized
+  auto state = MyState{}.With<Field1>(42U).With<Field2>(true);
   state.Set<Field4>(3U);
   state.Ref<Field1>() += state.Get<Field4>();
 

From 73432a3f369d2f6331b68c907a0ffac4e9a3d653 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Fri, 19 Sep 2025 19:52:55 -0700
Subject: [PATCH 296/500] Improve random seed override support in stress test
 (#13952)

Summary:
Support random seed for white box test
Support per iteration random seed override, so that we could skip previous iterations, as sometimes failure happens after a few iterations.
The reason we still need initial random seed is that some of the parameter is initialized before each iteration, and not all of the parameters are randomized again in each iteration. The reason is that we want some of the parameters to be stable across the run.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13952

Test Plan:
Example for using per iteration random seed override to jump the to second iteration.

Simulate a normal run. 4205502355970671733 is the seed used for the second iteration.
```
[xbw@devvm16622.vll0 ~/workspace/ws2/rocksdb (plm_stress_fix)]$ /usr/local/bin/python3 -u tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --cf_consistency blackbox --duration=96000 --max_key=2500000 --interval=10 --initial_random_seed_override=10
Start with random seed 10
Running blackbox-crash-test with
interval_between_crash=10
total-duration=96000

Use random seed for iteration 13278846177722289202
Running db_stress with pid=2102945: ./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=10000 --adaptive_readahead=0 --adm_policy=2 --advise_random_on_open=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=0 --async_io=1 --atomic_flush=1 --auto_readahead_size=1 --auto_refresh_iterator_with_snapshot=0 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=1000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=1000000 --blob_cache_size=2097152 --blob_compaction_readahead_size=4194304 --blob_compression_type=snappy --blob_file_size=1073741824 --blob_file_starting_level=3 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.5 --block_align=0 --block_protection_bytes_per_key=1 --block_size=16384 --bloom_before_level=1 --bloom_bits=27.321469575655275 --bottommost_compression_type=zstd --bottommost_file_compaction_delay=86400 --bytes_per_sync=0 --cache_index_and_filter_blocks=0 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=1 --checkpoint_one_in=0 --checksum_type=kXXH3 --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000 --compaction_pri=4 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=0 --compress_format_version=1 --compressed_secondary_cache_ratio=0.0 --compressed_secondary_cache_size=0 --compression_checksum=1 --compression_manager=autoskip --compression_max_dict_buffer_bytes=8589934591 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4 --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=1 --db=/tmp/rocksdb_crashtest_blackboxs39kubu3 --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kHot --default_write_temperature=kUnknown --delete_obsolete_files_period_micros=30000000 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=10000 --disable_wal=1 --dump_malloc_stats=0 --enable_blob_files=1 --enable_blob_garbage_collection=0 --enable_checksum_handoff=0 --enable_compaction_filter=0 --enable_compaction_on_deletion_trigger=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=0 --expected_values_dir=/tmp/rocksdb_crashtest_expected_rvq7p3ow --fifo_allow_compaction=0 --file_checksum_impl=xxh64 --file_temperature_age_thresholds= --fill_cache=1 --flush_one_in=1000000 --format_version=6 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=15 --index_shortening=2 --index_type=2 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=500 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kWarm --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=0 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=32768 --max_sequential_skip_in_iterations=1 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=1048576 --memtable_avg_op_scan_flush_trigger=20 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=0 --memtable_op_scan_flush_trigger=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=0 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=8 --min_write_buffer_number_to_merge=1 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_bottom_pri_threads=1 --num_file_reads_for_auto_readahead=0 --open_files=100 --open_metadata_read_fault_one_in=8 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=1 --optimize_multiget_for_io=1 --paranoid_file_checks=0 --paranoid_memory_checks=0 --partition_filters=1 --partition_pinning=2 --pause_background_one_in=10000 --periodic_compaction_seconds=0 --prefix_size=1 --prefixpercent=5 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=36000 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=32 --read_fault_one_in=0 --readahead_size=524288 --readpercent=45 --recycle_log_file_num=0 --remote_compaction_worker_threads=0 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=10000 --sample_for_compression=0 --secondary_cache_fault_one_in=32 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=1000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=1048576 --sqfc_name=foo --sqfc_version=0 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --statistics=1 --stats_dump_period_sec=0 --stats_history_buffer_size=0 --strict_bytes_per_sync=0 --subcompactions=3 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=-1 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=1 --test_cf_consistency=1 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=0 --track_and_verify_wals=0 --uncache_aggressiveness=3225 --universal_max_read_amp=-1 --universal_reduce_file_locking=0 --unpartitioned_pinning=1 --use_adaptive_mutex=1 --use_adaptive_mutex_lru=1 --use_attribute_group=1 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=1 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_multiscan=0 --use_put_entity_one_in=10 --use_shared_block_and_blob_cache=0 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=1048576 --write_dbid_to_manifest=0 --write_fault_one_in=0 --write_identity_file=1 --writepercent=35

KILLED 2102945

stdout:

Use random seed for iteration 4205502355970671733
Running db_stress with pid=2107447: ./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=10000 --adaptive_readahead=0 --adm_policy=3 --advise_random_on_open=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=1 --async_io=1 --auto_readahead_size=0 --auto_refresh_iterator_with_snapshot=0 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=4194304 --blob_compaction_readahead_size=1048576 --blob_compression_type=snappy --blob_file_size=1048576 --blob_file_starting_level=2 --blob_garbage_collection_age_cutoff=1.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=0 --bottommost_compression_type=disable --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=33554432 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=1 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kxxHash --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000 --compaction_pri=4 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=2 --compress_format_version=1 --compressed_secondary_cache_ratio=0.0 --compressed_secondary_cache_size=0 --compression_checksum=1 --compression_manager=randommixed --compression_max_dict_buffer_bytes=34359738367 --compression_max_dict_bytes=16384 --compression_parallel_threads=4 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=0 --db=/tmp/rocksdb_crashtest_blackboxs39kubu3 --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kHot --default_write_temperature=kUnknown --delete_obsolete_files_period_micros=30000000 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_blob_files=1 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_compaction_on_deletion_trigger=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/tmp/rocksdb_crashtest_expected_rvq7p3ow --fifo_allow_compaction=1 --file_checksum_impl=none --file_temperature_age_thresholds= --fill_cache=1 --flush_one_in=1000000 --format_version=5 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0.5 --index_block_restart_interval=11 --index_shortening=2 --index_type=2 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=0 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kCold --level_compaction_dynamic_level_bytes=1 --lock_wal_one_in=0 --log_file_time_to_roll=0 --log_readahead_size=16777216 --long_running_snapshots=0 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=1000 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=524288 --max_background_compactions=2 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=2 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=2097152 --memtable_avg_op_scan_flush_trigger=20 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=0 --memtable_op_scan_flush_trigger=10 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=0 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_bottom_pri_threads=20 --num_file_reads_for_auto_readahead=1 --open_files=-1 --open_metadata_read_fault_one_in=8 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_hits=0 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=1 --partition_pinning=2 --pause_background_one_in=10000 --periodic_compaction_seconds=1000 --prefix_size=7 --prefixpercent=5 --prepopulate_blob_cache=1 --prepopulate_block_cache=0 --preserve_internal_time_seconds=36000 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=16384 --readpercent=45 --recycle_log_file_num=0 --remote_compaction_worker_threads=0 --reopen=0 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=0 --secondary_cache_fault_one_in=0 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=1000 --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=0 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --statistics=1 --stats_dump_period_sec=600 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=1 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=1 --test_cf_consistency=1 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=1 --track_and_verify_wals=0 --uncache_aggressiveness=136 --universal_max_read_amp=-1 --universal_reduce_file_locking=1 --unpartitioned_pinning=2 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=1 --use_multi_get_entity=0 --use_multiget=0 --use_multiscan=0 --use_put_entity_one_in=10 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=5 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=1048576 --write_dbid_to_manifest=0 --write_fault_one_in=0 --write_identity_file=1 --writepercent=35
```

Override the per iteration random seed directly 4205502355970671733, to jump to the second iteration parameter set. Only the file path name is different. The rest of the parameters are all same
```
[xbw@devvm16622.vll0 ~/workspace/ws2/rocksdb (plm_stress_fix)]$ /usr/local/bin/python3 -u tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --cf_consistency blackbox --duration=96000 --max_key=2500000 --interval=10 --initial_random_seed_override=10 --per_iteration_random_seed_override=4205502355970671733
Start with random seed 10
Running blackbox-crash-test with
interval_between_crash=10
total-duration=96000

Use random seed for iteration 4205502355970671733
Running db_stress with pid=2110794: ./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=0 --acquire_snapshot_one_in=10000 --adaptive_readahead=0 --adm_policy=3 --advise_random_on_open=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --allow_unprepared_value=1 --async_io=1 --auto_readahead_size=0 --auto_refresh_iterator_with_snapshot=0 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=4194304 --blob_compaction_readahead_size=1048576 --blob_compression_type=snappy --blob_file_size=1048576 --blob_file_starting_level=2 --blob_garbage_collection_age_cutoff=1.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=0 --bottommost_compression_type=disable --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=33554432 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=1 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kxxHash --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000 --compaction_pri=4 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=2 --compress_format_version=1 --compressed_secondary_cache_ratio=0.0 --compressed_secondary_cache_size=0 --compression_checksum=1 --compression_manager=randommixed --compression_max_dict_buffer_bytes=34359738367 --compression_max_dict_bytes=16384 --compression_parallel_threads=4 --compression_type=none --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=0 --db=/tmp/rocksdb_crashtest_blackboxo1xvo_2n --db_write_buffer_size=0 --decouple_partitioned_filters=1 --default_temperature=kHot --default_write_temperature=kUnknown --delete_obsolete_files_period_micros=30000000 --delpercent=5 --delrangepercent=0 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_blob_files=1 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_compaction_on_deletion_trigger=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=1 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/tmp/rocksdb_crashtest_expected_s0kmvlrj --fifo_allow_compaction=1 --file_checksum_impl=none --file_temperature_age_thresholds= --fill_cache=1 --flush_one_in=1000000 --format_version=5 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0.5 --index_block_restart_interval=11 --index_shortening=2 --index_type=2 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=0 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100000 --last_level_temperature=kCold --level_compaction_dynamic_level_bytes=1 --lock_wal_one_in=0 --log_file_time_to_roll=0 --log_readahead_size=16777216 --long_running_snapshots=0 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=1000 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=524288 --max_background_compactions=2 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=2 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=2097152 --memtable_avg_op_scan_flush_trigger=20 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=0 --memtable_op_scan_flush_trigger=10 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=0 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_bottom_pri_threads=20 --num_file_reads_for_auto_readahead=1 --open_files=-1 --open_metadata_read_fault_one_in=8 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_hits=0 --optimize_filters_for_memory=0 --optimize_multiget_for_io=0 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=1 --partition_pinning=2 --pause_background_one_in=10000 --periodic_compaction_seconds=1000 --prefix_size=7 --prefixpercent=5 --prepopulate_blob_cache=1 --prepopulate_block_cache=0 --preserve_internal_time_seconds=36000 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=16384 --readpercent=45 --recycle_log_file_num=0 --remote_compaction_worker_threads=0 --reopen=0 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=0 --secondary_cache_fault_one_in=0 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --set_options_one_in=1000 --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=0 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=1048576 --statistics=1 --stats_dump_period_sec=600 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=1 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=1 --test_cf_consistency=1 --test_ingest_standalone_range_deletion_one_in=0 --top_level_index_pinning=1 --track_and_verify_wals=0 --uncache_aggressiveness=136 --universal_max_read_amp=-1 --universal_reduce_file_locking=1 --unpartitioned_pinning=2 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=1 --use_multi_get_entity=0 --use_multiget=0 --use_multiscan=0 --use_put_entity_one_in=10 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=5 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=1048576 --write_dbid_to_manifest=0 --write_fault_one_in=0 --write_identity_file=1 --writepercent=35
```

Reviewed By: jaykorean

Differential Revision: D82399857

Pulled By: xingbowang

fbshipit-source-id: 38f3bfefdd0adc7f527fd68982e2edc22b2304f4
---
 tools/db_crashtest.py | 45 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index cf71c9dcdf94..3f1bfc1c1d16 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -11,22 +11,48 @@
 import tempfile
 import time
 
+per_iteration_random_seed_override = 0
+
+def get_random_seed(override):
+    if override == 0:
+        return random.randint(1, 2**64)
+    else:
+        return override
 
 def setup_random_seed_before_main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--random_seed",
+        "--initial_random_seed_override",
         default=0,
         type=int,
-        help="Random seed used for reproduce the same test parameter set",
+        help="Random seed used for initialize the test parameters at the beginning of stress test run",
     )
-    args, _ = parser.parse_known_args()
-    random_seed = (
-        random.randint(1, 2**64) if args.random_seed == 0 else args.random_seed
+    # sometimes the failure appeared after a few iteration, to reproduce the error, we have to wait for the test to run
+    # multiple iterations to reach the iteration that fails the test. By overriding the seed used within each iteration,
+    # we could skip all the previous iterations.
+    parser.add_argument(
+        "--per_iteration_random_seed_override",
+        default=0,
+        type=int,
+        help="Random seed used for initialize the test parameters in each iteration of the stress test run",
     )
-    print(f"Start with random seed {random_seed}")
-    random.seed(random_seed)
 
+    args, remain_args = parser.parse_known_args()
+    init_random_seed = get_random_seed(args.initial_random_seed_override)
+    global per_iteration_random_seed_override
+    per_iteration_random_seed_override = args.per_iteration_random_seed_override
+
+    print(f"Start with random seed {init_random_seed}")
+    random.seed(init_random_seed)
+
+    # reset the sys.argv with the remaining args, so that the rest of the argument parser would not see these 2 args
+    sys.argv = remain_args
+
+def apply_random_seed_per_iteration():
+    global per_iteration_random_seed_override
+    per_iteration_random_seed = get_random_seed(per_iteration_random_seed_override)
+    print(f"Use random seed for iteration {per_iteration_random_seed}")
+    random.seed(per_iteration_random_seed)
 
 # Random seed has to be setup before the rest of the script, so that the random
 # value selected in the global variable uses the random seed specified
@@ -367,7 +393,7 @@ def setup_random_seed_before_main():
     "memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
-    "track_and_verify_wals": lambda: random.choice([0]),    
+    "track_and_verify_wals": lambda: random.choice([0]),
     "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
     # TODO(jaykorean): Change to lambda: random.choice([0, 1]) after addressing all remote compaction failures
     "remote_compaction_failure_fall_back_to_local": 1,
@@ -1230,7 +1256,6 @@ def gen_cmd(params, unknown_params):
             not in {
                 "test_type",
                 "simple",
-                "random_seed",
                 "duration",
                 "interval",
                 "random_kill_odd",
@@ -1314,6 +1339,7 @@ def blackbox_crash_main(args, unknown_args):
     )
 
     while time.time() < exit_time:
+        apply_random_seed_per_iteration()
         cmd = gen_cmd(
             dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
         )
@@ -1376,6 +1402,7 @@ def whitebox_crash_main(args, unknown_args):
     succeeded = True
     hit_timeout = False
     while time.time() < exit_time:
+        apply_random_seed_per_iteration()
         if check_mode == 0:
             additional_opts = {
                 # use large ops per thread since we will kill it anyway

From 841e3642380e07938e0ed7d4fb5b1e17b7cc9ab2 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Sat, 20 Sep 2025 00:08:12 -0700
Subject: [PATCH 297/500] Fix flaky unit test
 `IngestDBGeneratedFileTest2.NonZeroSeqno` (#13979)

Summary:
the test did not consider the ingestion_option settings that can result in different error message. This PR fixes the relevant check and ensure we have enough randomness in this test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13979

Test Plan: `gtest-parallel --repeat=20 --workers=20 ./external_sst_file_test --gtest_filter="*VaryingOptions/IngestDBGeneratedFileTest2.NonZeroSeqno/*"`

Reviewed By: hx235

Differential Revision: D82873439

Pulled By: cbi42

fbshipit-source-id: b0d74bf26a502ca3db59b4a0ea9717bf7d027400
---
 db/external_sst_file_test.cc | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 1a8a5f717651..ff0d15faa73a 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -4094,6 +4094,17 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
   ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
   ingest_opts.link_files = std::get<4>(GetParam());
   Random* rnd = Random::GetTLSInstance();
+  rnd->Reset(std::random_device{}());
+  std::ostringstream ingest_opts_trace;
+  ingest_opts_trace << "ingest_opts params: " << "snapshot_consistency="
+                    << ingest_opts.snapshot_consistency << ", "
+                    << "allow_global_seqno=" << ingest_opts.allow_global_seqno
+                    << ", " << "allow_blocking_flush="
+                    << ingest_opts.allow_blocking_flush << ", "
+                    << "fail_if_not_bottommost_level="
+                    << ingest_opts.fail_if_not_bottommost_level << ", "
+                    << "link_files=" << ingest_opts.link_files;
+  SCOPED_TRACE(ingest_opts_trace.str());
 
   do {
     SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
@@ -4263,11 +4274,17 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
       s = db_->IngestExternalFile(overlap_cf, sst_file_paths, ingest_opts);
 
       ASSERT_NOK(s);
-      ASSERT_TRUE(s.ToString().find("An ingested file overlaps with existing "
-                                    "data in the DB and has been "
-                                    "assigned a non-zero sequence number") !=
-                  std::string::npos)
-          << s.ToString();
+      if (ingest_opts.fail_if_not_bottommost_level) {
+        ASSERT_TRUE(s.ToString().find("Files cannot be ingested to Lmax") !=
+                    std::string::npos)
+            << s.ToString();
+      } else {
+        ASSERT_TRUE(s.ToString().find("An ingested file overlaps with existing "
+                                      "data in the DB and has been "
+                                      "assigned a non-zero sequence number") !=
+                    std::string::npos)
+            << s.ToString();
+      }
     }
 
     // Cleanup

From 3cdd3281baa824882aa4b24ac5ed7149d37cfb0c Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 22 Sep 2025 08:51:17 -0700
Subject: [PATCH 298/500] Update main for 10.8 (#13980)

Summary:
- updated release note
- updated version to 10.8 in version.h
- added 10.7 to check_format_compatible.sh
- did not updated folly commit hash due to some build failure.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13980

Reviewed By: xingbowang

Differential Revision: D82882035

Pulled By: cbi42

fbshipit-source-id: b5e0e78570fdd492d592ee77bd3901e4b39c25fb
---
 HISTORY.md                                    | 28 +++++++++++++++++++
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 .../behavior_changes/autohcc.md               |  1 -
 .../bug_fixes/compaction_cpu.md               |  1 -
 ...fix_multi_level_fifo_double_picking_bug.md |  1 -
 .../bug_fixes/udi_empty_scan_range_fix.md     |  1 -
 .../new_features/fail_if_no_udi_on_open.md    |  1 -
 .../improve_data_integrity_check_on_seek.md   |  1 -
 .../new_features/multi-scan-async-io.md       |  1 -
 .../new_features/multi-scan-max-prefetch.md   |  1 -
 .../new_features/sst_dump_recompress.md       |  1 -
 ...ternal_sst_ingestion_seqno_optimization.md |  2 --
 .../improve_point_lock_manager_performance.md |  2 --
 .../parallel_compression.md                   |  1 -
 ...ltiScanArgs_contructor_parameter_change.md |  1 -
 .../public_api_changes/autohcc.md             |  1 -
 .../public_api_changes/cplusplus20.md         |  1 -
 18 files changed, 30 insertions(+), 19 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/autohcc.md
 delete mode 100644 unreleased_history/bug_fixes/compaction_cpu.md
 delete mode 100644 unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md
 delete mode 100644 unreleased_history/bug_fixes/udi_empty_scan_range_fix.md
 delete mode 100644 unreleased_history/new_features/fail_if_no_udi_on_open.md
 delete mode 100644 unreleased_history/new_features/improve_data_integrity_check_on_seek.md
 delete mode 100644 unreleased_history/new_features/multi-scan-async-io.md
 delete mode 100644 unreleased_history/new_features/multi-scan-max-prefetch.md
 delete mode 100644 unreleased_history/new_features/sst_dump_recompress.md
 delete mode 100644 unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md
 delete mode 100644 unreleased_history/performance_improvements/improve_point_lock_manager_performance.md
 delete mode 100644 unreleased_history/performance_improvements/parallel_compression.md
 delete mode 100644 unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md
 delete mode 100644 unreleased_history/public_api_changes/autohcc.md
 delete mode 100644 unreleased_history/public_api_changes/cplusplus20.md

diff --git a/HISTORY.md b/HISTORY.md
index 9f37452ccb5d..b9ba1074c3d9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,34 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.7.0 (09/19/2025)
+### New Features
+* Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not.
+* A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
+* Introduce option MultiScanArgs::use_async_io to enable asynchronous I/O during MultiScan, instead of waiting for I/O to be done in Prepare().
+* Add new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks.
+* Improved `sst_dump` by allowing standalone file and directory arguments without `--file=`. Also added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`
+
+### Public API Changes
+* HyperClockCache with no `estimated_entry_charge` is now production-ready and is the preferred block cache implementation vs. LRUCache. Please consider updating your code to minimize the risk of hitting performance bottlenecks or anomalies from LRUCache. See cache.h for more detail.
+* RocksDB now requires a C++20 compatible compiler (GCC >= 11, Clang >= 10, Visual Studio >= 2019), including for any code using RocksDB headers.
+* MultiScanArgs used to have a default constructor with default parameter of BytewiseComparator. Now it always requires Comparator in its constructor.
+
+### Behavior Changes
+* The default provided block cache implementation is now HyperClockCache instead of LRUCache, when `block_cache` is nullptr (default) and `no_block_cache==false` (default). We recommend explicitly creating a HyperClockCache block cache based on memory budget and sharing it across all column families and even DB instances. This change could expose previously hidden memory or resource leaks.
+
+### Bug Fixes
+* Reported numbers for compaction and flush CPU usage now include time spent by parallel compression worker threads. This now means compaction/flush CPU usage could exceed the wall clock time.
+* Fix a race condition in FIFO size-based compaction where concurrent threads could select the same non-L0 file, causing assertion failures in debug builds or "Cannot delete table file from LSM tree" errors in release builds.
+* Fix a bug in RocksDB MultiScan with UDI when one of the scan ranges is determined to be empty by the UDI, which causes incorrect results.
+
+### Performance Improvements
+* Add a new table property "rocksdb.key.smallest.seqno" which records the smallest sequence number of all keys in file. It makes ingesting DB generated files faster by
+avoiding scanning the whole file to find the smallest sequence number.
+* Add a new experimental PerKeyPointLockManager to improve efficiency under high lock contention. PointLockManager was not efficient when there is high write contention on same key, as it uses a single conditional variable per lock stripe. PerKeyPointLockManager uses per thread conditional variable supporting fifo order. Although this is an experimental feature. By default, it is disabled. A new boolean flag TransactionDBOptions::use_per_key_point_lock_mgr is added to optionally enable it. Search the flag in code for more info.
+Together, a new configuration TransactionOptions::deadlock_timeout_us is added, which allows the transaction to wait for a short period before perform deadlock detection. When the workload has low lock contention, the deadlock_timeout_us can be configured to be slightly higher than average transaction execution time, so that transaction would likely be able to take the lock before deadlock detection is performed when it is waiting for a lock. This allows transaction to reduce CPU cost on performing deadlock detection, which could be expensive in CPU time. When the workload has high lock contention, the deadlock_timeout_us can be configured to 0, so that transaction would perform deadlock detection immediately. By default the value is 0 to keep the behavior same as before.
+* Majorly improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature. Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, but this is not currently recommended because of reported bugs in implementations of `std::counting_semaphore`/`binary_semaphore`.
+
 ## 10.6.0 (08/22/2025)
 ### New Features
 * Introduce column family option `cf_allow_ingest_behind`. This option aims to replace `DBOptions::allow_ingest_behind` to enable ingest behind at the per-CF level. `DBOptions::allow_ingest_behind` is deprecated.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 36e64444736a..1761eff70e73 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 7
+#define ROCKSDB_MINOR 8
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index bfd3be3ae716..41c768fff442 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/autohcc.md b/unreleased_history/behavior_changes/autohcc.md
deleted file mode 100644
index d43f31117f9d..000000000000
--- a/unreleased_history/behavior_changes/autohcc.md
+++ /dev/null
@@ -1 +0,0 @@
-* The default provided block cache implementation is now HyperClockCache instead of LRUCache, when `block_cache` is nullptr (default) and `no_block_cache==false` (default). We recommend explicitly creating a HyperClockCache block cache based on memory budget and sharing it across all column families and even DB instances. This change could expose previously hidden memory or resource leaks.
diff --git a/unreleased_history/bug_fixes/compaction_cpu.md b/unreleased_history/bug_fixes/compaction_cpu.md
deleted file mode 100644
index 3d25b488feb8..000000000000
--- a/unreleased_history/bug_fixes/compaction_cpu.md
+++ /dev/null
@@ -1 +0,0 @@
-* Reported numbers for compaction and flush CPU usage now include time spent by parallel compression worker threads. This now means compaction/flush CPU usage could exceed the wall clock time.
diff --git a/unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md b/unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md
deleted file mode 100644
index e6d88a67fc35..000000000000
--- a/unreleased_history/bug_fixes/fix_multi_level_fifo_double_picking_bug.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix a race condition in FIFO size-based compaction where concurrent threads could select the same non-L0 file, causing assertion failures in debug builds or "Cannot delete table file from LSM tree" errors in release builds.
diff --git a/unreleased_history/bug_fixes/udi_empty_scan_range_fix.md b/unreleased_history/bug_fixes/udi_empty_scan_range_fix.md
deleted file mode 100644
index 939612a035e6..000000000000
--- a/unreleased_history/bug_fixes/udi_empty_scan_range_fix.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix a bug in RocksDB MultiScan with UDI when one of the scan ranges is determined to be empty by the UDI, which causes incorrect results.
diff --git a/unreleased_history/new_features/fail_if_no_udi_on_open.md b/unreleased_history/new_features/fail_if_no_udi_on_open.md
deleted file mode 100644
index d250fd77e147..000000000000
--- a/unreleased_history/new_features/fail_if_no_udi_on_open.md
+++ /dev/null
@@ -1 +0,0 @@
-Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not.
diff --git a/unreleased_history/new_features/improve_data_integrity_check_on_seek.md b/unreleased_history/new_features/improve_data_integrity_check_on_seek.md
deleted file mode 100644
index 7b17c5dad1ad..000000000000
--- a/unreleased_history/new_features/improve_data_integrity_check_on_seek.md
+++ /dev/null
@@ -1 +0,0 @@
-A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
diff --git a/unreleased_history/new_features/multi-scan-async-io.md b/unreleased_history/new_features/multi-scan-async-io.md
deleted file mode 100644
index b8be3ce39bfc..000000000000
--- a/unreleased_history/new_features/multi-scan-async-io.md
+++ /dev/null
@@ -1 +0,0 @@
-* Introduce option MultiScanArgs::use_async_io to enable asynchronous I/O during MultiScan, instead of waiting for I/O to be done in Prepare().
diff --git a/unreleased_history/new_features/multi-scan-max-prefetch.md b/unreleased_history/new_features/multi-scan-max-prefetch.md
deleted file mode 100644
index 4725de1e52b0..000000000000
--- a/unreleased_history/new_features/multi-scan-max-prefetch.md
+++ /dev/null
@@ -1 +0,0 @@
-* Add new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks.
diff --git a/unreleased_history/new_features/sst_dump_recompress.md b/unreleased_history/new_features/sst_dump_recompress.md
deleted file mode 100644
index de8a177e12bf..000000000000
--- a/unreleased_history/new_features/sst_dump_recompress.md
+++ /dev/null
@@ -1 +0,0 @@
-* Improved `sst_dump` by allowing standalone file and directory arguments without `--file=`. Also added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`
diff --git a/unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md b/unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md
deleted file mode 100644
index 53b073a35ee3..000000000000
--- a/unreleased_history/performance_improvements/external_sst_ingestion_seqno_optimization.md
+++ /dev/null
@@ -1,2 +0,0 @@
-* Add a new table property "rocksdb.key.smallest.seqno" which records the smallest sequence number of all keys in file. It makes ingesting DB generated files faster by
-avoiding scanning the whole file to find the smallest sequence number.
diff --git a/unreleased_history/performance_improvements/improve_point_lock_manager_performance.md b/unreleased_history/performance_improvements/improve_point_lock_manager_performance.md
deleted file mode 100644
index 7713818f38cd..000000000000
--- a/unreleased_history/performance_improvements/improve_point_lock_manager_performance.md
+++ /dev/null
@@ -1,2 +0,0 @@
-Add a new experimental PerKeyPointLockManager to improve efficiency under high lock contention. PointLockManager was not efficient when there is high write contention on same key, as it uses a single conditional variable per lock stripe. PerKeyPointLockManager uses per thread conditional variable supporting fifo order. Although this is an experimental feature. By default, it is disabled. A new boolean flag TransactionDBOptions::use_per_key_point_lock_mgr is added to optionally enable it. Search the flag in code for more info.
-Together, a new configuration TransactionOptions::deadlock_timeout_us is added, which allows the transaction to wait for a short period before perform deadlock detection. When the workload has low lock contention, the deadlock_timeout_us can be configured to be slightly higher than average transaction execution time, so that transaction would likely be able to take the lock before deadlock detection is performed when it is waiting for a lock. This allows transaction to reduce CPU cost on performing deadlock detection, which could be expensive in CPU time. When the workload has high lock contention, the deadlock_timeout_us can be configured to 0, so that transaction would perform deadlock detection immediately. By default the value is 0 to keep the behavior same as before.
diff --git a/unreleased_history/performance_improvements/parallel_compression.md b/unreleased_history/performance_improvements/parallel_compression.md
deleted file mode 100644
index 4a3b9a4361e4..000000000000
--- a/unreleased_history/performance_improvements/parallel_compression.md
+++ /dev/null
@@ -1 +0,0 @@
-* Majorly improved CPU efficiency and scalability of parallel compression (`CompressionOptions::parallel_threads` > 1), though this efficiency improvement makes parallel compression currently incompatible with UserDefinedIndex and with old setting of `decouple_partitioned_filters=false`. Parallel compression is now considered a production-ready feature. Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, but this is not currently recommended because of reported bugs in implementations of `std::counting_semaphore`/`binary_semaphore`.
diff --git a/unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md b/unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md
deleted file mode 100644
index 5912b4b3631a..000000000000
--- a/unreleased_history/public_api_changes/MultiScanArgs_contructor_parameter_change.md
+++ /dev/null
@@ -1 +0,0 @@
-MultiScanArgs used to have a default constructor with default parameter of BytewiseComparator. Now it always requires Comparator in its constructor.
diff --git a/unreleased_history/public_api_changes/autohcc.md b/unreleased_history/public_api_changes/autohcc.md
deleted file mode 100644
index 4bbe714fc5c2..000000000000
--- a/unreleased_history/public_api_changes/autohcc.md
+++ /dev/null
@@ -1 +0,0 @@
-* HyperClockCache with no `estimated_entry_charge` is now production-ready and is the preferred block cache implementation vs. LRUCache. Please consider updating your code to minimize the risk of hitting performance bottlenecks or anomalies from LRUCache. See cache.h for more detail.
diff --git a/unreleased_history/public_api_changes/cplusplus20.md b/unreleased_history/public_api_changes/cplusplus20.md
deleted file mode 100644
index e2c7311fdfd9..000000000000
--- a/unreleased_history/public_api_changes/cplusplus20.md
+++ /dev/null
@@ -1 +0,0 @@
-* RocksDB now requires a C++20 compatible compiler (GCC >= 11, Clang >= 10, Visual Studio >= 2019), including for any code using RocksDB headers.

From 7ae602e80a72831ca479cf3d68d9ec8a55e4fc1b Mon Sep 17 00:00:00 2001
From: Josh Kang <jkangs@fb.com>
Date: Mon, 22 Sep 2025 13:36:26 -0700
Subject: [PATCH 299/500] Support output temperature in CompactFiles (#13955)

Summary:
It is useful to be able to specify output temperatures in the CompactFiles API. For example it may be useful to store small L0 files produced by flushes locally, while larger intra-L0 compactions can store the compacted L0 file remotely.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13955

Test Plan: New unit tests

Reviewed By: jaykorean

Differential Revision: D82492503

Pulled By: joshkang97

fbshipit-source-id: e1225fe572a15d7c5c30a265762b048a4a9e7f0b
---
 db/compaction/compaction.cc                   | 20 ++++++++--
 db/compaction/compaction.h                    | 11 ++++--
 db/compaction/compaction_job.cc               | 11 +-----
 db/compaction/compaction_picker.cc            | 10 ++---
 db/compaction/compaction_picker_fifo.cc       |  9 ++---
 db/compaction/compaction_picker_level.cc      |  2 +-
 db/compaction/compaction_picker_test.cc       | 33 ++++++++++++++---
 db/compaction/compaction_picker_universal.cc  |  8 ++--
 db/db_impl/db_impl_compaction_flush.cc        | 37 +++++++++----------
 db/external_sst_file_ingestion_job.cc         |  3 +-
 include/rocksdb/options.h                     |  5 +++
 .../fifo_compaction_temperature               |  1 +
 .../manual_compaction_output_temperature      |  1 +
 13 files changed, 94 insertions(+), 57 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/fifo_compaction_temperature
 create mode 100644 unreleased_history/public_api_changes/manual_compaction_output_temperature

diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index afbabbaa510d..9609f17c80f0 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -281,8 +281,9 @@ Compaction::Compaction(
     std::vector<CompactionInputFiles> _inputs, int _output_level,
     uint64_t _target_file_size, uint64_t _max_compaction_bytes,
     uint32_t _output_path_id, CompressionType _compression,
-    CompressionOptions _compression_opts, Temperature _output_temperature,
-    uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+    CompressionOptions _compression_opts,
+    Temperature _output_temperature_override, uint32_t _max_subcompactions,
+    std::vector<FileMetaData*> _grandparents,
     std::optional<SequenceNumber> _earliest_snapshot,
     const SnapshotChecker* _snapshot_checker,
     CompactionReason _compaction_reason, const std::string& _trim_ts,
@@ -303,7 +304,7 @@ Compaction::Compaction(
       output_path_id_(_output_path_id),
       output_compression_(_compression),
       output_compression_opts_(_compression_opts),
-      output_temperature_(_output_temperature),
+      output_temperature_override_(_output_temperature_override),
       deletion_compaction_(_compaction_reason == CompactionReason::kFIFOTtl ||
                            _compaction_reason ==
                                CompactionReason::kFIFOMaxSize),
@@ -1128,4 +1129,17 @@ void Compaction::FilterInputsForCompactionIterator() {
   }
 }
 
+Temperature Compaction::GetOutputTemperature(bool is_proximal_level) const {
+  if (output_temperature_override_ != Temperature::kUnknown) {
+    return output_temperature_override_;
+  }
+
+  if (is_last_level() && !is_proximal_level &&
+      mutable_cf_options_.last_level_temperature != Temperature::kUnknown) {
+    return mutable_cf_options_.last_level_temperature;
+  }
+
+  return mutable_cf_options_.default_write_temperature;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index b1498a877010..46870fbb7835 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -90,7 +90,8 @@ class Compaction {
              uint64_t target_file_size, uint64_t max_compaction_bytes,
              uint32_t output_path_id, CompressionType compression,
              CompressionOptions compression_opts,
-             Temperature output_temperature, uint32_t max_subcompactions,
+             Temperature output_temperature_override,
+             uint32_t max_subcompactions,
              std::vector<FileMetaData*> grandparents,
              std::optional<SequenceNumber> earliest_snapshot,
              const SnapshotChecker* snapshot_checker,
@@ -409,7 +410,11 @@ class Compaction {
 
   uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
 
-  Temperature output_temperature() const { return output_temperature_; }
+  // Order of precedence for temperature:
+  // 1. Override temp if not kUnknown
+  // 2. Temperature of the last level files if applicable
+  // 3. Default write temperature
+  Temperature GetOutputTemperature(bool is_proximal_level = false) const;
 
   uint32_t max_subcompactions() const { return max_subcompactions_; }
 
@@ -541,7 +546,7 @@ class Compaction {
   const uint32_t output_path_id_;
   CompressionType output_compression_;
   CompressionOptions output_compression_opts_;
-  Temperature output_temperature_;
+  Temperature output_temperature_override_;
   // If true, then the compaction can be done by simply deleting input files.
   const bool deletion_compaction_;
   // should it split the output file using the compact cursor?
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 33380362a3ac..b2588eaead90 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -2120,15 +2120,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
 
   // Pass temperature of the last level files to FileSystem.
   FileOptions fo_copy = file_options_;
-  Temperature temperature = sub_compact->compaction->output_temperature();
-  Temperature last_level_temp =
-      sub_compact->compaction->mutable_cf_options().last_level_temperature;
-  // Here last_level_temperature supersedes default_write_temperature, when
-  // enabled and applicable
-  if (last_level_temp != Temperature::kUnknown &&
-      sub_compact->compaction->is_last_level() && !outputs.IsProximalLevel()) {
-    temperature = last_level_temp;
-  }
+  auto temperature =
+      sub_compact->compaction->GetOutputTemperature(outputs.IsProximalLevel());
   fo_copy.temperature = temperature;
   fo_copy.write_hint = write_hint_;
 
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index da95425eb813..a59a28e819b4 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -375,12 +375,13 @@ Compaction* CompactionPicker::PickCompactionForCompactFiles(
     // without configurable `CompressionOptions`, which is inconsistent.
     compression_type = compact_options.compression;
   }
+
   auto c = new Compaction(
       vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
       output_level, compact_options.output_file_size_limit,
       mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
       GetCompressionOptions(mutable_cf_options, vstorage, output_level),
-      mutable_cf_options.default_write_temperature,
+      compact_options.output_temperature_override,
       compact_options.max_subcompactions,
       /* grandparents */ {}, earliest_snapshot, snapshot_checker,
       CompactionReason::kManualCompaction);
@@ -679,8 +680,7 @@ Compaction* CompactionPicker::PickCompactionForCompactRange(
         compact_range_options.target_path_id,
         GetCompressionType(vstorage, mutable_cf_options, output_level, 1),
         GetCompressionOptions(mutable_cf_options, vstorage, output_level),
-        mutable_cf_options.default_write_temperature,
-        compact_range_options.max_subcompactions,
+        Temperature::kUnknown, compact_range_options.max_subcompactions,
         /* grandparents */ {}, /* earliest_snapshot */ std::nullopt,
         /* snapshot_checker */ nullptr, CompactionReason::kManualCompaction,
         trim_ts, /* score */ -1,
@@ -871,8 +871,8 @@ Compaction* CompactionPicker::PickCompactionForCompactRange(
       GetCompressionType(vstorage, mutable_cf_options, output_level,
                          vstorage->base_level()),
       GetCompressionOptions(mutable_cf_options, vstorage, output_level),
-      mutable_cf_options.default_write_temperature,
-      compact_range_options.max_subcompactions, std::move(grandparents),
+      Temperature::kUnknown, compact_range_options.max_subcompactions,
+      std::move(grandparents),
       /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
       CompactionReason::kManualCompaction, trim_ts, /* score */ -1,
       /* l0_files_might_overlap */ true,
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index 51dd4ea5344e..a569fc12a360 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -124,8 +124,7 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   Compaction* c = new Compaction(
       vstorage, ioptions_, mutable_cf_options, mutable_db_options,
       std::move(inputs), 0, 0, 0, 0, kNoCompression,
-      mutable_cf_options.compression_opts,
-      mutable_cf_options.default_write_temperature,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
       /* snapshot_checker */ nullptr, CompactionReason::kFIFOTtl,
       /* trim_ts */ "", vstorage->CompactionScore(0),
@@ -194,8 +193,7 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
             {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
             0 /* max compaction bytes, not applicable */,
             0 /* output path ID */, mutable_cf_options.compression,
-            mutable_cf_options.compression_opts,
-            mutable_cf_options.default_write_temperature,
+            mutable_cf_options.compression_opts, Temperature::kUnknown,
             0 /* max_subcompactions */, {},
             /* earliest_snapshot */ std::nullopt,
             /* snapshot_checker */ nullptr,
@@ -294,8 +292,7 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
       /* target_file_size */ 0,
       /* max_compaction_bytes */ 0,
       /* output_path_id */ 0, kNoCompression,
-      mutable_cf_options.compression_opts,
-      mutable_cf_options.default_write_temperature,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
       /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt,
       /* snapshot_checker */ nullptr, CompactionReason::kFIFOMaxSize,
       /* trim_ts */ "", vstorage->CompactionScore(0),
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 3c6407da1683..132c5a72a191 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -557,7 +557,7 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
       GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
                          vstorage_->base_level()),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, std::move(grandparents_),
       /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr,
       compaction_reason_,
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 631295fbe851..605678295cb9 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -1178,7 +1178,7 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1248,7 +1248,7 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
     // Compaction picker picks older files first and picks one file at a time.
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1316,7 +1316,7 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
     // Compaction picker picks older files first and picks one file at a time.
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1383,7 +1383,7 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
   }
@@ -1464,12 +1464,35 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) {
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
     // Compaction picker picks older files first and picks one file at a time.
-    ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm);
+    ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kWarm);
     ASSERT_EQ(1U, compaction->num_input_files(0));
     ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
   }
 }
 
+TEST_F(CompactionPickerTest, CompactFilesOutputTemperature) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  auto file_number = 66U;
+  Add(0, file_number, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unordered_set<uint64_t> input{file_number};
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input, vstorage_.get(), CompactionOptions()));
+
+  auto compaction_options = CompactionOptions();
+  compaction_options.output_temperature_override = Temperature::kCold;
+
+  std::unique_ptr<Compaction> compaction(
+      level_compaction_picker.PickCompactionForCompactFiles(
+          compaction_options, input_files, 1, vstorage_.get(),
+          mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(compaction->GetOutputTemperature(), Temperature::kCold);
+}
+
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
   NewVersionStorage(6, kCompactionStyleLevel);
   ioptions_.compaction_pri = kMinOverlappingRatio;
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index d03c9cfd0876..c7223fd9ed8f 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -1093,7 +1093,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
                                            output_level, 1, enable_compression),
                         GetCompressionOptions(mutable_cf_options_, vstorage_,
                                               output_level, enable_compression),
-                        mutable_cf_options_.default_write_temperature,
+                        Temperature::kUnknown,
                         /* max_subcompactions */ 0, grandparents,
                         /* earliest_snapshot */ std::nullopt,
                         /* snapshot_checker */ nullptr, compaction_reason,
@@ -1441,7 +1441,7 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
                          true /* enable_compression */),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                             true /* enable_compression */),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, /* grandparents */ {},
       /* earliest_snapshot */ std::nullopt,
       /* snapshot_checker */ nullptr,
@@ -1594,7 +1594,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
       GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, grandparents, earliest_snapshot_,
       snapshot_checker_, CompactionReason::kFilesMarkedForCompaction,
       /* trim_ts */ "", score_,
@@ -1690,7 +1690,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
                          true /* enable_compression */),
       GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
                             true /* enable_compression */),
-      mutable_cf_options_.default_write_temperature,
+      Temperature::kUnknown,
       /* max_subcompactions */ 0, /* grandparents */ {},
       /* earliest_snapshot */ std::nullopt,
       /* snapshot_checker */ nullptr, compaction_reason,
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 75629a8a00ea..5e883874715e 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1843,8 +1843,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
         ,
         LLONG_MAX /* max compaction bytes, not applicable */,
         0 /* output path ID, not applicable */, mutable_cf_options.compression,
-        mutable_cf_options.compression_opts,
-        mutable_cf_options.default_write_temperature,
+        mutable_cf_options.compression_opts, Temperature::kUnknown,
         0 /* max_subcompactions, not applicable */,
         {} /* grandparents, not applicable */,
         std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
@@ -3861,7 +3860,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       uint64_t out_file_creation_time = static_cast<uint64_t>(tmp_current_time);
 
       FileOptions copied_file_options = file_options_;
-      copied_file_options.temperature = c->output_temperature();
+      copied_file_options.temperature = c->GetOutputTemperature();
       std::unique_ptr<WritableFileWriter> dest_writer;
       {
         std::unique_ptr<FSWritableFile> dest_file;
@@ -3879,7 +3878,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
               "NewWritableFile %s\n"
               " out_fname=%s, temperature=%s, io_status=%s",
               c->column_family_data()->GetName().c_str(), out_fname.c_str(),
-              temperature_to_string[c->output_temperature()].c_str(),
+              temperature_to_string[c->GetOutputTemperature()].c_str(),
               io_s.ToString().c_str());
           break;
         }
@@ -3901,7 +3900,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           c->column_family_data()->GetName().c_str(), in_fname.c_str(),
           temperature_to_string[in_file->temperature].c_str(),
           out_fname.c_str(),
-          temperature_to_string[c->output_temperature()].c_str(),
+          temperature_to_string[c->GetOutputTemperature()].c_str(),
           c->mutable_cf_options()
               .compaction_options_fifo.trivial_copy_buffer_size);
       // Add IO_LOW HINT for compaction
@@ -3941,7 +3940,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
               c->column_family_data()->GetName().c_str(), in_fname.c_str(),
               temperature_to_string[in_file->temperature].c_str(),
               out_fname.c_str(),
-              temperature_to_string[c->output_temperature()].c_str(),
+              temperature_to_string[c->GetOutputTemperature()].c_str(),
               io_s.ToString().c_str());
           break;
         }
@@ -3950,15 +3949,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       io_s = copy_file_io_status;
 
       if (!io_s.ok()) {
-        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] Failed to copy from: %s\n"
-                         " temperature=%s, to=%s, temperature=%s, io_status=%s",
-                         c->column_family_data()->GetName().c_str(),
-                         in_fname.c_str(),
-                         temperature_to_string[in_file->temperature].c_str(),
-                         out_fname.c_str(),
-                         temperature_to_string[c->output_temperature()].c_str(),
-                         io_s.ToString().c_str());
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Failed to copy from: %s\n"
+            " temperature=%s, to=%s, temperature=%s, io_status=%s",
+            c->column_family_data()->GetName().c_str(), in_fname.c_str(),
+            temperature_to_string[in_file->temperature].c_str(),
+            out_fname.c_str(),
+            temperature_to_string[c->GetOutputTemperature()].c_str(),
+            io_s.ToString().c_str());
         break;
       }
       ROCKS_LOG_BUFFER(log_buffer,
@@ -3968,7 +3967,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                        in_fname.c_str(),
                        temperature_to_string[in_file->temperature].c_str(),
                        out_fname.c_str(),
-                       temperature_to_string[c->output_temperature()].c_str(),
+                       temperature_to_string[c->GetOutputTemperature()].c_str(),
                        io_s.ToString().c_str());
 
       FileMetaData out_file_metadata{
@@ -3980,7 +3979,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           in_file->fd.smallest_seqno,
           in_file->fd.largest_seqno,
           false /* marked_for_compact */,
-          c->output_temperature() /* temperature */,
+          c->GetOutputTemperature() /* temperature */,
           in_file->oldest_blob_file_number,
           in_file->oldest_ancester_time,
           out_file_creation_time,
@@ -4049,7 +4048,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
             " temperature=%s, to temperature=%s, status=%s, io_status=%s",
             c->column_family_data()->GetName().c_str(), in_fname.c_str(),
             temperature_to_string[in_file->temperature].c_str(),
-            temperature_to_string[c->output_temperature()].c_str(),
+            temperature_to_string[c->GetOutputTemperature()].c_str(),
             status.ToString().c_str(), io_s.ToString().c_str());
       }
     }
@@ -4428,7 +4427,7 @@ Compaction* DBImpl::CreateIntendedCompactionForwardedToBottomPriorityPool(
                      c->output_level(), c->target_output_file_size(),
                      c->max_compaction_bytes(), c->output_path_id(),
                      c->output_compression(), c->output_compression_opts(),
-                     c->output_temperature(), c->max_subcompactions(),
+                     c->GetOutputTemperature(), c->max_subcompactions(),
                      c->grandparents(), std::nullopt /* earliest_snapshot */,
                      nullptr /* snapshot_checker */, c->compaction_reason());
 
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index d992e754d417..f7a34ab78cc7 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -740,8 +740,7 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() {
                             cfd_->ioptions().compaction_style),
         LLONG_MAX /* max compaction bytes, not applicable */,
         0 /* output path ID, not applicable */, mutable_cf_options.compression,
-        mutable_cf_options.compression_opts,
-        mutable_cf_options.default_write_temperature,
+        mutable_cf_options.compression_opts, Temperature::kUnknown,
         0 /* max_subcompaction, not applicable */,
         {} /* grandparents, not applicable */,
         std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 1f4e237d5fbb..bdba2a05519c 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2353,6 +2353,11 @@ struct CompactionOptions {
   // canceled variable in CompactionOptions, as it does for CompactRangeOptions
   // - this is because ManualCompactionState is not used
 
+  // Create output compaction file using this file temperature. If unset, will
+  // default to "last_level_temperature" if output level is last level otherwise
+  // "default_write_temperature"
+  Temperature output_temperature_override = Temperature::kUnknown;
+
   CompactionOptions()
       : compression(kDisableCompressionOption),
         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
diff --git a/unreleased_history/behavior_changes/fifo_compaction_temperature b/unreleased_history/behavior_changes/fifo_compaction_temperature
new file mode 100644
index 000000000000..ff0ab32e0555
--- /dev/null
+++ b/unreleased_history/behavior_changes/fifo_compaction_temperature
@@ -0,0 +1 @@
+* `kChangeTemperature` FIFO compaction will now honor `compaction_target_temp` to all levels regardless of `cf_options::last_level_temperature`
diff --git a/unreleased_history/public_api_changes/manual_compaction_output_temperature b/unreleased_history/public_api_changes/manual_compaction_output_temperature
new file mode 100644
index 000000000000..a9ac7ac2a017
--- /dev/null
+++ b/unreleased_history/public_api_changes/manual_compaction_output_temperature
@@ -0,0 +1 @@
+* Allow specifying output temperature in CompactionOptions

From eb1d924308b994cf13a2f105020c768025d3b8a9 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 22 Sep 2025 14:28:38 -0700
Subject: [PATCH 300/500] Fix an assertion failure in stress test (#13988)

Summary:
for MultiScan and UDI we start to use bound check from index iterator, so removing this assert here.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13988

Test Plan: existing test

Reviewed By: hx235

Differential Revision: D82993180

Pulled By: cbi42

fbshipit-source-id: 442b2e83cb3aef96fc1a825bf733af9ce59c21c1
---
 table/block_based/partitioned_index_iterator.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/table/block_based/partitioned_index_iterator.h b/table/block_based/partitioned_index_iterator.h
index 6412fe2399b5..31ccded9a025 100644
--- a/table/block_based/partitioned_index_iterator.h
+++ b/table/block_based/partitioned_index_iterator.h
@@ -81,8 +81,6 @@ class PartitionedIndexIterator : public InternalIteratorBase<IndexValue> {
     }
   }
   inline IterBoundCheck UpperBoundCheckResult() override {
-    // Shouldn't be called.
-    assert(false);
     return IterBoundCheck::kUnknown;
   }
   void SetPinnedItersMgr(PinnedIteratorsManager*) override {

From ab10ea0aac9b30fbfd88edcd69c38ba3affc644d Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 22 Sep 2025 15:03:46 -0700
Subject: [PATCH 301/500] Add in-memory data structures and (de)serialization
 support for subcompaction progress (#13928)

Summary:
**Context**
Resuming compaction is designed to periodically record the progress of an ongoing compaction and can resume from that saved progress after interruptions such as cancellation, database shutdown, or crashes.

This PR introduces the data structures needed to store  subcompaction progress in memory, along with serialization and deserialization support to persist and parse this progress to/from "a manifest-like compaction progress file" (the actual creation of such file is in upcoming PRs).

Flow of resuming: DB::OpenAndCompact() -> Compaction progress file  -> SubcompactionProgress -> CompactionJob
Flow of persistence: CompactionJob -> SubcompactionProgress -> Compaction progress file  -> DB that is called with OpenAndCompact()

**Summary**
Progress represented by `SubcompactionProgress` will be tracked at the scope of a subcompaction, which is the smallest independent unit of compaction work.

The frequency of recording this progress is once every N compaction output files (to be detailed in future PRs).

When recording, all fields, except for the output files metadata in `SubcompactionProgress`, will directly overwrite the corresponding fields from the last saved progress (See `SubcompactionProgress` and `SubcompactionProgressBuilder` for more).

As a bonus, this PR refactors the file metadata encoding and decoding utilities into two static helper functions, EncodeToNewFile4() and DecodeNewFile4From(), to support subcompaction progress usage.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13928

Test Plan:
- Added various `SubcompactionProgressTest` unit tests in version_edit_test.cc to verify basic serialization/deserialization and forward compatibility handling
- Existing UTs and stress/crash test

**Follow up:**
- Move output entry number and file verification to after each file creation so we can remove kNumProcessedOutputRecords persistence support and make resuming compaction work with `paranoid_file_checks=true` (by default false). Output verification will be done before persistence of progress. As long as this follow-up is done before the landing of the integration PR to create the progress file, we can change the manifest-like compaction progress file format freely.

Reviewed By: jaykorean

Differential Revision: D81986583

Pulled By: hx235

fbshipit-source-id: b42766da7d9c2e2f596c892d050c753238d1039f
---
 db/version_edit.cc      | 657 +++++++++++++++++++++++++++++++---------
 db/version_edit.h       | 243 ++++++++++++++-
 db/version_edit_test.cc | 367 ++++++++++++++++++++++
 3 files changed, 1120 insertions(+), 147 deletions(-)

diff --git a/db/version_edit.cc b/db/version_edit.cc
index 84aeba823faa..f76706fd7f52 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -112,124 +112,9 @@ bool VersionEdit::EncodeTo(std::string* dst,
         f.epoch_number == kUnknownEpochNumber) {
       return false;
     }
-    PutVarint32(dst, kNewFile4);
-    PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
-    PutVarint64(dst, f.fd.GetFileSize());
-    EncodeFileBoundaries(dst, f, ts_sz.value());
-    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
-    // Customized fields' format:
-    // +-----------------------------+
-    // | 1st field's tag (varint32)  |
-    // +-----------------------------+
-    // | 1st field's size (varint32) |
-    // +-----------------------------+
-    // |    bytes for 1st field      |
-    // |  (based on size decoded)    |
-    // +-----------------------------+
-    // |                             |
-    // |          ......             |
-    // |                             |
-    // +-----------------------------+
-    // | last field's size (varint32)|
-    // +-----------------------------+
-    // |    bytes for last field     |
-    // |  (based on size decoded)    |
-    // +-----------------------------+
-    // | terminating tag (varint32)  |
-    // +-----------------------------+
-    //
-    // Customized encoding for fields:
-    //   tag kPathId: 1 byte as path_id
-    //   tag kNeedCompaction:
-    //        now only can take one char value 1 indicating need-compaction
-    //
-    PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
-    std::string varint_oldest_ancester_time;
-    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
-                             &varint_oldest_ancester_time);
-    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
-
-    PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
-    std::string varint_file_creation_time;
-    PutVarint64(&varint_file_creation_time, f.file_creation_time);
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
-                             &varint_file_creation_time);
-    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
-
-    PutVarint32(dst, NewFileCustomTag::kEpochNumber);
-    std::string varint_epoch_number;
-    PutVarint64(&varint_epoch_number, f.epoch_number);
-    PutLengthPrefixedSlice(dst, Slice(varint_epoch_number));
-
-    if (f.file_checksum_func_name != kUnknownFileChecksumFuncName) {
-      PutVarint32(dst, NewFileCustomTag::kFileChecksum);
-      PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
-
-      PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
-      PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
-    }
-
-    if (f.fd.GetPathId() != 0) {
-      PutVarint32(dst, NewFileCustomTag::kPathId);
-      char p = static_cast<char>(f.fd.GetPathId());
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    if (f.temperature != Temperature::kUnknown) {
-      PutVarint32(dst, NewFileCustomTag::kTemperature);
-      char p = static_cast<char>(f.temperature);
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    if (f.marked_for_compaction) {
-      PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
-      char p = static_cast<char>(1);
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    if (has_min_log_number_to_keep_ && !min_log_num_written) {
-      PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
-      std::string varint_log_number;
-      PutFixed64(&varint_log_number, min_log_number_to_keep_);
-      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
-      min_log_num_written = true;
-    }
-    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
-      PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
-      std::string oldest_blob_file_number;
-      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
-      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
-    }
-    UniqueId64x2 unique_id = f.unique_id;
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
-    if (unique_id != kNullUniqueId64x2) {
-      PutVarint32(dst, NewFileCustomTag::kUniqueId);
-      std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
-      PutLengthPrefixedSlice(dst, Slice(unique_id_str));
-    }
-    if (f.compensated_range_deletion_size) {
-      PutVarint32(dst, kCompensatedRangeDeletionSize);
-      std::string compensated_range_deletion_size;
-      PutVarint64(&compensated_range_deletion_size,
-                  f.compensated_range_deletion_size);
-      PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
-    }
-    if (f.tail_size) {
-      PutVarint32(dst, NewFileCustomTag::kTailSize);
-      std::string varint_tail_size;
-      PutVarint64(&varint_tail_size, f.tail_size);
-      PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
-    }
-    if (!f.user_defined_timestamps_persisted) {
-      // The default value for the flag is true, it's only explicitly persisted
-      // when it's false. We are putting 0 as the value here to signal false
-      // (i.e. UDTS not persisted).
-      PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted);
-      char p = static_cast<char>(0);
-      PutLengthPrefixedSlice(dst, Slice(&p, 1));
-    }
-    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
-                             dst);
-
-    PutVarint32(dst, NewFileCustomTag::kTerminate);
+    EncodeToNewFile4(f, new_files_[i].first, ts_sz.value(),
+                     has_min_log_number_to_keep_, min_log_number_to_keep_,
+                     min_log_num_written, dst);
   }
 
   for (const auto& blob_file_addition : blob_file_additions_) {
@@ -288,9 +173,142 @@ bool VersionEdit::EncodeTo(std::string* dst,
     char p = static_cast<char>(persist_user_defined_timestamps_);
     PutLengthPrefixedSlice(dst, Slice(&p, 1));
   }
+
+  if (HasSubcompactionProgress()) {
+    PutVarint32(dst, kSubcompactionProgress);
+    std::string progress_data;
+    subcompaction_progress_.EncodeTo(&progress_data);
+    PutLengthPrefixedSlice(dst, progress_data);
+  }
+
   return true;
 }
 
+void VersionEdit::EncodeToNewFile4(const FileMetaData& f, int level,
+                                   size_t ts_sz,
+                                   bool has_min_log_number_to_keep,
+                                   uint64_t min_log_number_to_keep,
+                                   bool& min_log_num_written,
+                                   std::string* dst) {
+  PutVarint32(dst, kNewFile4);
+  PutVarint32Varint64(dst, level, f.fd.GetNumber());
+  PutVarint64(dst, f.fd.GetFileSize());
+  EncodeFileBoundaries(dst, f, ts_sz);
+  PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+  // Customized fields' format:
+  // +-----------------------------+
+  // | 1st field's tag (varint32)  |
+  // +-----------------------------+
+  // | 1st field's size (varint32) |
+  // +-----------------------------+
+  // |    bytes for 1st field      |
+  // |  (based on size decoded)    |
+  // +-----------------------------+
+  // |                             |
+  // |          ......             |
+  // |                             |
+  // +-----------------------------+
+  // | last field's size (varint32)|
+  // +-----------------------------+
+  // |    bytes for last field     |
+  // |  (based on size decoded)    |
+  // +-----------------------------+
+  // | terminating tag (varint32)  |
+  // +-----------------------------+
+  //
+  // Customized encoding for fields:
+  //   tag kPathId: 1 byte as path_id
+  //   tag kNeedCompaction:
+  //        now only can take one char value 1 indicating need-compaction
+  //
+  PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
+  std::string varint_oldest_ancester_time;
+  PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                           &varint_oldest_ancester_time);
+  PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+  PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
+  std::string varint_file_creation_time;
+  PutVarint64(&varint_file_creation_time, f.file_creation_time);
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                           &varint_file_creation_time);
+  PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+  PutVarint32(dst, NewFileCustomTag::kEpochNumber);
+  std::string varint_epoch_number;
+  PutVarint64(&varint_epoch_number, f.epoch_number);
+  PutLengthPrefixedSlice(dst, Slice(varint_epoch_number));
+
+  if (f.file_checksum_func_name != kUnknownFileChecksumFuncName) {
+    PutVarint32(dst, NewFileCustomTag::kFileChecksum);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+    PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+  }
+
+  if (f.fd.GetPathId() != 0) {
+    PutVarint32(dst, NewFileCustomTag::kPathId);
+    char p = static_cast<char>(f.fd.GetPathId());
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  if (f.temperature != Temperature::kUnknown) {
+    PutVarint32(dst, NewFileCustomTag::kTemperature);
+    char p = static_cast<char>(f.temperature);
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  if (f.marked_for_compaction) {
+    PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
+    char p = static_cast<char>(1);
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  if (has_min_log_number_to_keep && !min_log_num_written) {
+    PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
+    std::string varint_log_number;
+    PutFixed64(&varint_log_number, min_log_number_to_keep);
+    PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+    min_log_num_written = true;
+  }
+  if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+    PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
+    std::string oldest_blob_file_number;
+    PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+    PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+  }
+  UniqueId64x2 unique_id = f.unique_id;
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
+  if (unique_id != kNullUniqueId64x2) {
+    PutVarint32(dst, NewFileCustomTag::kUniqueId);
+    std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
+    PutLengthPrefixedSlice(dst, Slice(unique_id_str));
+  }
+  if (f.compensated_range_deletion_size) {
+    PutVarint32(dst, NewFileCustomTag::kCompensatedRangeDeletionSize);
+    std::string compensated_range_deletion_size;
+    PutVarint64(&compensated_range_deletion_size,
+                f.compensated_range_deletion_size);
+    PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size));
+  }
+  if (f.tail_size) {
+    PutVarint32(dst, NewFileCustomTag::kTailSize);
+    std::string varint_tail_size;
+    PutVarint64(&varint_tail_size, f.tail_size);
+    PutLengthPrefixedSlice(dst, Slice(varint_tail_size));
+  }
+  if (!f.user_defined_timestamps_persisted) {
+    // The default value for the flag is true, it's only explicitly persisted
+    // when it's false. We are putting 0 as the value here to signal false
+    // (i.e. UDTS not persisted).
+    PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted);
+    char p = static_cast<char>(0);
+    PutLengthPrefixedSlice(dst, Slice(&p, 1));
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                           dst);
+
+  PutVarint32(dst, NewFileCustomTag::kTerminate);
+}
 static bool GetInternalKey(Slice* input, InternalKey* dst) {
   Slice str;
   if (GetLengthPrefixedSlice(input, &str)) {
@@ -301,12 +319,12 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
   }
 }
 
-bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+bool VersionEdit::GetLevel(Slice* input, int* level, int& max_level) {
   uint32_t v = 0;
   if (GetVarint32(input, &v)) {
     *level = v;
-    if (max_level_ < *level) {
-      max_level_ = *level;
+    if (max_level < *level) {
+      max_level = *level;
     }
     return true;
   } else {
@@ -314,16 +332,18 @@ bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
   }
 }
 
-const char* VersionEdit::DecodeNewFile4From(Slice* input) {
-  const char* msg = nullptr;
+const char* VersionEdit::DecodeNewFile4From(Slice* input, int& max_level,
+                                            uint64_t& min_log_number_to_keep,
+                                            bool& has_min_log_number_to_keep,
+                                            NewFiles& new_files,
+                                            FileMetaData& f) {
   int level = 0;
-  FileMetaData f;
   uint64_t number = 0;
   uint32_t path_id = 0;
   uint64_t file_size = 0;
   SequenceNumber smallest_seqno = 0;
   SequenceNumber largest_seqno = kMaxSequenceNumber;
-  if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+  if (GetLevel(input, &level, max_level) && GetVarint64(input, &number) &&
       GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
       GetInternalKey(input, &f.largest) &&
       GetVarint64(input, &smallest_seqno) &&
@@ -381,10 +401,10 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
         case kMinLogNumberToKeepHack:
           // This is a hack to encode kMinLogNumberToKeep in a
           // forward-compatible fashion.
-          if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+          if (!GetFixed64(&field, &min_log_number_to_keep)) {
             return "deleted log number malformatted";
           }
-          has_min_log_number_to_keep_ = true;
+          has_min_log_number_to_keep = true;
           break;
         case kOldestBlobFileNumber:
           if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
@@ -436,13 +456,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
   }
   f.fd =
       FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
-  new_files_.push_back(std::make_pair(level, f));
+  new_files.emplace_back(level, f);
   return nullptr;
 }
 
 void VersionEdit::EncodeFileBoundaries(std::string* dst,
-                                       const FileMetaData& meta,
-                                       size_t ts_sz) const {
+                                       const FileMetaData& meta, size_t ts_sz) {
   if (ts_sz == 0 || meta.user_defined_timestamps_persisted) {
     PutLengthPrefixedSlice(dst, meta.smallest.Encode());
     PutLengthPrefixedSlice(dst, meta.largest.Encode());
@@ -545,7 +564,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         break;
 
       case kCompactCursor:
-        if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) {
+        if (GetLevel(&input, &level, max_level_) &&
+            GetInternalKey(&input, &key)) {
           // Here we re-use the output format of compact pointer in LevelDB
           // to persist compact_cursors_
           compact_cursors_.push_back(std::make_pair(level, key));
@@ -558,7 +578,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
 
       case kDeletedFile: {
         uint64_t number = 0;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number)) {
           deleted_files_.insert(std::make_pair(level, number));
         } else {
           if (!msg) {
@@ -571,8 +592,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       case kNewFile: {
         uint64_t number = 0;
         uint64_t file_size = 0;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
-            GetVarint64(&input, &file_size) &&
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number) && GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest)) {
           f.fd = FileDescriptor(number, 0, file_size);
@@ -589,8 +610,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         uint64_t file_size = 0;
         SequenceNumber smallest_seqno = 0;
         SequenceNumber largest_seqno = kMaxSequenceNumber;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
-            GetVarint64(&input, &file_size) &&
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number) && GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
             GetVarint64(&input, &smallest_seqno) &&
@@ -612,8 +633,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         uint64_t file_size = 0;
         SequenceNumber smallest_seqno = 0;
         SequenceNumber largest_seqno = kMaxSequenceNumber;
-        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
-            GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+        if (GetLevel(&input, &level, max_level_) &&
+            GetVarint64(&input, &number) && GetVarint32(&input, &path_id) &&
+            GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
             GetVarint64(&input, &smallest_seqno) &&
@@ -630,7 +652,10 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       }
 
       case kNewFile4: {
-        msg = DecodeNewFile4From(&input);
+        FileMetaData ignored_file;
+        msg = DecodeNewFile4From(&input, max_level_, min_log_number_to_keep_,
+                                 has_min_log_number_to_keep_, new_files_,
+                                 ignored_file);
         break;
       }
 
@@ -767,6 +792,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
+      case kSubcompactionProgress: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "SubcompactionProgress not prefixed by length";
+          break;
+        }
+
+        SubcompactionProgress progress;
+        Status s = progress.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        SetSubcompactionProgress(progress);
+        break;
+      }
+
       default:
         if (tag & kTagSafeIgnoreMask) {
           // Tag from future which can be safely ignored.
@@ -1087,4 +1129,341 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
   return jw.Get();
 }
 
+void SubcompactionProgressPerLevel::EncodeTo(std::string* dst) const {
+  if (num_processed_output_records_ > 0) {
+    PutVarint32(
+        dst,
+        SubcompactionProgressPerLevelCustomTag::kNumProcessedOutputRecords);
+    std::string varint_records;
+    PutVarint64(&varint_records, num_processed_output_records_);
+    PutLengthPrefixedSlice(dst, varint_records);
+  }
+
+  if (!output_files_.empty()) {
+    PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta);
+    std::string files_data;
+    EncodeOutputFiles(&files_data);
+    PutLengthPrefixedSlice(dst, files_data);
+  } else if (!temp_output_files_allocation_.empty()) {
+    PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta);
+    std::string files_data;
+    EncodeTemporaryOutputFilesAllocation(&files_data);
+    PutLengthPrefixedSlice(dst, files_data);
+  }
+
+  PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::
+                       kSubcompactionProgressPerLevelTerminate);
+}
+
+Status SubcompactionProgressPerLevel::DecodeFrom(Slice* input) {
+  Clear();
+
+  while (true) {
+    uint32_t tag = 0;
+    if (!GetVarint32(input, &tag)) {
+      return Status::Corruption("SubcompactionProgressPerLevel", "tag error");
+    }
+
+    if (tag == SubcompactionProgressPerLevelCustomTag::
+                   kSubcompactionProgressPerLevelTerminate) {
+      break;
+    }
+
+    Slice field;
+    if (!GetLengthPrefixedSlice(input, &field)) {
+      return Status::Corruption("SubcompactionProgressPerLevel",
+                                "field length prefixed slice error");
+    }
+
+    switch (tag) {
+      case SubcompactionProgressPerLevelCustomTag::kNumProcessedOutputRecords: {
+        if (!GetVarint64(&field, &num_processed_output_records_)) {
+          return Status::Corruption("SubcompactionProgressPerLevel",
+                                    "invalid num_processed_output_records_");
+        }
+        break;
+      }
+
+      case SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta: {
+        Status s = DecodeOutputFiles(&field, temp_output_files_allocation_);
+        if (!s.ok()) {
+          return s;
+        }
+        break;
+      }
+
+      default:
+        // Forward compatibility: Handle unknown tags
+        if ((tag & SubcompactionProgressPerLevelCustomTag::
+                       kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask) !=
+            0) {
+          break;
+        } else {
+          return Status::NotSupported("SubcompactionProgress",
+                                      "unsupported critical custom field");
+        }
+    }
+  }
+
+  return Status::OK();
+}
+
+void SubcompactionProgressPerLevel::EncodeOutputFiles(std::string* dst) const {
+  size_t new_files_count =
+      output_files_.size() > last_persisted_output_files_count_
+          ? output_files_.size() - last_persisted_output_files_count_
+          : 0;
+
+  assert(new_files_count > 0);
+
+  PutVarint32(dst, static_cast<uint32_t>(new_files_count));
+
+  for (size_t i = last_persisted_output_files_count_; i < output_files_.size();
+       ++i) {
+    const FileMetaData* file_ptr = output_files_[i];
+    assert(file_ptr != nullptr);
+
+    std::string file_dst;
+    bool ignored_min_log_written = false;
+
+    VersionEdit::EncodeToNewFile4(*file_ptr, -1 /* level */, 0 /* ts_sz */,
+                                  false /* has_min_log_number_to_keep */,
+                                  0 /* min_log_number_to_keep */,
+                                  ignored_min_log_written, &file_dst);
+
+    PutLengthPrefixedSlice(dst, file_dst);
+  }
+}
+
+void SubcompactionProgressPerLevel::EncodeTemporaryOutputFilesAllocation(
+    std::string* dst) const {
+  size_t new_files_count =
+      temp_output_files_allocation_.size() > last_persisted_output_files_count_
+          ? temp_output_files_allocation_.size() -
+                last_persisted_output_files_count_
+          : 0;
+
+  assert(new_files_count > 0);
+
+  PutVarint32(dst, static_cast<uint32_t>(new_files_count));
+
+  for (size_t i = last_persisted_output_files_count_;
+       i < temp_output_files_allocation_.size(); ++i) {
+    const FileMetaData& file = temp_output_files_allocation_[i];
+
+    std::string file_dst;
+    bool ignored_min_log_written = false;
+
+    VersionEdit::EncodeToNewFile4(file, -1 /* level */, 0 /* ts_sz */,
+                                  false /* has_min_log_number_to_keep */,
+                                  0 /* min_log_number_to_keep */,
+                                  ignored_min_log_written, &file_dst);
+
+    PutLengthPrefixedSlice(dst, file_dst);
+  }
+}
+
+Status SubcompactionProgressPerLevel::DecodeOutputFiles(
+    Slice* input, autovector<FileMetaData>& temporary_output_files_allocation) {
+  uint32_t new_file_count = 0;
+  if (!GetVarint32(input, &new_file_count)) {
+    return Status::Corruption("SubcompactionProgressPerLevel",
+                              "new output file count");
+  }
+
+  assert(temporary_output_files_allocation.size() == 0);
+
+  temporary_output_files_allocation.reserve(new_file_count);
+
+  for (uint32_t i = 0; i < new_file_count; ++i) {
+    Slice file_input;
+    if (!GetLengthPrefixedSlice(input, &file_input)) {
+      return Status::Corruption("SubcompactionProgressPerLevel",
+                                "output file metadata");
+    }
+
+    uint32_t tag = 0;
+    if (!GetVarint32(&file_input, &tag) || tag != kNewFile4) {
+      return Status::Corruption("SubcompactionProgressPerLevel",
+                                "expected kNewFile4 tag");
+    }
+
+    int ignored_max_level = -1;
+    uint64_t ignored_min_log_number_to_keep = 0;
+    bool ignored_has_min_log_number_to_keep = false;
+    VersionEdit::NewFiles ignored_new_files;
+    FileMetaData file;
+
+    const char* err = VersionEdit::DecodeNewFile4From(
+        &file_input, ignored_max_level, ignored_min_log_number_to_keep,
+        ignored_has_min_log_number_to_keep, ignored_new_files, file);
+
+    if (err != nullptr) {
+      return Status::Corruption("SubcompactionProgressPerLevel", err);
+    }
+
+    temporary_output_files_allocation.push_back(std::move(file));
+  }
+
+  return Status::OK();
+}
+
+void SubcompactionProgress::EncodeTo(std::string* dst) const {
+  if (!next_internal_key_to_compact.empty()) {
+    PutVarint32(dst, SubcompactionProgressCustomTag::kNextInternalKeyToCompact);
+    PutLengthPrefixedSlice(dst, next_internal_key_to_compact);
+  }
+
+  if (num_processed_input_records > 0) {
+    PutVarint32(dst, SubcompactionProgressCustomTag::kNumProcessedInputRecords);
+    std::string varint_records;
+    PutVarint64(&varint_records, num_processed_input_records);
+    PutLengthPrefixedSlice(dst, varint_records);
+  }
+
+  if (output_level_progress.GetOutputFiles().size() >
+      output_level_progress.GetLastPersistedOutputFilesCount()) {
+    PutVarint32(dst, SubcompactionProgressCustomTag::kOutputLevelProgress);
+    std::string level_progress_data;
+    output_level_progress.EncodeTo(&level_progress_data);
+    PutLengthPrefixedSlice(dst, level_progress_data);
+  }
+
+  if (proximal_output_level_progress.GetOutputFiles().size() >
+      proximal_output_level_progress.GetLastPersistedOutputFilesCount()) {
+    PutVarint32(dst,
+                SubcompactionProgressCustomTag::kProximalOutputLevelProgress);
+    std::string level_progress_data;
+    proximal_output_level_progress.EncodeTo(&level_progress_data);
+    PutLengthPrefixedSlice(dst, level_progress_data);
+  }
+  PutVarint32(dst,
+              SubcompactionProgressCustomTag::kSubcompactionProgressTerminate);
+}
+
+Status SubcompactionProgress::DecodeFrom(Slice* input) {
+  Clear();
+
+  while (true) {
+    uint32_t custom_tag = 0;
+    if (!GetVarint32(input, &custom_tag)) {
+      return Status::Corruption("SubcompactionProgress",
+                                "custom field tag error");
+    }
+
+    if (custom_tag ==
+        SubcompactionProgressCustomTag::kSubcompactionProgressTerminate) {
+      break;
+    }
+
+    Slice field;
+    if (!GetLengthPrefixedSlice(input, &field)) {
+      return Status::Corruption("SubcompactionProgress",
+                                "custom field length prefixed slice error");
+    }
+
+    switch (custom_tag) {
+      case SubcompactionProgressCustomTag::kNextInternalKeyToCompact:
+        next_internal_key_to_compact = field.ToString();
+        break;
+
+      case SubcompactionProgressCustomTag::kNumProcessedInputRecords:
+        if (!GetVarint64(&field, &num_processed_input_records)) {
+          return Status::Corruption("SubcompactionProgress",
+                                    "invalid num_processed_input_records");
+        }
+        break;
+
+      case SubcompactionProgressCustomTag::kOutputLevelProgress: {
+        Status s = output_level_progress.DecodeFrom(&field);
+        if (!s.ok()) {
+          return s;
+        }
+        break;
+      }
+
+      case SubcompactionProgressCustomTag::kProximalOutputLevelProgress: {
+        Status s = proximal_output_level_progress.DecodeFrom(&field);
+        if (!s.ok()) {
+          return s;
+        }
+        break;
+      }
+
+      default:
+        if ((custom_tag & SubcompactionProgressCustomTag::
+                              kSubcompactionProgressCustomTagSafeIgnoreMask) !=
+            0) {
+          break;
+        } else {
+          return Status::NotSupported("SubcompactionProgress",
+                                      "unsupported critical custom field");
+        }
+    }
+  }
+
+  return Status::OK();
+}
+
+bool SubcompactionProgressBuilder::ProcessVersionEdit(const VersionEdit& edit) {
+  if (!edit.HasSubcompactionProgress()) {
+    return false;
+  }
+
+  const SubcompactionProgress& progress = edit.GetSubcompactionProgress();
+
+  MergeDeltaProgress(progress);
+
+  has_subcompaction_progress_ = true;
+
+  return true;
+}
+
+void SubcompactionProgressBuilder::MergeDeltaProgress(
+    const SubcompactionProgress& delta_progress) {
+  accumulated_subcompaction_progress_.next_internal_key_to_compact =
+      delta_progress.next_internal_key_to_compact;
+
+  accumulated_subcompaction_progress_.num_processed_input_records =
+      delta_progress.num_processed_input_records;
+
+  MaybeMergeDeltaProgressPerLevel(
+      accumulated_subcompaction_progress_.output_level_progress,
+      delta_progress.output_level_progress);
+
+  MaybeMergeDeltaProgressPerLevel(
+      accumulated_subcompaction_progress_.proximal_output_level_progress,
+      delta_progress.proximal_output_level_progress);
+}
+
+void SubcompactionProgressBuilder::MaybeMergeDeltaProgressPerLevel(
+    SubcompactionProgressPerLevel& accumulated_level_progress,
+    const SubcompactionProgressPerLevel& delta_level_progress) {
+  assert(delta_level_progress.GetOutputFiles().empty());
+
+  if (delta_level_progress.GetTempOutputFilesAllocation().empty()) {
+    return;
+  }
+
+  accumulated_level_progress.SetNumProcessedOutputRecords(
+      delta_level_progress.GetNumProcessedOutputRecords());
+
+  auto& accumulated_temp_files =
+      accumulated_level_progress.TempOutputFilesAllocation();
+
+  const auto& delta_temp_files =
+      delta_level_progress.GetTempOutputFilesAllocation();
+
+  accumulated_temp_files.reserve(accumulated_temp_files.size() +
+                                 delta_temp_files.size());
+
+  for (const auto& file_allocation : delta_temp_files) {
+    accumulated_temp_files.push_back(file_allocation);
+  }
+}
+
+void SubcompactionProgressBuilder::Clear() {
+  accumulated_subcompaction_progress_.Clear();
+  has_subcompaction_progress_ = false;
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_edit.h b/db/version_edit.h
index 37175f1db3d4..83b04e577510 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -72,6 +72,23 @@ enum Tag : uint32_t {
   kWalAddition2,
   kWalDeletion2,
   kPersistUserDefinedTimestamps,
+  kSubcompactionProgress,
+};
+
+enum SubcompactionProgressPerLevelCustomTag : uint32_t {
+  kSubcompactionProgressPerLevelTerminate = 1,  // End of fields marker
+  kOutputFilesDelta = 2,
+  kNumProcessedOutputRecords = 3,
+  kSubcompactionProgressPerLevelCustomTagSafeIgnoreMask = 1 << 16,
+};
+
+enum SubcompactionProgressCustomTag : uint32_t {
+  kSubcompactionProgressTerminate = 1,  // End of fields marker
+  kNextInternalKeyToCompact = 2,
+  kNumProcessedInputRecords = 3,
+  kOutputLevelProgress = 4,
+  kProximalOutputLevelProgress = 5,
+  kSubcompactionProgressCustomTagSafeIgnoreMask = 1 << 16,
 };
 
 enum NewFileCustomTag : uint32_t {
@@ -440,12 +457,198 @@ struct LevelFilesBrief {
   }
 };
 
+struct SubcompactionProgressPerLevel {
+  uint64_t GetNumProcessedOutputRecords() const {
+    return num_processed_output_records_;
+  }
+
+  void SetNumProcessedOutputRecords(uint64_t num) {
+    num_processed_output_records_ = num;
+  }
+
+  const autovector<const FileMetaData*>& GetOutputFiles() const {
+    return output_files_;
+  }
+
+  void AddToOutputFiles(const FileMetaData* file) {
+    output_files_.push_back(file);
+  }
+
+  const autovector<FileMetaData>& GetTempOutputFilesAllocation() const {
+    return temp_output_files_allocation_;
+  }
+
+  autovector<FileMetaData>& TempOutputFilesAllocation() {
+    return temp_output_files_allocation_;
+  }
+
+  size_t GetLastPersistedOutputFilesCount() const {
+    return last_persisted_output_files_count_;
+  }
+
+  void UpdateLastPersistedOutputFilesCount() {
+    last_persisted_output_files_count_ =
+        std::max(output_files_.size(), temp_output_files_allocation_.size());
+  }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* input);
+
+  void Clear() {
+    num_processed_output_records_ = 0;
+    output_files_.clear();
+    temp_output_files_allocation_.clear();
+    last_persisted_output_files_count_ = 0;
+  }
+
+  std::string ToString() const {
+    std::ostringstream oss;
+    oss << "SubcompactionProgressPerLevel{";
+    oss << " num_processed_output_records=" << num_processed_output_records_;
+    oss << ", output_files_count=" << output_files_.size();
+    oss << ", temp_output_files_allocation_count="
+        << temp_output_files_allocation_.size();
+    oss << ", last_persisted_output_files_count="
+        << last_persisted_output_files_count_;
+    oss << " }";
+    return oss.str();
+  }
+
+  void TEST_ClearOutputFiles() { output_files_.clear(); }
+
+ private:
+  uint64_t num_processed_output_records_ = 0;
+
+  // These pointers ONLY point to FileMetaData objects owned by compaction
+  // outputs. They are NEVER set to point to objects in
+  // `temp_output_files_allocation` This ensures stable pointers that don't get
+  // invalidated by copy/move operations on `SubcompactionProgress`
+  autovector<const FileMetaData*> output_files_ = {};
+
+  // These are ONLY used during deserialization from VersionEdit.
+  // They provide temporary storage before being moved to compaction outputs.
+  autovector<FileMetaData> temp_output_files_allocation_ = {};
+
+  // Number of files already persisted to help calculate the new output files to
+  // persist in the future. This is to prevent having to persist all the output
+  // files metadata so far every time of a "snapshot" of a progress is persisted
+  // which can lead to O(1+2+...+n) = O(n^2) file metadata being persisted. The
+  // current approach of persisting only the delta should always persist
+  // exactly the number (n) of output files in total.
+  size_t last_persisted_output_files_count_ = 0;
+
+  void EncodeOutputFiles(std::string* dst) const;
+
+  void EncodeTemporaryOutputFilesAllocation(std::string* dst) const;
+
+  Status DecodeOutputFiles(Slice* input,
+                           autovector<FileMetaData>& temp_storage);
+};
+
+struct SubcompactionProgress {
+  static constexpr uint64_t kInaccurateNumProcessedInputRecords =
+      std::numeric_limits<uint64_t>::max();
+
+  std::string next_internal_key_to_compact;
+
+  uint64_t num_processed_input_records = 0;
+
+  SubcompactionProgressPerLevel output_level_progress;
+
+  SubcompactionProgressPerLevel proximal_output_level_progress;
+
+  SubcompactionProgress() = default;
+
+  void Clear() {
+    next_internal_key_to_compact.clear();
+    num_processed_input_records = 0;
+    output_level_progress.Clear();
+    proximal_output_level_progress.Clear();
+  }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* input);
+
+  std::string ToString() const {
+    std::ostringstream oss;
+    oss << "SubcompactionProgress{";
+    oss << " next_internal_key_to_compact="
+        << (next_internal_key_to_compact.empty()
+                ? "NONE"
+                : next_internal_key_to_compact);
+    oss << ", num_processed_input_records=" << num_processed_input_records;
+    oss << ", output_level_progress" << output_level_progress.ToString();
+    oss << ", proximal_output_level_progress"
+        << proximal_output_level_progress.ToString();
+    oss << " }";
+    return oss.str();
+  }
+};
+
+class VersionEdit;
+
+// Builder class to reconstruct complete subcompaction progress object
+// from multiple decoded VersionEdits containing delta output files information
+// of the same subcompaction. See
+// `SubcompactionProgressPerLevel::last_persisted_output_files_count_`'s comment
+//
+// WARNING: This class currently assumes all input VersionEdits contain progress
+// information for the SAME subcompaction. It does not validate
+// progress data from different subcompactions so mixing progress from
+// multiple subcompactions can result in corrupted state silently. The caller is
+// responsible for ensuring all VersionEdits processed by a single instance
+// of this builder correspond to the same subcompaction.
+class SubcompactionProgressBuilder {
+ public:
+  SubcompactionProgressBuilder() = default;
+
+  bool ProcessVersionEdit(const VersionEdit& edit);
+
+  const SubcompactionProgress& GetAccumulatedSubcompactionProgress() const {
+    return accumulated_subcompaction_progress_;
+  }
+
+  bool HasAccumulatedSubcompactionProgress() const {
+    return has_subcompaction_progress_;
+  }
+
+  void Clear();
+
+ private:
+  void MergeDeltaProgress(const SubcompactionProgress& delta_progress);
+
+  void MaybeMergeDeltaProgressPerLevel(
+      SubcompactionProgressPerLevel& accumulated_level_progress,
+      const SubcompactionProgressPerLevel& delta_level_progress);
+
+  SubcompactionProgress accumulated_subcompaction_progress_;
+  bool has_subcompaction_progress_ = false;
+};
+
+// Type alias for backward compatibility - vector of subcompaction progress
+using CompactionProgress = std::vector<SubcompactionProgress>;
+
 // The state of a DB at any given time is referred to as a Version.
 // Any modification to the Version is considered a Version Edit. A Version is
 // constructed by joining a sequence of Version Edits. Version Edits are written
 // to the MANIFEST file.
 class VersionEdit {
  public:
+  // Retrieve the table files added as well as their associated levels.
+  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+
+  static void EncodeToNewFile4(const FileMetaData& f, int level, size_t ts_sz,
+                               bool has_min_log_number_to_keep,
+                               uint64_t min_log_number_to_keep,
+                               bool& min_log_num_written, std::string* dst);
+
+  static const char* DecodeNewFile4From(Slice* input, int& max_level,
+                                        uint64_t& min_log_number_to_keep,
+                                        bool& has_min_log_number_to_keep,
+                                        NewFiles& new_files, FileMetaData& f);
+
   void Clear();
 
   void SetDBId(const std::string& db_id) {
@@ -564,8 +767,6 @@ class VersionEdit {
     }
   }
 
-  // Retrieve the table files added as well as their associated levels.
-  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
   const NewFiles& GetNewFiles() const { return new_files_; }
 
   NewFiles& GetMutableNewFiles() { return new_files_; }
@@ -735,6 +936,22 @@ class VersionEdit {
     full_history_ts_low_ = std::move(full_history_ts_low);
   }
 
+  void SetSubcompactionProgress(const SubcompactionProgress& progress) {
+    has_subcompaction_progress_ = true;
+    subcompaction_progress_ = progress;
+  }
+
+  bool HasSubcompactionProgress() const { return has_subcompaction_progress_; }
+
+  const SubcompactionProgress& GetSubcompactionProgress() const {
+    return subcompaction_progress_;
+  }
+
+  void ClearSubcompactionProgress() {
+    has_subcompaction_progress_ = false;
+    subcompaction_progress_.Clear();
+  }
+
   // return true on success.
   // `ts_sz` is the size in bytes for the user-defined timestamp contained in
   // a user key. This argument is optional because it's only required for
@@ -757,15 +974,22 @@ class VersionEdit {
   std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
  private:
-  bool GetLevel(Slice* input, int* level, const char** msg);
-
-  const char* DecodeNewFile4From(Slice* input);
-
+  // Decode level information from serialized VersionEdit data and and track the
+  // maximum level seen.
+  //
+  // Parameters:
+  //   input: Pointer to serialized data slice
+  //   level: Output parameter for the decoded level value
+  //   max_level: get updated if the decoded level is higher than passed in
+  //   value
+  //
+  // Returns: true on successful decode, false on parse error
+  static bool GetLevel(Slice* input, int* level, int& max_level);
   // Encode file boundaries `FileMetaData.smallest` and `FileMetaData.largest`.
   // User-defined timestamps in the user key will be stripped if they shouldn't
   // be persisted.
-  void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta,
-                            size_t ts_sz) const;
+  static void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta,
+                                   size_t ts_sz);
 
   int max_level_ = 0;
   std::string db_id_;
@@ -816,6 +1040,9 @@ class VersionEdit {
   std::string full_history_ts_low_;
   bool persist_user_defined_timestamps_ = true;
 
+  bool has_subcompaction_progress_ = false;
+  SubcompactionProgress subcompaction_progress_;
+
   // Newly created table files and blob files are eligible for deletion if they
   // are not registered as live files after the background jobs creating them
   // have finished. In case committing the VersionEdit containing such changes
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 25235206994a..6b1df759a266 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -794,6 +794,373 @@ TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) {
   }
 }
 
+class SubcompactionProgressTest : public VersionEditTest {
+ protected:
+  static constexpr uint64_t kTestFileSize = 1024;
+  static constexpr SequenceNumber kTestSmallestSeq = 50;
+  static constexpr SequenceNumber kTestLargestSeq = 150;
+  static constexpr uint64_t kTestOldestAncesterTime = 12345;
+  static constexpr uint64_t kTestFileCreationTime = 67890;
+  static constexpr uint64_t kTestEpochNumber = 10;
+  static const std::string kTestChecksumFuncName;
+
+  FileMetaData CreateTestFile(uint64_t file_number, const std::string& prefix) {
+    FileMetaData file;
+    file.fd = FileDescriptor(file_number, 0, kTestFileSize, kTestSmallestSeq,
+                             kTestLargestSeq);
+    file.smallest = InternalKey(prefix + "a", kTestSmallestSeq, kTypeValue);
+    file.largest = InternalKey(prefix + "z", kTestLargestSeq, kTypeValue);
+    file.oldest_ancester_time = kTestOldestAncesterTime;
+    file.file_creation_time = kTestFileCreationTime;
+    file.epoch_number = kTestEpochNumber;
+    file.file_checksum = "checksum_" + std::to_string(file_number);
+    file.file_checksum_func_name = kTestChecksumFuncName;
+    file.marked_for_compaction = false;
+    file.temperature = Temperature::kUnknown;
+    return file;
+  }
+
+  // Store external file metadata objects for testing
+  // These simulate files owned by CompactionOutputs
+  std::vector<FileMetaData> compaction_output_files_;
+  std::vector<FileMetaData> proximal_level_compaction_output_files_;
+
+  void SetupOutputFilePointers(
+      SubcompactionProgress& progress,
+      const std::vector<FileMetaData>& compaction_output_files,
+      const std::vector<FileMetaData>& proximal_level_compaction_output_files) {
+    if (!compaction_output_files.empty()) {
+      progress.output_level_progress.TEST_ClearOutputFiles();
+    }
+
+    for (const auto& file : compaction_output_files) {
+      progress.output_level_progress.AddToOutputFiles(&file);
+    }
+
+    if (!proximal_level_compaction_output_files.empty()) {
+      progress.proximal_output_level_progress.TEST_ClearOutputFiles();
+    }
+
+    for (const auto& file : proximal_level_compaction_output_files) {
+      progress.proximal_output_level_progress.AddToOutputFiles(&file);
+    }
+  }
+
+  SubcompactionProgress CreateSubcompactionProgress(
+      const std::string& next_key, uint64_t num_processed_input_records,
+      uint64_t num_processed_output_records,
+      uint64_t num_processed_proximal_level_output_records,
+      const std::vector<uint64_t>& output_file_numbers = {},
+      const std::vector<uint64_t>& proximal_file_numbers = {},
+      const std::string& file_prefix = "file_") {
+    SubcompactionProgress progress;
+    progress.next_internal_key_to_compact = next_key;
+    progress.num_processed_input_records = num_processed_input_records;
+    progress.output_level_progress.SetNumProcessedOutputRecords(
+        num_processed_output_records);
+    progress.proximal_output_level_progress.SetNumProcessedOutputRecords(
+        num_processed_proximal_level_output_records);
+
+    for (uint64_t file_num : output_file_numbers) {
+      compaction_output_files_.push_back(
+          CreateTestFile(file_num, file_prefix + "output_"));
+    }
+    for (uint64_t file_num : proximal_file_numbers) {
+      proximal_level_compaction_output_files_.push_back(
+          CreateTestFile(file_num, file_prefix + "proximal_"));
+    }
+
+    SetupOutputFilePointers(progress, compaction_output_files_,
+                            proximal_level_compaction_output_files_);
+
+    return progress;
+  }
+
+  std::pair<const VersionEdit, const SubcompactionProgress>
+  EncodeDecodeProgress(const SubcompactionProgress& progress) {
+    VersionEdit edit;
+    edit.SetSubcompactionProgress(progress);
+
+    std::string encoded;
+    EXPECT_TRUE(edit.EncodeTo(&encoded, 0 /* ts_sz */));
+
+    VersionEdit decoded_edit;
+    EXPECT_OK(decoded_edit.DecodeFrom(encoded));
+    EXPECT_TRUE(decoded_edit.HasSubcompactionProgress());
+
+    SubcompactionProgress decoded_progress =
+        decoded_edit.GetSubcompactionProgress();
+
+    return {std::move(decoded_edit), std::move(decoded_progress)};
+  }
+
+  void VerifyFileMetaDataEquality(const FileMetaData& expected,
+                                  const FileMetaData& actual) {
+    // Verify the major fields only
+    ASSERT_EQ(actual.fd.GetNumber(), expected.fd.GetNumber());
+    ASSERT_EQ(actual.fd.GetFileSize(), expected.fd.GetFileSize());
+    ASSERT_EQ(actual.smallest.Encode(), expected.smallest.Encode());
+    ASSERT_EQ(actual.largest.Encode(), expected.largest.Encode());
+    ASSERT_EQ(actual.oldest_ancester_time, expected.oldest_ancester_time);
+    ASSERT_EQ(actual.file_creation_time, expected.file_creation_time);
+    ASSERT_EQ(actual.epoch_number, expected.epoch_number);
+    ASSERT_EQ(actual.file_checksum, expected.file_checksum);
+    ASSERT_EQ(actual.file_checksum_func_name, expected.file_checksum_func_name);
+    ASSERT_EQ(actual.marked_for_compaction, expected.marked_for_compaction);
+    ASSERT_EQ(actual.temperature, expected.temperature);
+  }
+
+  void VerifyProgressEquality(const SubcompactionProgress& expected,
+                              const SubcompactionProgress& actual) {
+    ASSERT_EQ(actual.next_internal_key_to_compact,
+              expected.next_internal_key_to_compact);
+
+    ASSERT_EQ(actual.num_processed_input_records,
+              expected.num_processed_input_records);
+
+    for (const bool is_proximal_level : {false, true}) {
+      const SubcompactionProgressPerLevel&
+          actual_subcompaction_progress_by_level =
+              is_proximal_level ? actual.proximal_output_level_progress
+                                : actual.output_level_progress;
+
+      const SubcompactionProgressPerLevel&
+          expected_subcompaction_progress_by_level =
+              is_proximal_level ? expected.proximal_output_level_progress
+                                : expected.output_level_progress;
+
+      ASSERT_EQ(
+          actual_subcompaction_progress_by_level.GetNumProcessedOutputRecords(),
+          expected_subcompaction_progress_by_level
+              .GetNumProcessedOutputRecords());
+
+      ASSERT_EQ(
+          actual_subcompaction_progress_by_level.GetTempOutputFilesAllocation()
+              .size(),
+          expected_subcompaction_progress_by_level.GetOutputFiles().size());
+
+      for (size_t i = 0;
+           i < expected_subcompaction_progress_by_level.GetOutputFiles().size();
+           ++i) {
+        VerifyFileMetaDataEquality(
+            *expected_subcompaction_progress_by_level.GetOutputFiles()[i],
+            actual_subcompaction_progress_by_level
+                .GetTempOutputFilesAllocation()[i]);
+      }
+    }
+  }
+};
+
+const std::string SubcompactionProgressTest::kTestChecksumFuncName = "crc32c";
+
+TEST_F(SubcompactionProgressTest, BasicEncodeDecode) {
+  // Create progress with files for both levels
+  SubcompactionProgress progress = CreateSubcompactionProgress(
+      "key_100",  // next_internal_key_to_compact
+      500,        // num_processed_input_records
+      400,        // num_processed_output_records
+      100,        // num_processed_proximal_level_output_records
+      {1},        // output_file_numbers
+      {2},        // proximal_file_numbers
+      "test_"     // file_prefix
+  );
+
+  auto [ignored, decoded_progress] = EncodeDecodeProgress(progress);
+
+  VerifyProgressEquality(progress, decoded_progress);
+}
+
+TEST_F(SubcompactionProgressTest, OutputFilesDeltaEncodeDecode) {
+  // Test Delta Encoding/Decoding
+  SubcompactionProgress initial_progress = CreateSubcompactionProgress(
+      "key_100",  // next_internal_key_to_compact
+      100,        // num_processed_input_records
+      40,         // num_processed_output_records
+      60,         // num_processed_proximal_level_output_records
+      {1},        // output_file_numbers
+      {2},        // proximal_file_numbers
+      "initial_"  // file_prefix
+  );
+
+  auto [initial_decoded_edit, ignored_1] =
+      EncodeDecodeProgress(initial_progress);
+  initial_progress.output_level_progress.UpdateLastPersistedOutputFilesCount();
+  initial_progress.proximal_output_level_progress
+      .UpdateLastPersistedOutputFilesCount();
+
+  // Add one new output file to output and proximal level
+  SubcompactionProgress updated_progress = initial_progress;
+  updated_progress.next_internal_key_to_compact = "key_300";
+  updated_progress.num_processed_input_records = 1000;
+
+  updated_progress.output_level_progress.SetNumProcessedOutputRecords(400);
+  FileMetaData new_file = CreateTestFile(3, "new_");
+  compaction_output_files_.push_back(new_file);
+
+  updated_progress.proximal_output_level_progress.SetNumProcessedOutputRecords(
+      600);
+  FileMetaData new_file_proximal = CreateTestFile(4, "new_");
+  proximal_level_compaction_output_files_.push_back(new_file_proximal);
+
+  SetupOutputFilePointers(updated_progress, compaction_output_files_,
+                          proximal_level_compaction_output_files_);
+
+  auto [delta_decoded_edit, delta_decoded_progress] =
+      EncodeDecodeProgress(updated_progress);
+
+  ASSERT_EQ(delta_decoded_progress.next_internal_key_to_compact,
+            updated_progress.next_internal_key_to_compact);
+
+  ASSERT_EQ(delta_decoded_progress.num_processed_input_records,
+            updated_progress.num_processed_input_records);
+
+  for (const bool& is_proximal_level : {false, true}) {
+    const SubcompactionProgressPerLevel& delta_progress_per_level =
+        is_proximal_level
+            ? delta_decoded_progress.proximal_output_level_progress
+            : delta_decoded_progress.output_level_progress;
+
+    const SubcompactionProgressPerLevel& updated_progress_per_level =
+        is_proximal_level ? updated_progress.proximal_output_level_progress
+                          : updated_progress.output_level_progress;
+
+    ASSERT_EQ(delta_progress_per_level.GetNumProcessedOutputRecords(),
+              updated_progress_per_level.GetNumProcessedOutputRecords());
+
+    // Delta encoding: only the one newly added file is present, not the
+    // previously persisted file
+    ASSERT_EQ(delta_progress_per_level.GetTempOutputFilesAllocation().size(),
+              1);
+
+    ASSERT_EQ(delta_progress_per_level.GetTempOutputFilesAllocation()[0]
+                  .fd.GetNumber(),
+              is_proximal_level ? new_file_proximal.fd.GetNumber()
+                                : new_file.fd.GetNumber());
+  }
+
+  // Test SubcompactionProgressBuilder
+  SubcompactionProgressBuilder builder;
+  ASSERT_FALSE(builder.HasAccumulatedSubcompactionProgress());
+
+  ASSERT_TRUE(builder.ProcessVersionEdit(initial_decoded_edit));
+  ASSERT_TRUE(builder.HasAccumulatedSubcompactionProgress());
+  ASSERT_TRUE(builder.ProcessVersionEdit(delta_decoded_edit));
+
+  const auto& accumulated_progress =
+      builder.GetAccumulatedSubcompactionProgress();
+
+  ASSERT_EQ(accumulated_progress.next_internal_key_to_compact,
+            updated_progress.next_internal_key_to_compact);
+
+  ASSERT_EQ(accumulated_progress.num_processed_input_records,
+            updated_progress.num_processed_input_records);
+
+  for (const bool& is_proximal_level : {false, true}) {
+    const SubcompactionProgressPerLevel& accumulated_progress_per_level =
+        is_proximal_level ? accumulated_progress.output_level_progress
+                          : accumulated_progress.proximal_output_level_progress;
+
+    const SubcompactionProgressPerLevel& updated_progress_per_level =
+        is_proximal_level ? updated_progress.output_level_progress
+                          : updated_progress.proximal_output_level_progress;
+
+    ASSERT_EQ(accumulated_progress_per_level.GetNumProcessedOutputRecords(),
+              updated_progress_per_level.GetNumProcessedOutputRecords());
+
+    ASSERT_EQ(
+        accumulated_progress_per_level.GetTempOutputFilesAllocation().size(),
+        updated_progress_per_level.GetOutputFiles().size());
+
+    std::set<uint64_t> accumulated_file_numbers;
+
+    for (const auto& file :
+         accumulated_progress_per_level.GetTempOutputFilesAllocation()) {
+      accumulated_file_numbers.insert(file.fd.GetNumber());
+    }
+
+    std::set<uint64_t> expected_file_numbers;
+
+    for (const auto& file : updated_progress_per_level.GetOutputFiles()) {
+      expected_file_numbers.insert(file->fd.GetNumber());
+    }
+
+    ASSERT_EQ(accumulated_file_numbers, expected_file_numbers);
+  }
+
+  // ===== PART 3: Test Builder Reset =====
+  builder.Clear();
+  ASSERT_FALSE(builder.HasAccumulatedSubcompactionProgress());
+}
+
+TEST_F(SubcompactionProgressTest, UnknownTags) {
+  SubcompactionProgress progress;
+  std::string encoded;
+
+  // 1. Test unknown ignorable tag
+  progress.next_internal_key_to_compact = "test_key";
+  progress.num_processed_input_records = 100;
+
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kNextInternalKeyToCompact);
+  PutLengthPrefixedSlice(&encoded, progress.next_internal_key_to_compact);
+
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kNumProcessedInputRecords);
+  std::string varint_records;
+  PutVarint64(&varint_records, progress.num_processed_input_records);
+  PutLengthPrefixedSlice(&encoded, varint_records);
+
+  // Manually encode with unknown ignorable tag (has
+  // SubcompactionProgressCustomTag::kSubcompactionProgressCustomTagSafeIgnoreMask
+  // bit set)
+  uint32_t unknown_ignorable_tag =
+      SubcompactionProgressCustomTag::
+          kSubcompactionProgressCustomTagSafeIgnoreMask +
+      1;
+  PutVarint32(&encoded, unknown_ignorable_tag);
+  PutLengthPrefixedSlice(&encoded, "future_data");
+
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kSubcompactionProgressTerminate);
+
+  // Test decoding - should succeed and ignore unknown tag
+  Slice input(encoded);
+  SubcompactionProgress decoded_progress;
+  Status s = decoded_progress.DecodeFrom(&input);
+  ASSERT_OK(s);
+
+  // Verify known fields are preserved
+  ASSERT_EQ(decoded_progress.next_internal_key_to_compact,
+            progress.next_internal_key_to_compact);
+  ASSERT_EQ(decoded_progress.num_processed_input_records,
+            progress.num_processed_input_records);
+
+  // 2. Test unknown non-ignorable tag
+  encoded.clear();
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kNextInternalKeyToCompact);
+  PutLengthPrefixedSlice(&encoded, "test_key");
+
+  // Manually encode with unknown non-ignorable tag (do not have
+  // SubcompactionProgressCustomTag::kSubcompactionProgressCustomTagSafeIgnoreMask
+  // bit set)
+  uint32_t unknown_critical_tag =
+      SubcompactionProgressCustomTag::
+          kSubcompactionProgressCustomTagSafeIgnoreMask -
+      1;
+  PutVarint32(&encoded, unknown_critical_tag);
+  PutLengthPrefixedSlice(&encoded, "critical_future_data");
+  PutVarint32(&encoded,
+              SubcompactionProgressCustomTag::kSubcompactionProgressTerminate);
+
+  // Test decoding - should fail on critical unknown tag
+  Slice critical_input(encoded);
+  SubcompactionProgress critical_progress;
+  Status critical_status = critical_progress.DecodeFrom(&critical_input);
+  ASSERT_NOK(critical_status);
+  ASSERT_TRUE(critical_status.IsNotSupported());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 54373ba0e8c149c36674cbe10c7c5d367ff69993 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Mon, 22 Sep 2025 15:30:24 -0700
Subject: [PATCH 302/500] Revert "Create a new API FileSystem::SyncFile for
 file sync (#13762)" (#13987)

Summary:
This is causing some internal failure, we decide to revert this for now until we have a proper fix.

This reverts commit 961880b4580d0b83225e8f718bb51bec329236e7.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13987

Reviewed By: anand1976

Differential Revision: D82990294

Pulled By: cbi42

fbshipit-source-id: 5f5b4d18d0afe47599738d27e11e3eb2d08d88a0
---
 db/external_sst_file_basic_test.cc    | 12 +++----
 db/external_sst_file_ingestion_job.cc | 49 ++++++++++++++++-----------
 env/composite_env_wrapper.h           |  6 ----
 env/env.cc                            | 44 ------------------------
 env/env_encryption.cc                 | 11 ------
 env/file_system.cc                    | 17 ----------
 env/mock_env.cc                       |  8 -----
 env/mock_env.h                        |  4 ---
 include/rocksdb/env.h                 | 12 -------
 include/rocksdb/file_system.h         | 18 ----------
 utilities/fault_injection_env.cc      | 11 ------
 utilities/fault_injection_env.h       |  3 --
 utilities/fault_injection_fs.cc       | 11 ------
 utilities/fault_injection_fs.h        |  4 ---
 14 files changed, 35 insertions(+), 175 deletions(-)

diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 3d91c62a62fd..c93806aaea09 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -1311,7 +1311,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
     });
     if (i == 0) {
       SyncPoint::GetInstance()->SetCallBack(
-          "ExternalSstFileIngestionJob::CheckSyncReturnCode", [&](void* s) {
+          "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) {
             Status* status = static_cast<Status*>(s);
             if (status->IsNotSupported()) {
               no_sync = true;
@@ -1372,11 +1372,11 @@ TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
   options.create_if_missing = true;
   options.env = env_;
 
-  SyncPoint::GetInstance()->SetCallBack("FileSystem::SyncFile:Open",
-                                        [&](void* arg) {
-                                          Status* s = static_cast<Status*>(arg);
-                                          *s = Status::NotSupported();
-                                        });
+  SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
+        Status* s = static_cast<Status*>(arg);
+        *s = Status::NotSupported();
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   DestroyAndReopen(options);
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index f7a34ab78cc7..fd7e195dd055 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -163,26 +163,35 @@ Status ExternalSstFileIngestionJob::Prepare(
         // It is unsafe to assume application had sync the file and file
         // directory before ingest the file. For integrity of RocksDB we need
         // to sync the file.
-        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
-        auto s = fs_->SyncFile(path_inside_db, env_options_, IOOptions(),
-                               db_options_.use_fsync, nullptr);
-        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncIngestedFile");
-        TEST_SYNC_POINT_CALLBACK(
-            "ExternalSstFileIngestionJob::CheckSyncReturnCode", &s);
-        if (!s.ok()) {
-          if (s.IsNotSupported()) {
-            // Some file systems (especially remote/distributed) don't support
-            // SyncFile API. Ignore the NotSupported error in that case.
-            ROCKS_LOG_WARN(db_options_.info_log,
-                           "After link the file, SyncFile API is not supported "
-                           "for file %s: %s",
-                           path_inside_db.c_str(), status.ToString().c_str());
-          } else {
-            // for other errors, propagate the error
-            status = s;
-            ROCKS_LOG_WARN(db_options_.info_log,
-                           "Failed to sync ingested file %s: %s",
-                           path_inside_db.c_str(), status.ToString().c_str());
+
+        // TODO(xingbo), We should in general be moving away from production
+        // uses of ReuseWritableFile (except explicitly for WAL recycling),
+        // ReopenWritableFile, and NewRandomRWFile. We should create a
+        // FileSystem::SyncFile/FsyncFile API that by default does the
+        // re-open+sync+close combo but can (a) be reused easily, and (b) be
+        // overridden to do that more cleanly, e.g. in EncryptedEnv.
+        // https://github.com/facebook/rocksdb/issues/13741
+        std::unique_ptr<FSWritableFile> file_to_sync;
+        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                           &file_to_sync, nullptr);
+        TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
+                                 &s);
+        // Some file systems (especially remote/distributed) don't support
+        // reopening a file for writing and don't require reopening and
+        // syncing the file. Ignore the NotSupported error in that case.
+        if (!s.IsNotSupported()) {
+          status = s;
+          if (status.ok()) {
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+            status = SyncIngestedFile(file_to_sync.get());
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+            if (!status.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Failed to sync ingested file %s: %s",
+                             path_inside_db.c_str(), status.ToString().c_str());
+            }
           }
         }
       } else if (status.IsNotSupported() &&
diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h
index e2eab9957f85..f9b9c6994e53 100644
--- a/env/composite_env_wrapper.h
+++ b/env/composite_env_wrapper.h
@@ -142,12 +142,6 @@ class CompositeEnv : public Env {
     return file_system_->LinkFile(s, t, io_opts, &dbg);
   }
 
-  Status SyncFile(const std::string& fname, const EnvOptions& env_options,
-                  bool use_fsync) override {
-    return file_system_->SyncFile(fname, env_options, IOOptions(), use_fsync,
-                                  nullptr);
-  }
-
   Status NumFileLinks(const std::string& fname, uint64_t* count) override {
     IOOptions io_opts;
     IODebugContext dbg;
diff --git a/env/env.cc b/env/env.cc
index 4047f2797038..bfc226a20928 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -529,13 +529,6 @@ class LegacyFileSystemWrapper : public FileSystem {
     return status_to_io_status(target_->LinkFile(s, t));
   }
 
-  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
-                    const IOOptions& /*io_options*/, bool use_fsync,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(
-        target_->SyncFile(fname, file_options, use_fsync));
-  }
-
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
                         uint64_t* count, IODebugContext* /*dbg*/) override {
     return status_to_io_status(target_->NumFileLinks(fname, count));
@@ -878,43 +871,6 @@ std::string Env::GenerateUniqueId() {
   return result;
 }
 
-// This API Env::SyncFile is used for testing for 2 reasons:
-//
-// 1. The default implementation of SyncFile API is essentially a wrapper of
-// other FileSystem APIs. FaultInjectionTestEnv uses this default
-// implementation to call other FileSystem APIs defined at
-// FaultInjectionTestEnv class to inject failurses. See
-// FaultInjectionTestEnv::SyncFile for more details
-//
-// 2. Some of old tests are using LegacyFileSystemWrapper.
-// LegacyFileSystemWrapper forwards the API call to EnvWrapper, which forwards
-// to CompositeEnv, and then forwards to the actual FileSystem implemention.
-// Without this API in Env, LegacyFileSystemWrapper will not be able to
-// forward the API call to EnvWrapper, causing the default FileSystem API to
-// be called.
-//
-// Due to the above reason, adding a new API in FileSystem, would very likely
-// require the same API to be added to Env.
-//
-// TODO xingbo. Getting rid of FileSystem functions from Env.
-// We need to simplify the relationship between Env and FileSystem. At least
-// for internal test, we should stop using Env and switch to FileSystem, if
-// possible. Related github issue #9274
-Status Env::SyncFile(const std::string& fname, const EnvOptions& env_options,
-                     bool use_fsync) {
-  std::unique_ptr<WritableFile> file_to_sync;
-  auto status = ReopenWritableFile(fname, &file_to_sync, env_options);
-  TEST_SYNC_POINT_CALLBACK("FileSystem::SyncFile:Open", &status);
-  if (status.ok()) {
-    if (use_fsync) {
-      status = file_to_sync->Fsync();
-    } else {
-      status = file_to_sync->Sync();
-    }
-  }
-  return status;
-}
-
 SequentialFile::~SequentialFile() = default;
 
 RandomAccessFile::~RandomAccessFile() = default;
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index 98c1f38083a8..9565b9d9bc90 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -664,8 +664,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem {
                               const FileOptions& options,
                               std::unique_ptr<FSWritableFile>* result,
                               IODebugContext* dbg) override {
-    // TODO xingbo Add unit test for the new implementation of
-    // EncryptedFileSysmteImpl::ReopenWritableFile.
     result->reset();
     if (options.use_mmap_reads || options.use_mmap_writes) {
       return IOStatus::InvalidArgument();
@@ -816,15 +814,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem {
     return status;
   }
 
-  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
-                    const IOOptions& io_options, bool use_fsync,
-                    IODebugContext* dbg) override {
-    // Use the underlying file system to sync the file, as we don't need to
-    // read/write the file.
-    return FileSystemWrapper::SyncFile(fname, file_options, io_options,
-                                       use_fsync, dbg);
-  }
-
  private:
   std::shared_ptr<EncryptionProvider> provider_;
 };
diff --git a/env/file_system.cc b/env/file_system.cc
index 5d160078965d..fad48cc1175f 100644
--- a/env/file_system.cc
+++ b/env/file_system.cc
@@ -107,23 +107,6 @@ IOStatus FileSystem::ReuseWritableFile(const std::string& fname,
   return NewWritableFile(fname, opts, result, dbg);
 }
 
-IOStatus FileSystem::SyncFile(const std::string& fname,
-                              const FileOptions& file_options,
-                              const IOOptions& io_options, bool use_fsync,
-                              IODebugContext* dbg) {
-  std::unique_ptr<FSWritableFile> file_to_sync;
-  auto status = ReopenWritableFile(fname, file_options, &file_to_sync, dbg);
-  TEST_SYNC_POINT_CALLBACK("FileSystem::SyncFile:Open", &status);
-  if (status.ok()) {
-    if (use_fsync) {
-      status = file_to_sync->Fsync(io_options, dbg);
-    } else {
-      status = file_to_sync->Sync(io_options, dbg);
-    }
-  }
-  return status;
-}
-
 IOStatus FileSystem::NewLogger(const std::string& fname,
                                const IOOptions& io_opts,
                                std::shared_ptr<Logger>* result,
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 3088984445fe..0f9e5ab47f67 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -957,14 +957,6 @@ IOStatus MockFileSystem::LinkFile(const std::string& src,
   return IOStatus::OK();
 }
 
-IOStatus MockFileSystem::SyncFile(const std::string& /*fname*/,
-                                  const FileOptions& /*file_options*/,
-                                  const IOOptions& /*io_options*/,
-                                  bool /*use_fsync*/, IODebugContext* /*dbg*/) {
-  // Noop
-  return IOStatus::OK();
-}
-
 IOStatus MockFileSystem::NewLogger(const std::string& fname,
                                    const IOOptions& io_opts,
                                    std::shared_ptr<Logger>* result,
diff --git a/env/mock_env.h b/env/mock_env.h
index 040235e1ab8a..406a31f63570 100644
--- a/env/mock_env.h
+++ b/env/mock_env.h
@@ -86,10 +86,6 @@ class MockFileSystem : public FileSystem {
   IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
                     const IOOptions& /*options*/,
                     IODebugContext* /*dbg*/) override;
-  IOStatus SyncFile(const std::string& /*fname*/,
-                    const FileOptions& /*file_options*/,
-                    const IOOptions& /*io_options*/, bool /*use_fsync*/,
-                    IODebugContext* /*dbg*/) override;
   IOStatus LockFile(const std::string& fname, const IOOptions& options,
                     FileLock** lock, IODebugContext* dbg) override;
   IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index c0f667ff8c48..03a64b968982 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -385,13 +385,6 @@ class Env : public Customizable {
     return Status::NotSupported("LinkFile is not supported for this Env");
   }
 
-  // Sync the file content to file system.
-  // This API is only used for testing.
-  // See FileSystem::SyncFile comment for details
-  virtual Status SyncFile(const std::string& /*fname*/,
-                          const EnvOptions& /*env_options*/,
-                          bool /*use_fsync*/);
-
   virtual Status NumFileLinks(const std::string& /*fname*/,
                               uint64_t* /*count*/) {
     return Status::NotSupported(
@@ -1684,11 +1677,6 @@ class EnvWrapper : public Env {
     return target_.env->LinkFile(s, t);
   }
 
-  Status SyncFile(const std::string& fname, const EnvOptions& env_options,
-                  bool use_fsync) override {
-    return target_.env->SyncFile(fname, env_options, use_fsync);
-  }
-
   Status NumFileLinks(const std::string& fname, uint64_t* count) override {
     return target_.env->NumFileLinks(fname, count);
   }
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index c0a064d6639f..a68dee516679 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -606,18 +606,6 @@ class FileSystem : public Customizable {
         "LinkFile is not supported for this FileSystem");
   }
 
-  // Sync the file content to file system.
-  // The default implementation would open, sync and close the file.
-  // This function could be overridden with no-op, if the file system
-  // automatically sync the data when file is closed.
-  // This is used when a user-provided file, probably unsynced, is pulled into a
-  // context where power-outage-proof persistence is required (e.g.
-  // IngestExternalFile without copy).
-  virtual IOStatus SyncFile(const std::string& fname,
-                            const FileOptions& file_options,
-                            const IOOptions& io_options, bool use_fsync,
-                            IODebugContext* dbg);
-
   virtual IOStatus NumFileLinks(const std::string& /*fname*/,
                                 const IOOptions& /*options*/,
                                 uint64_t* /*count*/, IODebugContext* /*dbg*/) {
@@ -1604,12 +1592,6 @@ class FileSystemWrapper : public FileSystem {
     return target_->LinkFile(s, t, options, dbg);
   }
 
-  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
-                    const IOOptions& io_options, bool use_fsync,
-                    IODebugContext* dbg) override {
-    return target_->SyncFile(fname, file_options, io_options, use_fsync, dbg);
-  }
-
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
                         uint64_t* count, IODebugContext* dbg) override {
     return target_->NumFileLinks(fname, options, count, dbg);
diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc
index 1bbe587f52cb..6aedb87ab634 100644
--- a/utilities/fault_injection_env.cc
+++ b/utilities/fault_injection_env.cc
@@ -464,17 +464,6 @@ Status FaultInjectionTestEnv::LinkFile(const std::string& s,
   return ret;
 }
 
-Status FaultInjectionTestEnv::SyncFile(const std::string& fname,
-                                       const EnvOptions& env_options,
-                                       bool use_fsync) {
-  // Call the default implement of SyncFile API in Env, so that it would call
-  // other FileSystem API at FaultInjectionTestEnv layer for failure injection.
-  // Otherwise, the default behavior is WrapperEnv::SyncFile, which forward the
-  // call to the underlying FileSystem, instead of the ones in
-  // FaultInjectionTestEnv.
-  return Env::SyncFile(fname, env_options, use_fsync);
-}
-
 void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
   MutexLock l(&mutex_);
   if (open_managed_files_.find(state.filename_) != open_managed_files_.end()) {
diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h
index fedcb2ae22ff..eaece031848d 100644
--- a/utilities/fault_injection_env.h
+++ b/utilities/fault_injection_env.h
@@ -177,9 +177,6 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   Status LinkFile(const std::string& s, const std::string& t) override;
 
-  Status SyncFile(const std::string& fname, const EnvOptions& env_options,
-                  bool use_fsync) override;
-
 // Undef to eliminate clash on Windows
 #undef GetFreeSpace
   Status GetFreeSpace(const std::string& path, uint64_t* disk_free) override {
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 338c5ff66577..e658f114f860 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -1200,17 +1200,6 @@ IOStatus FaultInjectionTestFS::LinkFile(const std::string& s,
   }
   return io_s;
 }
-IOStatus FaultInjectionTestFS::SyncFile(const std::string& fname,
-                                        const FileOptions& file_options,
-                                        const IOOptions& io_options,
-                                        bool use_fsync, IODebugContext* dbg) {
-  // Call the default implement of SyncFile API in FileSystem, so that it would
-  // call other FileSystem API at FaultInjectionTestFS layer for failure
-  // injection. Otherwise, the default behavior is calling target()->SyncFile,
-  // which forward the call to the underlying FileSystem, instead of the ones in
-  // FaultInjectionTestFS.
-  return FileSystem::SyncFile(fname, file_options, io_options, use_fsync, dbg);
-}
 
 IOStatus FaultInjectionTestFS::NumFileLinks(const std::string& fname,
                                             const IOOptions& options,
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 54d657d17d97..b4cb122273d6 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -312,10 +312,6 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   IOStatus LinkFile(const std::string& src, const std::string& target,
                     const IOOptions& options, IODebugContext* dbg) override;
 
-  IOStatus SyncFile(const std::string& fname, const FileOptions& file_options,
-                    const IOOptions& io_options, bool use_fsync,
-                    IODebugContext* dbg) override;
-
   IOStatus NumFileLinks(const std::string& fname, const IOOptions& options,
                         uint64_t* count, IODebugContext* dbg) override;
 

From eaeafa78190013aad60d0f2888b04f8beea43169 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 22 Sep 2025 17:44:16 -0700
Subject: [PATCH 303/500] Revert "Improve random seed override support in
 stress test (#13952)" (#13989)

Summary:
**Context/Summary**
This reverts commit 73432a3f369d2f6331b68c907a0ffac4e9a3d653. This is due to it mysteriously fails our internal CI running with this change to db_crashtest.py. The root-cause is unknown but the error only reproed with this commit frequently but not the one before it. The error message appears to be the command parsing leading to the db_stress binary can't be found

```
Traceback (most recent call last):
  File "/data/sandcastle/boxes/trunk-hg-full-fbsource/fbcode/internal_repo_rocksdb/repo/tools/db_crashtest.py", line 1638, in <module>
    main()
  File "/data/sandcastle/boxes/trunk-hg-full-fbsource/fbcode/internal_repo_rocksdb/repo/tools/db_crashtest.py", line 1627, in main
    blackbox_crash_main(args, unknown_args)
  File "/data/sandcastle/boxes/trunk-hg-full-fbsource/fbcode/internal_repo_rocksdb/repo/tools/db_crashtest.py", line 1347, in blackbox_crash_main
    hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/sandcastle/boxes/trunk-hg-full-fbsource/fbcode/internal_repo_rocksdb/repo/tools/db_crashtest.py", line 1283, in execute_cmd
    child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/fbcode/platform010/lib/python3.12/subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/usr/local/fbcode/platform010/lib/python3.12/subprocess.py", line 1957, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: './db_stress'
```

**Test plan**
- Rehearsal crash test

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13989

Reviewed By: xingbowang

Differential Revision: D83010751

Pulled By: hx235

fbshipit-source-id: d8cfc70564074065b6bb8a3986d6c1011064dd5e
---
 tools/db_crashtest.py | 45 +++++++++----------------------------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 3f1bfc1c1d16..cf71c9dcdf94 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -11,48 +11,22 @@
 import tempfile
 import time
 
-per_iteration_random_seed_override = 0
-
-def get_random_seed(override):
-    if override == 0:
-        return random.randint(1, 2**64)
-    else:
-        return override
 
 def setup_random_seed_before_main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--initial_random_seed_override",
+        "--random_seed",
         default=0,
         type=int,
-        help="Random seed used for initialize the test parameters at the beginning of stress test run",
+        help="Random seed used for reproduce the same test parameter set",
     )
-    # sometimes the failure appeared after a few iteration, to reproduce the error, we have to wait for the test to run
-    # multiple iterations to reach the iteration that fails the test. By overriding the seed used within each iteration,
-    # we could skip all the previous iterations.
-    parser.add_argument(
-        "--per_iteration_random_seed_override",
-        default=0,
-        type=int,
-        help="Random seed used for initialize the test parameters in each iteration of the stress test run",
+    args, _ = parser.parse_known_args()
+    random_seed = (
+        random.randint(1, 2**64) if args.random_seed == 0 else args.random_seed
     )
+    print(f"Start with random seed {random_seed}")
+    random.seed(random_seed)
 
-    args, remain_args = parser.parse_known_args()
-    init_random_seed = get_random_seed(args.initial_random_seed_override)
-    global per_iteration_random_seed_override
-    per_iteration_random_seed_override = args.per_iteration_random_seed_override
-
-    print(f"Start with random seed {init_random_seed}")
-    random.seed(init_random_seed)
-
-    # reset the sys.argv with the remaining args, so that the rest of the argument parser would not see these 2 args
-    sys.argv = remain_args
-
-def apply_random_seed_per_iteration():
-    global per_iteration_random_seed_override
-    per_iteration_random_seed = get_random_seed(per_iteration_random_seed_override)
-    print(f"Use random seed for iteration {per_iteration_random_seed}")
-    random.seed(per_iteration_random_seed)
 
 # Random seed has to be setup before the rest of the script, so that the random
 # value selected in the global variable uses the random seed specified
@@ -393,7 +367,7 @@ def apply_random_seed_per_iteration():
     "memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
-    "track_and_verify_wals": lambda: random.choice([0]),
+    "track_and_verify_wals": lambda: random.choice([0]),    
     "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
     # TODO(jaykorean): Change to lambda: random.choice([0, 1]) after addressing all remote compaction failures
     "remote_compaction_failure_fall_back_to_local": 1,
@@ -1256,6 +1230,7 @@ def gen_cmd(params, unknown_params):
             not in {
                 "test_type",
                 "simple",
+                "random_seed",
                 "duration",
                 "interval",
                 "random_kill_odd",
@@ -1339,7 +1314,6 @@ def blackbox_crash_main(args, unknown_args):
     )
 
     while time.time() < exit_time:
-        apply_random_seed_per_iteration()
         cmd = gen_cmd(
             dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
         )
@@ -1402,7 +1376,6 @@ def whitebox_crash_main(args, unknown_args):
     succeeded = True
     hit_timeout = False
     while time.time() < exit_time:
-        apply_random_seed_per_iteration()
         if check_mode == 0:
             additional_opts = {
                 # use large ops per thread since we will kill it anyway

From afbbc90b062bcdb9fbc4ad19a8b9c6129fa4dda5 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 22 Sep 2025 18:13:10 -0700
Subject: [PATCH 304/500] Fail multi scan upon Prepare failure or bad scan
 options (#13974)

Summary:
Return a failure status for multi scan if Prepare fails, or if the scan options are unsupported, instead of falling back on a regular scan. This PR also fixes a bug in LevelIterator that caused max_prefetch_size to be ignored.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13974

Test Plan: Add new test in db_iterator_test and table_test

Reviewed By: xingbowang

Differential Revision: D82843944

Pulled By: anand1976

fbshipit-source-id: f12756c40ebd38d8d4e4425e97438b6e766a4663
---
 db/db_iterator_test.cc                        | 152 ++++++++++--------
 db/version_set.cc                             |   5 +-
 include/rocksdb/multi_scan.h                  |   8 +-
 .../block_based/block_based_table_iterator.cc | 138 ++++++++--------
 .../block_based/block_based_table_iterator.h  |  27 ++--
 .../block_based_table_reader_test.cc          |  23 +--
 .../block_based/user_defined_index_wrapper.h  |   8 +-
 table/table_test.cc                           | 105 ++++++++++++
 8 files changed, 300 insertions(+), 166 deletions(-)

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 99ecb713b011..ee5ac84f29cb 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4192,66 +4192,6 @@ TEST_P(DBMultiScanIteratorTest, BasicTest) {
     abort();
   }
   iter.reset();
-
-  // Test the overlapping scan case
-  key_ranges[1] = "k30";
-  scan_options = MultiScanArgs(BytewiseComparator());
-  scan_options.insert(key_ranges[0], key_ranges[1]);
-  scan_options.insert(key_ranges[2], key_ranges[3]);
-
-  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
-  try {
-    int idx = 0;
-    int count = 0;
-    for (auto range : *iter) {
-      for (auto it : range) {
-        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
-        ASSERT_LT(it.first.ToString().compare(key_ranges[idx + 1]), 0);
-        count++;
-      }
-      idx += 2;
-    }
-    ASSERT_EQ(count, 52);
-  } catch (MultiScanException& ex) {
-    // Make sure exception contains the status
-    ASSERT_NOK(ex.status());
-    std::cerr << "Iterator returned status " << ex.what();
-    abort();
-  } catch (std::logic_error& ex) {
-    std::cerr << "Iterator returned logic error " << ex.what();
-    abort();
-  }
-  iter.reset();
-
-  // Test the no limit scan case
-  scan_options = MultiScanArgs(BytewiseComparator());
-  scan_options.insert(key_ranges[0]);
-  scan_options.insert(key_ranges[2]);
-  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
-  try {
-    int idx = 0;
-    int count = 0;
-    for (auto range : *iter) {
-      for (auto it : range) {
-        ASSERT_GE(it.first.ToString().compare(key_ranges[idx]), 0);
-        if (it.first.ToString().compare(key_ranges[idx + 1]) == 0) {
-          break;
-        }
-        count++;
-      }
-      idx += 2;
-    }
-    ASSERT_EQ(count, 52);
-  } catch (MultiScanException& ex) {
-    // Make sure exception contains the status
-    ASSERT_NOK(ex.status());
-    std::cerr << "Iterator returned status " << ex.what();
-    abort();
-  } catch (std::logic_error& ex) {
-    std::cerr << "Iterator returned logic error " << ex.what();
-    abort();
-  }
-  iter.reset();
 }
 
 TEST_P(DBMultiScanIteratorTest, MixedBoundsTest) {
@@ -4366,15 +4306,95 @@ TEST_P(DBMultiScanIteratorTest, RangeAcrossFiles) {
   ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
   std::unique_ptr<MultiScan> iter =
       dbfull()->NewMultiScan(ro, cfh, scan_options);
-  int i = 10;
-  for (auto range : *iter) {
-    for (auto it : range) {
-      ASSERT_EQ(it.first.ToString(), Key(i));
-      ++i;
+  try {
+    int i = 10;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_EQ(it.first.ToString(), Key(i));
+        ++i;
+      }
     }
+    ASSERT_EQ(i, 90);
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
   }
-  ASSERT_EQ(i, 90);
+  iter.reset();
 }
+
+TEST_P(DBMultiScanIteratorTest, FailureTest) {
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create a file
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024)));
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_ranges({"k04", "k06", "k12", "k14"});
+  ReadOptions ro;
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.max_prefetch_size = 4500;
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+  int count = 0;
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[0]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[1]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+
+  // Second seek should hit the max_prefetch_size limit
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  ASSERT_NOK(iter->status());
+  iter.reset();
+
+  // Test the case of unexpected Seek key
+  iter.reset(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  scan_options.max_prefetch_size = 0;
+  iter->Prepare(scan_options);
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  ASSERT_NOK(iter->status());
+  iter.reset();
+
+  // Test the case of overlapping ranges
+  iter.reset(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  (*scan_options).clear();
+  scan_options.insert(key_ranges[0]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  iter->Prepare(scan_options);
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  ASSERT_NOK(iter->status());
+  iter.reset();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 23df68244fd4..9a4189cf8e36 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1181,14 +1181,17 @@ class LevelIterator final : public InternalIterator {
       // 3. [  S  ] ...... [  E  ]
       for (auto i = fstart; i <= fend; i++) {
         if (i < flevel_->num_files) {
-          auto args = GetMultiScanArgForFile(i);
+          auto& args = GetMultiScanArgForFile(i);
           args.insert(start.value(), end.value(), opt.property_bag);
         }
       }
     }
     // Propagate io colaescing threshold
+    // TODO: This is error prone as we may forget to copy some fields. Think
+    // of a better way to do this.
     for (auto& file_to_arg : *file_to_scan_opts_) {
       file_to_arg.second.io_coalesce_threshold = so->io_coalesce_threshold;
+      file_to_arg.second.max_prefetch_size = so->max_prefetch_size;
       file_to_arg.second.use_async_io = so->use_async_io;
     }
   }
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
index eb120c07a1b3..a1c87d57fff5 100644
--- a/include/rocksdb/multi_scan.h
+++ b/include/rocksdb/multi_scan.h
@@ -97,7 +97,13 @@ class Scan {
 
     ScanIterator() : db_iter_(nullptr), valid_(false) {}
 
-    ~ScanIterator() { assert(status_.ok()); }
+    ~ScanIterator() {
+      if (!status_.ok()) {
+        fprintf(stderr, "ScanIterator status: %s\n",
+                status_.ToString().c_str());
+        assert(false);
+      }
+    }
 
     ScanIterator& operator++() {
       if (!valid_) {
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index f5f19b09ebd5..9c2880406361 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -37,14 +37,14 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target,
                                        bool async_prefetch) {
   // TODO(hx235): set `seek_key_prefix_for_readahead_trimming_`
   // even when `target == nullptr` that is when `SeekToFirst()` is called
+  if (!multi_scan_status_.ok()) {
+    return;
+  }
   if (multi_scan_) {
-    if (SeekMultiScan(target)) {
-      return;
-    }
+    SeekMultiScan(target);
+    return;
   }
 
-  assert(!multi_scan_);
-
   if (target != nullptr && prefix_extractor_ &&
       read_options_.prefix_same_as_start) {
     const Slice& seek_user_key = ExtractUserKey(*target);
@@ -961,30 +961,37 @@ BlockBasedTableIterator::MultiScanState::~MultiScanState() {
 // - scan ranges should be non-overlapping, and have increasing start keys.
 // If a scan range's limit is not set, then there should only be one scan range.
 // - After Prepare(), the iterator expects Seek to be called on the start key
-// of each ScanOption in order. If any other seek is done, the optimization here
-// is aborted and fall back to vanilla iterator.
+// of each ScanOption in order. If any other Seek is done, an error status is
+// returned
 // FIXME: DBIter and MergingIterator may
 // internally do Seek() on child iterators, e.g. due to
 // ReadOptions::max_skippable_internal_keys or reseeking into range deletion
 // end key. So these Seeks can cause iterator to fall back to normal
 // (non-prepared) iterator and ignore the optimizations done in Prepare().
 void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
-  index_iter_->Prepare(multiscan_opts);
-
   assert(!multi_scan_);
+  if (!index_iter_->status().ok()) {
+    multi_scan_status_ = index_iter_->status();
+    return;
+  }
   if (multi_scan_) {
     multi_scan_.reset();
+    multi_scan_status_ = Status::InvalidArgument("Prepare already called");
     return;
   }
-  if (!ValidateScanOptions(multiscan_opts)) {
+  multi_scan_status_ = ValidateScanOptions(multiscan_opts);
+  if (!multi_scan_status_.ok()) {
     return;
   }
 
+  index_iter_->Prepare(multiscan_opts);
+
   std::vector<BlockHandle> scan_block_handles;
   std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
   const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
-  if (!CollectBlockHandles(scan_opts, &scan_block_handles,
-                           &block_index_ranges_per_scan)) {
+  multi_scan_status_ = CollectBlockHandles(scan_opts, &scan_block_handles,
+                                           &block_index_ranges_per_scan);
+  if (!multi_scan_status_.ok()) {
     return;
   }
 
@@ -993,9 +1000,10 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
       scan_block_handles.size());
   size_t prefetched_max_idx;
-  if (!FilterAndPinCachedBlocks(
-          scan_block_handles, multiscan_opts, &block_indices_to_read,
-          &pinned_data_blocks_guard, &prefetched_max_idx)) {
+  multi_scan_status_ = FilterAndPinCachedBlocks(
+      scan_block_handles, multiscan_opts, &block_indices_to_read,
+      &pinned_data_blocks_guard, &prefetched_max_idx);
+  if (!multi_scan_status_.ok()) {
     return;
   }
 
@@ -1009,8 +1017,10 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
                       &read_reqs, &block_idx_to_readreq_idx,
                       &coalesced_block_indices);
 
-    if (!ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
-                   &read_reqs, &async_states, &pinned_data_blocks_guard)) {
+    multi_scan_status_ =
+        ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
+                  &read_reqs, &async_states, &pinned_data_blocks_guard);
+    if (!multi_scan_status_.ok()) {
       return;
     }
   }
@@ -1028,16 +1038,16 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   block_iter_points_to_real_block_ = false;
 }
 
-bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
-  assert(multi_scan_);
+void BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
+  assert(multi_scan_ && multi_scan_status_.ok());
   // This is a MultiScan and Preapre() has been called.
   //
   // Validate seek key with scan options
   if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
-    multi_scan_.reset();
+    multi_scan_status_ = Status::InvalidArgument("Outside MultiScan range");
   } else if (!target) {
     // start key must be set for multi-scan
-    multi_scan_.reset();
+    multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
   } else if (user_comparator_.CompareWithoutTimestamp(
                  ExtractUserKey(*target), /*a_has_ts=*/true,
                  multi_scan_->scan_opts
@@ -1045,7 +1055,7 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
                      .range.start.value(),
                  /*b_has_ts=*/false) != 0) {
     // Unexpected seek key
-    multi_scan_.reset();
+    multi_scan_status_ = Status::InvalidArgument("Unexpected seek key");
   } else {
     if (multi_scan_->next_scan_idx > 0) {
       UnpinPreviousScanBlocks(multi_scan_->next_scan_idx);
@@ -1058,7 +1068,7 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
     if (cur_scan_start_idx >= cur_scan_end_idx) {
       is_out_of_bound_ = true;
       assert(!Valid());
-      return true;
+      return;
     } else {
       is_out_of_bound_ = false;
     }
@@ -1074,25 +1084,18 @@ bool BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
       ResetDataIter();
 
       multi_scan_->cur_data_block_idx = cur_scan_start_idx;
-      multi_scan_->status = MultiScanLoadDataBlock(cur_scan_start_idx);
-      if (!multi_scan_->status.ok()) {
+      multi_scan_status_ = MultiScanLoadDataBlock(cur_scan_start_idx);
+      if (!multi_scan_status_.ok()) {
         assert(!Valid());
-        assert(status() == multi_scan_->status);
-        return true;
+        assert(status() == multi_scan_status_);
+        return;
       }
     }
     multi_scan_->cur_data_block_idx = cur_scan_start_idx;
     block_iter_points_to_real_block_ = true;
     block_iter_.Seek(*target);
     FindKeyForward();
-    return true;
   }
-
-  // We are aborting MultiScan.
-  ResetDataIter();
-  assert(!is_index_at_curr_block_);
-  assert(!block_iter_points_to_real_block_);
-  return false;
 }
 
 void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
@@ -1153,11 +1156,11 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     ResetDataIter();
     ++multi_scan_->cur_data_block_idx;
 
-    multi_scan_->status =
+    multi_scan_status_ =
         MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx);
-    if (!multi_scan_->status.ok()) {
+    if (!multi_scan_status_.ok()) {
       assert(!Valid());
-      assert(status() == multi_scan_->status);
+      assert(status() == multi_scan_status_);
       return;
     }
 
@@ -1268,24 +1271,24 @@ Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
       &pinned_block_entry.As<Block_kData>());
 }
 
-bool BlockBasedTableIterator::ValidateScanOptions(
+Status BlockBasedTableIterator::ValidateScanOptions(
     const MultiScanArgs* multiscan_opts) {
   if (multiscan_opts == nullptr || multiscan_opts->empty()) {
-    return false;
+    return Status::InvalidArgument("Empty MultiScanArgs");
   }
 
   const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
   const bool has_limit = scan_opts.front().range.limit.has_value();
   if (!has_limit && scan_opts.size() > 1) {
     // Abort: overlapping ranges
-    return false;
+    return Status::InvalidArgument("Scan has no upper bound");
   }
 
   for (size_t i = 0; i < scan_opts.size(); ++i) {
     const auto& scan_range = scan_opts[i].range;
     if (!scan_range.start.has_value()) {
       // Abort: no start key
-      return false;
+      return Status::InvalidArgument("Scan has no start key");
     }
 
     if (scan_range.limit.has_value()) {
@@ -1297,7 +1300,7 @@ bool BlockBasedTableIterator::ValidateScanOptions(
     if (i > 0) {
       if (!scan_range.limit.has_value()) {
         // multiple no limit scan ranges
-        return false;
+        return Status::InvalidArgument("Scan has no upper bound");
       }
 
       const auto& last_end_key = scan_opts[i - 1].range.limit.value();
@@ -1305,14 +1308,14 @@ bool BlockBasedTableIterator::ValidateScanOptions(
               scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
               /*b_has_ts=*/false) < 0) {
         // Abort: overlapping ranges
-        return false;
+        return Status::InvalidArgument("Overlapping ranges");
       }
     }
   }
-  return true;
+  return Status::OK();
 }
 
-bool BlockBasedTableIterator::CollectBlockHandles(
+Status BlockBasedTableIterator::CollectBlockHandles(
     const std::vector<ScanOptions>& scan_opts,
     std::vector<BlockHandle>* scan_block_handles,
     std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan) {
@@ -1352,7 +1355,7 @@ bool BlockBasedTableIterator::CollectBlockHandles(
 
     if (!index_iter_->status().ok()) {
       // Abort: index iterator error
-      return false;
+      return index_iter_->status();
     }
 
     if (index_iter_->Valid()) {
@@ -1369,15 +1372,16 @@ bool BlockBasedTableIterator::CollectBlockHandles(
       // range. This is important for FindBlockForwardInMultiScan() which only
       // lets the upper layer (LevelIterator) advance to the next SST file when
       // the last scan range is exhausted.
-      return false;
+      return Status::InvalidArgument("Scan does not intersect with file");
+      ;
     }
     block_index_ranges_per_scan->emplace_back(
         scan_block_handles->size() - num_blocks, scan_block_handles->size());
   }
-  return true;
+  return Status::OK();
 }
 
-bool BlockBasedTableIterator::FilterAndPinCachedBlocks(
+Status BlockBasedTableIterator::FilterAndPinCachedBlocks(
     const std::vector<BlockHandle>& scan_block_handles,
     const MultiScanArgs* multiscan_opts,
     std::vector<size_t>* block_indices_to_read,
@@ -1406,14 +1410,14 @@ bool BlockBasedTableIterator::FilterAndPinCachedBlocks(
 
     if (!s.ok()) {
       // Abort: block cache look up failed.
-      return false;
+      return s;
     }
     if (!(*pinned_data_blocks_guard)[i].GetValue()) {
       // Block not in cache
       block_indices_to_read->emplace_back(i);
     }
   }
-  return true;
+  return Status::OK();
 }
 
 void BlockBasedTableIterator::PrepareIORequests(
@@ -1500,7 +1504,7 @@ void BlockBasedTableIterator::PrepareIORequests(
   }
 }
 
-bool BlockBasedTableIterator::ExecuteIO(
+Status BlockBasedTableIterator::ExecuteIO(
     const std::vector<BlockHandle>& scan_block_handles,
     const MultiScanArgs* multiscan_opts,
     const std::vector<std::vector<size_t>>& coalesced_block_indices,
@@ -1508,9 +1512,11 @@ bool BlockBasedTableIterator::ExecuteIO(
     std::vector<AsyncReadState>* async_states,
     std::vector<CachableEntry<Block>>* pinned_data_blocks_guard) {
   IOOptions io_opts;
-  if (!table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts).ok()) {
+  Status s;
+  s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
+  if (!s.ok()) {
     // Abort: PrepareIOOptions failed
-    return false;
+    return s;
   }
   const bool direct_io = table_->get_rep()->file->use_direct_io();
 
@@ -1538,7 +1544,7 @@ bool BlockBasedTableIterator::ExecuteIO(
                           this, std::placeholders::_1, std::placeholders::_2);
       // TODO: for mmap, io_handle will not be set but callback will already
       // be called.
-      Status s = table_->get_rep()->file.get()->ReadAsync(
+      s = table_->get_rep()->file.get()->ReadAsync(
           read_req, io_opts, cb, &async_read, &async_read.io_handle,
           &async_read.del_fn, direct_io ? &async_read.aligned_buf : nullptr);
       if (!s.ok()) {
@@ -1546,13 +1552,15 @@ bool BlockBasedTableIterator::ExecuteIO(
         fprintf(stderr, "ReadAsync failed with %s\n", s.ToString().c_str());
 #endif
         assert(false);
-        return false;
+        return s;
       }
       assert(async_read.io_handle);
       for (auto& req : *read_reqs) {
         if (!req.status.ok()) {
           assert(false);
-          return false;
+          // Silence compiler warning about NRVO
+          s = req.status;
+          return s;
         }
       }
     }
@@ -1579,15 +1587,17 @@ bool BlockBasedTableIterator::ExecuteIO(
     }
 
     AlignedBuf aligned_buf;
-    Status s = table_->get_rep()->file->MultiRead(
-        io_opts, read_reqs->data(), read_reqs->size(),
-        direct_io ? &aligned_buf : nullptr);
+    s = table_->get_rep()->file->MultiRead(io_opts, read_reqs->data(),
+                                           read_reqs->size(),
+                                           direct_io ? &aligned_buf : nullptr);
     if (!s.ok()) {
-      return false;
+      return s;
     }
     for (auto& req : *read_reqs) {
       if (!req.status.ok()) {
-        return false;
+        // Silence compiler warning about NRVO
+        s = req.status;
+        return s;
       }
     }
 
@@ -1604,13 +1614,13 @@ bool BlockBasedTableIterator::ExecuteIO(
         if (!s.ok()) {
           assert(false);
           // Abort: failed to create and pin block in cache
-          return false;
+          return s;
         }
         assert((*pinned_data_blocks_guard)[block_idx].GetValue());
       }
     }
   }
-  return true;
+  return s;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 39fc2a1bef04..095529341c95 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -45,7 +45,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
         need_upper_bound_check_(need_upper_bound_check),
         async_read_in_progress_(false),
         is_last_level_(table->IsLastLevel()),
-        block_iter_points_to_real_block_(false) {}
+        block_iter_points_to_real_block_(false) {
+    multi_scan_status_.PermitUncheckedError();
+  }
 
   ~BlockBasedTableIterator() override { ClearBlockHandles(); }
 
@@ -136,6 +138,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     return block_iter_.value();
   }
   Status status() const override {
+    if (!multi_scan_status_.ok()) {
+      return multi_scan_status_;
+    }
     // In case of block cache readahead lookup, it won't add the block to
     // block_handles if it's index is invalid. So index_iter_->status check can
     // be skipped.
@@ -151,7 +156,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
       assert(!multi_scan_);
       return Status::TryAgain("Async read in progress");
     } else if (multi_scan_) {
-      return multi_scan_->status;
+      return multi_scan_status_;
     } else {
       return Status::OK();
     }
@@ -454,7 +459,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     // async_states[j].
     std::vector<AsyncReadState> async_states;
     UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
-    Status status;
     size_t prefetch_max_idx;
 
     MultiScanState(
@@ -471,14 +475,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
           cur_data_block_idx(0),
           async_states(std::move(_async_states)),
           block_idx_to_readreq_idx(std::move(_block_idx_to_readreq_idx)),
-          status(Status::OK()),
-          prefetch_max_idx(_prefetch_max_idx) {
-      status.PermitUncheckedError();
-    }
+          prefetch_max_idx(_prefetch_max_idx) {}
 
     ~MultiScanState();
   };
 
+  Status multi_scan_status_;
   std::unique_ptr<MultiScanState> multi_scan_;
   // *** END MultiScan related APIs and states ***
 
@@ -599,8 +601,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   // *** BEGIN APIs relevant to multiscan ***
 
-  // Returns true iff we should fallback to regular scan.
-  bool SeekMultiScan(const Slice* target);
+  void SeekMultiScan(const Slice* target);
 
   void FindBlockForwardInMultiScan();
 
@@ -665,14 +666,14 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
                                      CachableEntry<Block>& pinned_block_entry);
 
   // Helper functions for Prepare():
-  bool ValidateScanOptions(const MultiScanArgs* multiscan_opts);
+  Status ValidateScanOptions(const MultiScanArgs* multiscan_opts);
 
-  bool CollectBlockHandles(
+  Status CollectBlockHandles(
       const std::vector<ScanOptions>& scan_opts,
       std::vector<BlockHandle>* scan_block_handles,
       std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan);
 
-  bool FilterAndPinCachedBlocks(
+  Status FilterAndPinCachedBlocks(
       const std::vector<BlockHandle>& scan_block_handles,
       const MultiScanArgs* multiscan_opts,
       std::vector<size_t>* block_indices_to_read,
@@ -687,7 +688,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
       UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
       std::vector<std::vector<size_t>>* coalesced_block_indices);
 
-  bool ExecuteIO(
+  Status ExecuteIO(
       const std::vector<BlockHandle>& scan_block_handles,
       const MultiScanArgs* multiscan_opts,
       const std::vector<std::vector<size_t>>& coalesced_block_indices,
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 1922ef8fbd4c..92b79143f609 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1196,14 +1196,7 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
 
       // Does not match start key of the second ScanOptions.
       iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
-      for (size_t i = 50 * kEntriesPerBlock + 1; i < 100 * kEntriesPerBlock;
-           ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_FALSE(iter->Valid());
-      ASSERT_OK(iter->status());
+      ASSERT_NOK(iter->status());
 
       iter.reset(table->NewIterator(
           read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
@@ -1215,19 +1208,9 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
       iter->Prepare(&scan_options);
       // Does not match the first ScanOptions.
       iter->SeekToFirst();
-      for (size_t i = 0; i < kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_OK(iter->status());
+      ASSERT_NOK(iter->status());
       iter->Seek(kv[10 * kEntriesPerBlock].first);
-      for (size_t i = 10 * kEntriesPerBlock; i < 12 * kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_OK(iter->status());
+      ASSERT_NOK(iter->status());
     }
   }
 }
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 416ed513ee72..acc5f40a1c97 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -289,7 +289,13 @@ class UserDefinedIndexReaderWrapper : public BlockBasedTable::IndexReader {
     }
     std::unique_ptr<UserDefinedIndexIterator> udi_iter =
         udi_reader_->NewIterator(read_options);
-    return new UserDefinedIndexIteratorWrapper(std::move(udi_iter));
+    if (udi_iter) {
+      InternalIteratorBase<IndexValue>* wrap_iter =
+          new UserDefinedIndexIteratorWrapper(std::move(udi_iter));
+      return wrap_iter;
+    }
+    return NewErrorInternalIterator<IndexValue>(
+        Status::NotFound("COuld not create UDI iterator"));
   }
 
   virtual Status CacheDependencies(
diff --git a/table/table_test.cc b/table/table_test.cc
index e1d01db61264..e9aa1003c115 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -8341,6 +8341,111 @@ TEST_F(UserDefinedIndexTest, IngestEmptyUDI) {
   ASSERT_OK(DestroyDB(dbname, options));
 }
 
+TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  Random rnd(301);
+  // Add 100 keys instead of just 5
+  for (int i = 0; i < 100; i++) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = rnd.RandomString(1024);
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::vector<std::string> key_ranges({"key03", "key05", "key12", "key14"});
+  ReadOptions ro;
+  ro.table_index_factory = user_defined_index_factory.get();
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(5);
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
+  scan_options.max_prefetch_size = 3500;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+  int count = 0;
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[0]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[1]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  // This should fail due to reaching max_prefetch_size limit
+  ASSERT_EQ(iter->status(), Status::Incomplete());
+  iter.reset();
+
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  scan_options.max_prefetch_size = 0;
+  iter->Prepare(scan_options);
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  // Seek should fail as its not in the order specified in scan_options
+  ASSERT_EQ(iter->status(), Status::InvalidArgument());
+  iter.reset();
+
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  (*scan_options).clear();
+  key_ranges[1] = "key20";
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
+  iter->Prepare(scan_options);
+  ub = key_ranges[3];
+  iter->Seek(key_ranges[2]);
+  // Should fail due to overlapping ranges
+  ASSERT_EQ(iter->status(), Status::InvalidArgument());
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
 TEST_F(UserDefinedIndexTest, ConfigTest) {
   Options options;
   BlockBasedTableOptions table_options;

From bbd8f0d4bfc8b2535cd4f3da045a5a0178bfd74b Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Tue, 23 Sep 2025 12:35:35 -0700
Subject: [PATCH 305/500] Bug fix in random seed override support in stress
 test (#13991)

Summary:
Fix the bug in Improve random seed override support in stress test.

The Bug:
`parser.parse_known_args()` is used to parse command line argument. When it is called without any argument, it uses sys.argv as input parameter. In sys.argv, the first argument is the command itself, so parser.parse_known_args skip the first argument. Meantime, the return value `remain_argv` of `parser.parse_known_args()` does not contain the command itself. When `remain_arg` replaces `sys.argv`, the first argument is treated as the command itself, which is skipped by `parser.parse_known_args()`. In the internal stress test tool, the first argument is `--stress_cmd`, therefore, it is skipped. Instead, the default value `./stress_db` is used. This is why `./stress_db` showed up in the error message. This is also why it works in local, as stress_db is located in the local folder.

The Fix:
When `parser.parse_known_args()` is called first time, the remain_argv is saved as a global variable. It is used in the second call of the `parser.parse_known_args(remain_argv)`. When argument is passed to `parser.parse_known_args` directly, the first argument will not be skipped.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13991

Test Plan:
The the value of first argument `--stress_cmd` is parsed correctly, and shown up in the error message.

```
/usr/local/bin/python3 -u tools/db_crashtest.py --stress_cmd=/data/sandcastle/boxes/trunk-hg-full-fbsource/buck-out/v2/gen/fbcode/d7db8b24dd42e2db/internal_repo_rocksdb/repo/__db_stress__/db_stress --cleanup_cmd='' --simple blackbox  --print_stderr_separately
Start with random seed 11107847853133580500
Running blackbox-crash-test with
interval_between_crash=120
total-duration=6000

Use random seed for iteration 8577470137673434540
Traceback (most recent call last):
  File "/home/xbw/workspace/ws1/rocksdb/tools/db_crashtest.py", line 1650, in <module>
    main()
  File "/home/xbw/workspace/ws1/rocksdb/tools/db_crashtest.py", line 1639, in main
    blackbox_crash_main(args, unknown_args)
  File "/home/xbw/workspace/ws1/rocksdb/tools/db_crashtest.py", line 1358, in blackbox_crash_main
    hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
                                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xbw/workspace/ws1/rocksdb/tools/db_crashtest.py", line 1294, in execute_cmd
    child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/fbcode/platform010/lib/python3.12/subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/usr/local/fbcode/platform010/lib/python3.12/subprocess.py", line 1957, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: '/data/sandcastle/boxes/trunk-hg-full-fbsource/buck-out/v2/gen/fbcode/d7db8b24dd42e2db/internal_repo_rocksdb/repo/__db_stress__/db_stress'
```

Reviewed By: hx235

Differential Revision: D83068960

Pulled By: xingbowang

fbshipit-source-id: 28334d38a444c6f8525444e15f460ec6b257ef38
---
 tools/db_crashtest.py | 51 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index cf71c9dcdf94..2cf045116f54 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -11,21 +11,50 @@
 import tempfile
 import time
 
+per_iteration_random_seed_override = 0
+remain_argv = None
+
+
+def get_random_seed(override):
+    if override == 0:
+        return random.randint(1, 2**64)
+    else:
+        return override
+
 
 def setup_random_seed_before_main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--random_seed",
+        "--initial_random_seed_override",
         default=0,
         type=int,
-        help="Random seed used for reproduce the same test parameter set",
+        help="Random seed used for initialize the test parameters at the beginning of stress test run",
     )
-    args, _ = parser.parse_known_args()
-    random_seed = (
-        random.randint(1, 2**64) if args.random_seed == 0 else args.random_seed
+    # sometimes the failure appeared after a few iteration, to reproduce the error, we have to wait for the test to run
+    # multiple iterations to reach the iteration that fails the test. By overriding the seed used within each iteration,
+    # we could skip all the previous iterations.
+    parser.add_argument(
+        "--per_iteration_random_seed_override",
+        default=0,
+        type=int,
+        help="Random seed used for initialize the test parameters in each iteration of the stress test run",
     )
-    print(f"Start with random seed {random_seed}")
-    random.seed(random_seed)
+
+    global remain_args
+    args, remain_args = parser.parse_known_args()
+    init_random_seed = get_random_seed(args.initial_random_seed_override)
+    global per_iteration_random_seed_override
+    per_iteration_random_seed_override = args.per_iteration_random_seed_override
+
+    print(f"Start with random seed {init_random_seed}")
+    random.seed(init_random_seed)
+
+
+def apply_random_seed_per_iteration():
+    global per_iteration_random_seed_override
+    per_iteration_random_seed = get_random_seed(per_iteration_random_seed_override)
+    print(f"Use random seed for iteration {per_iteration_random_seed}")
+    random.seed(per_iteration_random_seed)
 
 
 # Random seed has to be setup before the rest of the script, so that the random
@@ -367,7 +396,7 @@ def setup_random_seed_before_main():
     "memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
     "allow_unprepared_value": lambda: random.choice([0, 1]),
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
-    "track_and_verify_wals": lambda: random.choice([0]),    
+    "track_and_verify_wals": lambda: random.choice([0]),
     "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
     # TODO(jaykorean): Change to lambda: random.choice([0, 1]) after addressing all remote compaction failures
     "remote_compaction_failure_fall_back_to_local": 1,
@@ -1230,7 +1259,6 @@ def gen_cmd(params, unknown_params):
             not in {
                 "test_type",
                 "simple",
-                "random_seed",
                 "duration",
                 "interval",
                 "random_kill_odd",
@@ -1314,6 +1342,7 @@ def blackbox_crash_main(args, unknown_args):
     )
 
     while time.time() < exit_time:
+        apply_random_seed_per_iteration()
         cmd = gen_cmd(
             dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
         )
@@ -1376,6 +1405,7 @@ def whitebox_crash_main(args, unknown_args):
     succeeded = True
     hit_timeout = False
     while time.time() < exit_time:
+        apply_random_seed_per_iteration()
         if check_mode == 0:
             additional_opts = {
                 # use large ops per thread since we will kill it anyway
@@ -1576,8 +1606,9 @@ def main():
     for k, v in all_params.items():
         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
     # unknown_args are passed directly to db_stress
-    args, unknown_args = parser.parse_known_args()
 
+    global remain_args
+    args, unknown_args = parser.parse_known_args(remain_args)
     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
     if test_tmpdir is not None and not args.skip_tmpdir_check:
         isdir = False

From 6051d843d5c0421cfb87b59cce7f588bf0075f71 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:09:47 -0700
Subject: [PATCH 306/500] Prohibit unsupported multiscan + delrange combo in
 crash tests (#13992)

Summary:
This combination causes MultiScan iteration to fail due to internal reseek by the iterator.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13992

Reviewed By: cbi42

Differential Revision: D83094631

Pulled By: anand1976

fbshipit-source-id: 96410747d88de391e6d65857d39063d4fb113d65
---
 tools/db_crashtest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 2cf045116f54..2b2639e56826 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1191,6 +1191,8 @@ def finalize_and_sanitize(src_params):
         dest_params["use_multiscan"] = 0
     if dest_params.get("use_multiscan") == 1:
         dest_params["async_io"] = 0
+        dest_params["delpercent"] += dest_params["delrangepercent"]
+        dest_params["delrangepercent"] = 0
     return dest_params
 
 

From 134cfb6b22cace2d804f210de3fd147c165c4095 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 24 Sep 2025 14:06:56 -0700
Subject: [PATCH 307/500] Speed up AutoHCC check in dtor (#13998)

Summary:
In https://github.com/facebook/rocksdb/issues/13964 I changed an expensive DEBUG check in ~AutoHyperClockTable to only run in ASAN builds. It's still expensive so I'm modifying it to scan only about one page beyond what we expect to have written to the anonymous mmap, rather than scanning the whole thing.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13998

Test Plan: manually checked that lru_cache_test running time went from 5.0s to 4.0s after the change. Verified that existing unit test ClockCacheTest.Limits uses the full anonymous mmap to be sure it is sized as expected, by temporarily breaking AutoHyperClockTable::Grow() to allow slightly exceeding the anonymous mmap size.

Reviewed By: cbi42

Differential Revision: D83178493

Pulled By: pdillinger

fbshipit-source-id: a2bf093e98bf68b540c073800be7e193021f2692
---
 cache/clock_cache.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index e4e327e3b637..e65a3cf12f44 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -2062,9 +2062,10 @@ AutoHyperClockTable::~AutoHyperClockTable() {
   }
   // This check can be extra expensive for a cache that is just created,
   // maybe used for a small number of entries, as in a unit test, and then
-  // destroyed. Only do this in rare modes.
+  // destroyed. Only do this in rare modes. REVISED: Don't scan the whole mmap,
+  // just a reasonable frontier past what we expect to have written.
 #ifdef MUST_FREE_HEAP_ALLOCATIONS
-  for (size_t i = used_end; i < array_.Count(); i++) {
+  for (size_t i = used_end; i < array_.Count() && i < used_end + 64U; i++) {
     assert(array_[i].head_next_with_shift.LoadRelaxed() == 0);
     assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0);
     assert(array_[i].meta.LoadRelaxed() == 0);

From 169f90cdeadf227ae4007083005cea0485053d1b Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 24 Sep 2025 14:59:20 -0700
Subject: [PATCH 308/500] Allow UDIs with non BytewiseComparator (#13999)

Summary:
Remove the restriction of only using BytewiseComparator(). In a follow on PR, the UDI interface will be updated to take the Comparator as a parameter.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13999

Test Plan: Add a unit test in table_test.cc

Reviewed By: cbi42

Differential Revision: D83179747

Pulled By: anand1976

fbshipit-source-id: 60222533c71022aa0701ac61c39268d36ca86338
---
 .../block_based/block_based_table_builder.cc  |   6 -
 table/table_test.cc                           | 149 +++++++++++++++---
 .../udi_non_bytewise_comparator.md            |   1 +
 3 files changed, 127 insertions(+), 29 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/udi_non_bytewise_comparator.md

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index a660ff0b9854..1210b7769212 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1190,12 +1190,6 @@ struct BlockBasedTableBuilder::Rep {
         SetStatus(
             Status::InvalidArgument("user_defined_index_factory not supported "
                                     "with parallel compression"));
-      } else if (ioptions.user_comparator != BytewiseComparator()) {
-        // TODO: Pass the user_comparator to the UDI and let it validate. Do
-        // it in a major release.
-        SetStatus(
-            Status::InvalidArgument("user_defined_index_factory only supported "
-                                    "with bytewise comparator"));
       } else {
         std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder(
             table_options.user_defined_index_factory->NewBuilder());
diff --git a/table/table_test.cc b/table/table_test.cc
index e9aa1003c115..73b59e332fde 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7422,6 +7422,8 @@ TEST_F(ExternalTableTest, IngestionTest) {
   ASSERT_OK(db->Close());
 }
 
+// Test with a bool parameter for BytewiseComparator() (false) or
+// ReverseBytewiseComparator() (true)
 class UserDefinedIndexTest : public BlockBasedTableTestBase {
  public:
   class CustomFlushBlockPolicy : public FlushBlockPolicy {
@@ -7458,23 +7460,43 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
  public:
   class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
    public:
+    TestUserDefinedIndexFactory(bool reverse = false) { reverse_ = reverse; }
     const char* Name() const override { return "test_index"; }
     UserDefinedIndexBuilder* NewBuilder() const override {
-      return new TestUserDefinedIndexBuilder();
+      return new TestUserDefinedIndexBuilder(reverse_);
     }
 
     std::unique_ptr<UserDefinedIndexReader> NewReader(
         Slice& index_block) const override {
-      return std::make_unique<TestUserDefinedIndexReader>(index_block, this);
+      return std::make_unique<TestUserDefinedIndexReader>(reverse_, index_block,
+                                                          this);
     }
 
     uint64_t seek_error_count_ = 0;
     uint64_t next_error_count_ = 0;
 
    private:
+    struct TestUserDefinedIndexCompare {
+      bool operator()(const std::string& lhs, const std::string& rhs) const {
+        if (!reverse) {
+          return lhs < rhs;
+        } else {
+          return rhs < lhs;
+        }
+      }
+
+      bool reverse;
+      explicit TestUserDefinedIndexCompare(bool _reverse) {
+        reverse = _reverse;
+      }
+    };
+
     class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder {
      public:
-      TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {}
+      TestUserDefinedIndexBuilder(bool reverse)
+          : entries_added_(0),
+            index_data_(TestUserDefinedIndexCompare(reverse)),
+            keys_added_(0) {}
 
       Slice AddIndexEntry(const Slice& last_key_in_current_block,
                           const Slice* first_key_in_next_block,
@@ -7536,7 +7558,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
 
      private:
       int entries_added_;
-      std::map<std::string, std::string> index_data_;
+      std::map<std::string, std::string, TestUserDefinedIndexCompare>
+          index_data_;
       uint32_t keys_added_;
       std::string index_contents_data_;
     };
@@ -7544,8 +7567,11 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
     class TestUserDefinedIndexReader : public UserDefinedIndexReader {
      public:
       explicit TestUserDefinedIndexReader(
-          Slice& index_block, const TestUserDefinedIndexFactory* factory)
-          : factory_(factory) {
+          bool reverse, Slice& index_block,
+          const TestUserDefinedIndexFactory* factory)
+          : reverse_(reverse),
+            factory_(factory),
+            index_data_(TestUserDefinedIndexCompare(reverse)) {
         Slice block = index_block;
         while (!block.empty()) {
           Slice key;
@@ -7568,8 +7594,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
 
       std::unique_ptr<UserDefinedIndexIterator> NewIterator(
           const ReadOptions& /*ro*/) override {
-        return std::make_unique<TestUserDefinedIndexIterator>(index_data_,
-                                                              factory_);
+        return std::make_unique<TestUserDefinedIndexIterator>(
+            reverse_, index_data_, factory_);
       }
 
       size_t ApproximateMemoryUsage() const override { return 0; }
@@ -7578,9 +7604,10 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
       class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
        public:
         TestUserDefinedIndexIterator(
+            bool reverse,
             std::map<std::string,
-                     std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
-                index,
+                     std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
+                     TestUserDefinedIndexCompare>& index,
             const TestUserDefinedIndexFactory* factory)
             : index_(index),
               iter_(index_.end()),
@@ -7588,7 +7615,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
               num_opts_(0),
               target_num_keys_(0),
               seek_error_count_(factory->seek_error_count_),
-              next_error_count_(factory->next_error_count_) {}
+              next_error_count_(factory->next_error_count_),
+              comp_(reverse ? ReverseBytewiseComparator()
+                            : BytewiseComparator()) {}
 
         Status SeekAndGetResult(const Slice& key,
                                 IterateResult* result) override {
@@ -7602,8 +7631,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
           }
           if (scan_opts_) {
             // Seeks should be in order specified in scan_opts_
-            EXPECT_EQ(scan_opts_[scan_idx_].range.start.value().compare(key),
-                      0);
+            EXPECT_EQ(
+                comp_->Compare(scan_opts_[scan_idx_].range.start.value(), key),
+                0);
             EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value());
             target_num_keys_ = std::stoi(scan_opts_[scan_idx_]
                                              .property_bag.value()
@@ -7617,7 +7647,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             result->bound_check_result = IterBoundCheck::kInbound;
             result->key = Slice(iter_->first);
             if (scan_opts_ && target_num_keys_ > 0 &&
-                iter_->first.compare(key.ToString()) == 0) {
+                comp_->Compare(iter_->first, key) == 0) {
               target_num_keys_--;
             }
           } else {
@@ -7637,8 +7667,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             return s;
           }
           if (scan_opts_ && scan_opts_[scan_idx_ - 1].range.limit.has_value()) {
-            if (iter_->first.compare(
-                    scan_opts_[scan_idx_ - 1].range.limit.value().ToString()) >=
+            if (comp_->Compare(iter_->first,
+                               scan_opts_[scan_idx_ - 1].range.limit.value()) >=
                 0) {
               result->bound_check_result = IterBoundCheck::kOutOfBound;
               result->key = Slice();
@@ -7676,8 +7706,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             return true;
           }
           if (scan_opts_[scan_idx_ - 1].range.limit.has_value() &&
-              scan_opts_[scan_idx_ - 1].range.limit.value().compare(
-                  iter_->first) <= 0) {
+              comp_->Compare(scan_opts_[scan_idx_ - 1].range.limit.value(),
+                             iter_->first) <= 0) {
             return false;
           }
           return true;
@@ -7700,8 +7730,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
 
        private:
         std::map<std::string,
-                 std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>&
-            index_;
+                 std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
+                 TestUserDefinedIndexCompare>& index_;
         std::map<std::string, std::pair<UserDefinedIndexBuilder::BlockHandle,
                                         uint32_t>>::iterator iter_;
         const ScanOptions* scan_opts_;
@@ -7710,13 +7740,18 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         uint32_t target_num_keys_;
         uint64_t seek_error_count_;
         uint64_t next_error_count_;
+        const Comparator* comp_;
       };
 
+      bool reverse_;
       const TestUserDefinedIndexFactory* factory_;
       std::map<std::string,
-               std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>>
+               std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
+               TestUserDefinedIndexCompare>
           index_data_;
     };
+
+    bool reverse_;
   };
 
  protected:
@@ -7727,6 +7762,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
                          ColumnFamilyHandle* cfh) {
     Slice ub;
     ReadOptions read_opts = ro;
+    const Comparator* comp = cfh->GetComparator();
     int key_count = 0;
     int index = 0;
     auto opts = scan_opts.GetScanRanges();
@@ -7739,6 +7775,8 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
       EXPECT_OK(iter->status());
       while (iter->Valid()) {
         key_count++;
+        ASSERT_GE(comp->Compare(iter->key(), opt.range.start.value()), 0);
+        ASSERT_LT(comp->Compare(iter->key(), opt.range.limit.value()), 0);
         iter->Next();
       }
       EXPECT_EQ(key_count, key_counts[index]);
@@ -7887,7 +7925,7 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   ro.iterate_upper_bound = nullptr;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts(BytewiseComparator());
+  MultiScanArgs scan_opts(options.comparator);
 
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
@@ -8393,7 +8431,7 @@ TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
   ro.iterate_upper_bound = &ub;
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(5);
-  MultiScanArgs scan_options(BytewiseComparator());
+  MultiScanArgs scan_options(options.comparator);
   scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
   scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
   scan_options.max_prefetch_size = 3500;
@@ -8531,6 +8569,71 @@ TEST_F(UserDefinedIndexTest, ConfigTest) {
   ASSERT_OK(db->Close());
   ASSERT_OK(DestroyDB(dbname, options));
 }
+
+TEST_F(UserDefinedIndexTest, ReverseMultiScanTest) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory with ReverseBytewiseComparator
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>(/*reverse=*/true);
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options.comparator = ReverseBytewiseComparator();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  std::unique_ptr<SstFileWriter> writer;
+  writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(writer->Open(ingest_file));
+
+  Random rnd(301);
+  // Add 100 keys in reverse bytewise order
+  for (int i = 99; i >= 0; i--) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    std::string key = "key" + ss.str();
+    std::string value = rnd.RandomString(1024);
+    ASSERT_OK(writer->Put(key, value));
+  }
+  ASSERT_OK(writer->Finish());
+  writer.reset();
+
+  std::unique_ptr<DB> db;
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
+  ASSERT_OK(s);
+
+  std::vector<std::string> key_ranges({"key90", "key75", "key30", "key02"});
+  std::vector<int> key_counts;
+  ReadOptions ro;
+  ro.table_index_factory = user_defined_index_factory.get();
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(20);
+  MultiScanArgs scan_opts(options.comparator);
+  scan_opts.insert(key_ranges[0], key_ranges[1], property_bag);
+  key_counts.emplace_back(15);
+  scan_opts.insert(key_ranges[2], key_ranges[3], property_bag);
+  key_counts.emplace_back(24);
+  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/behavior_changes/udi_non_bytewise_comparator.md b/unreleased_history/behavior_changes/udi_non_bytewise_comparator.md
new file mode 100644
index 000000000000..f1494fc1eb36
--- /dev/null
+++ b/unreleased_history/behavior_changes/udi_non_bytewise_comparator.md
@@ -0,0 +1 @@
+Allow UDIs with a non BytewiseComparator

From 90241e18c8616677a08cee78be3a1cb91752f9fb Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Wed, 24 Sep 2025 16:31:13 -0700
Subject: [PATCH 309/500] Add shared mutex field to IODebugContext (#13993)

Summary:
There can be concurrent reads/writes to fields in `IODebugContext`. One example we have seen is for the `cost_info` field which is of type `std::any`. In fact, in RocksDB's async MultiRead implementation, the same `IODebugContext` is re-used across separate async read requests.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13993

Test Plan: Update code which reads/writes to `cost_data` to first acquire shared/exclusive lock on the `mutex` field. There should not be any race conditions when async MultiRead is used.

Reviewed By: pdillinger

Differential Revision: D83091423

Pulled By: archang19

fbshipit-source-id: 4db86d33cf162ed39114b1cd115fcd8964c8ff9b
---
 include/rocksdb/file_system.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index a68dee516679..b19c4786d482 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -24,6 +24,7 @@
 #include <functional>
 #include <limits>
 #include <memory>
+#include <shared_mutex>
 #include <sstream>
 #include <string>
 #include <unordered_map>
@@ -254,6 +255,10 @@ struct IODebugContext {
   // Arbitrary structure containing cost information about the IO request
   std::any cost_info;
 
+  // FileSystem implementations can use this mutex to synchronize concurrent
+  // reads/writes as needed (e.g. to update the counters or cost_info field)
+  std::shared_mutex mutex;
+
   IODebugContext() {}
 
   // Copy constructor

From 1c8a012727f40239c8cd9be46f7fbc19c952644a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 25 Sep 2025 11:27:00 -0700
Subject: [PATCH 310/500] Add kCool Temperature (#14000)

Summary:
also requested by internal user, like kIce in https://github.com/facebook/rocksdb/issues/13927

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14000

Test Plan: unit tests updated

Reviewed By: archang19

Differential Revision: D83200479

Pulled By: pdillinger

fbshipit-source-id: 31f2842d87bcad40227aeee9687ff5772393689c
---
 db/compaction/tiered_compaction_test.cc       | 23 ++++---
 db/db_compaction_test.cc                      | 63 +++++++++++++++----
 db/db_test2.cc                                | 25 +++-----
 db/db_test_util.cc                            |  9 +++
 db/db_test_util.h                             |  4 ++
 db_stress_tool/db_stress_test_base.cc         |  5 +-
 file/random_access_file_reader.cc             |  6 ++
 include/rocksdb/iostats_context.h             |  6 ++
 include/rocksdb/statistics.h                  |  2 +
 include/rocksdb/types.h                       |  1 +
 java/rocksjni/portal.h                        |  8 +++
 .../src/main/java/org/rocksdb/TickerType.java |  2 +
 monitoring/iostats_context.cc                 |  2 +
 monitoring/statistics.cc                      |  2 +
 options/options_helper.cc                     | 16 ++---
 tools/db_crashtest.py                         |  8 +--
 16 files changed, 129 insertions(+), 53 deletions(-)

diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index 0d623678c4b2..f8e9da373394 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -1764,9 +1764,10 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   options.env = mock_env_.get();
   options.level0_file_num_compaction_trigger = kNumTrigger;
   options.num_levels = kNumLevels;
-  // This existing test selected to also check the kIce case, which should not
-  // be interesting enough to exercise across all the test cases
-  options.last_level_temperature = Temperature::kIce;
+  // This existing test selected to also check the case of various temperatures
+  // for last_level_temperature, which should not be interesting enough to
+  // exercise across many/all test cases
+  options.last_level_temperature = RandomKnownTemperature();
   DestroyAndReopen(options);
 
   Random rnd(301);
@@ -1794,8 +1795,9 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
   auto seqs = tp_mapping.TEST_GetInternalMapping();
   ASSERT_FALSE(seqs.empty());
   ASSERT_GE(GetSstSizeHelper(Temperature::kUnknown), 1);
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kIce), 0);
+  for (auto t : kKnownTemperatures) {
+    ASSERT_EQ(GetSstSizeHelper(t), 0);
+  }
 
   // Wait more than preclude_last_level time, then make sure all the data is
   // compacted to the last level even there's no write (no seqno -> time
@@ -1804,9 +1806,14 @@ TEST_P(PrecludeLastLevelTest, SmallPrecludeTime) {
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
-  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
-  ASSERT_GE(GetSstSizeHelper(Temperature::kIce), 1);
+
+  for (auto t : kKnownTemperatures) {
+    if (t == options.last_level_temperature) {
+      ASSERT_GT(GetSstSizeHelper(t), 0);
+    } else {
+      ASSERT_EQ(GetSstSizeHelper(t), 0);
+    }
+  }
 
   Close();
 }
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 99b2c7208dba..7e3f61662220 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -9915,6 +9915,20 @@ static void VerifyTemperatureFileReadStats(const Statistics& st,
     EXPECT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
   }
 
+  if (temps.Contains(Temperature::kCool)) {
+    EXPECT_GE(st.getTickerCount(COOL_FILE_READ_BYTES), min_bytes);
+    EXPECT_GE(st.getTickerCount(COOL_FILE_READ_COUNT), min_count);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cool_file_bytes_read,
+              min_bytes);
+    EXPECT_GE(iostats->file_io_stats_by_temperature.cool_file_read_count,
+              min_count);
+  } else {
+    EXPECT_EQ(st.getTickerCount(COOL_FILE_READ_BYTES), 0);
+    EXPECT_EQ(st.getTickerCount(COOL_FILE_READ_COUNT), 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cool_file_bytes_read, 0);
+    EXPECT_EQ(iostats->file_io_stats_by_temperature.cool_file_read_count, 0);
+  }
+
   if (temps.Contains(Temperature::kCold)) {
     EXPECT_GE(st.getTickerCount(COLD_FILE_READ_BYTES), min_bytes);
     EXPECT_GE(st.getTickerCount(COLD_FILE_READ_COUNT), min_count);
@@ -9945,7 +9959,7 @@ static void VerifyTemperatureFileReadStats(const Statistics& st,
 }
 
 TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
-  // Test multi-tier aging: Hot -> Warm -> Cold -> Ice
+  // Test multi-tier aging: Hot -> Warm -> Cool -> Cold -> Ice
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleFIFO;
   options.num_levels = 1;
@@ -9961,8 +9975,9 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
   // Multi-tier aging: files age through multiple temperatures
   fifo_options.file_temperature_age_thresholds = {
       {Temperature::kWarm, 500},   // Hot -> Warm after 500s
-      {Temperature::kCold, 1000},  // Warm -> Cold after 1000s
-      {Temperature::kIce, 1500}    // Cold -> Ice after 1500s
+      {Temperature::kCool, 1000},  // Warm -> Cool
+      {Temperature::kCold, 1500},  // Cool -> Cold
+      {Temperature::kIce, 2000}    // Cold -> Ice
   };
   fifo_options.max_table_files_size = 100000000;
   fifo_options.allow_trivial_copy_when_change_temperature = true;
@@ -9973,8 +9988,8 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
   env_->SetMockSleep();
 
   // Track all temperature file creations
-  int total_hot = 0, total_warm = 0, total_cold = 0, total_ice = 0,
-      total_unknown = 0;
+  int total_hot = 0, total_warm = 0, total_cool = 0, total_cold = 0,
+      total_ice = 0, total_unknown = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "NewWritableFile::FileOptions.temperature", [&](void* arg) {
         Temperature temperature = *(static_cast<Temperature*>(arg));
@@ -9985,6 +10000,9 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
           case Temperature::kWarm:
             total_warm++;
             break;
+          case Temperature::kCool:
+            total_cool++;
+            break;
           case Temperature::kCold:
             total_cold++;
             break;
@@ -10016,8 +10034,11 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
 
   VerifyTemperatureFileReadStats(*options.statistics, Temperature::kHot);
 
+  // Land well into each time interval
+  env_->MockSleepForSeconds(100);
+
   // Age initial files to warm
-  env_->MockSleepForSeconds(600);
+  env_->MockSleepForSeconds(500);
   ASSERT_OK(Put(Key(1), Random::GetTLSInstance()->RandomBinaryString(101)));
   ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -10031,12 +10052,26 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
   // Verify Warm file statistics
   VerifyTemperatureFileReadStats(*options.statistics, Temperature::kWarm);
 
-  // Age initial files to cold
-  env_->MockSleepForSeconds(600);
+  // Age initial files to cool
+  env_->MockSleepForSeconds(500);
   ASSERT_OK(Put(Key(2), Random::GetTLSInstance()->RandomBinaryString(102)));
   ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
+  // Test reading from Cool temperature file (the aged file)
+  ASSERT_OK(options.statistics->Reset());
+  get_iostats_context()->Reset();
+
+  ASSERT_EQ(100U, Get(Key(0)).size());
+
+  VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCool);
+
+  // Age initial files to cold
+  env_->MockSleepForSeconds(500);
+  ASSERT_OK(Put(Key(3), Random::GetTLSInstance()->RandomBinaryString(103)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
   // Test reading from Cold temperature file (the aged file)
   ASSERT_OK(options.statistics->Reset());
   get_iostats_context()->Reset();
@@ -10046,8 +10081,8 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
   VerifyTemperatureFileReadStats(*options.statistics, Temperature::kCold);
 
   // Age initial files to ice
-  env_->MockSleepForSeconds(600);
-  ASSERT_OK(Put(Key(3), Random::GetTLSInstance()->RandomBinaryString(103)));
+  env_->MockSleepForSeconds(500);
+  ASSERT_OK(Put(Key(4), Random::GetTLSInstance()->RandomBinaryString(104)));
   ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
@@ -10072,12 +10107,14 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
   // Verify current files temperatures
   EXPECT_EQ(temp_counts[Temperature::kHot], 1);
   EXPECT_EQ(temp_counts[Temperature::kWarm], 1);
+  EXPECT_EQ(temp_counts[Temperature::kCool], 1);
   EXPECT_EQ(temp_counts[Temperature::kCold], 1);
   EXPECT_EQ(temp_counts[Temperature::kIce], 3);
 
   // Verify historical (and current) file temperatures
-  EXPECT_EQ(total_hot, 6);
-  EXPECT_EQ(total_warm, 5);
+  EXPECT_EQ(total_hot, 7);
+  EXPECT_EQ(total_warm, 6);
+  EXPECT_EQ(total_cool, 5);
   EXPECT_EQ(total_cold, 4);
   EXPECT_EQ(total_ice, 3);
 
@@ -10087,7 +10124,7 @@ TEST_F(DBCompactionTest, FIFOMultiTierTemperatureAging) {
   get_iostats_context()->Reset();
 
   // Read from all files to verify cumulative statistics
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < 5; i++) {
     ASSERT_EQ(static_cast<unsigned>(100 + i), Get(Key(i)).size());
   }
 
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 4f1738880c3e..1a565c8e1630 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -6063,16 +6063,9 @@ TEST_F(DBTest2, VariousFileTemperatures) {
   };
 
   // We don't have enough non-unknown temps to confidently distinguish that
-  // a specific setting caused a specific outcome, in a single run. This is a
-  // reasonable work-around without blowing up test time. Only returns
-  // non-unknown temperatures.
-  auto RandomTemp = [] {
-    static std::vector<Temperature> temps = {
-        Temperature::kHot, Temperature::kWarm, Temperature::kCold,
-        Temperature::kIce};
-    return temps[Random::GetTLSInstance()->Uniform(
-        static_cast<int>(temps.size()))];
-  };
+  // a specific setting caused a specific outcome, in a single run. Using
+  // RandomKnownTemperature() is a reasonable work-around without blowing up
+  // test time.
 
   auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
   std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
@@ -6088,22 +6081,22 @@ TEST_F(DBTest2, VariousFileTemperatures) {
       options.env = env.get();
       test_fs->Reset();
       if (use_optimize) {
-        test_fs->optimize_manifest_temperature = RandomTemp();
+        test_fs->optimize_manifest_temperature = RandomKnownTemperature();
         test_fs->expected_manifest_temperature =
             test_fs->optimize_manifest_temperature;
-        test_fs->optimize_wal_temperature = RandomTemp();
+        test_fs->optimize_wal_temperature = RandomKnownTemperature();
         test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
       }
       if (use_temp_options) {
-        options.metadata_write_temperature = RandomTemp();
+        options.metadata_write_temperature = RandomKnownTemperature();
         test_fs->expected_manifest_temperature =
             options.metadata_write_temperature;
         test_fs->expected_other_metadata_temperature =
             options.metadata_write_temperature;
-        options.wal_write_temperature = RandomTemp();
+        options.wal_write_temperature = RandomKnownTemperature();
         test_fs->expected_wal_temperature = options.wal_write_temperature;
-        options.last_level_temperature = RandomTemp();
-        options.default_write_temperature = RandomTemp();
+        options.last_level_temperature = RandomKnownTemperature();
+        options.default_write_temperature = RandomKnownTemperature();
       }
 
       DestroyAndReopen(options);
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 018df7978cef..0cefcfd41d73 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -1872,4 +1872,13 @@ template class TargetCacheChargeTrackingCache<
     CacheEntryRole::kBlockBasedTableReader>;
 template class TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>;
 
+const std::vector<Temperature> kKnownTemperatures = {
+    Temperature::kHot, Temperature::kWarm, Temperature::kCool,
+    Temperature::kCold, Temperature::kIce};
+
+Temperature RandomKnownTemperature() {
+  return kKnownTemperatures[Random::GetTLSInstance()->Uniform(
+      static_cast<int>(kKnownTemperatures.size()))];
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_test_util.h b/db/db_test_util.h
index d93d68532317..ad25a85b0336 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -1467,4 +1467,8 @@ class DBTestBase : public testing::Test {
 // unique ids.
 void VerifySstUniqueIds(const TablePropertiesCollection& props);
 
+// Excludes kUnknown
+extern const std::vector<Temperature> kKnownTemperatures;
+Temperature RandomKnownTemperature();
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 06024c25cafc..080ada88f207 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -428,8 +428,9 @@ bool StressTest::BuildOptionsTable() {
     options_tbl.emplace(
         "file_temperature_age_thresholds",
         std::vector<std::string>{
-            "{{temperature=kWarm;age=10}:{temperature=kCold;age=50}:{"
-            "temperature=kIce;age=250}}",
+            "{{temperature=kWarm;age=10}:{temperature=kCool;age=30}:{"
+            "temperature=kCold;age=100}:{"
+            "temperature=kIce;age=300}}",
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}", "{}"});
     options_tbl.emplace(
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index f7bf9699822c..f96609a01df3 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -80,6 +80,12 @@ inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
         RecordTick(stats, WARM_FILE_READ_BYTES, size);
         RecordTick(stats, WARM_FILE_READ_COUNT, 1);
         break;
+      case Temperature::kCool:
+        IOSTATS_ADD(file_io_stats_by_temperature.cool_file_bytes_read, size);
+        IOSTATS_ADD(file_io_stats_by_temperature.cool_file_read_count, 1);
+        RecordTick(stats, COOL_FILE_READ_BYTES, size);
+        RecordTick(stats, COOL_FILE_READ_COUNT, 1);
+        break;
       case Temperature::kCold:
         IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
         IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h
index 64cf8cb49365..c9ebad1b7043 100644
--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@@ -32,6 +32,8 @@ struct FileIOByTemperature {
   uint64_t hot_file_bytes_read;
   // the number of bytes read to Temperature::kWarm file
   uint64_t warm_file_bytes_read;
+  // the number of bytes read to Temperature::kCool file
+  uint64_t cool_file_bytes_read;
   // the number of bytes read to Temperature::kCold file
   uint64_t cold_file_bytes_read;
   // the number of bytes read to Temperature::kIce file
@@ -40,6 +42,8 @@ struct FileIOByTemperature {
   uint64_t hot_file_read_count;
   // total number of reads to Temperature::kWarm file
   uint64_t warm_file_read_count;
+  // total number of reads to Temperature::kCool file
+  uint64_t cool_file_read_count;
   // total number of reads to Temperature::kCold file
   uint64_t cold_file_read_count;
   // total number of reads to Temperature::kIce file
@@ -48,10 +52,12 @@ struct FileIOByTemperature {
   void Reset() {
     hot_file_bytes_read = 0;
     warm_file_bytes_read = 0;
+    cool_file_bytes_read = 0;
     cold_file_bytes_read = 0;
     ice_file_bytes_read = 0;
     hot_file_read_count = 0;
     warm_file_read_count = 0;
+    cool_file_read_count = 0;
     cold_file_read_count = 0;
     ice_file_read_count = 0;
   }
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index af97cffeb8d5..db2ef6f79ade 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -443,10 +443,12 @@ enum Tickers : uint32_t {
   // Tiered storage related statistics
   HOT_FILE_READ_BYTES,
   WARM_FILE_READ_BYTES,
+  COOL_FILE_READ_BYTES,
   COLD_FILE_READ_BYTES,
   ICE_FILE_READ_BYTES,
   HOT_FILE_READ_COUNT,
   WARM_FILE_READ_COUNT,
+  COOL_FILE_READ_COUNT,
   COLD_FILE_READ_COUNT,
   ICE_FILE_READ_COUNT,
 
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
index 33bd9c869c90..d9b902ff0835 100644
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -118,6 +118,7 @@ enum class Temperature : uint8_t {
   kUnknown = 0,
   kHot = 0x04,
   kWarm = 0x08,
+  kCool = 0x0A,
   kCold = 0x0C,
   kIce = 0x10,
   // XXX: this is mis-named. It is instead an invalid temperature beyond the
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 5371c97a17c6..c51f83e1c29f 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5199,12 +5199,16 @@ class TickerTypeJni {
         return -0x31;
       case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES:
         return -0x32;
+      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES:
+        return -0x5B;
       case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES:
         return -0x33;
       case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT:
         return -0x34;
       case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT:
         return -0x35;
+      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT:
+        return -0x5C;
       case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT:
         return -0x36;
       case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES:
@@ -5664,12 +5668,16 @@ class TickerTypeJni {
         return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES;
       case -0x32:
         return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES;
+      case -0x5B:
+        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES;
       case -0x33:
         return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES;
       case -0x34:
         return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT;
       case -0x35:
         return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT;
+      case -0x5C:
+        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT;
       case -0x36:
         return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT;
       case -0x37:
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 12cea6d2385b..6a4cc30d7e2b 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -764,10 +764,12 @@ public enum TickerType {
      */
     HOT_FILE_READ_BYTES((byte) -0x31),
     WARM_FILE_READ_BYTES((byte) -0x32),
+    COOL_FILE_READ_BYTES((byte) -0x5B),
     COLD_FILE_READ_BYTES((byte) -0x33),
     ICE_FILE_READ_BYTES((byte) -0x58),
     HOT_FILE_READ_COUNT((byte) -0x34),
     WARM_FILE_READ_COUNT((byte) -0x35),
+    COOL_FILE_READ_COUNT((byte) -0x5C),
     COLD_FILE_READ_COUNT((byte) -0x36),
     ICE_FILE_READ_COUNT((byte) -0x59),
 
diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc
index 04e98914da9c..9f96655a6b48 100644
--- a/monitoring/iostats_context.cc
+++ b/monitoring/iostats_context.cc
@@ -65,9 +65,11 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
   IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_bytes_read);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_bytes_read);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cool_file_bytes_read);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_bytes_read);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.hot_file_read_count);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.warm_file_read_count);
+  IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cool_file_read_count);
   IOSTATS_CONTEXT_OUTPUT(file_io_stats_by_temperature.cold_file_read_count);
   std::string str = ss.str();
   str.erase(str.find_last_not_of(", ") + 1);
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 4aaf3c6e7c72..d2d316bedae2 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -226,10 +226,12 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"},
     {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
     {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
+    {COOL_FILE_READ_BYTES, "rocksdb.cool.file.read.bytes"},
     {COLD_FILE_READ_BYTES, "rocksdb.cold.file.read.bytes"},
     {ICE_FILE_READ_BYTES, "rocksdb.ice.file.read.bytes"},
     {HOT_FILE_READ_COUNT, "rocksdb.hot.file.read.count"},
     {WARM_FILE_READ_COUNT, "rocksdb.warm.file.read.count"},
+    {COOL_FILE_READ_COUNT, "rocksdb.cool.file.read.count"},
     {COLD_FILE_READ_COUNT, "rocksdb.cold.file.read.count"},
     {ICE_FILE_READ_COUNT, "rocksdb.ice.file.read.count"},
     {LAST_LEVEL_READ_BYTES, "rocksdb.last.level.read.bytes"},
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 09788a31e2cc..f2081ef8259f 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -367,11 +367,9 @@ std::map<CompactionStopStyle, std::string>
         {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
 
 std::map<Temperature, std::string> OptionsHelper::temperature_to_string = {
-    {Temperature::kUnknown, "kUnknown"},
-    {Temperature::kHot, "kHot"},
-    {Temperature::kWarm, "kWarm"},
-    {Temperature::kCold, "kCold"},
-    {Temperature::kIce, "kIce"}};
+    {Temperature::kUnknown, "kUnknown"}, {Temperature::kHot, "kHot"},
+    {Temperature::kWarm, "kWarm"},       {Temperature::kCool, "kCool"},
+    {Temperature::kCold, "kCold"},       {Temperature::kIce, "kIce"}};
 
 std::unordered_map<std::string, ChecksumType>
     OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
@@ -966,11 +964,9 @@ std::unordered_map<std::string, CompactionStopStyle>
 
 std::unordered_map<std::string, Temperature>
     OptionsHelper::temperature_string_map = {
-        {"kUnknown", Temperature::kUnknown},
-        {"kHot", Temperature::kHot},
-        {"kWarm", Temperature::kWarm},
-        {"kCold", Temperature::kCold},
-        {"kIce", Temperature::kIce}};
+        {"kUnknown", Temperature::kUnknown}, {"kHot", Temperature::kHot},
+        {"kWarm", Temperature::kWarm},       {"kCool", Temperature::kCool},
+        {"kCold", Temperature::kCold},       {"kIce", Temperature::kIce}};
 
 std::unordered_map<std::string, PrepopulateBlobCache>
     OptionsHelper::prepopulate_blob_cache_string_map = {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 2b2639e56826..8f96dd7e0c2a 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -377,13 +377,13 @@ def apply_random_seed_per_iteration():
     "enable_custom_split_merge": lambda: random.choice([0, 1]),
     "adm_policy": lambda: random.choice([0, 1, 2, 3]),
     "last_level_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold", "kIce"]
+        ["kUnknown", "kHot", "kWarm", "kCool", "kCold", "kIce"]
     ),
     "default_write_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold", "kIce"]
+        ["kUnknown", "kHot", "kWarm", "kCool", "kCold", "kIce"]
     ),
     "default_temperature": lambda: random.choice(
-        ["kUnknown", "kHot", "kWarm", "kCold", "kIce"]
+        ["kUnknown", "kHot", "kWarm", "kCool", "kCold", "kIce"]
     ),
     # TODO(hx235): enable `enable_memtable_insert_with_hint_prefix_extractor`
     # after fixing the surfaced issue with delete range
@@ -696,7 +696,7 @@ def is_direct_io_supported(dbname):
     # For FIFO compaction (ignored otherwise)
     "file_temperature_age_thresholds": lambda: random.choice(
         [
-            "{{temperature=kWarm;age=10}:{temperature=kCold;age=50}:{temperature=kIce;age=250}}",
+            "{{temperature=kWarm;age=10}:{temperature=kCool;age=30}:{temperature=kCold;age=100}:{temperature=kIce;age=300}}",
             "{{temperature=kWarm;age=30}:{temperature=kCold;age=300}}",
             "{{temperature=kCold;age=100}}",
         ]

From 862438a7a14e7483c05ad0a94fe354810c0bc595 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 25 Sep 2025 17:33:57 -0700
Subject: [PATCH 311/500] Fix handling of out-of-range scan option (#13995)

Summary:
currently BlockBasedTableIterator::Prepare() fails the iterator with non-ok status if an out-of-range scan option is detected. This is due to the interaction between LevelIterator and BlockBasedTableIterator, see added comment above BlockBasedTableIterator::Prepare(). This can fail stress test for L0 files since it doesn't use LevelIterator and scan options are not pruned. This PR fixes this by adding an internal option to MultiScanArgs that enables this check.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13995

Test Plan:
- new unit test
- stress test that fails before this pr: `python3 -u ./tools/db_crashtest.py whitebox --iterpercent=60 --prefix_size=-1 --prefixpercent=0 --readpercent=0 --test_batches_snapshots=0 --use_multiscan=1 --read_fault_one_in=0 --kill_random_test=88888 --interval=60 --multiscan_use_async_io=0 --mmap_read=0 --level0_file_num_compaction_trigger=20`

Reviewed By: anand1976

Differential Revision: D83166088

Pulled By: cbi42

fbshipit-source-id: 241a7d43c8c00d9a98eea0cabb03d2174d51aae5
---
 db/db_iterator_test.cc                        | 68 +++++++++++++++++++
 db/version_set.cc                             |  9 +--
 include/rocksdb/options.h                     | 21 +++++-
 .../block_based/block_based_table_iterator.cc | 30 +++++---
 .../block_based/block_based_table_iterator.h  |  2 +-
 .../block_based_table_reader_test.cc          | 46 +++++++++++++
 6 files changed, 159 insertions(+), 17 deletions(-)

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index ee5ac84f29cb..cc44ff069c05 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4395,6 +4395,74 @@ TEST_P(DBMultiScanIteratorTest, FailureTest) {
   iter.reset();
 }
 
+TEST_P(DBMultiScanIteratorTest, OutOfL0FileRange) {
+  // Test that prepare does not fail scan when a scan range
+  // is outside of a L0 file's key range.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create a Lmax file
+  // key01 ~ key99
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024)));
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Create a L0 file
+  // key00 ~ key09
+  for (int i = 0; i < 10; ++i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), rnd.RandomString(1024)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // The second range is outside of L0 file's key range
+  std::vector<std::string> key_ranges({"k04", "k06", "k12", "k14"});
+  ReadOptions ro;
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+  int count = 0;
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[0]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[1]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+
+  ub = key_ranges[3];
+  count = 0;
+  iter->Seek(key_ranges[2]);
+  while (iter->status().ok() && iter->Valid()) {
+    ASSERT_GE(iter->key().compare(key_ranges[2]), 0);
+    ASSERT_LT(iter->key().compare(key_ranges[3]), 0);
+    count++;
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_EQ(count, 2);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 9a4189cf8e36..f2eef583d0ef 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1186,13 +1186,10 @@ class LevelIterator final : public InternalIterator {
         }
       }
     }
-    // Propagate io colaescing threshold
-    // TODO: This is error prone as we may forget to copy some fields. Think
-    // of a better way to do this.
+    // Propagate multiscan configs
     for (auto& file_to_arg : *file_to_scan_opts_) {
-      file_to_arg.second.io_coalesce_threshold = so->io_coalesce_threshold;
-      file_to_arg.second.max_prefetch_size = so->max_prefetch_size;
-      file_to_arg.second.use_async_io = so->use_async_io;
+      file_to_arg.second.CopyConfigFrom(*so);
+      file_to_arg.second.SetRequireFileOverlap(true);
     }
   }
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index bdba2a05519c..ba7521cf591b 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1848,7 +1848,7 @@ class MultiScanArgs {
   operator std::vector<ScanOptions>*() { return &original_ranges_; }
 
   operator const std::vector<ScanOptions>*() const { return &original_ranges_; }
-  // Destructor
+
   ~MultiScanArgs() {}
 
   const std::vector<ScanOptions>& GetScanRanges() const {
@@ -1857,6 +1857,20 @@ class MultiScanArgs {
 
   const Comparator* GetComparator() const { return comp_; }
 
+  void SetRequireFileOverlap(bool require_overlap) {
+    require_file_overlap_ = require_overlap;
+  }
+
+  bool RequireFileOverlap() const { return require_file_overlap_; }
+
+  // Copies the configurations (excluding actual scan ranges) from another
+  // MultiScanArgs.
+  void CopyConfigFrom(const MultiScanArgs& other) {
+    io_coalesce_threshold = other.io_coalesce_threshold;
+    max_prefetch_size = other.max_prefetch_size;
+    use_async_io = other.use_async_io;
+  }
+
   uint64_t io_coalesce_threshold = 16 << 10;  // 16KB by default
 
   // Maximum size (in bytes) for the data blocks loaded by a MultiScan.
@@ -1880,6 +1894,11 @@ class MultiScanArgs {
   // The comparator used for ordering ranges
   const Comparator* comp_;
   std::vector<ScanOptions> original_ranges_;
+
+  // Internal use only.
+  // Fail the Prepare() on a file if a scan range does not overlap
+  // with the file.
+  bool require_file_overlap_{false};
 };
 
 // Options that control read operations
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 9c2880406361..ea288022a4e2 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -963,6 +963,15 @@ BlockBasedTableIterator::MultiScanState::~MultiScanState() {
 // - After Prepare(), the iterator expects Seek to be called on the start key
 // of each ScanOption in order. If any other Seek is done, an error status is
 // returned
+// - Whenever all blocks of a scan opt are exhausted, the iterator will become
+// invalid and UpperBoundCheckResult() will return kOutOfBound. So that the
+// upper layer (LevelIterator) will stop scanning instead thinking EOF is
+// reached and continue into the next file. The only exception is for the last
+// scan opt. If we reach the end of the last scan opt, UpperBoundCheckResult()
+// will return kUnknown instead of kOutOfBound. This mechanism requires that
+// scan opts are properly pruned such that there is no scan opt that is after
+// this file's key range. This check can be enforeced by setting
+// MultiScanArgs::require_file_overlap to true.
 // FIXME: DBIter and MergingIterator may
 // internally do Seek() on child iterators, e.g. due to
 // ReadOptions::max_skippable_internal_keys or reseeking into range deletion
@@ -989,8 +998,9 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   std::vector<BlockHandle> scan_block_handles;
   std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
   const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
-  multi_scan_status_ = CollectBlockHandles(scan_opts, &scan_block_handles,
-                                           &block_index_ranges_per_scan);
+  multi_scan_status_ =
+      CollectBlockHandles(scan_opts, multiscan_opts->RequireFileOverlap(),
+                          &scan_block_handles, &block_index_ranges_per_scan);
   if (!multi_scan_status_.ok()) {
     return;
   }
@@ -1316,7 +1326,7 @@ Status BlockBasedTableIterator::ValidateScanOptions(
 }
 
 Status BlockBasedTableIterator::CollectBlockHandles(
-    const std::vector<ScanOptions>& scan_opts,
+    const std::vector<ScanOptions>& scan_opts, bool require_file_overlap,
     std::vector<BlockHandle>* scan_block_handles,
     std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan) {
   for (const auto& scan_opt : scan_opts) {
@@ -1368,12 +1378,14 @@ Status BlockBasedTableIterator::CollectBlockHandles(
       ++num_blocks;
     } else if (num_blocks == 0 && index_iter_->UpperBoundCheckResult() !=
                                       IterBoundCheck::kOutOfBound) {
-      // We should not have scan ranges that are completely after the file's
-      // range. This is important for FindBlockForwardInMultiScan() which only
-      // lets the upper layer (LevelIterator) advance to the next SST file when
-      // the last scan range is exhausted.
-      return Status::InvalidArgument("Scan does not intersect with file");
-      ;
+      // If require_file_overlap is set, then the scan ranges for this file
+      // must intersect with the file. Otherwise, allow empty intersection.
+      if (require_file_overlap) {
+        // This is important for FindBlockForwardInMultiScan() which only
+        // lets the upper layer (LevelIterator) advance to the next SST file
+        // when the last scan range is exhausted.
+        return Status::InvalidArgument("Scan does not intersect with file");
+      }
     }
     block_index_ranges_per_scan->emplace_back(
         scan_block_handles->size() - num_blocks, scan_block_handles->size());
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 095529341c95..8d75770897fe 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -669,7 +669,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   Status ValidateScanOptions(const MultiScanArgs* multiscan_opts);
 
   Status CollectBlockHandles(
-      const std::vector<ScanOptions>& scan_opts,
+      const std::vector<ScanOptions>& scan_opts, bool require_file_overlap,
       std::vector<BlockHandle>* scan_block_handles,
       std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan);
 
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 92b79143f609..00749636c579 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1492,6 +1492,52 @@ TEST_P(BlockBasedTableReaderTest, MultiScanUnpinPreviousBlocks) {
   }
 }
 
+TEST_P(BlockBasedTableReaderTest, MultiScanOptFileOverlapChecking) {
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          20 /* num_block */,
+          true /* mixed_with_human_readable_string_value */);
+  std::vector<std::pair<std::string, std::string>> actual_kv(
+      kv.begin(), kv.begin() + 15 * kEntriesPerBlock);
+
+  std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" +
+                           CompressionTypeToString(compression_type_);
+  ImmutableOptions ioptions(options_);
+  CreateTable(table_name, ioptions, compression_type_, actual_kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options_.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
+
+  ReadOptions read_opts;
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.SetRequireFileOverlap(false);
+  scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[6 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[16 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[17 * kEntriesPerBlock].first));
+
+  iter->Prepare(&scan_options);
+  ASSERT_OK(iter->status());
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options.SetRequireFileOverlap(true);
+  iter->Prepare(&scan_options);
+  ASSERT_TRUE(iter->status().IsInvalidArgument());
+}
+
 // Param 1: compression type
 // Param 2: whether to use direct reads
 // Param 3: Block Based Table Index type, partitioned filters are also enabled

From e859c3b7af8892064b1538a58565f7cc3ec354d5 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 25 Sep 2025 17:35:23 -0700
Subject: [PATCH 312/500] Improve version macros (#14004)

Summary:
* Delete obsolete double-underscore version macros, `__ROCKSDB_MAJOR__` etc.
* Add convenient ROCKSDB_VERSION_GE(x, y, z) macro for conditional compilation

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14004

Test Plan: Unit test added

Reviewed By: jaykorean

Differential Revision: D83264938

Pulled By: pdillinger

fbshipit-source-id: 23dcfb2760751fb87e232b8e0bbda610fd4ac73c
---
 db/dbformat_test.cc       | 44 +++++++++++++++++++++++++++++++++++++++
 include/rocksdb/db.h      |  4 ++--
 include/rocksdb/version.h | 17 +++++++++------
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index ab31e5a6f087..674e01307f19 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -333,6 +333,50 @@ TEST_F(FormatTest, ReplaceInternalKeyWithMinTimestamp) {
   ASSERT_EQ(kTypeValue, new_key.type);
 }
 
+TEST(RocksdbVersionTest, Version) {
+  // Test preprocessor macros for versioning
+  ASSERT_GT(ROCKSDB_MAJOR, 0);
+  ASSERT_GE(ROCKSDB_MINOR, 0);
+  ASSERT_GE(ROCKSDB_PATCH, 0);
+  ASSERT_LT(ROCKSDB_MAJOR, 1000);
+  ASSERT_LT(ROCKSDB_MINOR, 1000);
+  ASSERT_LT(ROCKSDB_PATCH, 1000);
+  ASSERT_EQ(ROCKSDB_MAKE_VERSION_INT(123, 456, 789), 123456789);
+  ASSERT_GT(ROCKSDB_VERSION_INT, 9999999);
+  ASSERT_LT(ROCKSDB_VERSION_INT, 99999999);
+  static_assert(ROCKSDB_VERSION_GE(9, 8, 7));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH - 1));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH - 100));
+  static_assert(
+      ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR - 1, ROCKSDB_PATCH + 1));
+  static_assert(ROCKSDB_VERSION_GE(ROCKSDB_MAJOR - 1, ROCKSDB_MINOR + 1,
+                                   ROCKSDB_PATCH + 1));
+  static_assert(
+      !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 1));
+  static_assert(
+      !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 100));
+  static_assert(
+      !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR + 1, ROCKSDB_PATCH - 1));
+  static_assert(!ROCKSDB_VERSION_GE(ROCKSDB_MAJOR + 1, ROCKSDB_MINOR - 1,
+                                    ROCKSDB_PATCH - 1));
+  // More typical usage (but with literal numbers based on relevant API
+  // features)
+#if ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH)
+  static_assert(true);
+#else
+  static_assert(false);
+#endif
+#if !ROCKSDB_VERSION_GE(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH + 1)
+  static_assert(true);
+#else
+  static_assert(false);
+#endif
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 7fdad866784d..cad566fd5d72 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -96,8 +96,8 @@ class ColumnFamilyHandle {
   virtual const Comparator* GetComparator() const = 0;
 };
 
-static const int kMajorVersion = __ROCKSDB_MAJOR__;
-static const int kMinorVersion = __ROCKSDB_MINOR__;
+static const int kMajorVersion = ROCKSDB_MAJOR;
+static const int kMinorVersion = ROCKSDB_MINOR;
 
 struct GetMergeOperandsOptions {
   using ContinueCallback = std::function<bool(Slice)>;
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 1761eff70e73..31f293484c4c 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -15,12 +15,17 @@
 #define ROCKSDB_MINOR 8
 #define ROCKSDB_PATCH 0
 
-// Do not use these. We made the mistake of declaring macros starting with
-// double underscore. Now we have to live with our choice. We'll deprecate these
-// at some point
-#define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
-#define __ROCKSDB_MINOR__ ROCKSDB_MINOR
-#define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+// Make it easy to do conditional compilation based on version checks, i.e.
+// #if ROCKSDB_VERSION_GE(4, 5, 6)
+// int thisCoderequiresVersion_4_5_6_OrGreater;
+// #else
+// int thisCodeIsForOlderVersions;
+// #endif
+#define ROCKSDB_MAKE_VERSION_INT(a, b, c) ((a) * 1000000 + (b) * 1000 + (c))
+#define ROCKSDB_VERSION_INT \
+  ROCKSDB_MAKE_VERSION_INT(ROCKSDB_MAJOR, ROCKSDB_MINOR, ROCKSDB_PATCH)
+#define ROCKSDB_VERSION_GE(a, b, c) \
+  (ROCKSDB_VERSION_INT >= ROCKSDB_MAKE_VERSION_INT(a, b, c))
 
 namespace ROCKSDB_NAMESPACE {
 // Returns a set of properties indicating how/when/where this version of RocksDB

From 3d53af974694a6134bd712cc86fb8c6b26b253b0 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Fri, 26 Sep 2025 15:32:50 -0700
Subject: [PATCH 313/500] Allow passing comparator in UDI (#14001)

Summary:
Pass the comparator to UDI interface for both reader and builder.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14001

Test Plan: Unit test

Reviewed By: anand1976

Differential Revision: D83339943

Pulled By: xingbowang

fbshipit-source-id: 7f6541776b0995260e28224329f0cca37f13b3d4
---
 include/rocksdb/user_defined_index.h          |  23 +-
 .../block_based/block_based_table_builder.cc  |  21 +-
 table/block_based/block_based_table_reader.cc |  23 +-
 table/table_test.cc                           | 620 +++++++++---------
 4 files changed, 342 insertions(+), 345 deletions(-)

diff --git a/include/rocksdb/user_defined_index.h b/include/rocksdb/user_defined_index.h
index f51345231cab..395f9fbf3530 100644
--- a/include/rocksdb/user_defined_index.h
+++ b/include/rocksdb/user_defined_index.h
@@ -30,8 +30,7 @@ inline const std::string kUserDefinedIndexPrefix =
 //
 // This is currently supported only for a restricted set of use cases. The
 // CF must be ingest only, and only files containing Puts generated by
-// SstFileWriter are supported. The user_comparator used for the CF must
-// be BytewiseComparator.
+// SstFileWriter are supported.
 
 // The interface for building user-defined index.
 class UserDefinedIndexBuilder {
@@ -145,6 +144,11 @@ class UserDefinedIndexReader {
   virtual size_t ApproximateMemoryUsage() const = 0;
 };
 
+// Options for user defined index
+struct UserDefinedIndexOption {
+  const Comparator* comparator = BytewiseComparator();
+};
+
 // Factory for creating user-defined index builders.
 class UserDefinedIndexFactory : public Customizable {
  public:
@@ -163,6 +167,21 @@ class UserDefinedIndexFactory : public Customizable {
   // block
   virtual std::unique_ptr<UserDefinedIndexReader> NewReader(
       Slice& index_block) const = 0;
+
+  // New API for allowing customized comparator
+  virtual Status NewBuilder(
+      const UserDefinedIndexOption& /*option*/,
+      std::unique_ptr<UserDefinedIndexBuilder>& builder) const {
+    builder.reset(NewBuilder());
+    return Status::OK();
+  };
+
+  virtual Status NewReader(
+      const UserDefinedIndexOption& /*option*/, Slice& index_block,
+      std::unique_ptr<UserDefinedIndexReader>& reader) const {
+    reader = NewReader(index_block);
+    return Status::OK();
+  };
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 1210b7769212..0fa6879316ea 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1191,13 +1191,20 @@ struct BlockBasedTableBuilder::Rep {
             Status::InvalidArgument("user_defined_index_factory not supported "
                                     "with parallel compression"));
       } else {
-        std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder(
-            table_options.user_defined_index_factory->NewBuilder());
-        if (user_defined_index_builder != nullptr) {
-          index_builder = std::make_unique<UserDefinedIndexBuilderWrapper>(
-              std::string(table_options.user_defined_index_factory->Name()),
-              std::move(index_builder), std::move(user_defined_index_builder),
-              &internal_comparator, ts_sz, persist_user_defined_timestamps);
+        std::unique_ptr<UserDefinedIndexBuilder> user_defined_index_builder;
+        UserDefinedIndexOption udi_options;
+        udi_options.comparator = internal_comparator.user_comparator();
+        auto s = table_options.user_defined_index_factory->NewBuilder(
+            udi_options, user_defined_index_builder);
+        if (!s.ok()) {
+          SetStatus(s);
+        } else {
+          if (user_defined_index_builder != nullptr) {
+            index_builder = std::make_unique<UserDefinedIndexBuilderWrapper>(
+                std::string(table_options.user_defined_index_factory->Name()),
+                std::move(index_builder), std::move(user_defined_index_builder),
+                &internal_comparator, ts_sz, persist_user_defined_timestamps);
+          }
         }
       }
     }
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 5c22173223de..a0eba4f009b5 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1369,15 +1369,20 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
       if (s.ok()) {
         assert(!rep_->udi_block.IsEmpty());
 
-        std::unique_ptr<UserDefinedIndexReader> udi_reader =
-            table_options.user_defined_index_factory->NewReader(
-                rep_->udi_block.GetValue()->data);
-        if (udi_reader) {
-          index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
-              udi_name, std::move(index_reader), std::move(udi_reader));
-        } else {
-          s = Status::Corruption("Failed to create UDI reader for " + udi_name +
-                                 " in file " + rep_->file->file_name());
+        std::unique_ptr<UserDefinedIndexReader> udi_reader;
+        UserDefinedIndexOption udi_option;
+        udi_option.comparator = rep_->internal_comparator.user_comparator();
+        s = table_options.user_defined_index_factory->NewReader(
+            udi_option, rep_->udi_block.GetValue()->data, udi_reader);
+        if (s.ok()) {
+          if (udi_reader) {
+            index_reader = std::make_unique<UserDefinedIndexReaderWrapper>(
+                udi_name, std::move(index_reader), std::move(udi_reader));
+          } else {
+            s = Status::Corruption("Failed to create UDI reader for " +
+                                   udi_name + " in file " +
+                                   rep_->file->file_name());
+          }
         }
       }
     }
diff --git a/table/table_test.cc b/table/table_test.cc
index 73b59e332fde..b699fce1ad4e 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7422,9 +7422,9 @@ TEST_F(ExternalTableTest, IngestionTest) {
   ASSERT_OK(db->Close());
 }
 
-// Test with a bool parameter for BytewiseComparator() (false) or
-// ReverseBytewiseComparator() (true)
-class UserDefinedIndexTest : public BlockBasedTableTestBase {
+class UserDefinedIndexTest
+    : public BlockBasedTableTestBase,
+      public testing::WithParamInterface<const Comparator*> {
  public:
   class CustomFlushBlockPolicy : public FlushBlockPolicy {
    public:
@@ -7460,43 +7460,46 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
  public:
   class TestUserDefinedIndexFactory : public UserDefinedIndexFactory {
    public:
-    TestUserDefinedIndexFactory(bool reverse = false) { reverse_ = reverse; }
     const char* Name() const override { return "test_index"; }
-    UserDefinedIndexBuilder* NewBuilder() const override {
-      return new TestUserDefinedIndexBuilder(reverse_);
+    Status NewBuilder(
+        const UserDefinedIndexOption& /*option*/,
+        std::unique_ptr<UserDefinedIndexBuilder>& builder) const override {
+      builder = std::make_unique<TestUserDefinedIndexBuilder>();
+      return Status::OK();
     }
 
+    struct CustomizedMapComparator {
+      CustomizedMapComparator(const Comparator* _comparator)
+          : comparator(_comparator) {}
+      const Comparator* comparator;
+      bool operator()(const std::string& lhs, const std::string& rhs) const {
+        return comparator->Compare(lhs, rhs) < 0;
+      }
+    };
+
+    // Deprecated API
+    UserDefinedIndexBuilder* NewBuilder() const override { return nullptr; }
+
     std::unique_ptr<UserDefinedIndexReader> NewReader(
-        Slice& index_block) const override {
-      return std::make_unique<TestUserDefinedIndexReader>(reverse_, index_block,
-                                                          this);
+        Slice& /*index_block*/) const override {
+      return nullptr;
+    }
+
+    Status NewReader(
+        const UserDefinedIndexOption& option, Slice& index_block,
+        std::unique_ptr<UserDefinedIndexReader>& reader) const override {
+      reader = std::make_unique<TestUserDefinedIndexReader>(
+          index_block, option.comparator, this);
+      return Status::OK();
     }
 
     uint64_t seek_error_count_ = 0;
     uint64_t next_error_count_ = 0;
 
    private:
-    struct TestUserDefinedIndexCompare {
-      bool operator()(const std::string& lhs, const std::string& rhs) const {
-        if (!reverse) {
-          return lhs < rhs;
-        } else {
-          return rhs < lhs;
-        }
-      }
-
-      bool reverse;
-      explicit TestUserDefinedIndexCompare(bool _reverse) {
-        reverse = _reverse;
-      }
-    };
-
     class TestUserDefinedIndexBuilder : public UserDefinedIndexBuilder {
      public:
-      TestUserDefinedIndexBuilder(bool reverse)
-          : entries_added_(0),
-            index_data_(TestUserDefinedIndexCompare(reverse)),
-            keys_added_(0) {}
+      TestUserDefinedIndexBuilder() : entries_added_(0), keys_added_(0) {}
 
       Slice AddIndexEntry(const Slice& last_key_in_current_block,
                           const Slice* first_key_in_next_block,
@@ -7558,8 +7561,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
 
      private:
       int entries_added_;
-      std::map<std::string, std::string, TestUserDefinedIndexCompare>
-          index_data_;
+      std::map<std::string, std::string> index_data_;
       uint32_t keys_added_;
       std::string index_contents_data_;
     };
@@ -7567,11 +7569,11 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
     class TestUserDefinedIndexReader : public UserDefinedIndexReader {
      public:
       explicit TestUserDefinedIndexReader(
-          bool reverse, Slice& index_block,
+          Slice& index_block, const Comparator* comparator,
           const TestUserDefinedIndexFactory* factory)
-          : reverse_(reverse),
-            factory_(factory),
-            index_data_(TestUserDefinedIndexCompare(reverse)) {
+          : factory_(factory),
+            comparator_(comparator),
+            index_data_(CustomizedMapComparator(comparator)) {
         Slice block = index_block;
         while (!block.empty()) {
           Slice key;
@@ -7595,7 +7597,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
       std::unique_ptr<UserDefinedIndexIterator> NewIterator(
           const ReadOptions& /*ro*/) override {
         return std::make_unique<TestUserDefinedIndexIterator>(
-            reverse_, index_data_, factory_);
+            index_data_, factory_, comparator_);
       }
 
       size_t ApproximateMemoryUsage() const override { return 0; }
@@ -7604,11 +7606,11 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
       class TestUserDefinedIndexIterator : public UserDefinedIndexIterator {
        public:
         TestUserDefinedIndexIterator(
-            bool reverse,
             std::map<std::string,
                      std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
-                     TestUserDefinedIndexCompare>& index,
-            const TestUserDefinedIndexFactory* factory)
+                     CustomizedMapComparator>& index,
+            const TestUserDefinedIndexFactory* factory,
+            const Comparator* comparator)
             : index_(index),
               iter_(index_.end()),
               scan_opts_(nullptr),
@@ -7616,8 +7618,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
               target_num_keys_(0),
               seek_error_count_(factory->seek_error_count_),
               next_error_count_(factory->next_error_count_),
-              comp_(reverse ? ReverseBytewiseComparator()
-                            : BytewiseComparator()) {}
+              comparator_(comparator) {}
 
         Status SeekAndGetResult(const Slice& key,
                                 IterateResult* result) override {
@@ -7631,9 +7632,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
           }
           if (scan_opts_) {
             // Seeks should be in order specified in scan_opts_
-            EXPECT_EQ(
-                comp_->Compare(scan_opts_[scan_idx_].range.start.value(), key),
-                0);
+            EXPECT_EQ(comparator_->Compare(
+                          scan_opts_[scan_idx_].range.start.value(), key),
+                      0);
             EXPECT_TRUE(scan_opts_[scan_idx_].property_bag.has_value());
             target_num_keys_ = std::stoi(scan_opts_[scan_idx_]
                                              .property_bag.value()
@@ -7647,7 +7648,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             result->bound_check_result = IterBoundCheck::kInbound;
             result->key = Slice(iter_->first);
             if (scan_opts_ && target_num_keys_ > 0 &&
-                comp_->Compare(iter_->first, key) == 0) {
+                comparator_->Compare(key, iter_->first) == 0) {
               target_num_keys_--;
             }
           } else {
@@ -7667,9 +7668,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             return s;
           }
           if (scan_opts_ && scan_opts_[scan_idx_ - 1].range.limit.has_value()) {
-            if (comp_->Compare(iter_->first,
-                               scan_opts_[scan_idx_ - 1].range.limit.value()) >=
-                0) {
+            if (comparator_->Compare(
+                    iter_->first,
+                    scan_opts_[scan_idx_ - 1].range.limit.value()) >= 0) {
               result->bound_check_result = IterBoundCheck::kOutOfBound;
               result->key = Slice();
               return Status::OK();
@@ -7706,8 +7707,9 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
             return true;
           }
           if (scan_opts_[scan_idx_ - 1].range.limit.has_value() &&
-              comp_->Compare(scan_opts_[scan_idx_ - 1].range.limit.value(),
-                             iter_->first) <= 0) {
+              comparator_->Compare(
+                  scan_opts_[scan_idx_ - 1].range.limit.value(),
+                  iter_->first) <= 0) {
             return false;
           }
           return true;
@@ -7731,7 +7733,7 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
        private:
         std::map<std::string,
                  std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
-                 TestUserDefinedIndexCompare>& index_;
+                 CustomizedMapComparator>& index_;
         std::map<std::string, std::pair<UserDefinedIndexBuilder::BlockHandle,
                                         uint32_t>>::iterator iter_;
         const ScanOptions* scan_opts_;
@@ -7740,43 +7742,103 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
         uint32_t target_num_keys_;
         uint64_t seek_error_count_;
         uint64_t next_error_count_;
-        const Comparator* comp_;
+        const Comparator* comparator_;
       };
 
-      bool reverse_;
       const TestUserDefinedIndexFactory* factory_;
+      const Comparator* comparator_;
       std::map<std::string,
                std::pair<UserDefinedIndexBuilder::BlockHandle, uint32_t>,
-               TestUserDefinedIndexCompare>
+               CustomizedMapComparator>
           index_data_;
     };
-
-    bool reverse_;
   };
 
+  void SetUp() override {
+    comparator_ = GetParam();
+    options_.comparator = comparator_;
+    is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator();
+  }
+
  protected:
+  std::vector<std::pair<std::string, std::string>> generateKVs(
+      int key_count, int value_size = 0) {
+    std::vector<std::pair<std::string, std::string>> kvs(key_count);
+    for (int i = 0; i < key_count; i++) {
+      std::stringstream ss;
+      ss << std::setw(2) << std::setfill('0') << i;
+      std::string key = "key" + ss.str();
+      std::string value;
+      if (value_size != 0) {
+        value = rnd.RandomString(1024);
+      } else {
+        value = "value" + ss.str();
+      }
+      kvs[i] = std::make_pair(key, value);
+    }
+    if (is_reverse_comparator_) {
+      std::reverse(kvs.begin(), kvs.end());
+    }
+    return kvs;
+  }
+
   void BasicTest(bool use_partitioned_index);
 
-  void ValidateMultiScan(const ReadOptions& ro, MultiScanArgs& scan_opts,
-                         std::vector<int>& key_counts, std::unique_ptr<DB>& db,
-                         ColumnFamilyHandle* cfh) {
+  void ValidateMultiScan(
+      std::vector<std::tuple<std::vector<std::string>, int, int>>
+          scan_opt_validation_arg,
+      std::unordered_map<std::string, std::string> property_bag,
+      const ReadOptions& ro, MultiScanArgs& scan_opts,
+      std::vector<int>& key_counts, std::unique_ptr<DB>& db,
+      ColumnFamilyHandle* cfh) {
+    key_counts.clear();
+    (*scan_opts).clear();
+
+    if (is_reverse_comparator_) {
+      for (auto& scan_opt_validation_range : scan_opt_validation_arg) {
+        // reverse each range
+        std::reverse(std::get<0>(scan_opt_validation_range).begin(),
+                     std::get<0>(scan_opt_validation_range).end());
+      }
+      // reverse all the ranges
+      std::reverse(scan_opt_validation_arg.begin(),
+                   scan_opt_validation_arg.end());
+    }
+
+    for (auto& scan_opt_validation_range : scan_opt_validation_arg) {
+      scan_opts.insert(std::get<0>(scan_opt_validation_range)[0],
+                       std::get<0>(scan_opt_validation_range)[1],
+                       std::optional(property_bag));
+      if (is_reverse_comparator_) {
+        key_counts.push_back(std::get<2>(scan_opt_validation_range));
+      } else {
+        key_counts.push_back(std::get<1>(scan_opt_validation_range));
+      }
+    }
+
     Slice ub;
     ReadOptions read_opts = ro;
-    const Comparator* comp = cfh->GetComparator();
     int key_count = 0;
     int index = 0;
     auto opts = scan_opts.GetScanRanges();
     read_opts.iterate_upper_bound = &ub;
     std::unique_ptr<Iterator> iter(db->NewIterator(read_opts, cfh));
     iter->Prepare(scan_opts);
+    static const bool kVerbose = false;
     for (auto opt : opts) {
       ub = opt.range.limit.value();
       iter->Seek(opt.range.start.value());
+      if (kVerbose) {
+        printf("range start key %s, end key %s\n",
+               opt.range.start.value().ToString().c_str(),
+               opt.range.limit.value().ToString().c_str());
+      }
       EXPECT_OK(iter->status());
       while (iter->Valid()) {
+        if (kVerbose) {
+          printf("found key %s\n", iter->key().ToString().c_str());
+        }
         key_count++;
-        ASSERT_GE(comp->Compare(iter->key(), opt.range.start.value()), 0);
-        ASSERT_LT(comp->Compare(iter->key(), opt.range.limit.value()), 0);
         iter->Next();
       }
       EXPECT_EQ(key_count, key_counts[index]);
@@ -7785,10 +7847,13 @@ class UserDefinedIndexTest : public BlockBasedTableTestBase {
     }
     EXPECT_OK(iter->status());
   }
+  Options options_;
+  const Comparator* comparator_;
+  bool is_reverse_comparator_;
+  Random rnd{301};
 };
 
 void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
-  Options options;
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -7806,26 +7871,22 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
-  // Add 100 keys instead of just 5
-  for (int i = 0; i < 100; i++) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = "value" + ss.str();
-    ASSERT_OK(writer->Put(key, value));
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
   ASSERT_OK(writer->Finish());
   writer.reset();
 
-  ImmutableOptions ioptions(options);
-  MutableCFOptions moptions((ColumnFamilyOptions(options)));
-  EnvOptions eoptions(options);
+  ImmutableOptions ioptions(options_);
+  MutableCFOptions moptions((ColumnFamilyOptions(options_)));
+  EnvOptions eoptions(options_);
   TableReaderOptions toptions(
       ioptions, moptions.prefix_extractor,
       /*_compression_manager=*/nullptr, eoptions, ioptions.internal_comparator,
@@ -7843,7 +7904,7 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   uint64_t file_size = 0;
   std::unique_ptr<FSRandomAccessFile> file;
   std::unique_ptr<RandomAccessFileReader> file_reader;
-  const auto& fs = options.env->GetFileSystem();
+  const auto& fs = options_.env->GetFileSystem();
   ASSERT_OK(fs->GetFileSize(ingest_file, IOOptions(), &file_size, nullptr));
   ASSERT_OK(fs->NewRandomAccessFile(ingest_file, eoptions, &file, nullptr));
   file_reader.reset(new RandomAccessFileReader(std::move(file), ingest_file));
@@ -7860,7 +7921,7 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   ASSERT_GE(block_handle.size(),
             expected_entries);  // At least this many entries
 
-  std::unique_ptr<SstFileReader> reader(new SstFileReader(options));
+  std::unique_ptr<SstFileReader> reader(new SstFileReader(options_));
   ASSERT_OK(reader->Open(ingest_file));
 
   ReadOptions ro;
@@ -7881,55 +7942,58 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
 
-  // Test that we can read all the keys
+  // Test seek specific key
   key_count = 0;
-  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
     key_count++;
   }
-  ASSERT_EQ(key_count, 91);
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 41 : 60);
   ASSERT_OK(iter->status());
 
-  Slice ub("key75");
+  // Test upper bound
+  Slice ub(is_reverse_comparator_ ? "key25" : "key75");
   ro.iterate_upper_bound = &ub;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
 
-  // Test that we can read all the keys
+  // Test seek specific key with upper bound
   key_count = 0;
-  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
     key_count++;
   }
-  ASSERT_EQ(key_count, 66);
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35);
   ASSERT_OK(iter->status());
 
   user_defined_index_factory->seek_error_count_ = 1;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
-  iter->Seek("key09");
+  iter->Seek("key40");
   ASSERT_NOK(iter->status());
 
   user_defined_index_factory->seek_error_count_ = 0;
   user_defined_index_factory->next_error_count_ = 1;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
-  iter->Seek("key09");
-  ASSERT_OK(iter->status());
-  iter->Next();
+  iter->Seek(is_reverse_comparator_ ? "key92" : "key09");
   ASSERT_OK(iter->status());
   iter->Next();
   ASSERT_OK(iter->status());
   iter->Next();
+  if (!is_reverse_comparator_) {
+    ASSERT_OK(iter->status());
+    iter->Next();
+  }
   ASSERT_NOK(iter->status());
   user_defined_index_factory->next_error_count_ = 0;
 
   ro.iterate_upper_bound = nullptr;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts(options.comparator);
+  MultiScanArgs scan_opts(comparator_);
 
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
-  scan_opts.insert("key20", property_bag);
+  scan_opts.insert("key40", property_bag);
   iter->Prepare(scan_opts);
   // Test that we can read all the keys
   key_count = 0;
@@ -7943,16 +8007,15 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   ASSERT_OK(iter->status());
 }
 
-TEST_F(UserDefinedIndexTest, BasicTestWithPartitionedIndex) {
+TEST_P(UserDefinedIndexTest, BasicTestWithPartitionedIndex) {
   BasicTest(/*use_partitioned_index=*/true);
 }
 
-TEST_F(UserDefinedIndexTest, BasicTestWithoutPartitionedIndex) {
+TEST_P(UserDefinedIndexTest, BasicTestWithoutPartitionedIndex) {
   BasicTest(/*use_partitioned_index=*/false);
 }
 
-TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {
-  Options options;
+TEST_P(UserDefinedIndexTest, InvalidArgumentTest1) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -7966,11 +8029,11 @@ TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  options.compression_opts.parallel_threads = 10;
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.compression_opts.parallel_threads = 10;
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
   std::string key = "foo";
@@ -7980,8 +8043,7 @@ TEST_F(UserDefinedIndexTest, InvalidArgumentTest1) {
   writer.reset();
 }
 
-TEST_F(UserDefinedIndexTest, InvalidArgumentTest2) {
-  Options options;
+TEST_P(UserDefinedIndexTest, InvalidArgumentTest2) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -7995,10 +8057,10 @@ TEST_F(UserDefinedIndexTest, InvalidArgumentTest2) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
   std::string key = "foo";
@@ -8008,8 +8070,7 @@ TEST_F(UserDefinedIndexTest, InvalidArgumentTest2) {
   writer.reset();
 }
 
-TEST_F(UserDefinedIndexTest, IngestTest) {
-  Options options;
+TEST_P(UserDefinedIndexTest, IngestTest) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -8023,30 +8084,27 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
-  // Add 100 keys instead of just 5
-  for (int i = 0; i < 100; i++) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = "value" + ss.str();
-    ASSERT_OK(writer->Put(key, value));
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
+
   ASSERT_OK(writer->Finish());
   writer.reset();
 
   std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
   ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
 
   IngestExternalFileOptions ifo;
   s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
@@ -8070,34 +8128,35 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
 
-  // Test that we can read all the keys
+  // Test seek specific key
   key_count = 0;
-  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
     key_count++;
   }
-  ASSERT_EQ(key_count, 91);
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 41 : 60);
   ASSERT_OK(iter->status());
 
-  Slice ub("key75");
+  // Test upper bound
+  Slice ub(is_reverse_comparator_ ? "key25" : "key75");
   ro.iterate_upper_bound = &ub;
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
 
-  // Test that we can read all the keys
+  // Test seek specific key with upper bound
   key_count = 0;
-  for (iter->Seek("key09"); iter->Valid(); iter->Next()) {
+  for (iter->Seek("key40"); iter->Valid(); iter->Next()) {
     key_count++;
   }
-  ASSERT_EQ(key_count, 66);
+  ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35);
   ASSERT_OK(iter->status());
 
   ro.iterate_upper_bound = nullptr;
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts(options.comparator);
+  MultiScanArgs scan_opts(options_.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
-  scan_opts.insert(Slice("key20"), std::optional(property_bag));
+  scan_opts.insert(Slice("key40"), std::optional(property_bag));
   iter->Prepare(scan_opts);
   // Test that we can read all the keys
   key_count = 0;
@@ -8113,11 +8172,10 @@ TEST_F(UserDefinedIndexTest, IngestTest) {
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DestroyDB(dbname, options_));
 }
 
-TEST_F(UserDefinedIndexTest, EmptyRangeTest) {
-  Options options;
+TEST_P(UserDefinedIndexTest, EmptyRangeTest) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -8131,15 +8189,15 @@ TEST_F(UserDefinedIndexTest, EmptyRangeTest) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
+  // Generate key range key0 ~ key19, key40 ~ key59, key80 ~ key99
+  std::vector<std::pair<std::string, std::string>> kvs;
   bool skip = false;
-  // Create a sparse file with some missing key ranges so we can do
-  // MultiScans with empty scans interspersed with non-zero scans.
   for (int i = 0; i < 100; i++) {
     if (i > 0 && i % 20 == 0) {
       skip = !skip;
@@ -8151,18 +8209,26 @@ TEST_F(UserDefinedIndexTest, EmptyRangeTest) {
     ss << std::setw(2) << std::setfill('0') << i;
     std::string key = "key" + ss.str();
     std::string value = "value" + ss.str();
-    ASSERT_OK(writer->Put(key, value));
+    kvs.emplace_back(key, value);
+  }
+
+  if (is_reverse_comparator_) {
+    std::reverse(kvs.begin(), kvs.end());
+  }
+
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
   ASSERT_OK(writer->Finish());
   writer.reset();
 
   std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
   ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
 
   IngestExternalFileOptions ifo;
   s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
@@ -8184,86 +8250,57 @@ TEST_F(UserDefinedIndexTest, EmptyRangeTest) {
 
   ro.table_index_factory = user_defined_index_factory.get();
   std::vector<int> key_counts;
-  MultiScanArgs scan_opts(options.comparator);
+  MultiScanArgs scan_opts(options_.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(5);
-  // Empty scans
-  scan_opts.insert(Slice("key25"), Slice("key30"), std::optional(property_bag));
-  key_counts.push_back(0);
-  scan_opts.insert(Slice("key33"), Slice("key37"), std::optional(property_bag));
-  key_counts.push_back(0);
-  // Non-empty scan with range greater than count
-  scan_opts.insert(Slice("key42"), Slice("key56"), std::optional(property_bag));
-  // In the key42:key56 range, we might read an additional block worth of
-  // keys due to the boundaries (5 + 3)
-  key_counts.push_back(8);
-  // Empty scan succeeding a non-empty one
-  scan_opts.insert(Slice("key65"), Slice("key70"), std::optional(property_bag));
-  key_counts.push_back(0);
-  // A non-empty scan with range smaller than count
-  scan_opts.insert(Slice("key85"), Slice("key87"), std::optional(property_bag));
-  key_counts.push_back(2);
-  // Scan range completely outside the DB
-  scan_opts.insert(Slice("key991"), Slice("key999"),
-                   std::optional(property_bag));
-  key_counts.push_back(0);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
-
-  key_counts.clear();
-  (*scan_opts).clear();
+
+  ValidateMultiScan({{{"key25", "key30"}, 0, 0},
+                     {{"key33", "key37"}, 0, 0},
+                     // Non-empty scan with range greater than count
+                     // In the key42:key56 range, we might read an additional
+                     // block worth of keys due to the boundaries (5 + 3)
+                     {{"key42", "key56"}, 8, 7},
+                     // Empty scan succeeding a non-empty one
+                     {{"key65", "key70"}, 0, 0},
+                     // A non-empty scan with range smaller than count
+                     {{"key85", "key87"}, 2, 2},
+                     // Scan range completely outside the DB
+                     {{"key991", "key999"}, 0, 0}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
+
   // Scans that overlap with part of key range, with overlap less than count
-  scan_opts.insert(Slice("key18"), Slice("key25"), std::optional(property_bag));
-  key_counts.push_back(2);
-  scan_opts.insert(Slice("key38"), Slice("key43"), std::optional(property_bag));
-  key_counts.push_back(3);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+  ValidateMultiScan({{{"key18", "key25"}, 2, 1}, {{"key38", "key43"}, 3, 4}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
 
   // Scans that overlap with part of key range, with overlap same as count
-  key_counts.clear();
-  (*scan_opts).clear();
-  scan_opts.insert(Slice("key15"), Slice("key26"), std::optional(property_bag));
-  key_counts.push_back(5);
-  scan_opts.insert(Slice("key38"), Slice("key46"), std::optional(property_bag));
-  key_counts.push_back(6);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+  ValidateMultiScan({{{"key15", "key26"}, 5, 4}, {{"key38", "key46"}, 6, 7}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
 
   // Scans that overlap with part of key range, with overlap greater than count
-  key_counts.clear();
-  (*scan_opts).clear();
-  scan_opts.insert(Slice("key10"), Slice("key26"), std::optional(property_bag));
-  key_counts.push_back(8);
-  scan_opts.insert(Slice("key38"), Slice("key49"), std::optional(property_bag));
-  key_counts.push_back(7);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+  ValidateMultiScan({{{"key10", "key26"}, 8, 8},
+                     // Cross block boundary
+                     {{"key38", "key49"}, 7, 9}},
+                    property_bag, ro, scan_opts, key_counts, db, cfh);
 
   // Scan bigger than one contiguous range of keys, with overlap greater than
   // count
-  key_counts.clear();
-  (*scan_opts).clear();
-  scan_opts.insert(Slice("key75"), Slice("key991"),
-                   std::optional(property_bag));
-  key_counts.push_back(8);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+  ValidateMultiScan({{{"key75", "key991"}, 8, 9}}, property_bag, ro, scan_opts,
+                    key_counts, db, cfh);
 
   // Scan bigger than one contiguous range of keys, with overlap less than count
-  key_counts.clear();
-  (*scan_opts).clear();
   property_bag["count"] = std::to_string(25);
-  scan_opts.insert(Slice("key75"), Slice("key991"),
-                   std::optional(property_bag));
-  key_counts.push_back(20);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
+  ValidateMultiScan({{{"key75", "key991"}, 20, 20}}, property_bag, ro,
+                    scan_opts, key_counts, db, cfh);
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DestroyDB(dbname, options_));
 }
 
 // Verify that external file ingestion fails if we try to ingest an SST file
 // without the UDI and a UDI factory is configured in BlockBasedTableOptions
 // and fail_if_no_udi_on_open is true in BlockBasedTableOptions.
-TEST_F(UserDefinedIndexTest, IngestFailTest) {
-  Options options;
+TEST_P(UserDefinedIndexTest, IngestFailTest) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -8272,19 +8309,15 @@ TEST_F(UserDefinedIndexTest, IngestFailTest) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
-  // Add 100 keys instead of just 5
-  for (int i = 0; i < 100; i++) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = "value" + ss.str();
-    ASSERT_OK(writer->Put(key, value));
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
   ASSERT_OK(writer->Finish());
   writer.reset();
@@ -8294,15 +8327,15 @@ TEST_F(UserDefinedIndexTest, IngestFailTest) {
       std::make_shared<TestUserDefinedIndexFactory>();
   table_options.user_defined_index_factory = user_defined_index_factory;
   table_options.fail_if_no_udi_on_open = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
   ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
 
   IngestExternalFileOptions ifo;
   s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
@@ -8315,11 +8348,10 @@ TEST_F(UserDefinedIndexTest, IngestFailTest) {
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DestroyDB(dbname, options_));
 }
 
-TEST_F(UserDefinedIndexTest, IngestEmptyUDI) {
-  Options options;
+TEST_P(UserDefinedIndexTest, IngestEmptyUDI) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -8333,38 +8365,34 @@ TEST_F(UserDefinedIndexTest, IngestEmptyUDI) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
-  // Add 100 keys instead of just 5
-  for (int i = 0; i < 100; i++) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = "value" + ss.str();
-    ASSERT_OK(writer->Put(key, value));
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
   ASSERT_OK(writer->Finish());
   writer.reset();
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file2));
   ASSERT_OK(writer->Put("dummy", "val"));
   ASSERT_OK(writer->Finish());
   writer.reset();
 
   table_options.fail_if_no_udi_on_open = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
   ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
 
   std::vector<IngestExternalFileArg> ifa;
   ifa.emplace_back();
@@ -8376,11 +8404,10 @@ TEST_F(UserDefinedIndexTest, IngestEmptyUDI) {
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DestroyDB(dbname, options_));
 }
 
-TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
-  Options options;
+TEST_P(UserDefinedIndexTest, MultiScanFailureTest) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -8394,31 +8421,27 @@ TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
-  Random rnd(301);
-  // Add 100 keys instead of just 5
-  for (int i = 0; i < 100; i++) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = rnd.RandomString(1024);
-    ASSERT_OK(writer->Put(key, value));
+  // Use bigger value, so that prefetch size limit will be effective
+  auto kvs = generateKVs(/*key_count*/ 100, /* value_size */ 1024);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
   ASSERT_OK(writer->Finish());
   writer.reset();
 
   std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
   ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
 
   IngestExternalFileOptions ifo;
   s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
@@ -8431,7 +8454,10 @@ TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
   ro.iterate_upper_bound = &ub;
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(5);
-  MultiScanArgs scan_options(options.comparator);
+  MultiScanArgs scan_options(comparator_);
+  if (is_reverse_comparator_) {
+    std::reverse(key_ranges.begin(), key_ranges.end());
+  }
   scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
   scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
   scan_options.max_prefetch_size = 3500;
@@ -8442,8 +8468,8 @@ TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
   ub = key_ranges[1];
   iter->Seek(key_ranges[0]);
   while (iter->status().ok() && iter->Valid()) {
-    ASSERT_GE(iter->key().compare(key_ranges[0]), 0);
-    ASSERT_LT(iter->key().compare(key_ranges[1]), 0);
+    ASSERT_GE(comparator_->Compare(iter->key(), key_ranges[0]), 0);
+    ASSERT_LT(comparator_->Compare(iter->key(), key_ranges[1]), 0);
     count++;
     iter->Next();
   }
@@ -8469,7 +8495,12 @@ TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
   (*scan_options).clear();
-  key_ranges[1] = "key20";
+  if (is_reverse_comparator_) {
+    key_ranges[2] = "key20";
+  } else {
+    key_ranges[1] = "key20";
+  }
+
   scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
   scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
   iter->Prepare(scan_options);
@@ -8481,11 +8512,10 @@ TEST_F(UserDefinedIndexTest, MultiScanFailureTest) {
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DestroyDB(dbname, options_));
 }
 
-TEST_F(UserDefinedIndexTest, ConfigTest) {
-  Options options;
+TEST_P(UserDefinedIndexTest, ConfigTest) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -8499,25 +8529,21 @@ TEST_F(UserDefinedIndexTest, ConfigTest) {
   table_options.flush_block_policy_factory =
       std::make_shared<CustomFlushBlockPolicyFactory>();
 
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
+  writer.reset(new SstFileWriter(EnvOptions(), options_));
   ASSERT_OK(writer->Open(ingest_file));
 
-  // Add 100 keys instead of just 5
-  for (int i = 0; i < 100; i++) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = "value" + ss.str();
-    ASSERT_OK(writer->Put(key, value));
+  auto kvs = generateKVs(/*key_count*/ 100);
+  for (const auto& kv : kvs) {
+    ASSERT_OK(writer->Put(kv.first, kv.second));
   }
   ASSERT_OK(writer->Finish());
   writer.reset();
 
   table_options.user_defined_index_factory.reset();
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
   // Set up the user-defined index factory
   ObjectLibrary::Default().get()->AddFactory<UserDefinedIndexFactory>(
       "test_index", [](const std::string& /* uri */,
@@ -8528,17 +8554,17 @@ TEST_F(UserDefinedIndexTest, ConfigTest) {
         return guard->get();
       });
   ASSERT_OK(GetColumnFamilyOptionsFromString(
-      ConfigOptions(), options,
+      ConfigOptions(), options_,
       "block_based_table_factory={user_defined_index_factory=test_index;}",
-      &options));
+      &options_));
 
   std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
   ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
 
   IngestExternalFileOptions ifo;
   s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
@@ -8548,10 +8574,10 @@ TEST_F(UserDefinedIndexTest, ConfigTest) {
   ro.table_index_factory = user_defined_index_factory.get();
   std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts(options.comparator);
+  MultiScanArgs scan_opts(options_.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
-  scan_opts.insert(Slice("key20"), std::optional(property_bag));
+  scan_opts.insert(Slice("key40"), std::optional(property_bag));
   iter->Prepare(scan_opts);
   // Test that we can read all the keys
   int key_count = 0;
@@ -8567,72 +8593,12 @@ TEST_F(UserDefinedIndexTest, ConfigTest) {
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
   ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DestroyDB(dbname, options_));
 }
 
-TEST_F(UserDefinedIndexTest, ReverseMultiScanTest) {
-  Options options;
-  BlockBasedTableOptions table_options;
-  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
-  std::string ingest_file = dbname + "test.sst";
-
-  // Set up the user-defined index factory with ReverseBytewiseComparator
-  auto user_defined_index_factory =
-      std::make_shared<TestUserDefinedIndexFactory>(/*reverse=*/true);
-  table_options.user_defined_index_factory = user_defined_index_factory;
-
-  // Set up custom flush block policy that flushes every 3 keys
-  table_options.flush_block_policy_factory =
-      std::make_shared<CustomFlushBlockPolicyFactory>();
-
-  options.comparator = ReverseBytewiseComparator();
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
-  std::unique_ptr<SstFileWriter> writer;
-  writer.reset(new SstFileWriter(EnvOptions(), options));
-  ASSERT_OK(writer->Open(ingest_file));
-
-  Random rnd(301);
-  // Add 100 keys in reverse bytewise order
-  for (int i = 99; i >= 0; i--) {
-    std::stringstream ss;
-    ss << std::setw(2) << std::setfill('0') << i;
-    std::string key = "key" + ss.str();
-    std::string value = rnd.RandomString(1024);
-    ASSERT_OK(writer->Put(key, value));
-  }
-  ASSERT_OK(writer->Finish());
-  writer.reset();
-
-  std::unique_ptr<DB> db;
-  options.create_if_missing = true;
-  Status s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
-  ASSERT_TRUE(db != nullptr);
-  ColumnFamilyHandle* cfh = nullptr;
-  ASSERT_OK(db->CreateColumnFamily(options, "new_cf", &cfh));
-
-  IngestExternalFileOptions ifo;
-  s = db->IngestExternalFile(cfh, {ingest_file}, ifo);
-  ASSERT_OK(s);
-
-  std::vector<std::string> key_ranges({"key90", "key75", "key30", "key02"});
-  std::vector<int> key_counts;
-  ReadOptions ro;
-  ro.table_index_factory = user_defined_index_factory.get();
-  std::unordered_map<std::string, std::string> property_bag;
-  property_bag["count"] = std::to_string(20);
-  MultiScanArgs scan_opts(options.comparator);
-  scan_opts.insert(key_ranges[0], key_ranges[1], property_bag);
-  key_counts.emplace_back(15);
-  scan_opts.insert(key_ranges[2], key_ranges[3], property_bag);
-  key_counts.emplace_back(24);
-  ValidateMultiScan(ro, scan_opts, key_counts, db, cfh);
-
-  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
-  ASSERT_OK(db->Close());
-  ASSERT_OK(DestroyDB(dbname, options));
-}
+INSTANTIATE_TEST_CASE_P(UserDefinedIndexTest, UserDefinedIndexTest,
+                        ::testing::Values(BytewiseComparator(),
+                                          ReverseBytewiseComparator()));
 
 }  // namespace ROCKSDB_NAMESPACE
 

From c0e484c36e1fe07f36bdf5de6ca01979fcec5b6d Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 26 Sep 2025 15:57:06 -0700
Subject: [PATCH 314/500] Blog about IO tagging (#14005)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/14005

Test Plan: verify according to https://github.com/facebook/rocksdb/tree/main/docs

Reviewed By: archang19

Differential Revision: D83365540

Pulled By: hx235

fbshipit-source-id: b674aca6a9977721b64cafcdfaf8690d1c5940b7
---
 docs/_posts/2025-09-25-io-tagging.markdown | 74 ++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 docs/_posts/2025-09-25-io-tagging.markdown

diff --git a/docs/_posts/2025-09-25-io-tagging.markdown b/docs/_posts/2025-09-25-io-tagging.markdown
new file mode 100644
index 000000000000..14651d03f0e9
--- /dev/null
+++ b/docs/_posts/2025-09-25-io-tagging.markdown
@@ -0,0 +1,74 @@
+---
+title: IO Activity Tagging
+layout: post
+author: hx235
+category: blog
+---
+
+## Context
+
+RocksDB performs a variety of IO operations—user reads, background compactions, flushes, database opens, and verification tasks. Treating all these operations the same makes it difficult for file system implementers to optimize performance, prioritize latency-sensitive IOs, and diagnose bottlenecks. To solve that, RocksDB internally tags every IO operation with its activity type using the `IOActivity` enum. This automatic tagging provides precise context for each IO, enabling file systems to make smarter, context-aware decisions for scheduling, caching, and resource management.
+
+## How Internal IO Tagging Works
+RocksDB automatically assigns an `IOActivity` tag to each IO operation. This tag is propagated through the storage stack and included in the IO options passed to the file system.
+
+```cpp
+enum class IOActivity : uint8_t {
+    kFlush = 0,                        // IO for flush operations (background write)
+    kCompaction = 1,                   // IO for compaction (background read/write)
+    kDBOpen = 2,                       // IO during database open (read/write)
+    kGet = 3,                          // User Get() read
+    kMultiGet = 4,                     // User MultiGet() read
+    kDBIterator = 5,                   // User iterator read
+    kVerifyDBChecksum = 6,             // Verification: DB checksum
+    kVerifyFileChecksums = 7,          // Verification: file checksums
+    kGetEntity = 8,                    // Entity Get (e.g., wide-column)
+    kMultiGetEntity = 9,               // Entity MultiGet
+    kGetFileChecksumsFromCurrentManifest = 10, // Manifest checksum reads
+    // 0x80–0xFE: Reserved for custom/internal use
+    kUnknown = 0xFF                    // Unknown/unspecified activity
+};
+```
+
+## Access IO Tag in File System
+Custom file systems can access the IOActivity tag via the IO options structure provided by RocksDB. This allows them to optimize behavior based on the specific IO activity.
+
+```cpp
+Status CustomFileSystem::Append(uint64_t offset, const Slice& data, const IOOptions& io_opts, ...) {
+    switch (io_opts.io_activity) {
+        case Env::IOActivity::kGet:
+            // Prioritize or cache user reads
+            break;
+        case Env::IOActivity::kCompaction:
+            // Throttle or deprioritize background compaction IO
+            break;
+        case Env::IOActivity::kDBOpen:
+            // Track or optimize DB open IO
+            break;
+        // ... handle other activities ...
+        default:
+            // Default handling
+            break;
+    }
+}
+```
+## IO Activity Statistics in RocksDB
+RocksDB provides detailed histograms for IO activities, allowing you to analyze both the aggregate time spent (in microseconds) and the count of IOs for each activity type.
+```cpp
+// Read Histograms
+FILE_READ_FLUSH_MICROS
+FILE_READ_COMPACTION_MICROS
+FILE_READ_DB_OPEN_MICROS
+FILE_READ_GET_MICROS
+FILE_READ_MULTIGET_MICROS
+FILE_READ_DB_ITERATOR_MICROS
+FILE_READ_VERIFY_DB_CHECKSUM_MICROS
+FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS
+
+// Write Histograms
+FILE_WRITE_FLUSH_MICROS
+FILE_WRITE_COMPACTION_MICROS
+FILE_WRITE_DB_OPEN_MICROS
+```
+
+Thanks to Maciej Szeszko and Andrew Chang from the RocksDB team for their contributions in expanding and maintaining the IOActivity enum.

From feb1486e37a563fcac2014f4a1bc1c416c3d9a02 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 29 Sep 2025 09:26:59 -0700
Subject: [PATCH 315/500] No StandaloneRangeDeletionFile Optimization for
 Leveled Compaction (#14007)

Summary:
In https://github.com/facebook/rocksdb/pull/13816, we added `earliest_snapshot` in the Compaction object picked by remote compaction which is required for Standalone Range Deletion Optimization (introduced in https://github.com/facebook/rocksdb/pull/13078)

The Standalone Range Deletion Optimization was supposed to be supported by Universal Compaction only. This PR properly skips the feature when the compaction style is not kUniversal

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14007

Test Plan:
Unit Test updated to include Leveled Compaction
```
./compaction_service_test --gtest_filter="*CompactionServiceTest.StandaloneDeleteRangeTombstoneOptimization*"
```

In Stress Test, we were able to repro before, but not anymore
```
./db_stress --WAL_size_limit_MB=0 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --adm_policy=2 --advise_random_on_open=0 --allow_data_in_errors=True --allow_fallocate=1 --allow_setting_blob_options_dynamically=0 --allow_unprepared_value=1 --async_io=1 --atomic_flush=1 --auto_readahead_size=1 --auto_refresh_iterator_with_snapshot=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=1000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --block_align=0 --block_protection_bytes_per_key=0 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=3.4547746144863423 --bottommost_compression_type=lz4hc --bottommost_file_compaction_delay=3600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=8388608 --cache_type=tiered_fixed_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=1 --charge_table_reader=0 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=0 --checksum_type=kxxHash64 --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000 --compaction_pri=3 --compaction_readahead_size=1048576 --compaction_style=0 --compaction_ttl=2 --compress_format_version=1 --compressed_secondary_cache_ratio=0.5 --compressed_secondary_cache_size=0 --compression_checksum=1 --compression_manager=mixed --compression_max_dict_buffer_bytes=15 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=lz4hc --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=1 --db=/tmp/jewoongh/rocksdb_crashtest_blackbox_remote_compaction --db_write_buffer_size=1048576 --decouple_partitioned_filters=1 --default_temperature=kWarm --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=1000000 --disable_manual_compaction_one_in=10000 --disable_wal=1 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=0 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_compaction_on_deletion_trigger=1 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=0 --enable_sst_partitioner_factory=1 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=0 --exclude_wal_from_write_fault_injection=0 --expected_values_dir=/tmp/jewoongh/rocksdb_crashtest_expected_remote_compaction --fifo_allow_compaction=1 --file_checksum_impl=none --file_temperature_age_thresholds= --fill_cache=1 --flush_one_in=1000 --format_version=6 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=7 --index_shortening=2 --index_type=2 --ingest_external_file_one_in=0 --ingest_wbwi_one_in=500 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=1000000 --log_file_time_to_roll=60 --log_readahead_size=16777216 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=2 --max_bytes_for_level_base=10485760 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=8 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=4194304 --memtable_avg_op_scan_flush_trigger=2 --memtable_insert_hint_per_batch=0 --memtable_max_range_deletions=0 --memtable_op_scan_flush_trigger=100 --memtable_prefix_bloom_size_ratio=0.1 --memtable_protection_bytes_per_key=4 --memtable_veirfy_per_key_checksum_on_seek=0 --memtable_whole_key_filtering=1 --metadata_charge_policy=1 --metadata_read_fault_one_in=1000 --metadata_write_fault_one_in=0 --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --multiscan_use_async_io=0 --nooverwritepercent=1 --num_bottom_pri_threads=20 --num_file_reads_for_auto_readahead=1 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=1 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --paranoid_memory_checks=0 --partition_filters=1 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=0 --prefix_size=-1 --prefixpercent=0 --prepopulate_block_cache=0 --preserve_internal_time_seconds=36000 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=50 --recycle_log_file_num=0 --remote_compaction_failure_fall_back_to_local=1 --remote_compaction_worker_threads=8 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=32 --secondary_cache_uri= --set_options_one_in=1000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=2 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --statistics=1 --stats_dump_period_sec=600 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=0 --subcompactions=4 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=6 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_ingest_standalone_range_deletion_one_in=0 --test_secondary=0 --top_level_index_pinning=2 --track_and_verify_wals=0 --uncache_aggressiveness=72 --universal_max_read_amp=0 --universal_reduce_file_locking=1 --unpartitioned_pinning=1 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_delta_encoding=0 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=1 --use_get_entity=1 --use_merge=1 --use_multi_cf_iterator=1 --use_multi_get_entity=0 --use_multiget=0 --use_multiscan=0 --use_put_entity_one_in=0 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=1 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=zstd --write_buffer_size=1048576 --write_dbid_to_manifest=1 --write_fault_one_in=0 --write_identity_file=0 --writepercent=35
```

Reviewed By: hx235

Differential Revision: D83375779

Pulled By: jaykorean

fbshipit-source-id: 6dad06e3a825c4e9a7101ab8603d1c966be6a4f4
---
 db/compaction/compaction_service_test.cc | 152 ++++++++++++-----------
 db/db_impl/db_impl_secondary.cc          |  14 ++-
 2 files changed, 92 insertions(+), 74 deletions(-)

diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 1479a6c5a983..88de6d0e48d0 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -463,8 +463,6 @@ TEST_F(CompactionServiceTest, ManualCompaction) {
 
 TEST_F(CompactionServiceTest, StandaloneDeleteRangeTombstoneOptimization) {
   Options options = CurrentOptions();
-  options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
-  ReopenWithCompactionService(&options);
 
   size_t num_files_after_filtered = 0;
   SyncPoint::GetInstance()->SetCallBack(
@@ -472,83 +470,97 @@ TEST_F(CompactionServiceTest, StandaloneDeleteRangeTombstoneOptimization) {
       [&](void* arg) {
         num_files_after_filtered = *static_cast<size_t*>(arg);
       });
-
   SyncPoint::GetInstance()->EnableProcessing();
 
-  std::vector<std::string> files;
-  {
-    // Writes first version of data in range partitioned files.
-    SstFileWriter sst_file_writer(EnvOptions(), options);
-    std::string file1 = dbname_ + "file1.sst";
-    ASSERT_OK(sst_file_writer.Open(file1));
-    ASSERT_OK(sst_file_writer.Put("a", "a1"));
-    ASSERT_OK(sst_file_writer.Put("b", "b1"));
-    ExternalSstFileInfo file1_info;
-    ASSERT_OK(sst_file_writer.Finish(&file1_info));
-    files.push_back(std::move(file1));
-
-    std::string file2 = dbname_ + "file2.sst";
-    ASSERT_OK(sst_file_writer.Open(file2));
-    ASSERT_OK(sst_file_writer.Put("x", "x1"));
-    ASSERT_OK(sst_file_writer.Put("y", "y1"));
-    ExternalSstFileInfo file2_info;
-    ASSERT_OK(sst_file_writer.Finish(&file2_info));
-    files.push_back(std::move(file2));
-  }
+  for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel,
+                                CompactionStyle::kCompactionStyleUniversal}) {
+    SCOPED_TRACE("Style: " + std::to_string(compaction_style));
+    options.compaction_style = compaction_style;
+    ReopenWithCompactionService(&options);
 
-  IngestExternalFileOptions ifo;
-  ASSERT_OK(db_->IngestExternalFile(files, ifo));
-  ASSERT_EQ(Get("a"), "a1");
-  ASSERT_EQ(Get("b"), "b1");
-  ASSERT_EQ(Get("x"), "x1");
-  ASSERT_EQ(Get("y"), "y1");
-  ASSERT_EQ(2, NumTableFilesAtLevel(6));
+    num_files_after_filtered = 0;
 
-  auto my_cs = GetCompactionService();
-  uint64_t comp_num = my_cs->GetCompactionNum();
+    std::vector<std::string> files;
+    {
+      // Writes first version of data in range partitioned files.
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+      std::string file1 = dbname_ + "file1.sst";
+      ASSERT_OK(sst_file_writer.Open(file1));
+      ASSERT_OK(sst_file_writer.Put("a", "a1"));
+      ASSERT_OK(sst_file_writer.Put("b", "b1"));
+      ExternalSstFileInfo file1_info;
+      ASSERT_OK(sst_file_writer.Finish(&file1_info));
+      files.push_back(std::move(file1));
+
+      std::string file2 = dbname_ + "file2.sst";
+      ASSERT_OK(sst_file_writer.Open(file2));
+      ASSERT_OK(sst_file_writer.Put("x", "x1"));
+      ASSERT_OK(sst_file_writer.Put("y", "y1"));
+      ExternalSstFileInfo file2_info;
+      ASSERT_OK(sst_file_writer.Finish(&file2_info));
+      files.push_back(std::move(file2));
+    }
 
-  {
-    // Atomically delete old version of data with one range delete file.
-    // And a new batch of range partitioned files with new version of data.
-    files.clear();
-    SstFileWriter sst_file_writer(EnvOptions(), options);
-    std::string file2 = dbname_ + "file2.sst";
-    ASSERT_OK(sst_file_writer.Open(file2));
-    ASSERT_OK(sst_file_writer.DeleteRange("a", "z"));
-    ExternalSstFileInfo file2_info;
-    ASSERT_OK(sst_file_writer.Finish(&file2_info));
-    files.push_back(std::move(file2));
-
-    std::string file3 = dbname_ + "file3.sst";
-    ASSERT_OK(sst_file_writer.Open(file3));
-    ASSERT_OK(sst_file_writer.Put("a", "a2"));
-    ASSERT_OK(sst_file_writer.Put("b", "b2"));
-    ExternalSstFileInfo file3_info;
-    ASSERT_OK(sst_file_writer.Finish(&file3_info));
-    files.push_back(std::move(file3));
-
-    std::string file4 = dbname_ + "file4.sst";
-    ASSERT_OK(sst_file_writer.Open(file4));
-    ASSERT_OK(sst_file_writer.Put("x", "x2"));
-    ASSERT_OK(sst_file_writer.Put("y", "y2"));
-    ExternalSstFileInfo file4_info;
-    ASSERT_OK(sst_file_writer.Finish(&file4_info));
-    files.push_back(std::move(file4));
-  }
+    IngestExternalFileOptions ifo;
+    ASSERT_OK(db_->IngestExternalFile(files, ifo));
+    ASSERT_EQ(Get("a"), "a1");
+    ASSERT_EQ(Get("b"), "b1");
+    ASSERT_EQ(Get("x"), "x1");
+    ASSERT_EQ(Get("y"), "y1");
+    ASSERT_EQ(2, NumTableFilesAtLevel(6));
 
-  ASSERT_OK(db_->IngestExternalFile(files, ifo));
-  ASSERT_OK(db_->WaitForCompact(WaitForCompactOptions()));
-  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+    auto my_cs = GetCompactionService();
+    uint64_t comp_num = my_cs->GetCompactionNum();
 
-  CompactionServiceResult result;
-  my_cs->GetResult(&result);
-  ASSERT_OK(result.status);
-  ASSERT_TRUE(result.stats.is_manual_compaction);
-  ASSERT_TRUE(result.stats.is_remote_compaction);
+    {
+      // Atomically delete old version of data with one range delete file.
+      // And a new batch of range partitioned files with new version of data.
+      files.clear();
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+      std::string file2 = dbname_ + "file2.sst";
+      ASSERT_OK(sst_file_writer.Open(file2));
+      ASSERT_OK(sst_file_writer.DeleteRange("a", "z"));
+      ExternalSstFileInfo file2_info;
+      ASSERT_OK(sst_file_writer.Finish(&file2_info));
+      files.push_back(std::move(file2));
+
+      std::string file3 = dbname_ + "file3.sst";
+      ASSERT_OK(sst_file_writer.Open(file3));
+      ASSERT_OK(sst_file_writer.Put("a", "a2"));
+      ASSERT_OK(sst_file_writer.Put("b", "b2"));
+      ExternalSstFileInfo file3_info;
+      ASSERT_OK(sst_file_writer.Finish(&file3_info));
+      files.push_back(std::move(file3));
+
+      std::string file4 = dbname_ + "file4.sst";
+      ASSERT_OK(sst_file_writer.Open(file4));
+      ASSERT_OK(sst_file_writer.Put("x", "x2"));
+      ASSERT_OK(sst_file_writer.Put("y", "y2"));
+      ExternalSstFileInfo file4_info;
+      ASSERT_OK(sst_file_writer.Finish(&file4_info));
+      files.push_back(std::move(file4));
+    }
 
-  ASSERT_EQ(num_files_after_filtered, 1);
+    ASSERT_OK(db_->IngestExternalFile(files, ifo));
+    ASSERT_OK(db_->WaitForCompact(WaitForCompactOptions()));
+    ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+
+    if (compaction_style == kCompactionStyleUniversal) {
+      ASSERT_EQ(num_files_after_filtered, 1);
+    } else {
+      // Not filtered
+      ASSERT_EQ(num_files_after_filtered, 3);
+    }
+
+    Close();
+  }
 
-  Close();
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index e775490157e7..69c40eefed57 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -879,13 +879,19 @@ Status DBImplSecondary::CompactWithoutInstallation(
   // input instead of recreating it in the remote worker
   std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
+  std::optional<SequenceNumber> earliest_snapshot = std::nullopt;
+  // Standalone Range Deletion Optimization is only supported in Universal
+  // Compactions - https://github.com/facebook/rocksdb/pull/13078
+  if (cfd->GetLatestCFOptions().compaction_style ==
+      CompactionStyle::kCompactionStyleUniversal) {
+    earliest_snapshot = !job_context.snapshot_seqs.empty()
+                            ? job_context.snapshot_seqs.front()
+                            : kMaxSequenceNumber;
+  }
   c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles(
       comp_options, input_files, input.output_level, vstorage,
       cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0,
-      /*earliest_snapshot=*/job_context.snapshot_seqs.empty()
-          ? kMaxSequenceNumber
-          : job_context.snapshot_seqs.front(),
-      job_context.snapshot_checker));
+      earliest_snapshot, job_context.snapshot_checker));
   assert(c != nullptr);
   c->FinalizeInputInfo(version);
 

From d8c058c5fe6a9dfb5219673a1d52e6008e4cecff Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 29 Sep 2025 10:55:16 -0700
Subject: [PATCH 316/500] Blog about unified memory limit (#14002)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/14002

Test Plan: verify according to https://github.com/facebook/rocksdb/tree/main/docs

Reviewed By: jaykorean

Differential Revision: D83209262

Pulled By: hx235

fbshipit-source-id: 688c855387e08c9b22644d4de3bc539e51a0ba0a
---
 ...025-09-24-unified-memory-tracking.markdown | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 docs/_posts/2025-09-24-unified-memory-tracking.markdown

diff --git a/docs/_posts/2025-09-24-unified-memory-tracking.markdown b/docs/_posts/2025-09-24-unified-memory-tracking.markdown
new file mode 100644
index 000000000000..dba0ca488eb8
--- /dev/null
+++ b/docs/_posts/2025-09-24-unified-memory-tracking.markdown
@@ -0,0 +1,59 @@
+---
+title: Unified Memory Tracking
+layout: post
+author: hx235
+category: blog
+---
+
+## Context / Problem
+Modern RocksDB deployments often run in environments with strict memory constraints—cloud VMs, containers, or hosts with hundreds of DB instances. Unpredictable memory usage can lead to out-of-memory (OOM) errors, degraded performance, or even service outages.
+Historically, while the block cache was the main source of memory usage, other components—such as memtables, table readers, file metadata, and temporary buffers—could consume significant memory outside the block cache’s control. This made it difficult for users to set a single memory limit and guarantee resource usage stays within expectations.
+
+## Goal
+The goal of recent memory tracking work in RocksDB is to enable users to cap the total memory usage of RocksDB instances under a single, configurable limit—the block cache capacity. This is achieved by:
+- **Tracking and charging** all major memory consumers (memtables, table readers, file metadata, compression buffers, filter construction) to the block cache.
+- **Evicting** data blocks or other memory when the total tracked usage exceeds the configured limit.
+- **Providing a fixed memory footprint** for RocksDB, making it easier to run in resource-constrained environments and avoid OOMs.
+
+## Memtable Memory Charging
+A major source of memory usage in RocksDB is the memtable. To ensure memtable memory is tracked and capped under a single limit, RocksDB provides the WriteBufferManager (WBM). When WBM is configured with a block cache, memtable memory usage is charged to the block cache. This helps prevent OOM errors and simplifies resource management.
+
+```cpp
+std::shared_ptr<Cache> cache = HyperClockCacheOptions(capacity).MakeSharedCache();;
+DBOptions db_options;
+db_options.write_buffer_manager = std::make_shared<WriteBufferManager>(.., cache);
+```
+
+## Other Memory Charging
+Beyond memtables, RocksDB allows users to control memory charging for other internal roles using the cache_usage_options API. This provides fine-grained control over how memory is tracked for components like table readers, file metadata, compression dictionary buffers (`CompressionOptions::max_dict_buffer_bytes:`) and filter construction.
+
+```cpp
+struct CacheEntryRoleOptions {
+  enum class Decision {
+    kEnabled,
+    kDisabled,
+    kFallback,
+  };
+  Decision charged = Decision::kFallback;
+};
+struct CacheUsageOptions {
+  CacheEntryRoleOptions options;
+  std::map<CacheEntryRole, CacheEntryRoleOptions> options_overrides;
+};
+
+...
+BlockBasedTableOptions table_options;
+table_options.cache_usage_options.options.charged = CacheEntryRoleOptions::Decision::kFallback;
+table_options.cache_usage_options.options_overrides[CacheEntryRole::kTableBuilder] = {
+  .charged = CacheEntryRoleOptions::Decision::kEnabled,
+};
+```
+
+Default (`Decision::kFallback`) behavior for each memory type:
+- `CacheEntryRole::kCompressionDictionaryBuildingBuffer`: `kEnabled`
+- `CacheEntryRole::kFilterConstruction`: `kDisabled`
+- `CacheEntryRole::kBlockBasedTableReader`: `kDisabled`
+- `CacheEntryRole::kFileMetadata`: `kDisabled`
+
+## Monitoring and Observability
+RocksDB provides built-in statistics to help users monitor memory usage and cache behavior. The `DB::Properties::kBlockCacheEntryStats` exposes detailed statistics about block cache entries, including breakdowns by each `CacheEntryRole`. These statistics are essential for understanding memory consumption and tuning cache configuration.

From f5fb597bac1bb3611310ba8052e1ee556b4fc784 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 29 Sep 2025 14:21:00 -0700
Subject: [PATCH 317/500] Resolve missing/inconsistent tickers in Java (#14012)

Summary:
Pretty self-explanatory from the changes, including re-arranging the "COOL" entries for easier tracking of which values are used.

I'm not touching the TICKER_ENUM_MAX issue because IIRC we've gotten in trouble in the past for changing any Java ticker values.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14012

Test Plan: CI, sufficient prompts to get AI to discover the known issues relayed by hx235, to help ensure we found any other outstanding issues.

Reviewed By: hx235

Differential Revision: D83497503

Pulled By: pdillinger

fbshipit-source-id: ec0bd7e28188e0430fb03fc5bd79c2ed7b28f3ad
---
 java/rocksjni/portal.h                        | 25 +++++++++++++------
 .../src/main/java/org/rocksdb/TickerType.java | 19 ++++++++++++--
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index c51f83e1c29f..7ed6d6b1ff89 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5199,16 +5199,12 @@ class TickerTypeJni {
         return -0x31;
       case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES:
         return -0x32;
-      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES:
-        return -0x5B;
       case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES:
         return -0x33;
       case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT:
         return -0x34;
       case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT:
         return -0x35;
-      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT:
-        return -0x5C;
       case ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT:
         return -0x36;
       case ROCKSDB_NAMESPACE::Tickers::LAST_LEVEL_READ_BYTES:
@@ -5283,6 +5279,14 @@ class TickerTypeJni {
         return -0x59;
       case ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT:
         return -0x5A;
+      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES:
+        return -0x5B;
+      case ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT:
+        return -0x5C;
+      case ROCKSDB_NAMESPACE::Tickers::NUMBER_WBWI_INGEST:
+        return -0x5D;
+      case ROCKSDB_NAMESPACE::Tickers::SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT:
+        return -0x5E;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5668,16 +5672,12 @@ class TickerTypeJni {
         return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES;
       case -0x32:
         return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES;
-      case -0x5B:
-        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES;
       case -0x33:
         return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_BYTES;
       case -0x34:
         return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_COUNT;
       case -0x35:
         return ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_COUNT;
-      case -0x5C:
-        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT;
       case -0x36:
         return ROCKSDB_NAMESPACE::Tickers::COLD_FILE_READ_COUNT;
       case -0x37:
@@ -5755,6 +5755,15 @@ class TickerTypeJni {
         return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_BYTES;
       case -0x5A:
         return ROCKSDB_NAMESPACE::Tickers::ICE_FILE_READ_COUNT;
+      case -0x5B:
+        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_BYTES;
+      case -0x5C:
+        return ROCKSDB_NAMESPACE::Tickers::COOL_FILE_READ_COUNT;
+      case -0x5D:
+        return ROCKSDB_NAMESPACE::Tickers::NUMBER_WBWI_INGEST;
+      case -0x5E:
+        return ROCKSDB_NAMESPACE::Tickers::
+            SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT;
       case -0x54:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 6a4cc30d7e2b..32c4cea2f974 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -766,12 +766,12 @@ public enum TickerType {
     WARM_FILE_READ_BYTES((byte) -0x32),
     COOL_FILE_READ_BYTES((byte) -0x5B),
     COLD_FILE_READ_BYTES((byte) -0x33),
-    ICE_FILE_READ_BYTES((byte) -0x58),
+    ICE_FILE_READ_BYTES((byte) -0x59),
     HOT_FILE_READ_COUNT((byte) -0x34),
     WARM_FILE_READ_COUNT((byte) -0x35),
     COOL_FILE_READ_COUNT((byte) -0x5C),
     COLD_FILE_READ_COUNT((byte) -0x36),
-    ICE_FILE_READ_COUNT((byte) -0x59),
+    ICE_FILE_READ_COUNT((byte) -0x5A),
 
     /**
      * (non-)last level read statistics
@@ -874,6 +874,8 @@ public enum TickerType {
 
     FIFO_TTL_COMPACTIONS((byte) -0x50),
 
+    FIFO_CHANGE_TEMPERATURE_COMPACTIONS((byte) -0x58),
+
     PREFETCH_BYTES((byte) -0x51),
 
     PREFETCH_BYTES_USEFUL((byte) -0x52),
@@ -886,6 +888,19 @@ public enum TickerType {
 
     FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57),
 
+    /**
+     * Counter for the number of times a WBWI is ingested into the DB. This
+     * happens when IngestWriteBatchWithIndex() is used and when large
+     * transaction optimization is enabled through
+     * TransactionOptions::large_txn_commit_optimize_threshold.
+     */
+    NUMBER_WBWI_INGEST((byte) -0x5D),
+
+    /**
+     * Failure to load the UDI during SST table open
+     */
+    SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT((byte) -0x5E),
+
     TICKER_ENUM_MAX((byte) -0x54);
 
     private final byte value;

From 035242415f8db6b4cf61e5101fe18b53a77dc88a Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:45:49 -0700
Subject: [PATCH 318/500] Fix incorrect MultiScan handling of range limit
 between files (#14011)

Summary:
This PR fixes a bug in how MultiScan handled a scan range limit falling in the key range between files. The bug was in LevelIterator, where Prepare() relied on FindFile to determine the lower bound file for the range limit. FindFile returns the smallest file index with `range.limit < file.largest_key`. However, that doesn't guarantee that the range overlaps the file, as the `range.limit` could be smaller than `file.smallest_key`.

This also fixes a bug in BlockBasedTableIterator of Valid() returning true even if status() returned error. This was exposed by the previous bug.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14011

Test Plan: Add unit tests in db_iterator_test and table_test

Reviewed By: cbi42

Differential Revision: D83496439

Pulled By: anand1976

fbshipit-source-id: a9d2d138d69d0c816d9f4160a984b273d00d683f
---
 db/db_iterator_test.cc                        | 81 +++++++++++++++++++
 db/version_set.cc                             | 14 ++++
 .../block_based/block_based_table_iterator.h  |  2 +-
 table/table_test.cc                           | 16 ++++
 tools/db_crashtest.py                         | 10 +--
 .../mscan_range_limit_between_files.md        |  1 +
 6 files changed, 118 insertions(+), 6 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/mscan_range_limit_between_files.md

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index cc44ff069c05..1bf83d8e230d 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4149,6 +4149,7 @@ class DBMultiScanIteratorTest : public DBTestBase,
       : DBTestBase("db_multi_scan_iterator_test", /*env_do_fsync=*/true) {}
 };
 
+// Param 0: ReadOptions::fill_cache
 INSTANTIATE_TEST_CASE_P(DBMultiScanIteratorTest, DBMultiScanIteratorTest,
                         ::testing::Bool());
 
@@ -4463,6 +4464,86 @@ TEST_P(DBMultiScanIteratorTest, OutOfL0FileRange) {
   ASSERT_EQ(count, 2);
 }
 
+TEST_P(DBMultiScanIteratorTest, RangeBetweenFiles) {
+  auto options = CurrentOptions();
+  options.target_file_size_base = 100 << 10;  // 20KB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  auto rnd = Random::GetTLSInstance();
+  // Write ~200KB data
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(Key(i), rnd->RandomString(2 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_EQ(2, NumTableFilesAtLevel(49));
+
+  // Test with a scan range that overlaps an entire file, with upper bound
+  // between 2 files
+  std::vector<LiveFileMetaData> file_meta;
+  dbfull()->GetLiveFilesMetaData(&file_meta);
+  ASSERT_EQ(file_meta.size(), 2);
+  std::vector<std::string> key_ranges(4);
+  key_ranges[0] = file_meta[0].smallestkey;
+  key_ranges[1] = file_meta[0].largestkey + "0";
+  key_ranges[2] = file_meta[1].smallestkey + "0";
+  key_ranges[3] = file_meta[1].largestkey;
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[0]);
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+
+  // Test multiscan with a range entirely between adjacent files
+  key_ranges[0] = file_meta[0].largestkey + "0";
+  key_ranges[1] = file_meta[0].largestkey + "1";
+  key_ranges[2] = file_meta[1].smallestkey + "0";
+  key_ranges[3] = file_meta[1].largestkey;
+  (*scan_options).clear();
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[0]);
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+  iter.reset();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index f2eef583d0ef..a16e5232336b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1181,6 +1181,12 @@ class LevelIterator final : public InternalIterator {
       // 3. [  S  ] ...... [  E  ]
       for (auto i = fstart; i <= fend; i++) {
         if (i < flevel_->num_files) {
+          // FindFile only compares against the largest_key, so we need this
+          // additional check to ensure the scan range overlaps the file
+          if (icomparator_.InternalKeyComparator::Compare(
+                  iend.Encode(), flevel_->files[i].smallest_key) < 0) {
+            continue;
+          }
           auto& args = GetMultiScanArgForFile(i);
           args.insert(start.value(), end.value(), opt.property_bag);
         }
@@ -1362,6 +1368,14 @@ void LevelIterator::Seek(const Slice& target) {
   }
 
   if (file_iter_.iter() != nullptr) {
+    if (scan_opts_) {
+      // At this point, we only know that the seek target is < largest_key
+      // in the file. We need to check whether there is actual overlap.
+      const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+      if (KeyReachedUpperBound(cur_file.smallest_key)) {
+        return;
+      }
+    }
     file_iter_.Seek(target);
     // Status::TryAgain indicates asynchronous request for retrieval of data
     // blocks has been submitted. So it should return at this point and Seek
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 8d75770897fe..9cb2e407c5da 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -59,7 +59,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   bool NextAndGetResult(IterateResult* result) override;
   void Prev() override;
   bool Valid() const override {
-    return !is_out_of_bound_ &&
+    return !is_out_of_bound_ && multi_scan_status_.ok() &&
            (is_at_first_key_from_index_ ||
             (block_iter_points_to_real_block_ && block_iter_.Valid()));
   }
diff --git a/table/table_test.cc b/table/table_test.cc
index b699fce1ad4e..efb805f0e404 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -8490,6 +8490,22 @@ TEST_P(UserDefinedIndexTest, MultiScanFailureTest) {
   iter->Seek(key_ranges[2]);
   // Seek should fail as its not in the order specified in scan_options
   ASSERT_EQ(iter->status(), Status::InvalidArgument());
+  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  scan_options.max_prefetch_size = 0;
+  iter->Prepare(scan_options);
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+  ASSERT_TRUE(iter->Valid());
+  ub = key_ranges[3];
+  iter->Seek("key13");
+  // Seek should fail as its not in the order specified in scan_options
+  ASSERT_EQ(iter->status(), Status::InvalidArgument());
+  ASSERT_FALSE(iter->Valid());
   iter.reset();
 
   iter.reset(db->NewIterator(ro, cfh));
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 8f96dd7e0c2a..d75eb57fff34 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1184,15 +1184,15 @@ def finalize_and_sanitize(src_params):
     # Continuous verification fails with secondaries inside NonBatchedOpsStressTest
     if dest_params.get("test_secondary") == 1:
         dest_params["continuous_verification_interval"] = 0
-    if (
-        dest_params.get("prefix_size", 0) > 0
-        or dest_params.get("read_fault_one_in", 0) > 0
-    ):
-        dest_params["use_multiscan"] = 0
     if dest_params.get("use_multiscan") == 1:
         dest_params["async_io"] = 0
         dest_params["delpercent"] += dest_params["delrangepercent"]
         dest_params["delrangepercent"] = 0
+        dest_params["prefix_size"] = -1
+        dest_params["iterpercent"] += dest_params["prefixpercent"]
+        dest_params["prefixpercent"] = 0
+        dest_params["read_fault_one_in"] = 0
+        dest_params["memtable_prefix_bloom_size_ratio"] = 0
     return dest_params
 
 
diff --git a/unreleased_history/bug_fixes/mscan_range_limit_between_files.md b/unreleased_history/bug_fixes/mscan_range_limit_between_files.md
new file mode 100644
index 000000000000..be94aa90ed06
--- /dev/null
+++ b/unreleased_history/bug_fixes/mscan_range_limit_between_files.md
@@ -0,0 +1 @@
+Fix incorrect MultiScan seek error status due to bugs in handling range limit falling between adjacent SST files key range.

From 13172e2be3c80ff77115becd5f7510b57c64102d Mon Sep 17 00:00:00 2001
From: ngina <221624547+nmk70@users.noreply.github.com>
Date: Wed, 1 Oct 2025 07:38:08 -0700
Subject: [PATCH 319/500] Add method to estimate index size (#14010)

Summary:
This method will be used to improve the compaction logic by accounting for the tail size, in addition to the data size,  when determining when to cut a file.

Problem: Currently the file cutting logic only considers data size when determining where to cut a file, failing to reserve space for index and filter blocks that are added when the file is finalized.

Key changes:
- Add EstimateCurrentIndexSize() to IndexBuilder interface
- Implement in ShortenedIndexBuilder with buffer that accounts for the next index entry. The buffer addresses under-estimation where the current index size doesn't account for the next index entry associated with the data block currently being built. The 2x multiplier bounds the estimate in the right direction and handles outlier cases with large keys.
- Add num_index_entries_ member to track added index entries (== data blocks emitted). This is thread-safe since it's updated/read in the serialized emit step.

Next steps:
- Partitioned index size estimation implementation
- Update compaction file cutting logic to consider index size estimation

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14010

Test Plan: Added a new test class with unit tests for new builder size estimation across all IndexBuilder implementations.

Reviewed By: pdillinger

Differential Revision: D83501741

Pulled By: nmk70

fbshipit-source-id: d58fc2a9e92e12a162f6244d4abd707a9c9e1885
---
 BUCK                                          |   6 +
 CMakeLists.txt                                |   1 +
 Makefile                                      |   3 +
 src.mk                                        |   1 +
 table/block_based/index_builder.cc            |  16 ++
 table/block_based/index_builder.h             |  18 ++
 table/block_based/index_builder_test.cc       | 183 ++++++++++++++++++
 .../block_based/user_defined_index_wrapper.h  |   2 +
 8 files changed, 230 insertions(+)
 create mode 100644 table/block_based/index_builder_test.cc

diff --git a/BUCK b/BUCK
index 6e57e5cd1a7a..8a85587abf2b 100644
--- a/BUCK
+++ b/BUCK
@@ -5193,6 +5193,12 @@ cpp_unittest_wrapper(name="import_column_family_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="index_builder_test",
+            srcs=["table/block_based/index_builder_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="inlineskiplist_test",
             srcs=["memtable/inlineskiplist_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dd602fdacff4..502e6929aac8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1438,6 +1438,7 @@ if(WITH_TESTS)
         table/block_based/block_based_table_reader_test.cc
         table/block_based/block_test.cc
         table/block_based/data_block_hash_index_test.cc
+        table/block_based/index_builder_test.cc
         table/block_based/full_filter_block_test.cc
         table/block_based/partitioned_filter_block_test.cc
         table/cleanable_test.cc
diff --git a/Makefile b/Makefile
index 7b019a389144..f014aca2ec00 100644
--- a/Makefile
+++ b/Makefile
@@ -1739,6 +1739,9 @@ block_test: $(OBJ_DIR)/table/block_based/block_test.o $(TEST_LIBRARY) $(LIBRARY)
 data_block_hash_index_test: $(OBJ_DIR)/table/block_based/data_block_hash_index_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+index_builder_test: $(OBJ_DIR)/table/block_based/index_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 inlineskiplist_test: $(OBJ_DIR)/memtable/inlineskiplist_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/src.mk b/src.mk
index 3954622ba350..3f465c4562a3 100644
--- a/src.mk
+++ b/src.mk
@@ -588,6 +588,7 @@ TEST_MAIN_SOURCES =                                                     \
   table/block_based/block_based_table_reader_test.cc                    \
   table/block_based/block_test.cc                                       \
   table/block_based/data_block_hash_index_test.cc                       \
+  table/block_based/index_builder_test.cc                               \
   table/block_based/full_filter_block_test.cc                           \
   table/block_based/partitioned_filter_block_test.cc                    \
   table/cleanable_test.cc                                               \
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index 1ab6b0da82ae..2124579f82c4 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -117,6 +117,22 @@ Slice ShortenedIndexBuilder::FindShortInternalKeySuccessor(
   }
 }
 
+uint64_t ShortenedIndexBuilder::EstimateCurrentIndexSize() const {
+  uint64_t current_size =
+      must_use_separator_with_seq_
+          ? index_block_builder_.CurrentSizeEstimate()
+          : index_block_builder_without_seq_.CurrentSizeEstimate();
+
+  if (num_index_entries_ == 0) {
+    return current_size;
+  }
+
+  uint64_t avg_entry_size = current_size / num_index_entries_;
+
+  // Add buffer to generously account (in most cases) for the next index entry
+  return current_size + (2 * avg_entry_size);
+}
+
 PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
     const InternalKeyComparator* comparator,
     const bool use_value_delta_encoding,
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 630555219648..9cf498ea25d3 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -153,6 +153,13 @@ class IndexBuilder {
   // Get the size for index block. Must be called after ::Finish.
   virtual size_t IndexSize() const = 0;
 
+  // Get an estimate for current total index size based on current builder
+  // state.
+  //
+  // Called during compaction to estimate final index size for file cutting
+  // decisions.
+  virtual uint64_t EstimateCurrentIndexSize() const = 0;
+
   virtual bool separator_is_key_plus_seq() { return true; }
 
  protected:
@@ -317,6 +324,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
                                            encoded_entry,
                                            &delta_encoded_entry_slice);
     }
+
+    ++num_index_entries_;
   }
 
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
@@ -406,6 +415,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
+  uint64_t EstimateCurrentIndexSize() const override;
+
   bool separator_is_key_plus_seq() override {
     return must_use_separator_with_seq_;
   }
@@ -436,6 +447,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
   BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
   BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
   std::string current_block_first_internal_key_;
+  uint64_t num_index_entries_ = 0;
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -554,6 +566,9 @@ class HashIndexBuilder : public IndexBuilder {
            prefix_meta_block_.size();
   }
 
+  // TODO: implement
+  uint64_t EstimateCurrentIndexSize() const override { return 0; }
+
   bool separator_is_key_plus_seq() override {
     return primary_index_builder_.separator_is_key_plus_seq();
   }
@@ -628,6 +643,9 @@ class PartitionedIndexBuilder : public IndexBuilder {
   size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
   size_t NumPartitions() const;
 
+  // TODO: implement
+  uint64_t EstimateCurrentIndexSize() const override { return 0; }
+
   inline bool ShouldCutFilterBlock() {
     // Current policy is to align the partitions of index and filters
     if (cut_filter_block) {
diff --git a/table/block_based/index_builder_test.cc b/table/block_based/index_builder_test.cc
new file mode 100644
index 000000000000..28b138b53f5e
--- /dev/null
+++ b/table/block_based/index_builder_test.cc
@@ -0,0 +1,183 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "table/block_based/index_builder.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/format.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IndexBuilderTest
+    : public testing::Test,
+      public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
+ public:
+  IndexBuilderTest() : icomp_(BytewiseComparator()) {}
+
+  std::unique_ptr<IndexBuilder> CreateIndexBuilder() {
+    BlockBasedTableOptions table_options;
+    BlockBasedTableOptions::IndexType index_type = GetParam();
+    return std::unique_ptr<IndexBuilder>(IndexBuilder::CreateIndexBuilder(
+        index_type, &icomp_, nullptr, false /* use_value_delta_encoding */,
+        table_options, 0 /* ts_sz */,
+        true /* persist_user_defined_timestamps */));
+  }
+
+  std::string MakeKey(int i) {
+    return InternalKey(std::string("key") + std::to_string(i), 100 - i,
+                       kTypeValue)
+        .Encode()
+        .ToString();
+  }
+
+  BlockHandle MakeBlockHandle(uint64_t offset, uint64_t size) {
+    BlockHandle handle;
+    handle.set_offset(offset);
+    handle.set_size(size);
+    return handle;
+  }
+
+  void AddEntriesToBuilder(IndexBuilder* builder, int num_entries,
+                           std::vector<uint64_t>* estimates = nullptr) {
+    for (int i = 1; i <= num_entries; ++i) {
+      std::string key_current = MakeKey(i);
+      BlockHandle handle = MakeBlockHandle(i * kBlockOffset, kBlockSize);
+      std::string separator_scratch;
+
+      if (i == num_entries) {
+        // Last entry - no next key
+        builder->AddIndexEntry(key_current, nullptr, handle,
+                               &separator_scratch);
+      } else {
+        std::string key_next = MakeKey(i + 1);
+        Slice key_next_slice(key_next);
+        builder->AddIndexEntry(key_current, &key_next_slice, handle,
+                               &separator_scratch);
+      }
+
+      if (estimates) {
+        uint64_t current_estimate = builder->EstimateCurrentIndexSize();
+        estimates->push_back(current_estimate);
+      }
+    }
+  }
+
+ protected:
+  InternalKeyComparator icomp_;
+  static const uint64_t kBlockOffset = 1000;
+  static const uint64_t kBlockSize = 4096;
+  // BlockBuilder initial overhead
+  // See BlockBuilder constructor and Reset()
+  static const uint64_t kBlockBuilderInitialOverhead = 2 * sizeof(uint32_t);
+};
+
+const uint64_t IndexBuilderTest::kBlockOffset;
+const uint64_t IndexBuilderTest::kBlockSize;
+const uint64_t IndexBuilderTest::kBlockBuilderInitialOverhead;
+
+TEST_P(IndexBuilderTest, EstimateCurrentIndexSize) {
+  auto builder = CreateIndexBuilder();
+  BlockBasedTableOptions::IndexType index_type = GetParam();
+
+  // Empty builder
+  uint64_t empty_size = builder->EstimateCurrentIndexSize();
+  if (index_type == BlockBasedTableOptions::kBinarySearch) {
+    EXPECT_EQ(empty_size, kBlockBuilderInitialOverhead)
+        << "Empty ShortenedIndexBuilder should return BlockBuilder initial "
+           "overhead ("
+        << kBlockBuilderInitialOverhead;
+  } else {
+    EXPECT_EQ(empty_size, 0) << "Other builders should return 0 when empty";
+  }
+
+  // Add one entry
+  AddEntriesToBuilder(builder.get(), 1);
+  uint64_t size_after_one = builder->EstimateCurrentIndexSize();
+
+  if (index_type == BlockBasedTableOptions::kBinarySearch) {
+    EXPECT_GT(size_after_one, kBlockBuilderInitialOverhead)
+        << "Estimate should be greater than initial overhead";
+  } else {
+    // Other builders currently return 0 (which is expected)
+    EXPECT_EQ(size_after_one, 0) << "Other index builders currently return 0";
+  }
+
+  // Add multiple entries and capture all estimates
+  std::vector<uint64_t> estimates;
+  auto new_builder = CreateIndexBuilder();
+  AddEntriesToBuilder(new_builder.get(), 5, &estimates);
+
+  // Validate reported estimates
+  for (size_t i = 0; i < estimates.size(); ++i) {
+    uint64_t estimate = estimates[i];
+
+    if (index_type == BlockBasedTableOptions::kBinarySearch) {
+      EXPECT_GT(estimate, 0)
+          << "Estimate should be positive for " << i << " entry";
+      if (i > 0) {
+        EXPECT_GT(estimate, estimates[i - 1])
+            << "Estimate should not decrease with more entries (entry " << i - 1
+            << ": " << estimates[i - 1] << ", entry " << i << ": " << estimate
+            << ")";
+      }
+    } else {
+      EXPECT_EQ(estimate, 0) << "Other index builders currently return 0";
+    }
+  }
+
+  // Multiple calls should return the same value if the builder state is not
+  // modified
+  uint64_t estimate1 = builder->EstimateCurrentIndexSize();
+  uint64_t estimate2 = builder->EstimateCurrentIndexSize();
+  uint64_t estimate3 = builder->EstimateCurrentIndexSize();
+
+  EXPECT_EQ(estimate1, estimate2);
+  EXPECT_EQ(estimate2, estimate3);
+
+  // Test behavior after Finish() - only for builders that can be finished
+  // successfully
+  if (index_type == BlockBasedTableOptions::kBinarySearch) {
+    uint64_t estimate_before_finish = builder->EstimateCurrentIndexSize();
+
+    IndexBuilder::IndexBlocks index_blocks;
+    Status s = builder->Finish(&index_blocks);
+    EXPECT_TRUE(s.ok()) << "ShortenedIndexBuilder should finish successfully: "
+                        << s.ToString();
+
+    uint64_t estimate_after_finish = builder->EstimateCurrentIndexSize();
+    EXPECT_GT(estimate_after_finish, 0);
+    EXPECT_LE(estimate_before_finish, estimate_after_finish)
+        << "Estimate should not decrease after finish";
+
+    // Ensure that the actual index size is not greater than the estimated size
+    // after finish is called to prevent underestimation.
+    uint64_t actual_index_size = builder->IndexSize();
+    EXPECT_LE(actual_index_size, estimate_after_finish)
+        << "Actual index size should not be greater than estimated size: "
+           "actual size:  "
+        << actual_index_size << ", estimated size: " << estimate_after_finish;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    IndexBuilderTypes, IndexBuilderTest,
+    ::testing::Values(BlockBasedTableOptions::kBinarySearch,
+                      BlockBasedTableOptions::kHashSearch,
+                      BlockBasedTableOptions::kTwoLevelIndexSearch));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index acc5f40a1c97..4c37289c6c92 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -155,6 +155,8 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
+  uint64_t EstimateCurrentIndexSize() const override { return 0; }
+
   bool separator_is_key_plus_seq() override {
     return internal_index_builder_->separator_is_key_plus_seq();
   }

From 1e5fa69c99ac8765783f5ce8a3a065b08f5b08a7 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 1 Oct 2025 14:21:55 -0700
Subject: [PATCH 320/500] Resuming and persisting subcompaction progress in
 CompactionJob (#13983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
### Context/Summary:
Flow of resuming: DB::OpenAndCompact() -> Compaction progress file  -> SubcompactionProgress -> CompactionJob
Flow of persistence: CompactionJob -> SubcompactionProgress -> Compaction progress file  -> DB that is called with OpenAndCompact()

This PR focuses on SubcompactionProgress -> CompactionJob and  CompactionJob -> SubcompactionProgress -> Compaction progress file. For now only single subcompaction is supported as OpenAndCompact() does not partition compaction anyway.

The actual triggering of progress persistence and resuming (i.e, integration) is through DB::OpenAndCompact() in the upcoming PR.

**Resume Flow**
1. input_iter->Seek(next_internal_key_to_compact)  // Position iterator
2. ReadTableProperties()                           // Validate existing outputs
3. RestoreCompactionOutputs() in CompactionOutputs                     // Rebuild output file metadata
4. Restore critical statistics about processed input and output records count for verification later
5. AdvanceFileNumbers()                            // Prevent file number conflicts
6. Continue normal compaction from positioned iterator or fallback to not resuming compaction in limited case or fail the compaction entirely

**Persistence Strategy**
1. When: At each SST file completion (FinishCompactionOutputFile()). This is the simplest but most expensive frequency. See below for benchmarking and potential follow-up items
2. What: Serialize, write and sync the in-memory SubcompactionProgress to a dedicated manifest-like file
3. For simplicity: Only persist at "clean" boundaries (no overlapping user keys, no range deletions, no timestamp for now)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13983

Test Plan:
- New unit test in CompactionJob level to cover basic compaction progress resumption
- Existing UTs and stress/crash test to test no correctness regression to existing compaction code
- Run benchmark to ensure no performance regression to existing compaction code
```
./db_bench --benchmarks=fillseq[-X10] --db=$db --disable_auto_compactions=true --num=100000 --value_size=25000 --compression_type=none --target_file_size_base=268435456 --write_buffer_size=268435456
```
Pre-PR:
fillseq [AVG    10 runs] : 45127 (± 799) ops/sec; 1076.6 (± 19.1) MB/sec
fillseq [MEDIAN 10 runs] : 45375 ops/sec; 1082.5 MB/sec
Post-PR (regressed 0.057%, ignorable)
fillseq [AVG    10 runs] : 45101 (± 920) ops/sec; 1076.0 (± 22.0) MB/sec
fillseq [MEDIAN 10 runs] : 45385 ops/sec; 1082.8 MB/sec

Reviewed By: jaykorean

Differential Revision: D82889188

Pulled By: hx235

fbshipit-source-id: 8553fd478f134969d331af2c5a125b94bd747268
---
 db/compaction/compaction_job.cc         | 536 +++++++++++++++++++++---
 db/compaction/compaction_job.h          |  78 +++-
 db/compaction/compaction_job_test.cc    | 373 +++++++++++++++++
 db/compaction/compaction_outputs.cc     |   6 +-
 db/compaction/compaction_outputs.h      |  17 +-
 db/compaction/compaction_service_job.cc |   7 +-
 db/compaction/subcompaction_state.cc    |   6 +-
 db/compaction/subcompaction_state.h     |  14 +-
 db/db_impl/db_impl_files.cc             |   5 +
 db/version_edit.cc                      |  85 +---
 db/version_edit.h                       |  33 +-
 db/version_edit_test.cc                 |  84 ++--
 file/filename.cc                        |  40 ++
 file/filename.h                         |  15 +-
 include/rocksdb/types.h                 |   3 +-
 15 files changed, 1077 insertions(+), 225 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index b2588eaead90..80fc92b98c4b 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -51,7 +51,9 @@
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/options_type.h"
+#include "table/format.h"
 #include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
 #include "table/table_builder.h"
 #include "table/unique_id_impl.h"
 #include "test_util/sync_point.h"
@@ -253,7 +255,9 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
 
 void CompactionJob::Prepare(
     std::optional<std::pair<std::optional<Slice>, std::optional<Slice>>>
-        known_single_subcompact) {
+        known_single_subcompact,
+    const CompactionProgress& compaction_progress,
+    log::Writer* compaction_progress_writer) {
   db_mutex_->AssertHeld();
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PREPARE);
@@ -303,6 +307,9 @@ void CompactionJob::Prepare(
                                               /*sub_job_id*/ 0);
   }
 
+  MaybeAssignCompactionProgressAndWriter(compaction_progress,
+                                         compaction_progress_writer);
+
   // collect all seqno->time information from the input files which will be used
   // to encode seqno->time to the output files.
   SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber;
@@ -401,6 +408,25 @@ void CompactionJob::Prepare(
   options_file_number_ = versions_->options_file_number();
 }
 
+void CompactionJob::MaybeAssignCompactionProgressAndWriter(
+    const CompactionProgress& compaction_progress,
+    log::Writer* compaction_progress_writer) {
+  // LIMITATION: Only supports resuming single subcompaction for now
+  if (compact_->sub_compact_states.size() != 1) {
+    return;
+  }
+
+  if (!compaction_progress.empty()) {
+    assert(compaction_progress.size() == 1);
+    SubcompactionState* sub_compact = &compact_->sub_compact_states[0];
+    const SubcompactionProgress& subcompaction_progress =
+        compaction_progress[0];
+    sub_compact->SetSubcompactionProgress(subcompaction_progress);
+  }
+
+  compaction_progress_writer_ = compaction_progress_writer;
+}
+
 uint64_t CompactionJob::GetSubcompactionsLimit() {
   return extra_num_subcompaction_threads_reserved_ +
          std::max(
@@ -1249,8 +1275,8 @@ Status CompactionJob::SetupAndValidateCompactionFilter(
   return Status::OK();
 }
 
-void CompactionJob::InitializeReadOptions(
-    ColumnFamilyData* cfd, ReadOptions& read_options,
+void CompactionJob::InitializeReadOptionsAndBoundaries(
+    const size_t ts_sz, ReadOptions& read_options,
     SubcompactionKeyBoundaries& boundaries) {
   read_options.verify_checksums = true;
   read_options.fill_cache = false;
@@ -1264,8 +1290,6 @@ void CompactionJob::InitializeReadOptions(
 
   // Remove the timestamps from boundaries because boundaries created in
   // GenSubcompactionBoundaries doesn't strip away the timestamp.
-  const size_t ts_sz = cfd->user_comparator()->timestamp_size();
-
   if (boundaries.start.has_value()) {
     read_options.iterate_lower_bound = &(*boundaries.start);
     if (ts_sz > 0) {
@@ -1282,30 +1306,7 @@ void CompactionJob::InitializeReadOptions(
       read_options.iterate_upper_bound = &(*boundaries.end_without_ts);
     }
   }
-}
-
-InternalIterator* CompactionJob::CreateInputIterator(
-    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
-    SubcompactionInternalIterators& iterators,
-    SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
-  // This is assigned after creation of SubcompactionState to simplify that
-  // creation across both CompactionJob and CompactionServiceCompactionJob
-  sub_compact->AssignRangeDelAggregator(
-      std::make_unique<CompactionRangeDelAggregator>(
-          &cfd->internal_comparator(), job_context_->snapshot_seqs,
-          &full_history_ts_low_, &trim_ts_));
-
-  InitializeReadOptions(cfd, read_options, boundaries);
-
-  // Although the v2 aggregator is what the level iterator(s) know about,
-  // the AddTombstones calls will be propagated down to the v1 aggregator.
-  iterators.raw_input =
-      std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
-          read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
-          file_options_for_read_, boundaries.start, boundaries.end));
-  InternalIterator* input = iterators.raw_input.get();
 
-  const size_t ts_sz = cfd->user_comparator()->timestamp_size();
   if (ts_sz > 0) {
     if (ts_sz <= strlen(boundaries.kMaxTs)) {
       boundaries.ts_slice = Slice(boundaries.kMaxTs, ts_sz);
@@ -1314,7 +1315,6 @@ InternalIterator* CompactionJob::CreateInputIterator(
       boundaries.ts_slice = Slice(boundaries.max_ts);
     }
   }
-
   if (boundaries.start.has_value()) {
     boundaries.start_ikey.SetInternalKey(*boundaries.start, kMaxSequenceNumber,
                                          kValueTypeForSeek);
@@ -1335,6 +1335,29 @@ InternalIterator* CompactionJob::CreateInputIterator(
     boundaries.end_internal_key = boundaries.end_ikey.GetInternalKey();
     boundaries.end_user_key = boundaries.end_ikey.GetUserKey();
   }
+}
+
+InternalIterator* CompactionJob::CreateInputIterator(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    SubcompactionInternalIterators& iterators,
+    SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options) {
+  const size_t ts_sz = cfd->user_comparator()->timestamp_size();
+  InitializeReadOptionsAndBoundaries(ts_sz, read_options, boundaries);
+
+  // This is assigned after creation of SubcompactionState to simplify that
+  // creation across both CompactionJob and CompactionServiceCompactionJob
+  sub_compact->AssignRangeDelAggregator(
+      std::make_unique<CompactionRangeDelAggregator>(
+          &cfd->internal_comparator(), job_context_->snapshot_seqs,
+          &full_history_ts_low_, &trim_ts_));
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
+  iterators.raw_input =
+      std::unique_ptr<InternalIterator>(versions_->MakeInputIterator(
+          read_options, sub_compact->compaction, sub_compact->RangeDelAgg(),
+          file_options_for_read_, boundaries.start, boundaries.end));
+  InternalIterator* input = iterators.raw_input.get();
 
   if (boundaries.start.has_value() || boundaries.end.has_value()) {
     iterators.clip = std::make_unique<ClippingIterator>(
@@ -1424,11 +1447,13 @@ CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact,
 
   const CompactionFileCloseFunc close_file_func =
       [this, sub_compact, start_user_key, end_user_key](
-          CompactionOutputs& outputs, const Status& status,
-          const Slice& next_table_min_key) {
-        return this->FinishCompactionOutputFile(status, sub_compact, outputs,
-                                                next_table_min_key,
-                                                start_user_key, end_user_key);
+          const Status& status,
+          const ParsedInternalKey& prev_table_last_internal_key,
+          const Slice& next_table_min_key, const CompactionIterator* c_iter,
+          CompactionOutputs& outputs) {
+        return this->FinishCompactionOutputFile(
+            status, prev_table_last_internal_key, next_table_min_key,
+            start_user_key, end_user_key, c_iter, sub_compact, outputs);
       };
 
   return {open_file_func, close_file_func};
@@ -1442,6 +1467,9 @@ Status CompactionJob::ProcessKeyValue(
   const uint64_t kRecordStatsEvery = 1000;
   [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
 
+  IterKey last_output_key;
+  ParsedInternalKey last_output_ikey;
+
   TEST_SYNC_POINT_CALLBACK(
       "CompactionJob::ProcessKeyValueCompaction()::Processing",
       static_cast<void*>(const_cast<Compaction*>(sub_compact->compaction)));
@@ -1491,8 +1519,9 @@ Status CompactionJob::ProcessKeyValue(
     // and `close_file_func`.
     // TODO: it would be better to have the compaction file open/close moved
     // into `CompactionOutputs` which has the output file information.
-    status = sub_compact->AddToOutput(*c_iter, use_proximal_output,
-                                      open_file_func, close_file_func);
+    status =
+        sub_compact->AddToOutput(*c_iter, use_proximal_output, open_file_func,
+                                 close_file_func, last_output_ikey);
     if (!status.ok()) {
       break;
     }
@@ -1500,6 +1529,10 @@ Status CompactionJob::ProcessKeyValue(
     TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():PausingManualCompaction:2",
                              static_cast<void*>(const_cast<std::atomic<bool>*>(
                                  &manual_compaction_canceled_)));
+
+    last_output_key.SetInternalKey(c_iter->key(), &last_output_ikey);
+    last_output_ikey.sequence = ikey.sequence;
+    last_output_ikey.type = ikey.type;
     c_iter->Next();
 
 #ifndef NDEBUG
@@ -1684,6 +1717,22 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   ReadOptions read_options;
   const WriteOptions write_options(Env::IOPriority::IO_LOW,
                                    Env::IOActivity::kCompaction);
+
+  InternalIterator* input_iter = CreateInputIterator(
+      sub_compact, cfd, iterators, boundaries, read_options);
+
+  assert(input_iter);
+
+  Status status =
+      MaybeResumeSubcompactionProgressOnInputIterator(sub_compact, input_iter);
+
+  if (status.IsNotFound()) {
+    input_iter->SeekToFirst();
+  } else if (!status.ok()) {
+    sub_compact->status = status;
+    return;
+  }
+
   MergeHelper merge(
       env_, cfd->user_comparator(), cfd->ioptions().merge_operator.get(),
       compaction_filter, db_options_.info_log.get(),
@@ -1692,11 +1741,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       compact_->compaction->level(), db_options_.stats);
   BlobFileResources blob_resources;
 
-  InternalIterator* input_iter = CreateInputIterator(
-      sub_compact, cfd, iterators, boundaries, read_options);
-  assert(input_iter);
-  input_iter->SeekToFirst();
-
   auto c_iter =
       CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter,
                                merge, blob_resources, write_options);
@@ -1711,9 +1755,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   auto [open_file_func, close_file_func] =
       CreateFileHandlers(sub_compact, boundaries);
 
-  Status status =
-      ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
-                      close_file_func, prev_cpu_micros);
+  status = ProcessKeyValue(sub_compact, cfd, c_iter.get(), open_file_func,
+                           close_file_func, prev_cpu_micros);
 
   status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status);
 
@@ -1795,9 +1838,11 @@ void CompactionJob::RecordDroppedKeys(
 }
 
 Status CompactionJob::FinishCompactionOutputFile(
-    const Status& input_status, SubcompactionState* sub_compact,
-    CompactionOutputs& outputs, const Slice& next_table_min_key,
-    const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
+    const Status& input_status,
+    const ParsedInternalKey& prev_table_last_internal_key,
+    const Slice& next_table_min_key, const Slice* comp_start_user_key,
+    const Slice* comp_end_user_key, const CompactionIterator* c_iter,
+    SubcompactionState* sub_compact, CompactionOutputs& outputs) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
   assert(sub_compact != nullptr);
@@ -1971,10 +2016,79 @@ Status CompactionJob::FinishCompactionOutputFile(
     }
   }
 
+  if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact,
+                                                  prev_table_last_internal_key,
+                                                  next_table_min_key, meta)) {
+    UpdateSubcompactionProgress(c_iter, next_table_min_key, sub_compact);
+    s = PersistSubcompactionProgress(sub_compact);
+  }
   outputs.ResetBuilder();
   return s;
 }
 
+bool CompactionJob::ShouldUpdateSubcompactionProgress(
+    const SubcompactionState* sub_compact,
+    const ParsedInternalKey& prev_table_last_internal_key,
+    const Slice& next_table_min_internal_key, const FileMetaData* meta) const {
+  const auto* cfd = sub_compact->compaction->column_family_data();
+  // No need to update when the output will not get persisted
+  if (compaction_progress_writer_ == nullptr) {
+    return false;
+  }
+
+  // No need to update for a new empty output
+  if (meta == nullptr) {
+    return false;
+  }
+
+  // TODO(hx235): save progress even on the last output file
+  if (next_table_min_internal_key.empty()) {
+    return false;
+  }
+
+  // LIMITATION: Persisting compaction progress with timestamp
+  // is not supported since the feature of persisting timestamp of the key in
+  // SST files itself is still experimental
+  size_t ts_sz = cfd->user_comparator()->timestamp_size();
+  if (ts_sz > 0) {
+    return false;
+  }
+
+  // LIMITATION: Compaction progress persistence disabled for file boundaries
+  // contaning range deletions. Range deletions can span file boundaries, making
+  // it difficult (but possible) to ensure adjacent output tables have different
+  // user keys. See the last check for why different users keys of adjacent
+  // output tables are needed
+  const ValueType next_table_min_internal_key_type =
+      ExtractValueType(next_table_min_internal_key);
+  const ValueType prev_table_last_internal_key_type =
+      prev_table_last_internal_key.user_key.empty()
+          ? ValueType::kTypeValue
+          : prev_table_last_internal_key.type;
+
+  if (next_table_min_internal_key_type == ValueType::kTypeRangeDeletion ||
+      prev_table_last_internal_key_type == ValueType::kTypeRangeDeletion) {
+    return false;
+  }
+
+  // LIMITATION: Compaction progress persistence disabled when adjacent output
+  // tables share the same user key at boundaries. This ensures a simple Seek()
+  // of the next key when resuming can process all versions of a user key
+  const Slice next_table_min_user_key =
+      ExtractUserKey(next_table_min_internal_key);
+  const Slice prev_table_last_user_key =
+      prev_table_last_internal_key.user_key.empty()
+          ? Slice()
+          : prev_table_last_internal_key.user_key;
+
+  if (cfd->user_comparator()->EqualWithoutTimestamp(next_table_min_user_key,
+                                                    prev_table_last_user_key)) {
+    return false;
+  }
+
+  return true;
+}
+
 Status CompactionJob::InstallCompactionResults(bool* compaction_released) {
   assert(compact_);
 
@@ -2508,6 +2622,333 @@ Env::IOPriority CompactionJob::GetRateLimiterPriority() {
   return Env::IO_LOW;
 }
 
+Status CompactionJob::ReadTablePropertiesDirectly(
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const FileMetaData* file_meta, const ReadOptions& read_options,
+    std::shared_ptr<const TableProperties>* tp) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::string file_name = GetTableFileName(file_meta->fd.GetNumber());
+  Status s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file,
+                                              nullptr /* dbg */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_name, ioptions.clock, io_tracer_,
+          ioptions.stats, Histograms::SST_READ_MICROS /* hist_type */,
+          nullptr /* file_read_hist */, ioptions.rate_limiter.get(),
+          ioptions.listeners));
+
+  std::unique_ptr<TableProperties> props;
+
+  uint64_t magic_number = kBlockBasedTableMagicNumber;
+
+  const auto* table_factory = moptions.table_factory.get();
+  if (table_factory == nullptr) {
+    return Status::Incomplete("Table factory is not set");
+  } else {
+    const auto& table_factory_name = table_factory->Name();
+    if (table_factory_name == TableFactory::kPlainTableName()) {
+      magic_number = kPlainTableMagicNumber;
+    } else if (table_factory_name == TableFactory::kCuckooTableName()) {
+      magic_number = kCuckooTableMagicNumber;
+    }
+  }
+
+  s = ReadTableProperties(file_reader.get(), file_meta->fd.GetFileSize(),
+                          magic_number, ioptions, read_options, &props);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *tp = std::move(props);
+  return s;
+}
+
+Status CompactionJob::ReadOutputFilesTableProperties(
+    const autovector<FileMetaData>& output_files,
+    const ReadOptions& read_options,
+    std::vector<std::shared_ptr<const TableProperties>>&
+        output_files_table_properties,
+    bool is_proximal_level) {
+  assert(!output_files.empty());
+
+  static const char* level_type =
+      is_proximal_level ? "proximal output" : "output";
+
+  output_files_table_properties.reserve(output_files.size());
+
+  Status s;
+
+  for (const FileMetaData& metadata : output_files) {
+    std::shared_ptr<const TableProperties> tp;
+    s = ReadTablePropertiesDirectly(compact_->compaction->immutable_options(),
+                                    compact_->compaction->mutable_cf_options(),
+                                    &metadata, read_options, &tp);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Failed to read table properties for %s level output file #%" PRIu64
+          ": %s",
+          level_type, metadata.fd.GetNumber(), s.ToString().c_str());
+      return s;
+    }
+
+    if (tp == nullptr) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Empty table property for %s level output file #%" PRIu64
+                      "",
+                      level_type, metadata.fd.GetNumber());
+
+      s = Status::Corruption("Empty table property for " +
+                             std::string(level_type) +
+                             " level output files during resuming");
+      return s;
+    }
+    output_files_table_properties.push_back(tp);
+  }
+  return s;
+}
+
+void CompactionJob::RestoreCompactionOutputs(
+    const ColumnFamilyData* cfd,
+    const std::vector<std::shared_ptr<const TableProperties>>&
+        output_files_table_properties,
+    SubcompactionProgressPerLevel& subcompaction_progress_per_level,
+    CompactionOutputs* outputs_to_restore) {
+  assert(outputs_to_restore->GetOutputs().size() == 0);
+
+  const auto& output_files = subcompaction_progress_per_level.GetOutputFiles();
+
+  for (size_t i = 0; i < output_files.size(); i++) {
+    FileMetaData file_copy = output_files[i];
+
+    outputs_to_restore->AddOutput(std::move(file_copy),
+                                  cfd->internal_comparator(),
+                                  paranoid_file_checks_, true /* finished */);
+
+    outputs_to_restore->UpdateTableProperties(
+        *output_files_table_properties[i]);
+  }
+
+  outputs_to_restore->SetNumOutputRecords(
+      subcompaction_progress_per_level.GetNumProcessedOutputRecords());
+}
+
+// Attempt to resume compaction from a previously persisted compaction progress.
+//
+// RETURNS:
+// - Status::OK():
+// * Input iterator positioned at next unprocessed key
+// * CompactionOutputs objects fully restored for both output and proximal
+// output levels in SubcompactionState
+// * Compaction job statistics accurately reflect input and output records
+// processed for record count verification
+// * File number generation advanced to prevent conflicts with existing outputs
+// - Status::NotFound(): No valid progress to resume from
+// - Status::Corruption(): Resume key is invalid, beyond input range, or output
+// restoration failed
+// - Other non-OK status: Iterator errors or file system issues during
+// restoration
+//
+// The caller must check for Status::IsIncomplete() to distinguish between
+// "no resume needed" (proceed with `InternalIterator::SeekToFirst()`) vs
+// "resume failed" scenarios.
+Status CompactionJob::MaybeResumeSubcompactionProgressOnInputIterator(
+    SubcompactionState* sub_compact, InternalIterator* input_iter) {
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  SubcompactionProgress& subcompaction_progress =
+      sub_compact->GetSubcompactionProgressRef();
+
+  if (subcompaction_progress.output_level_progress
+              .GetNumProcessedOutputRecords() == 0 &&
+      subcompaction_progress.proximal_output_level_progress
+              .GetNumProcessedOutputRecords() == 0) {
+    return Status::NotFound("No subcompaction progress to resume");
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Resuming compaction",
+                 cfd->GetName().c_str(), job_id_);
+
+  input_iter->Seek(subcompaction_progress.next_internal_key_to_compact);
+
+  if (!input_iter->Valid()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Iterator is invalid after "
+                    "seeking to the key to resume. This indicates the key is "
+                    "incorrectly beyond the input data range.",
+                    cfd->GetName().c_str(), job_id_);
+    return Status::Corruption(
+        "The key to resume is beyond the input data range");
+  } else if (!input_iter->status().ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Iterator has error after seeking to "
+                    "the key to resume: %s",
+                    cfd->GetName().c_str(), job_id_,
+                    input_iter->status().ToString().c_str());
+    return Status::Corruption(
+        "Iterator has error status after seeking to the key: " +
+        input_iter->status().ToString());
+  }
+
+  sub_compact->compaction_job_stats.has_accurate_num_input_records =
+      subcompaction_progress.num_processed_input_records != 0;
+
+  sub_compact->compaction_job_stats.num_input_records =
+      subcompaction_progress.num_processed_input_records;
+
+  for (const bool& is_proximal_level : {false, true}) {
+    if (is_proximal_level &&
+        !sub_compact->compaction->SupportsPerKeyPlacement()) {
+      continue;
+    }
+
+    Status s;
+    SubcompactionProgressPerLevel& subcompaction_progress_per_level =
+        is_proximal_level
+            ? subcompaction_progress.proximal_output_level_progress
+            : subcompaction_progress.output_level_progress;
+
+    const auto& output_files =
+        subcompaction_progress_per_level.GetOutputFiles();
+
+    std::vector<std::shared_ptr<const TableProperties>>
+        output_files_table_properties;
+
+    // TODO(hx235): investigate if we can skip reading properties to save read
+    // IO
+    s = ReadOutputFilesTableProperties(output_files, read_options,
+                                       output_files_table_properties);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "[%s] [JOB %d] Failed to read table properties for %s output level"
+          "files "
+          "during resume: %s.",
+          cfd->GetName().c_str(), job_id_, is_proximal_level ? "proximal" : "",
+          s.ToString().c_str());
+      return Status::Corruption(
+          "Not able to resume due to table property reading error " +
+          s.ToString());
+    }
+
+    RestoreCompactionOutputs(cfd, output_files_table_properties,
+                             subcompaction_progress_per_level,
+                             sub_compact->Outputs(is_proximal_level));
+
+    // Skip past all the used file numbers to avoid creating new output files
+    // after resumption that conflict with the existing output files
+    for (const auto& file_meta : output_files) {
+      uint64_t file_number = file_meta.fd.GetNumber();
+      while (versions_->NewFileNumber() <= file_number) {
+        versions_->FetchAddFileNumber(1);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void CompactionJob::UpdateSubcompactionProgress(
+    const CompactionIterator* c_iter, const Slice next_table_min_key,
+    SubcompactionState* sub_compact) {
+  assert(c_iter);
+  SubcompactionProgress& subcompaction_progress =
+      sub_compact->GetSubcompactionProgressRef();
+
+  IterKey next_ikey_to_compact;
+  next_ikey_to_compact.SetInternalKey(ExtractUserKey(next_table_min_key),
+                                      kMaxSequenceNumber, kValueTypeForSeek);
+  subcompaction_progress.next_internal_key_to_compact =
+      next_ikey_to_compact.GetInternalKey().ToString();
+
+  subcompaction_progress.num_processed_input_records =
+      c_iter->HasNumInputEntryScanned() ? c_iter->NumInputEntryScanned() : 0;
+
+  UpdateSubcompactionProgressPerLevel(
+      sub_compact, false /* is_proximal_level */, subcompaction_progress);
+
+  if (sub_compact->compaction->SupportsPerKeyPlacement()) {
+    UpdateSubcompactionProgressPerLevel(
+        sub_compact, true /* is_proximal_level */, subcompaction_progress);
+  }
+}
+
+void CompactionJob::UpdateSubcompactionProgressPerLevel(
+    SubcompactionState* sub_compact, bool is_proximal_level,
+    SubcompactionProgress& subcompaction_progress) {
+  SubcompactionProgressPerLevel& subcompaction_progress_per_level =
+      is_proximal_level ? subcompaction_progress.proximal_output_level_progress
+                        : subcompaction_progress.output_level_progress;
+
+  subcompaction_progress_per_level.SetNumProcessedOutputRecords(
+      sub_compact->OutputStats(is_proximal_level)->num_output_records);
+
+  const auto& prev_output_files =
+      subcompaction_progress_per_level.GetOutputFiles();
+
+  const auto& current_output_files =
+      sub_compact->Outputs(is_proximal_level)->GetOutputs();
+
+  for (size_t i = prev_output_files.size(); i < current_output_files.size();
+       i++) {
+    subcompaction_progress_per_level.AddToOutputFiles(
+        current_output_files[i].meta);
+  }
+}
+
+Status CompactionJob::PersistSubcompactionProgress(
+    SubcompactionState* sub_compact) {
+  SubcompactionProgress& subcompaction_progress =
+      sub_compact->GetSubcompactionProgressRef();
+
+  assert(compaction_progress_writer_);
+
+  VersionEdit edit;
+  edit.SetSubcompactionProgress(subcompaction_progress);
+
+  std::string record;
+  if (!edit.EncodeTo(&record)) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] Failed to encode subcompaction "
+        "progress",
+        compact_->compaction->column_family_data()->GetName().c_str(), job_id_);
+    return Status::Corruption("Failed to encode subcompaction progress");
+  }
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status s = compaction_progress_writer_->AddRecord(write_options, record);
+  IOOptions opts;
+  if (s.ok()) {
+    s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  }
+  if (s.ok()) {
+    s = compaction_progress_writer_->file()->Sync(opts, db_options_.use_fsync);
+  }
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] Failed to persist subcompaction "
+        "progress: %s",
+        compact_->compaction->column_family_data()->GetName().c_str(), job_id_,
+        s.ToString().c_str());
+    return s;
+  }
+
+  subcompaction_progress.output_level_progress
+      .UpdateLastPersistedOutputFilesCount();
+
+  subcompaction_progress.proximal_output_level_progress
+      .UpdateLastPersistedOutputFilesCount();
+
+  return Status::OK();
+}
+
 Status CompactionJob::VerifyInputRecordCount(
     uint64_t num_input_range_del) const {
   size_t ts_sz = compact_->compaction->column_family_data()
@@ -2578,5 +3019,4 @@ Status CompactionJob::VerifyOutputRecordCount() const {
   }
   return Status::OK();
 }
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 87a9ccd11619..d2e3e4c5d3c3 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -176,9 +176,20 @@ class CompactionJob {
   // and organizing seqno <-> time info. `known_single_subcompact` is non-null
   // if we already have a known single subcompaction, with optional key bounds
   // (currently for executing a remote compaction).
+  //
+  // @param compaction_progress Previously saved compaction progress
+  //   to resume from. If empty, compaction starts fresh from the
+  //   beginning.
+  //
+  // @param compaction_progress_writer Writer for persisting
+  //   subcompaction progress periodically during compaction
+  //   execution. If nullptr, progress tracking is disabled and compaction
+  //   cannot be resumed later.
   void Prepare(
       std::optional<std::pair<std::optional<Slice>, std::optional<Slice>>>
-          known_single_subcompact);
+          known_single_subcompact,
+      const CompactionProgress& compaction_progress = CompactionProgress{},
+      log::Writer* compaction_progress_writer = nullptr);
 
   // REQUIRED mutex not held
   // Launch threads for each subcompaction and wait for them to finish. After
@@ -259,6 +270,10 @@ class CompactionJob {
   // consecutive groups such that each group has a similar size.
   void GenSubcompactionBoundaries();
 
+  void MaybeAssignCompactionProgressAndWriter(
+      const CompactionProgress& compaction_progress,
+      log::Writer* compaction_progress_writer);
+
   // Get the number of planned subcompactions based on max_subcompactions and
   // extra reserved resources
   uint64_t GetSubcompactionsLimit();
@@ -359,8 +374,9 @@ class CompactionJob {
       const CompactionFilter* configured_compaction_filter,
       const CompactionFilter*& compaction_filter,
       std::unique_ptr<CompactionFilter>& compaction_filter_from_factory);
-  void InitializeReadOptions(ColumnFamilyData* cfd, ReadOptions& read_options,
-                             SubcompactionKeyBoundaries& boundaries);
+  void InitializeReadOptionsAndBoundaries(
+      size_t ts_sz, ReadOptions& read_options,
+      SubcompactionKeyBoundaries& boundaries);
   InternalIterator* CreateInputIterator(
       SubcompactionState* sub_compact, ColumnFamilyData* cfd,
       SubcompactionInternalIterators& iterators,
@@ -411,12 +427,12 @@ class CompactionJob {
   // update the thread status for starting a compaction.
   void ReportStartedCompaction(Compaction* compaction);
 
-  Status FinishCompactionOutputFile(const Status& input_status,
-                                    SubcompactionState* sub_compact,
-                                    CompactionOutputs& outputs,
-                                    const Slice& next_table_min_key,
-                                    const Slice* comp_start_user_key,
-                                    const Slice* comp_end_user_key);
+  Status FinishCompactionOutputFile(
+      const Status& input_status,
+      const ParsedInternalKey& prev_table_last_internal_key,
+      const Slice& next_table_min_key, const Slice* comp_start_user_key,
+      const Slice* comp_end_user_key, const CompactionIterator* c_iter,
+      SubcompactionState* sub_compact, CompactionOutputs& outputs);
   Status InstallCompactionResults(bool* compaction_released);
   Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
                                   CompactionOutputs& outputs);
@@ -493,6 +509,9 @@ class CompactionJob {
   // Setting this requires DBMutex.
   uint64_t options_file_number_ = 0;
 
+  // Writer for persisting compaction progress during compaction
+  log::Writer* compaction_progress_writer_ = nullptr;
+
   // Get table file name in where it's outputting to, which should also be in
   // `output_directory_`.
   virtual std::string GetTableFileName(uint64_t file_number);
@@ -500,6 +519,43 @@ class CompactionJob {
   // The Compaction Read and Write priorities are the same for different
   // scenarios, such as write stalled.
   Env::IOPriority GetRateLimiterPriority();
+
+  Status MaybeResumeSubcompactionProgressOnInputIterator(
+      SubcompactionState* sub_compact, InternalIterator* input_iter);
+
+  Status ReadOutputFilesTableProperties(
+      const autovector<FileMetaData>& temporary_output_file_allocation,
+      const ReadOptions& read_options,
+      std::vector<std::shared_ptr<const TableProperties>>&
+          output_files_table_properties,
+      bool is_proximal_level = false);
+
+  Status ReadTablePropertiesDirectly(
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const FileMetaData* file_meta, const ReadOptions& read_options,
+      std::shared_ptr<const TableProperties>* tp);
+
+  void RestoreCompactionOutputs(
+      const ColumnFamilyData* cfd,
+      const std::vector<std::shared_ptr<const TableProperties>>&
+          output_files_table_properties,
+      SubcompactionProgressPerLevel& subcompaction_progress_per_level,
+      CompactionOutputs* outputs_to_restore);
+
+  bool ShouldUpdateSubcompactionProgress(
+      const SubcompactionState* sub_compact,
+      const ParsedInternalKey& prev_table_last_internal_key,
+      const Slice& next_table_min_internal_key, const FileMetaData* meta) const;
+
+  void UpdateSubcompactionProgress(const CompactionIterator* c_iter,
+                                   const Slice next_table_min_key,
+                                   SubcompactionState* sub_compact);
+
+  Status PersistSubcompactionProgress(SubcompactionState* sub_compact);
+
+  void UpdateSubcompactionProgressPerLevel(
+      SubcompactionState* sub_compact, bool is_proximal_level,
+      SubcompactionProgress& subcompaction_progress);
 };
 
 // CompactionServiceInput is used the pass compaction information between two
@@ -649,7 +705,9 @@ class CompactionServiceCompactionJob : private CompactionJob {
 
   // REQUIRED: mutex held
   // Like CompactionJob::Prepare()
-  void Prepare();
+  void Prepare(
+      const CompactionProgress& compaction_progress = CompactionProgress{},
+      log::Writer* compaction_progress_writer = nullptr);
 
   // Run the compaction in current thread and return the result
   Status Run();
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index c8178feb1b6a..6fb071f6d58a 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -17,6 +17,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
+#include "file/filename.h"
 #include "file/random_access_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/options_helper.h"
@@ -2409,6 +2410,378 @@ TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
                 Env::IO_LOW, Env::IO_LOW);
 }
 
+class ResumeCompactionJobTest : public CompactionJobTestBase {
+ public:
+  ResumeCompactionJobTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("resume_compaction_job_test"),
+            BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+            /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
+
+ protected:
+  std::string progress_dir_ = "";
+  bool enable_cancel_ = false;
+  std::atomic<int> stop_count_{0};
+  std::atomic<bool> cancel_{false};
+
+  void SetUp() override {
+    CompactionJobTestBase::SetUp();
+    SyncPoint::GetInstance()->SetCallBack(
+        "CompactionOutputs::ShouldStopBefore::manual_decision",
+        [this](void* p) {
+          auto* pair = static_cast<std::pair<bool*, const Slice>*>(p);
+          *(pair->first) = true;
+          if (enable_cancel_ && stop_count_.fetch_add(1) == 3) {
+            cancel_.store(true);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  void TearDown() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    if (env_->FileExists(progress_dir_).ok()) {
+      std::vector<std::string> files;
+      EXPECT_OK(env_->GetChildren(progress_dir_, &files));
+      for (const auto& file : files) {
+        if (file != "." && file != "..") {
+          EXPECT_OK(env_->DeleteFile(progress_dir_ + "/" + file));
+        }
+      }
+      EXPECT_OK(env_->DeleteDir(progress_dir_));
+    }
+
+    CompactionJobTestBase::TearDown();
+  }
+
+  void NewDB() {
+    if (env_->FileExists(progress_dir_).ok()) {
+      std::vector<std::string> files;
+      EXPECT_OK(env_->GetChildren(progress_dir_, &files));
+      for (const auto& file : files) {
+        if (file != "." && file != "..") {
+          EXPECT_OK(env_->DeleteFile(progress_dir_ + "/" + file));
+        }
+      }
+      EXPECT_OK(env_->DeleteDir(progress_dir_));
+    }
+
+    CompactionJobTestBase::NewDB();
+
+    progress_dir_ = test::PerThreadDBPath("compaction_progress");
+    ASSERT_OK(env_->CreateDirIfMissing(progress_dir_));
+  }
+
+  void EnableCompactionCancel() { enable_cancel_ = true; }
+
+  void DisableCompactionCancel() {
+    enable_cancel_ = false;
+    cancel_.store(false);
+  }
+
+  std::unique_ptr<log::Writer> CreateCompactionProgressWriter(
+      const std::string& compaction_progress_file) {
+    std::unique_ptr<FSWritableFile> file;
+    EXPECT_OK(fs_->NewWritableFile(compaction_progress_file, FileOptions(),
+                                   &file, nullptr));
+    auto file_writer = std::make_unique<WritableFileWriter>(
+        std::move(file), compaction_progress_file, FileOptions());
+    auto compaction_progress_writer =
+        std::make_unique<log::Writer>(std::move(file_writer), 0, false);
+    return compaction_progress_writer;
+  }
+
+  Status RunCompactionWithProgressTracking(
+      const CompactionProgress& compaction_progress,
+      log::Writer* compaction_progress_writer,
+      std::vector<SequenceNumber> snapshots = {},
+      std::shared_ptr<Statistics> stats = nullptr) {
+    mutex_.Lock();
+
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+    auto files = cfd->current()->storage_info()->LevelFiles(0);
+
+    db_options_.statistics = stats;
+    db_options_.stats = db_options_.statistics.get();
+
+    std::vector<CompactionInputFiles> compaction_input_files;
+    CompactionInputFiles level;
+    level.level = 0;
+    level.files = files;
+    compaction_input_files.push_back(level);
+
+    Compaction compaction(
+        cfd->current()->storage_info(), cfd->ioptions(),
+        cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+        compaction_input_files, 1, mutable_cf_options_.target_file_size_base,
+        mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
+        cfd->GetLatestMutableCFOptions().compression_opts,
+        Temperature::kUnknown, 0, {}, std::nullopt, nullptr,
+        CompactionReason::kManualCompaction);
+    compaction.FinalizeInputInfo(cfd->current());
+
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+    EventLogger event_logger(db_options_.info_log.get());
+    JobContext job_context(1, false);
+    job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber,
+                                    std::move(snapshots));
+    CompactionJobStats job_stats;
+
+    CompactionJob compaction_job(
+        0, &compaction, db_options_, mutable_db_options_, env_options_,
+        versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
+        nullptr, stats.get(), &mutex_, &error_handler_, &job_context,
+        table_cache_, &event_logger, false, false, dbname_, &job_stats,
+        Env::Priority::USER, nullptr, cancel_, env_->GenerateUniqueId(),
+        DBImpl::GenerateDbSessionId(nullptr), "");
+
+    compaction_job.Prepare(std::nullopt, compaction_progress,
+                           compaction_progress_writer);
+    mutex_.Unlock();
+
+    compaction_job.Run().PermitUncheckedError();
+    EXPECT_OK(compaction_job.io_status());
+
+    mutex_.Lock();
+
+    bool compaction_released = false;
+    Status s = compaction_job.Install(&compaction_released);
+
+    mutex_.Unlock();
+    if (!compaction_released) {
+      compaction.ReleaseCompactionFiles(s);
+    }
+
+    return s;
+  }
+
+  SubcompactionProgress ReadAndParseProgress(
+      const std::string& compaction_progress_file) {
+    std::unique_ptr<FSSequentialFile> seq_file;
+    EXPECT_OK(fs_->NewSequentialFile(compaction_progress_file, FileOptions(),
+                                     &seq_file, nullptr));
+    auto file_reader = std::make_unique<SequentialFileReader>(
+        std::move(seq_file), compaction_progress_file, 0, nullptr);
+    log::Reader reader(nullptr, std::move(file_reader), nullptr, true, 0);
+
+    SubcompactionProgressBuilder builder;
+    std::string record;
+    Slice slice;
+
+    while (reader.ReadRecord(&slice, &record)) {
+      VersionEdit edit;
+      if (!edit.DecodeFrom(slice).ok()) continue;
+      builder.ProcessVersionEdit(edit);
+    }
+
+    EXPECT_TRUE(builder.HasAccumulatedSubcompactionProgress());
+
+    return builder.GetAccumulatedSubcompactionProgress();
+  }
+
+  // Test utility function to verify that compaction progress was correctly
+  // persisted to the progress file after compaction interruption.
+  //
+  // VERIFIES:
+  // - Progress file exists and has expected size (empty if no progress
+  // expected)
+  // - Next internal key to compact matches expected user key with proper format
+  // - Number of processed input records matches position in ordered input keys
+  // - Number of processed output records equals number of processed input
+  // records (by test design to simplify verification)
+  // - Each output file contains exactly one user key (by test design to
+  // simplify verification)
+  void VerifyCompactionProgressPersisted(
+      const std::string& compaction_progress_file,
+      const std::string& next_user_key_to_compact,
+      const std::vector<std::string>& ordered_intput_keys) {
+    ASSERT_OK(env_->FileExists(compaction_progress_file));
+
+    uint64_t file_size;
+    ASSERT_OK(env_->GetFileSize(compaction_progress_file, &file_size));
+
+    if (next_user_key_to_compact.empty()) {
+      ASSERT_EQ(file_size, 0);
+      return;
+    }
+
+    const auto& subcompaction_progress =
+        ReadAndParseProgress(compaction_progress_file);
+
+    ASSERT_FALSE(subcompaction_progress.next_internal_key_to_compact.empty());
+    ParsedInternalKey parsed_next_key;
+    ASSERT_OK(
+        ParseInternalKey(subcompaction_progress.next_internal_key_to_compact,
+                         &parsed_next_key, true /* log_err_key */));
+    ASSERT_EQ(parsed_next_key.user_key, next_user_key_to_compact);
+    ASSERT_EQ(parsed_next_key.sequence, kMaxSequenceNumber);
+    ASSERT_EQ(parsed_next_key.type, kValueTypeForSeek);
+
+    auto it = std::find(ordered_intput_keys.begin(), ordered_intput_keys.end(),
+                        next_user_key_to_compact);
+    ASSERT_TRUE(it != ordered_intput_keys.end());
+
+    auto next_key_index = std::distance(ordered_intput_keys.begin(), it);
+
+    ASSERT_EQ(subcompaction_progress.num_processed_input_records,
+              next_key_index);
+
+    ASSERT_EQ(subcompaction_progress.output_level_progress
+                  .GetNumProcessedOutputRecords(),
+              next_key_index);
+
+    ASSERT_EQ(
+        subcompaction_progress.output_level_progress.GetOutputFiles().size(),
+
+        next_key_index);
+
+    for (size_t i = 0;
+         i <
+         subcompaction_progress.output_level_progress.GetOutputFiles().size();
+         ++i) {
+      const auto& output_file =
+          subcompaction_progress.output_level_progress.GetOutputFiles()[i];
+      ASSERT_EQ(output_file.smallest.user_key().ToString(),
+                output_file.largest.user_key().ToString());
+      ASSERT_EQ(output_file.largest.user_key().ToString(),
+                ordered_intput_keys[i]);
+    }
+  }
+};
+
+TEST_F(ResumeCompactionJobTest, BasicProgressPersistence) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 1U, kTypeValue), "val1"},
+      {KeyStr("b", 2U, kTypeValue), "val2"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("c", 3U, kTypeValue), "val3"},
+      {KeyStr("d", 4U, kTypeValue), "val4"},
+  });
+  AddMockFile(file2);
+
+  SetLastSequence(4U);
+
+  std::string compaction_progress_file =
+      CompactionProgressFileName(progress_dir_, 123);
+
+  std::unique_ptr<log::Writer> compaction_progress_writer =
+      CreateCompactionProgressWriter(compaction_progress_file);
+
+  Status status = RunCompactionWithProgressTracking(
+      CompactionProgress(), compaction_progress_writer.get());
+
+  ASSERT_OK(status);
+
+  VerifyCompactionProgressPersisted(
+      compaction_progress_file, "d" /* next_user_key_to_compact */,
+      {"a", "b", "c", "d"} /* ordered_intput_keys */);
+}
+
+TEST_F(ResumeCompactionJobTest, CondtionallySkipProgressPersistence) {
+  for (auto type : {kTypeValue, kTypeRangeDeletion}) {
+    NewDB();
+
+    auto file1 = mock::MakeMockFile({
+        {KeyStr("a", 1U, kTypeValue), "val1"},
+    });
+    AddMockFile(file1);
+
+    auto file2 =
+        (type == kTypeValue ? mock::MakeMockFile({
+                                  {KeyStr("a", 2U, kTypeValue), "val2"},
+                              }) /* same user keys spanning the file boundary */
+                            : mock::MakeMockFile({
+                                  {KeyStr("b", 2U, kTypeRangeDeletion), "val2"},
+                              })); /* deletion range in the file boundary */
+    AddMockFile(file2);
+    SetLastSequence(2U);
+
+    std::string compaction_progress_file =
+        CompactionProgressFileName(progress_dir_, 123);
+    std::unique_ptr<log::Writer> compaction_progress_writer =
+        CreateCompactionProgressWriter(compaction_progress_file);
+
+    Status status = RunCompactionWithProgressTracking(
+        CompactionProgress{}, compaction_progress_writer.get(),
+        {1U} /* snapshots */);
+
+    ASSERT_OK(status);
+
+    VerifyCompactionProgressPersisted(compaction_progress_file,
+                                      "" /* next_user_key_to_compact */,
+                                      {"a", "b"} /* ordered_intput_keys */);
+  }
+}
+
+TEST_F(ResumeCompactionJobTest, BasicProgressResume) {
+  std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 1U, kTypeValue), "val1"},
+      {KeyStr("b", 2U, kTypeValue), "val2"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("c", 3U, kTypeValue), "val3"},
+      {KeyStr("d", 4U, kTypeValue), "val4"},
+  });
+  AddMockFile(file2);
+  SetLastSequence(4U);
+
+  std::string compaction_progress_file =
+      CompactionProgressFileName(progress_dir_, 123);
+  std::unique_ptr<log::Writer> compaction_progress_writer =
+      CreateCompactionProgressWriter(compaction_progress_file);
+
+  ASSERT_OK(stats->Reset());
+
+  EnableCompactionCancel();
+
+  Status status = RunCompactionWithProgressTracking(
+      CompactionProgress{}, compaction_progress_writer.get(), {} /* snapshots*/,
+      stats);
+
+  ASSERT_TRUE(status.IsManualCompactionPaused());
+
+  DisableCompactionCancel();
+
+  HistogramData cancelled_compaction_stats;
+  stats->histogramData(FILE_WRITE_COMPACTION_MICROS,
+                       &cancelled_compaction_stats);
+
+  VerifyCompactionProgressPersisted(
+      compaction_progress_file, "d" /* next_user_key_to_compact */,
+      {"a", "b", "c", "d"} /* ordered_intput_keys */);
+
+  CompactionProgress compaction_progress;
+  compaction_progress.push_back(ReadAndParseProgress(compaction_progress_file));
+
+  std::string compaction_progress_file_2 =
+      CompactionProgressFileName(progress_dir_, 234);
+  std::unique_ptr<log::Writer> compaction_progress_writer_2 =
+      CreateCompactionProgressWriter(compaction_progress_file_2);
+
+  ASSERT_OK(stats->Reset());
+
+  status = RunCompactionWithProgressTracking(compaction_progress,
+                                             compaction_progress_writer_2.get(),
+                                             {} /* snapshots */, stats);
+
+  HistogramData resumed_compaction_stats;
+  stats->histogramData(FILE_WRITE_COMPACTION_MICROS, &resumed_compaction_stats);
+
+  ASSERT_OK(status);
+  ASSERT_LT(resumed_compaction_stats.count, cancelled_compaction_stats.count);
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index 5351e7d33edf..67ea73567ae8 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -359,7 +359,8 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
 Status CompactionOutputs::AddToOutput(
     const CompactionIterator& c_iter,
     const CompactionFileOpenFunc& open_file_func,
-    const CompactionFileCloseFunc& close_file_func) {
+    const CompactionFileCloseFunc& close_file_func,
+    const ParsedInternalKey& prev_table_last_internal_key) {
   Status s;
   bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
   if (is_range_del && compaction_->bottommost_level()) {
@@ -370,7 +371,8 @@ Status CompactionOutputs::AddToOutput(
   }
   const Slice& key = c_iter.key();
   if (ShouldStopBefore(c_iter) && HasBuilder()) {
-    s = close_file_func(*this, c_iter.InputStatus(), key);
+    s = close_file_func(c_iter.InputStatus(), prev_table_last_internal_key, key,
+                        &c_iter, *this);
     if (!s.ok()) {
       return s;
     }
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index ed7b8a3cdea4..d2f94a5c50da 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -21,7 +21,8 @@ namespace ROCKSDB_NAMESPACE {
 class CompactionOutputs;
 using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
 using CompactionFileCloseFunc =
-    std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
+    std::function<Status(const Status&, const ParsedInternalKey&, const Slice&,
+                         const CompactionIterator*, CompactionOutputs&)>;
 
 // Files produced by subcompaction, most of the functions are used by
 // compaction_job Open/Close compaction file functions.
@@ -58,6 +59,8 @@ class CompactionOutputs {
                           precalculated_hash, is_proximal_level_);
   }
 
+  const std::vector<Output>& GetOutputs() const { return outputs_; }
+
   // Set new table builder for the current output
   void NewBuilder(const TableBuilderOptions& tboptions);
 
@@ -195,6 +198,10 @@ class CompactionOutputs {
       std::pair<SequenceNumber, SequenceNumber> keep_seqno_range,
       const Slice& next_table_min_key, const std::string& full_history_ts_low);
 
+  void SetNumOutputRecords(uint64_t num_output_records) {
+    stats_.num_output_records = num_output_records;
+  }
+
  private:
   friend class SubcompactionState;
 
@@ -254,7 +261,8 @@ class CompactionOutputs {
   // close and open new compaction output with the functions provided.
   Status AddToOutput(const CompactionIterator& c_iter,
                      const CompactionFileOpenFunc& open_file_func,
-                     const CompactionFileCloseFunc& close_file_func);
+                     const CompactionFileCloseFunc& close_file_func,
+                     const ParsedInternalKey& prev_table_last_internal_key);
 
   // Close the current output. `open_file_func` is needed for creating new file
   // for range-dels only output file.
@@ -270,9 +278,12 @@ class CompactionOutputs {
         !range_del_agg->IsEmpty()) {
       status = open_file_func(*this);
     }
+
     if (HasBuilder()) {
+      const ParsedInternalKey empty_internal_key{};
       const Slice empty_key{};
-      Status s = close_file_func(*this, status, empty_key);
+      Status s = close_file_func(status, empty_internal_key, empty_key,
+                                 nullptr /* c_iter */, *this);
       if (!s.ok() && status.ok()) {
         status = s;
       }
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index fc21cb127025..8355a9be9682 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -326,7 +326,9 @@ CompactionServiceCompactionJob::CompactionServiceCompactionJob(
       compaction_input_(compaction_service_input),
       compaction_result_(compaction_service_result) {}
 
-void CompactionServiceCompactionJob::Prepare() {
+void CompactionServiceCompactionJob::Prepare(
+    const CompactionProgress& compaction_progress,
+    log::Writer* compaction_progress_writer) {
   std::optional<Slice> begin;
   if (compaction_input_.has_begin) {
     begin = compaction_input_.begin;
@@ -335,7 +337,8 @@ void CompactionServiceCompactionJob::Prepare() {
   if (compaction_input_.has_end) {
     end = compaction_input_.end;
   }
-  CompactionJob::Prepare(std::make_pair(begin, end));
+  CompactionJob::Prepare(std::make_pair(begin, end), compaction_progress,
+                         compaction_progress_writer);
 }
 
 Status CompactionServiceCompactionJob::Run() {
diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc
index 6aab80445647..910c0bff7f03 100644
--- a/db/compaction/subcompaction_state.cc
+++ b/db/compaction/subcompaction_state.cc
@@ -108,11 +108,13 @@ Slice SubcompactionState::LargestUserKey() const {
 Status SubcompactionState::AddToOutput(
     const CompactionIterator& iter, bool use_proximal_output,
     const CompactionFileOpenFunc& open_file_func,
-    const CompactionFileCloseFunc& close_file_func) {
+    const CompactionFileCloseFunc& close_file_func,
+    const ParsedInternalKey& prev_table_last_internal_key) {
   // update target output
   current_outputs_ =
       use_proximal_output ? &proximal_level_outputs_ : &compaction_outputs_;
-  return current_outputs_->AddToOutput(iter, open_file_func, close_file_func);
+  return current_outputs_->AddToOutput(iter, open_file_func, close_file_func,
+                                       prev_table_last_internal_key);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 3f417b97eaa9..944841f75ba3 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -208,10 +208,20 @@ class SubcompactionState {
     return range_del_agg_ && !range_del_agg_->IsEmpty();
   }
 
+  void SetSubcompactionProgress(
+      const SubcompactionProgress& subcompaction_progress) {
+    subcompaction_progress_ = subcompaction_progress;
+  }
+
+  SubcompactionProgress& GetSubcompactionProgressRef() {
+    return subcompaction_progress_;
+  }
+
   // Add compaction_iterator key/value to the `Current` output group.
   Status AddToOutput(const CompactionIterator& iter, bool use_proximal_output,
                      const CompactionFileOpenFunc& open_file_func,
-                     const CompactionFileCloseFunc& close_file_func);
+                     const CompactionFileCloseFunc& close_file_func,
+                     const ParsedInternalKey& prev_table_last_internal_key);
 
   // Close all compaction output files, both output_to_proximal_level outputs
   // and normal outputs.
@@ -241,6 +251,8 @@ class SubcompactionState {
   CompactionOutputs proximal_level_outputs_;
   CompactionOutputs* current_outputs_ = &compaction_outputs_;
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+
+  SubcompactionProgress subcompaction_progress_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index e2dc53e7d4ab..445f7338d1f7 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -614,6 +614,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       case kOptionsFile:
         keep = (number >= optsfile_num2);
         break;
+      case kCompactionProgressFile:
+        // Keep compaction progress files - they are managed
+        // separately by DBImplSecondary for now
+        keep = true;
+        break;
       case kCurrentFile:
       case kDBLockFile:
       case kIdentityFile:
diff --git a/db/version_edit.cc b/db/version_edit.cc
index f76706fd7f52..822dedb54d0c 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -1144,11 +1144,6 @@ void SubcompactionProgressPerLevel::EncodeTo(std::string* dst) const {
     std::string files_data;
     EncodeOutputFiles(&files_data);
     PutLengthPrefixedSlice(dst, files_data);
-  } else if (!temp_output_files_allocation_.empty()) {
-    PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta);
-    std::string files_data;
-    EncodeTemporaryOutputFilesAllocation(&files_data);
-    PutLengthPrefixedSlice(dst, files_data);
   }
 
   PutVarint32(dst, SubcompactionProgressPerLevelCustomTag::
@@ -1185,7 +1180,7 @@ Status SubcompactionProgressPerLevel::DecodeFrom(Slice* input) {
       }
 
       case SubcompactionProgressPerLevelCustomTag::kOutputFilesDelta: {
-        Status s = DecodeOutputFiles(&field, temp_output_files_allocation_);
+        Status s = DecodeOutputFiles(&field, output_files_);
         if (!s.ok()) {
           return s;
         }
@@ -1220,60 +1215,29 @@ void SubcompactionProgressPerLevel::EncodeOutputFiles(std::string* dst) const {
 
   for (size_t i = last_persisted_output_files_count_; i < output_files_.size();
        ++i) {
-    const FileMetaData* file_ptr = output_files_[i];
-    assert(file_ptr != nullptr);
-
-    std::string file_dst;
-    bool ignored_min_log_written = false;
-
-    VersionEdit::EncodeToNewFile4(*file_ptr, -1 /* level */, 0 /* ts_sz */,
-                                  false /* has_min_log_number_to_keep */,
-                                  0 /* min_log_number_to_keep */,
-                                  ignored_min_log_written, &file_dst);
-
-    PutLengthPrefixedSlice(dst, file_dst);
-  }
-}
-
-void SubcompactionProgressPerLevel::EncodeTemporaryOutputFilesAllocation(
-    std::string* dst) const {
-  size_t new_files_count =
-      temp_output_files_allocation_.size() > last_persisted_output_files_count_
-          ? temp_output_files_allocation_.size() -
-                last_persisted_output_files_count_
-          : 0;
-
-  assert(new_files_count > 0);
-
-  PutVarint32(dst, static_cast<uint32_t>(new_files_count));
-
-  for (size_t i = last_persisted_output_files_count_;
-       i < temp_output_files_allocation_.size(); ++i) {
-    const FileMetaData& file = temp_output_files_allocation_[i];
-
     std::string file_dst;
     bool ignored_min_log_written = false;
 
-    VersionEdit::EncodeToNewFile4(file, -1 /* level */, 0 /* ts_sz */,
-                                  false /* has_min_log_number_to_keep */,
-                                  0 /* min_log_number_to_keep */,
-                                  ignored_min_log_written, &file_dst);
+    VersionEdit::EncodeToNewFile4(
+        output_files_[i], -1 /* level */, 0 /* ts_sz */,
+        false /* has_min_log_number_to_keep */, 0 /* min_log_number_to_keep */,
+        ignored_min_log_written, &file_dst);
 
     PutLengthPrefixedSlice(dst, file_dst);
   }
 }
 
 Status SubcompactionProgressPerLevel::DecodeOutputFiles(
-    Slice* input, autovector<FileMetaData>& temporary_output_files_allocation) {
+    Slice* input, autovector<FileMetaData>& output_files) {
   uint32_t new_file_count = 0;
   if (!GetVarint32(input, &new_file_count)) {
     return Status::Corruption("SubcompactionProgressPerLevel",
                               "new output file count");
   }
 
-  assert(temporary_output_files_allocation.size() == 0);
+  assert(output_files.size() == 0);
 
-  temporary_output_files_allocation.reserve(new_file_count);
+  output_files.reserve(new_file_count);
 
   for (uint32_t i = 0; i < new_file_count; ++i) {
     Slice file_input;
@@ -1302,7 +1266,7 @@ Status SubcompactionProgressPerLevel::DecodeOutputFiles(
       return Status::Corruption("SubcompactionProgressPerLevel", err);
     }
 
-    temporary_output_files_allocation.push_back(std::move(file));
+    output_files.push_back(std::move(file));
   }
 
   return Status::OK();
@@ -1314,12 +1278,10 @@ void SubcompactionProgress::EncodeTo(std::string* dst) const {
     PutLengthPrefixedSlice(dst, next_internal_key_to_compact);
   }
 
-  if (num_processed_input_records > 0) {
-    PutVarint32(dst, SubcompactionProgressCustomTag::kNumProcessedInputRecords);
-    std::string varint_records;
-    PutVarint64(&varint_records, num_processed_input_records);
-    PutLengthPrefixedSlice(dst, varint_records);
-  }
+  PutVarint32(dst, SubcompactionProgressCustomTag::kNumProcessedInputRecords);
+  std::string varint_records;
+  PutVarint64(&varint_records, num_processed_input_records);
+  PutLengthPrefixedSlice(dst, varint_records);
 
   if (output_level_progress.GetOutputFiles().size() >
       output_level_progress.GetLastPersistedOutputFilesCount()) {
@@ -1439,27 +1401,16 @@ void SubcompactionProgressBuilder::MergeDeltaProgress(
 void SubcompactionProgressBuilder::MaybeMergeDeltaProgressPerLevel(
     SubcompactionProgressPerLevel& accumulated_level_progress,
     const SubcompactionProgressPerLevel& delta_level_progress) {
-  assert(delta_level_progress.GetOutputFiles().empty());
-
-  if (delta_level_progress.GetTempOutputFilesAllocation().empty()) {
+  const auto& delta_files = delta_level_progress.GetOutputFiles();
+  if (delta_files.empty()) {
     return;
   }
+  for (const FileMetaData& file : delta_files) {
+    accumulated_level_progress.AddToOutputFiles(file);  // Stored as copy
+  }
 
   accumulated_level_progress.SetNumProcessedOutputRecords(
       delta_level_progress.GetNumProcessedOutputRecords());
-
-  auto& accumulated_temp_files =
-      accumulated_level_progress.TempOutputFilesAllocation();
-
-  const auto& delta_temp_files =
-      delta_level_progress.GetTempOutputFilesAllocation();
-
-  accumulated_temp_files.reserve(accumulated_temp_files.size() +
-                                 delta_temp_files.size());
-
-  for (const auto& file_allocation : delta_temp_files) {
-    accumulated_temp_files.push_back(file_allocation);
-  }
 }
 
 void SubcompactionProgressBuilder::Clear() {
diff --git a/db/version_edit.h b/db/version_edit.h
index 83b04e577510..9d7d11265694 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -466,29 +466,20 @@ struct SubcompactionProgressPerLevel {
     num_processed_output_records_ = num;
   }
 
-  const autovector<const FileMetaData*>& GetOutputFiles() const {
+  const autovector<FileMetaData>& GetOutputFiles() const {
     return output_files_;
   }
 
-  void AddToOutputFiles(const FileMetaData* file) {
+  void AddToOutputFiles(const FileMetaData& file) {
     output_files_.push_back(file);
   }
 
-  const autovector<FileMetaData>& GetTempOutputFilesAllocation() const {
-    return temp_output_files_allocation_;
-  }
-
-  autovector<FileMetaData>& TempOutputFilesAllocation() {
-    return temp_output_files_allocation_;
-  }
-
   size_t GetLastPersistedOutputFilesCount() const {
     return last_persisted_output_files_count_;
   }
 
   void UpdateLastPersistedOutputFilesCount() {
-    last_persisted_output_files_count_ =
-        std::max(output_files_.size(), temp_output_files_allocation_.size());
+    last_persisted_output_files_count_ = output_files_.size();
   }
 
   void EncodeTo(std::string* dst) const;
@@ -498,7 +489,6 @@ struct SubcompactionProgressPerLevel {
   void Clear() {
     num_processed_output_records_ = 0;
     output_files_.clear();
-    temp_output_files_allocation_.clear();
     last_persisted_output_files_count_ = 0;
   }
 
@@ -507,8 +497,6 @@ struct SubcompactionProgressPerLevel {
     oss << "SubcompactionProgressPerLevel{";
     oss << " num_processed_output_records=" << num_processed_output_records_;
     oss << ", output_files_count=" << output_files_.size();
-    oss << ", temp_output_files_allocation_count="
-        << temp_output_files_allocation_.size();
     oss << ", last_persisted_output_files_count="
         << last_persisted_output_files_count_;
     oss << " }";
@@ -520,15 +508,7 @@ struct SubcompactionProgressPerLevel {
  private:
   uint64_t num_processed_output_records_ = 0;
 
-  // These pointers ONLY point to FileMetaData objects owned by compaction
-  // outputs. They are NEVER set to point to objects in
-  // `temp_output_files_allocation` This ensures stable pointers that don't get
-  // invalidated by copy/move operations on `SubcompactionProgress`
-  autovector<const FileMetaData*> output_files_ = {};
-
-  // These are ONLY used during deserialization from VersionEdit.
-  // They provide temporary storage before being moved to compaction outputs.
-  autovector<FileMetaData> temp_output_files_allocation_ = {};
+  autovector<FileMetaData> output_files_ = {};
 
   // Number of files already persisted to help calculate the new output files to
   // persist in the future. This is to prevent having to persist all the output
@@ -540,16 +520,11 @@ struct SubcompactionProgressPerLevel {
 
   void EncodeOutputFiles(std::string* dst) const;
 
-  void EncodeTemporaryOutputFilesAllocation(std::string* dst) const;
-
   Status DecodeOutputFiles(Slice* input,
                            autovector<FileMetaData>& temp_storage);
 };
 
 struct SubcompactionProgress {
-  static constexpr uint64_t kInaccurateNumProcessedInputRecords =
-      std::numeric_limits<uint64_t>::max();
-
   std::string next_internal_key_to_compact;
 
   uint64_t num_processed_input_records = 0;
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index 6b1df759a266..d5f6beee93cc 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -825,27 +825,6 @@ class SubcompactionProgressTest : public VersionEditTest {
   std::vector<FileMetaData> compaction_output_files_;
   std::vector<FileMetaData> proximal_level_compaction_output_files_;
 
-  void SetupOutputFilePointers(
-      SubcompactionProgress& progress,
-      const std::vector<FileMetaData>& compaction_output_files,
-      const std::vector<FileMetaData>& proximal_level_compaction_output_files) {
-    if (!compaction_output_files.empty()) {
-      progress.output_level_progress.TEST_ClearOutputFiles();
-    }
-
-    for (const auto& file : compaction_output_files) {
-      progress.output_level_progress.AddToOutputFiles(&file);
-    }
-
-    if (!proximal_level_compaction_output_files.empty()) {
-      progress.proximal_output_level_progress.TEST_ClearOutputFiles();
-    }
-
-    for (const auto& file : proximal_level_compaction_output_files) {
-      progress.proximal_output_level_progress.AddToOutputFiles(&file);
-    }
-  }
-
   SubcompactionProgress CreateSubcompactionProgress(
       const std::string& next_key, uint64_t num_processed_input_records,
       uint64_t num_processed_output_records,
@@ -862,17 +841,15 @@ class SubcompactionProgressTest : public VersionEditTest {
         num_processed_proximal_level_output_records);
 
     for (uint64_t file_num : output_file_numbers) {
-      compaction_output_files_.push_back(
-          CreateTestFile(file_num, file_prefix + "output_"));
+      FileMetaData file = CreateTestFile(file_num, file_prefix + "output_");
+      progress.output_level_progress.AddToOutputFiles(file);
     }
+
     for (uint64_t file_num : proximal_file_numbers) {
-      proximal_level_compaction_output_files_.push_back(
-          CreateTestFile(file_num, file_prefix + "proximal_"));
+      FileMetaData file = CreateTestFile(file_num, file_prefix + "proximal_");
+      progress.proximal_output_level_progress.AddToOutputFiles(file);
     }
 
-    SetupOutputFilePointers(progress, compaction_output_files_,
-                            proximal_level_compaction_output_files_);
-
     return progress;
   }
 
@@ -935,17 +912,15 @@ class SubcompactionProgressTest : public VersionEditTest {
               .GetNumProcessedOutputRecords());
 
       ASSERT_EQ(
-          actual_subcompaction_progress_by_level.GetTempOutputFilesAllocation()
-              .size(),
+          actual_subcompaction_progress_by_level.GetOutputFiles().size(),
           expected_subcompaction_progress_by_level.GetOutputFiles().size());
 
       for (size_t i = 0;
            i < expected_subcompaction_progress_by_level.GetOutputFiles().size();
            ++i) {
         VerifyFileMetaDataEquality(
-            *expected_subcompaction_progress_by_level.GetOutputFiles()[i],
-            actual_subcompaction_progress_by_level
-                .GetTempOutputFilesAllocation()[i]);
+            expected_subcompaction_progress_by_level.GetOutputFiles()[i],
+            actual_subcompaction_progress_by_level.GetOutputFiles()[i]);
       }
     }
   }
@@ -992,18 +967,16 @@ TEST_F(SubcompactionProgressTest, OutputFilesDeltaEncodeDecode) {
   SubcompactionProgress updated_progress = initial_progress;
   updated_progress.next_internal_key_to_compact = "key_300";
   updated_progress.num_processed_input_records = 1000;
-
   updated_progress.output_level_progress.SetNumProcessedOutputRecords(400);
-  FileMetaData new_file = CreateTestFile(3, "new_");
-  compaction_output_files_.push_back(new_file);
-
   updated_progress.proximal_output_level_progress.SetNumProcessedOutputRecords(
       600);
-  FileMetaData new_file_proximal = CreateTestFile(4, "new_");
-  proximal_level_compaction_output_files_.push_back(new_file_proximal);
 
-  SetupOutputFilePointers(updated_progress, compaction_output_files_,
-                          proximal_level_compaction_output_files_);
+  FileMetaData new_file = CreateTestFile(3, "new_");
+  updated_progress.output_level_progress.AddToOutputFiles(new_file);
+
+  FileMetaData new_file_proximal = CreateTestFile(4, "new_");
+  updated_progress.proximal_output_level_progress.AddToOutputFiles(
+      new_file_proximal);
 
   auto [delta_decoded_edit, delta_decoded_progress] =
       EncodeDecodeProgress(updated_progress);
@@ -1027,13 +1000,10 @@ TEST_F(SubcompactionProgressTest, OutputFilesDeltaEncodeDecode) {
     ASSERT_EQ(delta_progress_per_level.GetNumProcessedOutputRecords(),
               updated_progress_per_level.GetNumProcessedOutputRecords());
 
-    // Delta encoding: only the one newly added file is present, not the
-    // previously persisted file
-    ASSERT_EQ(delta_progress_per_level.GetTempOutputFilesAllocation().size(),
-              1);
+    // Only the newly added file since last persistence should be present
+    ASSERT_EQ(delta_progress_per_level.GetOutputFiles().size(), 1);
 
-    ASSERT_EQ(delta_progress_per_level.GetTempOutputFilesAllocation()[0]
-                  .fd.GetNumber(),
+    ASSERT_EQ(delta_progress_per_level.GetOutputFiles()[0].fd.GetNumber(),
               is_proximal_level ? new_file_proximal.fd.GetNumber()
                                 : new_file.fd.GetNumber());
   }
@@ -1057,31 +1027,27 @@ TEST_F(SubcompactionProgressTest, OutputFilesDeltaEncodeDecode) {
 
   for (const bool& is_proximal_level : {false, true}) {
     const SubcompactionProgressPerLevel& accumulated_progress_per_level =
-        is_proximal_level ? accumulated_progress.output_level_progress
-                          : accumulated_progress.proximal_output_level_progress;
+        is_proximal_level ? accumulated_progress.proximal_output_level_progress
+                          : accumulated_progress.output_level_progress;
 
     const SubcompactionProgressPerLevel& updated_progress_per_level =
-        is_proximal_level ? updated_progress.output_level_progress
-                          : updated_progress.proximal_output_level_progress;
+        is_proximal_level ? updated_progress.proximal_output_level_progress
+                          : updated_progress.output_level_progress;
 
     ASSERT_EQ(accumulated_progress_per_level.GetNumProcessedOutputRecords(),
               updated_progress_per_level.GetNumProcessedOutputRecords());
 
-    ASSERT_EQ(
-        accumulated_progress_per_level.GetTempOutputFilesAllocation().size(),
-        updated_progress_per_level.GetOutputFiles().size());
+    ASSERT_EQ(accumulated_progress_per_level.GetOutputFiles().size(),
+              updated_progress_per_level.GetOutputFiles().size());
 
     std::set<uint64_t> accumulated_file_numbers;
-
-    for (const auto& file :
-         accumulated_progress_per_level.GetTempOutputFilesAllocation()) {
+    for (const auto& file : accumulated_progress_per_level.GetOutputFiles()) {
       accumulated_file_numbers.insert(file.fd.GetNumber());
     }
 
     std::set<uint64_t> expected_file_numbers;
-
     for (const auto& file : updated_progress_per_level.GetOutputFiles()) {
-      expected_file_numbers.insert(file->fd.GetNumber());
+      expected_file_numbers.insert(file.fd.GetNumber());
     }
 
     ASSERT_EQ(accumulated_file_numbers, expected_file_numbers);
diff --git a/file/filename.cc b/file/filename.cc
index 45cbf9d76a98..d1d9c815a440 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -25,6 +25,7 @@ namespace ROCKSDB_NAMESPACE {
 
 const std::string kCurrentFileName = "CURRENT";
 const std::string kOptionsFileNamePrefix = "OPTIONS-";
+const std::string kCompactionProgressFileNamePrefix = "COMPACTION_PROGRESS-";
 const std::string kTempFileNameSuffix = "dbtmp";
 
 static const std::string kRocksDbTFileExt = "sst";
@@ -242,6 +243,25 @@ std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num) {
   return dbname + "/" + buffer;
 }
 
+std::string CompactionProgressFileName(const std::string& dbname,
+                                       uint64_t timestamp) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%llu",
+           kCompactionProgressFileNamePrefix.c_str(),
+           static_cast<unsigned long long>(timestamp));
+  return dbname + "/" + buffer;
+}
+
+std::string TempCompactionProgressFileName(const std::string& dbname,
+                                           uint64_t timestamp) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%s%llu.%s",
+           kCompactionProgressFileNamePrefix.c_str(),
+           static_cast<unsigned long long>(timestamp),
+           kTempFileNameSuffix.c_str());
+  return dbname + "/" + buffer;
+}
+
 std::string MetaDatabaseName(const std::string& dbname, uint64_t number) {
   char buf[100];
   snprintf(buf, sizeof(buf), "/METADB-%llu",
@@ -264,6 +284,8 @@ std::string IdentityFileName(const std::string& dbname) {
 //    dbname/METADB-[0-9]+
 //    dbname/OPTIONS-[0-9]+
 //    dbname/OPTIONS-[0-9]+.dbtmp
+//    dbname/COMPACTION_PROGRESS-[timestamp]
+//    dbname/COMPACTION_PROGRESS-[timestamp].dbtmp
 //    Disregards / at the beginning
 bool ParseFileName(const std::string& fname, uint64_t* number, FileType* type,
                    WalFileType* log_type) {
@@ -339,6 +361,24 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
     }
     *number = ts_suffix;
     *type = is_temp_file ? kTempFile : kOptionsFile;
+  } else if (rest.starts_with(kCompactionProgressFileNamePrefix)) {
+    uint64_t timestamp;
+    bool is_temp_file = false;
+    rest.remove_prefix(kCompactionProgressFileNamePrefix.size());
+    const std::string kTempFileNameSuffixWithDot =
+        std::string(".") + kTempFileNameSuffix;
+    if (rest.ends_with(kTempFileNameSuffixWithDot)) {
+      rest.remove_suffix(kTempFileNameSuffixWithDot.size());
+      is_temp_file = true;
+    }
+    if (!ConsumeDecimalNumber(&rest, &timestamp)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *number = timestamp;
+    *type = is_temp_file ? kTempFile : kCompactionProgressFile;
   } else {
     // Avoid strtoull() to keep filename format independent of the
     // current locale
diff --git a/file/filename.h b/file/filename.h
index 5a52c745ac6d..399a20f23cfa 100644
--- a/file/filename.h
+++ b/file/filename.h
@@ -124,7 +124,10 @@ std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
                                const std::string& log_dir = "");
 
 extern const std::string kOptionsFileNamePrefix;  // = "OPTIONS-"
-extern const std::string kTempFileNameSuffix;     // = "dbtmp"
+extern const std::string
+    kCompactionProgressFileNamePrefix;         // =
+                                               // "COMPACTION_PROGRESS-"
+extern const std::string kTempFileNameSuffix;  // = "dbtmp"
 
 // Return a options file name given the "dbname" and file number.
 // Format:  OPTIONS-[number].dbtmp
@@ -135,6 +138,16 @@ std::string OptionsFileName(uint64_t file_num);
 // Format:  OPTIONS-[number]
 std::string TempOptionsFileName(const std::string& dbname, uint64_t file_num);
 
+// Return a compaction progress file name given the timestamp.
+// Format:  COMPACTION_PROGRESS-[timestamp]
+std::string CompactionProgressFileName(const std::string& dbname,
+                                       uint64_t timestamp);
+
+// Return a temp compaction progress file name given the timestamp.
+// Format:  COMPACTION_PROGRESS-[timestamp].dbtmp
+std::string TempCompactionProgressFileName(const std::string& dbname,
+                                           uint64_t timestamp);
+
 // Return the name to use for a metadatabase. The result will be prefixed with
 // "dbname".
 std::string MetaDatabaseName(const std::string& dbname, uint64_t number);
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
index d9b902ff0835..982f497fdf55 100644
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -53,7 +53,8 @@ enum FileType {
   kMetaDatabase,
   kIdentityFile,
   kOptionsFile,
-  kBlobFile
+  kBlobFile,
+  kCompactionProgressFile
 };
 
 // User-oriented representation of internal key types.

From 742741b175c5f238374c1714f9db3340d49de569 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Wed, 1 Oct 2025 18:20:35 -0700
Subject: [PATCH 321/500] Support Super Block Alignment (#13909)

Summary:
Pad block based table based on super block alignment

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13909

Test Plan:
Unit Test

No impact on perf observed due to change in the inner loop of flush.

upstream/main branch 202.15 MB/s
```
for i in `seq 1 10`; do ./db_bench --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 >> /tmp/x1 2>&1; grep fillseq /tmp/x1 | grep -Po "\d+\.\d+ MB/s" | grep -Po "\d+\.\d+" | awk '{sum+=$1} END {print sum/NR}'
```

After the change without super block alignment 203.44 MB/s
```
for i in `seq 1 10`; do ./db_bench --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 >> /tmp/x1 2>&1
```

After the change with super block alignment 204.47 MB/s
```
for i in `seq 1 10`; do ./db_bench --benchmarks=fillseq -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=1000 -fifo_compaction_allow_compaction=0 -disable_wal -write_buffer_size=12000000 -format_version=7 --super_block_alignment_size=131072 --super_block_alignment_max_padding_size=4096 >> /tmp/x1 2>&1;
```

Reviewed By: pdillinger

Differential Revision: D83068913

Pulled By: xingbowang

fbshipit-source-id: eecd65088ab3e9dbc7902aab8c2580f1bc8575df
---
 db/db_flush_test.cc                           | 146 +++
 db_stress_tool/db_stress_common.h             |   2 +
 db_stress_tool/db_stress_gflags.cc            |  11 +
 db_stress_tool/db_stress_test_base.cc         |   4 +
 file/writable_file_writer.cc                  |   7 +-
 file/writable_file_writer.h                   |   3 +-
 include/rocksdb/table.h                       |  24 +
 java/rocksjni/portal.h                        |   5 +-
 java/rocksjni/table.cc                        |  11 +-
 .../org/rocksdb/BlockBasedTableConfig.java    |  61 +-
 .../rocksdb/BlockBasedTableConfigTest.java    |  14 +
 .../java/org/rocksdb/OptionsUtilTest.java     |   5 +
 options/options_settable_test.cc              |   2 +
 options/options_test.cc                       |   5 +
 .../block_based/block_based_table_builder.cc  |  89 +-
 table/block_based/block_based_table_builder.h |  15 +-
 .../block_based/block_based_table_factory.cc  |  32 +
 .../block_based_table_reader_test.cc          | 831 ++++++++++++------
 table/block_based/block_builder.cc            |  17 +-
 table/block_based/block_builder.h             |   8 +-
 table/block_based/index_builder.cc            |  14 +-
 table/block_based/index_builder.h             |  51 +-
 table/block_based/index_builder_test.cc       |   6 +-
 .../partitioned_filter_block_test.cc          |   6 +-
 .../block_based/user_defined_index_wrapper.h  |   9 +-
 table/table_test.cc                           |  21 +-
 tools/db_bench_tool.cc                        |  10 +
 tools/db_crashtest.py                         |   4 +
 util/file_reader_writer_test.cc               |   2 +-
 .../lock/point/point_lock_manager.cc          |   2 -
 30 files changed, 1059 insertions(+), 358 deletions(-)

diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index f8353974fb46..21f88d795171 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -3561,6 +3561,152 @@ TEST_F(DBFlushTest, VerifyOutputRecordCount) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
+
+class DBFlushSuperBlockTest
+    : public DBFlushTest,
+      public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {
+ public:
+  DBFlushSuperBlockTest() : DBFlushTest() {}
+
+  std::string formatKey(int i) {
+    int desired_length = 10;
+    char buffer[64];
+    snprintf(buffer, 64, "%0*d", desired_length, i);
+    return buffer;
+  }
+
+  void VerifyReadWithGet(int key_count) {
+    for (int i = 0; i < key_count; ++i) {
+      PinnableSlice value;
+      ASSERT_OK(Get(formatKey(i), &value));
+      ASSERT_EQ(value.ToString(), added_data[formatKey(i)]);
+    }
+  }
+
+  void VerifyReadWithIterator(int key_count) {
+    {
+      std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+      int i = 0;
+      for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        ASSERT_OK(it->status());
+        ASSERT_EQ((it->key()).ToString(), formatKey(i));
+        ASSERT_EQ((it->value()).ToString(), added_data[formatKey(i)]);
+        i++;
+      }
+      ASSERT_OK(it->status());
+      ASSERT_EQ(i, key_count);
+    }
+  }
+
+ protected:
+  Random rnd{123};
+  std::unordered_map<std::string, std::string> added_data;
+};
+
+constexpr size_t kLowSpaceOverheadRatio = 256;
+
+TEST_P(DBFlushSuperBlockTest, SuperBlock) {
+  constexpr int key_count = 12345;
+  Options options;
+  options.env = env_;
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.paranoid_file_checks = true;
+  options.write_buffer_size = 1024 * 1024;
+  BlockBasedTableOptions block_options;
+  block_options.block_align = get<0>(GetParam());
+  block_options.index_block_restart_interval = 3;
+  block_options.super_block_alignment_size = get<1>(GetParam());
+  block_options.super_block_alignment_space_overhead_ratio = get<2>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(block_options));
+  if (block_options.block_align) {
+    // When block align is enabled, disable compression
+    options.compression = kNoCompression;
+  }
+
+  ASSERT_OK(options.table_factory->ValidateOptions(
+      DBOptions(options), ColumnFamilyOptions(options)));
+
+  Reopen(options);
+
+  int super_block_pad_count = 0;
+  int super_block_pad_exceed_limit_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+      "SuperBlockAlignment",
+      [&super_block_pad_count](void* /*arg*/) { super_block_pad_count++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+      "SuperBlockAlignmentPaddingBytesExceedLimit",
+      [&super_block_pad_exceed_limit_count](void* /*arg*/) {
+        super_block_pad_exceed_limit_count++;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Add lots of keys
+  for (int i = 0; i < key_count; ++i) {
+    added_data[formatKey(i)] = std::string(rnd.RandomString(rnd.Next() % 1000));
+    ASSERT_OK(Put(formatKey(i), added_data[formatKey(i)]));
+  }
+
+  // flush the data in memory to disk to verify with super block alignment, the
+  // data could be read back properly
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // When block_align is enabled, super block is always aligned, so there should
+  // be 0 padding for super block alignment
+  if (block_options.super_block_alignment_size != 0 &&
+      !block_options.block_align) {
+    ASSERT_GT(super_block_pad_count, 0);
+  } else {
+    ASSERT_EQ(super_block_pad_count, 0);
+  }
+
+  if (!block_options.block_align &&
+      block_options.super_block_alignment_size != 0 &&
+      block_options.super_block_alignment_space_overhead_ratio ==
+          kLowSpaceOverheadRatio) {
+    ASSERT_GT(super_block_pad_exceed_limit_count, 0);
+  }
+
+  // verify the values are correct
+  VerifyReadWithGet(key_count);
+  Reopen(options);
+  VerifyReadWithIterator(key_count);
+
+  // verify checksum
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Reopen options and flip the option of super block configuration, read still
+  // works. This verifies the forward/backward compatibility
+  if (block_options.super_block_alignment_size == 0) {
+    block_options.super_block_alignment_size = 16 * 1024;
+  } else {
+    block_options.super_block_alignment_size = 0;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(block_options));
+
+  Reopen(options);
+
+  // verify the values are correct
+  VerifyReadWithGet(key_count);
+  Reopen(options);
+  VerifyReadWithIterator(key_count);
+
+  // verify checksum
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    SuperBlockTests, DBFlushSuperBlockTest,
+    testing::Combine(testing::Bool(), testing::Values(0, 32 * 1024, 16 * 1024),
+                     // Use very low space overhead ratio to test
+                     // the case where required padded bytes is
+                     // larger than the max allowed padding size
+                     testing::Values(4, kLowSpaceOverheadRatio)));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 58d7cf08e3b8..fe993451cf79 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -410,6 +410,8 @@ DECLARE_uint64(max_sequential_skip_in_iterations);
 DECLARE_bool(enable_sst_partitioner_factory);
 DECLARE_bool(enable_do_not_compress_roles);
 DECLARE_bool(block_align);
+DECLARE_uint64(super_block_alignment_size);
+DECLARE_uint64(super_block_alignment_space_overhead_ratio);
 DECLARE_uint32(lowest_used_cache_tier);
 DECLARE_bool(enable_custom_split_merge);
 DECLARE_uint32(adm_policy);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 1cc8d8e1e610..e2dd696ac4e3 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1432,6 +1432,17 @@ DEFINE_bool(block_align,
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
             "BlockBasedTableOptions.block_align");
 
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "BlockBasedTableOptions.super_block_alignment_size");
+
+DEFINE_uint64(
+    super_block_alignment_space_overhead_ratio,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+        .super_block_alignment_space_overhead_ratio,
+    "BlockBasedTableOptions.super_block_alignment_space_overhead_ratio");
+
 DEFINE_uint32(
     lowest_used_cache_tier,
     static_cast<uint32_t>(ROCKSDB_NAMESPACE::Options().lowest_used_cache_tier),
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 080ada88f207..d61caddc8263 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4319,6 +4319,10 @@ void InitializeOptionsFromFlags(
       static_cast<BlockBasedTableOptions::IndexShorteningMode>(
           FLAGS_index_shortening);
   block_based_options.block_align = FLAGS_block_align;
+  block_based_options.super_block_alignment_size =
+      fLU64::FLAGS_super_block_alignment_size;
+  block_based_options.super_block_alignment_space_overhead_ratio =
+      fLU64::FLAGS_super_block_alignment_space_overhead_ratio;
   options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
   options.db_write_buffer_size = FLAGS_db_write_buffer_size;
   options.write_buffer_size = FLAGS_write_buffer_size;
diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc
index dad64fb22424..2a92c0754dcd 100644
--- a/file/writable_file_writer.cc
+++ b/file/writable_file_writer.cc
@@ -204,13 +204,14 @@ IOStatus WritableFileWriter::Append(const IOOptions& opts, const Slice& data,
   return s;
 }
 
-IOStatus WritableFileWriter::Pad(const IOOptions& opts,
-                                 const size_t pad_bytes) {
+IOStatus WritableFileWriter::Pad(const IOOptions& opts, const size_t pad_bytes,
+                                 const size_t max_pad_size) {
+  (void)max_pad_size;
   if (seen_error()) {
     return GetWriterHasPreviousErrorStatus();
   }
   const IOOptions io_options = FinalizeIOOptions(opts);
-  assert(pad_bytes < kDefaultPageSize);
+  assert(pad_bytes < max_pad_size);
   size_t left = pad_bytes;
   size_t cap = buf_.Capacity() - buf_.CurrentSize();
 
diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h
index b880e1f216b2..619821204b3e 100644
--- a/file/writable_file_writer.h
+++ b/file/writable_file_writer.h
@@ -256,7 +256,8 @@ class WritableFileWriter {
   IOStatus Append(const IOOptions& opts, const Slice& data,
                   uint32_t crc32c_checksum = 0);
 
-  IOStatus Pad(const IOOptions& opts, const size_t pad_bytes);
+  IOStatus Pad(const IOOptions& opts, const size_t pad_bytes,
+               const size_t max_pad_size);
 
   IOStatus Flush(const IOOptions& opts);
 
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 51dffe7cc4f6..9727d30a3017 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -603,6 +603,30 @@ struct BlockBasedTableOptions {
   // Align data blocks on lesser of page size and block size
   bool block_align = false;
 
+  // Align data blocks on super block alignment. Avoid a data block split across
+  // super block boundaries. Works with/without compression.
+  //
+  // Here a "super block" refers to an aligned unit of underlying Filesystem
+  // storage for which there is an extra cost when a random read involves two
+  // such super blocks instead of just one. Configuring that size here suggests
+  // inserting padding in the SST file to avoid a single SST block splitting
+  // across two super blocks. Only power-of-two sizes are supported. See also
+  // super_block_alignment_space_overhead_ratio. Default to 0, which means super
+  // block alignment is disabled.
+  //
+  // Super block alignment size. Default to 0, which means super block alignment
+  // is disabled. If it is enabled, it needs to be a power of 2 and higher than
+  // block size.
+  size_t super_block_alignment_size = 0;
+
+  // This option constrols the storage space overhead of super block alignment.
+  // It is used to calculate the max padding size allowed for super block
+  // alignment. It is calculated in this way. If super_block_alignment_size is
+  // 2MB, and super_block_alignment_overhead_ratio is 128, then the max padding
+  // size allowed for super block alignment is 2MB / 128 = 16KB.
+  // Note that, when it is set to 0, super block alignment is disabled.
+  size_t super_block_alignment_space_overhead_ratio = 128;
+
   // This enum allows trading off increased index size for improved iterator
   // seek performance in some situations, particularly when block cache is
   // disabled (ReadOptions::fill_cache = false) and direct IO is
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 7ed6d6b1ff89..86248606b248 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -9152,7 +9152,7 @@ class BlockBasedTableOptionsJni
     }
 
     jmethodID method_id_init =
-        env->GetMethodID(jclazz, "<init>", "(ZZZZBBDBZJIIIJZZZZZIIZZBBJD)V");
+        env->GetMethodID(jclazz, "<init>", "(ZZZZBBDBZJIIIJZZZZZIIZZJJBBJD)V");
     if (method_id_init == nullptr) {
       // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
@@ -9197,6 +9197,9 @@ class BlockBasedTableOptionsJni
         table_factory_options->format_version,
         table_factory_options->enable_index_compression,
         table_factory_options->block_align,
+        static_cast<jlong>(table_factory_options->super_block_alignment_size),
+        static_cast<jlong>(
+            table_factory_options->super_block_alignment_space_overhead_ratio),
         IndexShorteningModeJni::toJavaIndexShorteningMode(
             table_factory_options->index_shortening),
         FilterPolicyJni::toJavaIndexType(filter_policy_type),
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index eb5de1695e6c..63eb3feca324 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -45,7 +45,7 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZZZZBBDBZJJJJIIIJZZZJZZIIZZBJIJI)J
+ * Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv *, jclass, jboolean jcache_index_and_filter_blocks,
@@ -63,8 +63,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     jboolean jwhole_key_filtering, jboolean jverify_compression,
     jint jread_amp_bytes_per_bit, jint jformat_version,
     jboolean jenable_index_compression, jboolean jblock_align,
-    jbyte jindex_shortening, jlong jblock_cache_size,
-    jint jblock_cache_num_shard_bits) {
+    jlong jsuper_block_alignment_size,
+    jlong jsuper_block_alignment_space_overhead_ratio, jbyte jindex_shortening,
+    jlong jblock_cache_size, jint jblock_cache_num_shard_bits) {
   ROCKSDB_NAMESPACE::BlockBasedTableOptions options;
   options.cache_index_and_filter_blocks =
       static_cast<bool>(jcache_index_and_filter_blocks);
@@ -136,6 +137,10 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   options.enable_index_compression =
       static_cast<bool>(jenable_index_compression);
   options.block_align = static_cast<bool>(jblock_align);
+  options.super_block_alignment_size =
+      static_cast<size_t>(jsuper_block_alignment_size);
+  options.super_block_alignment_space_overhead_ratio =
+      static_cast<size_t>(jsuper_block_alignment_space_overhead_ratio);
   options.index_shortening =
       ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode(
           jindex_shortening);
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index c8159db2ddca..18d1bebacbd6 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -40,6 +40,8 @@ public BlockBasedTableConfig() {
     formatVersion = 6;
     enableIndexCompression = true;
     blockAlign = false;
+    superBlockAlignmentSize = 0;
+    superBlockAlignmentSpaceOverheadRatio = 128;
     indexShortening = IndexShorteningMode.kShortenSeparators;
 
     // NOTE: ONLY used if blockCache == null
@@ -60,8 +62,9 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks,
       final boolean partitionFilters, final boolean optimizeFiltersForMemory,
       final boolean useDeltaEncoding, final boolean wholeKeyFiltering,
       final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion,
-      final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening,
-      final byte filterPolicyType, final long filterPolicyHandle,
+      final boolean enableIndexCompression, final boolean blockAlign,
+      final long superBlockAlignmentSize, final long superBlockAlignmentSpaceOverheadRatio,
+      final byte indexShortening, final byte filterPolicyType, final long filterPolicyHandle,
       final double filterPolicyConfigValue) {
     this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks;
     this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority;
@@ -86,6 +89,8 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks,
     this.formatVersion = formatVersion;
     this.enableIndexCompression = enableIndexCompression;
     this.blockAlign = blockAlign;
+    this.superBlockAlignmentSize = superBlockAlignmentSize;
+    this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio;
     this.indexShortening = IndexShorteningMode.values()[indexShortening];
     try (Filter filterPolicy = FilterPolicyType.values()[filterPolicyType].createFilter(
              filterPolicyHandle, filterPolicyConfigValue)) {
@@ -799,6 +804,50 @@ public BlockBasedTableConfig setBlockAlign(final boolean blockAlign) {
     return this;
   }
 
+  /**
+   * Get the super block alignment size.
+   *
+   * @return the super block alignment size.
+   */
+  public long superBlockAlignmentSize() {
+    return superBlockAlignmentSize;
+  }
+
+  /**
+   * Set the super block alignment size.
+   * When set to 0, super block alignment is disabled.
+   *
+   * @param superBlockAlignmentSize the super block alignment size.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setSuperBlockAlignmentSize(final long superBlockAlignmentSize) {
+    this.superBlockAlignmentSize = superBlockAlignmentSize;
+    return this;
+  }
+
+  /**
+   * Get the space overhead ratio of super block alignment.
+   *
+   * @return space overhead ratio of super block alignment.
+   */
+  public long superBlockAlignmentSpaceOverheadRatio() {
+    return superBlockAlignmentSpaceOverheadRatio;
+  }
+
+  /**
+   * Set the space overhead ratio of super block alignment.
+   *
+   * @param superBlockAlignmentSpaceOverheadRatio the space overhead ratio of super block alignment.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setSuperBlockAlignmentSpaceOverheadRatio(
+      final long superBlockAlignmentSpaceOverheadRatio) {
+    this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio;
+    return this;
+  }
+
   /**
    * Get the index shortening mode.
    *
@@ -946,7 +995,8 @@ public BlockBasedTableConfig setHashIndexAllowCollision(
         indexBlockRestartInterval, metadataBlockSize, partitionFilters, optimizeFiltersForMemory,
         useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, verifyCompression,
         readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign,
-        indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits);
+        superBlockAlignmentSize, superBlockAlignmentSpaceOverheadRatio, indexShortening.getValue(),
+        blockCacheSize, blockCacheNumShardBits);
   }
 
   private static native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks,
@@ -961,7 +1011,8 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt
       final boolean useDeltaEncoding, final long filterPolicyHandle,
       final boolean wholeKeyFiltering, final boolean verifyCompression,
       final int readAmpBytesPerBit, final int formatVersion, final boolean enableIndexCompression,
-      final boolean blockAlign, final byte indexShortening,
+      final boolean blockAlign, final long superBlockAlignmentSize,
+      final long superBlockAlignmentSpaceOverheadRatio, final byte indexShortening,
 
       @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits);
 
@@ -992,6 +1043,8 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt
   private int formatVersion;
   private boolean enableIndexCompression;
   private boolean blockAlign;
+  private long superBlockAlignmentSize;
+  private long superBlockAlignmentSpaceOverheadRatio;
   private IndexShorteningMode indexShortening;
 
   // NOTE: ONLY used if blockCache == null
diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
index 13247d1e6635..be2a7b46ec87 100644
--- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
+++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -377,6 +377,20 @@ public void blockAlign() {
         isTrue();
   }
 
+  @Test
+  public void superBlockAlignmentSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setSuperBlockAlignmentSize(1024 * 1024);
+    assertThat(blockBasedTableConfig.superBlockAlignmentSize()).isEqualTo(1024 * 1024);
+  }
+
+  @Test
+  public void superBlockAlignmentSpaceOverheadRatio() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setSuperBlockAlignmentSpaceOverheadRatio(4096);
+    assertThat(blockBasedTableConfig.superBlockAlignmentSpaceOverheadRatio()).isEqualTo(4096);
+  }
+
   @Test
   public void indexShortening() {
     final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
diff --git a/java/src/test/java/org/rocksdb/OptionsUtilTest.java b/java/src/test/java/org/rocksdb/OptionsUtilTest.java
index 0998ae83fa73..0cdccbb91ba4 100644
--- a/java/src/test/java/org/rocksdb/OptionsUtilTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsUtilTest.java
@@ -298,6 +298,8 @@ private void verifyTableFormatOptions(final LoaderUnderTest loaderUnderTest)
     altCFTableConfig.setFormatVersion(8);
     altCFTableConfig.setEnableIndexCompression(false);
     altCFTableConfig.setBlockAlign(true);
+    altCFTableConfig.setSuperBlockAlignmentSize(1024 * 1024);
+    altCFTableConfig.setSuperBlockAlignmentSpaceOverheadRatio(4 * 1024);
     altCFTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
     altCFTableConfig.setBlockCacheSize(3 * 1024 * 1024);
     // Note cache objects are not set here, as they are not read back when reading config.
@@ -365,6 +367,9 @@ private void verifyBlockBasedTableConfig(
     assertThat(actual.formatVersion()).isEqualTo(expected.formatVersion());
     assertThat(actual.enableIndexCompression()).isEqualTo(expected.enableIndexCompression());
     assertThat(actual.blockAlign()).isEqualTo(expected.blockAlign());
+    assertThat(actual.superBlockAlignmentSize()).isEqualTo(expected.superBlockAlignmentSize());
+    assertThat(actual.superBlockAlignmentSpaceOverheadRatio())
+        .isEqualTo(expected.superBlockAlignmentSpaceOverheadRatio());
     assertThat(actual.indexShortening()).isEqualTo(expected.indexShortening());
     if (expected.filterPolicy() == null) {
       assertThat(actual.filterPolicy()).isNull();
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index cacb1d2be316..8266adbb8ba4 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -200,6 +200,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "verify_compression=true;read_amp_bytes_per_bit=0;"
       "enable_index_compression=false;"
       "block_align=true;"
+      "super_block_alignment_size=65536;"
+      "super_block_alignment_space_overhead_ratio=4096;"
       "max_auto_readahead_size=0;"
       "prepopulate_block_cache=kDisable;"
       "initial_auto_readahead_size=0;"
diff --git a/options/options_test.cc b/options/options_test.cc
index c045266380d3..fc05e64ed79e 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -1721,6 +1721,9 @@ TEST_F(OptionsTest, MutableCFOptions) {
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       config_options, cf_opts,
       "paranoid_file_checks=true; block_based_table_factory.block_align=false; "
+      "block_based_table_factory.super_block_alignment_size=65536; "
+      "block_based_table_factory.super_block_alignment_space_overhead_ratio="
+      "4096; "
       "block_based_table_factory.block_size=8192;",
       &cf_opts));
   ASSERT_TRUE(cf_opts.paranoid_file_checks);
@@ -1729,6 +1732,8 @@ TEST_F(OptionsTest, MutableCFOptions) {
   ASSERT_NE(bbto, nullptr);
   ASSERT_EQ(bbto->block_size, 8192);
   ASSERT_EQ(bbto->block_align, false);
+  ASSERT_EQ(bbto->super_block_alignment_size, 65536);
+  ASSERT_EQ(bbto->super_block_alignment_space_overhead_ratio, 4096);
   std::unordered_map<std::string, std::string> unused_opts;
   ASSERT_OK(GetColumnFamilyOptionsFromMap(
       config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts));
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 0fa6879316ea..d85a73a11553 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1686,7 +1686,11 @@ void BlockBasedTableBuilder::EmitBlock(std::string& uncompressed,
   // Single-threaded context only
   assert(!r->IsParallelCompressionActive());
   assert(uncompressed.size() > 0);
-  WriteBlock(uncompressed, &r->pending_handle, BlockType::kData);
+  // When data blocks are aligned with super block alignment, delta encoding
+  // needs to be skipped for the first block after padding.
+  bool skip_delta_encoding = false;
+  WriteBlock(uncompressed, &r->pending_handle, BlockType::kData,
+             &skip_delta_encoding);
   if (LIKELY(ok())) {
     // We do not emit the index entry for a block until we have seen the
     // first key for the next data block.  This allows us to use shorter
@@ -1695,15 +1699,16 @@ void BlockBasedTableBuilder::EmitBlock(std::string& uncompressed,
     // "the r" as the key for the index block entry since it is >= all
     // entries in the first block and < all entries in subsequent
     // blocks.
-    r->index_builder->AddIndexEntry(last_key_in_current_block,
-                                    first_key_in_next_block, r->pending_handle,
-                                    &r->index_separator_scratch);
+    r->index_builder->AddIndexEntry(
+        last_key_in_current_block, first_key_in_next_block, r->pending_handle,
+        &r->index_separator_scratch, skip_delta_encoding);
   }
 }
 
 void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
                                         BlockHandle* handle,
-                                        BlockType block_type) {
+                                        BlockType block_type,
+                                        bool* skip_delta_encoding) {
   Rep* r = rep_.get();
   assert(r->state == Rep::State::kUnbuffered);
   // Single-threaded context only
@@ -1722,10 +1727,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   TEST_SYNC_POINT_CALLBACK(
       "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData",
       &r->single_threaded_compressed_output);
-  WriteMaybeCompressedBlock(type == kNoCompression
-                                ? uncompressed_block_data
-                                : Slice(r->single_threaded_compressed_output),
-                            type, handle, block_type, &uncompressed_block_data);
+  WriteMaybeCompressedBlock(
+      type == kNoCompression ? uncompressed_block_data
+                             : Slice(r->single_threaded_compressed_output),
+      type, handle, block_type, &uncompressed_block_data, skip_delta_encoding);
   r->single_threaded_compressed_output.Reset();
   if (is_data_block) {
     r->props.data_size = r->get_offset();
@@ -1770,18 +1775,20 @@ void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
     auto write_fn = [this, block_rep, &ios]() {
       Slice compressed = block_rep->compressed;
       Slice uncompressed = block_rep->uncompressed;
+      bool skip_delta_encoding = false;
       ios = WriteMaybeCompressedBlockImpl(
           block_rep->compression_type == kNoCompression ? uncompressed
                                                         : compressed,
           block_rep->compression_type, &rep_->pending_handle, BlockType::kData,
-          &uncompressed);
+          &uncompressed, &skip_delta_encoding);
       if (LIKELY(ios.ok())) {
         rep_->props.data_size = rep_->get_offset();
         rep_->props.uncompressed_data_size += block_rep->uncompressed.size();
         ++rep_->props.num_data_blocks;
 
         rep_->index_builder->FinishIndexEntry(
-            rep_->pending_handle, block_rep->prepared_index_entry.get());
+            rep_->pending_handle, block_rep->prepared_index_entry.get(),
+            skip_delta_encoding);
       }
     };
     switch (thread_state) {
@@ -1931,20 +1938,30 @@ Status BlockBasedTableBuilder::CompressAndVerifyBlock(
 
 void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
-    BlockType block_type, const Slice* uncompressed_block_data) {
+    BlockType block_type, const Slice* uncompressed_block_data,
+    bool* skip_delta_encoding) {
   rep_->SetIOStatus(WriteMaybeCompressedBlockImpl(
-      block_contents, comp_type, handle, block_type, uncompressed_block_data));
+      block_contents, comp_type, handle, block_type, uncompressed_block_data,
+      skip_delta_encoding));
 }
 
 IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl(
     const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
-    BlockType block_type, const Slice* uncompressed_block_data) {
+    BlockType block_type, const Slice* uncompressed_block_data,
+    bool* skip_delta_encoding) {
   // File format contains a sequence of blocks where each block has:
   //    block_data: uint8[n]
   //    compression_type: uint8
   //    checksum: uint32
   Rep* r = rep_.get();
   bool is_data_block = block_type == BlockType::kData;
+  // For data block, skip_delta_encoding must be non null
+  if (is_data_block) {
+    assert(skip_delta_encoding != nullptr);
+  }
+  if (skip_delta_encoding != nullptr) {
+    *skip_delta_encoding = false;
+  }
   IOOptions io_options;
   // Always return io_s for NRVO
   IOStatus io_s =
@@ -1954,7 +1971,47 @@ IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl(
   }
   // Old, misleading name of this function: WriteRawBlock
   StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
-  const uint64_t offset = r->get_offset();
+
+  auto offset = r->get_offset();
+  // try to align the data block page to the super alignment size, if enabled
+  if ((r->table_options.super_block_alignment_size != 0) && is_data_block) {
+    auto super_block_alignment_mask =
+        r->table_options.super_block_alignment_size - 1;
+    if ((r->table_options.super_block_alignment_space_overhead_ratio != 0) &&
+        (offset & (~super_block_alignment_mask)) !=
+            ((offset + block_contents.size()) &
+             (~super_block_alignment_mask))) {
+      auto allowed_max_padding_size =
+          r->table_options.super_block_alignment_size /
+          r->table_options.super_block_alignment_space_overhead_ratio;
+      // new block would cross the super block boundary
+      auto pad_bytes = r->table_options.super_block_alignment_size -
+                       (offset & super_block_alignment_mask);
+      if (pad_bytes < allowed_max_padding_size) {
+        io_s = r->file->Pad(io_options, pad_bytes, allowed_max_padding_size);
+        if (UNLIKELY(!io_s.ok())) {
+          r->SetIOStatus(io_s);
+          return io_s;
+        }
+        r->pre_compression_size += pad_bytes;
+        offset += pad_bytes;
+        r->set_offset(offset);
+        if (skip_delta_encoding != nullptr) {
+          // Skip delta encoding in index block builder when a super block
+          // alignment padding is added for data block.
+          *skip_delta_encoding = true;
+        }
+        TEST_SYNC_POINT(
+            "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+            "SuperBlockAlignment");
+      } else {
+        TEST_SYNC_POINT(
+            "BlockBasedTableBuilder::WriteMaybeCompressedBlock:"
+            "SuperBlockAlignmentPaddingBytesExceedLimit");
+      }
+    }
+  }
+
   handle->set_offset(offset);
   handle->set_size(block_contents.size());
   assert(status().ok());
@@ -2018,7 +2075,7 @@ IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl(
          ((block_contents.size() + kBlockTrailerSize) & (r->alignment - 1))) &
         (r->alignment - 1);
 
-    io_s = r->file->Pad(io_options, pad_bytes);
+    io_s = r->file->Pad(io_options, pad_bytes, kDefaultPageSize);
     if (LIKELY(io_s.ok())) {
       r->pre_compression_size += pad_bytes;
       r->set_offset(r->get_offset() + pad_bytes);
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 29a35c5135b3..db96a8929fa4 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -133,15 +133,20 @@ class BlockBasedTableBuilder : public TableBuilder {
 
   // Compress and write block content to the file, from a single-threaded
   // context
+  // @skip_delta_encoding : This is set to non null for data blocks, so that
+  //     caller would know whether the index entry of this data block should
+  //     skip delta encoding or not
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
-                  BlockType block_type);
+                  BlockType block_type, bool* skip_delta_encoding = nullptr);
   // Directly write data to the file.
-  void WriteMaybeCompressedBlock(
-      const Slice& block_contents, CompressionType, BlockHandle* handle,
-      BlockType block_type, const Slice* uncompressed_block_data = nullptr);
+  void WriteMaybeCompressedBlock(const Slice& block_contents, CompressionType,
+                                 BlockHandle* handle, BlockType block_type,
+                                 const Slice* uncompressed_block_data = nullptr,
+                                 bool* skip_delta_encoding = nullptr);
   IOStatus WriteMaybeCompressedBlockImpl(
       const Slice& block_contents, CompressionType, BlockHandle* handle,
-      BlockType block_type, const Slice* uncompressed_block_data = nullptr);
+      BlockType block_type, const Slice* uncompressed_block_data = nullptr,
+      bool* skip_delta_encoding = nullptr);
 
   void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
 
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 1a1ace7d1ef8..64ae8b0e19eb 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -364,6 +364,13 @@ static struct BlockBasedTableTypeInfo {
         {"block_align",
          {offsetof(struct BlockBasedTableOptions, block_align),
           OptionType::kBoolean, OptionVerificationType::kNormal}},
+        {"super_block_alignment_size",
+         {offsetof(struct BlockBasedTableOptions, super_block_alignment_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal}},
+        {"super_block_alignment_space_overhead_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   super_block_alignment_space_overhead_ratio),
+          OptionType::kSizeT, OptionVerificationType::kNormal}},
         {"pin_top_level_index_and_filter",
          {offsetof(struct BlockBasedTableOptions,
                    pin_top_level_index_and_filter),
@@ -693,6 +700,22 @@ Status BlockBasedTableFactory::ValidateOptions(
     return Status::InvalidArgument(
         "block size exceeds maximum number (4GiB) allowed");
   }
+  if ((table_options_.super_block_alignment_size &
+       (table_options_.super_block_alignment_size - 1))) {
+    return Status::InvalidArgument(
+        "Super Block alignment requested but super block alignment size is not "
+        "a power of 2");
+  }
+  if (table_options_.super_block_alignment_size >
+      std::numeric_limits<uint32_t>::max()) {
+    return Status::InvalidArgument(
+        "Super block alignment size exceeds maximum number (4GiB) allowed");
+  }
+  if (table_options_.super_block_alignment_space_overhead_ratio > 0 &&
+      table_options_.super_block_alignment_space_overhead_ratio < 4) {
+    return Status::InvalidArgument(
+        "Super block alignment space overhead is too high");
+  }
   if (table_options_.data_block_index_type ==
           BlockBasedTableOptions::kDataBlockBinaryAndHash &&
       table_options_.data_block_hash_table_util_ratio <= 0) {
@@ -903,6 +926,15 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
   snprintf(buffer, kBufferSize, "  block_align: %d\n",
            table_options_.block_align);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  super_block_alignment_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.super_block_alignment_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  super_block_alignment_space_overhead_ratio: %" ROCKSDB_PRIszt
+           "\n",
+           table_options_.super_block_alignment_space_overhead_ratio);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize,
            "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
            table_options_.max_auto_readahead_size);
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 00749636c579..d8426ed0cb7e 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -246,6 +246,70 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
   }
 };
 
+struct BlockBasedTableReaderTestParam {
+  BlockBasedTableReaderTestParam(
+      CompressionType _compression_type, bool _use_direct_reads,
+      BlockBasedTableOptions::IndexType _index_type, bool _no_block_cache,
+      test::UserDefinedTimestampTestMode _udt_test_mode,
+      uint32_t _compression_parallel_threads, uint32_t _compression_dict_bytes,
+      bool _same_key_diff_ts, const Comparator* _comparator, bool _fill_cache,
+      bool _use_async_io, bool _block_align, size_t _super_block_alignment_size,
+      size_t _super_block_alignment_space_overhead_ratio)
+      : compression_type(_compression_type),
+        use_direct_reads(_use_direct_reads),
+        index_type(_index_type),
+        no_block_cache(_no_block_cache),
+        udt_test_mode(_udt_test_mode),
+        compression_parallel_threads(_compression_parallel_threads),
+        compression_dict_bytes(_compression_dict_bytes),
+        same_key_diff_ts(_same_key_diff_ts),
+        comparator(_comparator),
+        fill_cache(_fill_cache),
+        use_async_io(_use_async_io),
+        block_align(_block_align),
+        super_block_alignment_size(_super_block_alignment_size),
+        super_block_alignment_space_overhead_ratio(
+            _super_block_alignment_space_overhead_ratio) {}
+
+  CompressionType compression_type;
+  bool use_direct_reads;
+  BlockBasedTableOptions::IndexType index_type;
+  bool no_block_cache;
+  test::UserDefinedTimestampTestMode udt_test_mode;
+  uint32_t compression_parallel_threads;
+  uint32_t compression_dict_bytes;
+  bool same_key_diff_ts;
+  const Comparator* comparator;
+  bool fill_cache;
+  bool use_async_io;
+  bool block_align;
+  size_t super_block_alignment_size;
+  size_t super_block_alignment_space_overhead_ratio;
+};
+
+// Define operator<< for SpotLockManagerTestParam to stop valgrind from
+// complaining uinitialized value when printing SpotLockManagerTestParam.
+std::ostream& operator<<(std::ostream& os,
+                         const BlockBasedTableReaderTestParam& param) {
+  os << "compression_type: " << CompressionTypeToString(param.compression_type)
+     << " use_direct_reads: " << param.use_direct_reads
+     << " index_type: " << static_cast<int>(param.index_type)
+     << " no_block_cache: " << param.no_block_cache
+     << " udt_test_mode: " << static_cast<int>(param.udt_test_mode)
+     << " compression_parallel_threads: " << param.compression_parallel_threads
+     << " compression_dict_bytes: " << param.compression_dict_bytes
+     << " same_key_diff_ts: " << param.same_key_diff_ts
+     << " comparator: " << param.comparator->Name()
+     << " fill_cache: " << param.fill_cache
+     << " use_async_io: " << param.use_async_io
+     << " block_align: " << param.block_align
+     << " super_block_alignment_size: " << param.super_block_alignment_size
+     << " super_block_alignment_space_overhead_ratio: "
+     << param.super_block_alignment_space_overhead_ratio;
+
+  return os;
+}
+
 // Param 1: compression type
 // Param 2: whether to use direct reads
 // Param 3: Block Based Table Index type
@@ -265,28 +329,30 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
 // Param 9: test both the default comparator and a reverse comparator.
 class BlockBasedTableReaderTest
     : public BlockBasedTableReaderBaseTest,
-      public testing::WithParamInterface<
-          std::tuple<CompressionType, bool, BlockBasedTableOptions::IndexType,
-                     bool, test::UserDefinedTimestampTestMode, uint32_t,
-                     uint32_t, bool, const Comparator*>> {
+      public testing::WithParamInterface<BlockBasedTableReaderTestParam> {
  protected:
   void SetUp() override {
-    compression_type_ = std::get<0>(GetParam());
-    use_direct_reads_ = std::get<1>(GetParam());
-    test::UserDefinedTimestampTestMode udt_test_mode = std::get<4>(GetParam());
+    auto param = GetParam();
+    compression_type_ = param.compression_type;
+    use_direct_reads_ = param.use_direct_reads;
+    test::UserDefinedTimestampTestMode udt_test_mode = param.udt_test_mode;
     udt_enabled_ = test::IsUDTEnabled(udt_test_mode);
     persist_udt_ = test::ShouldPersistUDT(udt_test_mode);
-    compression_parallel_threads_ = std::get<5>(GetParam());
-    compression_dict_bytes_ = std::get<6>(GetParam());
-    same_key_diff_ts_ = std::get<7>(GetParam());
-    comparator_ = std::get<8>(GetParam());
+    compression_parallel_threads_ = param.compression_parallel_threads;
+    compression_dict_bytes_ = param.compression_dict_bytes;
+    same_key_diff_ts_ = param.same_key_diff_ts;
+    comparator_ = param.comparator;
     BlockBasedTableReaderBaseTest::SetUp();
   }
 
   void ConfigureTableFactory() override {
     BlockBasedTableOptions opts;
-    opts.index_type = std::get<2>(GetParam());
-    opts.no_block_cache = std::get<3>(GetParam());
+    auto param = GetParam();
+    opts.index_type = param.index_type;
+    opts.no_block_cache = param.no_block_cache;
+    opts.super_block_alignment_size = param.super_block_alignment_size;
+    opts.super_block_alignment_space_overhead_ratio =
+        param.super_block_alignment_space_overhead_ratio;
     opts.filter_policy.reset(NewBloomFilterPolicy(10, false));
     opts.partition_filters =
         opts.index_type ==
@@ -1009,222 +1075,210 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
   ASSERT_EQ(s.code(), Status::kCorruption);
 }
 
+class BlockBasedTableReaderMultiScanTest : public BlockBasedTableReaderTest {
+ public:
+  void SetUp() override {
+    BlockBasedTableReaderTest::SetUp();
+    options_.comparator = comparator_;
+  }
+};
+
+class BlockBasedTableReaderMultiScanAsyncIOTest
+    : public BlockBasedTableReaderMultiScanTest {};
+
 // TODO: test no block cache case
-TEST_P(BlockBasedTableReaderTest, MultiScanPrepare) {
-  std::ostringstream param_trace;
-  param_trace << "[MultiScanPrepare] Test params: " << "CompressionType="
-              << CompressionTypeToString(compression_type_)
-              << ", UseDirectReads=" << (use_direct_reads_ ? "true" : "false")
-              << ", UDTEnabled=" << (udt_enabled_ ? "true" : "false")
-              << ", PersistUDT=" << (persist_udt_ ? "true" : "false")
-              << ", CompressionParallelThreads="
-              << compression_parallel_threads_
-              << ", CompressionDictBytes=" << compression_dict_bytes_
-              << ", SameKeyDiffTs=" << (same_key_diff_ts_ ? "true" : "false");
-  SCOPED_TRACE(param_trace.str());
-
-  for (bool fill_cache : {false, true}) {
-    SCOPED_TRACE(std::string("fill_cache=") + std::to_string(fill_cache));
-    for (bool use_async_io : {false,
-#ifdef ROCKSDB_IOURING_PRESENT
-                              true
-#endif
-         }) {
-      SCOPED_TRACE(std::string("use_async_io=") + std::to_string(use_async_io));
-      Options options;
-      options.statistics = CreateDBStatistics();
-      options.comparator = comparator_;
-      std::shared_ptr<FileSystem> fs = options.env->GetFileSystem();
-      ReadOptions read_opts;
-      read_opts.fill_cache = fill_cache;
-      size_t ts_sz = options.comparator->timestamp_size();
-      std::vector<std::pair<std::string, std::string>> kv =
-          BlockBasedTableReaderBaseTest::GenerateKVMap(
-              100 /* num_block */,
-              true /* mixed_with_human_readable_string_value */, ts_sz,
-              same_key_diff_ts_, comparator_);
-      std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
-                               CompressionTypeToString(compression_type_) +
-                               "_async" + std::to_string(use_async_io);
-      ImmutableOptions ioptions(options);
-      CreateTable(table_name, ioptions, compression_type_, kv,
-                  compression_parallel_threads_, compression_dict_bytes_);
-
-      std::unique_ptr<BlockBasedTable> table;
-      FileOptions foptions;
-      foptions.use_direct_reads = use_direct_reads_;
-      InternalKeyComparator comparator(options.comparator);
-      NewBlockBasedTableReader(
-          foptions, ioptions, comparator, table_name, &table,
-          true /* bool prefetch_index_and_filter_in_cache */,
-          nullptr /* status */, persist_udt_);
-
-      // 1. Should coalesce into a single I/O
-      std::unique_ptr<InternalIterator> iter;
-      iter.reset(table->NewIterator(
-          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-
-      MultiScanArgs scan_options(comparator_);
-      scan_options.use_async_io = use_async_io;
-      scan_options.insert(ExtractUserKey(kv[0].first),
-                          ExtractUserKey(kv[kEntriesPerBlock].first));
-      scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
-                          ExtractUserKey(kv[3 * kEntriesPerBlock].first));
-      auto read_count_before =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-
-      iter->Prepare(&scan_options);
-      iter->Seek(kv[0].first);
-      for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
-        ASSERT_TRUE(iter->status().ok()) << iter->status().ToString();
-        ASSERT_TRUE(iter->Valid()) << i;
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      // Iter may still be valid after scan range. Upper layer (DBIter) handles
-      // exact upper bound checking. So we don't check !iter->Valid() here.
-      ASSERT_OK(iter->status());
-      iter->Seek(kv[2 * kEntriesPerBlock].first);
-      for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_OK(iter->status());
-      auto read_count_after =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-      ASSERT_EQ(read_count_before + 1, read_count_after);
+TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
+  auto param = GetParam();
+  auto fill_cache = param.fill_cache;
+  auto use_async_io = param.use_async_io;
 
-      // 2. No IO coalesce, should do MultiRead/ReadAsync with 2 read requests.
-      iter.reset(table->NewIterator(
-          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-      scan_options = MultiScanArgs(comparator_);
-      scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
-                          ExtractUserKey(kv[75 * kEntriesPerBlock].first));
-      scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
-                          ExtractUserKey(kv[95 * kEntriesPerBlock].first));
-
-      read_count_before =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-      iter->Prepare(&scan_options);
-
-      iter->Seek(kv[70 * kEntriesPerBlock].first);
-      for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_OK(iter->status());
-      iter->Seek(kv[90 * kEntriesPerBlock].first);
-      for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_OK(iter->status());
-
-      read_count_after =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-      ASSERT_EQ(read_count_before + 2, read_count_after);
-
-      iter.reset(table->NewIterator(
-          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-
-      // 3. Tests I/O excludes blocks already in cache.
-      // Reading blocks from 50-99
-      // From reads above, blocks 70-75 and 90-95 already in cache
-      // So we should read 50-70 76-89 96-99 in three I/Os.
-      // If fill_cache is false, then we'll do one giant I/O.
-      scan_options = MultiScanArgs(comparator_);
-      scan_options.use_async_io = use_async_io;
-      scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
-      read_count_before =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-      iter->Prepare(&scan_options);
-      read_count_after =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-      if (!use_async_io) {
-        if (!fill_cache) {
-          ASSERT_EQ(read_count_before + 1, read_count_after);
-        } else {
-          ASSERT_EQ(read_count_before + 3, read_count_after);
-        }
-      } else {
-        // stat is recorded in async callback which happens in Poll(), and
-        // Poll() happens during scanning.
-        ASSERT_EQ(read_count_before, read_count_after);
-      }
+  options_.statistics = CreateDBStatistics();
+  std::shared_ptr<FileSystem> fs = options_.env->GetFileSystem();
+  ReadOptions read_opts;
+  read_opts.fill_cache = fill_cache;
+  size_t ts_sz = options_.comparator->timestamp_size();
+  std::vector<std::pair<std::string, std::string>> kv =
+      BlockBasedTableReaderBaseTest::GenerateKVMap(
+          100 /* num_block */,
+          true /* mixed_with_human_readable_string_value */, ts_sz,
+          same_key_diff_ts_, comparator_);
+  std::string table_name = "BlockBasedTableReaderTest_NewIterator" +
+                           CompressionTypeToString(compression_type_) +
+                           "_async" + std::to_string(use_async_io);
+  ImmutableOptions ioptions(options_);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
 
-      iter->Seek(kv[50 * kEntriesPerBlock].first);
-      for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_FALSE(iter->Valid());
-      ASSERT_OK(iter->status());
-      read_count_after =
-          options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
-      if (!fill_cache) {
-        ASSERT_EQ(read_count_before + 1, read_count_after);
-      } else {
-        ASSERT_EQ(read_count_before + 3, read_count_after);
-      }
+  std::unique_ptr<BlockBasedTable> table;
+  FileOptions foptions;
+  foptions.use_direct_reads = use_direct_reads_;
+  InternalKeyComparator comparator(options_.comparator);
+  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
+                           true /* bool prefetch_index_and_filter_in_cache */,
+                           nullptr /* status */, persist_udt_);
 
-      // 4. Check cases when Seek key does not match start key in ScanOptions
-      iter.reset(table->NewIterator(
-          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-      scan_options = MultiScanArgs(comparator_);
-      scan_options.use_async_io = use_async_io;
-      scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
-                          ExtractUserKey(kv[20 * kEntriesPerBlock].first));
-      scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
-                          ExtractUserKey(kv[40 * kEntriesPerBlock].first));
-      iter->Prepare(&scan_options);
-      // Match start key
-      iter->Seek(kv[10 * kEntriesPerBlock].first);
-      for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
-        ASSERT_TRUE(iter->Valid());
-        ASSERT_EQ(iter->key().ToString(), kv[i].first);
-        iter->Next();
-      }
-      ASSERT_OK(iter->status());
-
-      // Does not match start key of the second ScanOptions.
-      iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
-      ASSERT_NOK(iter->status());
-
-      iter.reset(table->NewIterator(
-          read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-          /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-      scan_options = MultiScanArgs(comparator_);
-      scan_options.use_async_io = use_async_io;
-      scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
-      scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
-      iter->Prepare(&scan_options);
-      // Does not match the first ScanOptions.
-      iter->SeekToFirst();
-      ASSERT_NOK(iter->status());
-      iter->Seek(kv[10 * kEntriesPerBlock].first);
-      ASSERT_NOK(iter->status());
+  // 1. Should coalesce into a single I/O
+  std::unique_ptr<InternalIterator> iter;
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  MultiScanArgs scan_options(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[0].first),
+                      ExtractUserKey(kv[kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[3 * kEntriesPerBlock].first));
+  auto read_count_before =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+
+  iter->Prepare(&scan_options);
+  iter->Seek(kv[0].first);
+  for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
+    ASSERT_TRUE(iter->status().ok()) << iter->status().ToString();
+    ASSERT_TRUE(iter->Valid()) << i;
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  // Iter may still be valid after scan range. Upper layer (DBIter) handles
+  // exact upper bound checking. So we don't check !iter->Valid() here.
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[2 * kEntriesPerBlock].first);
+  for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  auto read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 1, read_count_after);
+
+  // 2. No IO coalesce, should do MultiRead/ReadAsync with 2 read requests.
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[75 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[95 * kEntriesPerBlock].first));
+
+  read_count_before =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+
+  iter->Seek(kv[70 * kEntriesPerBlock].first);
+  for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[90 * kEntriesPerBlock].first);
+  for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_OK(iter->status());
+
+  read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_before + 2, read_count_after);
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+
+  // 3. Tests I/O excludes blocks already in cache.
+  // Reading blocks from 50-99
+  // From reads above, blocks 70-75 and 90-95 already in cache
+  // So we should read 50-70 76-89 96-99 in three I/Os.
+  // If fill_cache is false, then we'll do one giant I/O.
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
+  read_count_before =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  iter->Prepare(&scan_options);
+  read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  if (!use_async_io) {
+    if (!fill_cache) {
+      ASSERT_EQ(read_count_before + 1, read_count_after);
+    } else {
+      ASSERT_EQ(read_count_before + 3, read_count_after);
     }
+  } else {
+    // stat is recorded in async callback which happens in Poll(), and
+    // Poll() happens during scanning.
+    ASSERT_EQ(read_count_before, read_count_after);
+  }
+
+  iter->Seek(kv[50 * kEntriesPerBlock].first);
+  for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  read_count_after =
+      options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  if (!fill_cache) {
+    ASSERT_EQ(read_count_before + 1, read_count_after);
+  } else {
+    ASSERT_EQ(read_count_before + 3, read_count_after);
+  }
+
+  // 4. Check cases when Seek key does not match start key in ScanOptions
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[20 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+  iter->Prepare(&scan_options);
+  // Match start key
+  iter->Seek(kv[10 * kEntriesPerBlock].first);
+  for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), kv[i].first);
+    iter->Next();
   }
+  ASSERT_OK(iter->status());
+
+  // Does not match start key of the second ScanOptions.
+  iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
+  ASSERT_NOK(iter->status());
+
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
+  iter->Prepare(&scan_options);
+  // Does not match the first ScanOptions.
+  iter->SeekToFirst();
+  ASSERT_NOK(iter->status());
+  iter->Seek(kv[10 * kEntriesPerBlock].first);
+  ASSERT_NOK(iter->status());
 }
 
-TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
+TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanPrefetchSizeLimit) {
   if (compression_type_ != kNoCompression) {
     // This test relies on block sizes to be close to what's set in option.
     ROCKSDB_GTEST_BYPASS("This test assumes no compression.");
     return;
   }
-  Options options;
-  options.comparator = comparator_;
   ReadOptions read_opts;
-  size_t ts_sz = options.comparator->timestamp_size();
+  size_t ts_sz = options_.comparator->timestamp_size();
 
   // Generate data that spans multiple blocks
   std::vector<std::pair<std::string, std::string>> kv =
@@ -1235,14 +1289,14 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
   std::string table_name = "BlockBasedTableReaderTest_PrefetchSizeLimit" +
                            CompressionTypeToString(compression_type_);
 
-  ImmutableOptions ioptions(options);
+  ImmutableOptions ioptions(options_);
   CreateTable(table_name, ioptions, compression_type_, kv,
               compression_parallel_threads_, compression_dict_bytes_);
 
   std::unique_ptr<BlockBasedTable> table;
   FileOptions foptions;
   foptions.use_direct_reads = use_direct_reads_;
-  InternalKeyComparator comparator(options.comparator);
+  InternalKeyComparator comparator(options_.comparator);
   NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
                            true /* bool prefetch_index_and_filter_in_cache */,
                            nullptr /* status */, persist_udt_);
@@ -1415,11 +1469,11 @@ TEST_P(BlockBasedTableReaderTest, MultiScanPrefetchSizeLimit) {
   }
 }
 
-TEST_P(BlockBasedTableReaderTest, MultiScanUnpinPreviousBlocks) {
+TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanUnpinPreviousBlocks) {
   std::vector<std::pair<std::string, std::string>> kv =
       BlockBasedTableReaderBaseTest::GenerateKVMap(
-          30 /* num_block */,
-          true /* mixed_with_human_readable_string_value */);
+          30 /* num_block */, true /* mixed_with_human_readable_string_value */,
+          comparator_->timestamp_size(), same_key_diff_ts_, comparator_);
   std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" +
                            CompressionTypeToString(compression_type_);
   ImmutableOptions ioptions(options_);
@@ -1492,16 +1546,17 @@ TEST_P(BlockBasedTableReaderTest, MultiScanUnpinPreviousBlocks) {
   }
 }
 
-TEST_P(BlockBasedTableReaderTest, MultiScanOptFileOverlapChecking) {
+TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanOptFileOverlapChecking) {
   std::vector<std::pair<std::string, std::string>> kv =
       BlockBasedTableReaderBaseTest::GenerateKVMap(
-          20 /* num_block */,
-          true /* mixed_with_human_readable_string_value */);
+          20 /* num_block */, true /* mixed_with_human_readable_string_value */,
+          comparator_->timestamp_size(), same_key_diff_ts_, comparator_);
   std::vector<std::pair<std::string, std::string>> actual_kv(
       kv.begin(), kv.begin() + 15 * kEntriesPerBlock);
 
-  std::string table_name = "BlockBasedTableReaderTest_UnpinPreviousBlocks" +
-                           CompressionTypeToString(compression_type_);
+  std::string table_name =
+      "BlockBasedTableReaderMultiScanTest_UnpinPreviousBlocks" +
+      CompressionTypeToString(compression_type_);
   ImmutableOptions ioptions(options_);
   CreateTable(table_name, ioptions, compression_type_, actual_kv,
               compression_parallel_threads_, compression_dict_bytes_);
@@ -1538,64 +1593,290 @@ TEST_P(BlockBasedTableReaderTest, MultiScanOptFileOverlapChecking) {
   ASSERT_TRUE(iter->status().IsInvalidArgument());
 }
 
-// Param 1: compression type
-// Param 2: whether to use direct reads
-// Param 3: Block Based Table Index type, partitioned filters are also enabled
-//          when index type is kTwoLevelIndexSearch
-// Param 4: BBTO no_block_cache option
-// Param 5: test mode for the user-defined timestamp feature
-// Param 6: number of parallel compression threads
-// Param 7: CompressionOptions.max_dict_bytes and
-//          CompressionOptions.max_dict_buffer_bytes. This enable/disables
-//          compression dictionary.
-// Param 8: test mode to specify the pattern for generating key / value pairs.
+std::vector<BlockBasedTableReaderTestParam> GenerateCombinedParameters(
+    const std::vector<CompressionType>& compression_types,
+    const std::vector<bool>& use_direct_read_flags,
+    const std::vector<BlockBasedTableOptions::IndexType>& index_types,
+    const std::vector<bool>& no_block_cache_flags,
+    const std::vector<test::UserDefinedTimestampTestMode>& udt_test_modes,
+    const std::vector<int>& parallel_compression_thread_counts,
+    const std::vector<uint32_t>& compression_dict_byte_counts,
+    const std::vector<bool>& same_key_diff_ts_flags,
+    const std::vector<const Comparator*>& comparators,
+    const std::vector<bool>& fill_cache_flags,
+    const std::vector<bool>& use_async_io_flags,
+    const std::vector<bool>& block_align_flags,
+    const std::vector<size_t>& super_block_alignment_sizes,
+    const std::vector<size_t>& super_block_alignment_space_overhead_ratios) {
+  std::vector<BlockBasedTableReaderTestParam> params;
+  for (const auto& compression_type : compression_types) {
+    for (auto use_direct_read : use_direct_read_flags) {
+      for (const auto& index_type : index_types) {
+        for (auto no_block_cache : no_block_cache_flags) {
+          for (const auto& udt_test_mode : udt_test_modes) {
+            for (auto parallel_compression_thread_count :
+                 parallel_compression_thread_counts) {
+              for (auto compression_dict_byte_count :
+                   compression_dict_byte_counts) {
+                for (auto same_key_diff_ts_flag : same_key_diff_ts_flags) {
+                  for (const auto& comparator : comparators) {
+                    for (auto fill_cache : fill_cache_flags) {
+                      for (auto use_async_io : use_async_io_flags) {
+                        for (auto block_align : block_align_flags) {
+                          for (auto super_block_alignment_size :
+                               super_block_alignment_sizes) {
+                            for (
+                                auto
+                                    super_block_alignment_space_overhead_ratio :
+                                super_block_alignment_space_overhead_ratios) {
+                              if (super_block_alignment_size == 0) {
+                                // Override padding size to 0 if alignment size
+                                // is 0, which means no super block alignment
+                                super_block_alignment_space_overhead_ratio = 0;
+                              }
+                              params.emplace_back(
+                                  compression_type, use_direct_read, index_type,
+                                  no_block_cache, udt_test_mode,
+                                  parallel_compression_thread_count,
+                                  compression_dict_byte_count,
+                                  same_key_diff_ts_flag, comparator, fill_cache,
+                                  use_async_io, block_align,
+                                  super_block_alignment_size,
+                                  super_block_alignment_space_overhead_ratio);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return params;
+}
+
+std::vector<bool> Bool() { return {true, false}; }
+
+struct BlockBasedTableReaderTestParamBuilder {
+  BlockBasedTableReaderTestParamBuilder() {
+    // Default values
+    compression_types = GetSupportedCompressions();
+    use_direct_read_flags = Bool();
+    index_types = {
+        BlockBasedTableOptions::IndexType::kBinarySearch,
+        BlockBasedTableOptions::IndexType::kHashSearch,
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey};
+    no_block_cache_flags = {false};
+    udt_test_modes = {
+        test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp};
+    parallel_compression_thread_counts = {1, 2};
+    compression_dict_byte_counts = {0, 4096};
+    same_key_diff_ts_flags = {false};
+    comparators = {BytewiseComparator()};
+    fill_cache_flags = {true};
+    use_async_io_flags = {false};
+    block_align_flags = {false};
+    super_block_alignment_sizes = {0};
+    super_block_alignment_space_overhead_ratios = {128};
+  }
+
+  // builder methods for each member
+  BlockBasedTableReaderTestParamBuilder& WithCompressionTypes(
+      const std::vector<CompressionType>& _compression_types) {
+    compression_types = _compression_types;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithUseDirectReadFlags(
+      const std::vector<bool>& _use_direct_read_flags) {
+    use_direct_read_flags = _use_direct_read_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithIndexTypes(
+      const std::vector<BlockBasedTableOptions::IndexType>& _index_types) {
+    index_types = _index_types;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithNoBlockCacheFlags(
+      const std::vector<bool>& _no_block_cache_flags) {
+    no_block_cache_flags = _no_block_cache_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithUDTTestModes(
+      const std::vector<test::UserDefinedTimestampTestMode>& _udt_test_modes) {
+    udt_test_modes = _udt_test_modes;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithParallelCompressionThreadCounts(
+      const std::vector<int>& _parallel_compression_thread_counts) {
+    parallel_compression_thread_counts = _parallel_compression_thread_counts;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithCompressionDictByteCounts(
+      const std::vector<uint32_t>& _compression_dict_byte_counts) {
+    compression_dict_byte_counts = _compression_dict_byte_counts;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithSameKeyDiffTsFlags(
+      const std::vector<bool>& _same_key_diff_ts_flags) {
+    same_key_diff_ts_flags = _same_key_diff_ts_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithComparators(
+      const std::vector<const Comparator*>& _comparators) {
+    comparators = _comparators;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithFillCacheFlags(
+      const std::vector<bool>& _fill_cache_flags) {
+    fill_cache_flags = _fill_cache_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithUseAsyncIoFlags(
+      const std::vector<bool>& _use_async_io_flags) {
+    use_async_io_flags = _use_async_io_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithBlockAlignFlags(
+      const std::vector<bool>& _block_align_flags) {
+    block_align_flags = _block_align_flags;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder& WithSuperBlockAlignmentSizes(
+      const std::vector<size_t>& _super_block_alignment_sizes) {
+    super_block_alignment_sizes = _super_block_alignment_sizes;
+    return *this;
+  }
+
+  BlockBasedTableReaderTestParamBuilder&
+  WithSuperBlockAlignmentSpaceOverheadRatios(
+      const std::vector<size_t>& _super_block_alignment_space_overhead_ratios) {
+    super_block_alignment_space_overhead_ratios =
+        _super_block_alignment_space_overhead_ratios;
+    return *this;
+  }
+
+  std::vector<BlockBasedTableReaderTestParam> build() {
+    return GenerateCombinedParameters(
+        compression_types, use_direct_read_flags, index_types,
+        no_block_cache_flags, udt_test_modes,
+        parallel_compression_thread_counts, compression_dict_byte_counts,
+        same_key_diff_ts_flags, comparators, fill_cache_flags,
+        use_async_io_flags, block_align_flags, super_block_alignment_sizes,
+        super_block_alignment_space_overhead_ratios);
+  }
+
+  std::vector<CompressionType> compression_types;
+  std::vector<bool> use_direct_read_flags;
+  std::vector<BlockBasedTableOptions::IndexType> index_types;
+  std::vector<bool> no_block_cache_flags;
+  std::vector<test::UserDefinedTimestampTestMode> udt_test_modes;
+  std::vector<int> parallel_compression_thread_counts;
+  std::vector<uint32_t> compression_dict_byte_counts;
+  std::vector<bool> same_key_diff_ts_flags;
+  std::vector<const Comparator*> comparators;
+  std::vector<bool> fill_cache_flags;
+  std::vector<bool> use_async_io_flags;
+  std::vector<bool> block_align_flags;
+  std::vector<size_t> super_block_alignment_sizes;
+  std::vector<size_t> super_block_alignment_space_overhead_ratios;
+};
+
+std::vector<bool> IOUringFlags() {
+#ifdef ROCKSDB_IOURING_PRESENT
+  return {false, true};
+#else
+  return {false};
+#endif
+}
+
 INSTANTIATE_TEST_CASE_P(
     BlockBasedTableReaderTest, BlockBasedTableReaderTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kBinarySearch,
-            BlockBasedTableOptions::IndexType::kHashSearch,
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
-            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
-        ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0, 4096),
-        ::testing::Values(false),
-        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithUDTTestModes(test::GetUDTTestModes())
+                            .build()));
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableReaderMultiScanAsyncIOTest,
+    BlockBasedTableReaderMultiScanAsyncIOTest,
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithComparators({BytewiseComparator(),
+                                              ReverseBytewiseComparator()})
+                            .WithFillCacheFlags(Bool())
+                            .WithUseAsyncIoFlags(IOUringFlags())
+                            .build()));
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableReaderMultiScanTest, BlockBasedTableReaderMultiScanTest,
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithComparators({BytewiseComparator(),
+                                              ReverseBytewiseComparator()})
+                            .build()));
+
 INSTANTIATE_TEST_CASE_P(
     BlockBasedTableReaderGetTest, BlockBasedTableReaderGetTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kBinarySearch,
-            BlockBasedTableOptions::IndexType::kHashSearch,
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
-            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey),
-        ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0, 4096),
-        ::testing::Values(false, true),
-        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
+    ::testing::ValuesIn(BlockBasedTableReaderTestParamBuilder()
+                            .WithUDTTestModes(test::GetUDTTestModes())
+                            .WithSameKeyDiffTsFlags(Bool())
+                            .WithComparators({BytewiseComparator(),
+                                              ReverseBytewiseComparator()})
+                            .WithFillCacheFlags({false})
+                            .build()));
+
+INSTANTIATE_TEST_CASE_P(
+    BlockBasedTableReaderSuperBlockAlignTest, BlockBasedTableReaderGetTest,
+    ::testing::ValuesIn(
+        BlockBasedTableReaderTestParamBuilder()
+            .WithIndexTypes(
+                {BlockBasedTableOptions::IndexType::kBinarySearch,
+                 BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch})
+            .WithFillCacheFlags({false})
+            .WithBlockAlignFlags(Bool())
+            .WithSuperBlockAlignmentSizes({0, 32 * 1024, 16 * 1024})
+            .WithSuperBlockAlignmentSpaceOverheadRatios({0, 4, 256})
+            .build()));
+
 INSTANTIATE_TEST_CASE_P(
     StrictCapacityLimitReaderTest, StrictCapacityLimitReaderTest,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
-        ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0),
-        ::testing::Values(false, true),
-        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
+    ::testing::ValuesIn(
+        BlockBasedTableReaderTestParamBuilder()
+            .WithIndexTypes(
+                {BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch})
+            .WithUDTTestModes(test::GetUDTTestModes())
+            .WithCompressionDictByteCounts({0})
+            .WithSameKeyDiffTsFlags(Bool())
+            .WithFillCacheFlags({false})
+            .build()));
+
 INSTANTIATE_TEST_CASE_P(
     VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum,
-    ::testing::Combine(
-        ::testing::ValuesIn(GetSupportedCompressions()),
-        ::testing::Values(false),
-        ::testing::Values(
-            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch),
-        ::testing::Values(true), ::testing::ValuesIn(test::GetUDTTestModes()),
-        ::testing::Values(1, 2), ::testing::Values(0), ::testing::Values(false),
-        ::testing::Values(BytewiseComparator(), ReverseBytewiseComparator())));
-
+    ::testing::ValuesIn(
+        BlockBasedTableReaderTestParamBuilder()
+            .WithUseDirectReadFlags({false})
+            .WithIndexTypes(
+                {BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch})
+            .WithNoBlockCacheFlags({true})
+            .WithUDTTestModes(test::GetUDTTestModes())
+            .WithCompressionDictByteCounts({0})
+            .WithFillCacheFlags({false})
+            .build()));
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc
index f3a2834ab1d0..4c2cb40094d7 100644
--- a/table/block_based/block_builder.cc
+++ b/table/block_based/block_builder.cc
@@ -151,11 +151,13 @@ Slice BlockBuilder::Finish() {
 }
 
 void BlockBuilder::Add(const Slice& key, const Slice& value,
-                       const Slice* const delta_value) {
+                       const Slice* const delta_value,
+                       bool skip_delta_encoding) {
   // Ensure no unsafe mixing of Add and AddWithLastKey
   assert(!add_with_last_key_called_);
 
-  AddWithLastKeyImpl(key, value, last_key_, delta_value, buffer_.size());
+  AddWithLastKeyImpl(key, value, last_key_, delta_value, skip_delta_encoding,
+                     buffer_.size());
   if (use_delta_encoding_) {
     // Update state
     // We used to just copy the changed data, but it appears to be
@@ -166,7 +168,8 @@ void BlockBuilder::Add(const Slice& key, const Slice& value,
 
 void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
                                   const Slice& last_key_param,
-                                  const Slice* const delta_value) {
+                                  const Slice* const delta_value,
+                                  bool skip_delta_encoding) {
   // Ensure no unsafe mixing of Add and AddWithLastKey
   assert(last_key_.empty());
 #ifndef NDEBUG
@@ -185,17 +188,18 @@ void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value,
 
   Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0));
 
-  AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size);
+  AddWithLastKeyImpl(key, value, last_key, delta_value, skip_delta_encoding,
+                     buffer_size);
 }
 
 inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
                                              const Slice& value,
                                              const Slice& last_key,
                                              const Slice* const delta_value,
+                                             bool skip_delta_encoding,
                                              size_t buffer_size) {
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
-  assert(!use_value_delta_encoding_ || delta_value);
   std::string key_buf;
   std::string last_key_buf;
   const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key);
@@ -211,7 +215,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
     restarts_.push_back(static_cast<uint32_t>(buffer_size));
     estimate_ += sizeof(uint32_t);
     counter_ = 0;
-  } else if (use_delta_encoding_) {
+  } else if (use_delta_encoding_ && !skip_delta_encoding) {
     // See how much sharing to do with previous string
     shared = key_to_persist.difference_offset(last_key_persisted);
   }
@@ -235,6 +239,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key,
   // simplify the decoding, where it can figure which decoding to use simply by
   // looking at the shared bytes size.
   if (shared != 0 && use_value_delta_encoding_) {
+    assert(delta_value != nullptr);
     buffer_.append(delta_value->data(), delta_value->size());
   } else {
     buffer_.append(value.data(), value.size());
diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h
index 37e2c8ee69d2..6cc9d836ab31 100644
--- a/table/block_based/block_builder.h
+++ b/table/block_based/block_builder.h
@@ -46,7 +46,8 @@ class BlockBuilder {
   // AddWithLastKey() in contexts where previous added key is already known
   // and delta encoding might be used.
   void Add(const Slice& key, const Slice& value,
-           const Slice* const delta_value = nullptr);
+           const Slice* const delta_value = nullptr,
+           bool skip_delta_encoding = false);
 
   // A faster version of Add() if the previous key is already known for all
   // Add()s.
@@ -59,7 +60,8 @@ class BlockBuilder {
   // DO NOT mix with Add() between Resets.
   void AddWithLastKey(const Slice& key, const Slice& value,
                       const Slice& last_key,
-                      const Slice* const delta_value = nullptr);
+                      const Slice* const delta_value = nullptr,
+                      bool skip_delta_encoding = false);
 
   // Finish building the block and return a slice that refers to the
   // block contents.  The returned slice will remain valid for the
@@ -86,7 +88,7 @@ class BlockBuilder {
   inline void AddWithLastKeyImpl(const Slice& key, const Slice& value,
                                  const Slice& last_key,
                                  const Slice* const delta_value,
-                                 size_t buffer_size);
+                                 bool skip_delta_encoding, size_t buffer_size);
 
   inline const Slice MaybeStripTimestampFromKey(std::string* key_buf,
                                                 const Slice& key);
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index 2124579f82c4..56e539da1eb5 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -243,13 +243,15 @@ void PartitionedIndexBuilder::MaybeFlush(const Slice& index_key,
 }
 
 void PartitionedIndexBuilder::FinishIndexEntry(const BlockHandle& block_handle,
-                                               PreparedIndexEntry* base_entry) {
+                                               PreparedIndexEntry* base_entry,
+                                               bool skip_delta_encoding) {
   using SPIE = ShortenedIndexBuilder::ShortenedPreparedIndexEntry;
   SPIE* entry = static_cast<SPIE*>(base_entry);
 
   MaybeFlush(entry->separator_with_seq, block_handle);
 
-  sub_index_builder_->FinishIndexEntry(block_handle, base_entry);
+  sub_index_builder_->FinishIndexEntry(block_handle, base_entry,
+                                       skip_delta_encoding);
   std::swap(entries_.back().key, entry->separator_with_seq);
 
   if (!must_use_separator_with_seq_ && entry->must_use_separator_with_seq) {
@@ -264,16 +266,16 @@ void PartitionedIndexBuilder::FinishIndexEntry(const BlockHandle& block_handle,
 Slice PartitionedIndexBuilder::AddIndexEntry(
     const Slice& last_key_in_current_block,
     const Slice* first_key_in_next_block, const BlockHandle& block_handle,
-    std::string* separator_scratch) {
+    std::string* separator_scratch, bool skip_delta_encoding) {
   // At least when running without parallel compression, maintain behavior of
   // avoiding a last index partition with just one entry
   if (first_key_in_next_block) {
     MaybeFlush(last_key_in_current_block, block_handle);
   }
 
-  auto sep = sub_index_builder_->AddIndexEntry(last_key_in_current_block,
-                                               first_key_in_next_block,
-                                               block_handle, separator_scratch);
+  auto sep = sub_index_builder_->AddIndexEntry(
+      last_key_in_current_block, first_key_in_next_block, block_handle,
+      separator_scratch, skip_delta_encoding);
   entries_.back().key.assign(sep.data(), sep.size());
 
   if (!must_use_separator_with_seq_ &&
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index 9cf498ea25d3..b1e9dea46cfb 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -68,6 +68,9 @@ class IndexBuilder {
   //                           the last one in the table
   // @separator_scratch: a scratch buffer to back a computed separator between
   //                     those, as needed. May be modified on each call.
+  // @skip_delta_encoding: whether to skip delta encoding for this index entry
+  //                       for cases of violating the assumption that this
+  //                       block_handle starts where the last one ended.
   // @return: the key or separator stored in the index, which could be
   //          last_key_in_current_block or a computed separator backed by
   //          separator_scratch or last_key_in_current_block.
@@ -75,7 +78,8 @@ class IndexBuilder {
   virtual Slice AddIndexEntry(const Slice& last_key_in_current_block,
                               const Slice* first_key_in_next_block,
                               const BlockHandle& block_handle,
-                              std::string* separator_scratch) = 0;
+                              std::string* separator_scratch,
+                              bool skip_delta_encoding) = 0;
 
   // An abstract (extensible) holder for passing data from PrepareIndexEntry to
   // FinishIndexEntry (see below).
@@ -118,7 +122,8 @@ class IndexBuilder {
   // External synchronization ensures Finish is only called after all the
   // FinishIndexEntry calls have completed.
   virtual void FinishIndexEntry(const BlockHandle& block_handle,
-                                PreparedIndexEntry* entry) = 0;
+                                PreparedIndexEntry* entry,
+                                bool skip_delta_encoding) = 0;
 
   // This method will be called whenever a key is added. The subclasses may
   // override OnKeyAdded() if they need to collect additional information.
@@ -293,12 +298,14 @@ class ShortenedIndexBuilder : public IndexBuilder {
   void AddIndexEntryImpl(const Slice& separator_with_seq,
                          const Slice& first_internal_key,
                          const BlockHandle& block_handle,
-                         bool must_use_separator_with_seq) {
+                         bool must_use_separator_with_seq,
+                         bool skip_delta_encoding) {
     IndexValue entry(block_handle, first_internal_key);
     std::string encoded_entry;
     std::string delta_encoded_entry;
     entry.EncodeTo(&encoded_entry, include_first_key_, nullptr);
-    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull()) {
+    if (use_value_delta_encoding_ && !last_encoded_handle_.IsNull() &&
+        !skip_delta_encoding) {
       entry.EncodeTo(&delta_encoded_entry, include_first_key_,
                      &last_encoded_handle_);
     } else {
@@ -318,11 +325,11 @@ class ShortenedIndexBuilder : public IndexBuilder {
     // What are the implications if a "FindShortInternalKeySuccessor"
     // optimization is provided.
     index_block_builder_.Add(separator_with_seq, encoded_entry,
-                             &delta_encoded_entry_slice);
+                             &delta_encoded_entry_slice, skip_delta_encoding);
     if (!must_use_separator_with_seq) {
-      index_block_builder_without_seq_.Add(ExtractUserKey(separator_with_seq),
-                                           encoded_entry,
-                                           &delta_encoded_entry_slice);
+      index_block_builder_without_seq_.Add(
+          ExtractUserKey(separator_with_seq), encoded_entry,
+          &delta_encoded_entry_slice, skip_delta_encoding);
     }
 
     ++num_index_entries_;
@@ -331,7 +338,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
-                      std::string* separator_scratch) override {
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override {
     Slice separator_with_seq = GetSeparatorWithSeq(
         last_key_in_current_block, first_key_in_next_block, separator_scratch);
 
@@ -339,7 +347,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
     Slice first_internal_key = GetFirstInternalKey(&first_internal_key_buf);
 
     AddIndexEntryImpl(separator_with_seq, first_internal_key, block_handle,
-                      must_use_separator_with_seq_);
+                      must_use_separator_with_seq_, skip_delta_encoding);
     current_block_first_internal_key_.clear();
     return separator_with_seq;
   }
@@ -393,11 +401,13 @@ class ShortenedIndexBuilder : public IndexBuilder {
   }
 
   void FinishIndexEntry(const BlockHandle& block_handle,
-                        PreparedIndexEntry* base_entry) override {
+                        PreparedIndexEntry* base_entry,
+                        bool skip_delta_encoding) override {
     ShortenedPreparedIndexEntry* entry =
         static_cast<ShortenedPreparedIndexEntry*>(base_entry);
     AddIndexEntryImpl(entry->separator_with_seq, entry->first_internal_key,
-                      block_handle, entry->must_use_separator_with_seq);
+                      block_handle, entry->must_use_separator_with_seq,
+                      skip_delta_encoding);
   }
 
   using IndexBuilder::Finish;
@@ -495,11 +505,12 @@ class HashIndexBuilder : public IndexBuilder {
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
-                      std::string* separator_scratch) override {
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override {
     ++current_restart_index_;
     return primary_index_builder_.AddIndexEntry(
         last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
+        separator_scratch, skip_delta_encoding);
   }
 
   std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override {
@@ -515,8 +526,10 @@ class HashIndexBuilder : public IndexBuilder {
   }
 
   void FinishIndexEntry(const BlockHandle& block_handle,
-                        PreparedIndexEntry* entry) override {
-    primary_index_builder_.FinishIndexEntry(block_handle, entry);
+                        PreparedIndexEntry* entry,
+                        bool skip_delta_encoding) override {
+    primary_index_builder_.FinishIndexEntry(block_handle, entry,
+                                            skip_delta_encoding);
   }
 
   void OnKeyAdded(const Slice& key,
@@ -626,14 +639,16 @@ class PartitionedIndexBuilder : public IndexBuilder {
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
-                      std::string* separator_scratch) override;
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override;
 
   std::unique_ptr<PreparedIndexEntry> CreatePreparedIndexEntry() override;
   void PrepareIndexEntry(const Slice& last_key_in_current_block,
                          const Slice* first_key_in_next_block,
                          PreparedIndexEntry* out) override;
   void FinishIndexEntry(const BlockHandle& block_handle,
-                        PreparedIndexEntry* entry) override;
+                        PreparedIndexEntry* entry,
+                        bool skip_delta_encoding) override;
   void MaybeFlush(const Slice& index_key, const BlockHandle& index_value);
 
   Status Finish(IndexBlocks* index_blocks,
diff --git a/table/block_based/index_builder_test.cc b/table/block_based/index_builder_test.cc
index 28b138b53f5e..d398c214b70b 100644
--- a/table/block_based/index_builder_test.cc
+++ b/table/block_based/index_builder_test.cc
@@ -55,13 +55,13 @@ class IndexBuilderTest
 
       if (i == num_entries) {
         // Last entry - no next key
-        builder->AddIndexEntry(key_current, nullptr, handle,
-                               &separator_scratch);
+        builder->AddIndexEntry(key_current, nullptr, handle, &separator_scratch,
+                               false);
       } else {
         std::string key_next = MakeKey(i + 1);
         Slice key_next_slice(key_next);
         builder->AddIndexEntry(key_current, &key_next_slice, handle,
-                               &separator_scratch);
+                               &separator_scratch, false);
       }
 
       if (estimates) {
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index a5aa94a8e334..958b195c48d0 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -315,7 +315,8 @@ class PartitionedFilterBlockTest
         std::string(*InternalKey(user_key, 0, ValueType::kTypeValue).rep());
     BlockHandle dont_care_block_handle(1, 1);
     std::string scratch;
-    builder->AddIndexEntry(key, nullptr, dont_care_block_handle, &scratch);
+    builder->AddIndexEntry(key, nullptr, dont_care_block_handle, &scratch,
+                           false);
   }
 
   void CutABlock(PartitionedIndexBuilder* builder, const std::string& user_key,
@@ -327,7 +328,8 @@ class PartitionedFilterBlockTest
     BlockHandle dont_care_block_handle(1, 1);
     Slice slice = Slice(next_key.data(), next_key.size());
     std::string scratch;
-    builder->AddIndexEntry(key, &slice, dont_care_block_handle, &scratch);
+    builder->AddIndexEntry(key, &slice, dont_care_block_handle, &scratch,
+                           false);
   }
 
   int CountNumOfIndexPartitions(PartitionedIndexBuilder* builder) {
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 4c37289c6c92..37860eef38e6 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -41,7 +41,8 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
                       const Slice* first_key_in_next_block,
                       const BlockHandle& block_handle,
-                      std::string* separator_scratch) override {
+                      std::string* separator_scratch,
+                      bool skip_delta_encoding) override {
     UserDefinedIndexBuilder::BlockHandle handle;
     handle.offset = block_handle.offset();
     handle.size = block_handle.size();
@@ -66,7 +67,7 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
     }
     return internal_index_builder_->AddIndexEntry(
         last_key_in_current_block, first_key_in_next_block, block_handle,
-        separator_scratch);
+        separator_scratch, skip_delta_encoding);
   }
 
   // Not supported with parallel compression
@@ -82,9 +83,11 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
     assert(false);
   }
   void FinishIndexEntry(const BlockHandle& block_handle,
-                        PreparedIndexEntry* entry) override {
+                        PreparedIndexEntry* entry,
+                        bool skip_delta_encoding) override {
     (void)block_handle;
     (void)entry;
+    (void)skip_delta_encoding;
     assert(false);
   }
 
diff --git a/table/table_test.cc b/table/table_test.cc
index efb805f0e404..37abb0d824e5 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -1140,15 +1140,20 @@ class TableTest : public testing::Test {
 
 class GeneralTableTest : public TableTest {};
 class BlockBasedTableTestBase : public TableTest {};
-class BlockBasedTableTest
-    : public BlockBasedTableTestBase,
-      virtual public ::testing::WithParamInterface<uint32_t> {
+class BlockBasedTableTest : public BlockBasedTableTestBase,
+                            virtual public ::testing::WithParamInterface<
+                                std::tuple<uint32_t, size_t, size_t>> {
  public:
-  BlockBasedTableTest() : format_(GetParam()) { env_ = Env::Default(); }
+  BlockBasedTableTest() : format_(std::get<0>(GetParam())) {
+    env_ = Env::Default();
+  }
 
   BlockBasedTableOptions GetBlockBasedTableOptions() {
     BlockBasedTableOptions options;
     options.format_version = format_;
+    auto param = GetParam();
+    options.super_block_alignment_size = std::get<1>(param);
+    options.super_block_alignment_space_overhead_ratio = std::get<2>(param);
     return options;
   }
 
@@ -1380,8 +1385,12 @@ class FileChecksumTestHelper {
 
 uint64_t FileChecksumTestHelper::checksum_file_num_ = 1;
 
-INSTANTIATE_TEST_CASE_P(FormatVersions, BlockBasedTableTest,
-                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+INSTANTIATE_TEST_CASE_P(
+    FormatVersions, BlockBasedTableTest,
+    testing::Combine(testing::ValuesIn(test::kFooterFormatVersionsToTest),
+                     testing::Values(0, 128 * 1024, 512 * 1024,
+                                     2 * 1024 * 1024),
+                     testing::Values(2048, 32, 128)));
 
 // This test serves as the living tutorial for the prefix scan of user collected
 // properties.
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index fd07e1d1b63e..c8354840239f 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -717,6 +717,16 @@ DEFINE_bool(block_align,
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
             "Align data blocks on page size");
 
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "Configure super block size");
+
+DEFINE_uint64(super_block_alignment_space_overhead_ratio,
+              ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+                  .super_block_alignment_space_overhead_ratio,
+              "Configure space overhead for super block alignment");
+
 DEFINE_int64(prepopulate_block_cache, 0,
              "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
              "to insert during flush");
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index d75eb57fff34..e739611c0614 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -373,6 +373,10 @@ def apply_random_seed_per_iteration():
     "enable_sst_partitioner_factory": lambda: random.choice([0, 1]),
     "enable_do_not_compress_roles": lambda: random.choice([0, 1]),
     "block_align": lambda: random.choice([0, 1]),
+    "super_block_alignment_size": lambda: random.choice(
+        [0, 128 * 1024, 512 * 1024, 2 * 1024 * 1024]
+    ),
+    "super_block_alignment_space_overhead_ratio": lambda: random.choice([0, 32, 4096]),
     "lowest_used_cache_tier": lambda: random.choice([0, 1, 2]),
     "enable_custom_split_merge": lambda: random.choice([0, 1]),
     "adm_policy": lambda: random.choice([0, 1, 2, 3]),
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 35708aa7d61b..3ac8b9fe782b 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -1118,7 +1118,7 @@ TEST_F(WritableFileWriterIOPriorityTest, Append) {
 }
 
 TEST_F(WritableFileWriterIOPriorityTest, Pad) {
-  ASSERT_OK(writer_->Pad(IOOptions(), 500));
+  ASSERT_OK(writer_->Pad(IOOptions(), 500, kDefaultPageSize));
 }
 
 TEST_F(WritableFileWriterIOPriorityTest, Flush) {
diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc
index 78f6073082b6..05386b16bce0 100644
--- a/utilities/transactions/lock/point/point_lock_manager.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -782,9 +782,7 @@ Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
 void PointLockManager::UnLockKey(PessimisticTransaction* txn,
                                  const std::string& key, LockMapStripe* stripe,
                                  LockMap* lock_map, Env* env) {
-#ifdef NDEBUG
   (void)env;
-#endif
   TransactionID txn_id = txn->GetID();
 
   auto stripe_iter = stripe->keys.find(key);

From 9d3afcf543087692e5419069fb2372aaa04c7aaf Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 2 Oct 2025 08:34:08 -0700
Subject: [PATCH 322/500] Fix regression in LZ4 compression performance since
 10.6 (#14017)

Summary:
In RocksDB 10.6 with https://github.com/facebook/rocksdb/issues/13805, due to inaccurate testing of an async system, it went undetected at the time that LZ4 compression was using more CPU despite making a change to reuse stream objects which dramatically improved LZ4HC compression efficiency.

This change switches to using a basic LZ4 compress API which appears to be faster than all of these:
* Legacy behavior of creating LZ4_stream_t for each compression
* 10.6-10.7 behavior of re-using streams between compressions for the same file (with stream-as-WorkingArea)
* using LZ4's extState APIs without streams (with extState-as-WorkingArea) (data not shown in below results)

Also in this PR: more improvements to sst_dump --recompress, which is arguably the best SST construction benchmark right now since db_bench seems to be so noisy due to backgroun flush+compaction, even with no compaction (FIFO). Streamlined some output and added a SST read time test, mostly for decompression performance.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14017

Test Plan:
Performance test using sst_dump --recompress with newer sst_dump back-ported to 10.5:
```
./sst_dump --command=recompress --compression_types=kLZ4Compression
test5.sst --compression_level_from=-6 --compression_level_to=-1
```
and with default compression level.

10.5:
```
Cx level: -6    Cx size:   61608137 Write usec:     880404
Cx level: -5    Cx size:   60793749 Write usec:     840903
Cx level: -4    Cx size:   58134030 Write usec:     836365
Cx level: -3    Cx size:   55193773 Write usec:     857113
Cx level: -2    Cx size:   54013891 Write usec:     855642
Cx level: -1    Cx size:   50400393 Write usec:     865194
Cx level: 32767 Cx size:   50400393 Write usec:     886310
```

Before this change (showing the regression, more time, from 10.6:
```
Cx level: -6    Cx size:   61608137 Write usec:     933448
Cx level: -5    Cx size:   60793749 Write usec:     893826
Cx level: -4    Cx size:   58134030 Write usec:     891138
Cx level: -3    Cx size:   55193773 Write usec:     898461
Cx level: -2    Cx size:   54013891 Write usec:     897485
Cx level: -1    Cx size:   50400393 Write usec:     936970
Cx level: 32767 Cx size:   50400393 Write usec:     958764
```

After this change (faster than both the above):
```
Cx level: -6    Cx size:   63641883 Write usec:     874190
Cx level: -5    Cx size:   58860032 Write usec:     834662
Cx level: -4    Cx size:   57150188 Write usec:     832707
Cx level: -3    Cx size:   58791894 Write usec:     850305
Cx level: -2    Cx size:   53145885 Write usec:     839574
Cx level: -1    Cx size:   49809139 Write usec:     845639
Cx level: 32767 Cx size:   49809139 Write usec:     875199
```

Similar tests with dictionary compression show essentially no difference (need to use stream APIs and reuse doesn't seem to matter). LZ4HC also unaffected (still improved vs. 10.5)

Reviewed By: hx235

Differential Revision: D83722880

Pulled By: pdillinger

fbshipit-source-id: 30149dd187686d5dd98321e6aa7d74bd7653a905
---
 table/sst_file_dumper.cc                      | 91 +++++++++++++++----
 table/sst_file_dumper.h                       |  4 +-
 tools/sst_dump_tool.cc                        | 29 +++---
 .../performance_improvements/lz4.md           |  1 +
 util/compression.cc                           | 74 ++++++++++++++-
 5 files changed, 162 insertions(+), 37 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/lz4.md

diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index b095073b8f37..5197eb5383cc 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -231,7 +231,9 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
 }
 
 Status SstFileDumper::CalculateCompressedTableSize(
-    const TableBuilderOptions& tb_options, TableProperties* props) {
+    const TableBuilderOptions& tb_options, TableProperties* props,
+    std::chrono::microseconds* write_time,
+    std::chrono::microseconds* read_time) {
   std::unique_ptr<Env> env(NewMemEnv(options_.env));
   std::unique_ptr<WritableFileWriter> dest_writer;
   Status s =
@@ -240,6 +242,8 @@ Status SstFileDumper::CalculateCompressedTableSize(
   if (!s.ok()) {
     return s;
   }
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
   std::unique_ptr<TableBuilder> table_builder{
       tb_options.moptions.table_factory->NewTableBuilder(tb_options,
                                                          dest_writer.get())};
@@ -253,17 +257,69 @@ Status SstFileDumper::CalculateCompressedTableSize(
   if (!s.ok()) {
     return s;
   }
+  iter.reset();
   s = table_builder->Finish();
+  *write_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      std::chrono::steady_clock::now() - start);
   if (!s.ok()) {
     return s;
   }
+  s = dest_writer->Close({});
+  if (!s.ok()) {
+    return s;
+  }
+  dest_writer.reset();
   *props = table_builder->GetTableProperties();
+  start = std::chrono::steady_clock::now();
+  TableReaderOptions reader_options(ioptions_, moptions_.prefix_extractor,
+                                    moptions_.compression_manager.get(),
+                                    soptions_, internal_comparator_,
+                                    0 /* block_protection_bytes_per_key */);
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = RandomAccessFileReader::Create(env->GetFileSystem(), testFileName,
+                                     soptions_, &file_reader, /*dbg=*/nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<TableReader> table_reader;
+  s = tb_options.moptions.table_factory->NewTableReader(
+      reader_options, std::move(file_reader), table_builder->FileSize(),
+      &table_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  iter.reset(table_reader->NewIterator(
+      read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+  }
+  s = iter->status();
+  if (!s.ok()) {
+    return s;
+  }
+  iter.reset();
+  table_reader.reset();
+  file_reader.reset();
+  *read_time = std::chrono::duration_cast<std::chrono::microseconds>(
+      std::chrono::steady_clock::now() - start);
   return env->DeleteFile(testFileName);
 }
 
 Status SstFileDumper::ShowAllCompressionSizes(
     const std::vector<CompressionType>& compression_types,
     int32_t compress_level_from, int32_t compress_level_to) {
+#ifndef NDEBUG
+  fprintf(stdout,
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+  BlockBasedTableOptions bbto;
+  if (options_.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName())) {
+    bbto = *(static_cast_with_check<BlockBasedTableFactory>(
+                 options_.table_factory.get()))
+                ->GetOptions<BlockBasedTableOptions>();
+  }
+
   for (CompressionType ctype : compression_types) {
     std::string cname;
     if (!GetStringFromCompressionType(&cname, ctype).ok()) {
@@ -273,10 +329,12 @@ Status SstFileDumper::ShowAllCompressionSizes(
     if (options_.compression_manager
             ? options_.compression_manager->SupportsCompressionType(ctype)
             : CompressionTypeSupported(ctype)) {
-      fprintf(stdout, "Compression: %-24s\n", cname.c_str());
       CompressionOptions compress_opt = options_.compression_opts;
+      fprintf(stdout,
+              "Compression: %-24s Block Size: %" PRIu64 "  Threads: %u\n",
+              cname.c_str(), bbto.block_size, compress_opt.parallel_threads);
       for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
-        fprintf(stdout, "Compression level: %d", j);
+        fprintf(stdout, "Cx level: %d", j);
         compress_opt.level = j;
         Status s = ShowCompressionSize(ctype, compress_opt);
         if (!s.ok()) {
@@ -320,27 +378,26 @@ Status SstFileDumper::ShowCompressionSize(
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       column_family_name, unknown_level, kUnknownNewestKeyTime);
   TableProperties props;
-  std::chrono::steady_clock::time_point start =
-      std::chrono::steady_clock::now();
-  Status s = CalculateCompressedTableSize(tb_opts, &props);
+  std::chrono::microseconds write_time;
+  std::chrono::microseconds read_time;
+  Status s =
+      CalculateCompressedTableSize(tb_opts, &props, &write_time, &read_time);
   if (!s.ok()) {
     return s;
   }
 
   uint64_t num_data_blocks = props.num_data_blocks;
 
-  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-  fprintf(stdout, " Comp size: %10" PRIu64, props.data_size);
-  fprintf(stdout, " Uncompressed: %10" PRIu64, props.uncompressed_data_size);
+  fprintf(stdout, " Cx size: %10" PRIu64, props.data_size);
+  fprintf(stdout, " Uncx size: %10" PRIu64, props.uncompressed_data_size);
   fprintf(stdout, " Ratio: %10s",
           std::to_string(static_cast<double>(props.uncompressed_data_size) /
                          static_cast<double>(props.data_size))
               .c_str());
-  fprintf(stdout, " Microsecs: %10s ",
-          std::to_string(
-              std::chrono::duration_cast<std::chrono::microseconds>(end - start)
-                  .count())
-              .c_str());
+  fprintf(stdout, " Write usec: %10s ",
+          std::to_string(write_time.count()).c_str());
+  fprintf(stdout, " Read usec: %10s ",
+          std::to_string(read_time.count()).c_str());
   const uint64_t compressed_blocks =
       opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED);
   const uint64_t not_compressed_blocks =
@@ -370,11 +427,11 @@ Status SstFileDumper::ShowCompressionSize(
                              : ((static_cast<double>(not_compressed_blocks) /
                                  static_cast<double>(num_data_blocks)) *
                                 100.0);
-  fprintf(stdout, " Comp count: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
+  fprintf(stdout, " Cx count: %6" PRIu64 " (%5.1f%%)", compressed_blocks,
           compressed_pcnt);
-  fprintf(stdout, " Not compressed (ratio): %6" PRIu64 " (%5.1f%%)",
+  fprintf(stdout, " Not cx for ratio: %6" PRIu64 " (%5.1f%%)",
           ratio_not_compressed_blocks, ratio_not_compressed_pcnt);
-  fprintf(stdout, " Not compressed (abort): %6" PRIu64 " (%5.1f%%)\n",
+  fprintf(stdout, " Not cx otherwise: %6" PRIu64 " (%5.1f%%)\n",
           not_compressed_blocks, not_compressed_pcnt);
   return Status::OK();
 }
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index 22b1e860b4ee..329915fdd662 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -59,7 +59,9 @@ class SstFileDumper {
                              FilePrefetchBuffer* prefetch_buffer);
 
   Status CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
-                                      TableProperties* props);
+                                      TableProperties* props,
+                                      std::chrono::microseconds* write_time,
+                                      std::chrono::microseconds* read_time);
 
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
   Status SetOldTableOptions();
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 3e4a05b0473e..f09712838f96 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -181,7 +181,6 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   bool list_meta_blocks = false;
   bool has_compression_level_from = false;
   bool has_compression_level_to = false;
-  bool has_specified_compression_types = false;
   std::string from_key;
   std::string to_key;
   std::string block_size_str;
@@ -258,7 +257,6 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       std::string compression_types_csv = argv[i] + 20;
       std::istringstream iss(compression_types_csv);
       std::string compression_type;
-      has_specified_compression_types = true;
 
       while (std::getline(iss, compression_type, ',')) {
         auto iter =
@@ -392,12 +390,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     }
   }
 
-  if (has_compression_level_from && has_compression_level_to) {
-    if (!has_specified_compression_types || compression_types.size() != 1) {
-      fprintf(stderr, "Specify one compression type.\n\n");
-      exit(1);
-    }
-  } else if (has_compression_level_from || has_compression_level_to) {
+  if (has_compression_level_from ^ has_compression_level_to) {
     fprintf(stderr,
             "Specify both --compression_level_from and "
             "--compression_level_to.\n\n");
@@ -536,14 +529,20 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
     }
 
     if (command == "recompress") {
-      fprintf(stdout, "Block Size: %zu  Threads: %u\n", block_size,
-              (unsigned)compression_parallel_threads);
-      // TODO: consider getting supported compressions from the compression
-      // manager
+      if (compression_types.empty()) {
+        if (options.compression_manager != nullptr) {
+          for (int c = 0; c < kDisableCompressionOption; ++c) {
+            if (options.compression_manager->SupportsCompressionType(
+                    static_cast<CompressionType>(c))) {
+              compression_types.emplace_back(static_cast<CompressionType>(c));
+            }
+          }
+        } else {
+          compression_types = GetSupportedCompressions();
+        }
+      }
       st = dumper.ShowAllCompressionSizes(
-          compression_types.empty() ? GetSupportedCompressions()
-                                    : compression_types,
-          compress_level_from, compress_level_to);
+          compression_types, compress_level_from, compress_level_to);
       if (!st.ok()) {
         fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
         exit(1);
diff --git a/unreleased_history/performance_improvements/lz4.md b/unreleased_history/performance_improvements/lz4.md
new file mode 100644
index 000000000000..5ae1656dfa75
--- /dev/null
+++ b/unreleased_history/performance_improvements/lz4.md
@@ -0,0 +1 @@
+* Fixed a performance regression in LZ4 compression that started in version 10.6.0
diff --git a/util/compression.cc b/util/compression.cc
index f259bc947815..5831643d462f 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -516,7 +516,7 @@ class BuiltinBZip2CompressorV2 : public CompressorWithSimpleDictBase {
   }
 };
 
-class BuiltinLZ4CompressorV2 : public CompressorWithSimpleDictBase {
+class BuiltinLZ4CompressorV2WithDict : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
 
@@ -527,8 +527,8 @@ class BuiltinLZ4CompressorV2 : public CompressorWithSimpleDictBase {
   }
 
   std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
-    return std::make_unique<BuiltinLZ4CompressorV2>(opts_,
-                                                    std::move(dict_data));
+    return std::make_unique<BuiltinLZ4CompressorV2WithDict>(
+        opts_, std::move(dict_data));
   }
 
   ManagedWorkingArea ObtainWorkingArea() override {
@@ -611,6 +611,72 @@ class BuiltinLZ4CompressorV2 : public CompressorWithSimpleDictBase {
   }
 };
 
+class BuiltinLZ4CompressorV2NoDict : public BuiltinLZ4CompressorV2WithDict {
+ public:
+  BuiltinLZ4CompressorV2NoDict(const CompressionOptions& opts)
+      : BuiltinLZ4CompressorV2WithDict(opts, /*dict_data=*/{}) {}
+
+  ManagedWorkingArea ObtainWorkingArea() override {
+    // Using an LZ4_stream_t between compressions and resetting with
+    // LZ4_resetStream_fast is actually slower than using a fresh LZ4_stream_t
+    // each time, or not involving a stream at all. Similarly, using an extState
+    // does not seem to offer a performance boost, perhaps a small regression.
+    return {};
+  }
+
+  void ReleaseWorkingArea(WorkingArea* wa) override {
+    // Should not be called
+    (void)wa;
+    assert(wa == nullptr);
+  }
+
+  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                       size_t* compressed_output_size,
+                       CompressionType* out_compression_type,
+                       ManagedWorkingArea* wa) override {
+#ifdef LZ4
+    (void)wa;
+    auto [alg_output, alg_max_output_size] = StartCompressBlockV2(
+        uncompressed_data, compressed_output, *compressed_output_size);
+    if (alg_max_output_size == 0) {
+      // Compression bypassed
+      *compressed_output_size = 0;
+      *out_compression_type = kNoCompression;
+      return Status::OK();
+    }
+    int acceleration;
+    if (opts_.level < 0) {
+      acceleration = -opts_.level;
+    } else {
+      acceleration = 1;
+    }
+    auto outlen =
+        LZ4_compress_fast(uncompressed_data.data(), alg_output,
+                          static_cast<int>(uncompressed_data.size()),
+                          static_cast<int>(alg_max_output_size), acceleration);
+    if (outlen > 0) {
+      // Compression kept/successful
+      size_t output_size = static_cast<size_t>(
+          outlen + /*header size*/ (alg_output - compressed_output));
+      assert(output_size <= *compressed_output_size);
+      *compressed_output_size = output_size;
+      *out_compression_type = kLZ4Compression;
+      return Status::OK();
+    }
+    // Compression rejected
+    *compressed_output_size = 1;
+#else
+    (void)uncompressed_data;
+    (void)compressed_output;
+    (void)wa;
+    // Compression bypassed (not supported)
+    *compressed_output_size = 0;
+#endif
+    *out_compression_type = kNoCompression;
+    return Status::OK();
+  }
+};
+
 class BuiltinLZ4HCCompressorV2 : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
@@ -1508,7 +1574,7 @@ class BuiltinCompressionManagerV2 : public CompressionManager {
       case kBZip2Compression:
         return std::make_unique<BuiltinBZip2CompressorV2>(opts);
       case kLZ4Compression:
-        return std::make_unique<BuiltinLZ4CompressorV2>(opts);
+        return std::make_unique<BuiltinLZ4CompressorV2NoDict>(opts);
       case kLZ4HCCompression:
         return std::make_unique<BuiltinLZ4HCCompressorV2>(opts);
       case kXpressCompression:

From 7c22fbe0d511c5108d48ec16392e6cfa37edbc5d Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Fri, 3 Oct 2025 00:01:02 -0700
Subject: [PATCH 323/500] Disable a param combo in crash test to fix a data
 race (#14023)

Summary:
When inplace_update_support and memtable_veirfy_per_key_checksum_on_seek are enabled at the same time, it would cause data race in memtable.

inplace_update_support allows key/value pair in place update in memtable.

memtable_veirfy_per_key_checksum_on_seek performs key checksum verification during seek. It is possible that one thread is updating the key/value pair in place, while another thread is reading the key/value pair for checksum verification during seek.

Therefore, there these 2 configurations could not be enabled at the same time

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14023

Test Plan: local stress test run stops reporting race condition

Reviewed By: anand1976

Differential Revision: D83812322

Pulled By: xingbowang

fbshipit-source-id: 6cb9f0f3faa8deba97305bfe87266f2fe78e0501
---
 tools/db_crashtest.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index e739611c0614..40fb17a5f24a 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1197,6 +1197,12 @@ def finalize_and_sanitize(src_params):
         dest_params["prefixpercent"] = 0
         dest_params["read_fault_one_in"] = 0
         dest_params["memtable_prefix_bloom_size_ratio"] = 0
+
+    # inplace update and key checksum verification during seek would cause race condition
+    # Therefore, when inplace_update_support is enabled, disable memtable_veirfy_per_key_checksum_on_seek
+    if dest_params["inplace_update_support"] == 1:
+        dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0
+
     return dest_params
 
 

From 2fab774697c453f807a640bbf27c2a50421f2361 Mon Sep 17 00:00:00 2001
From: Pierre Moulon <pierrem@meta.com>
Date: Fri, 3 Oct 2025 14:28:37 -0700
Subject: [PATCH 324/500] Typo fix (#14024)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14024

Fix some typo found along the codebase

Reviewed By: pdillinger

Differential Revision: D83789182

fbshipit-source-id: feb24d7d47a6faaf735fcfd50dd3ecce4a6c8cd5
---
 HISTORY.md                                    | 16 ++++++-------
 cache/secondary_cache_adapter.cc              |  8 +++----
 db/db_follower_test.cc                        |  4 ++--
 db/log_reader.h                               |  8 +++----
 db/log_writer.h                               |  2 +-
 db/write_thread.h                             |  2 +-
 db_stress_tool/expected_value.h               |  8 +++----
 .../transactions/write_prepared_txn_db.cc     | 24 +++++++++----------
 8 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index b9ba1074c3d9..01cdf940907b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,7 +4,7 @@
 ## 10.7.0 (09/19/2025)
 ### New Features
 * Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not.
-* A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
+* A new flag memtable_verify_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
 * Introduce option MultiScanArgs::use_async_io to enable asynchronous I/O during MultiScan, instead of waiting for I/O to be done in Prepare().
 * Add new option `MultiScanArgs::max_prefetch_size` that limits the memory usage of per file pinning of prefetched blocks.
 * Improved `sst_dump` by allowing standalone file and directory arguments without `--file=`. Also added new options and better output for `sst_dump --command=recompress`. See `sst_dump --help`
@@ -62,7 +62,7 @@ Together, a new configuration TransactionOptions::deadlock_timeout_us is added,
 * DB option skip_checking_sst_file_sizes_on_db_open is deprecated, in favor of validating file size in parallel in a thread pool, when db is opened. When DB is opened, with paranoid check enabled, a file with the wrong size would fail the DB open. With paranoid check disabled, the DB open would succeed, the column family with the corrupted file would not be read or write, while the other healthy column families could be read and write normally. When max_open_files option is not set to -1, only a subset of the files will be opened and checked. The rest of the files will be opened and checked when they are accessed.
 
 ### Behavior Changes
-* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occured.
+* PessimisticTransaction::GetWaitingTxns now returns waiting transaction information even if the current transaction has timed out. This allows the information to be surfaced to users for debugging purposes once it is known that the timeout has occurred.
 * A new API GetFileSize is added to FSRandomAccessFile interface class. It uses fstat vs stat on the posix implementation which is more efficient. Caller could use it to get file size faster. This function might be required in the future for FileSystem implementation outside of the RocksDB code base.
 * RocksDB now triggers eligible compactions every 12 hours when periodic compaction is configured. This solves a limitation of the compaction trigger mechanism, which would only trigger compaction after specific events like flush, compaction, or SetOptions.
 
@@ -119,7 +119,7 @@ system's prefetch) on SST file during compaction read
 * Deprecated API `DB::MaxMemCompactionLevel()`.
 * Deprecated `ReadOptions::ignore_range_deletions`.
 * Deprecated API `experimental::PromoteL0()`.
-* Added arbitrary string map for additional options to be overriden for remote compactions
+* Added arbitrary string map for additional options to be overridden for remote compactions
 * The fail_if_options_file_error option in DBOptions has been removed. The behavior now is to always return failure in any API that fails to persist the OPTIONS file.
 
 ### Behavior Changes
@@ -269,7 +269,7 @@ system's prefetch) on SST file during compaction read
 * In FIFO compaction, compactions for changing file temperature (configured by option `file_temperature_age_thresholds`) will compact one file at a time, instead of merging multiple eligible file together (#13018).
 * Support ingesting db generated files using hard link, i.e. IngestExternalFileOptions::move_files/link_files and IngestExternalFileOptions::allow_db_generated_files.
 * Add a new file ingestion option `IngestExternalFileOptions::link_files` to hard link input files and preserve original files links after ingestion.
-* DB::Close now untracks files in SstFileManager, making avaialble any space used
+* DB::Close now untracks files in SstFileManager, making available any space used
 by them. Prior to this change they would be orphaned until the DB is re-opened.
 
 ### Bug Fixes
@@ -465,7 +465,7 @@ MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 76
 * Removed deprecated option `ColumnFamilyOptions::check_flush_compaction_key_order`
 * Remove the default `WritableFile::GetFileSize` and `FSWritableFile::GetFileSize` implementation that returns 0 and make it pure virtual, so that subclasses are enforced to explicitly provide an implementation.
 * Removed deprecated option `ColumnFamilyOptions::level_compaction_dynamic_file_size`
-* Removed tickers with typos "rocksdb.error.handler.bg.errro.count", "rocksdb.error.handler.bg.io.errro.count", "rocksdb.error.handler.bg.retryable.io.errro.count".
+* Removed tickers with typos "rocksdb.error.handler.bg.error.count", "rocksdb.error.handler.bg.io.error.count", "rocksdb.error.handler.bg.retryable.io.error.count".
 * Remove the force mode for `EnableFileDeletions` API because it is unsafe with no known legitimate use.
 * Removed deprecated option `ColumnFamilyOptions::ignore_max_compaction_bytes_for_input`
 * `sst_dump --command=check` now compares the number of records in a table with `num_entries` in table property, and reports corruption if there is a mismatch. API `SstFileDumper::ReadSequential()` is updated to optionally do this verification. (#12322)
@@ -492,7 +492,7 @@ MultiGetBenchmarks.multiGetList10 no_column_family 10000 16 100 1024 thrpt 25 76
 * Exposed options ttl via c api.
 
 ### Behavior Changes
-* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explictly flushing blob file.
+* `rocksdb.blobdb.blob.file.write.micros` expands to also measure time writing the header and footer. Therefore the COUNT may be higher and values may be smaller than before. For stacked BlobDB, it no longer measures the time of explicitly flushing blob file.
 * Files will be compacted to the next level if the data age exceeds periodic_compaction_seconds except for the last level.
 * Reduced the compaction debt ratio trigger for scheduling parallel compactions
 * For leveled compaction with default compaction pri (kMinOverlappingRatio), files marked for compaction will be prioritized over files not marked when picking a file from a level for compaction.
@@ -557,7 +557,7 @@ want to continue to use force enabling, they need to explicitly pass a `true` to
 
 ### Behavior Changes
 * During off-peak hours defined by `daily_offpeak_time_utc`, the compaction picker will select a larger number of files for periodic compaction. This selection will include files that are projected to expire by the next off-peak start time, ensuring that these files are not chosen for periodic compaction outside of off-peak hours.
-* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occured previously in its status message.
+* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occurred previously in its status message.
 * Deleting stale files upon recovery are delegated to SstFileManger if available so they can be rate limited.
 * Make RocksDB only call `TablePropertiesCollector::Finish()` once.
 * When `WAL_ttl_seconds > 0`, we now process archived WALs for deletion at least every `WAL_ttl_seconds / 2` seconds. Previously it could be less frequent in case of small `WAL_ttl_seconds` values when size-based expiration (`WAL_size_limit_MB > 0 `) was simultaneously enabled.
@@ -1345,7 +1345,7 @@ Note: The next release will be major release 7.0. See https://github.com/faceboo
 ### Public API change
 * Extend WriteBatch::AssignTimestamp and AssignTimestamps API so that both functions can accept an optional `checker` argument that performs additional checking on timestamp sizes.
 * Introduce a new EventListener callback that will be called upon the end of automatic error recovery.
-* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low seperately.
+* Add IncreaseFullHistoryTsLow API so users can advance each column family's full_history_ts_low separately.
 * Add GetFullHistoryTsLow API so users can query current full_history_low value of specified column family.
 
 ### Performance Improvements
diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc
index 2db601d2ecf8..c02e31227308 100644
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@@ -33,7 +33,7 @@ const char* kTieredCacheName = "TieredCache";
 // proportionally across the primary/secondary caches.
 //
 // The primary block cache is initially sized to the sum of the primary cache
-// budget + teh secondary cache budget, as follows -
+// budget + the secondary cache budget, as follows -
 //   |---------    Primary Cache Configured Capacity  -----------|
 //   |---Secondary Cache Budget----|----Primary Cache Budget-----|
 //
@@ -51,7 +51,7 @@ const char* kTieredCacheName = "TieredCache";
 // placeholder is counted against the primary cache. To compensate and count
 // a portion of it against the secondary cache, the secondary cache Deflate()
 // method is called to shrink it. Since the Deflate() causes the secondary
-// actual usage to shrink, it is refelcted here by releasing an equal amount
+// actual usage to shrink, it is reflected here by releasing an equal amount
 // from the pri_cache_res_ reservation. The Deflate() in the secondary cache
 // can be, but is not required to be, implemented using its own cache
 // reservation manager.
@@ -72,7 +72,7 @@ const char* kTieredCacheName = "TieredCache";
 // reservation is increased by an equal amount.
 //
 // Another way of implementing this would have been to simply split the user
-// reservation into primary and seconary components. However, this would
+// reservation into primary and secondary components. However, this would
 // require allocating a structure to track the associated secondary cache
 // reservation, which adds some complexity and overhead.
 //
@@ -621,7 +621,7 @@ Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio(
   } else {
     // We're shrinking the ratio. Try to avoid unnecessary evictions -
     // 1. Lower the secondary cache capacity
-    // 2. Decrease pri_cache_res_ reservation to relect lower secondary
+    // 2. Decrease pri_cache_res_ reservation to reflect lower secondary
     //    cache utilization (decrease in capacity - decrease in share of cache
     //    reservations)
     // 3. Inflate the secondary cache to give it back the reduction in its
diff --git a/db/db_follower_test.cc b/db/db_follower_test.cc
index a0f35a46b619..c032464052c2 100644
--- a/db/db_follower_test.cc
+++ b/db/db_follower_test.cc
@@ -370,10 +370,10 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
 
 // This test creates 4 L0 files and compacts them. The follower, during catchup,
 // successfully instantiates 4 Versions corresponding to the 4 files (but
-// donesn't install them yet), followed by deleting those 4 and adding a new
+// doesn't install them yet), followed by deleting those 4 and adding a new
 // file from compaction. The test verifies that the 4 L0 files are deleted
 // correctly by the follower.
-// We use teh Barrier* functions to ensure that the follower first sees the 4
+// We use the Barrier* functions to ensure that the follower first sees the 4
 // L0 files and is able to link them, and then sees the compaction that
 // obsoletes those L0 files (so those L0 files are intermediates that it has
 // to explicitly delete). Suppose we don't have any barriers, its possible
diff --git a/db/log_reader.h b/db/log_reader.h
index dfcd6b7690f3..b2c43f076414 100644
--- a/db/log_reader.h
+++ b/db/log_reader.h
@@ -59,7 +59,7 @@ class Reader {
   // live while this Reader is in use.
   //
   // If "checksum" is true, verify checksums if available.
-  // TODO(hx235): seperate WAL related parameters from general `Reader`
+  // TODO(hx235): separate WAL related parameters from general `Reader`
   // parameters
   Reader(std::shared_ptr<Logger> info_log,
          std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
@@ -155,7 +155,7 @@ class Reader {
   // which log number this is
   uint64_t const log_number_;
 
-  // See `Optinos::track_and_verify_wals`
+  // See `Options::track_and_verify_wals`
   bool track_and_verify_wals_;
   // Below variables are used for WAL verification
   // TODO(hx235): To revise `stop_replay_for_corruption_` inside `LogReader`
@@ -208,8 +208,8 @@ class Reader {
   };
 
   // Return type, or one of the preceding special values
-  // If WAL compressioned is enabled, fragment_checksum is the checksum of the
-  // fragment computed from the orginal buffer containinng uncompressed
+  // If WAL compression is enabled, fragment_checksum is the checksum of the
+  // fragment computed from the original buffer containing uncompressed
   // fragment.
   uint8_t ReadPhysicalRecord(Slice* result, size_t* drop_size,
                              uint64_t* fragment_checksum = nullptr);
diff --git a/db/log_writer.h b/db/log_writer.h
index f7aef75197d5..3a76faab771b 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -77,7 +77,7 @@ class Writer {
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
   // "*dest" must remain live while this Writer is in use.
-  // TODO(hx235): seperate WAL related parameters from general `Reader`
+  // TODO(hx235): separate WAL related parameters from general `Reader`
   // parameters
   explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
                   uint64_t log_number, bool recycle_log_files,
diff --git a/db/write_thread.h b/db/write_thread.h
index 7adf362dcba7..6c2dc5dcd02a 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -179,7 +179,7 @@ class WriteThread {
            PostMemTableCallback* _post_memtable_callback = nullptr,
            bool _ingest_wbwi = false)
         : batch(_batch),
-          // TODO: store a copy of WriteOptions instead of its seperated data
+          // TODO: store a copy of WriteOptions instead of its separated data
           // members
           sync(write_options.sync),
           no_slowdown(write_options.no_slowdown),
diff --git a/db_stress_tool/expected_value.h b/db_stress_tool/expected_value.h
index 428c389cb66e..7aed38240f09 100644
--- a/db_stress_tool/expected_value.h
+++ b/db_stress_tool/expected_value.h
@@ -253,20 +253,20 @@ class PendingExpectedValue {
 class ExpectedValueHelper {
  public:
   // Return whether the key associated with `pre_read_expected_value` and
-  // `post_read_expected_value` is expected not to exist from begining till the
+  // `post_read_expected_value` is expected not to exist from beginning till the
   // end of the read
   //
   // The negation of `MustHaveNotExisted()` is "may have not existed".
-  // To assert some key must have existsed, please use `MustHaveExisted()`
+  // To assert some key must have existed, please use `MustHaveExisted()`
   static bool MustHaveNotExisted(ExpectedValue pre_read_expected_value,
                                  ExpectedValue post_read_expected_value);
 
   // Return whether the key associated with `pre_read_expected_value` and
-  // `post_read_expected_value` is expected to exist from begining till the end
+  // `post_read_expected_value` is expected to exist from beginning till the end
   // of the read.
   //
   // The negation of `MustHaveExisted()` is "may have existed".
-  // To assert some key must have not existsed, please use
+  // To assert some key must have not existed, please use
   // `MustHaveNotExisted()`
   static bool MustHaveExisted(ExpectedValue pre_read_expected_value,
                               ExpectedValue post_read_expected_value);
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index 26b413bf8b20..54cc2511cc78 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -107,7 +107,7 @@ Status WritePreparedTxnDB::VerifyCFOptions(
   if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) {
     return Status::InvalidArgument(
         "memtable_factory->CanHandleDuplicatedKey() cannot be false with "
-        "WritePrpeared transactions");
+        "WritePrepared transactions");
   }
   return Status::OK();
 }
@@ -196,14 +196,14 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   const uint64_t no_log_ref = 0;
   uint64_t seq_used = kMaxSequenceNumber;
   const size_t ZERO_PREPARES = 0;
-  const bool kSeperatePrepareCommitBatches = true;
+  const bool kSeparatePrepareCommitBatches = true;
   // Since this is not 2pc, there is no need for AddPrepared but having it in
   // the PreReleaseCallback enables an optimization. Refer to
   // SmallestUnCommittedSeq for more details.
   AddPreparedCallback add_prepared_callback(
       this, db_impl_, batch_cnt,
       db_impl_->immutable_db_options().two_write_queues,
-      !kSeperatePrepareCommitBatches);
+      !kSeparatePrepareCommitBatches);
   WritePreparedCommitEntryPreReleaseCallback update_commit_map(
       this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt);
   PreReleaseCallback* pre_release_callback;
@@ -484,7 +484,7 @@ Status WritePreparedTxnDB::NewIterators(
 }
 
 void WritePreparedTxnDB::Init(const TransactionDBOptions& txn_db_opts) {
-  // Adcance max_evicted_seq_ no more than 100 times before the cache wraps
+  // Advance max_evicted_seq_ no more than 100 times before the cache wraps
   // around.
   INC_STEP_FOR_MAX_EVICTED =
       std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1));
@@ -731,7 +731,7 @@ void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
   bool update_snapshots = false;
   if (new_snapshots_version > snapshots_version_) {
     // This is to avoid updating the snapshots_ if it already updated
-    // with a more recent vesion by a concrrent thread
+    // with a more recent version by a concurrent thread
     update_snapshots = true;
     // We only care about snapshots lower then max
     snapshots = GetSnapshotListFromDB(new_max);
@@ -807,7 +807,7 @@ SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
       throw std::runtime_error(
           "Snapshot seq " + std::to_string(snap_impl->GetSequenceNumber()) +
           " after " + std::to_string(retry) +
-          " retries is still less than futre_max_evicted_seq_" +
+          " retries is still less than future_max_evicted_seq_" +
           std::to_string(max));
     }
   }
@@ -930,9 +930,9 @@ void WritePreparedTxnDB::UpdateSnapshots(
   // both new and old lists, it will appear upper in the new list. So if
   // we simply insert the new snapshots in order, if an overwritten item
   // is still valid in the new list is either written to the same place in
-  // the array or it is written in a higher palce before it gets
-  // overwritten by another item. This guarantess a reader that reads the
-  // list bottom-up will eventaully see a snapshot that repeats in the
+  // the array or it is written in a higher place before it gets
+  // overwritten by another item. This guarantee a reader that reads the
+  // list bottom-up will eventually see a snapshot that repeats in the
   // update, either before it gets overwritten by the writer or
   // afterwards.
   size_t i = 0;
@@ -981,7 +981,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
   // reader should be able to read all the snapshots that are still valid
   // after the update. Since the survived snapshots are written in a higher
   // place before gets overwritten the reader that reads bottom-up will
-  // eventully see it.
+  // eventually see it.
   const bool next_is_larger = true;
   // We will set to true if the border line snapshot suggests that.
   bool search_larger_list = false;
@@ -1003,7 +1003,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
     }
   }
 #ifndef NDEBUG
-  // Release the remaining sync points before accquiring the lock
+  // Release the remaining sync points before acquiring the lock
   for (++sync_i; sync_i <= 10; ++sync_i) {
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i);
     TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
@@ -1020,7 +1020,7 @@ void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
                    evicted.prep_seq, evicted.commit_seq, cnt);
     ReadLock rl(&snapshots_mutex_);
     // Items could have moved from the snapshots_ to snapshot_cache_ before
-    // accquiring the lock. To make sure that we do not miss a valid snapshot,
+    // acquiring the lock. To make sure that we do not miss a valid snapshot,
     // read snapshot_cache_ again while holding the lock.
     for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
       SequenceNumber snapshot_seq =

From bdf5a8fffbc271dba1868e391409839aeee9b546 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 3 Oct 2025 18:16:33 -0700
Subject: [PATCH 325/500] Avoid reseeking upon skipping too many keys in crash
 tests (#14015)

Summary:
Implicit reseek in the middle of an iteration is not supported with MultiScan. Avoid this for now in crash tests by setting max_sequential_skip_in_iterations to an absurdly high value.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14015

Reviewed By: xingbowang

Differential Revision: D83761612

Pulled By: anand1976

fbshipit-source-id: 16f4e856374b79170c0a79c11c275cbb0fc83a70
---
 tools/db_crashtest.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 40fb17a5f24a..f788ef59727c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1197,6 +1197,7 @@ def finalize_and_sanitize(src_params):
         dest_params["prefixpercent"] = 0
         dest_params["read_fault_one_in"] = 0
         dest_params["memtable_prefix_bloom_size_ratio"] = 0
+        dest_params["max_sequential_skip_in_iterations"] = sys.maxsize
 
     # inplace update and key checksum verification during seek would cause race condition
     # Therefore, when inplace_update_support is enabled, disable memtable_veirfy_per_key_checksum_on_seek

From 27625f4fc24b3816f90abec79a5ea4910e0745dd Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 6 Oct 2025 14:35:15 -0700
Subject: [PATCH 326/500] Fix range delete file caused MultiScan issue (#14028)

Summary:
When there is an ingested SST file that only contains delete range operations, MultiScan may return error "Scan does not intersect with file". This is due to file selection during Prepare uses the file smallest and largest key without considering whether there is any key in the file. This is only a temporary fix.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14028

Test Plan: Unit test

Reviewed By: anand1976

Differential Revision: D83986964

Pulled By: xingbowang

fbshipit-source-id: e0961ca854e2062c2457be4324817ba073ae785d
---
 db/version_set.cc   |   5 ++
 table/table_test.cc | 119 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/db/version_set.cc b/db/version_set.cc
index a16e5232336b..d265f51f3e5b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1187,6 +1187,11 @@ class LevelIterator final : public InternalIterator {
                   iend.Encode(), flevel_->files[i].smallest_key) < 0) {
             continue;
           }
+          auto const metadata = flevel_->files[i].file_metadata;
+          if (metadata->num_entries == metadata->num_range_deletions) {
+            // Skip range deletion only files.
+            continue;
+          }
           auto& args = GetMultiScanArgForFile(i);
           args.insert(start.value(), end.value(), opt.property_bag);
         }
diff --git a/table/table_test.cc b/table/table_test.cc
index 37abb0d824e5..8cbd0ac1677b 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -8621,6 +8621,125 @@ TEST_P(UserDefinedIndexTest, ConfigTest) {
   ASSERT_OK(DestroyDB(dbname, options_));
 }
 
+TEST_P(UserDefinedIndexTest, RangeDelete) {
+  BlockBasedTableOptions table_options;
+  options_.num_levels = 50;
+  options_.compaction_style = kCompactionStyleUniversal;
+  options_.disable_auto_compactions = true;
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  auto create_ingestion_data_file = [&](const std::string& filename) {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(filename));
+    auto kvs = generateKVs(100);
+
+    for (const auto& kv : kvs) {
+      ASSERT_OK(writer->Put(kv.first, kv.second));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  };
+
+  // Create first ingestion file with data
+  create_ingestion_data_file(ingest_file + "_0");
+
+  // Create second ingestion file with range delete only that covers the first
+  // file to delete all of its keys.
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(ingest_file + "_1"));
+    if (is_reverse_comparator_) {
+      ASSERT_OK(writer->DeleteRange("keyz", "key"));
+    } else {
+      ASSERT_OK(writer->DeleteRange("key", "keyz"));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  }
+
+  // Create the second ingestion file with data
+  create_ingestion_data_file(ingest_file + "_2");
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  // ingest first data file key00~key99
+  s = db->IngestExternalFile(cfh, {ingest_file + "_0"}, ifo);
+  ASSERT_OK(s);
+  // ingest delete range (key-keyz) and new data file (key00-key99) together
+  s = db->IngestExternalFile(cfh, {ingest_file + "_1", ingest_file + "_2"},
+                             ifo);
+  ASSERT_OK(s);
+
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  std::vector<Slice> range = {
+      Slice("key10"),
+      Slice("key25"),
+      Slice("key80"),
+      Slice("key95"),
+  };
+
+  if (is_reverse_comparator_) {
+    std::reverse(range.begin(), range.end());
+  }
+
+  Slice ub("");
+  ro.iterate_upper_bound = &ub;
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  MultiScanArgs scan_opts(options_.comparator);
+  std::unordered_map<std::string, std::string> property_bag;
+  property_bag["count"] = std::to_string(9);
+
+  std::vector<std::vector<char>> decoded_ranges;
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    scan_opts.insert(range[i * 2], range[i * 2 + 1],
+                     std::optional(property_bag));
+  }
+  iter->Prepare(scan_opts);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    // Update upper bound before each seek
+    ub = range[2 * i + 1];
+    auto key_count = 0;
+    for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) {
+      key_count++;
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key_count, 15);
+  }
+
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
 INSTANTIATE_TEST_CASE_P(UserDefinedIndexTest, UserDefinedIndexTest,
                         ::testing::Values(BytewiseComparator(),
                                           ReverseBytewiseComparator()));

From 4ab1bc865c0d4f91992e1a5654a9cafe7266ab1d Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Mon, 6 Oct 2025 18:47:24 -0700
Subject: [PATCH 327/500] Disable standlone delete range file ingest in
 db_crashtest.py if multiscan enabled (#14026)

Summary:
MultiScan currently doesn't handle delete range properly. In this specific case, a file with only delete range will have an empty index resulting in BlockBasedTableIterator wrongly thinking that a scan doesn't intersect the file due to empty result.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14026

Test Plan: Run crash test

Reviewed By: xingbowang

Differential Revision: D83881266

Pulled By: anand1976

fbshipit-source-id: dc1faa494ea23f36391b700dd1ee0430a1f20ac5
---
 tools/db_crashtest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index f788ef59727c..12482c0ba85f 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1198,6 +1198,10 @@ def finalize_and_sanitize(src_params):
         dest_params["read_fault_one_in"] = 0
         dest_params["memtable_prefix_bloom_size_ratio"] = 0
         dest_params["max_sequential_skip_in_iterations"] = sys.maxsize
+        # This option ingests a delete range that might partially overlap with
+        # existing key range, which will cause a reseek that's currently not
+        # supported by multiscan
+        dest_params["test_ingest_standalone_range_deletion_one_in"] = 0
 
     # inplace update and key checksum verification during seek would cause race condition
     # Therefore, when inplace_update_support is enabled, disable memtable_veirfy_per_key_checksum_on_seek

From 194160d534e1272d6d3d3728a27b1ce35b1b315f Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 7 Oct 2025 02:13:35 -0700
Subject: [PATCH 328/500] Use wget for folly dependency download (#14030)

Summary:
Fix the binutils truncated download issue by switching to wget in the folly build scripts for downloading dependencies.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14030

Test Plan: make build_folly

Reviewed By: jaykorean

Differential Revision: D84033126

Pulled By: anand1976

fbshipit-source-id: bc6706d7e57c97d6edff149a965aa12c7959825f
---
 .github/actions/setup-folly/action.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/actions/setup-folly/action.yml b/.github/actions/setup-folly/action.yml
index 41cec847ce60..438d8e8e8183 100644
--- a/.github/actions/setup-folly/action.yml
+++ b/.github/actions/setup-folly/action.yml
@@ -3,5 +3,7 @@ runs:
   using: composite
   steps:
   - name: Checkout folly sources
-    run: make checkout_folly
+    run: |
+      make checkout_folly
+      echo "GETDEPS_USE_WGET=1" >> "$GITHUB_ENV"
     shell: bash

From 5ace84ebae0cd42c7c5fa7125294dca9bd456bce Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 7 Oct 2025 08:35:28 -0700
Subject: [PATCH 329/500] Pass the correct comparator to MultiScanArgs (#14033)

Summary:
Fix assertion failure in crash tests with timestamp due to the wrong comparator passed to MultiScanArgs

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14033

Reviewed By: xingbowang

Differential Revision: D84036954

Pulled By: anand1976

fbshipit-source-id: 526be21c0754dcccf8e4d2b9fba33716fe35860a
---
 db_stress_tool/db_stress_test_base.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index d61caddc8263..c4abbf96995b 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1694,7 +1694,7 @@ Status StressTest::TestMultiScan(ThreadState* thread,
   std::vector<std::string> start_key_strs;
   std::vector<std::string> end_key_strs;
   // TODO support reverse BytewiseComparator in the stress test
-  MultiScanArgs scan_opts(BytewiseComparator());
+  MultiScanArgs scan_opts(options_.comparator);
   scan_opts.use_async_io = FLAGS_multiscan_use_async_io;
   start_key_strs.reserve(num_scans);
   end_key_strs.reserve(num_scans);

From cbfcac8d1d02d5389a6105f9bd0fa25a39873f02 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 7 Oct 2025 17:31:59 -0700
Subject: [PATCH 330/500] Stress Test Improvement (#14022)

Summary:
- Include Status in RemoteCompactionResultMap in SharedState so that we can directly check the status of the remote compaction in `DbStressCompactionService::Wait()`
- If result is empty, populate the result with the status that was returned from `GetRemoteCompactionResult()` so that the status can be bubbled up to the primary (main db thread)
- Get rid of Timeout in `Wait()`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14022

Test Plan:
With fall-back
```
python3 -u tools/db_crashtest.py blackbox --remote_compaction_worker_threads=8 --remote_compaction_failure_fall_back_to_local=1
```

Without fall-back
```
python3 -u tools/db_crashtest.py blackbox --remote_compaction_worker_threads=8 --remote_compaction_failure_fall_back_to_local=0
```

Reviewed By: hx235

Differential Revision: D83789172

Pulled By: jaykorean

fbshipit-source-id: 08f710c4ece5fcc1d4b95b3f9c353831882851b7
---
 db_stress_tool/db_stress_common.cc            |  2 +-
 db_stress_tool/db_stress_compaction_service.h | 47 ++++++++++++-------
 db_stress_tool/db_stress_shared_state.h       | 18 ++++---
 3 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index 19f5eeec9e86..99ff429f443f 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -280,7 +280,7 @@ void RemoteCompactionWorkerThread(void* v) {
       // Add the output regardless of status, so that primary DB doesn't rely on
       // the timeout to finish waiting. The actual failure from the
       // deserialization can fail the compaction properly
-      shared->AddRemoteCompactionResult(job_id, serialized_output);
+      shared->AddRemoteCompactionResult(job_id, s, serialized_output);
     }
     db_stress_env->SleepForMicroseconds(
         thread->rand.Next() % FLAGS_remote_compaction_worker_interval * 1000 +
diff --git a/db_stress_tool/db_stress_compaction_service.h b/db_stress_tool/db_stress_compaction_service.h
index a47963e261f9..f45198fe48c7 100644
--- a/db_stress_tool/db_stress_compaction_service.h
+++ b/db_stress_tool/db_stress_compaction_service.h
@@ -25,8 +25,6 @@ class DbStressCompactionService : public CompactionService {
   const char* Name() const override { return kClassName(); }
 
   static constexpr uint64_t kWaitIntervalInMicros = 10 * 1000;  // 10ms
-  static constexpr uint64_t kWaitTimeoutInMicros =
-      30 * 1000 * 1000;  // 30 seconds
 
   CompactionServiceScheduleResponse Schedule(
       const CompactionServiceJobInfo& info,
@@ -45,24 +43,39 @@ class DbStressCompactionService : public CompactionService {
 
   CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
                                   std::string* result) override {
-    auto start = Env::Default()->NowMicros();
-    while (Env::Default()->NowMicros() - start < kWaitTimeoutInMicros) {
+    while (true) {
       if (aborted_.load()) {
-        return CompactionServiceJobStatus::kUseLocal;
+        return CompactionServiceJobStatus::kAborted;
       }
-      if (shared_->GetRemoteCompactionResult(scheduled_job_id, result).ok()) {
-        if (result && result->empty()) {
-          // Race: Remote worker aborted before client sets aborted_ = true
-          return CompactionServiceJobStatus::kUseLocal;
+      const auto& maybeResultStatus =
+          shared_->GetRemoteCompactionResult(scheduled_job_id, result);
+      if (maybeResultStatus.has_value()) {
+        auto s = maybeResultStatus.value();
+        if (s.ok()) {
+          assert(result);
+          assert(!result->empty());
+          return CompactionServiceJobStatus::kSuccess;
+        } else {
+          // Remote Compaction failed
+          if (failure_should_fall_back_to_local_) {
+            return CompactionServiceJobStatus::kUseLocal;
+          }
+          if (result && result->empty()) {
+            // If result is empty, set the compaction status in the result so
+            // that it can be bubbled up to main thread
+            CompactionServiceResult compaction_result;
+            compaction_result.status = s;
+            if (compaction_result.Write(result).ok()) {
+              assert(result);
+              assert(!result->empty());
+            }
+          }
+          return CompactionServiceJobStatus::kFailure;
         }
-        return CompactionServiceJobStatus::kSuccess;
+      } else {
+        // Remote Compaction is still running
+        Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
       }
-      Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
-    }
-    if (failure_should_fall_back_to_local_) {
-      fprintf(stdout,
-              "Remote Compaction failed - fall back to local compaction!\n");
-      return CompactionServiceJobStatus::kUseLocal;
     }
     return CompactionServiceJobStatus::kFailure;
   }
@@ -73,7 +86,7 @@ class DbStressCompactionService : public CompactionService {
     std::string serialized;
     CompactionServiceResult result;
     if (shared_->GetRemoteCompactionResult(scheduled_job_id, &serialized)
-            .ok()) {
+            .has_value()) {
       if (CompactionServiceResult::Read(serialized, &result).ok()) {
         std::vector<std::string> filenames;
         Status s = Env::Default()->GetChildren(result.output_path, &filenames);
diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h
index 4da55a513d9e..d48610c6e5b1 100644
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@@ -302,20 +302,23 @@ class SharedState {
   }
 
   void AddRemoteCompactionResult(const std::string& job_id,
+                                 const Status& status,
                                  const std::string& result) {
     MutexLock l(&remote_compaction_result_map_mu_);
-    remote_compaction_result_map_.emplace(job_id, result);
+    remote_compaction_result_map_.emplace(
+        job_id, std::pair<Status, std::string>{status, result});
   }
 
-  Status GetRemoteCompactionResult(const std::string& job_id,
-                                   std::string* result) {
+  std::optional<Status> GetRemoteCompactionResult(const std::string& job_id,
+                                                  std::string* result) {
     MutexLock l(&remote_compaction_result_map_mu_);
     if (remote_compaction_result_map_.find(job_id) !=
         remote_compaction_result_map_.end()) {
-      *result = remote_compaction_result_map_.at(job_id);
-      return Status::OK();
+      const auto& pair = remote_compaction_result_map_.at(job_id);
+      *result = pair.second;
+      return pair.first;
     }
-    return Status::NotFound();
+    return std::nullopt;
   }
 
   void RemoveRemoteCompactionResult(const std::string& job_id) {
@@ -485,7 +488,8 @@ class SharedState {
   // Result Map for the remote compaciton. Key is the scheduled_job_id and value
   // is serialized compaction_service_result
   port::Mutex remote_compaction_result_map_mu_;
-  std::unordered_map<std::string, std::string> remote_compaction_result_map_;
+  std::unordered_map<std::string, std::pair<Status, std::string>>
+      remote_compaction_result_map_;
 
   // Keys that should not be overwritten
   const std::unordered_set<int64_t> no_overwrite_ids_;

From f722e68d88683df0988e275056262219292e4b7d Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 9 Oct 2025 14:31:47 -0700
Subject: [PATCH 331/500] New FlushWAL() API to take extra fields such as rate
 limiter priority (#14037)

Summary:
**Context/Summary:**
There is no way to tag or rate-limit write IO occurs during FlushWAL() with priority. Under `Options::manual_wal_flush=true`, it is the major source of write IO during user writes so we decide to add that support. A new option struct `FlushWALOptions` is introduced to avoid making the API ugly for future new fields.

Also, we can't use the WriteOptions (https://github.com/facebook/rocksdb/blob/main/include/rocksdb/options.h#L2293-L2302 i) since is associated with that particular Put/Merge/.. associated with that option but FlushWAL() can happen after that write. There is no way to carry that write option over in RocksDB. I also avoided using the WriteOptions since it's mostly for live write.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14037

Test Plan: New UTs `TEST_P(DBRateLimiterOnManualWALFlushTest, ManualWALFlush)`

Reviewed By: archang19

Differential Revision: D84193522

Pulled By: hx235

fbshipit-source-id: 18feb5235672010d19a101ce52c8abdcc4a789f2
---
 db/db_impl/db_impl.cc                         |   6 ++
 db/db_impl/db_impl.h                          |   7 +-
 db/db_rate_limiter_test.cc                    | 101 ++++++++++++++++++
 include/rocksdb/db.h                          |   5 +
 include/rocksdb/options.h                     |  17 +++
 include/rocksdb/utilities/stackable_db.h      |   4 +
 .../manual_wal_flush_priority                 |   1 +
 utilities/backup/backup_engine_test.cc        |   1 +
 8 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/manual_wal_flush_priority

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 3bf0def15359..55bf299c3bec 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1480,6 +1480,12 @@ int DBImpl::FindMinimumEmptyLevelFitting(
   return minimum_level;
 }
 
+Status DBImpl::FlushWAL(const FlushWALOptions& options) {
+  WriteOptions write_options;
+  write_options.rate_limiter_priority = options.rate_limiter_priority;
+  return FlushWAL(write_options, options.sync);
+}
+
 Status DBImpl::FlushWAL(const WriteOptions& write_options, bool sync) {
   if (manual_wal_flush_) {
     IOStatus io_s;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 702a8b9e648a..da1879688e56 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -484,10 +484,13 @@ class DBImpl : public DB {
       const FlushOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families) override;
   Status FlushWAL(bool sync) override {
-    // TODO: plumb Env::IOActivity, Env::IOPriority
-    return FlushWAL(WriteOptions(), sync);
+    FlushWALOptions options;
+    options.sync = sync;
+    return FlushWAL(options);
   }
 
+  Status FlushWAL(const FlushWALOptions& options) override;
+
   virtual Status FlushWAL(const WriteOptions& write_options, bool sync);
   bool WALBufferIsEmpty();
   Status SyncWAL() override;
diff --git a/db/db_rate_limiter_test.cc b/db/db_rate_limiter_test.cc
index b28055225a0f..210e3c49ac32 100644
--- a/db/db_rate_limiter_test.cc
+++ b/db/db_rate_limiter_test.cc
@@ -442,6 +442,107 @@ TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) {
   EXPECT_EQ(actual_auto_wal_flush_request,
             options_.rate_limiter->GetTotalRequests(Env::IO_USER));
 }
+
+class DBRateLimiterOnManualWALFlushTest
+    : public DBRateLimiterOnWriteTest,
+      public ::testing::WithParamInterface<Env::IOPriority> {
+ public:
+  static std::string GetTestNameSuffix(
+      ::testing::TestParamInfo<Env::IOPriority> info) {
+    std::ostringstream oss;
+    if (info.param == Env::IO_USER) {
+      oss << "RateLimitManualWALFlush";
+    } else if (info.param == Env::IO_TOTAL) {
+      oss << "NoRateLimitManualWALFlush";
+    } else if (info.param == Env::IO_HIGH) {
+      oss << "RateLimitManualWALFlushWithHighPriority";
+    } else {
+      oss << "RateLimitManualWALFlushWithLowPriority";
+    }
+    return oss.str();
+  }
+
+  explicit DBRateLimiterOnManualWALFlushTest()
+      : rate_limiter_priority_(GetParam()) {}
+
+  void Init() {
+    options_ = GetOptions();
+    // Enable manual WAL flush mode
+    options_.manual_wal_flush = true;
+    Reopen(options_);
+  }
+
+  WriteOptions GetWriteOptions() {
+    WriteOptions write_options;
+    // WAL must be enabled for manual WAL flush to work
+    write_options.disableWAL = false;
+    // In manual WAL flush mode, WAL write rate limiting should be done through
+    // FlushWAL(), not WriteOptions::rate_limiter_priority
+    write_options.rate_limiter_priority = Env::IO_TOTAL;
+    return write_options;
+  }
+
+ protected:
+  Env::IOPriority rate_limiter_priority_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnManualWALFlushTest,
+                        DBRateLimiterOnManualWALFlushTest,
+                        ::testing::Values(Env::IO_TOTAL, Env::IO_USER,
+                                          Env::IO_HIGH, Env::IO_LOW),
+                        DBRateLimiterOnManualWALFlushTest::GetTestNameSuffix);
+
+TEST_P(DBRateLimiterOnManualWALFlushTest, ManualWALFlush) {
+  Init();
+
+  const bool no_rate_limit = (rate_limiter_priority_ == Env::IO_TOTAL);
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL));
+
+  for (bool sync : {false, true}) {
+    std::int64_t prev_total_request =
+        options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+
+    Status put_status = Put("key_" + std::to_string(sync),
+                            "value_" + std::to_string(sync), GetWriteOptions());
+
+    EXPECT_TRUE(put_status.ok());
+
+    // Since manual_wal_flush is enabled and write_options.rate_limiter_priority
+    // is IO_TOTAL, no rate limiting should have occurred for this user write
+    EXPECT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+                     prev_total_request);
+
+    // Now explicitly flush the WAL with the test's rate_limiter_priority
+    prev_total_request = options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+    std::int64_t prev_priority_request =
+        options_.rate_limiter->GetTotalRequests(rate_limiter_priority_);
+
+    FlushWALOptions flush_options;
+    flush_options.sync = sync;
+    flush_options.rate_limiter_priority = rate_limiter_priority_;
+    Status flush_status = db_->FlushWAL(flush_options);
+
+    EXPECT_TRUE(flush_status.ok());
+
+    std::int64_t manual_wal_flush_requests_total =
+        options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+        prev_total_request;
+    std::int64_t manual_wal_flush_requests_for_priority =
+        options_.rate_limiter->GetTotalRequests(rate_limiter_priority_) -
+        prev_priority_request;
+
+    if (no_rate_limit) {
+      EXPECT_EQ(0, manual_wal_flush_requests_total);
+      EXPECT_EQ(0, manual_wal_flush_requests_for_priority);
+    } else {
+      EXPECT_EQ(manual_wal_flush_requests_total,
+                manual_wal_flush_requests_for_priority);
+      EXPECT_GT(manual_wal_flush_requests_for_priority, 0);
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index cad566fd5d72..7bc50ad890f2 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -49,6 +49,7 @@ struct CompactRangeOptions;
 struct DBOptions;
 struct ExternalSstFileInfo;
 struct FlushOptions;
+struct FlushWALOptions;
 struct Options;
 struct ReadOptions;
 struct TableProperties;
@@ -1777,6 +1778,10 @@ class DB {
     return Status::NotSupported("FlushWAL not implemented");
   }
 
+  virtual Status FlushWAL(const FlushWALOptions& /*options*/) {
+    return Status::NotSupported("FlushWAL not implemented");
+  }
+
   // Ensure all WAL writes have been synced to storage, so that (assuming OS
   // and hardware support) data will survive power loss. This function does
   // not imply FlushWAL, so `FlushWAL(true)` is recommended if using
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index ba7521cf591b..e8cc0d43fb94 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2335,6 +2335,23 @@ struct FlushOptions {
   FlushOptions() : wait(true), allow_write_stall(false) {}
 };
 
+struct FlushWALOptions {
+  // If true, it calls `SyncWAL()` afterwards.
+  // Default: false
+  bool sync;
+
+  // For IO operations associated with flushing the WAL, charge the internal
+  // rate limiter (see `DBOptions::rate_limiter`) at the specified priority and
+  // pass the priority down to the file system through
+  // `IOOptions::rate_limiter_priority`. The special value `Env::IO_TOTAL`
+  // disables charging the rate limiter.
+  //
+  // Default: `Env::IO_TOTAL`
+  Env::IOPriority rate_limiter_priority;
+
+  FlushWALOptions() : sync(false), rate_limiter_priority(Env::IO_TOTAL) {}
+};
+
 // Create a Logger from provided DBOptions
 Status CreateLoggerFromOptions(const std::string& dbname,
                                const DBOptions& options,
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 06c5d1f7d8e5..0710c713de0b 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -423,7 +423,11 @@ class StackableDB : public DB {
 
   Status SyncWAL() override { return db_->SyncWAL(); }
 
+  using DB::FlushWAL;
   Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
+  Status FlushWAL(const FlushWALOptions& options) override {
+    return db_->FlushWAL(options);
+  }
 
   Status LockWAL() override { return db_->LockWAL(); }
 
diff --git a/unreleased_history/public_api_changes/manual_wal_flush_priority b/unreleased_history/public_api_changes/manual_wal_flush_priority
new file mode 100644
index 000000000000..3dc34c8f146a
--- /dev/null
+++ b/unreleased_history/public_api_changes/manual_wal_flush_priority
@@ -0,0 +1 @@
+Added `DB::FlushWAL(const FlushWALOptions&)` as an alternative to `DB::FlushWAL(bool sync)`, where `FlushWALOptions` includes a new `rate_limiter_priority` field (default `Env::IO_TOTAL`) that allows rate limiting and priority passing of manual WAL flush's IO operations.
diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc
index e01911be29c9..51581fb00dda 100644
--- a/utilities/backup/backup_engine_test.cc
+++ b/utilities/backup/backup_engine_test.cc
@@ -135,6 +135,7 @@ class DummyDB : public StackableDB {
   }
 
   // To avoid FlushWAL called on stacked db which is nullptr
+  using DB::FlushWAL;
   Status FlushWAL(bool /*sync*/) override { return Status::OK(); }
 
   std::vector<std::string> live_files_;

From 2d331cc125f915a4f540ccb14e421a1d80dd738e Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 9 Oct 2025 16:55:27 -0700
Subject: [PATCH 332/500] Blog post for parallel compression revamp (#14035)

Summary:
self-explanatory. First drafts using AI then heavily revised. AI help with diagram also.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14035

Test Plan: https://pdillinger.github.io/rocksdb/blog/2025/10/08/parallel-compression-revamp.html

Reviewed By: hx235

Differential Revision: D84277660

Pulled By: pdillinger

fbshipit-source-id: 4d76f60f3f7304392836fa4df7a819e67d531a52
---
 ...10-08-parallel-compression-revamp.markdown |  89 ++++++++++++
 .../ring-buffer-architecture.svg              | 136 ++++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 docs/_posts/2025-10-08-parallel-compression-revamp.markdown
 create mode 100644 docs/static/images/parallel-compression/ring-buffer-architecture.svg

diff --git a/docs/_posts/2025-10-08-parallel-compression-revamp.markdown b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
new file mode 100644
index 000000000000..435c409415f6
--- /dev/null
+++ b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
@@ -0,0 +1,89 @@
+---
+title: "Parallel Compression Revamp: Dramatically Reduced CPU Overhead"
+layout: post
+author: peterd
+category: blog
+---
+
+The upcoming RocksDB 10.7 release includes a major revamp of parallel compression that **dramatically reduces the feature's CPU overhead by up to 65%** while maintaining or improving throughput for compression-heavy workloads. We expect this to broaden the set of workloads that could benefit from parallel compression, especially for **bulk SST generation and remote compaction use cases** that are less sensitive to CPU responsiveness.
+
+## Background
+
+Parallel compression in RocksDB (`CompressionOptions::parallel_threads > 1`) allows multiple threads to compress different blocks simultaneously during SST file generation, which can significantly improve compaction throughput for workloads where compression is a bottleneck. However, the original implementation had substantial CPU overhead that often outweighed the benefits, limiting its practical adoption.
+
+## What's New: A Complete Reimplementation
+
+The parallel compression framework has been completely rewritten from the ground up in [pull request #13910](https://github.com/facebook/rocksdb/pull/13910) to address the core inefficiencies:
+
+### Ring Buffer Architecture
+Instead of separate compression and write queues with complex thread coordination, the new implementation uses a ring buffer of blocks-in-progress that enables efficient work distribution across threads. This bounds working memory while enabling high throughput with minimal cross-thread synchronization.
+
+![Ring Buffer Architecture](/static/images/parallel-compression/ring-buffer-architecture.svg)
+
+### Work-Stealing Design
+Previously, the calling thread could only generate uncompressed blocks, dedicated compression threads could only compress, and a writer thread could only write the SST file to storage. Now, all threads can participate in compression work in a quasi-work-stealing manner, dramatically reducing the need for threads to block waiting for work. While only one thread (the calling thread or "emit thread") can generate uncompressed SST blocks in the new implementation, feeding compression work to other threads and itself, all other threads are compatible with writing compressed blocks to storage.
+
+### Auto-Scaling Thread Management
+The ring buffer enables another key feature: auto-scaling of active threads based on ring buffer utilization. The framework intelligently wakes up idle worker threads only when there's sufficient work to justify the overhead, achieving near-maximum throughput while minimizing CPU waste from unnecessary thread wake-ups.
+
+### Lock-Free Synchronization
+The entire framework is now lock-free (and wait-free as long as compatible work units are available for each thread), based primarily on atomic operations. To cleanly pack and leverage many data fields into a single atomic value, I've developed a new `BitFields` utility API. This is proving useful for cleaning up the HyperClockCache implementation as well, and will be the topic of a later blog post.
+
+Semaphores are used for lock-free management of idle threads (assuming a lock-free semaphore implementation, which is likely the case with `ROCKSDB_USE_STD_SEMAPHORES` but that is untrustworthy; see below).
+
+## Performance Improvements
+
+The results speak for themselves. Here's a comparison using `db_bench` fillseq benchmarks with various compression configurations:
+
+### ZSTD Compression (Default Level)
+Note:
+* "throughput" = how quickly a given CPU-bound flush or compaction can complete
+* "CPU increase" = total CPU usage in amount of time that each core was used
+* "PT" = parallel_threads setting.
+
+**Before:**
+- PT=3: ~38% throughput increase for ~73% CPU increase
+- PT=6: No throughput increase for ~70% CPU increase
+
+**After:**
+- PT=3: ~58% throughput increase for ~25% CPU increase
+- PT=6: ~58% throughput increase for ~28% CPU increase
+
+### High Compression Scenarios
+For ZSTD compression level 8, the improvements are even more dramatic:
+
+**Before:**
+- PT=4: 2.6x throughput increase for 139% CPU increase
+- PT=8: 3.6x throughput increase for 135% CPU increase
+
+**After:**
+- PT=4: 2.8x throughput increase for 114% CPU increase
+- PT=8: 3.7x throughput increase for 116% CPU increase
+
+## Compression Algorithm Optimizations
+
+Alongside the parallel compression revamp, some optimizations have gone into the underlying compression implementations/integrations. Most notably, **LZ4HC received dramatic performance improvements** through better reuse of internal data structures between compression calls (detailed in [pull request #13805](https://github.com/facebook/rocksdb/pull/13805)). A small regression in LZ4 performance from that change was fixed in [pull request #14017](https://github.com/facebook/rocksdb/pull/14017).
+
+While **ZSTD remains the gold standard** for medium-to-high compression ratios in RocksDB, these LZ4HC optimizations make it an increasingly attractive option for read-heavy workloads where LZ4's faster decompression can provide overall performance benefits.
+
+## Production Ready
+
+With these efficiency improvements, parallel compression is now considered **production-ready**. The feature has been thoroughly tested in both unit tests and stress testing, including validation on high-load scenarios with hundreds of concurrent compression jobs and thousands of threads.
+
+Some notes on current limitations:
+- Parallel compression is currently incompatible with `UserDefinedIndex` and with the deprecated `decouple_partitioned_filters=false` setting
+- Maximum performance is available with `-DROCKSDB_USE_STD_SEMAPHORES` at compile time, though this is not currently recommended due to reported bugs in some implementations of C++20 semaphores
+
+## Configuration Recommendations
+
+The dramatically reduced CPU overhead means parallel compression is now viable for a broader range of workloads, particularly those using higher compression levels or compression-heavy scenarios like time-series data. However, simply enabling parallel compression could result in more *spiky* CPU loads for hosts serving live DB data. **Parallel compression might be most useful for bulk SST file generation and/or remote compaction workloads** because they are less sensitive to CPU responsiveness. In these scenarios there is little danger in setting `parallel_threads=8` even with the possibility of over-subscribing CPU cores, though the potentially safer "sweet spot" is typically around `parallel_threads=3`, depending on compression level, etc.
+
+## Limitations and Future
+
+Although this offers a great improvement in the implementation of an existing option, we recognize that this setup is suboptimal in a number of ways:
+* There is no work sharing / thread pooling for these SST compression/writer threads among compactions in the same process, so not well able to fit the workload to available CPU cores and not able to use other SST file compression work to avoid a worker thread going to sleep.
+* We are not (yet) using a framework that would allow micro-work sharing with things other than SST generation on a set of threads. That would be a good direction for effective sharing of CPU resources without spikes in usage, but might incur intolerable CPU overhead in managing work. With this "hand optimized" and specialized framework, we can at least evaluate such future endeavors against a perhaps ideal framework in terms of parallelizing with minimal overhead.
+
+## Try It Out
+
+Parallel compression revamp will be available in RocksDB 10.7. As always, we recommend testing in your specific environment to determine the optimal configuration for your workload.
diff --git a/docs/static/images/parallel-compression/ring-buffer-architecture.svg b/docs/static/images/parallel-compression/ring-buffer-architecture.svg
new file mode 100644
index 000000000000..75ee489cf243
--- /dev/null
+++ b/docs/static/images/parallel-compression/ring-buffer-architecture.svg
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 800 500" style="background-color: #fafafa;">
+  <defs>
+    <marker id="arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#333" />
+    </marker>
+    <marker id="red-arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#d32f2f" />
+    </marker>
+    <marker id="blue-arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#1976d2" />
+    </marker>
+    <marker id="green-arrowhead" markerWidth="10" markerHeight="7"
+     refX="10" refY="3.5" orient="auto">
+      <polygon points="0 0, 10 3.5, 0 7" fill="#388e3c" />
+    </marker>
+    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
+      <feDropShadow dx="2" dy="2" stdDeviation="2" flood-color="#00000020"/>
+    </filter>
+  </defs>
+
+  <!-- Title -->
+  <text x="400" y="30" font-family="Arial, sans-serif" font-size="18" font-weight="bold" text-anchor="middle" fill="#333">
+    Ring Buffer Architecture (8 slots shown) for Parallel Compression
+  </text>
+
+  <!-- Ring Buffer Array -->
+  <g transform="translate(100,150)">
+    <!-- Array slots laid out horizontally -->
+
+    <!-- Slot 0 -->
+    <rect x="0" y="0" width="70" height="50" fill="#f5f5f5" stroke="#666" stroke-width="1" rx="5" filter="url(#shadow)"/>
+    <text x="35" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 0</text>
+    <text x="35" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Empty</text>
+
+    <!-- Slot 1 - NextToWrite=1, being written by Worker Thread 2 -->
+    <rect x="75" y="0" width="70" height="50" fill="#e8f5e8" stroke="#388e3c" stroke-width="3" rx="5" filter="url(#shadow)"/>
+    <text x="110" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 1</text>
+    <text x="110" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#388e3c">Writing...</text>
+
+    <!-- Slot 2 - Compressed -->
+    <rect x="150" y="0" width="70" height="50" fill="#fff3e0" stroke="#f57c00" stroke-width="2" rx="5" filter="url(#shadow)"/>
+    <text x="185" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 2</text>
+    <text x="185" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Compressed</text>
+
+    <!-- Slot 3 - Being compressed by Worker Thread 1 -->
+    <rect x="225" y="0" width="70" height="50" fill="#e3f2fd" stroke="#1976d2" stroke-width="3" rx="5" filter="url(#shadow)"/>
+    <text x="260" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 3</text>
+    <text x="260" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#1976d2">Compressing...</text>
+
+    <!-- Slot 4 - NextToCompress=4, uncompressed -->
+    <rect x="300" y="0" width="70" height="50" fill="#ffebee" stroke="#d32f2f" stroke-width="2" rx="5" filter="url(#shadow)"/>
+    <text x="335" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 4</text>
+    <text x="335" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Uncompressed</text>
+
+    <!-- Slot 5 - NextToEmit=5, adding block -->
+    <rect x="375" y="0" width="70" height="50" fill="#e1f5fe" stroke="#4fc3f7" stroke-width="3" rx="5" filter="url(#shadow)"/>
+    <text x="410" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 5</text>
+    <text x="410" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#4fc3f7">Adding block...</text>
+
+    <!-- Slot 6 -->
+    <rect x="450" y="0" width="70" height="50" fill="#f5f5f5" stroke="#666" stroke-width="1" rx="5" filter="url(#shadow)"/>
+    <text x="485" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 6</text>
+    <text x="485" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Empty</text>
+
+    <!-- Slot 7 -->
+    <rect x="525" y="0" width="70" height="50" fill="#f5f5f5" stroke="#666" stroke-width="1" rx="5" filter="url(#shadow)"/>
+    <text x="560" y="20" font-family="Arial, sans-serif" font-size="10" font-weight="bold" text-anchor="middle" fill="#333">Slot 7</text>
+    <text x="560" y="35" font-family="Arial, sans-serif" font-size="9" text-anchor="middle" fill="#666">Empty</text>
+  </g>
+
+  <!-- Arrows pointing to array positions - staggered vertically -->
+  <!-- NextToWrite=1 arrow (red) -->
+  <line x1="210" y1="100" x2="210" y2="145" stroke="#d32f2f" stroke-width="3" marker-end="url(#red-arrowhead)"/>
+  <text x="210" y="90" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#d32f2f">NextToWrite=1</text>
+
+  <!-- NextToCompress=4 arrow (blue) -->
+  <line x1="435" y1="110" x2="435" y2="145" stroke="#1976d2" stroke-width="3" marker-end="url(#blue-arrowhead)"/>
+  <text x="435" y="100" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#1976d2">NextToCompress=4</text>
+
+  <!-- NextToEmit=5 arrow (light blue) -->
+  <line x1="510" y1="120" x2="510" y2="145" stroke="#4fc3f7" stroke-width="3" marker-end="url(#arrowhead)"/>
+  <text x="510" y="110" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#4fc3f7">NextToEmit=5</text>
+
+  <!-- Worker Thread 2 (wider box) -->
+  <g transform="translate(30,250)">
+    <rect x="0" y="0" width="160" height="70" fill="#e8f5e8" stroke="#388e3c" stroke-width="2" rx="8" filter="url(#shadow)"/>
+    <text x="80" y="25" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#333">Worker Thread 2</text>
+    <text x="80" y="40" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Currently writing</text>
+    <text x="80" y="55" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Can also compress</text>
+  </g>
+
+  <!-- Worker Thread 1 (wider box) -->
+  <g transform="translate(240,280)">
+    <rect x="0" y="0" width="160" height="70" fill="#e3f2fd" stroke="#1976d2" stroke-width="2" rx="8" filter="url(#shadow)"/>
+    <text x="80" y="25" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#333">Worker Thread 1</text>
+    <text x="80" y="40" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Currently compressing</text>
+    <text x="80" y="55" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Can also write</text>
+  </g>
+
+  <!-- Emit Thread (wider box) -->
+  <g transform="translate(460,250)">
+    <rect x="0" y="0" width="180" height="70" fill="#e1f5fe" stroke="#4fc3f7" stroke-width="2" rx="8" filter="url(#shadow)"/>
+    <text x="90" y="25" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#333">Emit Thread</text>
+    <text x="90" y="40" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Generates uncompressed blocks</text>
+    <text x="90" y="55" font-family="Arial, sans-serif" font-size="11" text-anchor="middle" fill="#666">Can help with compression</text>
+  </g>
+
+  <!-- Arrows from threads to array (no labels) -->
+  <!-- Worker Thread 2 to Slot 1 -->
+  <line x1="190" y1="285" x2="210" y2="210" stroke="#388e3c" stroke-width="2" marker-end="url(#green-arrowhead)"/>
+
+  <!-- Worker Thread 1 to Slot 3 -->
+  <line x1="320" y1="280" x2="360" y2="210" stroke="#1976d2" stroke-width="2" marker-end="url(#blue-arrowhead)"/>
+
+  <!-- Emit Thread to Slot 5 -->
+  <line x1="550" y1="250" x2="510" y2="210" stroke="#4fc3f7" stroke-width="2" marker-end="url(#arrowhead)"/>
+
+  <!-- SST File Output (centered below Worker Thread 2) -->
+  <g transform="translate(50,380)">
+    <rect x="0" y="0" width="120" height="40" fill="#e8f5e8" stroke="#388e3c" stroke-width="2" rx="5" filter="url(#shadow)"/>
+    <text x="60" y="25" font-family="Arial, sans-serif" font-size="12" font-weight="bold" text-anchor="middle" fill="#333">SST File</text>
+  </g>
+
+  <!-- Arrow from Worker Thread 2 to SST file -->
+  <line x1="110" y1="320" x2="110" y2="380" stroke="#388e3c" stroke-width="2" marker-end="url(#green-arrowhead)"/>
+
+  <!-- Invariant (moved to the right) -->
+  <g transform="translate(400,430)">
+    <text x="0" y="0" font-family="Arial, sans-serif" font-size="12" font-weight="bold" fill="#333">Invariant:</text>
+    <text x="0" y="20" font-family="Arial, sans-serif" font-size="11" fill="#666">NextToWrite ≤ NextToCompress ≤ NextToEmit (modulo ring buffer size)</text>
+  </g>
+</svg>

From 04c085a3faab360747615fb9975495bfa5a3360a Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Thu, 9 Oct 2025 17:31:54 -0700
Subject: [PATCH 333/500] Disable skip_stats_update_on_db_open in crash tests
 for multiscan (#14039)

Summary:
Multi scan crash/stress tests are failing when skip_stats_update_on_db_open is true, because LevelIterator::Prepare relies on these stats in FileMetaData to make decisions. Disable it in crash tests until the proper fix is ready.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14039

Reviewed By: archang19

Differential Revision: D84280059

Pulled By: anand1976

fbshipit-source-id: f9f58b94c24d1f455432b05f3bf97f25c7233e3c
---
 tools/db_crashtest.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 12482c0ba85f..2b456efab9f4 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -1202,6 +1202,9 @@ def finalize_and_sanitize(src_params):
         # existing key range, which will cause a reseek that's currently not
         # supported by multiscan
         dest_params["test_ingest_standalone_range_deletion_one_in"] = 0
+        # LevelIterator multiscan currently relies on num_entries and num_range_deletions,
+        # which are not updated if skip_stats_update_on_db_open is true
+        dest_params["skip_stats_update_on_db_open"] = 0
 
     # inplace update and key checksum verification during seek would cause race condition
     # Therefore, when inplace_update_support is enabled, disable memtable_veirfy_per_key_checksum_on_seek

From 1585f2240c3af0f1b6e81706277b83d6b155e1e4 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 13 Oct 2025 12:48:04 -0700
Subject: [PATCH 334/500] Move the MultiScan seek key check to upper layer
 (#14040)

Summary:
The current seek key validation is too strict. This change relaxes it at block iterator level, and add additional check at DB iterator level. The new contract is that when MultiScan is used, after prepared is called, each following seek must seek the start key of the prepared scan range in order. Otherwise, the iterator is set with error status.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14040

Test Plan: Unit test

Reviewed By: anand1976

Differential Revision: D84292297

Pulled By: xingbowang

fbshipit-source-id: 7b31f727e67e7c0bfc53c2f9a6552e0c3d324869
---
 db/db_iter.cc                                 | 104 +++
 db/db_iter.h                                  |  13 +-
 db/db_iterator_test.cc                        |  12 -
 db/multi_scan.cc                              |   5 +
 db/version_set.cc                             |   2 -
 include/rocksdb/multi_scan.h                  |   4 +
 .../block_based/block_based_table_iterator.cc | 318 ++++---
 .../block_based/block_based_table_iterator.h  |  50 +-
 .../block_based_table_reader_test.cc          | 149 +++-
 table/table_test.cc                           | 812 ++++++++++++++++--
 .../multi_scan_api_contract.md                |   1 +
 11 files changed, 1228 insertions(+), 242 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/multi_scan_api_contract.md

diff --git a/db/db_iter.cc b/db/db_iter.cc
index 7258913e765d..52a357247f06 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -1565,11 +1565,115 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
   }
 }
 
+Status DBIter::ValidateScanOptions(const MultiScanArgs& multiscan_opts) const {
+  if (multiscan_opts.empty()) {
+    return Status::InvalidArgument("Empty MultiScanArgs");
+  }
+
+  const std::vector<ScanOptions>& scan_opts = multiscan_opts.GetScanRanges();
+  const bool has_limit = scan_opts.front().range.limit.has_value();
+  if (!has_limit && scan_opts.size() > 1) {
+    return Status::InvalidArgument("Scan has no upper bound");
+  }
+
+  for (size_t i = 0; i < scan_opts.size(); ++i) {
+    const auto& scan_range = scan_opts[i].range;
+    if (!scan_range.start.has_value()) {
+      return Status::InvalidArgument("Scan has no start key at index " +
+                                     std::to_string(i));
+    }
+
+    if (scan_range.limit.has_value()) {
+      if (user_comparator_.CompareWithoutTimestamp(
+              scan_range.start.value(), /*a_has_ts=*/false,
+              scan_range.limit.value(), /*b_has_ts=*/false) >= 0) {
+        return Status::InvalidArgument(
+            "Scan start key is large or equal than limit at index " +
+            std::to_string(i));
+      }
+    }
+
+    if (i > 0) {
+      if (!scan_range.limit.has_value()) {
+        // multiple scan without limit scan ranges
+        return Status::InvalidArgument("Scan has no upper bound at index " +
+                                       std::to_string(i));
+      }
+
+      const auto& last_end_key = scan_opts[i - 1].range.limit.value();
+      if (user_comparator_.CompareWithoutTimestamp(
+              scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
+              /*b_has_ts=*/false) < 0) {
+        return Status::InvalidArgument("Overlapping ranges at index " +
+                                       std::to_string(i));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void DBIter::Prepare(const MultiScanArgs& scan_opts) {
+  status_ = ValidateScanOptions(scan_opts);
+  if (!status_.ok()) {
+    return;
+  }
+  std::optional<MultiScanArgs> new_scan_opts;
+  new_scan_opts.emplace(scan_opts);
+  scan_opts_.swap(new_scan_opts);
+  scan_index_ = 0;
+  if (!scan_opts.empty()) {
+    iter_.Prepare(&scan_opts_.value());
+  } else {
+    iter_.Prepare(nullptr);
+  }
+}
+
 void DBIter::Seek(const Slice& target) {
   PERF_COUNTER_ADD(iter_seek_count, 1);
   PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
   StopWatch sw(clock_, statistics_, DB_SEEK);
 
+  if (scan_opts_.has_value()) {
+    // Validate the seek target is as expected in the previously prepared range
+    auto const& scan_ranges = scan_opts_.value().GetScanRanges();
+    if (scan_index_ >= scan_ranges.size()) {
+      status_ = Status::InvalidArgument(
+          "Seek called after exhausting all of the scan ranges");
+      valid_ = false;
+      return;
+    }
+
+    // Validate start key of next prepare range matches the seek target
+    auto const& range = scan_ranges[scan_index_];
+    auto const& start = range.range.start;
+    assert(start.has_value());
+    if (user_comparator_.CompareWithoutTimestamp(target, *start) != 0) {
+      status_ = Status::InvalidArgument(
+          "Seek target does not match the start of the next prepared range at "
+          "index " +
+          std::to_string(scan_index_));
+      valid_ = false;
+      return;
+    }
+
+    // validate the upper bound is set to the same value of limit, if limit
+    // exists
+    auto const& limit = range.range.limit;
+    if (limit.has_value()) {
+      if (iterate_upper_bound_ == nullptr ||
+          user_comparator_.CompareWithoutTimestamp(
+              limit.value(), *iterate_upper_bound_) != 0) {
+        status_ = Status::InvalidArgument(
+            "Upper bound is not set to the same limit value of the next "
+            "prepared range at index " +
+            std::to_string(scan_index_));
+        valid_ = false;
+        return;
+      }
+    }
+    scan_index_++;
+  }
+
   if (cfh_ != nullptr) {
     // TODO: What do we do if this returns an error?
     Slice lower_bound, upper_bound;
diff --git a/db/db_iter.h b/db/db_iter.h
index 28a5b22db7fa..575dc455eedc 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -240,16 +240,8 @@ class DBIter final : public Iterator {
 
   bool PrepareValue() override;
 
-  void Prepare(const MultiScanArgs& scan_opts) override {
-    std::optional<MultiScanArgs> new_scan_opts;
-    new_scan_opts.emplace(scan_opts);
-    scan_opts_.swap(new_scan_opts);
-    if (!scan_opts.empty()) {
-      iter_.Prepare(&scan_opts_.value());
-    } else {
-      iter_.Prepare(nullptr);
-    }
-  }
+  void Prepare(const MultiScanArgs& scan_opts) override;
+  Status ValidateScanOptions(const MultiScanArgs& multiscan_opts) const;
 
  private:
   DBIter(Env* _env, const ReadOptions& read_options,
@@ -506,6 +498,7 @@ class DBIter final : public Iterator {
   const size_t timestamp_size_;
   std::string saved_timestamp_;
   std::optional<MultiScanArgs> scan_opts_;
+  size_t scan_index_{0};
   ReadOnlyMemTable* const active_mem_;
   SequenceNumber memtable_seqno_lb_;
   uint32_t memtable_op_scan_flush_trigger_;
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 1bf83d8e230d..92108d37eec7 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4382,18 +4382,6 @@ TEST_P(DBMultiScanIteratorTest, FailureTest) {
   iter->Seek(key_ranges[2]);
   ASSERT_NOK(iter->status());
   iter.reset();
-
-  // Test the case of overlapping ranges
-  iter.reset(dbfull()->NewIterator(ro, cfh));
-  ASSERT_NE(iter, nullptr);
-  (*scan_options).clear();
-  scan_options.insert(key_ranges[0]);
-  scan_options.insert(key_ranges[2], key_ranges[3]);
-  iter->Prepare(scan_options);
-  ub = key_ranges[3];
-  iter->Seek(key_ranges[2]);
-  ASSERT_NOK(iter->status());
-  iter.reset();
 }
 
 TEST_P(DBMultiScanIteratorTest, OutOfL0FileRange) {
diff --git a/db/multi_scan.cc b/db/multi_scan.cc
index ae31c4882d62..3d3855e0946d 100644
--- a/db/multi_scan.cc
+++ b/db/multi_scan.cc
@@ -40,6 +40,11 @@ MultiScan::MultiScan(const ReadOptions& read_options,
 }
 
 MultiScanIterator& MultiScanIterator::operator++() {
+  status_ = db_iter_->status();
+  if (!status_.ok()) {
+    throw MultiScanException(status_);
+  }
+
   if (idx_ >= scan_opts_.size()) {
     throw std::logic_error("Index out of range");
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index d265f51f3e5b..1dd22c071825 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1668,8 +1668,6 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
     if (FileHasMultiScanArg(file_index_)) {
       const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
       file_iter_.Prepare(&new_opts);
-    } else {
-      file_iter_.Prepare(scan_opts_);
     }
   }
 
diff --git a/include/rocksdb/multi_scan.h b/include/rocksdb/multi_scan.h
index a1c87d57fff5..4b0917173701 100644
--- a/include/rocksdb/multi_scan.h
+++ b/include/rocksdb/multi_scan.h
@@ -191,6 +191,10 @@ class MultiScan {
       if (scan_opts_.empty()) {
         throw std::logic_error("Zero scans in multi-scan");
       }
+      status_ = db_iter_->status();
+      if (!status_.ok()) {
+        throw MultiScanException(status_);
+      }
       db_iter_->Seek(*scan_opts_[idx_].range.start);
       status_ = db_iter_->status();
       if (!status_.ok()) {
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index ea288022a4e2..9398b4f1772b 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -988,19 +988,16 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
     multi_scan_status_ = Status::InvalidArgument("Prepare already called");
     return;
   }
-  multi_scan_status_ = ValidateScanOptions(multiscan_opts);
-  if (!multi_scan_status_.ok()) {
-    return;
-  }
 
   index_iter_->Prepare(multiscan_opts);
 
   std::vector<BlockHandle> scan_block_handles;
+  std::vector<std::string> data_block_separators;
   std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
   const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
-  multi_scan_status_ =
-      CollectBlockHandles(scan_opts, multiscan_opts->RequireFileOverlap(),
-                          &scan_block_handles, &block_index_ranges_per_scan);
+  multi_scan_status_ = CollectBlockHandles(
+      scan_opts, multiscan_opts->RequireFileOverlap(), &scan_block_handles,
+      &block_index_ranges_per_scan, &data_block_separators);
   if (!multi_scan_status_.ok()) {
     return;
   }
@@ -1039,7 +1036,7 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   // blocks.
   multi_scan_ = std::make_unique<MultiScanState>(
       table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
-      std::move(pinned_data_blocks_guard),
+      std::move(pinned_data_blocks_guard), std::move(data_block_separators),
       std::move(block_index_ranges_per_scan),
       std::move(block_idx_to_readreq_idx), std::move(async_states),
       prefetched_max_idx);
@@ -1048,25 +1045,120 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   block_iter_points_to_real_block_ = false;
 }
 
-void BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
+void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
+  if (SeekMultiScanImpl(seek_target)) {
+    is_out_of_bound_ = true;
+    assert(!Valid());
+  }
+}
+
+bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
   assert(multi_scan_ && multi_scan_status_.ok());
   // This is a MultiScan and Preapre() has been called.
-  //
+
+  // Reset out of bound on seek, if it is out of bound again, it will be set
+  // properly later in the code path
+  is_out_of_bound_ = false;
+
   // Validate seek key with scan options
-  if (multi_scan_->next_scan_idx >= multi_scan_->scan_opts->size()) {
-    multi_scan_status_ = Status::InvalidArgument("Outside MultiScan range");
-  } else if (!target) {
+  if (!seek_target) {
     // start key must be set for multi-scan
     multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
-  } else if (user_comparator_.CompareWithoutTimestamp(
-                 ExtractUserKey(*target), /*a_has_ts=*/true,
-                 multi_scan_->scan_opts
-                     ->GetScanRanges()[multi_scan_->next_scan_idx]
-                     .range.start.value(),
-                 /*b_has_ts=*/false) != 0) {
-    // Unexpected seek key
-    multi_scan_status_ = Status::InvalidArgument("Unexpected seek key");
+    return false;
+  }
+
+  constexpr auto out_of_bound = true;
+
+  // Check the case where there is no range prepared on this table
+  if (multi_scan_->scan_opts->size() == 0) {
+    // out of bound
+    return out_of_bound;
+  }
+
+  // Check whether seek key is moving forward.
+  if (!multi_scan_->prev_seek_key_.empty()) {
+    if (user_comparator_.CompareWithoutTimestamp(ExtractUserKey(*seek_target),
+                                                 /*a_has_ts=*/true,
+                                                 multi_scan_->prev_seek_key_,
+                                                 /*b_has_ts=*/false) < 0) {
+      // The seek target moved backward
+      multi_scan_status_ =
+          Status::InvalidArgument("Unexpected seek key moving backward");
+      return false;
+    }
+  }
+  multi_scan_->prev_seek_key_ = ExtractUserKey(*seek_target).ToString();
+
+  // There are still a few cases we need to handle
+  // table: _____[prepared range 1]_____[prepared range 2]_____
+  // seek :   1  2        3          4                      5
+  // Case 1: seek before the first prepared ranges, return out of bound
+  // Case 2: seek at the beginning of a prepared range (expected case)
+  // Case 3: seek within a prepared range (unexpected, but supported)
+  // Case 4: seek between 2 of the prepared ranges, return out of bound
+  // Case 5: seek after all of the prepared ranges, should move on to next file
+  // The reason this could happen is due to seek key adjustment due to delete
+  // range file.
+  // E.g. LSM has 3 levels, each level has only 1 file:
+  // L1 : key :              0---10
+  // L2 : Delete range key : 0-5
+  // L3 : key :              0---10
+  // When a range 2-8 was prepared, the prepared key would be 2 on L3 file, but
+  // the seek key would be 5, as the seek key was updated by the largest key of
+  // delete range. This causes all of the cases above to be possible, when the
+  // ranges are adjusted in the above examples.
+
+  // Allow reseek on the start of the last prepared range due to too many
+  // tombstone
+  multi_scan_->next_scan_idx =
+      std::min(multi_scan_->next_scan_idx,
+               multi_scan_->block_index_ranges_per_scan.size() - 1);
+
+  auto compare_next_scan_start_result =
+      user_comparator_.CompareWithoutTimestamp(
+          ExtractUserKey(*seek_target), /*a_has_ts=*/true,
+          multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
+              .range.start.value(),
+          /*b_has_ts=*/false);
+
+  if (compare_next_scan_start_result != 0) {
+    // The seek key is not exactly same as what was prepared.
+    if (compare_next_scan_start_result < 0) {
+      // Needs to handle Cases: 1, 3, 4
+      //
+      // next_scan_idx :                    |
+      //                                    V
+      // table: _____[prepared range 1]_____[prepared range 2]_____
+      // seek :   1           3          4
+
+      // Case 1: Seek key is before the start key of the first range
+      if (multi_scan_->next_scan_idx == 0) {
+        return out_of_bound;
+      }
+      // Case: 3, 4
+      MultiScanUnexpectedSeekTarget(
+          seek_target, std::get<0>(multi_scan_->block_index_ranges_per_scan
+                                       [multi_scan_->next_scan_idx - 1]));
+
+    } else {
+      // Needs to handle Cases: 3, 4, 5
+      // next_scan_idx :|
+      //                V
+      // table:     ____[prepared range 1]_____[prepared range 2]_____
+      // seek :                 3           4                      5
+      MultiScanUnexpectedSeekTarget(
+          seek_target,
+          std::get<0>(
+              multi_scan_
+                  ->block_index_ranges_per_scan[multi_scan_->next_scan_idx]));
+    }
   } else {
+    if (multi_scan_->next_scan_idx >=
+        multi_scan_->block_index_ranges_per_scan.size()) {
+      // Seeking a range that is out side of prepared ranges.
+      return out_of_bound;
+    }
+    // unpin block, then do a seek.
     if (multi_scan_->next_scan_idx > 0) {
       UnpinPreviousScanBlocks(multi_scan_->next_scan_idx);
     }
@@ -1076,36 +1168,82 @@ void BlockBasedTableIterator::SeekMultiScan(const Slice* target) {
     // We should have the data block already loaded
     ++multi_scan_->next_scan_idx;
     if (cur_scan_start_idx >= cur_scan_end_idx) {
-      is_out_of_bound_ = true;
-      assert(!Valid());
-      return;
+      return out_of_bound;
     } else {
       is_out_of_bound_ = false;
     }
 
-    if (!block_iter_points_to_real_block_ ||
-        multi_scan_->cur_data_block_idx != cur_scan_start_idx) {
-      if (block_iter_points_to_real_block_) {
-        // Should be scan in increasing key range.
-        // All blocks before cur_data_block_idx_ are not pinned anymore.
-        assert(multi_scan_->cur_data_block_idx < cur_scan_start_idx);
-      }
+    MultiScanSeekTargetFromBlock(seek_target, cur_scan_start_idx);
+  }
 
-      ResetDataIter();
+  return false;
+}
 
-      multi_scan_->cur_data_block_idx = cur_scan_start_idx;
-      multi_scan_status_ = MultiScanLoadDataBlock(cur_scan_start_idx);
-      if (!multi_scan_status_.ok()) {
-        assert(!Valid());
-        assert(status() == multi_scan_status_);
-        return;
-      }
+void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
+    const Slice* seek_target, size_t block_idx) {
+  // linear search the block that contains the seek target, and unpin blocks
+  // that are before it.
+  auto const& data_block_separators = multi_scan_->data_block_separators;
+  while (block_idx < data_block_separators.size() &&
+         (user_comparator_.CompareWithoutTimestamp(
+              ExtractUserKey(*seek_target), /*a_has_ts=*/true,
+              data_block_separators[block_idx],
+              /*b_has_ts=*/false) > 0)) {
+    if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
+      multi_scan_->pinned_data_blocks[block_idx].Reset();
     }
-    multi_scan_->cur_data_block_idx = cur_scan_start_idx;
-    block_iter_points_to_real_block_ = true;
-    block_iter_.Seek(*target);
-    FindKeyForward();
+    block_idx++;
   }
+
+  if (block_idx >= data_block_separators.size()) {
+    // Handle case 5, when seek key is larger than the last block in the last
+    // prepared range.
+    ResetDataIter();
+    assert(!Valid());
+    return;
+  }
+
+  // // The iterator from previous seek may have moved forward a few blocks,
+  // // In that case, have block_idx catch up the cur_data_block_idx
+  // // Note no need to handle block unpin, as it has been handled during
+  // iterating block_idx = std::max(block_idx, multi_scan_->cur_data_block_idx);
+
+  // advance to the right prepared range
+  while (
+      multi_scan_->next_scan_idx <
+          multi_scan_->block_index_ranges_per_scan.size() &&
+      (user_comparator_.CompareWithoutTimestamp(
+           ExtractUserKey(*seek_target), /*a_has_ts=*/true,
+           multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
+               .range.start.value(),
+           /*b_has_ts=*/false) >= 0)) {
+    multi_scan_->next_scan_idx++;
+  }
+
+  // The current block may contain the data for the target key
+  MultiScanSeekTargetFromBlock(seek_target, block_idx);
+}
+
+void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
+    const Slice* seek_target, size_t block_idx) {
+  if (!block_iter_points_to_real_block_ ||
+      multi_scan_->cur_data_block_idx != block_idx) {
+    if (block_iter_points_to_real_block_) {
+      // Should be scan in increasing key range.
+      // All blocks before cur_data_block_idx_ are not pinned anymore.
+      assert(multi_scan_->cur_data_block_idx < block_idx);
+    }
+
+    ResetDataIter();
+
+    if (MultiScanLoadDataBlock(block_idx)) {
+      return;
+    }
+  }
+  multi_scan_->cur_data_block_idx = block_idx;
+  block_iter_points_to_real_block_ = true;
+  block_iter_.Seek(*seek_target);
+  FindKeyForward();
 }
 
 void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
@@ -1115,13 +1253,13 @@ void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
   assert(current_scan_idx < multi_scan_->block_index_ranges_per_scan.size());
   if (current_scan_idx == 0) return;
 
-  auto [prev_start_block_idx, prev_end_block_idx] =
-      multi_scan_->block_index_ranges_per_scan[current_scan_idx - 1];
+  auto prev_start_block_idx = std::get<0>(
+      multi_scan_->block_index_ranges_per_scan[current_scan_idx - 1]);
   // Since a block can be shared between consecutive scans, we need
   // curr_start_block_idx here instead of just release blocks
-  // up to prev_end_block_idx.
-  auto [curr_start_block_idx, curr_end_block_idx] =
-      multi_scan_->block_index_ranges_per_scan[current_scan_idx];
+  // up to the end of previous range block index.
+  auto curr_start_block_idx =
+      std::get<0>(multi_scan_->block_index_ranges_per_scan[current_scan_idx]);
   for (size_t block_idx = prev_start_block_idx;
        block_idx < curr_start_block_idx; ++block_idx) {
     if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
@@ -1148,8 +1286,8 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
       if (multi_scan_->next_scan_idx >=
           multi_scan_->block_index_ranges_per_scan.size()) {
-        // We are done with this file, should let LevelIter advance to the next
-        // file instead of ending the scan
+        // We are done with this file, should let LevelIter advance to the
+        // next file instead of ending the scan
         ResetDataIter();
         assert(!is_out_of_bound_);
         assert(!Valid());
@@ -1166,11 +1304,7 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     ResetDataIter();
     ++multi_scan_->cur_data_block_idx;
 
-    multi_scan_status_ =
-        MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx);
-    if (!multi_scan_status_.ok()) {
-      assert(!Valid());
-      assert(status() == multi_scan_status_);
+    if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
       return;
     }
 
@@ -1281,54 +1415,19 @@ Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
       &pinned_block_entry.As<Block_kData>());
 }
 
-Status BlockBasedTableIterator::ValidateScanOptions(
-    const MultiScanArgs* multiscan_opts) {
-  if (multiscan_opts == nullptr || multiscan_opts->empty()) {
-    return Status::InvalidArgument("Empty MultiScanArgs");
-  }
-
-  const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
-  const bool has_limit = scan_opts.front().range.limit.has_value();
-  if (!has_limit && scan_opts.size() > 1) {
-    // Abort: overlapping ranges
-    return Status::InvalidArgument("Scan has no upper bound");
-  }
-
-  for (size_t i = 0; i < scan_opts.size(); ++i) {
-    const auto& scan_range = scan_opts[i].range;
-    if (!scan_range.start.has_value()) {
-      // Abort: no start key
-      return Status::InvalidArgument("Scan has no start key");
-    }
-
-    if (scan_range.limit.has_value()) {
-      assert(user_comparator_.CompareWithoutTimestamp(
-                 scan_range.start.value(), /*a_has_ts=*/false,
-                 scan_range.limit.value(), /*b_has_ts=*/false) <= 0);
-    }
-
-    if (i > 0) {
-      if (!scan_range.limit.has_value()) {
-        // multiple no limit scan ranges
-        return Status::InvalidArgument("Scan has no upper bound");
-      }
-
-      const auto& last_end_key = scan_opts[i - 1].range.limit.value();
-      if (user_comparator_.CompareWithoutTimestamp(
-              scan_range.start.value(), /*a_has_ts=*/false, last_end_key,
-              /*b_has_ts=*/false) < 0) {
-        // Abort: overlapping ranges
-        return Status::InvalidArgument("Overlapping ranges");
-      }
-    }
-  }
-  return Status::OK();
-}
+constexpr auto kVerbose = false;
 
 Status BlockBasedTableIterator::CollectBlockHandles(
     const std::vector<ScanOptions>& scan_opts, bool require_file_overlap,
     std::vector<BlockHandle>* scan_block_handles,
-    std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan) {
+    std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
+    std::vector<std::string>* data_block_separators) {
+  // print file name and level
+  if (kVerbose) {
+    auto file_name = table_->get_rep()->file->file_name();
+    auto level = table_->get_rep()->level;
+    printf("file name : %s, level %d\n", file_name.c_str(), level);
+  }
   for (const auto& scan_opt : scan_opts) {
     size_t num_blocks = 0;
     bool check_overlap = !scan_block_handles->empty();
@@ -1348,15 +1447,19 @@ Status BlockBasedTableIterator::CollectBlockHandles(
     index_iter_->Seek(start_key.Encode());
     while (index_iter_->status().ok() && index_iter_->Valid() &&
            (!scan_opt.range.limit.has_value() ||
-            user_comparator_.CompareWithoutTimestamp(
-                index_iter_->user_key(),
-                /*a_has_ts*/ true, *scan_opt.range.limit,
-                /*b_has_ts=*/false) <= 0)) {
+            user_comparator_.CompareWithoutTimestamp(index_iter_->user_key(),
+                                                     /*a_has_ts*/ true,
+                                                     *scan_opt.range.limit,
+                                                     /*b_has_ts=*/false) < 0)) {
+      // Only add the block if the index separator is smaller than limit. When
+      // they are equal or larger, it will be handled later below.
       if (check_overlap &&
           scan_block_handles->back() == index_iter_->value().handle) {
         // Skip the current block since it's already in the list
       } else {
         scan_block_handles->push_back(index_iter_->value().handle);
+        // clone the Slice to avoid the lifetime issue
+        data_block_separators->push_back(index_iter_->user_key().ToString());
       }
       ++num_blocks;
       index_iter_->Next();
@@ -1369,11 +1472,13 @@ Status BlockBasedTableIterator::CollectBlockHandles(
     }
 
     if (index_iter_->Valid()) {
+      // Handle the last block when its separator is equal or larger than limit
       if (check_overlap &&
           scan_block_handles->back() == index_iter_->value().handle) {
         // Skip adding the current block since it's already in the list
       } else {
         scan_block_handles->push_back(index_iter_->value().handle);
+        data_block_separators->push_back(index_iter_->user_key().ToString());
       }
       ++num_blocks;
     } else if (num_blocks == 0 && index_iter_->UpperBoundCheckResult() !=
@@ -1389,6 +1494,13 @@ Status BlockBasedTableIterator::CollectBlockHandles(
     }
     block_index_ranges_per_scan->emplace_back(
         scan_block_handles->size() - num_blocks, scan_block_handles->size());
+    if (kVerbose) {
+      printf("separators :");
+      for (const auto& separator : *data_block_separators) {
+        printf("%s, ", separator.c_str());
+      }
+      printf("\n");
+    }
   }
   return Status::OK();
 }
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 9cb2e407c5da..16f4f74f2bc0 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -438,6 +438,15 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     const std::shared_ptr<FileSystem> fs;
     const MultiScanArgs* scan_opts;
     std::vector<CachableEntry<Block>> pinned_data_blocks;
+    // The separator of each data block in above pinned_data_blocks vector.
+    // Its size is same as pinned_data_blocks.
+    // The value of separator is larger than or equal to the last key in the
+    // corresponding data block.
+    std::vector<std::string> data_block_separators;
+    // Track previously seeked key in multi-scan.
+    // This is used to ensure that the seek key is keep moving forward, as
+    // blocks that are smaller than the seek key are unpinned from memory.
+    std::string prev_seek_key_;
 
     // Indicies into pinned_data_blocks for data blocks for each scan range.
     // inclusive start, exclusive end
@@ -464,12 +473,14 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     MultiScanState(
         const std::shared_ptr<FileSystem>& _fs, const MultiScanArgs* _scan_opts,
         std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
+        std::vector<std::string>&& _data_block_separators,
         std::vector<std::tuple<size_t, size_t>>&& _block_index_ranges_per_scan,
         UnorderedMap<size_t, size_t>&& _block_idx_to_readreq_idx,
         std::vector<AsyncReadState>&& _async_states, size_t _prefetch_max_idx)
         : fs(_fs),
           scan_opts(_scan_opts),
           pinned_data_blocks(std::move(_pinned_data_blocks)),
+          data_block_separators(std::move(_data_block_separators)),
           block_index_ranges_per_scan(std::move(_block_index_ranges_per_scan)),
           next_scan_idx(0),
           cur_data_block_idx(0),
@@ -601,8 +612,12 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   // *** BEGIN APIs relevant to multiscan ***
 
+  // Wrapper for SeekMultiScanImpl for handling out of bound
   void SeekMultiScan(const Slice* target);
 
+  // Return true if the result is out of bound
+  bool SeekMultiScanImpl(const Slice* seek_target);
+
   void FindBlockForwardInMultiScan();
 
   // Unpins blocks from the immediately previous scan range.
@@ -628,15 +643,32 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     }
   }
 
-  Status MultiScanLoadDataBlock(size_t idx) {
+  void MultiScanSeekTargetFromBlock(const Slice* seek_target, size_t block_idx);
+  void MultiScanUnexpectedSeekTarget(const Slice* seek_target,
+                                     size_t block_idx);
+
+  // Return true, if there is an error, or end of file
+  bool MultiScanLoadDataBlock(size_t idx) {
     if (idx >= multi_scan_->prefetch_max_idx) {
-      return Status::PrefetchLimitReached();
+      // TODO: Fix the max_prefetch_size support for multiple files.
+      // The goal is to limit the memory usage, prefetch could be done
+      // incrementally.
+      if (multi_scan_->scan_opts->max_prefetch_size == 0) {
+        // If max_prefetch_size is not set, treat this as end of file.
+        ResetDataIter();
+        assert(!is_out_of_bound_);
+        assert(!Valid());
+      } else {
+        // If max_prefetch_size is set, treat this as error.
+        multi_scan_status_ = Status::PrefetchLimitReached();
+      }
+      return true;
     }
 
     if (!multi_scan_->async_states.empty()) {
-      Status s = PollForBlock(idx);
-      if (!s.ok()) {
-        return s;
+      multi_scan_status_ = PollForBlock(idx);
+      if (!multi_scan_status_.ok()) {
+        return true;
       }
     }
     // This block should have been initialized
@@ -647,7 +679,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     table_->NewDataBlockIterator<DataBlockIter>(
         read_options_, multi_scan_->pinned_data_blocks[idx], &block_iter_,
         Status::OK());
-    return Status::OK();
+    return false;
   }
 
   // After PollForBlock(idx), the async request that contains
@@ -665,13 +697,11 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
                                      const Slice& buffer_data,
                                      CachableEntry<Block>& pinned_block_entry);
 
-  // Helper functions for Prepare():
-  Status ValidateScanOptions(const MultiScanArgs* multiscan_opts);
-
   Status CollectBlockHandles(
       const std::vector<ScanOptions>& scan_opts, bool require_file_overlap,
       std::vector<BlockHandle>* scan_block_handles,
-      std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan);
+      std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
+      std::vector<std::string>* data_block_boundary_keys);
 
   Status FilterAndPinCachedBlocks(
       const std::vector<BlockHandle>& scan_block_handles,
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index d8426ed0cb7e..99f3b0164bd7 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1106,7 +1106,11 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
                            CompressionTypeToString(compression_type_) +
                            "_async" + std::to_string(use_async_io);
   ImmutableOptions ioptions(options_);
-  CreateTable(table_name, ioptions, compression_type_, kv,
+  // Only insert 60 out of 100 blocks
+  CreateTable(table_name, ioptions, compression_type_,
+              std::vector<std::pair<std::string, std::string>>{
+                  kv.begin() + 20 * kEntriesPerBlock,
+                  kv.begin() + 80 * kEntriesPerBlock},
               compression_parallel_threads_, compression_dict_bytes_);
 
   std::unique_ptr<BlockBasedTable> table;
@@ -1125,16 +1129,16 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
 
   MultiScanArgs scan_options(comparator_);
   scan_options.use_async_io = use_async_io;
-  scan_options.insert(ExtractUserKey(kv[0].first),
-                      ExtractUserKey(kv[kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[2 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[3 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[31 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[32 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[33 * kEntriesPerBlock].first));
   auto read_count_before =
       options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
 
   iter->Prepare(&scan_options);
-  iter->Seek(kv[0].first);
-  for (size_t i = 0; i < kEntriesPerBlock + 1; ++i) {
+  iter->Seek(kv[30 * kEntriesPerBlock].first);
+  for (size_t i = 30 * kEntriesPerBlock; i <= 31 * kEntriesPerBlock; ++i) {
     ASSERT_TRUE(iter->status().ok()) << iter->status().ToString();
     ASSERT_TRUE(iter->Valid()) << i;
     ASSERT_EQ(iter->key().ToString(), kv[i].first);
@@ -1143,8 +1147,8 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
   // Iter may still be valid after scan range. Upper layer (DBIter) handles
   // exact upper bound checking. So we don't check !iter->Valid() here.
   ASSERT_OK(iter->status());
-  iter->Seek(kv[2 * kEntriesPerBlock].first);
-  for (size_t i = 2 * kEntriesPerBlock; i < 3 * kEntriesPerBlock; ++i) {
+  iter->Seek(kv[32 * kEntriesPerBlock].first);
+  for (size_t i = 32 * kEntriesPerBlock; i < 33 * kEntriesPerBlock; ++i) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(iter->key().ToString(), kv[i].first);
     iter->Next();
@@ -1159,24 +1163,24 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(ExtractUserKey(kv[40 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[45 * kEntriesPerBlock].first));
   scan_options.insert(ExtractUserKey(kv[70 * kEntriesPerBlock].first),
                       ExtractUserKey(kv[75 * kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[90 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[95 * kEntriesPerBlock].first));
 
   read_count_before =
       options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
   iter->Prepare(&scan_options);
 
-  iter->Seek(kv[70 * kEntriesPerBlock].first);
-  for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
+  iter->Seek(kv[40 * kEntriesPerBlock].first);
+  for (size_t i = 40 * kEntriesPerBlock; i < 45 * kEntriesPerBlock; ++i) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(iter->key().ToString(), kv[i].first);
     iter->Next();
   }
   ASSERT_OK(iter->status());
-  iter->Seek(kv[90 * kEntriesPerBlock].first);
-  for (size_t i = 90 * kEntriesPerBlock; i < 95 * kEntriesPerBlock; ++i) {
+  iter->Seek(kv[70 * kEntriesPerBlock].first);
+  for (size_t i = 70 * kEntriesPerBlock; i < 75 * kEntriesPerBlock; ++i) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(iter->key().ToString(), kv[i].first);
     iter->Next();
@@ -1192,13 +1196,14 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
 
   // 3. Tests I/O excludes blocks already in cache.
-  // Reading blocks from 50-99
-  // From reads above, blocks 70-75 and 90-95 already in cache
-  // So we should read 50-70 76-89 96-99 in three I/Os.
+  // Reading blocks from 40-79
+  // From reads above, blocks 40-44 and 70-74 already in cache
+  // So we should read 45-69, 75-79 in two I/Os.
   // If fill_cache is false, then we'll do one giant I/O.
   scan_options = MultiScanArgs(comparator_);
   scan_options.use_async_io = use_async_io;
-  scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[40 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[80 * kEntriesPerBlock].first));
   read_count_before =
       options_.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
   iter->Prepare(&scan_options);
@@ -1208,7 +1213,7 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
     if (!fill_cache) {
       ASSERT_EQ(read_count_before + 1, read_count_after);
     } else {
-      ASSERT_EQ(read_count_before + 3, read_count_after);
+      ASSERT_EQ(read_count_before + 2, read_count_after);
     }
   } else {
     // stat is recorded in async callback which happens in Poll(), and
@@ -1216,8 +1221,8 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
     ASSERT_EQ(read_count_before, read_count_after);
   }
 
-  iter->Seek(kv[50 * kEntriesPerBlock].first);
-  for (size_t i = 50 * kEntriesPerBlock; i < 100 * kEntriesPerBlock; ++i) {
+  iter->Seek(kv[40 * kEntriesPerBlock].first);
+  for (size_t i = 40 * kEntriesPerBlock; i < 80 * kEntriesPerBlock; ++i) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(iter->key().ToString(), kv[i].first);
     iter->Next();
@@ -1229,7 +1234,7 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
   if (!fill_cache) {
     ASSERT_EQ(read_count_before + 1, read_count_after);
   } else {
-    ASSERT_EQ(read_count_before + 3, read_count_after);
+    ASSERT_EQ(read_count_before + 2, read_count_after);
   }
 
   // 4. Check cases when Seek key does not match start key in ScanOptions
@@ -1238,37 +1243,109 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   scan_options = MultiScanArgs(comparator_);
   scan_options.use_async_io = use_async_io;
-  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[20 * kEntriesPerBlock].first));
   scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
                       ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[50 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[60 * kEntriesPerBlock].first));
   iter->Prepare(&scan_options);
   // Match start key
-  iter->Seek(kv[10 * kEntriesPerBlock].first);
-  for (size_t i = 10 * kEntriesPerBlock; i < 20 * kEntriesPerBlock; ++i) {
+  iter->Seek(kv[30 * kEntriesPerBlock].first);
+  for (size_t i = 30 * kEntriesPerBlock; i < 40 * kEntriesPerBlock; ++i) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(iter->key().ToString(), kv[i].first);
     iter->Next();
   }
   ASSERT_OK(iter->status());
 
-  // Does not match start key of the second ScanOptions.
+  // Seek a key that is larger than next start key is allowed, as long as it is
+  // larger than the previous key
   iter->Seek(kv[50 * kEntriesPerBlock + 1].first);
-  ASSERT_NOK(iter->status());
+  ASSERT_OK(iter->status());
 
+  // Check seek key going backward
   iter.reset(table->NewIterator(
       read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
       /*skip_filters=*/false, TableReaderCaller::kUncategorized));
   scan_options = MultiScanArgs(comparator_);
   scan_options.use_async_io = use_async_io;
-  scan_options.insert(ExtractUserKey(kv[10 * kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[11 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[31 * kEntriesPerBlock].first));
+  scan_options.insert(ExtractUserKey(kv[32 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[33 * kEntriesPerBlock].first));
   iter->Prepare(&scan_options);
-  // Does not match the first ScanOptions.
-  iter->SeekToFirst();
-  ASSERT_NOK(iter->status());
-  iter->Seek(kv[10 * kEntriesPerBlock].first);
-  ASSERT_NOK(iter->status());
+  iter->Seek(kv[32 * kEntriesPerBlock].first);
+  ASSERT_OK(iter->status());
+  iter->Seek(kv[34 * kEntriesPerBlock].first);
+  ASSERT_OK(iter->status());
+  // Seek key could not going backward
+  iter->Seek(kv[30 * kEntriesPerBlock].first);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument("Unexpected seek key moving backward"));
+
+  // Test prefetch limit reached.
+  iter.reset(table->NewIterator(
+      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.use_async_io = use_async_io;
+  scan_options.max_prefetch_size = 1024;  // less than block size
+  scan_options.insert(ExtractUserKey(kv[30 * kEntriesPerBlock].first),
+                      ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+  iter->Prepare(&scan_options);
+  iter->Seek(kv[31 * kEntriesPerBlock].first);
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Randomly seek keys on the file, as long as the key is moving forward, it
+  // is allowed
+
+  if (use_async_io) {
+    // Skip following test when async io is enabled. There is some issue with
+    // IO_uring that I am still trying to root cause.
+    // TODO : enable the test again with async IO
+    return;
+  }
+  for (int i = 0; i < 100; i++) {
+    iter.reset(table->NewIterator(
+        read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
+        /*skip_filters=*/false, TableReaderCaller::kUncategorized));
+    scan_options = MultiScanArgs(comparator_);
+    scan_options.use_async_io = use_async_io;
+    scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[10 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[25 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[35 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[35 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[40 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[45 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[50 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[75 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[85 * kEntriesPerBlock].first));
+    scan_options.insert(ExtractUserKey(kv[85 * kEntriesPerBlock].first),
+                        ExtractUserKey(kv[95 * kEntriesPerBlock].first));
+
+    iter->Prepare(&scan_options);
+
+    auto random_seed = static_cast<uint32_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count());
+    Random rnd(random_seed);
+    std::cout << random_seed << std::endl;
+    SCOPED_TRACE("Random seed " + std::to_string(random_seed));
+
+    int last_read_key_index = rnd.Uniform(100);
+    while (last_read_key_index < 100) {
+      iter->Seek(kv[last_read_key_index * kEntriesPerBlock].first);
+      EXPECT_OK(iter->status());
+      // iterate for a few keys
+      while (iter->Valid()) {
+        iter->Next();
+        last_read_key_index++;
+        EXPECT_OK(iter->status());
+      }
+      last_read_key_index += rnd.Uniform(100);
+    }
+  }
 }
 
 TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanPrefetchSizeLimit) {
diff --git a/table/table_test.cc b/table/table_test.cc
index 8cbd0ac1677b..f207387d2500 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -7257,8 +7257,6 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
   } catch (MultiScanException& ex) {
     // Make sure exception contains the status
     ASSERT_NOK(ex.status());
-    std::cerr << "Iterator returned status " << ex.what();
-    abort();
   } catch (std::logic_error& ex) {
     std::cerr << "Iterator returned logic error " << ex.what();
     abort();
@@ -7287,8 +7285,6 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
   } catch (MultiScanException& ex) {
     // Make sure exception contains the status
     ASSERT_NOK(ex.status());
-    std::cerr << "Iterator returned status " << ex.what();
-    abort();
   } catch (std::logic_error& ex) {
     std::cerr << "Iterator returned logic error " << ex.what();
     abort();
@@ -7313,7 +7309,7 @@ TEST_F(ExternalTableTest, DBMultiScanTest) {
     }
   } catch (MultiScanException& ex) {
     // Make sure exception contains the status
-    ASSERT_EQ(ex.status(), Status::IOError());
+    ASSERT_NOK(ex.status());
   } catch (std::logic_error& ex) {
     std::cerr << "Iterator returned logic error " << ex.what();
     abort();
@@ -7431,9 +7427,7 @@ TEST_F(ExternalTableTest, IngestionTest) {
   ASSERT_OK(db->Close());
 }
 
-class UserDefinedIndexTest
-    : public BlockBasedTableTestBase,
-      public testing::WithParamInterface<const Comparator*> {
+class UserDefinedIndexTestBase : public BlockBasedTableTestBase {
  public:
   class CustomFlushBlockPolicy : public FlushBlockPolicy {
    public:
@@ -7763,13 +7757,22 @@ class UserDefinedIndexTest
     };
   };
 
-  void SetUp() override {
-    comparator_ = GetParam();
-    options_.comparator = comparator_;
-    is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator();
+ protected:
+  std::vector<std::pair<std::string, std::string>> generateKVWithValue(
+      int key_count, const std::string& value) {
+    std::vector<std::pair<std::string, std::string>> kvs(key_count);
+    for (int i = 0; i < key_count; i++) {
+      std::stringstream ss;
+      ss << std::setw(2) << std::setfill('0') << i;
+      std::string key = "key" + ss.str();
+      kvs[i] = std::make_pair(key, value);
+    }
+    if (is_reverse_comparator_) {
+      std::reverse(kvs.begin(), kvs.end());
+    }
+    return kvs;
   }
 
- protected:
   std::vector<std::pair<std::string, std::string>> generateKVs(
       int key_count, int value_size = 0) {
     std::vector<std::pair<std::string, std::string>> kvs(key_count);
@@ -7862,7 +7865,17 @@ class UserDefinedIndexTest
   Random rnd{301};
 };
 
-void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
+class UserDefinedIndexTest
+    : public UserDefinedIndexTestBase,
+      public testing::WithParamInterface<const Comparator*> {
+  void SetUp() override {
+    comparator_ = GetParam();
+    options_.comparator = comparator_;
+    is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator();
+  }
+};
+
+void UserDefinedIndexTestBase::BasicTest(bool use_partitioned_index) {
   BlockBasedTableOptions table_options;
   std::string dbname = test::PerThreadDBPath("user_defined_index_test");
   std::string ingest_file = dbname + "test.sst";
@@ -7995,24 +8008,29 @@ void UserDefinedIndexTest::BasicTest(bool use_partitioned_index) {
   ASSERT_NOK(iter->status());
   user_defined_index_factory->next_error_count_ = 0;
 
-  ro.iterate_upper_bound = nullptr;
+  ro.iterate_upper_bound = &ub;
   iter.reset(reader->NewIterator(ro));
   ASSERT_NE(iter, nullptr);
   MultiScanArgs scan_opts(comparator_);
 
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
-  scan_opts.insert("key40", property_bag);
+  std::vector<std::string> boundaries = {"key10", "key50"};
+  if (is_reverse_comparator_) {
+    std::reverse(boundaries.begin(), boundaries.end());
+  }
+
+  scan_opts.insert(boundaries[0], boundaries[1], std::optional(property_bag));
   iter->Prepare(scan_opts);
-  // Test that we can read all the keys
+  // Test that UDI is used to help fetch the number of keys
   key_count = 0;
+  ub = boundaries[1];
   for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
        iter->Valid(); iter->Next()) {
     key_count++;
   }
-  ASSERT_GE(key_count, 25);
   // The index may undercount by 2 blocks
-  ASSERT_LE(key_count, 30);
+  ASSERT_EQ(key_count, 29);
   ASSERT_OK(iter->status());
 }
 
@@ -8158,25 +8176,6 @@ TEST_P(UserDefinedIndexTest, IngestTest) {
   }
   ASSERT_EQ(key_count, is_reverse_comparator_ ? 15 : 35);
   ASSERT_OK(iter->status());
-
-  ro.iterate_upper_bound = nullptr;
-  iter.reset(db->NewIterator(ro, cfh));
-  ASSERT_NE(iter, nullptr);
-  MultiScanArgs scan_opts(options_.comparator);
-  std::unordered_map<std::string, std::string> property_bag;
-  property_bag["count"] = std::to_string(25);
-  scan_opts.insert(Slice("key40"), std::optional(property_bag));
-  iter->Prepare(scan_opts);
-  // Test that we can read all the keys
-  key_count = 0;
-  for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
-       iter->Valid(); iter->Next()) {
-    key_count++;
-  }
-  ASSERT_GE(key_count, 25);
-  // The index may undercount by 2 blocks
-  ASSERT_LE(key_count, 30);
-  ASSERT_OK(iter->status());
   iter.reset();
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
@@ -8491,48 +8490,111 @@ TEST_P(UserDefinedIndexTest, MultiScanFailureTest) {
   ASSERT_EQ(iter->status(), Status::Incomplete());
   iter.reset();
 
+  // Empty range multiscan error
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  iter->Prepare(scan_options);
+  ASSERT_EQ(iter->status(), Status::InvalidArgument("Empty MultiScanArgs"));
+
+  // Check no seek key error
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[2], property_bag);
+  iter->Prepare(scan_options);
+  iter->SeekToFirst();
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument("No seek key for MultiScan"));
+
+  // Seek is not allowed to seen a key that is not following the prepare order
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
   scan_options.max_prefetch_size = 0;
   iter->Prepare(scan_options);
   ub = key_ranges[3];
   iter->Seek(key_ranges[2]);
-  // Seek should fail as its not in the order specified in scan_options
-  ASSERT_EQ(iter->status(), Status::InvalidArgument());
+  ASSERT_EQ(
+      iter->status(),
+      Status::InvalidArgument(
+          "Seek target does not match the start of the next prepared range at "
+          "index 0"));
   ASSERT_FALSE(iter->Valid());
   iter.reset();
 
+  // limit is equal to start error
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
-  scan_options.max_prefetch_size = 0;
+  (*scan_options).clear();
+  scan_options.insert(key_ranges[0], key_ranges[0], property_bag);
   iter->Prepare(scan_options);
-  ub = key_ranges[1];
-  iter->Seek(key_ranges[0]);
-  ASSERT_OK(iter->status()) << iter->status().ToString();
-  ASSERT_TRUE(iter->Valid());
-  ub = key_ranges[3];
-  iter->Seek("key13");
-  // Seek should fail as its not in the order specified in scan_options
-  ASSERT_EQ(iter->status(), Status::InvalidArgument());
-  ASSERT_FALSE(iter->Valid());
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Scan start key is large or equal than limit at index 0"));
   iter.reset();
 
+  // overlapping ranges error
   iter.reset(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
   (*scan_options).clear();
-  if (is_reverse_comparator_) {
-    key_ranges[2] = "key20";
-  } else {
-    key_ranges[1] = "key20";
-  }
+  scan_options.insert(key_ranges[0], key_ranges[2], property_bag);
+  scan_options.insert(key_ranges[1], key_ranges[3], property_bag);
+  iter->Prepare(scan_options);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument("Overlapping ranges at index 1"));
+  iter.reset();
 
+  // Validate an error is returned if upper bound is not set to the same value
+  // as limit
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
   scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
-  scan_options.insert(key_ranges[2], key_ranges[3], property_bag);
   iter->Prepare(scan_options);
-  ub = key_ranges[3];
+  ub = "";
+  iter->Seek(key_ranges[0]);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Upper bound is not set to the same limit value of the next "
+                "prepared range at index 0"));
+  ASSERT_FALSE(iter->Valid());
+
+  // Validate an error is returned when seek more keys than prepared
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  iter->Prepare(scan_options);
+  ub = key_ranges[1];
+  iter->Seek(key_ranges[0]);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
   iter->Seek(key_ranges[2]);
-  // Should fail due to overlapping ranges
-  ASSERT_EQ(iter->status(), Status::InvalidArgument());
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Seek called after exhausting all of the scan ranges"));
+  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+
+  // Check error is returned if upper bound is not set and limit is set
+  ro.iterate_upper_bound = nullptr;
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], key_ranges[1], property_bag);
+  iter->Prepare(scan_options);
+  iter->Seek(key_ranges[0]);
+  ASSERT_EQ(iter->status(),
+            Status::InvalidArgument(
+                "Upper bound is not set to the same limit value of the next "
+                "prepared range at index 0"));
+  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+
+  // Upper bound is allowed to be empty, if limit is not set
+  ro.iterate_upper_bound = nullptr;
+  iter.reset(db->NewIterator(ro, cfh));
+  scan_options = MultiScanArgs(comparator_);
+  scan_options.insert(key_ranges[0], property_bag);
+  iter->Prepare(scan_options);
+  iter->Seek(key_ranges[0]);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
   iter.reset();
 
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
@@ -8596,23 +8658,33 @@ TEST_P(UserDefinedIndexTest, ConfigTest) {
   ASSERT_OK(s);
 
   ReadOptions ro;
+  Slice ub;
+  ro.iterate_upper_bound = &ub;
   ro.table_index_factory = user_defined_index_factory.get();
   std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
   MultiScanArgs scan_opts(options_.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(25);
-  scan_opts.insert(Slice("key40"), std::optional(property_bag));
+
+  std::vector<std::string> boundaries = {"key10", "key50"};
+  if (is_reverse_comparator_) {
+    std::reverse(boundaries.begin(), boundaries.end());
+  }
+
+  scan_opts.insert(boundaries[0], boundaries[1], std::optional(property_bag));
   iter->Prepare(scan_opts);
-  // Test that we can read all the keys
+  // Test that UDI is used to help fetch the number of keys
+  ub = boundaries[1];
   int key_count = 0;
   for (iter->Seek(scan_opts.GetScanRanges()[0].range.start.value());
        iter->Valid(); iter->Next()) {
     key_count++;
   }
-  ASSERT_GE(key_count, 25);
+  // Number of blocks prepared is based on UDI, it would be slightly higher than
+  // the limit
   // The index may undercount by 2 blocks
-  ASSERT_LE(key_count, 30);
+  ASSERT_EQ(key_count, 29);
   ASSERT_OK(iter->status());
   iter.reset();
 
@@ -8691,11 +8763,6 @@ TEST_P(UserDefinedIndexTest, RangeDelete) {
                              ifo);
   ASSERT_OK(s);
 
-  ReadOptions ro;
-  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
-  ASSERT_NE(iter, nullptr);
-  ASSERT_OK(iter->status());
-
   std::vector<Slice> range = {
       Slice("key10"),
       Slice("key25"),
@@ -8708,9 +8775,11 @@ TEST_P(UserDefinedIndexTest, RangeDelete) {
   }
 
   Slice ub("");
+  ReadOptions ro;
   ro.iterate_upper_bound = &ub;
-  iter.reset(db->NewIterator(ro, cfh));
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
   ASSERT_NE(iter, nullptr);
+
   MultiScanArgs scan_opts(options_.comparator);
   std::unordered_map<std::string, std::string> property_bag;
   property_bag["count"] = std::to_string(9);
@@ -8740,10 +8809,615 @@ TEST_P(UserDefinedIndexTest, RangeDelete) {
   ASSERT_OK(DestroyDB(dbname, options_));
 }
 
+TEST_P(UserDefinedIndexTest, QueryCrossTwoFiles) {
+  BlockBasedTableOptions table_options;
+  options_.num_levels = 50;
+  options_.compaction_style = kCompactionStyleUniversal;
+  options_.disable_auto_compactions = true;
+  options_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(4);
+  std::string dbname = test::PerThreadDBPath("user_defined_index_test");
+  std::string ingest_file = dbname + "test.sst";
+
+  // Set up the user-defined index factory
+  auto user_defined_index_factory =
+      std::make_shared<TestUserDefinedIndexFactory>();
+  table_options.user_defined_index_factory = user_defined_index_factory;
+
+  // Set up custom flush block policy that flushes every 3 keys
+  table_options.flush_block_policy_factory =
+      std::make_shared<CustomFlushBlockPolicyFactory>();
+
+  options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  auto create_ingestion_data_file = [&](const std::string& filename,
+                                        const std::string& value) {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(filename));
+    auto kvs = generateKVWithValue(100, value);
+
+    for (const auto& kv : kvs) {
+      ASSERT_OK(writer->Put(kv.first, kv.second));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  };
+
+  // Create first ingestion file with data
+  create_ingestion_data_file(ingest_file + "_0", "old");
+
+  std::unique_ptr<DB> db;
+  options_.create_if_missing = true;
+  Status s = DB::Open(options_, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(options_, "new_cf", &cfh));
+
+  IngestExternalFileOptions ifo;
+  // ingest data file key00~key99
+  s = db->IngestExternalFile(cfh, {ingest_file + "_0"}, ifo);
+  ASSERT_OK(s);
+
+  // Compact the file with SST partitioner, so that files are split into
+  // multiple ones
+  s = db->CompactRange(
+      {.exclusive_manual_compaction = true,
+       .bottommost_level_compaction = BottommostLevelCompaction::kForce},
+      cfh, nullptr, nullptr);
+  ASSERT_OK(s);
+
+  std::vector<Slice> range = {
+      // Each range span across 2 files
+      Slice("key16"),
+      Slice("key24"),
+      Slice("key26"),
+      Slice("key34"),
+  };
+
+  if (is_reverse_comparator_) {
+    std::reverse(range.begin(), range.end());
+  }
+
+  Slice ub("");
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub;
+  std::unique_ptr<Iterator> iter(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+
+  MultiScanArgs scan_opts(options_.comparator);
+  std::unordered_map<std::string, std::string> property_bag;
+  auto read_key_per_range_limit = 2;
+  property_bag["count"] = std::to_string(read_key_per_range_limit);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    scan_opts.insert(range[i * 2], range[i * 2 + 1],
+                     std::optional(property_bag));
+  }
+  iter->Prepare(scan_opts);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    // Update upper bound before each seek
+    ub = range[2 * i + 1];
+    auto key_count = 0;
+    for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) {
+      key_count++;
+      ASSERT_EQ(iter->value(), "old");
+      if (key_count >= read_key_per_range_limit) {
+        break;
+      }
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key_count, read_key_per_range_limit);
+  }
+
+  // Create another ingestion file with range delete only that covers the first
+  // file to delete all of its keys.
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options_));
+    ASSERT_OK(writer->Open(ingest_file + "_1"));
+    if (is_reverse_comparator_) {
+      ASSERT_OK(writer->DeleteRange("keyz", "key"));
+    } else {
+      ASSERT_OK(writer->DeleteRange("key", "keyz"));
+    }
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  }
+  s = db->IngestExternalFile(cfh, {ingest_file + "_1"}, ifo);
+  ASSERT_OK(s);
+
+  // ingest new data
+  create_ingestion_data_file(ingest_file + "_2", "new");
+  s = db->IngestExternalFile(cfh, {ingest_file + "_2"}, ifo);
+  ASSERT_OK(s);
+
+  iter.reset(db->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  ASSERT_OK(iter->status());
+
+  iter->Prepare(scan_opts);
+
+  for (size_t i = 0; i < range.size() / 2; i++) {
+    // Update upper bound before each seek
+    ub = range[2 * i + 1];
+    auto key_count = 0;
+    for (iter->Seek(range[i * 2]); iter->Valid(); iter->Next()) {
+      key_count++;
+      ASSERT_EQ(iter->value(), "new");
+      if (key_count >= read_key_per_range_limit) {
+        break;
+      }
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key_count, read_key_per_range_limit);
+  }
+
+  iter.reset();
+
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfh));
+  ASSERT_OK(db->Close());
+  ASSERT_OK(DestroyDB(dbname, options_));
+}
+
 INSTANTIATE_TEST_CASE_P(UserDefinedIndexTest, UserDefinedIndexTest,
                         ::testing::Values(BytewiseComparator(),
                                           ReverseBytewiseComparator()));
 
+struct UserDefinedIndexStressTestParam {
+  const Comparator* comparator;
+  bool enable_udi;
+  bool enable_compaction_with_sst_partitioner;
+
+  using UserDefinedIndexStressTestTuple =
+      std::tuple<const Comparator*, bool, bool>;
+
+  UserDefinedIndexStressTestParam(const UserDefinedIndexStressTestTuple& tuple)
+      : comparator(std::get<0>(tuple)),
+        enable_udi(std::get<1>(tuple)),
+        enable_compaction_with_sst_partitioner(std::get<2>(tuple)) {}
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const UserDefinedIndexStressTestParam& param) {
+  return os << "UserDefinedIndexStressTestParam{comparator="
+            << (param.comparator ? param.comparator->Name() : "nullptr")
+            << ", enable_udi=" << param.enable_udi
+            << ", enable_compaction_with_sst_partitioner="
+            << param.enable_compaction_with_sst_partitioner << "}";
+}
+
+constexpr auto kVerbose = false;
+
+struct DataRange {
+  size_t start;  // inclusive
+  size_t end;    // exclusive
+  std::string value;
+  bool is_range_delete;
+  bool skipped;
+  size_t scan_key_count_limit;
+  std::string start_key;
+  std::string end_key;
+
+  // print the range in human readable format
+  std::string ToString() const {
+    std::ostringstream oss;
+    oss << "[" << start << ", " << end << "), value: " << value
+        << ", is_range_delete: " << is_range_delete << ", skipped: " << skipped
+        << ", scan_key_count_limit: " << scan_key_count_limit
+        << ", start_key: " << start_key << ", end_key: " << end_key;
+    return oss.str();
+  }
+};
+class UserDefinedIndexStressTest
+    : public UserDefinedIndexTestBase,
+      public testing::WithParamInterface<
+          UserDefinedIndexStressTestParam::UserDefinedIndexStressTestTuple> {
+ public:
+  void SetUp() override {
+    rand_seed_ = static_cast<uint32_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count());
+
+    std::cout << "Random seed: " << rand_seed_ << std::endl;
+
+    rnd = Random(rand_seed_);
+    UserDefinedIndexStressTestParam param = GetParam();
+    comparator_ = param.comparator;
+    enable_udi_ = param.enable_udi;
+    enable_compaction_with_sst_partitioner_ =
+        param.enable_compaction_with_sst_partitioner;
+    options_.comparator = comparator_;
+    is_reverse_comparator_ = comparator_ == ReverseBytewiseComparator();
+    options_.compaction_style = kCompactionStyleUniversal;
+
+    // Set up custom flush block policy that flushes every 3 keys
+    table_options_.flush_block_policy_factory =
+        std::make_shared<CustomFlushBlockPolicyFactory>();
+
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options_));
+  }
+
+  void TearDown() override {
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(ingest_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(regular_cfh_));
+
+    ASSERT_OK(db_->Close());
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+
+ protected:
+  static constexpr auto kKeyRange = 100;
+  bool enable_udi_{};
+  bool enable_compaction_with_sst_partitioner_{};
+  uint32_t rand_seed_{};
+  std::shared_ptr<UserDefinedIndexFactory> user_defined_index_factory_;
+  BlockBasedTableOptions table_options_;
+  const Comparator* comparator_{};
+  bool is_reverse_comparator_{};
+  Random rnd{0};
+  ColumnFamilyHandle* ingest_cfh_ = nullptr;
+  ColumnFamilyHandle* regular_cfh_ = nullptr;
+  std::unique_ptr<DB> db_;
+  std::vector<std::vector<DataRange>> ranges_in_levels_;
+  std::string dbname_;
+
+  void SetupDB(const std::string& dbname) {
+    options_.create_if_missing = true;
+    options_.disable_auto_compactions = true;
+    Status s = DB::Open(options_, dbname, &db_);
+    ASSERT_OK(s);
+    ASSERT_TRUE(db_ != nullptr);
+    if (enable_compaction_with_sst_partitioner_) {
+      // Use a SST partitioner to create multiple files, use the first 4 bytes
+      // of key to partition the file, The key is formatted with 2 digit
+      // following "key" string, e.g. key01, key99
+      options_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(4);
+    }
+
+    ASSERT_OK(db_->CreateColumnFamily(options_, "regular_cf", &regular_cfh_));
+
+    if (enable_udi_) {
+      // Set up the user-defined index factory
+      user_defined_index_factory_ =
+          std::make_shared<TestUserDefinedIndexFactory>();
+      table_options_.user_defined_index_factory = user_defined_index_factory_;
+    }
+
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options_));
+    ASSERT_OK(db_->CreateColumnFamily(options_, "ingest_cf", &ingest_cfh_));
+  }
+
+  template <typename T>
+  std::string FormatKey(T i) {
+    std::stringstream ss;
+    ss << std::setw(2) << std::setfill('0') << i;
+    return "key" + ss.str();
+  }
+
+  std::vector<DataRange> GenerateKeyRanges(size_t range_count,
+                                           int skip_range_count,
+                                           const std::string& value) {
+    std::set<size_t> boundaries;
+    // generate n + 1 number of unique boundaries to form n contiguoes ranges
+    while (boundaries.size() < range_count + 1) {
+      boundaries.insert(rnd.Uniform(kKeyRange));
+    }
+    std::vector<size_t> sorted_boundaries(boundaries.begin(), boundaries.end());
+    if (is_reverse_comparator_) {
+      std::reverse(sorted_boundaries.begin(), sorted_boundaries.end());
+    }
+    auto ranges = std::vector<DataRange>();
+    std::optional<size_t> prev_bound;
+    for (auto it = sorted_boundaries.begin(); it != sorted_boundaries.end();
+         it++) {
+      if (prev_bound.has_value()) {
+        ranges.push_back({.start = prev_bound.value(),
+                          .end = *it,
+                          .value = value,
+                          .is_range_delete = rnd.OneIn(6),
+                          .skipped = false,
+                          .scan_key_count_limit = rnd.Uniform(10) + 1,
+                          .start_key = FormatKey(prev_bound.value()),
+                          .end_key = FormatKey(*it)});
+      }
+      prev_bound = *it;
+    }
+    // skipped some of them
+    for (int j = 0; j < skip_range_count; j++) {
+      ranges[rnd.Uniform(static_cast<uint32_t>(range_count))].skipped = true;
+    }
+
+    if (kVerbose) {
+      for (auto const& range : ranges) {
+        std::cout << range.ToString() << std::endl;
+      }
+    }
+
+    return ranges;
+  }
+
+  void CreateSstFileWithRanges(const std::string& ingest_file,
+                               const DataRange& range) {
+    std::unique_ptr<SstFileWriter> writer =
+        std::make_unique<SstFileWriter>(EnvOptions(), options_);
+    ASSERT_OK(writer->Open(ingest_file));
+
+    assert(range.start != range.end);
+
+    if (range.is_range_delete) {
+      ASSERT_OK(writer->DeleteRange(range.start_key, range.end_key));
+    } else {
+      for (size_t i = range.start; i != range.end;) {
+        auto key = FormatKey(i);
+        range.start < range.end ? i++ : i--;
+        ASSERT_OK(writer->Put(key, range.value));
+      }
+    }
+    ASSERT_OK(writer->Finish()) << range.ToString();
+  }
+
+  void RangeScan(std::unique_ptr<Iterator>& iter,
+                 const std::vector<DataRange>& ranges, Slice& upper_bound,
+                 std::vector<std::pair<std::string, std::string>>& result,
+                 bool use_multi_scan) {
+    ASSERT_NE(iter, nullptr);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(!ranges.empty());
+
+    MultiScanArgs scan_opts(options_.comparator);
+    std::unordered_map<std::string, std::string> property_bag;
+    if (use_multi_scan) {
+      for (auto const& range : ranges) {
+        if (range.skipped) {
+          continue;
+        }
+        property_bag["count"] = std::to_string(range.scan_key_count_limit);
+        scan_opts.insert(range.start_key, range.end_key, property_bag);
+        // print range start end key
+        if (kVerbose) {
+          std::cout << "range start " << range.start_key << " end "
+                    << range.end_key << std::endl;
+        }
+      }
+      iter->Prepare(scan_opts);
+      ASSERT_OK(iter->status());
+    }
+
+    for (auto const& range : ranges) {
+      if (range.skipped) {
+        continue;
+      }
+      size_t scan_key_count = 0;
+      if (kVerbose) {
+        std::cout << "seek key " << range.start_key << std::endl;
+      }
+      upper_bound = range.end_key;
+      for (iter->Seek(range.start_key);
+           iter->Valid() && scan_key_count < range.scan_key_count_limit;
+           iter->Next()) {
+        if (kVerbose) {
+          std::cout << "key " << iter->key().ToString() << " value "
+                    << iter->value().ToString() << std::endl;
+        }
+        result.emplace_back(iter->key().ToString(), iter->value().ToString());
+        scan_key_count++;
+      }
+      ASSERT_OK(iter->status());
+    }
+  }
+
+  void AddDataToRegularCF() {
+    for (auto const& ranges_in_level : ranges_in_levels_) {
+      for (auto const& range : ranges_in_level) {
+        if (!range.skipped) {
+          for (auto i = range.start; i != range.end;
+               range.start < range.end ? i++ : i--) {
+            if (range.is_range_delete) {
+              ASSERT_OK(
+                  db_->Delete(WriteOptions(), regular_cfh_, FormatKey(i)));
+            } else {
+              ASSERT_OK(db_->Put(WriteOptions(), regular_cfh_, FormatKey(i),
+                                 range.value));
+            }
+          }
+        }
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions(), regular_cfh_));
+  }
+
+  void ValidateQueryResult() {
+    // Query both CF with same range scan and validate result are same
+    for (auto i = 0; i < 200; i++) {
+      if (kVerbose) {
+        std::cout << "iteration " << i << std::endl;
+      }
+      // randomly generate 1 to 3 ranges
+      auto ranges = GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "");
+
+      // Query regular CF
+      std::vector<std::pair<std::string, std::string>> expected_result;
+      Slice upper_bound("");
+      ReadOptions ro;
+      ro.iterate_upper_bound = &upper_bound;
+
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ro, regular_cfh_));
+      ASSERT_NO_FATAL_FAILURE(
+          RangeScan(iter, ranges, upper_bound, expected_result, false));
+      ASSERT_OK(iter->status());
+
+      // Query ingest CF
+      iter.reset(db_->NewIterator(ro, ingest_cfh_));
+      std::vector<std::pair<std::string, std::string>> ingest_cf_result;
+      ASSERT_NO_FATAL_FAILURE(
+          RangeScan(iter, ranges, upper_bound, ingest_cf_result, false));
+
+      ASSERT_EQ(expected_result, ingest_cf_result);
+      ASSERT_OK(iter->status());
+
+      // Query ingest CF with UDI if it is enabled
+      if (enable_udi_) {
+        ro.table_index_factory = user_defined_index_factory_.get();
+      }
+
+      iter.reset(db_->NewIterator(ro, ingest_cfh_));
+      std::vector<std::pair<std::string, std::string>>
+          ingest_cf_multi_scan_result;
+      ASSERT_NO_FATAL_FAILURE(RangeScan(iter, ranges, upper_bound,
+                                        ingest_cf_multi_scan_result, true));
+      ASSERT_EQ(expected_result, ingest_cf_multi_scan_result);
+      ASSERT_OK(iter->status());
+    }
+  }
+
+  void IngestFilesInOneLevel(const std::vector<DataRange>& ranges_in_level,
+                             const std::string& ingest_file_name_prefix,
+                             size_t& ingest_file_count,
+                             const IngestExternalFileOptions& ifo) {
+    std::vector<std::string> ingest_files;
+    // Generate SST file and bulk load them one level at a time
+    for (auto const& range : ranges_in_level) {
+      if (!range.skipped) {
+        ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
+            ingest_file_name_prefix + std::to_string(ingest_file_count),
+            range));
+        ingest_files.push_back(ingest_file_name_prefix +
+                               std::to_string(ingest_file_count));
+        ingest_file_count++;
+      }
+    }
+
+    ASSERT_OK(db_->IngestExternalFile(ingest_cfh_, ingest_files, ifo));
+  }
+
+  void IngestDataToCF() {
+    IngestExternalFileOptions ifo;
+    ifo.snapshot_consistency = false;
+    auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+    size_t ingest_file_count = 0;
+    for (auto const& ranges_in_level : ranges_in_levels_) {
+      ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel(
+          ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo));
+    }
+
+    ASSERT_GE(ingest_file_count, 0);
+  }
+
+  void CompactIngestedCF() {
+    auto s = db_->CompactRange(
+        {.exclusive_manual_compaction = true,
+         .bottommost_level_compaction = BottommostLevelCompaction::kForce},
+        ingest_cfh_, nullptr, nullptr);
+    ASSERT_OK(s);
+  }
+};
+
+// TODO(xingbo)
+// This test is disabled due to following test case condition:
+// level n:   delete range 4-6
+// level n+1: data range 0-------10
+// query: 3-9, count=2.
+// Becuase query count == 2, level n+1 would only prepare 3-5. but since 4-6
+// got deleted in the upper level, they are not returned, so only 3 is
+// returned. Meantime the query should have return [3, 6]
+// One way to fix this is by preparing more data blocks once prepared blocks are
+// exhausted, but upper bound is not reached yet.
+// This requires following changes:
+// 1. Fix out of bound flag in block table iterator. Only set it if the key is
+// larger than the upper bound.
+// 2. Refactor the prepared block single dimension vector into 2 dimension of
+// vectors, so that more blocks could be prepared if needed.
+TEST_P(UserDefinedIndexStressTest, DISABLED_PartialDeleteRange) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest Randomly generate multiple non overlapping range for multiple
+  // levels Range scan same range between the 2 CF and validate the result is
+  // same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ =
+      test::PerThreadDBPath("UserDefinedIndexStressTest_PartialDeleteRange");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  for (int i = 0; i < 5; i++) {
+    ranges_in_levels_.push_back(
+        GenerateKeyRanges(rnd.Uniform(3) + 4, 2,
+                          "L" + std::to_string(options_.num_levels - 1 - i)));
+  }
+
+  ASSERT_NO_FATAL_FAILURE(IngestDataToCF());
+
+  if (enable_compaction_with_sst_partitioner_) {
+    ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+TEST_P(UserDefinedIndexStressTest, DeleteRange) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest.
+  // Test the case where there are 3 levels, the middle level is a delete range
+  // file that span across the entire key space.
+  // Range scan same range between the 2 CF and validate the result is same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  // Test 3 levels.
+  // bottom level is normal data files.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
+  // middle level delete range between each level
+  if (is_reverse_comparator_) {
+    ranges_in_levels_.push_back({{.start = 100,
+                                  .end = 0,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "keyz",
+                                  .end_key = "key"}});
+  } else {
+    ranges_in_levels_.push_back({{.start = 0,
+                                  .end = 100,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "key",
+                                  .end_key = "keyz"}});
+  }
+  // Top level is normal data files
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
+
+  IngestExternalFileOptions ifo;
+  ifo.snapshot_consistency = false;
+  auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+  size_t ingest_file_count = 0;
+  auto first_level = true;
+  for (auto const& ranges_in_level : ranges_in_levels_) {
+    ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel(
+        ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo));
+    if (first_level) {
+      first_level = false;
+      if (enable_compaction_with_sst_partitioner_) {
+        // When compaction is enabled, do a compaction at the first level
+        ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    UserDefinedIndexStressTest, UserDefinedIndexStressTest,
+    testing::Combine(testing::Values(BytewiseComparator(),
+                                     ReverseBytewiseComparator()),
+                     testing::Bool(), testing::Bool()));
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/public_api_changes/multi_scan_api_contract.md b/unreleased_history/public_api_changes/multi_scan_api_contract.md
new file mode 100644
index 000000000000..f988794c6973
--- /dev/null
+++ b/unreleased_history/public_api_changes/multi_scan_api_contract.md
@@ -0,0 +1 @@
+The MultiScan API contract is updated. After a multi scan range got prepared with Prepare API call, the following seeks must seek the start of each prepared scan range in order. In addition, when limit is set, upper bound must be set to the same value of limit before each seek

From 112ff5bb703787186b01d496bc5b32e9477bddbb Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:22:29 -0700
Subject: [PATCH 335/500] Allow empty MultiScan result in
 BlockBasedTableIterator Prepare (#14046)

Summary:
Currently in BlockBasedTableIterator's Prepare(), the index lookup for a MultiScan range is expected to return atleast 1 data block (unless UDI is in use). This is because there's an implicit assumption that only ranges intersecting with the keys in the file will be prepared. This assumption, however, doesn't hold if there are range deletions and the smallest and/or largest keys in the file extend beyond the keys in the file. The LevelIterator prunes the MultiScan ranges based on the smallest/largest key, so its possible for a range to only overlap the range deletion portion of the file and not overlap any of the data blocks. Furthermore, the BlockBasedTableIterator is now much more forgiving of Seek to targets outside of prepared ranges after https://github.com/facebook/rocksdb/issues/14040 .

Keeping the above in mind, this PR removes the check in BlockBasedTableIterator for non-empty index result. It adds assertions in LevelIterator to verify that ranges are being properly pruned. Another side effect is we can no longer rely solely on a scan range having 0 data blocks (i.e cur_scan_start_idx >= cur_scan_end_idx) to decide if the iterator is out of bound. We can only do so for all but the last range prepared range.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14046

Test Plan:
1. Add unit test in db_iterator_test
2. Run crash test

Reviewed By: xingbowang

Differential Revision: D84623871

Pulled By: anand1976

fbshipit-source-id: 2418e629f92b1c46c555ddea3761140f700819e4
---
 db/db_iterator_test.cc                        | 134 ++++++++++++++++++
 db/version_set.cc                             |  20 ++-
 include/rocksdb/options.h                     |  11 --
 .../block_based/block_based_table_iterator.cc |  33 ++---
 .../block_based/block_based_table_iterator.h  |   2 +-
 .../block_based_table_reader_test.cc          |  47 ------
 6 files changed, 168 insertions(+), 79 deletions(-)

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 92108d37eec7..97f73026f1dc 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4532,6 +4532,140 @@ TEST_P(DBMultiScanIteratorTest, RangeBetweenFiles) {
   iter.reset();
 }
 
+// This test case tests multiscan in the presence of fragmented range
+// tombstones in the LSM.
+TEST_P(DBMultiScanIteratorTest, FragmentedRangeTombstones) {
+  auto options = CurrentOptions();
+  // Compaction may create files 2x the target_file_size_base,
+  // so set this to 50KB so we atleast end up with 2 files of
+  // 100KB
+  options.target_file_size_base = 50 << 10;  // 50KB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  // Setup the LSM as follows -
+  // 1. Ingest a file with 100 keys
+  // 2. Ingest a file with one overlapping key
+  // 3. Do a Put and flush a file to L0 with one overlapping key
+  // 4. Ingest a standalone delete range file that covers the full key space
+  //    and a file with the same 100 keys with new values. This will ingest
+  //    into L0 due to the presence of an existing file in L0
+  // The final LSM will have an SST in Lmax with 100 keys, and 2 SST files
+  // in Lmax-1 with half the keys each and completely overlapping delete ranges
+  std::unordered_map<std::string, std::string> kvs;
+  auto rnd = Random::GetTLSInstance();
+  auto create_ingestion_data_file_and_update_key_value =
+      [&](const std::string& filename, int start_key, int end_key) {
+        std::unique_ptr<SstFileWriter> writer;
+        writer.reset(new SstFileWriter(EnvOptions(), options));
+        ASSERT_OK(writer->Open(filename));
+        for (int i = start_key; i < end_key; ++i) {
+          auto kiter = kvs.find(Key(i));
+          if (kiter != kvs.end()) {
+            kvs.erase(kiter);
+          }
+          auto res =
+              kvs.emplace(std::make_pair(Key(i), rnd->RandomString(2 << 10)));
+          ASSERT_OK(writer->Put(res.first->first, res.first->second));
+        }
+        ASSERT_OK(writer->Finish());
+        writer.reset();
+      };
+
+  CreateColumnFamilies({"new_cf"}, options);
+  std::string ingest_file = dbname_ + "test.sst";
+  // Write ~200KB data
+  create_ingestion_data_file_and_update_key_value(ingest_file + "_0", 0, 100);
+  create_ingestion_data_file_and_update_key_value(ingest_file + "_1", 50, 51);
+  ColumnFamilyHandle* cfh = handles_[0];
+  IngestExternalFileOptions ifo;
+  Status s = dbfull()->IngestExternalFile(
+      cfh, {ingest_file + "_0", ingest_file + "_1"}, ifo);
+  ASSERT_OK(s);
+
+  ASSERT_OK(Put(0, Key(50), rnd->RandomString(2 << 10)));
+  ASSERT_OK(Flush());
+
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options));
+    ASSERT_OK(writer->Open(ingest_file + "_2"));
+    ASSERT_OK(writer->DeleteRange("a", "z"));
+    ASSERT_OK(writer->Finish());
+    writer.reset();
+  }
+  create_ingestion_data_file_and_update_key_value(ingest_file + "_3", 0, 100);
+  s = dbfull()->IngestExternalFile(
+      cfh, {ingest_file + "_2", ingest_file + "_3"}, ifo);
+  ASSERT_OK(s);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // The first scan range overlaps the DB key range, while the second extends
+  // beyond but overlaps the delete range
+  std::vector<std::string> key_ranges({"key000085", "key000090", "l", "n"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int i = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[i]);
+        ASSERT_LT(it.first.ToString(), key_ranges[i + 1]);
+        auto kiter = kvs.find(it.first.ToString());
+        ASSERT_NE(kiter, kvs.end());
+        ASSERT_EQ(kiter->second, it.second.ToString());
+        count++;
+      }
+      i += 2;
+    }
+    ASSERT_EQ(i, 4);
+    ASSERT_EQ(count, 5);
+  } catch (MultiScanException& ex) {
+    ASSERT_OK(ex.status());
+  }
+  iter.reset();
+
+  // The second scan range start overlaps the delete range in the first file
+  // in Lmax-1, while the end overlaps the keys in the second file
+  (*scan_options).clear();
+  key_ranges[0] = "key000010";
+  key_ranges[1] = "key000020";
+  key_ranges[2] = "key0000500";
+  key_ranges[3] = "key000060";
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  iter = dbfull()->NewMultiScan(ro, cfh, scan_options);
+  try {
+    int i = 0;
+    int count = 0;
+    for (auto range : *iter) {
+      for (auto it : range) {
+        ASSERT_GE(it.first.ToString(), key_ranges[i]);
+        ASSERT_LT(it.first.ToString(), key_ranges[i + 1]);
+        auto kiter = kvs.find(it.first.ToString());
+        ASSERT_NE(kiter, kvs.end());
+        ASSERT_EQ(kiter->second, it.second.ToString());
+        count++;
+      }
+      i += 2;
+    }
+    ASSERT_EQ(i, 4);
+    ASSERT_EQ(count, 19);
+  } catch (MultiScanException& ex) {
+    ASSERT_OK(ex.status());
+  }
+  iter.reset();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 1dd22c071825..90c8e1a8bc8e 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1200,7 +1200,6 @@ class LevelIterator final : public InternalIterator {
     // Propagate multiscan configs
     for (auto& file_to_arg : *file_to_scan_opts_) {
       file_to_arg.second.CopyConfigFrom(*so);
-      file_to_arg.second.SetRequireFileOverlap(true);
     }
   }
 
@@ -1276,6 +1275,10 @@ class LevelIterator final : public InternalIterator {
     }
   }
 
+#ifndef NDEBUG
+  bool OverlapRange(const ScanOptions& opts);
+#endif
+
   TableCache* table_cache_;
   const ReadOptions& read_options_;
   const FileOptions& file_options_;
@@ -1658,6 +1661,19 @@ void LevelIterator::SkipEmptyFileBackward() {
   }
 }
 
+#ifndef NDEBUG
+bool LevelIterator::OverlapRange(const ScanOptions& opts) {
+  return (user_comparator_.CompareWithoutTimestamp(
+              opts.range.start.value(), /*a_has_ts=*/false,
+              ExtractUserKey(flevel_->files[file_index_].largest_key),
+              /*b_has_ts=*/true) <= 0 &&
+          user_comparator_.CompareWithoutTimestamp(
+              opts.range.limit.value(), /*a_has_ts=*/false,
+              ExtractUserKey(flevel_->files[file_index_].smallest_key),
+              /*b_has_ts=*/true) > 0);
+}
+#endif
+
 void LevelIterator::SetFileIterator(InternalIterator* iter) {
   if (pinned_iters_mgr_ && iter) {
     iter->SetPinnedItersMgr(pinned_iters_mgr_);
@@ -1667,6 +1683,8 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
   if (iter && scan_opts_) {
     if (FileHasMultiScanArg(file_index_)) {
       const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
+      assert(OverlapRange(*new_opts.GetScanRanges().begin()) &&
+             OverlapRange(*new_opts.GetScanRanges().rbegin()));
       file_iter_.Prepare(&new_opts);
     }
   }
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index e8cc0d43fb94..ebd3f4f727e5 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1857,12 +1857,6 @@ class MultiScanArgs {
 
   const Comparator* GetComparator() const { return comp_; }
 
-  void SetRequireFileOverlap(bool require_overlap) {
-    require_file_overlap_ = require_overlap;
-  }
-
-  bool RequireFileOverlap() const { return require_file_overlap_; }
-
   // Copies the configurations (excluding actual scan ranges) from another
   // MultiScanArgs.
   void CopyConfigFrom(const MultiScanArgs& other) {
@@ -1894,11 +1888,6 @@ class MultiScanArgs {
   // The comparator used for ordering ranges
   const Comparator* comp_;
   std::vector<ScanOptions> original_ranges_;
-
-  // Internal use only.
-  // Fail the Prepare() on a file if a scan range does not overlap
-  // with the file.
-  bool require_file_overlap_{false};
 };
 
 // Options that control read operations
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 9398b4f1772b..e3d16ba4337f 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -970,13 +970,12 @@ BlockBasedTableIterator::MultiScanState::~MultiScanState() {
 // scan opt. If we reach the end of the last scan opt, UpperBoundCheckResult()
 // will return kUnknown instead of kOutOfBound. This mechanism requires that
 // scan opts are properly pruned such that there is no scan opt that is after
-// this file's key range. This check can be enforeced by setting
-// MultiScanArgs::require_file_overlap to true.
+// this file's key range.
 // FIXME: DBIter and MergingIterator may
 // internally do Seek() on child iterators, e.g. due to
 // ReadOptions::max_skippable_internal_keys or reseeking into range deletion
-// end key. So these Seeks can cause iterator to fall back to normal
-// (non-prepared) iterator and ignore the optimizations done in Prepare().
+// end key. These Seeks will be handled properly, as long as the target is
+// moving forward.
 void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   assert(!multi_scan_);
   if (!index_iter_->status().ok()) {
@@ -995,9 +994,9 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   std::vector<std::string> data_block_separators;
   std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
   const std::vector<ScanOptions>& scan_opts = multiscan_opts->GetScanRanges();
-  multi_scan_status_ = CollectBlockHandles(
-      scan_opts, multiscan_opts->RequireFileOverlap(), &scan_block_handles,
-      &block_index_ranges_per_scan, &data_block_separators);
+  multi_scan_status_ =
+      CollectBlockHandles(scan_opts, &scan_block_handles,
+                          &block_index_ranges_per_scan, &data_block_separators);
   if (!multi_scan_status_.ok()) {
     return;
   }
@@ -1168,7 +1167,13 @@ bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
     // We should have the data block already loaded
     ++multi_scan_->next_scan_idx;
     if (cur_scan_start_idx >= cur_scan_end_idx) {
-      return out_of_bound;
+      if (multi_scan_->next_scan_idx <
+          multi_scan_->block_index_ranges_per_scan.size()) {
+        return out_of_bound;
+      } else {
+        ResetDataIter();
+        return false;
+      }
     } else {
       is_out_of_bound_ = false;
     }
@@ -1418,7 +1423,7 @@ Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
 constexpr auto kVerbose = false;
 
 Status BlockBasedTableIterator::CollectBlockHandles(
-    const std::vector<ScanOptions>& scan_opts, bool require_file_overlap,
+    const std::vector<ScanOptions>& scan_opts,
     std::vector<BlockHandle>* scan_block_handles,
     std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
     std::vector<std::string>* data_block_separators) {
@@ -1481,16 +1486,6 @@ Status BlockBasedTableIterator::CollectBlockHandles(
         data_block_separators->push_back(index_iter_->user_key().ToString());
       }
       ++num_blocks;
-    } else if (num_blocks == 0 && index_iter_->UpperBoundCheckResult() !=
-                                      IterBoundCheck::kOutOfBound) {
-      // If require_file_overlap is set, then the scan ranges for this file
-      // must intersect with the file. Otherwise, allow empty intersection.
-      if (require_file_overlap) {
-        // This is important for FindBlockForwardInMultiScan() which only
-        // lets the upper layer (LevelIterator) advance to the next SST file
-        // when the last scan range is exhausted.
-        return Status::InvalidArgument("Scan does not intersect with file");
-      }
     }
     block_index_ranges_per_scan->emplace_back(
         scan_block_handles->size() - num_blocks, scan_block_handles->size());
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 16f4f74f2bc0..299c54f74b40 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -698,7 +698,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
                                      CachableEntry<Block>& pinned_block_entry);
 
   Status CollectBlockHandles(
-      const std::vector<ScanOptions>& scan_opts, bool require_file_overlap,
+      const std::vector<ScanOptions>& scan_opts,
       std::vector<BlockHandle>* scan_block_handles,
       std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
       std::vector<std::string>* data_block_boundary_keys);
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 99f3b0164bd7..2785de86db82 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1623,53 +1623,6 @@ TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanUnpinPreviousBlocks) {
   }
 }
 
-TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanOptFileOverlapChecking) {
-  std::vector<std::pair<std::string, std::string>> kv =
-      BlockBasedTableReaderBaseTest::GenerateKVMap(
-          20 /* num_block */, true /* mixed_with_human_readable_string_value */,
-          comparator_->timestamp_size(), same_key_diff_ts_, comparator_);
-  std::vector<std::pair<std::string, std::string>> actual_kv(
-      kv.begin(), kv.begin() + 15 * kEntriesPerBlock);
-
-  std::string table_name =
-      "BlockBasedTableReaderMultiScanTest_UnpinPreviousBlocks" +
-      CompressionTypeToString(compression_type_);
-  ImmutableOptions ioptions(options_);
-  CreateTable(table_name, ioptions, compression_type_, actual_kv,
-              compression_parallel_threads_, compression_dict_bytes_);
-
-  std::unique_ptr<BlockBasedTable> table;
-  FileOptions foptions;
-  foptions.use_direct_reads = use_direct_reads_;
-  InternalKeyComparator comparator(options_.comparator);
-  NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table,
-                           true /* bool prefetch_index_and_filter_in_cache */,
-                           nullptr /* status */, persist_udt_);
-
-  ReadOptions read_opts;
-  std::unique_ptr<InternalIterator> iter;
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-
-  MultiScanArgs scan_options(BytewiseComparator());
-  scan_options.SetRequireFileOverlap(false);
-  scan_options.insert(ExtractUserKey(kv[5 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[6 * kEntriesPerBlock].first));
-  scan_options.insert(ExtractUserKey(kv[16 * kEntriesPerBlock].first),
-                      ExtractUserKey(kv[17 * kEntriesPerBlock].first));
-
-  iter->Prepare(&scan_options);
-  ASSERT_OK(iter->status());
-
-  iter.reset(table->NewIterator(
-      read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr,
-      /*skip_filters=*/false, TableReaderCaller::kUncategorized));
-  scan_options.SetRequireFileOverlap(true);
-  iter->Prepare(&scan_options);
-  ASSERT_TRUE(iter->status().IsInvalidArgument());
-}
-
 std::vector<BlockBasedTableReaderTestParam> GenerateCombinedParameters(
     const std::vector<CompressionType>& compression_types,
     const std::vector<bool>& use_direct_read_flags,

From f7e4009de1d16421a254dd7e799dd91c522d832c Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 15 Oct 2025 13:43:53 -0700
Subject: [PATCH 336/500] Integrate compaction resumption with
 DB::OpenAndCompact() (#13984)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
### Context/Summary:
This is stacked on top of https://github.com/facebook/rocksdb/pull/13983 and integrate compaction resumption with OpenAndCompact().

Flow of resuming: DB::OpenAndCompact() -> Compaction progress file  -> SubcompactionProgress -> CompactionJob
Flow of persistence: CompactionJob -> SubcompactionProgress -> Compaction progress file  -> DB that is called with OpenAndCompact()

This PR focuses on DB::OpenAndCompact() -> Compaction progress file  -> SubcompactionProgress  and Compaction progress file -> DB that is called with OpenAndCompact()

**Resume Flow**
1. Check configuration. Right now paranoid_file_check=true (by default false) is not yet compatible with allow_resumption=true. Also only single subcompaction is supported as OpenAndCompact() does not partition compaction anyway
2. Scan compaction output files for latest, old and temporary compaction progress file and output files. If latest compaction progress file exists, we should resume.
3. Clean up older or temporary progress files if any. They can exist if the last OpenAndCompact() crashed during resume flow
4. If any, parse the latest progress file into CompactionProgress and clean up extra compaction output files that are not yet tracked. These compaction output files can exist as tracking every output file is just best-effort and interrupted output files in the middle is not tracked as progress yet.
5. If allow_resumption=false or no valid compaction progress is found or parsed, clean up the latest progress file and existing compaction output files to start fresh compaction. If the clean up itself fails, fail the OpenAndCompact() call to prevent resuming with inconsistency between output files and progress file.

 **Progress File Creation**
1. Create temporary progress file
2. Persist the progress from latest compaction progress file to the temporary progress file. This is to simplify resuming from an interrupted compaction that was just resumed. Similar to how manifest recovery works.
3. Rename the temporary progress file to the newer compaction progress so it atomically becomes the "new" latest progress file
4. Delete the "old" latest progress file since it's useless now.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13984

Test Plan:
- Integrated unit tests to simulate OpenAndCompact gets canceled and optionally resumed for remote compaction
- Existing UTs and stress/crash test
- Manual stress test with https://github.com/facebook/rocksdb/pull/14041

### Performance testing:
**1. Latency**
Using
```
./db_bench --benchmarks=OpenAndCompact[X5] --openandcompact_test_cancel_on_odd=false --openandcompact_cancel_after_seconds=0 --openandcompact_allow_resumption=$openandcompact_allow_resumption  --use_existing_db=true --db=$db --disable_auto_compactions=true --compression_type=none --secondary_path=$secondary_path  --target_file_size_base=268435456
```
**allow_resumption = false**
Input files: 101 files, 10000 keys
OpenAndCompact() API call : 26766256.000 micros/op 0 ops/sec 26.766 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 27837249.000 micros/op 0 ops/sec 27.837 seconds 1 operations;

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 26546234.000 micros/op 0 ops/sec 26.546 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 27918621.000 micros/op 0 ops/sec 27.919 seconds 1 operations;
OpenAndCompact [AVG 2 runs] : 0 (± 0) ops/sec

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 42243571.000 micros/op 0 ops/sec 42.244 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 43497581.000 micros/op 0 ops/sec 43.498 seconds 1 operations;
OpenAndCompact [AVG 3 runs] : 0 (± 0) ops/sec

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 34241357.000 micros/op 0 ops/sec 34.241 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 35655346.000 micros/op 0 ops/sec 35.655 seconds 1 operations;
OpenAndCompact [AVG 4 runs] : 0 (± 0) ops/sec

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 27083361.000 micros/op 0 ops/sec 27.083 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 28487999.000 micros/op 0 ops/sec 28.488 seconds 1 operations;
OpenAndCompact [AVG 5 runs] : 0 (± 0) ops/sec

OpenAndCompact [AVG    5 runs] : 0 (± 0) ops/sec; 31669.681 ms/op
OpenAndCompact [MEDIAN 5 runs] : 0 ops/sec

**allow_resumption= true**
Input files: 101 files, 10000 keys
OpenAndCompact() API call : 25446470.000 micros/op 0 ops/sec 25.446 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 26833415.000 micros/op 0 ops/sec 26.833 seconds 1 operations;

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 240745.000 micros/op 0 ops/sec 0.241 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact :  244934.000 micros/op 4 ops/sec 0.245 seconds 1 operations;
OpenAndCompact [AVG 2 runs] : 2 (± 3) ops/sec

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 24843383.000 micros/op 0 ops/sec 24.843 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 26192235.000 micros/op 0 ops/sec 26.192 seconds 1 operations;
OpenAndCompact [AVG 3 runs] : 1 (± 2) ops/sec

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 270819.000 micros/op 0 ops/sec 0.271 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact :  275140.000 micros/op 3 ops/sec 0.275 seconds 1 operations;
OpenAndCompact [AVG 4 runs] : 1 (± 2) ops/sec

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 23038311.000 micros/op 0 ops/sec 23.038 seconds 1 operations;
OpenAndCompact status: OK
Output: 92 files, average size: 271747380 bytes (259.16 MB)
OpenAndCompact : 24439097.000 micros/op 0 ops/sec 24.439 seconds 1 operations;
OpenAndCompact [AVG 5 runs] : 1 (± 1) ops/sec

OpenAndCompact [AVG    5 runs] : 1 (± 1) ops/sec; 638.417 ms/op
OpenAndCompact [MEDIAN 5 runs] : 0 ops/sec

**Persistence cost:** If we compare the odd number of OpenAndCompact() API, it's actually faster.
**Resumption saving:** (0.2 - 26.766 ) / 26.766 * 100 = 99.25% improvement when all the compaction progress is redone without the allow_resumption feature

**2.  Memory usage** (in case SubcompactionProgress storing its own memory copies of output filemetadata in https://github.com/facebook/rocksdb/pull/13983/files is a trouble)
 ```
// ~= 90 output files
/usr/bin/time -f "
Resource Summary:
  Wall time: %e seconds
  CPU time: %U user + %S system (%P total)
  Peak memory: %M KB
  Page faults: %F major + %R minor
" ./db_bench --benchmarks=OpenAndCompact[X1] --openandcompact_test_cancel_on_odd=false --openandcompact_cancel_after_seconds=0 --openandcompact_allow_resumption=$openandcompact_allow_resumption  --use_existing_db=true --db=$db --disable_auto_compactions=true --compression_type=none --secondary_path=$secondary_path  --target_file_size_base=268435456
```
**allow_resumption = false**
Peak memory: 275828 KB
**allow_resumption = true**
Peak memory: 277204 KB (regress 0.49% memory usage, most likely due to storing own copies of output files' file metadata in subcompaction progress)

### Near-term follow up:
- Add statistics to record the successfully resumed compaction output files bytes
- Add stress/crash test support to cover error paths (including progress file sync error), crash/cancel OpenAndCompact() at random compaction progress point and surface feature incompatibility
   - See https://github.com/facebook/rocksdb/pull/14041
- Resolve the TODO https://github.com/facebook/rocksdb/pull/13984/files#diff-17fbdec07244b1f07d1a4e5aed0a6feecf4474d20b3129818c10fc0ff9f3d547R1303-R1314
   - See https://github.com/facebook/rocksdb/pull/14042

Reviewed By: jaykorean

Differential Revision: D84299662

Pulled By: hx235

fbshipit-source-id: 69bbf395401604172a1a5c557ca834011a3d51d7
---
 db/compaction/compaction_iterator.h           |  10 +
 db/compaction/compaction_job.cc               |  39 +-
 db/compaction/compaction_job.h                |   2 +-
 db/compaction/compaction_job_test.cc          | 248 ++++---
 db/compaction/compaction_service_test.cc      | 381 ++++++++++-
 db/db_impl/db_impl_secondary.cc               | 633 +++++++++++++++++-
 db/db_impl/db_impl_secondary.h                |  80 +++
 db/merge_helper.h                             |   2 +-
 db/version_edit.h                             |  10 +-
 include/rocksdb/db.h                          |  24 +-
 include/rocksdb/options.h                     |  37 +
 .../new_features/resume_compaction.md         |   1 +
 12 files changed, 1321 insertions(+), 146 deletions(-)
 create mode 100644 unreleased_history/new_features/resume_compaction.md

diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 6117d23f9e18..a851e35f93d5 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -270,6 +270,16 @@ class CompactionIterator {
   // true, unless `must_count_input_entries=true` was specified during iterator
   // creation (which ensures the count is always accurate).
   uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
+
+  // Returns true if the current valid key was already scanned/counted during
+  // a lookahead operation in a previous iteration.
+  //
+  // REQUIRED: Valid() must be true
+  bool IsCurrentKeyAlreadyScanned() const {
+    assert(Valid());
+    return at_next_ || merge_out_iter_.Valid();
+  }
+
   Status InputStatus() const { return input_.status(); }
 
   bool IsDeleteRangeSentinelKey() const { return is_range_del_; }
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 80fc92b98c4b..06d608fb4f09 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1427,7 +1427,8 @@ std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
       env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
       blob_resources.blob_file_builder.get(), db_options_.allow_data_in_errors,
       db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
-      sub_compact->compaction->DoesInputReferenceBlobFiles(),
+      sub_compact->compaction
+          ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
       sub_compact->compaction, compaction_filter, shutting_down_,
       db_options_.info_log, full_history_ts_low, preserve_seqno_after_);
 }
@@ -2016,7 +2017,7 @@ Status CompactionJob::FinishCompactionOutputFile(
     }
   }
 
-  if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact,
+  if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact, c_iter,
                                                   prev_table_last_internal_key,
                                                   next_table_min_key, meta)) {
     UpdateSubcompactionProgress(c_iter, next_table_min_key, sub_compact);
@@ -2027,7 +2028,7 @@ Status CompactionJob::FinishCompactionOutputFile(
 }
 
 bool CompactionJob::ShouldUpdateSubcompactionProgress(
-    const SubcompactionState* sub_compact,
+    const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
     const ParsedInternalKey& prev_table_last_internal_key,
     const Slice& next_table_min_internal_key, const FileMetaData* meta) const {
   const auto* cfd = sub_compact->compaction->column_family_data();
@@ -2086,6 +2087,21 @@ bool CompactionJob::ShouldUpdateSubcompactionProgress(
     return false;
   }
 
+  // LIMITATION: Don't save progress if the current key has already been scanned
+  // (looked ahead) in the input but not yet output. This can happen with merge
+  // operations, single deletes, and deletes at the bottommost level where
+  // CompactionIterator needs to look ahead to process multiple entries for the
+  // same user key before outputting a result. If we saved progress and resumed
+  // at this boundary, the resumed session would see and process the same input
+  // key again through Seek(), leading to incorrect double-counting in
+  // number of processed input entries and input count verification failure
+  //
+  // TODO(hx235): Offset num_processed_input_records to avoid double counting
+  // instead of disabling progress persistence.
+  if (c_iter->IsCurrentKeyAlreadyScanned()) {
+    return false;
+  }
+
   return true;
 }
 
@@ -2770,8 +2786,9 @@ Status CompactionJob::MaybeResumeSubcompactionProgressOnInputIterator(
     return Status::NotFound("No subcompaction progress to resume");
   }
 
-  ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Resuming compaction",
-                 cfd->GetName().c_str(), job_id_);
+  ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Resuming compaction : %s",
+                 cfd->GetName().c_str(), job_id_,
+                 subcompaction_progress.ToString().c_str());
 
   input_iter->Seek(subcompaction_progress.next_internal_key_to_compact);
 
@@ -2865,8 +2882,18 @@ void CompactionJob::UpdateSubcompactionProgress(
   subcompaction_progress.next_internal_key_to_compact =
       next_ikey_to_compact.GetInternalKey().ToString();
 
+  // Track total processed input records for progress reporting by combining:
+  // - Resumed count: records already processed before compaction was
+  // interrupted
+  // - Current count: records scanned in the current compaction session
+  // Only update when both tracking mechanisms provide accurate counts to ensure
+  // reliability.
   subcompaction_progress.num_processed_input_records =
-      c_iter->HasNumInputEntryScanned() ? c_iter->NumInputEntryScanned() : 0;
+      c_iter->HasNumInputEntryScanned() &&
+              sub_compact->compaction_job_stats.has_accurate_num_input_records
+          ? c_iter->NumInputEntryScanned() +
+                sub_compact->compaction_job_stats.num_input_records
+          : 0;
 
   UpdateSubcompactionProgressPerLevel(
       sub_compact, false /* is_proximal_level */, subcompaction_progress);
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index d2e3e4c5d3c3..bff25f465f4d 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -543,7 +543,7 @@ class CompactionJob {
       CompactionOutputs* outputs_to_restore);
 
   bool ShouldUpdateSubcompactionProgress(
-      const SubcompactionState* sub_compact,
+      const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
       const ParsedInternalKey& prev_table_last_internal_key,
       const Slice& next_table_min_internal_key, const FileMetaData* meta) const;
 
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 6fb071f6d58a..4c5f889de847 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -2410,15 +2410,16 @@ TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
                 Env::IO_LOW, Env::IO_LOW);
 }
 
-class ResumeCompactionJobTest : public CompactionJobTestBase {
+class ResumableCompactionJobTest : public CompactionJobTestBase {
  public:
-  ResumeCompactionJobTest()
+  ResumableCompactionJobTest()
       : CompactionJobTestBase(
-            test::PerThreadDBPath("resume_compaction_job_test"),
+            test::PerThreadDBPath("allow_resumption_job_test"),
             BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
             /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
 
  protected:
+  static constexpr const char* kCancelBeforeThisKey = "cancel_before_this_key";
   std::string progress_dir_ = "";
   bool enable_cancel_ = false;
   std::atomic<int> stop_count_{0};
@@ -2431,8 +2432,15 @@ class ResumeCompactionJobTest : public CompactionJobTestBase {
         [this](void* p) {
           auto* pair = static_cast<std::pair<bool*, const Slice>*>(p);
           *(pair->first) = true;
-          if (enable_cancel_ && stop_count_.fetch_add(1) == 3) {
-            cancel_.store(true);
+
+          // Cancel after outputting a specific key
+          if (enable_cancel_) {
+            ParsedInternalKey parsed_key;
+            if (ParseInternalKey(pair->second, &parsed_key, true).ok()) {
+              if (parsed_key.user_key == kCancelBeforeThisKey) {
+                cancel_.store(true);
+              }
+            }
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -2649,79 +2657,77 @@ class ResumeCompactionJobTest : public CompactionJobTestBase {
                 ordered_intput_keys[i]);
     }
   }
-};
-
-TEST_F(ResumeCompactionJobTest, BasicProgressPersistence) {
-  NewDB();
-
-  auto file1 = mock::MakeMockFile({
-      {KeyStr("a", 1U, kTypeValue), "val1"},
-      {KeyStr("b", 2U, kTypeValue), "val2"},
-  });
-  AddMockFile(file1);
-
-  auto file2 = mock::MakeMockFile({
-      {KeyStr("c", 3U, kTypeValue), "val3"},
-      {KeyStr("d", 4U, kTypeValue), "val4"},
-  });
-  AddMockFile(file2);
-
-  SetLastSequence(4U);
-
-  std::string compaction_progress_file =
-      CompactionProgressFileName(progress_dir_, 123);
-
-  std::unique_ptr<log::Writer> compaction_progress_writer =
-      CreateCompactionProgressWriter(compaction_progress_file);
 
-  Status status = RunCompactionWithProgressTracking(
-      CompactionProgress(), compaction_progress_writer.get());
+  void RunCancelAndResumeTest(
+      const std::initializer_list<mock::KVPair>& input_file_1,
+      const std::initializer_list<mock::KVPair>& input_file_2,
+      uint64_t last_sequence, const std::vector<uint64_t>& snapshots,
+      const std::string& expected_next_key_to_compact,
+      const std::vector<std::string>& expected_input_keys, bool exists_progress,
+      bool cancelled_past_mid_point = false) {
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
 
-  ASSERT_OK(status);
-
-  VerifyCompactionProgressPersisted(
-      compaction_progress_file, "d" /* next_user_key_to_compact */,
-      {"a", "b", "c", "d"} /* ordered_intput_keys */);
-}
-
-TEST_F(ResumeCompactionJobTest, CondtionallySkipProgressPersistence) {
-  for (auto type : {kTypeValue, kTypeRangeDeletion}) {
-    NewDB();
-
-    auto file1 = mock::MakeMockFile({
-        {KeyStr("a", 1U, kTypeValue), "val1"},
-    });
+    auto file1 = mock::MakeMockFile(input_file_1);
     AddMockFile(file1);
-
-    auto file2 =
-        (type == kTypeValue ? mock::MakeMockFile({
-                                  {KeyStr("a", 2U, kTypeValue), "val2"},
-                              }) /* same user keys spanning the file boundary */
-                            : mock::MakeMockFile({
-                                  {KeyStr("b", 2U, kTypeRangeDeletion), "val2"},
-                              })); /* deletion range in the file boundary */
+    auto file2 = mock::MakeMockFile(input_file_2);
     AddMockFile(file2);
-    SetLastSequence(2U);
+    SetLastSequence(last_sequence);
 
+    // First compaction (will be cancelled)
     std::string compaction_progress_file =
         CompactionProgressFileName(progress_dir_, 123);
     std::unique_ptr<log::Writer> compaction_progress_writer =
         CreateCompactionProgressWriter(compaction_progress_file);
 
+    ASSERT_OK(stats->Reset());
+    EnableCompactionCancel();
+
     Status status = RunCompactionWithProgressTracking(
-        CompactionProgress{}, compaction_progress_writer.get(),
-        {1U} /* snapshots */);
+        CompactionProgress{}, compaction_progress_writer.get(), snapshots,
+        stats);
 
-    ASSERT_OK(status);
+    ASSERT_TRUE(status.IsManualCompactionPaused());
+    DisableCompactionCancel();
+
+    HistogramData cancelled_compaction_stats;
+    stats->histogramData(FILE_WRITE_COMPACTION_MICROS,
+                         &cancelled_compaction_stats);
 
     VerifyCompactionProgressPersisted(compaction_progress_file,
-                                      "" /* next_user_key_to_compact */,
-                                      {"a", "b"} /* ordered_intput_keys */);
+                                      expected_next_key_to_compact,
+                                      expected_input_keys);
+
+    // Resume compaction
+    CompactionProgress compaction_progress;
+    if (exists_progress) {
+      compaction_progress.push_back(
+          ReadAndParseProgress(compaction_progress_file));
+    }
+
+    std::string compaction_progress_file_2 =
+        CompactionProgressFileName(progress_dir_, 234);
+    std::unique_ptr<log::Writer> compaction_progress_writer_2 =
+        CreateCompactionProgressWriter(compaction_progress_file_2);
+
+    ASSERT_OK(stats->Reset());
+
+    status = RunCompactionWithProgressTracking(
+        compaction_progress, compaction_progress_writer_2.get(),
+        {} /* snapshots */, stats);
+
+    ASSERT_OK(status);
+
+    if (cancelled_past_mid_point) {
+      HistogramData resumed_compaction_stats;
+      stats->histogramData(FILE_WRITE_COMPACTION_MICROS,
+                           &resumed_compaction_stats);
+      ASSERT_GT(cancelled_compaction_stats.count,
+                resumed_compaction_stats.count);
+    }
   }
-}
+};
 
-TEST_F(ResumeCompactionJobTest, BasicProgressResume) {
-  std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+TEST_F(ResumableCompactionJobTest, BasicProgressPersistence) {
   NewDB();
 
   auto file1 = mock::MakeMockFile({
@@ -2735,52 +2741,116 @@ TEST_F(ResumeCompactionJobTest, BasicProgressResume) {
       {KeyStr("d", 4U, kTypeValue), "val4"},
   });
   AddMockFile(file2);
+
   SetLastSequence(4U);
 
   std::string compaction_progress_file =
       CompactionProgressFileName(progress_dir_, 123);
+
   std::unique_ptr<log::Writer> compaction_progress_writer =
       CreateCompactionProgressWriter(compaction_progress_file);
 
-  ASSERT_OK(stats->Reset());
-
-  EnableCompactionCancel();
-
   Status status = RunCompactionWithProgressTracking(
-      CompactionProgress{}, compaction_progress_writer.get(), {} /* snapshots*/,
-      stats);
-
-  ASSERT_TRUE(status.IsManualCompactionPaused());
-
-  DisableCompactionCancel();
+      CompactionProgress(), compaction_progress_writer.get());
 
-  HistogramData cancelled_compaction_stats;
-  stats->histogramData(FILE_WRITE_COMPACTION_MICROS,
-                       &cancelled_compaction_stats);
+  ASSERT_OK(status);
 
   VerifyCompactionProgressPersisted(
       compaction_progress_file, "d" /* next_user_key_to_compact */,
       {"a", "b", "c", "d"} /* ordered_intput_keys */);
+}
 
-  CompactionProgress compaction_progress;
-  compaction_progress.push_back(ReadAndParseProgress(compaction_progress_file));
+TEST_F(ResumableCompactionJobTest, BasicProgressResume) {
+  NewDB();
 
-  std::string compaction_progress_file_2 =
-      CompactionProgressFileName(progress_dir_, 234);
-  std::unique_ptr<log::Writer> compaction_progress_writer_2 =
-      CreateCompactionProgressWriter(compaction_progress_file_2);
+  RunCancelAndResumeTest(
+      {{KeyStr("a", 1U, kTypeValue), "val1"},
+       {KeyStr("b", 2U, kTypeValue), "val2"}} /* input_file_1 */,
+      {{KeyStr("bb", 3U, kTypeValue), "val3"},
+       {KeyStr(kCancelBeforeThisKey, 4U, kTypeValue),
+        "val4"}} /* input_file_2 */,
+      4U /* last_sequence */, {} /* snapshots */,
+      kCancelBeforeThisKey /* expected_next_key_to_compact */,
+      {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */,
+      true /* exists_progress */, true /* cancelled_past_mid_point*/);
+}
 
-  ASSERT_OK(stats->Reset());
+TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSameKey) {
+  NewDB();
+
+  RunCancelAndResumeTest(
+      {{KeyStr(kCancelBeforeThisKey, 1U, kTypeValue),
+        "val1"}} /* input_file_1 */,
+      {{KeyStr(kCancelBeforeThisKey, 2U, kTypeValue),
+        "val2"}} /* input_file_2 */,
+      2U /* last_sequence */, {1U} /* snapshots */,
+      "" /* expected_next_key_to_compact */,
+      {kCancelBeforeThisKey, kCancelBeforeThisKey} /* expected_input_keys */,
+      false /* exists_progress */);
+}
 
-  status = RunCompactionWithProgressTracking(compaction_progress,
-                                             compaction_progress_writer_2.get(),
-                                             {} /* snapshots */, stats);
+TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeleteRange) {
+  NewDB();
 
-  HistogramData resumed_compaction_stats;
-  stats->histogramData(FILE_WRITE_COMPACTION_MICROS, &resumed_compaction_stats);
+  RunCancelAndResumeTest(
+      {{KeyStr(kCancelBeforeThisKey, 1U, kTypeValue),
+        "val1"}} /* input_file_1 */,
+      {{KeyStr(kCancelBeforeThisKey, 2U, kTypeRangeDeletion),
+        "val2"}} /* input_file_2 */,
+      2U /* last_sequence */, {1U} /* snapshots */,
+      "" /* expected_next_key_to_compact */,
+      {kCancelBeforeThisKey, kCancelBeforeThisKey} /* expected_input_keys */,
+      false /* exists_progress */);
+}
 
-  ASSERT_OK(status);
-  ASSERT_LT(resumed_compaction_stats.count, cancelled_compaction_stats.count);
+TEST_F(ResumableCompactionJobTest, NoProgressResumeOnMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendOperator();
+  NewDB();
+
+  RunCancelAndResumeTest(
+      {{KeyStr("a", 1U, kTypeValue), "val1"},
+       {KeyStr("b", 2U, kTypeValue), "val2"}} /* input_file_1 */,
+      {{KeyStr("bb", 3U, kTypeValue), "val3"},
+       {KeyStr(kCancelBeforeThisKey, 4U, kTypeMerge),
+        "val4"}} /* input_file_2 */,
+      4U /* last_sequence */, {} /* snapshots */,
+      "bb" /* expected_next_key_to_compact */,
+      {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */,
+      true /* exists_progress */);
+}
+
+TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSingleDelete) {
+  NewDB();
+
+  RunCancelAndResumeTest(
+      {{KeyStr("a", 1U, kTypeValue), "val1"},
+       {KeyStr("b", 2U, kTypeValue), "val2"},
+       {KeyStr(kCancelBeforeThisKey, 3U, kTypeValue),
+        "val3"}} /* input_file_1 */,
+      {{KeyStr(kCancelBeforeThisKey, 4U, kTypeSingleDeletion), ""},
+       {KeyStr("d", 5U, kTypeValue), "val4"}} /* input_file_2 */,
+      5U /* last_sequence */, {3U} /* snapshots */,
+      "b" /* expected_next_key_to_compact */,
+      {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
+       "d"} /* expected_input_keys */,
+      true /* exists_progress */);
+}
+
+TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeletionAtBottom) {
+  NewDB();
+
+  RunCancelAndResumeTest(
+      {{KeyStr("a", 1U, kTypeValue), "val1"},
+       {KeyStr("b", 2U, kTypeValue), "val2"},
+       {KeyStr(kCancelBeforeThisKey, 3U, kTypeValue),
+        "val3"}} /* input_file_1 */,
+      {{KeyStr(kCancelBeforeThisKey, 4U, kTypeDeletion), ""},
+       {KeyStr("d", 5U, kTypeValue), "val4"}} /* input_file_2 */,
+      5U /* last_sequence */, {3U} /* snapshots */,
+      "b" /* expected_next_key_to_compact */,
+      {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
+       "d"} /* expected_input_keys */,
+      true /* exists_progress */);
 }
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 88de6d0e48d0..7414b52bb609 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/db_test_util.h"
+#include "file/file_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/utilities/options_util.h"
 #include "table/unique_id_impl.h"
@@ -16,17 +17,17 @@ class MyTestCompactionService : public CompactionService {
   MyTestCompactionService(
       std::string db_path, Options& options,
       std::shared_ptr<Statistics>& statistics,
-      std::vector<std::shared_ptr<EventListener>>& listeners,
+      std::vector<std::shared_ptr<EventListener>> listeners,
       std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
           table_properties_collector_factories)
       : db_path_(std::move(db_path)),
-        options_(options),
         statistics_(statistics),
+        options_(options),
         start_info_("na", "na", "na", 0, "na", 0, Env::TOTAL,
                     CompactionReason::kUnknown, false, false, false, -1, -1),
         wait_info_("na", "na", "na", 0, "na", 0, Env::TOTAL,
                    CompactionReason::kUnknown, false, false, false, -1, -1),
-        listeners_(listeners),
+        listeners_(std::move(listeners)),
         table_properties_collector_factories_(
             std::move(table_properties_collector_factories)) {}
 
@@ -72,6 +73,31 @@ class MyTestCompactionService : public CompactionService {
     if (is_override_wait_status_) {
       return override_wait_status_;
     }
+
+    CompactionServiceOptionsOverride options_override = GetOptionsOverride();
+
+    OpenAndCompactOptions options;
+    options.canceled = &canceled_;
+
+    Status s =
+        DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id),
+                           compaction_input, result, options_override);
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (is_override_wait_result_) {
+        *result = override_wait_result_;
+      }
+      result_ = *result;
+    }
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  CompactionServiceOptionsOverride GetOptionsOverride() {
     CompactionServiceOptionsOverride options_override;
     options_override.env = options_.env;
     options_override.file_checksum_gen_factory =
@@ -94,26 +120,7 @@ class MyTestCompactionService : public CompactionService {
       options_override.table_properties_collector_factories =
           table_properties_collector_factories_;
     }
-
-    OpenAndCompactOptions options;
-    options.canceled = &canceled_;
-
-    Status s =
-        DB::OpenAndCompact(options, db_path_, db_path_ + "/" + scheduled_job_id,
-                           compaction_input, result, options_override);
-    {
-      InstrumentedMutexLock l(&mutex_);
-      if (is_override_wait_result_) {
-        *result = override_wait_result_;
-      }
-      result_ = *result;
-    }
-    compaction_num_.fetch_add(1);
-    if (s.ok()) {
-      return CompactionServiceJobStatus::kSuccess;
-    } else {
-      return CompactionServiceJobStatus::kFailure;
-    }
+    return options_override;
   }
 
   void CancelAwaitingJobs() override { canceled_ = true; }
@@ -160,14 +167,21 @@ class MyTestCompactionService : public CompactionService {
     return final_updated_status_.load();
   }
 
- private:
+ protected:
   InstrumentedMutex mutex_;
-  std::atomic_int compaction_num_{0};
+  const std::string db_path_;
+  std::shared_ptr<Statistics> statistics_;
   std::map<std::string, std::string> jobs_;
   std::map<std::string, CompactionServiceJobInfo> infos_;
-  const std::string db_path_;
+  std::string result_;
+
+  std::string GetOutputPath(const std::string& scheduled_job_id) {
+    return db_path_ + "/" + scheduled_job_id;
+  }
+
+ private:
+  std::atomic_int compaction_num_{0};
   Options options_;
-  std::shared_ptr<Statistics> statistics_;
   CompactionServiceJobInfo start_info_;
   CompactionServiceJobInfo wait_info_;
   bool is_override_start_status_ = false;
@@ -177,7 +191,6 @@ class MyTestCompactionService : public CompactionService {
   CompactionServiceJobStatus override_wait_status_ =
       CompactionServiceJobStatus::kFailure;
   bool is_override_wait_result_ = false;
-  std::string result_;
   std::string override_wait_result_;
   std::vector<std::shared_ptr<EventListener>> listeners_;
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
@@ -2005,6 +2018,318 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) {
   ASSERT_TRUE(has_user_property);
 }
 
+class ResumableCompactionService : public MyTestCompactionService {
+ public:
+  enum class TestScenario {
+    // Test scenario 1: Two-phase compaction with resumption
+    // - Phase 1: Cancel the compaction running with resumption enabled (saves
+    // progress)
+    // - Phase 2: Resume from saved progress and complete
+    // Validates: Resumption reduces redundant work
+    kCancelThenResume,
+
+    // Test scenario 2: Two-phase compaction without resumption
+    // - Phase 1: Cancel the compaction running with resumption enabled (saves
+    // progress)
+    // - Phase 2: Start fresh without resumption (ignores saved progress) and
+    // complete
+    // Validates: Disabling resumption causes full reprocessing
+    kCancelThenFreshStart,
+
+    // Test scenario 3: Three-phase compaction toggling resumption on/off/on
+    // - Phase 1: Cancel the compaction running with resumption enabled (saves
+    // progress)
+    // - Phase 2: Start fresh wtihout resumption (ignores saved progress) and
+    // cancel agains
+    // - Phase 3: Resume with resumption support (loads Phase 1's progress) and
+    // complete
+    // Validates: Resumption state can be toggled;
+    kMultipleCancelToggleResumption
+  };
+
+  ResumableCompactionService(const std::string& db_path, Options& options,
+                             std::shared_ptr<Statistics> statistics,
+                             TestScenario scenario)
+      : MyTestCompactionService(db_path, options, statistics,
+                                {} /* listeners */,
+                                {} /* table_properties_collector_factories */),
+        scenario_(scenario) {}
+
+  CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
+                                  std::string* result) override {
+    std::string compaction_input = ExtractCompactionInput(scheduled_job_id);
+    EXPECT_FALSE(compaction_input.empty());
+
+    OpenAndCompactOptions open_and_compaction_options;
+    auto override_options = GetOptionsOverride();
+
+    // Force creation of one key per output file for test simplicity.
+    // ASSUMPTION: This makes stats.count directly proportional to keys
+    // processed.
+    SyncPoint::GetInstance()->SetCallBack(
+        "CompactionOutputs::ShouldStopBefore::manual_decision", [](void* p) {
+          auto* pair = static_cast<std::pair<bool*, const Slice>*>(p);
+          *(pair->first) = true;
+        });
+    // Simulate cancelled compaction by overriding status at completion. So
+    // compaction processes all keys before this point to make stats.count
+    // comparison straightforward.
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+          auto s = static_cast<Status*>(status);
+          *s = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    // Phase 1: Run compaction with resumption enabled and cancel it
+    // - Processes all input keys
+    // - Creates output files and saves progress
+    // - Status overridden to "paused"
+    open_and_compaction_options.allow_resumption = true;
+    auto phase1_stats =
+        RunCancelledCompaction(open_and_compaction_options, scheduled_job_id,
+                               compaction_input, override_options);
+
+    HistogramData phase2_stats;
+
+    if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+      // Phase 2: Run compaction WITHOUT resumption (fresh start) and cancel it
+      // - Delete all files left behind Phase 1 before calling OpenAndCompact()
+      // - Processes all input keys again from scratch
+      // - Creates output files but does NOT save progress
+      // - Status overridden to "paused"
+      open_and_compaction_options.allow_resumption = false;
+
+      // Clean up output folder for fresh start
+      std::string output_dir = GetOutputPath(scheduled_job_id);
+      Status cleanup_status = DestroyDir(override_options.env, output_dir);
+      EXPECT_TRUE(cleanup_status.ok());
+      EXPECT_OK(override_options.env->CreateDir(output_dir));
+
+      phase2_stats =
+          RunCancelledCompaction(open_and_compaction_options, scheduled_job_id,
+                                 compaction_input, override_options);
+
+      // Validation: Phase 2 starts from scratch, so it processes the same
+      // input keys as Phase 1.
+      // ASSUMPTION: With fixed input (10 keys) and deterministic cancellation
+      // (after processing), both phases create the same number of output files.
+      EXPECT_EQ(phase2_stats.count, phase1_stats.count);
+    }
+
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImplSecondary::CompactWithoutInstallation::End");
+
+    // Final phase: Run compaction to completion (no cancellation)
+    if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+      // Attempt to resume but it ends up starting fresh
+      open_and_compaction_options.allow_resumption = true;
+    } else if (scenario_ == TestScenario::kCancelThenResume) {
+      // Resume from Phase 1's saved progress
+      open_and_compaction_options.allow_resumption = true;
+    } else {  // kCancelThenFreshStart
+      // Start fresh without resumption
+      open_and_compaction_options.allow_resumption = false;
+
+      // Clean up output folder for fresh start
+      std::string output_dir = GetOutputPath(scheduled_job_id);
+      Status cleanup_status = DestroyDir(override_options.env, output_dir);
+      EXPECT_TRUE(cleanup_status.ok());
+      EXPECT_OK(override_options.env->CreateDir(output_dir));
+    }
+
+    auto final_phase_stats =
+        RunCompaction(open_and_compaction_options, scheduled_job_id,
+                      compaction_input, override_options, result);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Validate statistics based on scenario
+    if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+      // ASSUMPTION: Phase 1 processes all keys before cancellation
+      EXPECT_GT(phase1_stats.count, 0);
+
+      // ASSUMPTION: Phase 2 runs with allow_resumption=false and an empty
+      // folder. Phase 2 then creates its own output files (but doesn't save
+      // progress). When Phase 3 starts with allow_resumption=true, it finds no
+      // progress file exists, so it cannot resume and must start from scratch,
+      // processing all input keys again.
+      // Result: Phase 3 does the same amount of work as Phase 1.
+      EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+
+    } else if (scenario_ == TestScenario::kCancelThenResume) {
+      // ASSUMPTION: Phase 1 processes all keys before cancellation
+      EXPECT_GT(phase1_stats.count, 0);
+
+      // ASSUMPTION: Phase 1 processes all keys and saves progress before
+      // cancellation. Final phase resumes from Phase 1's saved progress.
+      // Since Phase 1 completed all processing before being cancelled, the
+      // final phase should do less work than Phase 1.
+      EXPECT_LT(final_phase_stats.count, phase1_stats.count);
+
+    } else {  // kCancelThenFreshStart
+      // ASSUMPTION: Phase 1 processes all keys before cancellation
+      EXPECT_GT(phase1_stats.count, 0);
+
+      // ASSUMPTION: Final phase starts fresh without resumption, so it
+      // processes all input keys again and creates the same number of files
+      EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+    }
+
+    StoreResult(*result);
+
+    return CompactionServiceJobStatus::kSuccess;
+  }
+
+ private:
+  std::string ExtractCompactionInput(const std::string& scheduled_job_id) {
+    InstrumentedMutexLock l(&mutex_);
+
+    auto job_index = jobs_.find(scheduled_job_id);
+    if (job_index == jobs_.end()) {
+      return "";
+    }
+    std::string compaction_input = std::move(job_index->second);
+    jobs_.erase(job_index);
+
+    auto info_index = infos_.find(scheduled_job_id);
+    if (info_index == infos_.end()) {
+      return "";
+    }
+    infos_.erase(info_index);
+
+    return compaction_input;
+  }
+
+  HistogramData RunCancelledCompaction(
+      const OpenAndCompactOptions& options, const std::string& scheduled_job_id,
+      const std::string& compaction_input,
+      const CompactionServiceOptionsOverride& override_options) {
+    std::string temp_result;
+    EXPECT_OK(statistics_->Reset());
+
+    Status s =
+        DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id),
+                           compaction_input, &temp_result, override_options);
+
+    EXPECT_TRUE(s.IsManualCompactionPaused());
+
+    HistogramData stats;
+    statistics_->histogramData(FILE_WRITE_COMPACTION_MICROS, &stats);
+    return stats;
+  }
+
+  HistogramData RunCompaction(
+      const OpenAndCompactOptions& options, const std::string& scheduled_job_id,
+      const std::string& compaction_input,
+      const CompactionServiceOptionsOverride& override_options,
+      std::string* result) {
+    EXPECT_OK(statistics_->Reset());
+
+    Status s =
+        DB::OpenAndCompact(options, db_path_, GetOutputPath(scheduled_job_id),
+                           compaction_input, result, override_options);
+
+    EXPECT_TRUE(s.ok());
+
+    HistogramData stats;
+    statistics_->histogramData(FILE_WRITE_COMPACTION_MICROS, &stats);
+    return stats;
+  }
+
+  void StoreResult(const std::string& result) {
+    InstrumentedMutexLock l(&mutex_);
+    result_ = result;
+  }
+
+  TestScenario scenario_;
+};
+
+class ResumableCompactionServiceTest : public CompactionServiceTest {
+ public:
+  explicit ResumableCompactionServiceTest() : CompactionServiceTest() {}
+
+  void RunCompactionCancelTest(
+      ResumableCompactionService::TestScenario scenario) {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    std::shared_ptr<Statistics> statistics = CreateDBStatistics();
+
+    options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+    BlockBasedTableOptions table_options;
+    table_options.verify_compression = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    auto resume_cs = std::make_shared<ResumableCompactionService>(
+        dbname_, options, statistics, scenario);
+    options.compaction_service = resume_cs;
+
+    DestroyAndReopen(options);
+
+    GenerateTestData();
+
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    Status s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(s);
+
+    VerifyTestData();
+
+    s = db_->VerifyChecksum();
+    ASSERT_OK(s);
+
+    s = db_->VerifyFileChecksums(ReadOptions());
+    ASSERT_OK(s);
+
+    CompactionServiceResult result;
+    resume_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+    ASSERT_GT(result.output_files.size(), 0);
+  }
+
+  void GenerateTestData() {
+    for (int i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(Put(Key(i), "value"));
+      ASSERT_OK(Flush());
+      if (i % 2 == 0) {
+        ASSERT_OK(Delete(Key(i)));
+        ASSERT_OK(Flush());
+      }
+    }
+  }
+
+  void VerifyTestData() {
+    for (int i = 0; i < kNumKeys; ++i) {
+      if (i % 2 == 0) {
+        ASSERT_EQ("NOT_FOUND", Get((Key(i))));
+      } else {
+        ASSERT_EQ("value", Get((Key(i))));
+      }
+    }
+  }
+
+ private:
+  static constexpr int kNumKeys = 10;
+};
+
+TEST_F(ResumableCompactionServiceTest, CompactionCancelThenResume) {
+  RunCompactionCancelTest(
+      ResumableCompactionService::TestScenario::kCancelThenResume);
+}
+
+TEST_F(ResumableCompactionServiceTest, CompactionCancelThenFreshStart) {
+  RunCompactionCancelTest(
+      ResumableCompactionService::TestScenario::kCancelThenFreshStart);
+}
+
+TEST_F(ResumableCompactionServiceTest,
+       CompactionMultipleCancelToggleResumption) {
+  RunCompactionCancelTest(ResumableCompactionService::TestScenario::
+                              kMultipleCancelToggleResumption);
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 69c40eefed57..b13af01999cd 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -8,7 +8,12 @@
 #include <cinttypes>
 
 #include "db/arena_wrapped_db_iter.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
 #include "db/merge_context.h"
+#include "db/version_edit.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
 #include "logging/auto_roll_logger.h"
 #include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
@@ -823,18 +828,487 @@ Status DB::OpenAsSecondary(
   return s;
 }
 
+Status DBImplSecondary::ScanCompactionProgressFiles(
+    CompactionProgressFilesScan* scan_result) {
+  assert(scan_result != nullptr);
+  scan_result->Clear();
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::string> all_filenames;
+  s = fs_->GetChildren(secondary_path_, opts, &all_filenames, nullptr /* dbg*/);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& filename : all_filenames) {
+    if (filename == "." || filename == "..") {
+      continue;
+    }
+
+    uint64_t number;
+    FileType type;
+
+    if (!ParseFileName(filename, &number, &type)) {
+      continue;
+    }
+
+    // Categorize compaction progress files
+    if (type == kCompactionProgressFile) {
+      if (number > scan_result->latest_progress_timestamp) {
+        // Found a newer progress file
+        if (scan_result->HasLatestProgressFile()) {
+          // Previous "latest" becomes "old"
+          scan_result->old_progress_filenames.push_back(
+              scan_result->latest_progress_filename.value());
+        }
+        scan_result->latest_progress_timestamp = number;
+        scan_result->latest_progress_filename = filename;
+      } else {
+        // This is an older progress file
+        scan_result->old_progress_filenames.push_back(filename);
+      }
+    } else if (type == kTempFile &&
+               filename.find(kCompactionProgressFileNamePrefix) == 0) {
+      // Temporary progress files
+      scan_result->temp_progress_filenames.push_back(filename);
+    } else if (type == kTableFile) {
+      // Collect table file numbers for CleanupPhysicalCompactionOutputFiles
+      scan_result->table_file_numbers.push_back(number);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::DeleteCompactionProgressFiles(
+    const std::vector<std::string>& filenames) {
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& filename : filenames) {
+    std::string file_path = secondary_path_ + "/" + filename;
+    Status delete_status = fs_->DeleteFile(file_path, opts, nullptr /* dbg */);
+    if (!delete_status.ok()) {
+      return delete_status;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::CleanupOldAndTemporaryCompactionProgressFiles(
+    bool preserve_latest, const CompactionProgressFilesScan& scan_result) {
+  std::vector<std::string> filenames_to_delete;
+
+  // Always delete old progress files
+  filenames_to_delete.insert(filenames_to_delete.end(),
+                             scan_result.old_progress_filenames.begin(),
+                             scan_result.old_progress_filenames.end());
+
+  // Always delete temp files
+  filenames_to_delete.insert(filenames_to_delete.end(),
+                             scan_result.temp_progress_filenames.begin(),
+                             scan_result.temp_progress_filenames.end());
+
+  // Conditionally delete latest file
+  if (!preserve_latest && scan_result.HasLatestProgressFile()) {
+    filenames_to_delete.push_back(scan_result.latest_progress_filename.value());
+  }
+
+  return DeleteCompactionProgressFiles(filenames_to_delete);
+}
+
+// Loads compaction progress from a file and cleans up extra output
+// files. After loading the progress, this function identifies and deletes any
+// SST files in the output folder that are NOT tracked in the
+// progress. This ensures consistency between the progress file and
+// actual output files on disk.
+Status DBImplSecondary::LoadCompactionProgressAndCleanupExtraOutputFiles(
+    const std::string& compaction_progress_file_path,
+    const CompactionProgressFilesScan& scan_result) {
+  Status s = ParseCompactionProgressFile(compaction_progress_file_path,
+                                         &compaction_progress_);
+  if (s.ok()) {
+    s = CleanupPhysicalCompactionOutputFiles(true /* preserve_tracked_files */,
+                                             scan_result);
+  }
+  return s;
+}
+
+Status DBImplSecondary::ParseCompactionProgressFile(
+    const std::string& compaction_progress_file_path,
+    CompactionProgress* compaction_progress) {
+  std::unique_ptr<FSSequentialFile> file;
+  Status s = fs_->NewSequentialFile(compaction_progress_file_path,
+                                    FileOptions(), &file, nullptr /* dbg */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<SequentialFileReader> file_reader(new SequentialFileReader(
+      std::move(file), compaction_progress_file_path,
+      immutable_db_options_.log_readahead_size, io_tracer_, {} /* listeners */,
+      immutable_db_options_.rate_limiter.get()));
+
+  Status reader_status;
+
+  struct CompactionProgressReaderReporter : public log::Reader::Reporter {
+    Status* status;
+    explicit CompactionProgressReaderReporter(Status* s) : status(s) {}
+
+    void Corruption(size_t /*bytes*/, const Status& s,
+                    uint64_t /*log_number*/) override {
+      if (status->ok()) {
+        *status = s;
+      }
+    }
+
+    void OldLogRecord(size_t /*bytes*/) override {
+      // Ignore old records
+    }
+  } progress_reporter(&reader_status);
+
+  log::Reader compaction_progress_reader(
+      immutable_db_options_.info_log, std::move(file_reader),
+      &progress_reporter, true /* checksum */, 0 /* log_num */);
+
+  // LIMITATION: Only supports resuming single subcompaction
+  SubcompactionProgressBuilder progress_builder;
+  Slice slice;
+  std::string record;
+
+  while (compaction_progress_reader.ReadRecord(&slice, &record)) {
+    if (!reader_status.ok()) {
+      return reader_status;
+    }
+
+    VersionEdit edit;
+    s = edit.DecodeFrom(slice);
+    if (!s.ok()) {
+      break;
+    }
+
+    bool res = progress_builder.ProcessVersionEdit(edit);
+    if (!res) {
+      break;
+    }
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (progress_builder.HasAccumulatedSubcompactionProgress()) {
+    compaction_progress->clear();
+    compaction_progress->push_back(
+        progress_builder.GetAccumulatedSubcompactionProgress());
+  } else {
+    s = Status::NotFound("No compaction progress was persisted yet");
+  }
+
+  return s;
+}
+
+Status DBImplSecondary::RenameCompactionProgressFile(
+    const std::string& temp_file_path, std::string* final_file_path) {
+  uint64_t current_time = env_->NowMicros();
+  *final_file_path = CompactionProgressFileName(secondary_path_, current_time);
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = fs_->RenameFile(temp_file_path, *final_file_path, opts,
+                      nullptr /* dbg */);
+
+  return s;
+}
+
+Status DBImplSecondary::CleanupPhysicalCompactionOutputFiles(
+    bool preserve_tracked_files,
+    const CompactionProgressFilesScan& scan_result) {
+  std::unordered_set<uint64_t> files_to_preserve;
+
+  if (preserve_tracked_files) {
+    for (const auto& subcompaction_progress : compaction_progress_) {
+      for (const auto& file_metadata :
+           subcompaction_progress.output_level_progress.GetOutputFiles()) {
+        files_to_preserve.insert(file_metadata.fd.GetNumber());
+      }
+      for (const auto& file_metadata :
+           subcompaction_progress.proximal_output_level_progress
+               .GetOutputFiles()) {
+        files_to_preserve.insert(file_metadata.fd.GetNumber());
+      }
+    }
+  }
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  IOOptions opts;
+  Status s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (uint64_t file_number : scan_result.table_file_numbers) {
+    bool should_delete =
+        !preserve_tracked_files ||
+        (files_to_preserve.find(file_number) == files_to_preserve.end());
+
+    if (should_delete) {
+      std::string file_path = MakeTableFileName(secondary_path_, file_number);
+      Status delete_status =
+          fs_->DeleteFile(file_path, opts, nullptr /* dbg */);
+      if (!delete_status.ok()) {
+        return delete_status;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::InitializeCompactionWorkspace(
+    bool allow_resumption, std::unique_ptr<FSDirectory>* output_dir,
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  // Create output directory if it doest exist yet
+  Status s = CreateAndNewDirectory(fs_.get(), secondary_path_, output_dir);
+  if (!s.ok() || !allow_resumption) {
+    return s;
+  }
+
+  s = PrepareCompactionProgressState();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = FinalizeCompactionProgressWriter(compaction_progress_writer);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Initialized compaction workspace with %zu subcompaction "
+                 "progress to resume",
+                 compaction_progress_.size());
+
+  return Status::OK();
+}
+
+// PrepareCompactionProgressState() manages compaction progress files and output
+// files to ensure a clean, consistent state for resuming or starting fresh
+// compaction.
+//
+// PRECONDITION:
+// - This function is ONLY called when allow_resumption = true
+// - The caller wants resumption support for this compaction attempt
+//
+// FILE SYSTEM STATE (before entering this function):
+// - 0 or more compaction progress files may exist in `secondary_path_`:
+//   * Latest progress file (from the most recent compaction attempt)
+//   * Older progress files (left by crashing during a previous
+//     InitializeCompactionWorkspace() call)
+//   * Temporary progress files (left by crashing during a previous
+//     InitializeCompactionWorkspace() call)
+// - 0 or more compaction output files may exist in `secondary_path_`
+//
+// POSTCONDITIONS (after this function):
+// - IF the latest progress file exists AND it parses successfully AND
+//   actually contains valid compaction progress:
+//   * Exactly one latest progress file remains
+//   * All older and temporary compaction progress files are deleted
+//   * All corresponding compaction output files are preserved
+//   * All extra compaction output files are deleted (files left by
+//   compaction
+//     crashing before persisting the progress)
+//   * Result: Ready to resume compaction from the saved progress
+// - OTHERWISE (no latest progress file OR it fails to parse OR it's
+// invalid):
+//   * ALL compaction progress files are deleted (latest + older +
+//   temporary)
+//   * ALL compaction output files are deleted
+//   * Result: Ready to start fresh compaction (despite allow_resumption =
+//   true, we cannot resume because there's no valid progress to resume from)
+//
+// ERROR HANDLING:
+// - ON ERROR (if any of the postconditions cannot be achieved):
+//   * Function returns error status
+//   * File system may be left in a partially modified state
+//   * Caller should manually clean up secondary_path_ before retrying
+//   * Subsequent OpenAndCompact() calls to this clean secondary_path_ will
+//     effectively start fresh compaction
+Status DBImplSecondary::PrepareCompactionProgressState() {
+  Status s;
+
+  // STEP 1: Scan directory ONCE (includes progress files + table files)
+  CompactionProgressFilesScan scan_result;
+  s = ScanCompactionProgressFiles(&scan_result);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Encountered error when scanning for compaction "
+                    "progress files: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  std::optional<std::string> latest_progress_file =
+      scan_result.latest_progress_filename;
+
+  // STEP 2: Determine if we should resume
+  bool should_resume = false;
+  if (latest_progress_file.has_value()) {
+    should_resume = true;
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Did not find any latest compaction progress file. "
+                   "Will perform clean up to start fresh compaction");
+  }
+
+  // STEP 3: Cleanup using pre-scanned results
+  if (should_resume) {
+    // Keep latest, delete old/temp
+    s = CleanupOldAndTemporaryCompactionProgressFiles(
+        true /* preserve_latest */, scan_result);
+  } else {
+    // Delete everything including latest
+    s = CleanupOldAndTemporaryCompactionProgressFiles(
+        false /* preserve_latest */, scan_result);
+    latest_progress_file.reset();
+  }
+
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Failed to clean up compaction progress file(s): %s. "
+                    "Will fail the compaction",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  // STEP 4: Load progress if resuming
+  if (latest_progress_file.has_value()) {
+    uint64_t timestamp = scan_result.latest_progress_timestamp;
+
+    std::string compaction_progress_file_path =
+        CompactionProgressFileName(secondary_path_, timestamp);
+
+    s = LoadCompactionProgressAndCleanupExtraOutputFiles(
+        compaction_progress_file_path, scan_result);
+
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to load the latest compaction "
+                     "progress from %s: %s. Will perform clean up "
+                     "to start fresh compaction",
+                     latest_progress_file.value().c_str(),
+                     s.ToString().c_str());
+      return HandleInvalidOrNoCompactionProgress(compaction_progress_file_path,
+                                                 scan_result);
+    }
+    return s;
+  } else {
+    return HandleInvalidOrNoCompactionProgress(
+        std::nullopt /* compaction_progress_file_path */, scan_result);
+  }
+}
+
+Status DBImplSecondary::HandleInvalidOrNoCompactionProgress(
+    const std::optional<std::string>& compaction_progress_file_path,
+    const CompactionProgressFilesScan& scan_result) {
+  compaction_progress_.clear();
+
+  Status s;
+  if (compaction_progress_file_path.has_value()) {
+    WriteOptions write_options(Env::IOActivity::kCompaction);
+    IOOptions opts;
+    s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+    if (s.ok()) {
+      s = fs_->DeleteFile(compaction_progress_file_path.value(), opts,
+                          nullptr /* dbg */);
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Failed to remove invalid progress file: %s",
+                      s.ToString().c_str());
+      return s;
+    }
+  }
+
+  s = CleanupPhysicalCompactionOutputFiles(false /* preserve_tracked_files */,
+                                           scan_result);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Failed to cleanup existing compaction output files: %s",
+                    s.ToString().c_str());
+    return s;
+  }
+
+  return Status::OK();
+}
+
 Status DBImplSecondary::CompactWithoutInstallation(
     const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh,
     const CompactionServiceInput& input, CompactionServiceResult* result) {
   if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
+
+  std::unique_ptr<FSDirectory> output_dir;
+  std::unique_ptr<log::Writer> compaction_progress_writer;
+
   InstrumentedMutexLock l(&mutex_);
+
   auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
   if (!cfd) {
     return Status::InvalidArgument("Cannot find column family" +
                                    cfh->GetName());
   }
+  Status s;
+
+  // TODO(hx235): Resuming compaction is currently incompatible with
+  // paranoid_file_checks=true because OutputValidator hash verification would
+  // fail during compaction resumption. Before interruption, resuming
+  // compaction needs to persist the hash of each output file to enable
+  // validation after resumption. Alternatively and preferably, we could move
+  // the output verification to happen immediately after each output file is
+  // created. This workaround currently disables resuming compaction when
+  // paranoid_file_checks is enabled. Note that paranoid_file_checks is
+  // disabled by default.
+  bool allow_resumption =
+      options.allow_resumption &&
+      !cfd->GetLatestMutableCFOptions().paranoid_file_checks;
+
+  if (options.allow_resumption &&
+      cfd->GetLatestMutableCFOptions().paranoid_file_checks) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Resume compaction configured but disabled due to "
+                   "incompatible with paranoid_file_checks=true");
+  }
+
+  mutex_.Unlock();
+
+  s = InitializeCompactionWorkspace(allow_resumption, &output_dir,
+                                    &compaction_progress_writer);
+
+  mutex_.Lock();
+
+  if (!s.ok()) {
+    return s;
+  }
 
   std::unordered_set<uint64_t> input_set;
   for (const auto& file_name : input.input_files) {
@@ -856,7 +1330,7 @@ Status DBImplSecondary::CompactWithoutInstallation(
       cfd->ioptions().level_compaction_dynamic_level_bytes);
 
   std::vector<CompactionInputFiles> input_files;
-  Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
       &input_files, &input_set, vstorage, comp_options);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(
@@ -895,13 +1369,6 @@ Status DBImplSecondary::CompactWithoutInstallation(
   assert(c != nullptr);
   c->FinalizeInputInfo(version);
 
-  // Create output directory if it's not existed yet
-  std::unique_ptr<FSDirectory> output_dir;
-  s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
-  if (!s.ok()) {
-    return s;
-  }
-
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
 
@@ -919,13 +1386,15 @@ Status DBImplSecondary::CompactWithoutInstallation(
       options.canceled ? *options.canceled : kManualCompactionCanceledFalse_,
       input.db_id, db_session_id_, secondary_path_, input, result);
 
-  compaction_job.Prepare();
+  compaction_job.Prepare(compaction_progress_,
+                         compaction_progress_writer.get());
 
   mutex_.Unlock();
   s = compaction_job.Run();
   mutex_.Lock();
 
-  // clean up
+  // These cleanup functions handle metadata and state cleanup only and
+  // not the physical files
   compaction_job.io_status().PermitUncheckedError();
   compaction_job.CleanupCompaction();
   c->ReleaseCompactionFiles(s);
@@ -1088,4 +1557,148 @@ Status DB::OpenAndCompact(
                         output, override_options);
 }
 
+Status DBImplSecondary::CreateCompactionProgressWriter(
+    const std::string& file_path,
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  std::unique_ptr<FSWritableFile> file;
+  Status s =
+      fs_->NewWritableFile(file_path, FileOptions(), &file, nullptr /* dbg */);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), file_path, FileOptions()));
+
+  compaction_progress_writer->reset(
+      new log::Writer(std::move(file_writer), 0 /* log_number */,
+                      false /* recycle_log_files */));
+
+  return Status::OK();
+}
+
+Status DBImplSecondary::PersistInitialCompactionProgress(
+    log::Writer* compaction_progress_writer,
+    const CompactionProgress& compaction_progress) {
+  assert(compaction_progress_writer);
+
+  // LIMITATION: Only supports resuming single subcompaction
+  assert(compaction_progress.size() == 1);
+  const SubcompactionProgress& subcompaction_progress = compaction_progress[0];
+
+  VersionEdit edit;
+  edit.SetSubcompactionProgress(subcompaction_progress);
+
+  std::string record;
+  if (!edit.EncodeTo(&record)) {
+    return Status::IOError("Failed to encode the initial compaction progress");
+  }
+
+  WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status s = compaction_progress_writer->AddRecord(write_options, record);
+  if (!s.ok()) {
+    return s;
+  }
+  IOOptions opts;
+  s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = compaction_progress_writer->file()->Sync(opts,
+                                               immutable_db_options_.use_fsync);
+
+  return s;
+}
+
+Status DBImplSecondary::HandleCompactionProgressWriterCreationFailure(
+    const std::string& temp_file_path, const std::string& final_file_path,
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  compaction_progress_writer->reset();
+
+  const std::vector<std::string> paths_to_delete = {final_file_path,
+                                                    temp_file_path};
+
+  Status s;
+  for (const auto& file_path : paths_to_delete) {
+    WriteOptions write_options(Env::IOActivity::kCompaction);
+    IOOptions opts;
+    s = WritableFileWriter::PrepareIOOptions(write_options, opts);
+    if (s.ok()) {
+      s = fs_->DeleteFile(file_path, opts, nullptr /* dbg */);
+    }
+
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Failed to cleanup the compaction progress file "
+                      "during writer creation failure: %s",
+                      s.ToString().c_str());
+      return s;
+    }
+  }
+
+  return s;
+}
+
+Status DBImplSecondary::FinalizeCompactionProgressWriter(
+    std::unique_ptr<log::Writer>* compaction_progress_writer) {
+  uint64_t timestamp = env_->NowMicros();
+  const std::string temp_file_path =
+      TempCompactionProgressFileName(secondary_path_, timestamp);
+
+  Status s = CreateCompactionProgressWriter(temp_file_path,
+                                            compaction_progress_writer);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to create compaction progress writer at "
+                   "temp path %s: %s. Will perform clean up "
+                   "to start compaction without progress persistence",
+                   temp_file_path.c_str(), s.ToString().c_str());
+    return HandleCompactionProgressWriterCreationFailure(
+        temp_file_path, "" /* final_file_path */, compaction_progress_writer);
+  }
+
+  if (!compaction_progress_.empty()) {
+    s = PersistInitialCompactionProgress(compaction_progress_writer->get(),
+                                         compaction_progress_);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to persist the initial copmaction "
+                     "progress: %s. Will perform clean up "
+                     "to start compaction without progress persistence",
+                     s.ToString().c_str());
+      return HandleCompactionProgressWriterCreationFailure(
+          temp_file_path, "" /* final_file_path */, compaction_progress_writer);
+    }
+  }
+
+  compaction_progress_writer->reset();
+
+  std::string final_file_path;
+  s = RenameCompactionProgressFile(temp_file_path, &final_file_path);
+
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to rename temporary compaction progress "
+                   "file from %s to %s: %s.  Will perform clean up "
+                   "to start compaction without progress persistence",
+                   temp_file_path.c_str(), final_file_path.c_str(),
+                   s.ToString().c_str());
+    return HandleCompactionProgressWriterCreationFailure(
+        temp_file_path, final_file_path, compaction_progress_writer);
+  }
+
+  s = CreateCompactionProgressWriter(final_file_path,
+                                     compaction_progress_writer);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to create the final compaction progress "
+                   "writer: %s. Will attempt clean to start the compaction "
+                   "without progress persistence",
+                   s.ToString().c_str());
+    return HandleCompactionProgressWriterCreationFailure(
+        "" /* temp_file_path */, final_file_path, compaction_progress_writer);
+  }
+  return Status::OK();
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index b18822b171b3..0476cf60be53 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -303,6 +303,84 @@ class DBImplSecondary : public DBImpl {
                                     const CompactionServiceInput& input,
                                     CompactionServiceResult* result);
 
+ private:
+  // Holds results of compaction progress files and output files from a single
+  // directory scan
+  struct CompactionProgressFilesScan {
+    // The latest (newest) progress file filename
+    std::optional<std::string> latest_progress_filename;
+    uint64_t latest_progress_timestamp = 0;
+
+    // Older progress file filenames (to be deleted)
+    autovector<std::string> old_progress_filenames;
+
+    // Temporary progress file filenames (to be deleted)
+    autovector<std::string> temp_progress_filenames;
+
+    // All output file numbers - for cleanup optimization
+    std::vector<uint64_t> table_file_numbers;
+
+    bool HasLatestProgressFile() const {
+      return latest_progress_filename.has_value();
+    }
+
+    void Clear() {
+      latest_progress_filename.reset();
+      latest_progress_timestamp = 0;
+      old_progress_filenames.clear();
+      temp_progress_filenames.clear();
+      table_file_numbers.clear();
+    }
+  };
+
+  Status InitializeCompactionWorkspace(
+      bool allow_resumption, std::unique_ptr<FSDirectory>* output_dir,
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  Status PrepareCompactionProgressState();
+
+  Status ScanCompactionProgressFiles(CompactionProgressFilesScan* scan_result);
+
+  Status DeleteCompactionProgressFiles(
+      const std::vector<std::string>& filenames);
+
+  Status CleanupOldAndTemporaryCompactionProgressFiles(
+      bool preserve_latest, const CompactionProgressFilesScan& scan_result);
+
+  Status LoadCompactionProgressAndCleanupExtraOutputFiles(
+      const std::string& compaction_progress_file_path,
+      const CompactionProgressFilesScan& scan_result);
+
+  Status ParseCompactionProgressFile(
+      const std::string& compaction_progress_file_path,
+      CompactionProgress* compaction_progress);
+
+  Status HandleInvalidOrNoCompactionProgress(
+      const std::optional<std::string>& compaction_progress_file_path,
+      const CompactionProgressFilesScan& scan_result);
+
+  Status CleanupPhysicalCompactionOutputFiles(
+      bool preserve_tracked_files,
+      const CompactionProgressFilesScan& scan_result);
+
+  Status FinalizeCompactionProgressWriter(
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  Status CreateCompactionProgressWriter(
+      const std::string& file_path,
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
+  Status PersistInitialCompactionProgress(
+      log::Writer* compaction_progress_writer,
+      const CompactionProgress& compaction_progress);
+
+  Status RenameCompactionProgressFile(const std::string& temp_file_path,
+                                      std::string* final_file_path);
+
+  Status HandleCompactionProgressWriterCreationFailure(
+      const std::string& temp_file_path, const std::string& final_file_path,
+      std::unique_ptr<log::Writer>* compaction_progress_writer);
+
   // Cache log readers for each log number, used for continue WAL replay
   // after recovery
   std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
@@ -311,6 +389,8 @@ class DBImplSecondary : public DBImpl {
   std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
 
   const std::string secondary_path_;
+
+  CompactionProgress compaction_progress_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/merge_helper.h b/db/merge_helper.h
index 3c016e6753e7..098b9b5baba6 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -307,7 +307,7 @@ class MergeOutputIterator {
 
   Slice key() { return Slice(*it_keys_); }
   Slice value() { return Slice(*it_values_); }
-  bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+  bool Valid() const { return it_keys_ != merge_helper_->keys().rend(); }
 
  private:
   const MergeHelper* merge_helper_;
diff --git a/db/version_edit.h b/db/version_edit.h
index 9d7d11265694..0963b5754814 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -549,13 +549,11 @@ struct SubcompactionProgress {
   std::string ToString() const {
     std::ostringstream oss;
     oss << "SubcompactionProgress{";
-    oss << " next_internal_key_to_compact="
-        << (next_internal_key_to_compact.empty()
-                ? "NONE"
-                : next_internal_key_to_compact);
+    oss << " next_internal_key_to_compact"
+        << (next_internal_key_to_compact.empty() ? " empty" : " non-empty");
     oss << ", num_processed_input_records=" << num_processed_input_records;
-    oss << ", output_level_progress" << output_level_progress.ToString();
-    oss << ", proximal_output_level_progress"
+    oss << ", output_level_progress=" << output_level_progress.ToString();
+    oss << ", proximal_output_level_progress="
         << proximal_output_level_progress.ToString();
     oss << " }";
     return oss.str();
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 7bc50ad890f2..ff62188795d1 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -353,16 +353,30 @@ class DB {
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr);
   // End EXPERIMENTAL
 
-  // Open DB and run the compaction.
-  // It's a read-only operation, the result won't be installed to the DB, it
-  // will be output to the `output_directory`. The API should only be used with
-  // `options.CompactionService` to run compaction triggered by
-  // `CompactionService`.
   static Status OpenAndCompact(
       const std::string& name, const std::string& output_directory,
       const std::string& input, std::string* output,
       const CompactionServiceOptionsOverride& override_options);
 
+  // Opens a database and runs compaction without modifying the original DB.
+  //
+  // This read-only operation outputs compaction results to `output_directory`
+  // instead of installing them back to the source database. Designed primarily
+  // for use with `CompactionService` to process remote compaction jobs.
+  //
+  // Parameters:
+  // - `options`: Additional controls
+  //   * When `allow_resumption = false`: The `output_directory` MUST be empty
+  //     before calling this function. Any existing files (including resume
+  //     state or output files from previous runs) in the directory may
+  //     cause correctness errors as the compaction will start from scratch.
+  // - `name`: Source database path
+  // - `output_directory`: Where compaction output files are written
+  // - `input`: Serialized compaction input information
+  // - `output`: Serialized compaction result
+  // - `override_options`: Configuration overrides for the operation
+  //
+  // Returns: Status of the compaction operation
   static Status OpenAndCompact(
       const OpenAndCompactOptions& options, const std::string& name,
       const std::string& output_directory, const std::string& input,
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index ebd3f4f727e5..aeca38ec2487 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2808,6 +2808,43 @@ struct CompactionServiceOptionsOverride {
 struct OpenAndCompactOptions {
   // Allows cancellation of an in-progress compaction.
   std::atomic<bool>* canceled = nullptr;
+
+  // EXPERIMENTAL
+  //
+  // Controls whether OpenAndCompact() should attempt to resume from previously
+  // persisted compaction progress or start fresh.
+  //
+  // When `allow_resumption = true`:
+  // - OpenAndCompact() attempts to resume from previously persisted compaction
+  //   progress stored in `output_directory`
+  // - During execution, it periodically persists new progress to the same
+  //   directory, allowing future calls to continue from where the previous
+  //   compaction left off.
+  // - Fallback behavior: If resumption cannot be fulfilled (e.g., due to
+  //   corrupted or missing resume state), the system will attempt to start a
+  //   fresh compaction as a best-effort fallback by cleaning related files in
+  //   the `output_directory` to achieve a clean state. If even the fresh
+  //   compaction cannot be started, a non-OK status will be returned.
+  // - Important: Resume attempts will be ineffective if the underlying
+  //   conditions that caused the previous OpenAndCompact() failure still
+  //   persist. The same non-OK status will likely be returned unless the root
+  //   cause has been resolved.
+  // - Progress persistence is sequential and best-effort, triggered upon
+  //   completion of each new output file. If compaction is interrupted while
+  //   creating an output file (before its completion), that partial work will
+  //   need to be redone upon resumption.
+  //
+  // When `allow_resumption = false`:
+  // - OpenAndCompact() starts a fresh compaction from scratch.
+  // - No progress will be saved during execution, so interruptions require
+  //   starting over completely.
+  // - CRITICAL REQUIREMENT: The `output_directory` associated MUST be empty
+  //   before calling OpenAndCompact(). Any existing files (including resume
+  //   state or output files from previous runs) may cause correctness errors.
+  //
+  // Limitation: Currently incompatible with paranoid_file_checks=true. The
+  // option is effectively disabled when `paranoid_file_checks` is enabled.
+  bool allow_resumption = false;
 };
 
 struct LiveFilesStorageInfoOptions {
diff --git a/unreleased_history/new_features/resume_compaction.md b/unreleased_history/new_features/resume_compaction.md
new file mode 100644
index 000000000000..3960b3126095
--- /dev/null
+++ b/unreleased_history/new_features/resume_compaction.md
@@ -0,0 +1 @@
+Added experimental support `OpenAndCompactOptions::allow_resumption` for resumable compaction that persists progress during `OpenAndCompact()`, allowing interrupted compactions to resume from the last progress persitence. The default behavior is to not persist progress.

From 1d18c4ed0177f184f228a7cdfb78eb85d0dab540 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Wed, 15 Oct 2025 20:40:05 -0700
Subject: [PATCH 337/500] Reduce macos github CI build time (#14048)

Summary:
We are adding more and more tests, so we need to increase the number of shards in macos build to reduce overall CI time.

macos-15-xlarge image is ARM, which has 5 vCPU cores, but is still 50% faster than the intel x86 12 vCPU.

Test time reduced from 1h 37m to 14m.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14048

Reviewed By: archang19

Differential Revision: D84741917

Pulled By: xingbowang

fbshipit-source-id: 9ba9bd696d3b2152f11dec2fb4280572b98233d5
---
 .github/workflows/pr-jobs.yml | 32 +++++++++++++++++++-------------
 CMakeLists.txt                |  2 +-
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 1ae487a4e1bd..f766ae26b0a6 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -328,45 +328,51 @@ jobs:
   # ========================= MacOS build only ======================== #
   build-macos:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    runs-on: macos-15-xlarge
     env:
       ROCKSDB_DISABLE_JEMALLOC: 1
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/pre-steps-macos"
     - name: Build
-      run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j16 all
+      run: ulimit -S -n `ulimit -H -n` && make V=1 J=16 -j8 all
     - uses: "./.github/actions/post-steps"
   # ========================= MacOS with Tests ======================== #
   build-macos-cmake:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    runs-on: macos-15-xlarge
     strategy:
       matrix:
-        run_even_tests: [true, false]
+        run_sharded_tests: [0, 1, 2, 3]
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/pre-steps-macos"
     - name: cmake generate project file
       run: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 ..
     - name: Build tests
-      run: cd build && make V=1 -j16
-    - name: Run even tests
-      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 0,,2
-      if: ${{ matrix.run_even_tests }}
-    - name: Run odd tests
-      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 1,,2
-      if: ${{ ! matrix.run_even_tests  }}
+      run: cd build && make V=1 -j8
+    - name: Run shard 0 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 0,,4
+      if: ${{ matrix.run_sharded_tests == 0 }}
+    - name: Run shard 1 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 1,,4
+      if: ${{ matrix.run_sharded_tests == 1 }}
+    - name: Run shard 2 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 2,,4
+      if: ${{ matrix.run_sharded_tests == 2 }}
+    - name: Run shard 3 out of 4 test shards
+      run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 3,,4
+      if: ${{ matrix.run_sharded_tests == 3 }}
     - uses: "./.github/actions/post-steps"
   # ======================== Windows with Tests ======================= #
   # NOTE: some windows jobs are in "nightly" to save resources
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 502e6929aac8..44c564481589 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -313,7 +313,7 @@ if(NOT MSVC)
 endif()
 
 # Check if -latomic is required or not
-if (NOT MSVC)
+if (NOT MSVC AND NOT APPLE)
   CHECK_CXX_SOURCE_COMPILES("
 #include <atomic>
 std::atomic<uint64_t> x(0);

From 42842edc8d1767d8c0a3404fe8c2931eda2981a2 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 15 Oct 2025 22:01:49 -0700
Subject: [PATCH 338/500] Use new TableFactory for each remote compaction in
 stress test (#14050)

Summary:
We simulate remote compaction in our stress test by running a separate set of worker threads to run compactions. In reality, these remote compactions run on a different host or (at least in a different process) where we cannot share the TableFactory and BlockCache with the main DB process.

To make this simulated remote compaction closer to reality, create a new TableFactory for each remote compaction in stress test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14050

Test Plan:
```
python3 -u tools/db_crashtest.py --cleanup_cmd='' --simple blackbox --remote_compaction_worker_threads=8 --interval=10
```

Reviewed By: hx235

Differential Revision: D84775656

Pulled By: jaykorean

fbshipit-source-id: d6203fcbe0eca3539e008a19fd47b742553537ed
---
 db_stress_tool/db_stress_common.cc | 54 +++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index 99ff429f443f..0f06d4937e17 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -258,27 +258,57 @@ void RemoteCompactionWorkerThread(void* v) {
           .compaction_filter = options.compaction_filter,
           .compaction_filter_factory = options.compaction_filter_factory,
           .prefix_extractor = options.prefix_extractor,
-          .table_factory = options.table_factory,
           .sst_partitioner_factory = options.sst_partitioner_factory,
           .listeners = {},
           .statistics = options.statistics,
           .table_properties_collector_factories =
               options.table_properties_collector_factories};
+      std::string serialized_output;
       std::string tmp_output_dir = job_info.db_name + "/" + "tmp_output_" +
                                    db_stress_env->GenerateUniqueId();
-      std::string serialized_output;
-      Status s = DB::OpenAndCompact(OpenAndCompactOptions{}, job_info.db_name,
-                                    tmp_output_dir, serialized_input,
-                                    &serialized_output, override_options);
+
+      // Set up Table Factory
+      ConfigOptions config_options;
+      config_options.ignore_unknown_options = false;
+      config_options.ignore_unsupported_options = false;
+
+      Status s = TableFactory::CreateFromString(
+          config_options, options.table_factory->Name(),
+          &override_options.table_factory);
+      if (s.ok()) {
+        std::string optionsStr;
+        s = options.table_factory->GetOptionString(config_options, &optionsStr);
+        if (s.ok()) {
+          s = override_options.table_factory->ConfigureFromString(
+              config_options, optionsStr);
+        }
+      }
       if (!s.ok()) {
-        // Print in stdout instead of stderr to avoid stress test failure,
-        // because OpenAndCompact() failure doesn't necessarily mean
-        // primary db instance failure.
-        fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n",
-                job_info.db_name.c_str(), s.ToString().c_str());
+        fprintf(
+            stdout,
+            "Failed to set up TableFactory for remote compaction - (%s): %s\n",
+            job_info.db_name.c_str(), s.ToString().c_str());
       }
-      // Add the output regardless of status, so that primary DB doesn't rely on
-      // the timeout to finish waiting. The actual failure from the
+
+      // TODO(jaykorean) - create a new compaction filter / merge operator and
+      // others for remote compactions
+
+      // Run Remote Compaction
+      if (s.ok()) {
+        s = DB::OpenAndCompact(OpenAndCompactOptions{}, job_info.db_name,
+                               tmp_output_dir, serialized_input,
+                               &serialized_output, override_options);
+        if (!s.ok()) {
+          // Print in stdout instead of stderr to avoid stress test failure,
+          // because OpenAndCompact() failure doesn't necessarily mean
+          // primary db instance failure.
+          fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n",
+                  job_info.db_name.c_str(), s.ToString().c_str());
+        }
+      }
+
+      // Add the output regardless of status, so that primary DB doesn't rely
+      // on the timeout to finish waiting. The actual failure from the
       // deserialization can fail the compaction properly
       shared->AddRemoteCompactionResult(job_id, s, serialized_output);
     }

From a1dad12c8c9a7a65fa19d3bc78a5f7687ce6c1bd Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Thu, 16 Oct 2025 17:51:55 -0700
Subject: [PATCH 339/500] Reduce github CI build time (#14057)

Summary:
* Reduce build time of folly from 45m~1hr down to 25m. This is achieved by caching folly build artifact from previous build.
* Reduce windows build time of folly from 1hr 15m down to 50m. This is done by increase windows build machine size.
* Fix build on macos on other macos target.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14057

Test Plan: github CI

Reviewed By: archang19, nmk70

Differential Revision: D84848041

Pulled By: xingbowang

fbshipit-source-id: 00306750737070e7e446ee436d607ed6ecae79ae
---
 .github/actions/build-folly/action.yml | 10 ++++++++++
 .github/actions/cache-folly/action.yml | 26 ++++++++++++++++++++++++++
 .github/workflows/pr-jobs.yml          | 26 +++++++++++++++++++-------
 Makefile                               |  4 +++-
 4 files changed, 58 insertions(+), 8 deletions(-)
 create mode 100644 .github/actions/cache-folly/action.yml

diff --git a/.github/actions/build-folly/action.yml b/.github/actions/build-folly/action.yml
index 70229199958b..84f99de18d25 100644
--- a/.github/actions/build-folly/action.yml
+++ b/.github/actions/build-folly/action.yml
@@ -1,7 +1,17 @@
 name: build-folly
+description: Build folly and dependencies (skipped if cache hit)
+inputs:
+  cache-hit:
+    description: Whether the folly cache was hit
+    required: true
 runs:
   using: composite
   steps:
   - name: Build folly and dependencies
+    if: ${{ inputs.cache-hit != 'true' }}
     run: make build_folly
     shell: bash
+  - name: Skip folly build (using cached version)
+    if: ${{ inputs.cache-hit == 'true' }}
+    run: echo "Folly build skipped - using cached version"
+    shell: bash
diff --git a/.github/actions/cache-folly/action.yml b/.github/actions/cache-folly/action.yml
new file mode 100644
index 000000000000..7ec394eb2391
--- /dev/null
+++ b/.github/actions/cache-folly/action.yml
@@ -0,0 +1,26 @@
+name: cache-folly
+description: Cache folly build to speed up CI
+outputs:
+  cache-hit:
+    description: Whether the cache was hit
+    value: ${{ steps.cache-folly-build.outputs.cache-hit }}
+runs:
+  using: composite
+  steps:
+  - name: Extract FOLLY_COMMIT_HASH from Makefile
+    id: extract-folly-hash
+    shell: bash
+    run: |
+      FOLLY_COMMIT_HASH=$(grep '^FOLLY_COMMIT_HASH' Makefile | awk '{print $3}')
+      echo "hash=$FOLLY_COMMIT_HASH" >> $GITHUB_OUTPUT
+  - name: Cache folly build
+    id: cache-folly-build
+    uses: actions/cache@v4
+    with:
+      # Cache the folly build directory
+      path: /tmp/fbcode_builder_getdeps-Z__wZrocksdbZrocksdbZthird-partyZfollyZbuildZfbcode_builder-root/installed
+      # Key is based on:
+      # - OS and architecture
+      # - The specific folly commit hash from Makefile
+      # - The container image version to account for different compiler/library versions
+      key: folly-build-${{ runner.os }}-${{ runner.arch }}-${{ steps.extract-folly-hash.outputs.hash }}-ubuntu22.1-v1
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index f766ae26b0a6..98c4cfa22e19 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -103,7 +103,11 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
     - uses: "./.github/actions/build-folly"
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
     - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly:
@@ -117,7 +121,11 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
     - uses: "./.github/actions/build-folly"
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
     - run: USE_FOLLY=1 LIB_MODE=static V=1 make -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly-lite-no-test:
@@ -144,7 +152,11 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
     - uses: "./.github/actions/build-folly"
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
     - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-benchmark:
@@ -378,7 +390,7 @@ jobs:
   # NOTE: some windows jobs are in "nightly" to save resources
   build-windows-vs2022:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: windows-2022
+    runs-on: windows-8-core
     env:
       CMAKE_GENERATOR: Visual Studio 17 2022
       CMAKE_PORTABLE: 1
@@ -450,7 +462,7 @@ jobs:
     # post-steps skipped because of compatibility issues with docker image
   build-macos-java:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
       ROCKSDB_DISABLE_JEMALLOC: 1
@@ -458,7 +470,7 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/install-jdk8-on-macos"
@@ -473,14 +485,14 @@ jobs:
     - uses: "./.github/actions/post-steps"
   build-macos-java-static:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/install-jdk8-on-macos"
@@ -495,14 +507,14 @@ jobs:
     - uses: "./.github/actions/post-steps"
   build-macos-java-static-universal:
     if: ${{ github.repository_owner == 'facebook' }}
-    runs-on: macos-13
+    runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: maxim-lobanov/setup-xcode@v1.6.0
       with:
-        xcode-version: 14.3.1
+        xcode-version: 16.4.0
     - uses: "./.github/actions/increase-max-open-files-on-macos"
     - uses: "./.github/actions/install-gflags-on-macos"
     - uses: "./.github/actions/install-jdk8-on-macos"
diff --git a/Makefile b/Makefile
index f014aca2ec00..a6a5dd3b4708 100644
--- a/Makefile
+++ b/Makefile
@@ -2495,6 +2495,8 @@ commit_prereq:
 	false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm
 	# $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
 
+FOLLY_COMMIT_HASH = e95383b7c8b5b1e46cf47acf2f317d54f93c8268
+
 # For public CI runs, checkout folly in a way that can build with RocksDB.
 # This is mostly intended as a test-only simulation of Meta-internal folly
 # integration.
@@ -2506,7 +2508,7 @@ checkout_folly:
 	fi
 	@# Pin to a particular version for public CI, so that PR authors don't
 	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard e95383b7c8b5b1e46cf47acf2f317d54f93c8268
+	cd third-party/folly && git reset --hard $(FOLLY_COMMIT_HASH)
 	@# Apparently missing include
 	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
 	@# Warning-as-error on memcpy

From ad83352c3900dde08cfe79e727c1083e99fdae2a Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 17 Oct 2025 10:54:25 -0700
Subject: [PATCH 340/500] Support dumping compaction progress file in ldb
 (#14058)

Summary:
**Context/Summary:**

This PR adds support to dump compaction progress file in ldb for debugging resumable compaction issue

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14058

Test Plan:
```
/data/users/huixiao/rocksdb$ ./ldb compaction_progress_dump --path=/home/huixiao/COMPACTION_PROGRESS-123
Compaction Progress File: /home/huixiao/COMPACTION_PROGRESS-123
============================================
Progress Record 0:
SubcompactionProgress{ next_internal_key_to_compact=user_key="b" (hex:62), seq=kMaxSequenceNumber, type=24, num_processed_input_records=1, output_level_progress=SubcompactionProgressPerLevel{ num_processed_output_records=1, output_files_count=1, last_persisted_output_files_count=0 }, proximal_output_level_progress=SubcompactionProgressPerLevel{ num_processed_output_records=0, output_files_count=0, last_persisted_output_files_count=0 } }
Progress Record 1:
SubcompactionProgress{ next_internal_key_to_compact=user_key="bb" (hex:6262), seq=kMaxSequenceNumber, type=24, num_processed_input_records=2, output_level_progress=SubcompactionProgressPerLevel{ num_processed_output_records=2, output_files_count=1, last_persisted_output_files_count=0 }, proximal_output_level_progress=SubcompactionProgressPerLevel{ num_processed_output_records=0, output_files_count=0, last_persisted_output_files_count=0 } }
Progress Record 2:
SubcompactionProgress{ next_internal_key_to_compact=user_key="cancel_before_this_key" (hex:63616E63656C5F6265666F72655F746869735F6B6579), seq=kMaxSequenceNumber, type=24, num_processed_input_records=3, output_level_progress=SubcompactionProgressPerLevel{ num_processed_output_records=3, output_files_count=1, last_persisted_output_files_count=0 }, proximal_output_level_progress=SubcompactionProgressPerLevel{ num_processed_output_records=0, output_files_count=0, last_persisted_output_files_count=0 } }

Total records: 3
```

Reviewed By: jaykorean

Differential Revision: D84840680

Pulled By: hx235

fbshipit-source-id: 8e448c50348eb1dba92c4ffdbd2d1bb6306288d6
---
 db/version_edit.cc   |  8 +++++
 db/version_edit.h    | 24 +++++++++++--
 tools/ldb_cmd.cc     | 86 ++++++++++++++++++++++++++++++++++++++++++++
 tools/ldb_cmd_impl.h | 21 +++++++++++
 tools/ldb_tool.cc    |  1 +
 5 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/db/version_edit.cc b/db/version_edit.cc
index 822dedb54d0c..88150181bf4c 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -975,6 +975,10 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n FullHistoryTsLow: ");
     r.append(Slice(full_history_ts_low_).ToString(hex_key));
   }
+  if (HasSubcompactionProgress()) {
+    r.append("\n SubcompactionProgress: ");
+    r.append(subcompaction_progress_.ToString());
+  }
   r.append("\n}\n");
   return r;
 }
@@ -1124,6 +1128,10 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
     jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
   }
 
+  if (HasSubcompactionProgress()) {
+    jw << "SubcompactionProgress" << subcompaction_progress_.ToString();
+  }
+
   jw.EndObject();
 
   return jw.Get();
diff --git a/db/version_edit.h b/db/version_edit.h
index 0963b5754814..8ed83cc4a8ed 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -549,8 +549,28 @@ struct SubcompactionProgress {
   std::string ToString() const {
     std::ostringstream oss;
     oss << "SubcompactionProgress{";
-    oss << " next_internal_key_to_compact"
-        << (next_internal_key_to_compact.empty() ? " empty" : " non-empty");
+    oss << " next_internal_key_to_compact=";
+    if (next_internal_key_to_compact.empty()) {
+      oss << "";
+    } else {
+      ParsedInternalKey parsed_key;
+      Slice key_slice(next_internal_key_to_compact);
+      if (ParseInternalKey(key_slice, &parsed_key, false /* log_err_key */)
+              .ok()) {
+        oss << "user_key=\"" << parsed_key.user_key.ToString(false /* hex */)
+            << "\" (hex:" << parsed_key.user_key.ToString(true /* hex */)
+            << ")";
+        oss << ", seq=";
+        if (parsed_key.sequence == kMaxSequenceNumber) {
+          oss << "kMaxSequenceNumber";
+        } else {
+          oss << parsed_key.sequence;
+        }
+        oss << ", type=" << static_cast<int>(parsed_key.type);
+      } else {
+        oss << "raw=" << key_slice.ToString(true /* hex */);
+      }
+    }
     oss << ", num_processed_input_records=" << num_processed_input_records;
     oss << ", output_level_progress=" << output_level_progress.ToString();
     oss << ", proximal_output_level_progress="
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 50051198d770..96842a86e3a5 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -427,6 +427,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new UpdateManifestCommand(parsed_params.cmd_params,
                                      parsed_params.option_map,
                                      parsed_params.flags);
+  } else if (parsed_params.cmd == CompactionProgressDumpCommand::Name()) {
+    return new CompactionProgressDumpCommand(parsed_params.cmd_params,
+                                             parsed_params.option_map,
+                                             parsed_params.flags);
   }
   return nullptr;
 }
@@ -1618,6 +1622,57 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   }
 }
 
+void DumpCompactionProgressFile(const std::string& file_path) {
+  Status s;
+  std::unique_ptr<SequentialFileReader> file_reader;
+
+  std::unique_ptr<FSSequentialFile> file;
+  const std::shared_ptr<FileSystem>& fs = Env::Default()->GetFileSystem();
+  s = fs->NewSequentialFile(file_path, FileOptions(), &file, nullptr);
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to open compaction progress file %s: %s\n",
+            file_path.c_str(), s.ToString().c_str());
+    return;
+  }
+
+  file_reader = std::make_unique<SequentialFileReader>(std::move(file),
+                                                       file_path, 0, nullptr);
+
+  log::Reader reader(nullptr, std::move(file_reader), nullptr,
+                     true /* checksum */, 0);
+
+  Slice record;
+  std::string scratch;
+  int count = 0;
+
+  fprintf(stdout, "Compaction Progress File: %s\n", file_path.c_str());
+  fprintf(stdout, "============================================\n");
+
+  while (reader.ReadRecord(&record, &scratch)) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed to decode VersionEdit: %s\n",
+              s.ToString().c_str());
+      continue;
+    }
+
+    if (edit.HasSubcompactionProgress()) {
+      fprintf(stdout, "Progress Record %d:\n", count);
+      const auto& progress = edit.GetSubcompactionProgress();
+      fprintf(stdout, "%s\n", progress.ToString().c_str());
+      ++count;
+    }
+  }
+
+  if (count == 0) {
+    fprintf(stdout,
+            "No valid records found in the compaction progress file.\n");
+  } else {
+    fprintf(stdout, "\nTotal records: %d\n", count);
+  }
+}
+
 }  // namespace
 
 const std::string ManifestDumpCommand::ARG_VERBOSE = "verbose";
@@ -5303,4 +5358,35 @@ void UpdateManifestCommand::DoCommand() {
   }
 }
 
+const std::string CompactionProgressDumpCommand::ARG_PATH = "path";
+
+CompactionProgressDumpCommand::CompactionProgressDumpCommand(
+    const std::vector<std::string>& /*params*/,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_PATH})) {
+  auto itr = options.find(ARG_PATH);
+  if (itr != options.end()) {
+    path_ = itr->second;
+  } else {
+    path_ = "";
+  }
+
+  if (path_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "The --path option is required for compaction_progress_dump command");
+  }
+}
+
+void CompactionProgressDumpCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(CompactionProgressDumpCommand::Name());
+  ret.append(" [--" + ARG_PATH + "=<path_to_compaction_progress_file>]");
+  ret.append("\n");
+}
+
+void CompactionProgressDumpCommand::DoCommand() {
+  DumpCompactionProgressFile(path_);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 3f7273dd5447..1a30d402cee9 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -814,4 +814,25 @@ class UnsafeRemoveSstFileCommand : public LDBCommand {
   uint64_t sst_file_number_;
 };
 
+class CompactionProgressDumpCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "compaction_progress_dump"; }
+
+  CompactionProgressDumpCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+
+  void DoCommand() override;
+
+  bool NoDBOpen() override { return true; }
+
+ private:
+  std::string path_;
+
+  static const std::string ARG_PATH;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index ebf40e25d8ab..b2e19524e834 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -119,6 +119,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBDumperCommand::Help(ret);
   DBLoaderCommand::Help(ret);
   ManifestDumpCommand::Help(ret);
+  CompactionProgressDumpCommand::Help(ret);
   UpdateManifestCommand::Help(ret);
   FileChecksumDumpCommand::Help(ret);
   GetPropertyCommand::Help(ret);

From 622186adecbb688c6d86a21b57795ade0a9cb8a3 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Fri, 17 Oct 2025 11:12:35 -0700
Subject: [PATCH 341/500] Update error message for plain table reader max file
 size (#14056)

Summary:
Currently we return `File is too large for PlainTableReader!` when the file size exceeds our pre-defined constant. There was a request to have the file size information logged when this error is returned.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14056

Reviewed By: nmk70

Differential Revision: D84834869

Pulled By: archang19

fbshipit-source-id: 8f332b6a31d51f320c7e2db06ad49f50798ff70e
---
 table/plain/plain_table_reader.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index 578e92aa3126..b90f24da6898 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -120,7 +120,9 @@ Status PlainTableReader::Open(
     bool full_scan_mode, const bool immortal_table,
     const SliceTransform* prefix_extractor) {
   if (file_size > PlainTableIndex::kMaxFileSize) {
-    return Status::NotSupported("File is too large for PlainTableReader!");
+    return Status::NotSupported("File size " + std::to_string(file_size) +
+                                " exceeds PlainTableReader max file size " +
+                                std::to_string(PlainTableIndex::kMaxFileSize));
   }
 
   std::unique_ptr<TableProperties> props;

From 8edb99f904d0c05f10878f0b137debe6c9d5a524 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 17 Oct 2025 11:38:20 -0700
Subject: [PATCH 342/500] Statistics for successfully resumed compaction output
 bytes (#14054)

Summary:
Context/Summary: as titled

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14054

Test Plan: new UT, manually checking

Reviewed By: jaykorean

Differential Revision: D84828431

Pulled By: hx235

fbshipit-source-id: 56e1a9159f7597a10d6c549657d8b22788aa0599
---
 db/compaction/compaction_service_test.cc      | 20 ++++++++++++
 db/db_impl/db_impl_secondary.cc               | 32 +++++++++++++++++++
 db/db_impl/db_impl_secondary.h                |  3 ++
 include/rocksdb/statistics.h                  |  3 ++
 java/rocksjni/portal.h                        |  4 +++
 .../src/main/java/org/rocksdb/TickerType.java |  5 +++
 monitoring/statistics.cc                      |  1 +
 7 files changed, 68 insertions(+)

diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 7414b52bb609..1127244d6247 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -2269,6 +2269,8 @@ class ResumableCompactionServiceTest : public CompactionServiceTest {
 
     GenerateTestData();
 
+    ASSERT_OK(statistics->Reset());
+
     CompactRangeOptions cro;
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
     Status s = db_->CompactRange(cro, nullptr, nullptr);
@@ -2288,6 +2290,24 @@ class ResumableCompactionServiceTest : public CompactionServiceTest {
     ASSERT_TRUE(result.stats.is_manual_compaction);
     ASSERT_TRUE(result.stats.is_remote_compaction);
     ASSERT_GT(result.output_files.size(), 0);
+
+    uint64_t resumed_bytes =
+        statistics->getTickerCount(REMOTE_COMPACT_RESUMED_BYTES);
+    if (scenario ==
+        ResumableCompactionService::TestScenario::kCancelThenResume) {
+      // When resuming compaction, some bytes should be resumed from previous
+      // progress
+      ASSERT_GT(resumed_bytes, 0);
+    } else if (scenario == ResumableCompactionService::TestScenario::
+                               kCancelThenFreshStart) {
+      // When starting fresh (ignoring existing progress), no bytes should be
+      // resumed
+      ASSERT_EQ(resumed_bytes, 0);
+    } else {  // kMultipleCancelToggleResumption
+      // Phase 2 ran without resumption (fresh start), so Phase 3 has no
+      // progress to resume from. It processes all keys again from scratch.
+      ASSERT_EQ(resumed_bytes, 0);
+    }
   }
 
   void GenerateTestData() {
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index b13af01999cd..05337a019f3e 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -1226,6 +1226,26 @@ Status DBImplSecondary::PrepareCompactionProgressState() {
   }
 }
 
+uint64_t DBImplSecondary::CalculateResumedCompactionBytes(
+    const CompactionProgress& compaction_progress) const {
+  uint64_t total_resumed_bytes = 0;
+
+  for (const auto& subcompaction_progress : compaction_progress) {
+    for (const auto& file_meta :
+         subcompaction_progress.output_level_progress.GetOutputFiles()) {
+      total_resumed_bytes += file_meta.fd.file_size;
+    }
+
+    for (const auto& file_meta :
+         subcompaction_progress.proximal_output_level_progress
+             .GetOutputFiles()) {
+      total_resumed_bytes += file_meta.fd.file_size;
+    }
+  }
+
+  return total_resumed_bytes;
+}
+
 Status DBImplSecondary::HandleInvalidOrNoCompactionProgress(
     const std::optional<std::string>& compaction_progress_file_path,
     const CompactionProgressFilesScan& scan_result) {
@@ -1402,6 +1422,18 @@ Status DBImplSecondary::CompactWithoutInstallation(
 
   TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
                            &s);
+
+  if (!compaction_progress_.empty() && s.ok()) {
+    uint64_t total_resumed_bytes =
+        CalculateResumedCompactionBytes(compaction_progress_);
+
+    if (total_resumed_bytes > 0 &&
+        immutable_db_options_.statistics != nullptr) {
+      RecordTick(immutable_db_options_.statistics.get(),
+                 REMOTE_COMPACT_RESUMED_BYTES, total_resumed_bytes);
+    }
+  }
+
   result->status = s;
   return s;
 }
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index 0476cf60be53..c523fd9b873f 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -381,6 +381,9 @@ class DBImplSecondary : public DBImpl {
       const std::string& temp_file_path, const std::string& final_file_path,
       std::unique_ptr<log::Writer>* compaction_progress_writer);
 
+  uint64_t CalculateResumedCompactionBytes(
+      const CompactionProgress& compaction_progress) const;
+
   // Cache log readers for each log number, used for continue WAL replay
   // after recovery
   std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index db2ef6f79ade..1bd4f382b7a4 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -440,6 +440,9 @@ enum Tickers : uint32_t {
   REMOTE_COMPACT_READ_BYTES,
   REMOTE_COMPACT_WRITE_BYTES,
 
+  // Bytes of output files successfully resumed during compaction
+  REMOTE_COMPACT_RESUMED_BYTES,
+
   // Tiered storage related statistics
   HOT_FILE_READ_BYTES,
   WARM_FILE_READ_BYTES,
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 86248606b248..094ac379b174 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5195,6 +5195,8 @@ class TickerTypeJni {
         return -0x2F;
       case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES:
         return -0x30;
+      case ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_RESUMED_BYTES:
+        return -0x5F;
       case ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES:
         return -0x31;
       case ROCKSDB_NAMESPACE::Tickers::WARM_FILE_READ_BYTES:
@@ -5668,6 +5670,8 @@ class TickerTypeJni {
         return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_READ_BYTES;
       case -0x30:
         return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_WRITE_BYTES;
+      case -0x5F:
+        return ROCKSDB_NAMESPACE::Tickers::REMOTE_COMPACT_RESUMED_BYTES;
       case -0x31:
         return ROCKSDB_NAMESPACE::Tickers::HOT_FILE_READ_BYTES;
       case -0x32:
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 32c4cea2f974..e5fb81a138ba 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -901,6 +901,11 @@ public enum TickerType {
      */
     SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT((byte) -0x5E),
 
+    /**
+     * Bytes of output files successfully resumed during remote compaction
+     */
+    REMOTE_COMPACT_RESUMED_BYTES((byte) -0x5F),
+
     TICKER_ENUM_MAX((byte) -0x54);
 
     private final byte value;
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index d2d316bedae2..652080e59d85 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -224,6 +224,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BACKUP_WRITE_BYTES, "rocksdb.backup.write.bytes"},
     {REMOTE_COMPACT_READ_BYTES, "rocksdb.remote.compact.read.bytes"},
     {REMOTE_COMPACT_WRITE_BYTES, "rocksdb.remote.compact.write.bytes"},
+    {REMOTE_COMPACT_RESUMED_BYTES, "rocksdb.remote.compact.resumed.bytes"},
     {HOT_FILE_READ_BYTES, "rocksdb.hot.file.read.bytes"},
     {WARM_FILE_READ_BYTES, "rocksdb.warm.file.read.bytes"},
     {COOL_FILE_READ_BYTES, "rocksdb.cool.file.read.bytes"},

From 3687dc4ad3963c4d10ded587080bfc2153ad2bdd Mon Sep 17 00:00:00 2001
From: ngina <221624547+nmk70@users.noreply.github.com>
Date: Fri, 17 Oct 2025 19:54:49 -0700
Subject: [PATCH 343/500] Add prefetch feature enum to FSSupportedOps (#13917)

Summary:
**Problem:** RocksDB was making unnecessary prefetch system calls on file systems that don't support prefetch operations, potentially leading to wasted CPU cycles.

**Fix:** Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability. File systems can now opt out of prefetch calls by not setting this field.

**Backwards compatibility:** File systems that don't override SupportedOps() continue to receive prefetch calls exactly as before. Only file systems that explicitly opt out by not setting kFSPrefetch will avoid the calls.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13917

Test Plan:
- Added a new test in block_based_table_reader.
- Run existing tests: ```make prefetch_test && ./prefetch_test```

Reviewed By: anand1976

Differential Revision: D81607145

Pulled By: nmk70

fbshipit-source-id: 3bbefa05919034e8776ea4e4540cdc695cdc6d3f
---
 env/fs_posix.cc                               |  1 +
 include/rocksdb/file_system.h                 |  8 +-
 table/block_based/block_based_table_reader.h  |  6 +-
 .../block_based_table_reader_test.cc          | 85 +++++++++++++++++++
 table/block_based/block_prefetcher.cc         | 46 +++++-----
 .../new_features/fs_prefetch_support.md       |  1 +
 6 files changed, 124 insertions(+), 23 deletions(-)
 create mode 100644 unreleased_history/new_features/fs_prefetch_support.md

diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index dcadafde1a0e..c93d9ce8675f 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -1272,6 +1272,7 @@ class PosixFileSystem : public FileSystem {
       supported_ops |= (1 << FSSupportedOps::kAsyncIO);
     }
 #endif
+    supported_ops |= (1 << FSSupportedOps::kFSPrefetch);
   }
 
 #if defined(ROCKSDB_IOURING_PRESENT)
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index b19c4786d482..8fbb8c4ab55a 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -90,6 +90,7 @@ enum FSSupportedOps {
   kVerifyAndReconstructRead,  // Supports a higher level of data integrity. See
                               // the verify_and_reconstruct_read flag in
                               // IOOptions.
+  kFSPrefetch,                // Supports prefetch operations
 };
 
 // Per-request options that can be passed down to the FileSystem
@@ -771,12 +772,13 @@ class FileSystem : public Customizable {
   //  If async_io is supported by the underlying FileSystem, then supported_ops
   //  will have corresponding bit (i.e FSSupportedOps::kAsyncIO) set to 1.
   //
-  // By default, async_io operation is set and FS should override this API and
-  // set all the operations they support provided in FSSupportedOps (including
-  // async_io).
+  // By default, async_io and prefetch operation are set and FS should override
+  // this API and set all the operations they support provided in FSSupportedOps
+  // (including async_io and prefetch).
   virtual void SupportedOps(int64_t& supported_ops) {
     supported_ops = 0;
     supported_ops |= (1 << FSSupportedOps::kAsyncIO);
+    supported_ops |= (1 << FSSupportedOps::kFSPrefetch);
   }
 
   // If you're adding methods here, remember to add them to EnvWrapper too.
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 946d7263485c..fb4bc998300b 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -610,7 +610,9 @@ struct BlockBasedTable::Rep {
         file_size(_file_size),
         level(_level),
         immortal_table(_immortal_table),
-        user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {}
+        user_defined_timestamps_persisted(_user_defined_timestamps_persisted),
+        fs_prefetch_support(CheckFSFeatureSupport(
+            _ioptions.fs.get(), FSSupportedOps::kFSPrefetch)) {}
   ~Rep() { status.PermitUncheckedError(); }
   const ImmutableOptions& ioptions;
   const EnvOptions& env_options;
@@ -699,6 +701,8 @@ struct BlockBasedTable::Rep {
   // `end_key` for range deletion entries.
   const bool user_defined_timestamps_persisted;
 
+  const bool fs_prefetch_support;
+
   // Set to >0 when the file is known to be obsolete and should have its block
   // cache entries evicted on close. NOTE: when the file becomes obsolete,
   // there could be multiple table cache references that all mark this file as
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 2785de86db82..9b40dd1d5f42 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1623,6 +1623,91 @@ TEST_P(BlockBasedTableReaderMultiScanTest, MultiScanUnpinPreviousBlocks) {
   }
 }
 
+// Test that fs_prefetch_support flag is correctly initialized during table
+// construction based on filesystem capabilities
+TEST_P(BlockBasedTableReaderTest, FSPrefetchSupportInitializedCorrectly) {
+  class ConfigurablePrefetchFS : public FileSystemWrapper {
+   public:
+    ConfigurablePrefetchFS(const std::shared_ptr<FileSystem>& target,
+                           bool support_prefetch)
+        : FileSystemWrapper(target), support_prefetch_(support_prefetch) {}
+
+    static const char* kClassName() { return "ConfigurablePrefetchFS"; }
+    const char* Name() const override { return kClassName(); }
+
+    void SupportedOps(int64_t& supported_ops) override {
+      target()->SupportedOps(supported_ops);
+      if (!support_prefetch_) {  // Disable prefetch support if requested
+        supported_ops &= ~(1 << FSSupportedOps::kFSPrefetch);
+      }
+    }
+
+   private:
+    bool support_prefetch_;
+  };
+
+  // Prepare test table
+  Options options;
+  options.persist_user_defined_timestamps = persist_udt_;
+  if (udt_enabled_) {
+    options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  }
+  size_t ts_sz = options.comparator->timestamp_size();
+  auto kv = BlockBasedTableReaderBaseTest::GenerateKVMap(5, true, ts_sz);
+  std::string table_name = "BlockBasedTableReaderTest_BlockPrefetcherTest" +
+                           CompressionTypeToString(compression_type_);
+  ImmutableOptions ioptions(options);
+  CreateTable(table_name, ioptions, compression_type_, kv,
+              compression_parallel_threads_, compression_dict_bytes_);
+
+  // Test Case 1: Filesystem supports prefetch, fs_prefetch_support should be
+  // true
+  {
+    auto fs_with_prefetch = std::make_shared<ConfigurablePrefetchFS>(
+        env_->GetFileSystem(), true /* support_prefetch */);
+    std::unique_ptr<Env> env_wrapper(
+        new CompositeEnvWrapper(env_, fs_with_prefetch));
+    options.env = env_wrapper.get();
+
+    FileOptions fopts;
+    fopts.use_direct_reads = use_direct_reads_;
+    InternalKeyComparator cmp(options.comparator);
+    ImmutableOptions iopts(options);
+
+    std::unique_ptr<BlockBasedTable> table;
+    NewBlockBasedTableReader(fopts, iopts, cmp, table_name, &table,
+                             false /* prefetch_index_and_filter_in_cache */,
+                             nullptr, persist_udt_);
+
+    ASSERT_TRUE(table->get_rep()->fs_prefetch_support);
+    ASSERT_TRUE(CheckFSFeatureSupport(fs_with_prefetch.get(),
+                                      FSSupportedOps::kFSPrefetch));
+  }
+
+  // Test Case 2: Filesystem doesn't support prefetch, fs_prefetch_support
+  // should be false
+  {
+    auto fs_without_prefetch = std::make_shared<ConfigurablePrefetchFS>(
+        env_->GetFileSystem(), false /* support_prefetch */);
+    std::unique_ptr<Env> env_wrapper(
+        new CompositeEnvWrapper(env_, fs_without_prefetch));
+    options.env = env_wrapper.get();
+
+    FileOptions fopts;
+    fopts.use_direct_reads = use_direct_reads_;
+    InternalKeyComparator cmp(options.comparator);
+    ImmutableOptions iopts(options);
+
+    std::unique_ptr<BlockBasedTable> table;
+    NewBlockBasedTableReader(fopts, iopts, cmp, table_name, &table,
+                             false /* prefetch_index_and_filter_in_cache */,
+                             nullptr, persist_udt_);
+
+    ASSERT_FALSE(table->get_rep()->fs_prefetch_support);
+    ASSERT_FALSE(CheckFSFeatureSupport(fs_without_prefetch.get(),
+                                       FSSupportedOps::kFSPrefetch));
+  }
+}
 std::vector<BlockBasedTableReaderTestParam> GenerateCombinedParameters(
     const std::vector<CompressionType>& compression_types,
     const std::vector<bool>& use_direct_read_flags,
diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc
index 38ec3a044179..bcebf5d36db0 100644
--- a/table/block_based/block_prefetcher.cc
+++ b/table/block_based/block_prefetcher.cc
@@ -44,12 +44,16 @@ void BlockPrefetcher::PrefetchIfNeeded(
       if (!s.ok()) {
         return;
       }
-      s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_);
-      if (s.ok()) {
-        readahead_limit_ = offset + len + compaction_readahead_size_;
-        return;
-      } else if (!s.IsNotSupported()) {
-        return;
+      if (rep->fs_prefetch_support) {
+        s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_);
+        if (s.ok()) {
+          readahead_limit_ = offset + len + compaction_readahead_size_;
+          return;
+        } else if (!s.IsNotSupported()) {
+          return;
+        }
+        // If FS prefetch returned NotSupported despite feature bit being set,
+        // fall through to use internal prefetch buffer.
       }
     }
     // If FS prefetch is not supported, fall back to use internal prefetch
@@ -142,19 +146,23 @@ void BlockPrefetcher::PrefetchIfNeeded(
   if (!s.ok()) {
     return;
   }
-  s = rep->file->Prefetch(
-      opts, handle.offset(),
-      BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_);
-  if (s.IsNotSupported()) {
-    rep->CreateFilePrefetchBufferIfNotExists(
-        readahead_params, &prefetch_buffer_, readaheadsize_cb,
-        /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch);
-    return;
-  }
 
-  readahead_limit_ = offset + len + readahead_size_;
-  // Keep exponentially increasing readahead size until
-  // max_auto_readahead_size.
-  readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+  if (rep->fs_prefetch_support) {
+    s = rep->file->Prefetch(
+        opts, handle.offset(),
+        BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_);
+    if (s.ok()) {
+      readahead_limit_ = offset + len + readahead_size_;
+      // Keep exponentially increasing readahead size until
+      // max_auto_readahead_size.
+      readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
+      return;
+    }
+  }
+  // If FS prefetch is not supported or returned NotSupported, fall back to use
+  // internal prefetch buffer.
+  rep->CreateFilePrefetchBufferIfNotExists(
+      readahead_params, &prefetch_buffer_, readaheadsize_cb,
+      /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/unreleased_history/new_features/fs_prefetch_support.md b/unreleased_history/new_features/fs_prefetch_support.md
new file mode 100644
index 000000000000..2dace9301ad2
--- /dev/null
+++ b/unreleased_history/new_features/fs_prefetch_support.md
@@ -0,0 +1 @@
+Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability, avoiding unnecessary prefetch system calls on file systems that don't support them.

From a8a5ade6fabf56146d020012e700b229c86e9c37 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Sun, 19 Oct 2025 21:24:17 -0700
Subject: [PATCH 344/500] Fix a nullptr access bug in MultiScan (#14062)

Summary:
Fixing a nullptr access in multiscan, under following situation.

```
Block Based Table: blk1:[k1,k2], blk2:[k3,                k8], blk3:[k9]
Scan ranges:            [k1,             k4), [k5,k6), [k7,            k10)
Prepared block ranges:  [0,2],                [2,2],   [1,3]
```

1. Seek key k1 on the first range, read key k1, k2.
2. Seek key k4 on the 2nd range, blocks 0,1 would be unpinned.
3. Seek key k9, block 1 would be accessed, but it is unpinned, which trigger assert failure in debug mode and nullptr access on release build.

This fix changes how blocks are unpinned. It is now only unpinning the block, when the cur_data_block_idx has passed it.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14062

Test Plan:
Unit Test
rand_seed 304010984 on UserDefinedIndexStressTest

Reviewed By: cbi42

Differential Revision: D84976410

Pulled By: xingbowang

fbshipit-source-id: 6b99bf85fc9d4108c5267ae77be77ccfe08923cd
---
 .../block_based/block_based_table_iterator.cc |  45 ++--
 .../block_based/block_based_table_iterator.h  |   3 -
 table/table_test.cc                           | 237 +++++++++++++++---
 .../multi_scan_page_unpin_bug_fix.md          |   1 +
 4 files changed, 226 insertions(+), 60 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md

diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index e3d16ba4337f..10b4a70897f2 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1157,10 +1157,6 @@ bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
       // Seeking a range that is out side of prepared ranges.
       return out_of_bound;
     }
-    // unpin block, then do a seek.
-    if (multi_scan_->next_scan_idx > 0) {
-      UnpinPreviousScanBlocks(multi_scan_->next_scan_idx);
-    }
 
     auto [cur_scan_start_idx, cur_scan_end_idx] =
         multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
@@ -1231,6 +1227,8 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
 
 void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
     const Slice* seek_target, size_t block_idx) {
+  assert(multi_scan_->cur_data_block_idx <= block_idx);
+
   if (!block_iter_points_to_real_block_ ||
       multi_scan_->cur_data_block_idx != block_idx) {
     if (block_iter_points_to_real_block_) {
@@ -1245,32 +1243,20 @@ void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
       return;
     }
   }
-  multi_scan_->cur_data_block_idx = block_idx;
-  block_iter_points_to_real_block_ = true;
-  block_iter_.Seek(*seek_target);
-  FindKeyForward();
-}
 
-void BlockBasedTableIterator::UnpinPreviousScanBlocks(size_t current_scan_idx) {
-  // TODO: support aborting and clearn up async IO requests, currently
-  // only unpins already initialized blocks
-  assert(multi_scan_);
-  assert(current_scan_idx < multi_scan_->block_index_ranges_per_scan.size());
-  if (current_scan_idx == 0) return;
-
-  auto prev_start_block_idx = std::get<0>(
-      multi_scan_->block_index_ranges_per_scan[current_scan_idx - 1]);
-  // Since a block can be shared between consecutive scans, we need
-  // curr_start_block_idx here instead of just release blocks
-  // up to the end of previous range block index.
-  auto curr_start_block_idx =
-      std::get<0>(multi_scan_->block_index_ranges_per_scan[current_scan_idx]);
-  for (size_t block_idx = prev_start_block_idx;
-       block_idx < curr_start_block_idx; ++block_idx) {
-    if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
-      multi_scan_->pinned_data_blocks[block_idx].Reset();
+  // Move current data block index forward until block_idx, meantime, unpin all
+  // the blocks in between
+  while (multi_scan_->cur_data_block_idx < block_idx) {
+    // unpin block
+    if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
+             .IsEmpty()) {
+      multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
     }
+    multi_scan_->cur_data_block_idx++;
   }
+  block_iter_points_to_real_block_ = true;
+  block_iter_.Seek(*seek_target);
+  FindKeyForward();
 }
 
 void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
@@ -1307,6 +1293,11 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     }
     // Move to the next pinned data block
     ResetDataIter();
+    // Unpin previous block if it is not reset by data iterator
+    if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
+             .IsEmpty()) {
+      multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
+    }
     ++multi_scan_->cur_data_block_idx;
 
     if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 299c54f74b40..85e2f8d90923 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -620,9 +620,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   void FindBlockForwardInMultiScan();
 
-  // Unpins blocks from the immediately previous scan range.
-  void UnpinPreviousScanBlocks(size_t current_scan_idx);
-
   void PrepareReadAsyncCallBack(FSReadRequest& req, void* cb_arg) {
     // Record status, result and sanity check offset from `req`.
     AsyncReadState* async_state = static_cast<AsyncReadState*>(cb_arg);
diff --git a/table/table_test.cc b/table/table_test.cc
index f207387d2500..869e6030ddb4 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -87,6 +87,7 @@ namespace ROCKSDB_NAMESPACE {
 namespace {
 
 const std::string kDummyValue(10000, 'o');
+constexpr auto kVerbose = false;
 
 // DummyPropertiesCollector used to test BlockBasedTableProperties
 class DummyPropertiesCollector : public TablePropertiesCollector {
@@ -934,7 +935,6 @@ class HarnessTest : public testing::Test {
 
   void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
                         const stl_wrappers::KVMap& data) {
-    static const bool kVerbose = false;
     InternalIterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     stl_wrappers::KVMap::const_iterator model_iter = data.begin();
@@ -7836,7 +7836,6 @@ class UserDefinedIndexTestBase : public BlockBasedTableTestBase {
     read_opts.iterate_upper_bound = &ub;
     std::unique_ptr<Iterator> iter(db->NewIterator(read_opts, cfh));
     iter->Prepare(scan_opts);
-    static const bool kVerbose = false;
     for (auto opt : opts) {
       ub = opt.range.limit.value();
       iter->Seek(opt.range.start.value());
@@ -8988,8 +8987,6 @@ std::ostream& operator<<(std::ostream& os,
             << param.enable_compaction_with_sst_partitioner << "}";
 }
 
-constexpr auto kVerbose = false;
-
 struct DataRange {
   size_t start;  // inclusive
   size_t end;    // exclusive
@@ -9140,23 +9137,47 @@ class UserDefinedIndexStressTest
   }
 
   void CreateSstFileWithRanges(const std::string& ingest_file,
-                               const DataRange& range) {
-    std::unique_ptr<SstFileWriter> writer =
-        std::make_unique<SstFileWriter>(EnvOptions(), options_);
-    ASSERT_OK(writer->Open(ingest_file));
+                               const std::vector<DataRange>& ranges,
+                               bool& data_added) {
+    std::unique_ptr<SstFileWriter> writer;
+    data_added = false;
 
-    assert(range.start != range.end);
+    std::vector<DataRange> ranges_in_file;
 
-    if (range.is_range_delete) {
-      ASSERT_OK(writer->DeleteRange(range.start_key, range.end_key));
-    } else {
-      for (size_t i = range.start; i != range.end;) {
-        auto key = FormatKey(i);
-        range.start < range.end ? i++ : i--;
-        ASSERT_OK(writer->Put(key, range.value));
+    for (auto const& range : ranges) {
+      assert(range.start != range.end);
+      if (range.skipped) {
+        continue;
+      }
+
+      if (writer == nullptr) {
+        writer = std::make_unique<SstFileWriter>(EnvOptions(), options_);
+        ASSERT_OK(writer->Open(ingest_file));
+      }
+      ranges_in_file.push_back(range);
+
+      data_added = true;
+
+      if (range.is_range_delete) {
+        ASSERT_OK(writer->DeleteRange(range.start_key, range.end_key));
+      } else {
+        for (size_t i = range.start; i != range.end;) {
+          auto key = FormatKey(i);
+          range.start < range.end ? i++ : i--;
+          ASSERT_OK(writer->Put(key, range.value));
+        }
+      }
+    }
+    if (kVerbose) {
+      std::cout << "Ingested file: " + ingest_file + "; Range: {" << std::endl;
+      for (const auto& range : ranges_in_file) {
+        std::cout << "    " << range.ToString() << "," << std::endl;
       }
+      std::cout << "}" << std::endl;
+    }
+    if (data_added) {
+      ASSERT_OK(writer->Finish());
     }
-    ASSERT_OK(writer->Finish()) << range.ToString();
   }
 
   void RangeScan(std::unique_ptr<Iterator>& iter,
@@ -9276,17 +9297,42 @@ class UserDefinedIndexStressTest
   void IngestFilesInOneLevel(const std::vector<DataRange>& ranges_in_level,
                              const std::string& ingest_file_name_prefix,
                              size_t& ingest_file_count,
-                             const IngestExternalFileOptions& ifo) {
+                             const IngestExternalFileOptions& ifo,
+                             bool combine_ranges = false) {
     std::vector<std::string> ingest_files;
     // Generate SST file and bulk load them one level at a time
-    for (auto const& range : ranges_in_level) {
-      if (!range.skipped) {
+    if (combine_ranges) {
+      size_t i = 0;
+      while (i < ranges_in_level.size()) {
+        // if combine ranges, generate 1 SST file that combines muliple ranges
+        // together
+        size_t batch_end_idx =
+            std::min(i + rnd.Uniform(3) + 2, ranges_in_level.size());
+        bool data_added = false;
         ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
             ingest_file_name_prefix + std::to_string(ingest_file_count),
-            range));
-        ingest_files.push_back(ingest_file_name_prefix +
-                               std::to_string(ingest_file_count));
-        ingest_file_count++;
+            {ranges_in_level.begin() + i,
+             ranges_in_level.begin() + batch_end_idx},
+            data_added));
+        if (data_added) {
+          ingest_files.push_back(ingest_file_name_prefix +
+                                 std::to_string(ingest_file_count));
+          ingest_file_count++;
+        }
+        i = batch_end_idx;
+      }
+    } else {
+      for (auto const& range : ranges_in_level) {
+        if (!range.skipped) {
+          bool data_added = false;
+          ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
+              ingest_file_name_prefix + std::to_string(ingest_file_count),
+              {range}, data_added));
+          ASSERT_TRUE(data_added);
+          ingest_files.push_back(ingest_file_name_prefix +
+                                 std::to_string(ingest_file_count));
+          ingest_file_count++;
+        }
       }
     }
 
@@ -9323,9 +9369,9 @@ class UserDefinedIndexStressTest
 // Becuase query count == 2, level n+1 would only prepare 3-5. but since 4-6
 // got deleted in the upper level, they are not returned, so only 3 is
 // returned. Meantime the query should have return [3, 6]
-// One way to fix this is by preparing more data blocks once prepared blocks are
-// exhausted, but upper bound is not reached yet.
-// This requires following changes:
+// One way to fix this is by preparing more data blocks once prepared blocks
+// are exhausted, but upper bound is not reached yet. This requires following
+// changes:
 // 1. Fix out of bound flag in block table iterator. Only set it if the key is
 // larger than the upper bound.
 // 2. Refactor the prepared block single dimension vector into 2 dimension of
@@ -9358,12 +9404,126 @@ TEST_P(UserDefinedIndexStressTest, DISABLED_PartialDeleteRange) {
   ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
 }
 
+TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest.
+  // Test the case where there are 3 levels, the middle level is a delete
+  // range file that span across the entire key space. The top level file have
+  // multiple files and each one has both data and delete range Scan same
+  // range between the 2 CF and validate the result is same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ = test::PerThreadDBPath(
+      "UserDefinedIndexStressTest_DeleteRangeMixedWithDataFile");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  // Test 3 levels.
+  // bottom level is normal data files.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
+  // middle level delete range between each level
+  if (is_reverse_comparator_) {
+    ranges_in_levels_.push_back({{.start = 100,
+                                  .end = 0,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "keyz",
+                                  .end_key = "key"}});
+  } else {
+    ranges_in_levels_.push_back({{.start = 0,
+                                  .end = 100,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "key",
+                                  .end_key = "keyz"}});
+  }
+
+  // Top level is normal data files
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
+
+  IngestExternalFileOptions ifo;
+  ifo.snapshot_consistency = false;
+  auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+  size_t ingest_file_count = 0;
+  auto first_level = true;
+  for (auto const& ranges_in_level : ranges_in_levels_) {
+    ASSERT_NO_FATAL_FAILURE(
+        IngestFilesInOneLevel(ranges_in_level, ingest_file_name_prefix,
+                              ingest_file_count, ifo, true));
+    if (first_level) {
+      first_level = false;
+      if (enable_compaction_with_sst_partitioner_) {
+        // When compaction is enabled, do a compaction at the first level
+        ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
 TEST_P(UserDefinedIndexStressTest, DeleteRange) {
   // Create 2 column families. One use normal put/del, the other uses sst
   // ingest.
-  // Test the case where there are 3 levels, the middle level is a delete range
-  // file that span across the entire key space.
-  // Range scan same range between the 2 CF and validate the result is same
+  // Test the case where there are 3 levels, the middle level is a delete
+  // range file that span across the entire key space. Range scan same range
+  // between the 2 CF and validate the result is same
+  SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
+  dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange");
+  SCOPED_TRACE("dbname: " + dbname_);
+  ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
+
+  // Test 3 levels.
+  // bottom level is normal data files.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
+  // middle level delete range between each level
+  if (is_reverse_comparator_) {
+    ranges_in_levels_.push_back({{.start = 100,
+                                  .end = 0,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "keyz",
+                                  .end_key = "key"}});
+  } else {
+    ranges_in_levels_.push_back({{.start = 0,
+                                  .end = 100,
+                                  .is_range_delete = true,
+                                  .skipped = false,
+                                  .start_key = "key",
+                                  .end_key = "keyz"}});
+  }
+  // Top level is normal data files
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
+
+  IngestExternalFileOptions ifo;
+  ifo.snapshot_consistency = false;
+  auto ingest_file_name_prefix = dbname_ + "ingest_file_";
+  size_t ingest_file_count = 0;
+  auto first_level = true;
+  for (auto const& ranges_in_level : ranges_in_levels_) {
+    ASSERT_NO_FATAL_FAILURE(IngestFilesInOneLevel(
+        ranges_in_level, ingest_file_name_prefix, ingest_file_count, ifo));
+    if (first_level) {
+      first_level = false;
+      if (enable_compaction_with_sst_partitioner_) {
+        // When compaction is enabled, do a compaction at the first level
+        ASSERT_NO_FATAL_FAILURE(CompactIngestedCF());
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
+
+  ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
+}
+
+TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
+  // Create 2 column families. One use normal put/del, the other uses sst
+  // ingest.
+  // Test the case where there are 3 levels, the middle level is a delete
+  // range file that span across the entire key space. Range scan same range
+  // between the 2 CF and validate the result is same
   SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
   dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange");
   SCOPED_TRACE("dbname: " + dbname_);
@@ -9408,6 +9568,23 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
     }
   }
 
+  // Ingest the a new file with atomic replace with full key space, this layer
+  // is exactly same as the one at Level 4
+  bool data_added;
+  ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
+      ingest_file_name_prefix + std::to_string(++ingest_file_count),
+      ranges_in_levels_[2], data_added));
+
+  IngestExternalFileArg ingest_arg;
+  ingest_arg.column_family = ingest_cfh_;
+  ingest_arg.options = ifo;
+  ingest_arg.external_files.push_back(ingest_file_name_prefix +
+                                      std::to_string(ingest_file_count));
+  ingest_arg.atomic_replace_range = RangeOpt(nullptr, nullptr);
+
+  ASSERT_OK(db_->IngestExternalFiles(
+      std::vector<IngestExternalFileArg>({ingest_arg})));
+
   ASSERT_NO_FATAL_FAILURE(AddDataToRegularCF());
 
   ASSERT_NO_FATAL_FAILURE(ValidateQueryResult());
diff --git a/unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md b/unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md
new file mode 100644
index 000000000000..3fed513995fa
--- /dev/null
+++ b/unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md
@@ -0,0 +1 @@
+Fix a bug in Page unpinning in MultiScan

From f343f7ecdc1ad2c13858b62b6e6409fd6df6c0a0 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 20 Oct 2025 10:37:08 -0700
Subject: [PATCH 345/500] Use ccache to accelerate windows build (#14064)

Summary:
With cache hit and compiler option optimization, the compilation time build time is reduced from 40 min to 2 min. Overall build time is reduced from 60 min to less 20 minutes on cache hit on majority of the source file. On 100% cache miss, it would be around 40 minutes.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14064

Test Plan: Github CI

Reviewed By: mszeszko-meta

Differential Revision: D85023882

Pulled By: xingbowang

fbshipit-source-id: 98551880c98f14d36133ff43e6af8c3be94ab465
---
 .../actions/windows-build-steps/action.yml    | 19 ++++++++-
 CMakeLists.txt                                | 40 +++++++++++++------
 Directory.Build.props                         |  9 +++++
 ccache_msvc_compiler.bat                      |  1 +
 4 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 Directory.Build.props
 create mode 100644 ccache_msvc_compiler.bat

diff --git a/.github/actions/windows-build-steps/action.yml b/.github/actions/windows-build-steps/action.yml
index 0986099ce9a2..699d4aa0e580 100644
--- a/.github/actions/windows-build-steps/action.yml
+++ b/.github/actions/windows-build-steps/action.yml
@@ -4,6 +4,16 @@ runs:
   steps:
   - name: Add msbuild to PATH
     uses: microsoft/setup-msbuild@v1.3.1
+  - name: Cache ccache directory
+    id: ccache-cache
+    uses: actions/cache@v4
+    with:
+      path: C:\a\rocksdb\rocksdb\.ccache
+      key: rocksdb-build-${{ runner.os }}-${{ runner.arch }}-ccache-${{ hashFiles('CMakeLists.txt', 'cmake/**/*.cmake') }}-v1
+  - name: ccache
+    uses: hendrikmuhs/ccache-action@v1.2
+    with:
+      max-size: "10GB"
   - name: Custom steps
     env:
       THIRDPARTY_HOME: ${{ github.workspace }}/thirdparty
@@ -38,11 +48,12 @@ runs:
       $env:Path = $env:JAVA_HOME + ";" + $env:Path
       mkdir build
       cd build
-      & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DXPRESS=1 -DJNI=1 ..
+      & cmake -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DWIN_CI=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DXPRESS=1 -DJNI=1 ..
       if(!$?) { Exit $LASTEXITCODE }
       cd ..
       echo "Building with VS version: $Env:CMAKE_GENERATOR"
-      msbuild build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
+      # use more parallel processes than the number of processes available, as most of the compile command would be cache hit
+      msbuild build/rocksdb.sln /m:32 /p:LinkIncremental=false -property:Configuration=Debug -property:Platform=x64
       if(!$?) { Exit $LASTEXITCODE }
       echo ========================= Test RocksDB =========================
       build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
@@ -52,3 +63,7 @@ runs:
       & ctest -C Debug -j 16
       if(!$?) { Exit $LASTEXITCODE }
     shell: pwsh
+  - name: Show ccache stats
+    shell: pwsh
+    run: |
+      ccache --show-stats -v
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44c564481589..e0bbbc4c5cfc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,9 +203,16 @@ if(WIN32 AND MSVC)
   endif()
 endif()
 
+option(WIN_CI "Accelerate build speed and reduce build artifect size for github CI with MSVC" OFF)
+
 if(MSVC)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324")
+  if(WIN_CI)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP /nologo /EHsc /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /W4 /wd4127 /wd4996 /wd4100 /wd4324 /wd4702")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324")
+  endif()
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing -Wno-invalid-offsetof")
@@ -450,24 +457,33 @@ else()
   endif()
 endif()
 
-# Used to run CI build and tests so we can run faster
+# Used to run optimized debug build and tests so we can run faster
 option(OPTDBG "Build optimized debug build with MSVC" OFF)
 option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON)
 if(MSVC)
-  if(OPTDBG)
+  if (WIN_CI)
     message(STATUS "Debug optimization is enabled")
     set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG:FASTLINK")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG:FASTLINK")
   else()
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1")
-
-    # Minimal Build is deprecated after MSVC 2015
-    if( MSVC_VERSION GREATER 1900 )
-      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-")
+    if(OPTDBG)
+      message(STATUS "Debug optimization is enabled")
+      set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
     else()
-      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm")
-    endif()
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1")
 
+      # Minimal Build is deprecated after MSVC 2015
+      if( MSVC_VERSION GREATER 1900 )
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm-")
+      else()
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Gm")
+      endif()
+    endif()
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
   endif()
+
   if(WITH_RUNTIME_DEBUG)
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d")
   else()
@@ -475,8 +491,6 @@ if(MSVC)
   endif()
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}")
 
-  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
-  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
 endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
diff --git a/Directory.Build.props b/Directory.Build.props
new file mode 100644
index 000000000000..5862fb2c2f45
--- /dev/null
+++ b/Directory.Build.props
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project>
+    <PropertyGroup>
+        <CLToolExe>ccache_msvc_compiler.bat</CLToolExe>
+        <CLToolPath>$(MSBuildThisFileDirectory)</CLToolPath>
+        <UseMultiToolTask>true</UseMultiToolTask>
+        <EnforceProcessCountAcrossBuilds>true</EnforceProcessCountAcrossBuilds>
+    </PropertyGroup>
+</Project>
diff --git a/ccache_msvc_compiler.bat b/ccache_msvc_compiler.bat
new file mode 100644
index 000000000000..9501ec592bc4
--- /dev/null
+++ b/ccache_msvc_compiler.bat
@@ -0,0 +1 @@
+ccache.exe cl.exe %*

From 6d9b526551ef660dd02bfe7ed41cfb8e50c374dd Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Mon, 20 Oct 2025 15:11:45 -0700
Subject: [PATCH 346/500] Add OpenAndCompact() to db_bench (#14003)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
**Context/Summary:** as titled.

This can be used to benchmark OpenAndCompact() and OpenAndCompactionOptions::allow_resumption. See below for usage.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14003

Test Plan:
1. Simple OpenAndCompact()
```
openandcompact_allow_resumption=false

./db_bench --use_existing_db=true --db=$db --benchmarks=openandcompact --openandcompact_test_cancel_on_odd=false --openandcompact_cancel_after_millseconds=0 --openandcompact_allow_resumption=$openandcompact_allow_resumption  --disable_auto_compactions=true --compression_type=none --secondary_path=$secondary_path

...
DB path: [/dev/shm/test]

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 39746440.000 micros/op 39.746 seconds/op
OpenAndCompact status: OK
Output: 358 files, average size: 69835396 bytes (66.60 MB)
openandcompact : 39977603.000 micros/op 0 ops/sec 39.978 seconds 1 operations;
```

2. OpenAndCompact() with cancellation (after the whole compaction essentially finishes) and resumption
```
openandcompact_allow_resumption=true
./db_bench --use_existing_db=true --db=$db --benchmarks=openandcompact[X2] --openandcompact_test_cancel_on_odd=false --openandcompact_cancel_after_millseconds=0 --openandcompact_allow_resumption=$openandcompact_allow_resumption  --disable_auto_compactions=true --compression_type=none --secondary_path=$secondary_path

..
DB path: [/dev/shm/test]
Running benchmark for 2 times

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 40095045.000 micros/op 40.095 seconds/op
OpenAndCompact status: OK
Output: 358 files, average size: 69835396 bytes (66.60 MB)
openandcompact : 41471226.000 micros/op 0 ops/sec 41.471 seconds 1 operations;

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 336588.000 micros/op 0.337 seconds/op // Resume
OpenAndCompact status: OK
Output: 358 files, average size: 69835396 bytes (66.60 MB)
openandcompact :  573885.000 micros/op 1 ops/sec 0.574 seconds 1 operations;
openandcompact [AVG 2 runs] : 0 (± 1) ops/sec

openandcompact [AVG    2 runs] : 0 (± 1) ops/sec; 1132.236 ms/op
openandcompact [MEDIAN 2 runs] : 0 ops/sec
```

3. OpenAndCompact() with cancellation at a fixed point and resumption
```
openandcompact_allow_resumption=true
./db_bench --use_existing_db=true --db=$db --benchmarks=openandcompact[X2] --openandcompact_test_cancel_on_odd=true --openandcompact_cancel_after_millseconds=6000 --openandcompact_allow_resumption=$openandcompact_allow_resumption  --disable_auto_compactions=true --compression_type=none --secondary_path=$secondary_path

...
DB path: [/dev/shm/test]
Running benchmark for 2 times

 --- Run 1 (odd - will cancel) ---

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 6005787.000 micros/op 6.006 seconds/op // Cancel accordingly
OpenAndCompact status: Result incomplete: Manual compaction paused
openandcompact : 7255346.000 micros/op 0 ops/sec 7.255 seconds 1 operations;

 --- Run 2 (even - resume) ---

Input files: 101 files, 10000 keys
OpenAndCompact() API call : 33013725.000 micros/op 33.014 seconds/op // Resume
OpenAndCompact status: OK
Output: 358 files, average size: 69835396 bytes (66.60 MB)
openandcompact : 33244026.000 micros/op 0 ops/sec 33.244 seconds 1 operations;
openandcompact [AVG 2 runs] : 0 (± 0) ops/sec

openandcompact [AVG    2 runs] : 0 (± 0) ops/sec; 11911.234 ms/op
openandcompact [MEDIAN 2 runs] : 0 ops/sec
```

Reviewed By: jaykorean

Differential Revision: D84839965

Pulled By: hx235

fbshipit-source-id: 21c4cd01be67da0a128e2de1c3aae93aa97662bd
---
 tools/db_bench_tool.cc | 242 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 236 insertions(+), 6 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index c8354840239f..caf23ee61e7d 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -159,6 +159,7 @@ DEFINE_string(
     "readrandomoperands,"
     "backup,"
     "restore,"
+    "openandcompact,"
     "approximatememtablestats",
 
     "Comma-separated list of operations to run in the specified"
@@ -230,6 +231,9 @@ DEFINE_string(
     "\tcompact1  -- compact L1 into L2\n"
     "\twaitforcompaction - pause until compaction is (probably) done\n"
     "\tflush - flush the memtable\n"
+    "\topenandcompact -- Open DB and compact all files to bottommost level, "
+    "writing output to separate directory without modifying source DB. "
+    "Designed for remote compaction service testing\n"
     "\tstats       -- Print DB stats\n"
     "\tresetstats  -- Reset DB stats\n"
     "\tlevelstats  -- Print the number of files and bytes per level\n"
@@ -1872,6 +1876,18 @@ DEFINE_bool(
         .use_async_io,
     "Sets MultiScanArgs::use_async_io");
 
+DEFINE_bool(openandcompact_allow_resumption, false,
+            "Whether to keep existing progress and enable resume compaction in "
+            "OpenAndCompact benchmark");
+
+DEFINE_bool(openandcompact_test_cancel_on_odd, false,
+            "During OpenAndCompact[Xn], odd runs gets cancelled "
+            "after specified `openandcompact_cancel_after_millseconds`");
+
+DEFINE_uint32(openandcompact_cancel_after_millseconds, 1,
+              "Time to wait before cancelling compaction in odd runs when "
+              "openandcompact_test_cancel_on_odd is true");
+
 namespace ROCKSDB_NAMESPACE {
 namespace {
 static Status CreateMemTableRepFactory(
@@ -2625,24 +2641,33 @@ class CombinedStats {
     const char* name = bench_name.c_str();
     int num_runs = static_cast<int>(throughput_ops_.size());
 
+    double avg_ops_per_sec = CalcAvg(throughput_ops_);
+    double avg_millis_per_op =
+        (avg_ops_per_sec > 0) ? (1000.0 / avg_ops_per_sec) : 0;
+
+    printf("\n");
+
     if (throughput_mbs_.size() == throughput_ops_.size()) {
       // \xC2\xB1 is +/- character in UTF-8
       fprintf(stdout,
-              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %6.1f (\xC2\xB1 "
+              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %.3f ms/op; "
+              "%6.1f (\xC2\xB1 "
               "%.1f) MB/sec\n"
               "%s [MEDIAN %d runs] : %d ops/sec; %6.1f MB/sec\n",
               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
               static_cast<int>(CalcConfidence95(throughput_ops_)),
-              CalcAvg(throughput_mbs_), CalcConfidence95(throughput_mbs_), name,
-              num_runs, static_cast<int>(CalcMedian(throughput_ops_)),
+              avg_millis_per_op, CalcAvg(throughput_mbs_),
+              CalcConfidence95(throughput_mbs_), name, num_runs,
+              static_cast<int>(CalcMedian(throughput_ops_)),
               CalcMedian(throughput_mbs_));
     } else {
       fprintf(stdout,
-              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec\n"
+              "%s [AVG    %d runs] : %d (\xC2\xB1 %d) ops/sec; %.3f ms/op\n"
               "%s [MEDIAN %d runs] : %d ops/sec\n",
               name, num_runs, static_cast<int>(CalcAvg(throughput_ops_)),
-              static_cast<int>(CalcConfidence95(throughput_ops_)), name,
-              num_runs, static_cast<int>(CalcMedian(throughput_ops_)));
+              static_cast<int>(CalcConfidence95(throughput_ops_)),
+              avg_millis_per_op, name, num_runs,
+              static_cast<int>(CalcMedian(throughput_ops_)));
     }
   }
 
@@ -2801,6 +2826,8 @@ class Duration {
   uint64_t start_at_;
 };
 
+// Global run counter for cancel/resume-OpenAndCompact() testing
+static std::atomic<int> openandcompact_run_counter{0};
 class Benchmark {
  private:
   std::shared_ptr<Cache> cache_;
@@ -3853,6 +3880,9 @@ class Benchmark {
         method = &Benchmark::Backup;
       } else if (name == "restore") {
         method = &Benchmark::Restore;
+      } else if (name == "openandcompact") {
+        fresh_db = false;
+        method = &Benchmark::OpenAndCompact;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
         ErrorExit();
@@ -5182,6 +5212,206 @@ class Benchmark {
     DoWrite(thread, UNIQUE_RANDOM);
   }
 
+  void OpenAndCompact(ThreadState* thread) {
+    if (thread->tid != 0) {
+      return;
+    }
+
+    int current_run = ++openandcompact_run_counter;
+    bool is_odd_run = (current_run % 2 == 1);
+
+    if (FLAGS_openandcompact_test_cancel_on_odd) {
+      const char* even_description = FLAGS_openandcompact_allow_resumption
+                                         ? "even - resume"
+                                         : "even - normal";
+      fprintf(stdout, "\n--- Run %d (%s) ---\n", current_run,
+              is_odd_run ? "odd - will cancel" : even_description);
+    }
+
+    Status create_status =
+        db_.db->GetEnv()->CreateDirIfMissing(FLAGS_secondary_path);
+    if (!create_status.ok()) {
+      fprintf(stderr, "Failed to create secondary path: %s\n",
+              create_status.ToString().c_str());
+      return;
+    }
+
+    std::string options_file;
+    Status options_status =
+        GetLatestOptionsFileName(FLAGS_db, db_.db->GetEnv(), &options_file);
+    if (!options_status.ok()) {
+      fprintf(stderr, "FAILED: Cannot find OPTIONS file in %s: %s\n",
+              FLAGS_db.c_str(), options_status.ToString().c_str());
+      return;
+    }
+
+    uint64_t options_file_number;
+    FileType type;
+    if (!ParseFileName(options_file, &options_file_number, &type) ||
+        type != kOptionsFile) {
+      fprintf(stderr, "FAILED: Cannot parse OPTIONS file number from %s\n",
+              options_file.c_str());
+      return;
+    }
+
+    CompactionServiceInput compaction_input;
+    compaction_input.cf_name = kDefaultColumnFamilyName;
+
+    std::vector<std::string> input_file_names;
+    ColumnFamilyMetaData cf_meta;
+    db_.db->GetColumnFamilyMetaData(&cf_meta);
+
+    uint64_t total_input_keys = 0;
+    uint64_t total_input_files = 0;
+
+    // Collect files from all levels for full compaction
+    for (const auto& level : cf_meta.levels) {
+      for (const auto& file : level.files) {
+        input_file_names.push_back(file.name);
+        total_input_keys += file.num_entries;
+        total_input_files++;
+      }
+    }
+
+    // Set output level to configured bottom level (num_levels - 1)
+    compaction_input.output_level = FLAGS_num_levels - 1;
+    compaction_input.db_id = "db_bench_openandcompact";
+    compaction_input.options_file_number = options_file_number;
+
+    compaction_input.input_files = input_file_names;
+
+    std::string input_string;
+    Status serialize_status = compaction_input.Write(&input_string);
+    if (!serialize_status.ok()) {
+      fprintf(stderr, "FAILED: Cannot serialize compaction input: %s\n",
+              serialize_status.ToString().c_str());
+      return;
+    }
+
+    fprintf(stdout, "\nInput files: %" PRIu64 " files, %" PRIu64 " keys\n",
+            total_input_files, total_input_keys);
+
+    std::string output_directory =
+        FLAGS_secondary_path + "/openandcompact_" + std::to_string(thread->tid);
+
+    // Always clean up in odd run, depending on
+    // !FLAGS_openandcompact_allow_resumption in even run
+    bool should_cleanup = is_odd_run || !FLAGS_openandcompact_allow_resumption;
+
+    if (should_cleanup) {
+      std::vector<std::string> children;
+      Status list_status = FLAGS_env->GetChildren(output_directory, &children);
+      if (list_status.ok()) {
+        for (const auto& child : children) {
+          if (child != "." && child != "..") {
+            std::string child_path = output_directory + "/" + child;
+            Status del_status = FLAGS_env->DeleteFile(child_path);
+            if (!del_status.ok()) {
+              fprintf(stderr, "Warning: Failed to delete file %s: %s\n",
+                      child_path.c_str(), del_status.ToString().c_str());
+            }
+          }
+        }
+        Status del_dir_status = FLAGS_env->DeleteDir(output_directory);
+        if (!del_dir_status.ok()) {
+          fprintf(stderr, "Warning: Failed to delete directory %s: %s\n",
+                  output_directory.c_str(), del_dir_status.ToString().c_str());
+        }
+      }
+    }
+
+    Status create_output_status =
+        FLAGS_env->CreateDirIfMissing(output_directory);
+    if (!create_output_status.ok()) {
+      fprintf(stderr, "Failed to create output directory %s: %s\n",
+              output_directory.c_str(),
+              create_output_status.ToString().c_str());
+      return;
+    }
+
+    std::string result_string;
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = FLAGS_env;
+    BlockBasedTableOptions table_options;
+    options_override.table_factory.reset(
+        NewBlockBasedTableFactory(table_options));
+
+    OpenAndCompactOptions options;
+    std::atomic<bool> should_cancel{false};
+    options.canceled = &should_cancel;
+    options.allow_resumption = FLAGS_openandcompact_allow_resumption;
+
+    Status s;
+    uint64_t start_time = FLAGS_env->NowMicros();
+    uint64_t end_time = start_time;
+
+    if (FLAGS_openandcompact_test_cancel_on_odd && is_odd_run) {
+      std::thread compaction_thread([&]() {
+        s = DB::OpenAndCompact(options, FLAGS_db, output_directory,
+                               input_string, &result_string, options_override);
+        end_time = FLAGS_env->NowMicros();
+      });
+
+      std::thread cancellation_timer([&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(
+            FLAGS_openandcompact_cancel_after_millseconds));
+        should_cancel.store(true);
+      });
+
+      compaction_thread.join();
+      cancellation_timer.join();
+    } else {
+      // Normal synchronous operation for even runs or when test_cancel_on_odd
+      // is false
+      s = DB::OpenAndCompact(options, FLAGS_db, output_directory, input_string,
+                             &result_string, options_override);
+      end_time = FLAGS_env->NowMicros();
+    }
+
+    uint64_t latency_micros = end_time - start_time;
+    double latency_seconds = latency_micros / 1000000.0;
+
+    fprintf(stdout,
+            "OpenAndCompact() API call : %.3f micros/op %.3f seconds/op\n",
+            (double)latency_micros, latency_seconds);
+
+    fprintf(stdout, "OpenAndCompact status: %s\n", s.ToString().c_str());
+
+    if (FLAGS_openandcompact_test_cancel_on_odd && is_odd_run) {
+      if (!s.IsManualCompactionPaused()) {
+        fprintf(stdout, "Fail to cancel compaction");
+      }
+      return;
+    } else if (!s.ok()) {
+      fprintf(stderr, "OpenAndCompact failed: %s\n", s.ToString().c_str());
+      return;
+    }
+
+    CompactionServiceResult compaction_result;
+    Status parse_status =
+        CompactionServiceResult::Read(result_string, &compaction_result);
+    if (parse_status.ok()) {
+      uint64_t total_output_size = 0;
+      for (const auto& output_file : compaction_result.output_files) {
+        total_output_size += output_file.file_size;
+      }
+
+      uint64_t num_output_files = compaction_result.output_files.size();
+      uint64_t avg_output_file_size =
+          num_output_files > 0 ? total_output_size / num_output_files : 0;
+
+      fprintf(stdout,
+              "Output: %" PRIu64 " files, average size: %" PRIu64
+              " bytes (%.2f MB)\n",
+              num_output_files, avg_output_file_size,
+              avg_output_file_size / (1024.0 * 1024.0));
+    } else {
+      fprintf(stderr, "Failed to parse compaction result: %s\n",
+              parse_status.ToString().c_str());
+    }
+  }
+
   class KeyGenerator {
    public:
     KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,

From e32c14eb5646e8f629df43e819ad8ffa529d8c3a Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 21 Oct 2025 12:13:57 -0700
Subject: [PATCH 347/500] Stress/crash test improvement to remote compaction
 with resumable compaction (#14041)

Summary:
**Context/Summary:**
- Add resumable compaction to stress test with adaptive progress cancellation
- Add fault injection to remote compaction
- Fix a real minor bug in a couple testing framework bugs with remote compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14041

Test Plan: - Rehearsal stress test, finding bugs for https://github.com/facebook/rocksdb/pull/13984 effectively and did not create new failures.

Reviewed By: jaykorean

Differential Revision: D84524194

Pulled By: hx235

fbshipit-source-id: 42b4264e428c6739631ed9aa5eb02723367510bc
---
 BUCK                                          |   1 +
 db/compaction/compaction_job.cc               |   6 +-
 db/compaction/compaction_job.h                |   1 +
 db/compaction/compaction_service_job.cc       |   2 +-
 db_stress_tool/CMakeLists.txt                 |   1 +
 db_stress_tool/db_stress_common.cc            | 309 ++++++++++++++----
 db_stress_tool/db_stress_common.h             |   1 +
 .../db_stress_compaction_service.cc           |  61 ++++
 db_stress_tool/db_stress_compaction_service.h |  56 +---
 db_stress_tool/db_stress_gflags.cc            |   5 +
 db_stress_tool/db_stress_listener.h           |   7 +-
 db_stress_tool/db_stress_shared_state.h       |  46 ++-
 db_stress_tool/db_stress_test_base.cc         |  15 +-
 db_stress_tool/db_stress_test_base.h          |  15 +-
 src.mk                                        |   1 +
 tools/db_crashtest.py                         |  17 +
 16 files changed, 419 insertions(+), 125 deletions(-)
 create mode 100644 db_stress_tool/db_stress_compaction_service.cc

diff --git a/BUCK b/BUCK
index 8a85587abf2b..c4327a3f724b 100644
--- a/BUCK
+++ b/BUCK
@@ -425,6 +425,7 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
         "db_stress_tool/batched_ops_stress.cc",
         "db_stress_tool/cf_consistency_stress.cc",
         "db_stress_tool/db_stress_common.cc",
+        "db_stress_tool/db_stress_compaction_service.cc",
         "db_stress_tool/db_stress_compression_manager.cc",
         "db_stress_tool/db_stress_driver.cc",
         "db_stress_tool/db_stress_filters.cc",
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 06d608fb4f09..c21306e65cde 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -950,7 +950,8 @@ void CompactionJob::FinalizeCompactionRun(
     UpdateCompactionJobInputStatsFromInternalStats(internal_stats_,
                                                    num_input_range_del);
   }
-  UpdateCompactionJobOutputStatsFromInternalStats(internal_stats_);
+  UpdateCompactionJobOutputStatsFromInternalStats(input_status,
+                                                  internal_stats_);
   RecordCompactionIOStats();
 
   LogFlush(db_options_.info_log);
@@ -2527,6 +2528,7 @@ void CompactionJob::UpdateCompactionJobInputStatsFromInternalStats(
 }
 
 void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
+    const Status& status,
     const InternalStats::CompactionStatsFull& internal_stats) const {
   assert(job_stats_);
   job_stats_->elapsed_micros = internal_stats.output_level_stats.micros;
@@ -2557,7 +2559,7 @@ void CompactionJob::UpdateCompactionJobOutputStatsFromInternalStats(
         internal_stats.proximal_level_stats.num_output_files_blob;
   }
 
-  if (job_stats_->num_output_files > 0) {
+  if (status.ok() && job_stats_->num_output_files > 0) {
     CopyPrefix(compact_->SmallestUserKey(),
                CompactionJobStats::kMaxPrefixLength,
                &job_stats_->smallest_output_key_prefix);
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index bff25f465f4d..ca933f7d4814 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -208,6 +208,7 @@ class CompactionJob {
 
  protected:
   void UpdateCompactionJobOutputStatsFromInternalStats(
+      const Status& status,
       const InternalStats::CompactionStatsFull& internal_stats) const;
 
   void LogCompaction();
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index 8355a9be9682..d9eea538193f 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -411,7 +411,7 @@ Status CompactionServiceCompactionJob::Run() {
   // 2. Update job-level output stats with the aggregated internal_stats_
   // Please note that input stats will be updated by primary host when all
   // subcompactions are finished
-  UpdateCompactionJobOutputStatsFromInternalStats(internal_stats_);
+  UpdateCompactionJobOutputStatsFromInternalStats(status, internal_stats_);
   // and set fields that are not propagated as part of the update
   compaction_result_->stats.is_manual_compaction = c->is_manual_compaction();
   compaction_result_->stats.is_full_compaction = c->is_full_compaction();
diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt
index 80b46330514f..90200f342bf4 100644
--- a/db_stress_tool/CMakeLists.txt
+++ b/db_stress_tool/CMakeLists.txt
@@ -2,6 +2,7 @@ add_executable(db_stress${ARTIFACT_SUFFIX}
   batched_ops_stress.cc
   cf_consistency_stress.cc
   db_stress.cc
+  db_stress_compaction_service.cc
   db_stress_compression_manager.cc
   db_stress_common.cc
   db_stress_driver.cc
diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index 0f06d4937e17..ee7fc1cf5edc 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -13,6 +13,7 @@
 
 #include <cmath>
 
+#include "file/file_util.h"
 #include "rocksdb/secondary_cache.h"
 #include "util/file_checksum_helper.h"
 #include "util/xxhash.h"
@@ -228,14 +229,250 @@ void CompressedCacheSetCapacityThread(void* v) {
   }
 }
 
+#ifndef NDEBUG
+static void SetupFaultInjectionForRemoteCompaction(SharedState* shared) {
+  if (!fault_fs_guard) {
+    return;
+  }
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kRead, shared->GetSeed(), FLAGS_read_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(FaultInjectionIOType::kRead);
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kWrite, shared->GetSeed(), FLAGS_write_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(FaultInjectionIOType::kWrite);
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kMetadataRead, shared->GetSeed(),
+      FLAGS_metadata_read_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(
+      FaultInjectionIOType::kMetadataRead);
+
+  fault_fs_guard->SetThreadLocalErrorContext(
+      FaultInjectionIOType::kMetadataWrite, shared->GetSeed(),
+      FLAGS_metadata_write_fault_one_in,
+      FLAGS_inject_error_severity == 1 /* retryable */,
+      FLAGS_inject_error_severity == 2 /* has_data_loss*/);
+  fault_fs_guard->EnableThreadLocalErrorInjection(
+      FaultInjectionIOType::kMetadataWrite);
+}
+#endif  // NDEBUG
+
+static CompactionServiceOptionsOverride CreateOverrideOptions(
+    const Options& options, const CompactionServiceJobInfo& job_info) {
+  CompactionServiceOptionsOverride override_options{
+      .env = db_stress_env,
+      .file_checksum_gen_factory = options.file_checksum_gen_factory,
+      .merge_operator = options.merge_operator,
+      .compaction_filter = options.compaction_filter,
+      .compaction_filter_factory = options.compaction_filter_factory,
+      .prefix_extractor = options.prefix_extractor,
+      .sst_partitioner_factory = options.sst_partitioner_factory,
+      .listeners = options.listeners,
+      .statistics = options.statistics,
+      .table_properties_collector_factories =
+          options.table_properties_collector_factories};
+
+  // TODO(jaykorean) - create a new compaction filter / merge operator and
+  // others for remote compactions
+  //
+  // Create a new Table Factory
+  ConfigOptions config_options;
+  config_options.ignore_unknown_options = false;
+  config_options.ignore_unsupported_options = false;
+
+  Status s = TableFactory::CreateFromString(config_options,
+                                            options.table_factory->Name(),
+                                            &override_options.table_factory);
+
+  if (s.ok()) {
+    std::string options_str;
+    s = options.table_factory->GetOptionString(config_options, &options_str);
+    if (s.ok()) {
+      s = override_options.table_factory->ConfigureFromString(config_options,
+                                                              options_str);
+    }
+  }
+
+  if (!s.ok()) {
+    fprintf(stdout,
+            "Failed to set up TableFactory for remote compaction - (%s): %s\n",
+            job_info.db_name.c_str(), s.ToString().c_str());
+  }
+
+  return override_options;
+}
+
+static Status CleanupOutputDirectory(const std::string& output_directory) {
+#ifndef NDEBUG
+  // Temporarily disable fault injection to ensure deletion always succeeds
+  if (fault_fs_guard) {
+    fault_fs_guard->DisableAllThreadLocalErrorInjection();
+  }
+#endif  // NDEBUG
+
+  Status s = DestroyDir(db_stress_env, output_directory);
+  if (!s.ok()) {
+    fprintf(stderr,
+            "Failed to destroy output directory %s when allow_resumption is "
+            "false: %s\n",
+            output_directory.c_str(), s.ToString().c_str());
+  }
+
+  if (s.ok()) {
+    s = db_stress_env->CreateDir(output_directory);
+    if (!s.ok()) {
+      fprintf(stderr,
+              "Failed to recreate output directory %s when allow_resumption is "
+              "false: %s\n",
+              output_directory.c_str(), s.ToString().c_str());
+    }
+  }
+
+#ifndef NDEBUG
+  // Re-enable fault injection after deletion
+  if (fault_fs_guard) {
+    fault_fs_guard->EnableAllThreadLocalErrorInjection();
+  }
+#endif  // NDEBUG
+
+  return s;
+}
+
+// Set up cancellation mechanism for testing resumable remote compactions.
+// Spawns a detached thread to trigger cancellation after a delay (50ms
+// initially, or 2/3 of the previous successful compaction time for adaptive
+// timing). First-time jobs are always canceled; retries have a 10% chance
+// to test consecutive cancellation scenarios.
+static std::shared_ptr<std::atomic<bool>> SetupCancellation(
+    OpenAndCompactOptions& open_compact_options, bool was_canceled,
+    Random& rand, uint64_t successful_compaction_end_to_end_micros) {
+  auto canceled = std::make_shared<std::atomic<bool>>(false);
+  open_compact_options.canceled = canceled.get();
+
+  bool should_cancel = !was_canceled || rand.OneIn(10);
+
+  if (should_cancel) {
+    std::thread interruption_thread(
+        [canceled, successful_compaction_end_to_end_micros]() {
+          uint64_t sleep_micros =
+              successful_compaction_end_to_end_micros == 0
+                  ? 50000
+                  : successful_compaction_end_to_end_micros * 2 / 3;
+          std::this_thread::sleep_for(std::chrono::microseconds(sleep_micros));
+          canceled->store(true);
+        });
+    interruption_thread.detach();
+  }
+
+  return canceled;
+}
+
+// Process the result of OpenAndCompact operation
+static void ProcessCompactionResult(
+    const Status& s, const std::string& job_id,
+    const CompactionServiceJobInfo& job_info,
+    const std::string& serialized_input, const std::string& output_directory,
+    const std::string& serialized_output, SharedState* shared,
+    uint64_t& successful_compaction_end_to_end_micros, uint64_t start_micros,
+    Env* env) {
+  if (s.IsManualCompactionPaused() && FLAGS_allow_resumption_one_in > 0) {
+    // Re-enqueue for retry
+    shared->EnqueueRemoteCompaction(job_id, job_info, serialized_input,
+                                    output_directory, true /* was_cancelled */);
+    return;
+  }
+
+  if (!s.ok()) {
+    if (!StressTest::IsErrorInjectedAndRetryable(s)) {
+      // Print in stdout instead of stderr to avoid stress test failure,
+      // because OpenAndCompact() failure doesn't necessarily mean
+      // primary db instance failure.
+      fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n",
+              job_info.db_name.c_str(), s.ToString().c_str());
+    }
+  } else {
+    // Track successful completion time
+    successful_compaction_end_to_end_micros = env->NowMicros() - start_micros;
+  }
+
+  // Add the output regardless of status, so that primary DB doesn't rely
+  // on the timeout to finish waiting. The actual failure from the
+  // deserialization can fail the compaction properly
+  shared->AddRemoteCompactionResult(job_id, s, serialized_output);
+}
+
+static void ProcessRemoteCompactionJob(
+    const std::string& job_id, const CompactionServiceJobInfo& job_info,
+    const std::string& serialized_input, const std::string& output_directory,
+    bool was_canceled, SharedState* shared, StressTest* stress_test,
+    Random& rand, uint64_t& successful_compaction_end_to_end_micros) {
+  auto options = stress_test->GetOptions(job_info.cf_id);
+  assert(options.env != nullptr);
+
+  auto override_options = CreateOverrideOptions(options, job_info);
+
+  OpenAndCompactOptions open_compact_options;
+  if (FLAGS_allow_resumption_one_in > 0) {
+    open_compact_options.allow_resumption =
+        rand.OneIn(FLAGS_allow_resumption_one_in);
+  } else {
+    open_compact_options.allow_resumption = false;
+  }
+
+  if (!open_compact_options.allow_resumption) {
+    CleanupOutputDirectory(output_directory);
+  }
+
+  std::shared_ptr<std::atomic<bool>> canceled = nullptr;
+  if (FLAGS_allow_resumption_one_in > 0) {
+    canceled = SetupCancellation(open_compact_options, was_canceled, rand,
+                                 successful_compaction_end_to_end_micros);
+  }
+
+  std::string serialized_output;
+  uint64_t start_micros = options.env->NowMicros();
+
+  Status s = DB::OpenAndCompact(open_compact_options, job_info.db_name,
+                                output_directory, serialized_input,
+                                &serialized_output, override_options);
+
+  ProcessCompactionResult(s, job_id, job_info, serialized_input,
+                          output_directory, serialized_output, shared,
+                          successful_compaction_end_to_end_micros, start_micros,
+                          options.env);
+}
+
 void RemoteCompactionWorkerThread(void* v) {
   assert(FLAGS_remote_compaction_worker_threads > 0);
   assert(FLAGS_remote_compaction_worker_interval > 0);
+
   auto* thread = static_cast<ThreadState*>(v);
   SharedState* shared = thread->shared;
   StressTest* stress_test = shared->GetStressTest();
   assert(stress_test != nullptr);
+
+#ifndef NDEBUG
+  SetupFaultInjectionForRemoteCompaction(shared);
+#endif  // NDEBUG
+
+  // Tracks the duration (in microseconds) of the most recent successfully
+  // completed compaction from start to finish. This value is used in
+  // SetupCancellation() to adaptively set up cancellation point for a
+  // compaction
+  uint64_t successful_compaction_end_to_end_micros = 0;
+  Random rand(static_cast<uint32_t>(FLAGS_seed));
+
+  // Main worker loop
   while (true) {
+    // Check if we should stop
     {
       MutexLock l(shared->GetMutex());
       if (shared->ShouldStopBgThread()) {
@@ -246,72 +483,20 @@ void RemoteCompactionWorkerThread(void* v) {
         return;
       }
     }
+
     std::string job_id;
     CompactionServiceJobInfo job_info;
     std::string serialized_input;
-    if (shared->DequeueRemoteCompaction(&job_id, &job_info,
-                                        &serialized_input)) {
-      auto options = stress_test->GetOptions(job_info.cf_id);
-      CompactionServiceOptionsOverride override_options{
-          .file_checksum_gen_factory = options.file_checksum_gen_factory,
-          .merge_operator = options.merge_operator,
-          .compaction_filter = options.compaction_filter,
-          .compaction_filter_factory = options.compaction_filter_factory,
-          .prefix_extractor = options.prefix_extractor,
-          .sst_partitioner_factory = options.sst_partitioner_factory,
-          .listeners = {},
-          .statistics = options.statistics,
-          .table_properties_collector_factories =
-              options.table_properties_collector_factories};
-      std::string serialized_output;
-      std::string tmp_output_dir = job_info.db_name + "/" + "tmp_output_" +
-                                   db_stress_env->GenerateUniqueId();
-
-      // Set up Table Factory
-      ConfigOptions config_options;
-      config_options.ignore_unknown_options = false;
-      config_options.ignore_unsupported_options = false;
-
-      Status s = TableFactory::CreateFromString(
-          config_options, options.table_factory->Name(),
-          &override_options.table_factory);
-      if (s.ok()) {
-        std::string optionsStr;
-        s = options.table_factory->GetOptionString(config_options, &optionsStr);
-        if (s.ok()) {
-          s = override_options.table_factory->ConfigureFromString(
-              config_options, optionsStr);
-        }
-      }
-      if (!s.ok()) {
-        fprintf(
-            stdout,
-            "Failed to set up TableFactory for remote compaction - (%s): %s\n",
-            job_info.db_name.c_str(), s.ToString().c_str());
-      }
-
-      // TODO(jaykorean) - create a new compaction filter / merge operator and
-      // others for remote compactions
-
-      // Run Remote Compaction
-      if (s.ok()) {
-        s = DB::OpenAndCompact(OpenAndCompactOptions{}, job_info.db_name,
-                               tmp_output_dir, serialized_input,
-                               &serialized_output, override_options);
-        if (!s.ok()) {
-          // Print in stdout instead of stderr to avoid stress test failure,
-          // because OpenAndCompact() failure doesn't necessarily mean
-          // primary db instance failure.
-          fprintf(stdout, "Failed to run OpenAndCompact(%s): %s\n",
-                  job_info.db_name.c_str(), s.ToString().c_str());
-        }
-      }
-
-      // Add the output regardless of status, so that primary DB doesn't rely
-      // on the timeout to finish waiting. The actual failure from the
-      // deserialization can fail the compaction properly
-      shared->AddRemoteCompactionResult(job_id, s, serialized_output);
+    std::string output_directory;
+    bool was_canceled;
+
+    if (shared->DequeueRemoteCompaction(&job_id, &job_info, &serialized_input,
+                                        &output_directory, &was_canceled)) {
+      ProcessRemoteCompactionJob(
+          job_id, job_info, serialized_input, output_directory, was_canceled,
+          shared, stress_test, rand, successful_compaction_end_to_end_micros);
     }
+
     db_stress_env->SleepForMicroseconds(
         thread->rand.Next() % FLAGS_remote_compaction_worker_interval * 1000 +
         1);
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index fe993451cf79..9c3b6563f2f2 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -429,6 +429,7 @@ DECLARE_bool(track_and_verify_wals);
 DECLARE_int32(remote_compaction_worker_threads);
 DECLARE_int32(remote_compaction_worker_interval);
 DECLARE_bool(remote_compaction_failure_fall_back_to_local);
+DECLARE_int32(allow_resumption_one_in);
 DECLARE_bool(auto_refresh_iterator_with_snapshot);
 DECLARE_uint32(memtable_op_scan_flush_trigger);
 DECLARE_uint32(memtable_avg_op_scan_flush_trigger);
diff --git a/db_stress_tool/db_stress_compaction_service.cc b/db_stress_tool/db_stress_compaction_service.cc
new file mode 100644
index 000000000000..b64fe56095e6
--- /dev/null
+++ b/db_stress_tool/db_stress_compaction_service.cc
@@ -0,0 +1,61 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+
+#include "db_stress_tool/db_stress_compaction_service.h"
+
+#include <string>
+
+#include "db_stress_tool/db_stress_test_base.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+CompactionServiceJobStatus DbStressCompactionService::Wait(
+    const std::string& scheduled_job_id, std::string* result) {
+  while (true) {
+    if (aborted_.load()) {
+      return CompactionServiceJobStatus::kAborted;
+    }
+    const auto& maybeResultStatus =
+        shared_->GetRemoteCompactionResult(scheduled_job_id, result);
+    if (maybeResultStatus.has_value()) {
+      auto s = maybeResultStatus.value();
+      if (s.ok()) {
+        assert(result);
+        assert(!result->empty());
+        return CompactionServiceJobStatus::kSuccess;
+      } else {
+        // Remote Compaction failed
+        if (failure_should_fall_back_to_local_) {
+          return CompactionServiceJobStatus::kUseLocal;
+        }
+        if (StressTest::IsErrorInjectedAndRetryable(s)) {
+          return CompactionServiceJobStatus::kUseLocal;
+        }
+        if (result && result->empty()) {
+          // If result is empty, set the compaction status in the result so
+          // that it can be bubbled up to main thread
+          CompactionServiceResult compaction_result;
+          compaction_result.status = s;
+          if (compaction_result.Write(result).ok()) {
+            assert(result);
+            assert(!result->empty());
+          }
+        }
+        return CompactionServiceJobStatus::kFailure;
+      }
+    } else {
+      // Remote Compaction is still running
+      Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
+    }
+  }
+  return CompactionServiceJobStatus::kFailure;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_compaction_service.h b/db_stress_tool/db_stress_compaction_service.h
index f45198fe48c7..a3566cef52a2 100644
--- a/db_stress_tool/db_stress_compaction_service.h
+++ b/db_stress_tool/db_stress_compaction_service.h
@@ -3,11 +3,13 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#ifdef GFLAGS
 #pragma once
 
+#include "db/compaction/compaction_job.h"
 #include "db_stress_shared_state.h"
-#include "db_stress_tool/db_stress_common.h"
 #include "rocksdb/options.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -26,59 +28,32 @@ class DbStressCompactionService : public CompactionService {
 
   static constexpr uint64_t kWaitIntervalInMicros = 10 * 1000;  // 10ms
 
+  static constexpr const char* kTempOutputDirectoryPrefix = "tmp_output_";
+
   CompactionServiceScheduleResponse Schedule(
       const CompactionServiceJobInfo& info,
       const std::string& compaction_service_input) override {
     std::string job_id = info.db_id + "_" + info.db_session_id + "_" +
                          std::to_string(info.job_id);
+
     if (aborted_.load()) {
       return CompactionServiceScheduleResponse(
           job_id, CompactionServiceJobStatus::kUseLocal);
     }
-    shared_->EnqueueRemoteCompaction(job_id, info, compaction_service_input);
+    std::string output_directory = info.db_name + "/" +
+                                   kTempOutputDirectoryPrefix +
+                                   Env::Default()->GenerateUniqueId();
+
+    shared_->EnqueueRemoteCompaction(
+        job_id, info, compaction_service_input, output_directory,
+        false /* was_cancelled */);  // Not canceled initially
     CompactionServiceScheduleResponse response(
         job_id, CompactionServiceJobStatus::kSuccess);
     return response;
   }
 
   CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
-                                  std::string* result) override {
-    while (true) {
-      if (aborted_.load()) {
-        return CompactionServiceJobStatus::kAborted;
-      }
-      const auto& maybeResultStatus =
-          shared_->GetRemoteCompactionResult(scheduled_job_id, result);
-      if (maybeResultStatus.has_value()) {
-        auto s = maybeResultStatus.value();
-        if (s.ok()) {
-          assert(result);
-          assert(!result->empty());
-          return CompactionServiceJobStatus::kSuccess;
-        } else {
-          // Remote Compaction failed
-          if (failure_should_fall_back_to_local_) {
-            return CompactionServiceJobStatus::kUseLocal;
-          }
-          if (result && result->empty()) {
-            // If result is empty, set the compaction status in the result so
-            // that it can be bubbled up to main thread
-            CompactionServiceResult compaction_result;
-            compaction_result.status = s;
-            if (compaction_result.Write(result).ok()) {
-              assert(result);
-              assert(!result->empty());
-            }
-          }
-          return CompactionServiceJobStatus::kFailure;
-        }
-      } else {
-        // Remote Compaction is still running
-        Env::Default()->SleepForMicroseconds(kWaitIntervalInMicros);
-      }
-    }
-    return CompactionServiceJobStatus::kFailure;
-  }
+                                  std::string* result) override;
 
   void OnInstallation(const std::string& scheduled_job_id,
                       CompactionServiceJobStatus /*status*/) override {
@@ -113,5 +88,6 @@ class DbStressCompactionService : public CompactionService {
   std::atomic_bool aborted_{false};
   bool failure_should_fall_back_to_local_;
 };
-
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index e2dd696ac4e3..47b5f715fdb1 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -861,6 +861,11 @@ DEFINE_bool(remote_compaction_failure_fall_back_to_local, true,
             "If true, remote compaction failures will be ignored and "
             "compactions will fall back to local and retried");
 
+DEFINE_int32(allow_resumption_one_in, 0,
+             "If non-zero, enable resumable compaction with 1/N probability "
+             "for each OpenAndCompact call.Requires "
+             "remote_compaction_worker_threads > 0");
+
 DEFINE_uint32(ingest_wbwi_one_in, 0,
               "If set, will call"
               "IngestWriteBatchWithIndex() instead of regular write operations "
diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h
index 6edbaf7896d5..fd28d5b4ced0 100644
--- a/db_stress_tool/db_stress_listener.h
+++ b/db_stress_tool/db_stress_listener.h
@@ -9,6 +9,7 @@
 #include <mutex>
 #include <unordered_set>
 
+#include "db_stress_tool/db_stress_compaction_service.h"
 #include "db_stress_tool/db_stress_shared_state.h"
 #include "file/filename.h"
 #include "file/writable_file_writer.h"
@@ -21,7 +22,6 @@
 #include "util/gflags_compat.h"
 #include "util/random.h"
 #include "utilities/fault_injection_fs.h"
-
 DECLARE_int32(compact_files_one_in);
 
 extern std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
@@ -310,6 +310,11 @@ class DbStressListener : public EventListener {
         }
       }
     }
+    // We can't do exact matching since remote workers use dynamic temp paths
+    if (file_dir.find(DbStressCompactionService::kTempOutputDirectoryPrefix) !=
+        std::string::npos) {
+      return;
+    }
     assert(false);
 #else
     (void)file_dir;
diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h
index d48610c6e5b1..b4546cd3bad2 100644
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@@ -51,6 +51,24 @@ DECLARE_bool(enable_compaction_filter);
 namespace ROCKSDB_NAMESPACE {
 class StressTest;
 
+struct RemoteCompactionQueueItem {
+  std::string job_id;
+  CompactionServiceJobInfo job_info;
+  std::string serialized_input;
+  std::string output_directory;
+  bool canceled;
+
+  RemoteCompactionQueueItem(const std::string& id,
+                            const CompactionServiceJobInfo& info,
+                            const std::string& input,
+                            const std::string& output_dir, bool was_canceled)
+      : job_id(id),
+        job_info(info),
+        serialized_input(input),
+        output_directory(output_dir),
+        canceled(was_canceled) {}
+};
+
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
  public:
@@ -278,23 +296,31 @@ class SharedState {
 
   void EnqueueRemoteCompaction(const std::string& job_id,
                                const CompactionServiceJobInfo& job_info,
-                               const std::string& serialized_input) {
+                               const std::string& serialized_input,
+                               const std::string& output_directory,
+                               bool canceled) {
     MutexLock l(&remote_compaction_queue_mu_);
-    remote_compaction_queue_.emplace(job_id, job_info, serialized_input);
+    remote_compaction_queue_.emplace(job_id, job_info, serialized_input,
+                                     output_directory, canceled);
   }
 
   bool DequeueRemoteCompaction(std::string* job_id,
                                CompactionServiceJobInfo* job_info,
-                               std::string* serialized_input) {
+                               std::string* serialized_input,
+                               std::string* output_directory, bool* canceled) {
     assert(job_id);
     assert(job_info);
     assert(serialized_input);
+    assert(output_directory);
+    assert(canceled);
     MutexLock l(&remote_compaction_queue_mu_);
     if (!remote_compaction_queue_.empty()) {
-      const auto [id, info, input] = remote_compaction_queue_.front();
-      *job_id = id;
-      *job_info = info;
-      *serialized_input = input;
+      const RemoteCompactionQueueItem& item = remote_compaction_queue_.front();
+      *job_id = item.job_id;
+      *job_info = item.job_info;
+      *serialized_input = item.serialized_input;
+      *output_directory = item.output_directory;
+      *canceled = item.canceled;
       remote_compaction_queue_.pop();
       return true;
     }
@@ -480,11 +506,9 @@ class SharedState {
   std::atomic<bool> verification_failure_;
   std::atomic<bool> should_stop_test_;
 
-  // Queue for the remote compaction. Tuple of job id, job info and serialized
-  // compaction_service_input
+  // Queue for the remote compaction.
   port::Mutex remote_compaction_queue_mu_;
-  std::queue<std::tuple<std::string, CompactionServiceJobInfo, std::string>>
-      remote_compaction_queue_;
+  std::queue<RemoteCompactionQueueItem> remote_compaction_queue_;
   // Result Map for the remote compaciton. Key is the scheduled_job_id and value
   // is serialized compaction_service_result
   port::Mutex remote_compaction_result_map_mu_;
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index c4abbf96995b..b37a0307ff45 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3675,8 +3675,21 @@ void StressTest::Open(SharedState* shared, bool reopen) {
               "Compaction\n");
       exit(1);
     }
-    options_.compaction_service = std::make_shared<DbStressCompactionService>(
+    // Each DB open/reopen gets a fresh compaction service instance with a clean
+    // aborted_ state
+    auto compaction_service = std::make_shared<DbStressCompactionService>(
         shared, FLAGS_remote_compaction_failure_fall_back_to_local);
+
+    options_.compaction_service = compaction_service;
+  }
+
+  if (FLAGS_allow_resumption_one_in > 0) {
+    if (FLAGS_remote_compaction_worker_threads == 0) {
+      fprintf(stderr,
+              "allow_resumption or randomize_allow_resumption requires "
+              "remote_compaction_worker_threads > 0\n");
+      exit(1);
+    }
   }
 
   if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index dd72d5e2ea7e..da1589be541a 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -14,6 +14,7 @@
 #include "db_stress_tool/db_stress_common.h"
 #include "db_stress_tool/db_stress_shared_state.h"
 #include "rocksdb/experimental.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 class SystemClock;
@@ -25,6 +26,13 @@ using experimental::SstQueryFilterConfigsManager;
 
 class StressTest {
  public:
+  static bool IsErrorInjectedAndRetryable(const Status& error_s) {
+    assert(!error_s.ok());
+    return error_s.getState() &&
+           FaultInjectionTestFS::IsInjectedError(error_s) &&
+           !status_to_io_status(Status(error_s)).GetDataLoss();
+  }
+
   StressTest();
 
   virtual ~StressTest() {}
@@ -350,13 +358,6 @@ class StressTest {
     return Status::NotSupported("TestCustomOperations() must be overridden");
   }
 
-  bool IsErrorInjectedAndRetryable(const Status& error_s) const {
-    assert(!error_s.ok());
-    return error_s.getState() &&
-           FaultInjectionTestFS::IsInjectedError(error_s) &&
-           !status_to_io_status(Status(error_s)).GetDataLoss();
-  }
-
   void ProcessStatus(SharedState* shared, std::string msg, const Status& s,
                      bool ignore_injected_error = true) const;
 
diff --git a/src.mk b/src.mk
index 3f465c4562a3..06310de3d3ab 100644
--- a/src.mk
+++ b/src.mk
@@ -396,6 +396,7 @@ STRESS_LIB_SOURCES =                                           \
   db_stress_tool/batched_ops_stress.cc                         \
   db_stress_tool/cf_consistency_stress.cc                      \
   db_stress_tool/db_stress_common.cc                           \
+  db_stress_tool/db_stress_compaction_service.cc               \
   db_stress_tool/db_stress_compression_manager.cc              \
   db_stress_tool/db_stress_driver.cc                           \
   db_stress_tool/db_stress_filters.cc                          \
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 2b456efab9f4..cab484f089f7 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -402,6 +402,7 @@ def apply_random_seed_per_iteration():
     # TODO(hx235): enable `track_and_verify_wals` after stabalizing the stress test
     "track_and_verify_wals": lambda: random.choice([0]),
     "remote_compaction_worker_threads": lambda: random.choice([0, 8]),
+    "allow_resumption_one_in": lambda: random.choice([0, 1, 2, 20]),
     # TODO(jaykorean): Change to lambda: random.choice([0, 1]) after addressing all remote compaction failures
     "remote_compaction_failure_fall_back_to_local": 1,
     "auto_refresh_iterator_with_snapshot": lambda: random.choice([0, 1]),
@@ -845,6 +846,22 @@ def finalize_and_sanitize(src_params):
         dest_params["checkpoint_one_in"] = 0
         dest_params["use_timed_put_one_in"] = 0
         dest_params["test_secondary"] = 0
+        # Disable database open fault injection to prevent test inefficiency described below.
+        # When fault injection occurs during DB open, the db will wait for compaction
+        # to finish to clean up the database before retrying without injected error.
+        # However remote compaction threads are not yet created at that point
+        # so the db has to wait for the timeout (currently 30 seconds) to fall back to
+        # local compaction in order for the compaction to finish.
+        #
+        # TODO: Consider moving compaction thread creation earlier in the startup sequence
+        # to allow db open fault injection testing without this performance penalty
+        dest_params["open_metadata_write_fault_one_in"] = 0
+        dest_params["open_metadata_read_fault_one_in"] = 0
+        dest_params["open_write_fault_one_in"] = 0
+        dest_params["open_read_fault_one_in"] = 0
+        dest_params["sync_fault_injection"] = 0
+    else:
+        dest_params["allow_resumption_one_in"] = 0
 
     # Multi-key operations are not currently compatible with transactions or
     # timestamp.

From e691965558238c8c79a58d37ce7ebf97916a2765 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 22 Oct 2025 12:48:31 -0700
Subject: [PATCH 348/500] Start 10.9.0 development (#14067)

Summary:
10.8.0 branch has been cut.

Updated
- HISTORY.md
- include/rocksdb/version.h
- tools/check_format_compatible.sh

To follow up
- folly update

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14067

Test Plan: CI

Reviewed By: pdillinger

Differential Revision: D85186398

Pulled By: jaykorean

fbshipit-source-id: 44920156aa2a62ba40626766dc4ebdbc02f23fa8
---
 HISTORY.md                                    | 21 +++++++++++++++++++
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 .../fifo_compaction_temperature               |  1 -
 .../udi_non_bytewise_comparator.md            |  1 -
 .../mscan_range_limit_between_files.md        |  1 -
 .../multi_scan_page_unpin_bug_fix.md          |  1 -
 .../new_features/fs_prefetch_support.md       |  1 -
 .../new_features/resume_compaction.md         |  1 -
 .../performance_improvements/lz4.md           |  1 -
 .../manual_compaction_output_temperature      |  1 -
 .../manual_wal_flush_priority                 |  1 -
 .../multi_scan_api_contract.md                |  1 -
 13 files changed, 23 insertions(+), 12 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/fifo_compaction_temperature
 delete mode 100644 unreleased_history/behavior_changes/udi_non_bytewise_comparator.md
 delete mode 100644 unreleased_history/bug_fixes/mscan_range_limit_between_files.md
 delete mode 100644 unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md
 delete mode 100644 unreleased_history/new_features/fs_prefetch_support.md
 delete mode 100644 unreleased_history/new_features/resume_compaction.md
 delete mode 100644 unreleased_history/performance_improvements/lz4.md
 delete mode 100644 unreleased_history/public_api_changes/manual_compaction_output_temperature
 delete mode 100644 unreleased_history/public_api_changes/manual_wal_flush_priority
 delete mode 100644 unreleased_history/public_api_changes/multi_scan_api_contract.md

diff --git a/HISTORY.md b/HISTORY.md
index 01cdf940907b..c601e2cf8213 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,27 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.8.0 (10/21/2025)
+### New Features
+* Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability, avoiding unnecessary prefetch system calls on file systems that don't support them.
+* Added experimental support `OpenAndCompactOptions::allow_resumption` for resumable compaction that persists progress during `OpenAndCompact()`, allowing interrupted compactions to resume from the last progress persitence. The default behavior is to not persist progress.
+
+### Public API Changes
+* Allow specifying output temperature in CompactionOptions
+* Added `DB::FlushWAL(const FlushWALOptions&)` as an alternative to `DB::FlushWAL(bool sync)`, where `FlushWALOptions` includes a new `rate_limiter_priority` field (default `Env::IO_TOTAL`) that allows rate limiting and priority passing of manual WAL flush's IO operations.
+* The MultiScan API contract is updated. After a multi scan range got prepared with Prepare API call, the following seeks must seek the start of each prepared scan range in order. In addition, when limit is set, upper bound must be set to the same value of limit before each seek
+
+### Behavior Changes
+* `kChangeTemperature` FIFO compaction will now honor `compaction_target_temp` to all levels regardless of `cf_options::last_level_temperature`
+* Allow UDIs with a non BytewiseComparator
+
+### Bug Fixes
+* Fix incorrect MultiScan seek error status due to bugs in handling range limit falling between adjacent SST files key range.
+* Fix a bug in Page unpinning in MultiScan
+
+### Performance Improvements
+* Fixed a performance regression in LZ4 compression that started in version 10.6.0
+
 ## 10.7.0 (09/19/2025)
 ### New Features
 * Add the fail_if_no_udi_on_open flag in BlockBasedTableOption to control whether a missing user defined index block in a SST is a hard error or not.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 31f293484c4c..58d5119989a0 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 8
+#define ROCKSDB_MINOR 9
 #define ROCKSDB_PATCH 0
 
 // Make it easy to do conditional compilation based on version checks, i.e.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 41c768fff442..37051c77bb5e 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/fifo_compaction_temperature b/unreleased_history/behavior_changes/fifo_compaction_temperature
deleted file mode 100644
index ff0ab32e0555..000000000000
--- a/unreleased_history/behavior_changes/fifo_compaction_temperature
+++ /dev/null
@@ -1 +0,0 @@
-* `kChangeTemperature` FIFO compaction will now honor `compaction_target_temp` to all levels regardless of `cf_options::last_level_temperature`
diff --git a/unreleased_history/behavior_changes/udi_non_bytewise_comparator.md b/unreleased_history/behavior_changes/udi_non_bytewise_comparator.md
deleted file mode 100644
index f1494fc1eb36..000000000000
--- a/unreleased_history/behavior_changes/udi_non_bytewise_comparator.md
+++ /dev/null
@@ -1 +0,0 @@
-Allow UDIs with a non BytewiseComparator
diff --git a/unreleased_history/bug_fixes/mscan_range_limit_between_files.md b/unreleased_history/bug_fixes/mscan_range_limit_between_files.md
deleted file mode 100644
index be94aa90ed06..000000000000
--- a/unreleased_history/bug_fixes/mscan_range_limit_between_files.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix incorrect MultiScan seek error status due to bugs in handling range limit falling between adjacent SST files key range.
diff --git a/unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md b/unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md
deleted file mode 100644
index 3fed513995fa..000000000000
--- a/unreleased_history/bug_fixes/multi_scan_page_unpin_bug_fix.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix a bug in Page unpinning in MultiScan
diff --git a/unreleased_history/new_features/fs_prefetch_support.md b/unreleased_history/new_features/fs_prefetch_support.md
deleted file mode 100644
index 2dace9301ad2..000000000000
--- a/unreleased_history/new_features/fs_prefetch_support.md
+++ /dev/null
@@ -1 +0,0 @@
-Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability, avoiding unnecessary prefetch system calls on file systems that don't support them.
diff --git a/unreleased_history/new_features/resume_compaction.md b/unreleased_history/new_features/resume_compaction.md
deleted file mode 100644
index 3960b3126095..000000000000
--- a/unreleased_history/new_features/resume_compaction.md
+++ /dev/null
@@ -1 +0,0 @@
-Added experimental support `OpenAndCompactOptions::allow_resumption` for resumable compaction that persists progress during `OpenAndCompact()`, allowing interrupted compactions to resume from the last progress persitence. The default behavior is to not persist progress.
diff --git a/unreleased_history/performance_improvements/lz4.md b/unreleased_history/performance_improvements/lz4.md
deleted file mode 100644
index 5ae1656dfa75..000000000000
--- a/unreleased_history/performance_improvements/lz4.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fixed a performance regression in LZ4 compression that started in version 10.6.0
diff --git a/unreleased_history/public_api_changes/manual_compaction_output_temperature b/unreleased_history/public_api_changes/manual_compaction_output_temperature
deleted file mode 100644
index a9ac7ac2a017..000000000000
--- a/unreleased_history/public_api_changes/manual_compaction_output_temperature
+++ /dev/null
@@ -1 +0,0 @@
-* Allow specifying output temperature in CompactionOptions
diff --git a/unreleased_history/public_api_changes/manual_wal_flush_priority b/unreleased_history/public_api_changes/manual_wal_flush_priority
deleted file mode 100644
index 3dc34c8f146a..000000000000
--- a/unreleased_history/public_api_changes/manual_wal_flush_priority
+++ /dev/null
@@ -1 +0,0 @@
-Added `DB::FlushWAL(const FlushWALOptions&)` as an alternative to `DB::FlushWAL(bool sync)`, where `FlushWALOptions` includes a new `rate_limiter_priority` field (default `Env::IO_TOTAL`) that allows rate limiting and priority passing of manual WAL flush's IO operations.
diff --git a/unreleased_history/public_api_changes/multi_scan_api_contract.md b/unreleased_history/public_api_changes/multi_scan_api_contract.md
deleted file mode 100644
index f988794c6973..000000000000
--- a/unreleased_history/public_api_changes/multi_scan_api_contract.md
+++ /dev/null
@@ -1 +0,0 @@
-The MultiScan API contract is updated. After a multi scan range got prepared with Prepare API call, the following seeks must seek the start of each prepared scan range in order. In addition, when limit is set, upper bound must be set to the same value of limit before each seek

From 144e9f1e4298f0fe3641d0f98ed68c238037e10d Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Thu, 23 Oct 2025 13:34:07 -0700
Subject: [PATCH 349/500] Fix compaction picking with L0 standalone range
 deletion file (#14061)

Summary:
When a standalone range deletion file is ingested in L0, currently it is compacted with any overlapping L0 files. This is not desirable when we ingest new data on top of the range deletion file. This PR fixes the compaction picking logic to only consider L0 files older than the standalone range deletion file.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14061

Test Plan: added a new unit test and updated an existing one.

Reviewed By: xingbowang

Differential Revision: D84930780

Pulled By: cbi42

fbshipit-source-id: 65f4403ccb40ba964b9e65b09e2f7f7efebe81df
---
 db/compaction/compaction.h                    |  4 ++
 db/compaction/compaction_picker.cc            | 17 ++++---
 db/compaction/compaction_picker.h             | 10 +++-
 db/compaction/compaction_picker_test.cc       | 47 +++++++++++++++++++
 db/compaction/compaction_picker_universal.cc  | 14 +++++-
 db/db_iterator_test.cc                        |  5 ++
 db/version_set.cc                             | 17 ++++++-
 db/version_set.h                              | 15 ++++--
 .../behavior_changes/standalone-range-del.md  |  1 +
 9 files changed, 114 insertions(+), 16 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/standalone-range-del.md

diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 46870fbb7835..44eb876ac71a 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -180,6 +180,10 @@ class Compaction {
   const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
 
   // Returns the LevelFilesBrief of the specified compaction input level.
+  // Note that if the compaction includes standalone range deletion file,
+  // this function returns the result after filtering out input files covered
+  // by the range deletion file.
+  // Use inputs() if you want to get the original input files.
   const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
     return &input_levels_[compaction_input_level];
   }
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index a59a28e819b4..b92a507ce2d4 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -242,7 +242,7 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
     GetRange(*inputs, &smallest, &largest);
     inputs->clear();
     vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
-                                   hint_index, &hint_index, true,
+                                   hint_index, &hint_index, true, nullptr,
                                    next_smallest);
   } while (inputs->size() > old_size);
 
@@ -465,7 +465,8 @@ bool CompactionPicker::SetupOtherInputs(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
     CompactionInputFiles* output_level_inputs, int* parent_index,
-    int base_index, bool only_expand_towards_right) {
+    int base_index, bool only_expand_towards_right,
+    const FileMetaData* starting_l0_file) {
   assert(!inputs->empty());
   assert(output_level_inputs->empty());
   const int input_level = inputs->level;
@@ -521,11 +522,11 @@ bool CompactionPicker::SetupOtherInputs(
       // Round-robin compaction only allows expansion towards the larger side.
       vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
                                      &expanded_inputs.files, base_index,
-                                     nullptr);
+                                     nullptr, true, starting_l0_file);
     } else {
       vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
                                      &expanded_inputs.files, base_index,
-                                     nullptr);
+                                     nullptr, true, starting_l0_file);
     }
     uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files);
     if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
@@ -1231,7 +1232,7 @@ void CompactionPicker::PickFilesMarkedForCompaction(
 
 bool CompactionPicker::GetOverlappingL0Files(
     VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
-    int output_level, int* parent_index) {
+    int output_level, int* parent_index, const FileMetaData* starting_l0_file) {
   // Two level 0 compaction won't run at the same time, so don't need to worry
   // about files on level 0 being compacted.
   assert(level0_compactions_in_progress()->empty());
@@ -1242,7 +1243,11 @@ bool CompactionPicker::GetOverlappingL0Files(
   // which will include the picked file.
   start_level_inputs->files.clear();
   vstorage->GetOverlappingInputs(0, &smallest, &largest,
-                                 &(start_level_inputs->files));
+                                 &(start_level_inputs->files),
+                                 /*hint_index=*/-1,
+                                 /*file_index=*/nullptr,
+                                 /*expand_range=*/true,
+                                 /*starting_l0_file=*/starting_l0_file);
 
   // If we include more L0 files in the same compaction run it can
   // cause the 'smallest' and 'largest' key to get extended to a
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 9591d8f0d23b..f5cfdb16f4c8 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -203,13 +203,16 @@ class CompactionPicker {
       const std::vector<CompactionInputFiles>& inputs, int level,
       int proximal_level) const;
 
+  // @param starting_l0_file If not null, restricts L0 file selection to only
+  //                         include files at or older than starting_l0_file.
   bool SetupOtherInputs(const std::string& cf_name,
                         const MutableCFOptions& mutable_cf_options,
                         VersionStorageInfo* vstorage,
                         CompactionInputFiles* inputs,
                         CompactionInputFiles* output_level_inputs,
                         int* parent_index, int base_index,
-                        bool only_expand_towards_right = false);
+                        bool only_expand_towards_right = false,
+                        const FileMetaData* starting_l0_file = nullptr);
 
   void GetGrandparents(VersionStorageInfo* vstorage,
                        const CompactionInputFiles& inputs,
@@ -222,9 +225,12 @@ class CompactionPicker {
       CompactionInputFiles* start_level_inputs,
       std::function<bool(const FileMetaData*)> skip_marked_file);
 
+  // @param starting_l0_file If not null, restricts L0 file selection to only
+  //                         include files at or older than starting_l0_file.
   bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
                              CompactionInputFiles* start_level_inputs,
-                             int output_level, int* parent_index);
+                             int output_level, int* parent_index,
+                             const FileMetaData* starting_l0_file = nullptr);
 
   // Register this compaction in the set of running compactions
   void RegisterCompaction(Compaction* c);
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 605678295cb9..ddc4792d6b56 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -4716,6 +4716,53 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpSmallDB) {
   }
 }
 
+TEST_F(CompactionPickerTest, StandaloneRangeDeletionOnlyPicksOlderFiles) {
+  NewVersionStorage(6, kCompactionStyleUniversal);
+
+  // Create L0 files with overlapping ranges
+  // File 1: newest regular file (epoch 5), keys [100, 200]
+  Add(0, 1U, "100", "200", 1U, 0, 100, 100, 0, false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, kUnknownNewestKeyTime, Slice(), Slice(), 5);
+
+  // File 2: standalone range deletion (epoch 4), keys [150, 250]
+  // This file should be marked as having only range deletions
+  Add(0, 2U, "150", "250", 1U, 0, 200, 200, 0, true, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, kUnknownNewestKeyTime, Slice(), Slice(), 4);
+
+  // Manually set file 2 as standalone range deletion
+  FileMetaData* range_del_file = file_map_[2U].first;
+  range_del_file->num_entries = 1;
+  range_del_file->num_range_deletions = 1;
+  ASSERT_TRUE(range_del_file->FileIsStandAloneRangeTombstone());
+
+  Add(4, 10U, "000", "400", 1U);
+  Add(5, 20U, "000", "400", 100);
+
+  UpdateVersionStorageInfo();
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  ASSERT_TRUE(universal_compaction_picker.NeedsCompaction(vstorage_.get()));
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_,
+          /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
+          vstorage_.get(), &log_buffer_));
+
+  ASSERT_NE(nullptr, compaction);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  // First input level should be L0 with only the standalone range del file
+  // (file 2)
+  ASSERT_EQ(0, compaction->level(0));
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_TRUE(compaction->input(0, 0)->FileIsStandAloneRangeTombstone());
+
+  // Second input level should be L4 with file 10
+  ASSERT_EQ(4, compaction->level(1));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(10U, compaction->input(1, 0)->fd.GetNumber());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index c7223fd9ed8f..13f2831c4a16 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -1544,9 +1544,18 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
     }
 
     if (output_level != 0) {
+      // For standalone range deletion, we don't want to compact it with newer
+      // L0 files that it doesn't cover.
+      const FileMetaData* starting_l0_file =
+          (start_level == 0 && start_level_inputs.size() == 1 &&
+           start_level_inputs.files[0]->FileIsStandAloneRangeTombstone())
+              ? start_level_inputs.files[0]
+              : nullptr;
+
       if (start_level == 0) {
         if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
-                                            output_level, nullptr)) {
+                                            output_level, nullptr,
+                                            starting_l0_file)) {
           return nullptr;
         }
       }
@@ -1557,7 +1566,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
       output_level_inputs.level = output_level;
       if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
                                      &start_level_inputs, &output_level_inputs,
-                                     &parent_index, -1)) {
+                                     &parent_index, -1, false,
+                                     starting_l0_file)) {
         return nullptr;
       }
       inputs.push_back(start_level_inputs);
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 97f73026f1dc..842a38f4b35e 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4602,6 +4602,11 @@ TEST_P(DBMultiScanIteratorTest, FragmentedRangeTombstones) {
   ASSERT_OK(s);
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(cfh, &cf_meta);
+  // Only the L0 with range deletion is compacted.
+  ASSERT_EQ(1, cf_meta.levels[0].files.size());
+  ASSERT_EQ(0, cf_meta.levels[0].files[0].num_deletions);
 
   // The first scan range overlaps the DB key range, while the second extends
   // beyond but overlaps the delete range
diff --git a/db/version_set.cc b/db/version_set.cc
index 90c8e1a8bc8e..ed43e0c98571 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4487,7 +4487,8 @@ bool VersionStorageInfo::OverlapInLevel(int level,
 void VersionStorageInfo::GetOverlappingInputs(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
-    bool expand_range, InternalKey** next_smallest) const {
+    bool expand_range, const FileMetaData* starting_l0_file,
+    InternalKey** next_smallest) const {
   if (level >= num_non_empty_levels_) {
     // this level is empty, no overlapping inputs
     return;
@@ -4520,7 +4521,19 @@ void VersionStorageInfo::GetOverlappingInputs(
 
   // index stores the file index need to check.
   std::list<size_t> index;
-  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+  size_t start_index = 0;
+  if (starting_l0_file != nullptr) {
+    uint64_t starting_file_number = starting_l0_file->fd.GetNumber();
+    for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+      if (level_files_brief_[level].files[i].fd.GetNumber() ==
+          starting_file_number) {
+        start_index = i;
+        break;
+      }
+    }
+    assert(start_index < level_files_brief_[level].num_files);
+  }
+  for (size_t i = start_index; i < level_files_brief_[level].num_files; i++) {
     index.emplace_back(i);
   }
 
diff --git a/db/version_set.h b/db/version_set.h
index b20ab972f20f..cff81717bc63 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -268,8 +268,13 @@ class VersionStorageInfo {
       bool expand_range = true,   // if set, returns files which overlap the
                                   // range and overlap each other. If false,
                                   // then just files intersecting the range
-      InternalKey** next_smallest = nullptr)  // if non-null, returns the
-      const;  // smallest key of next file not included
+      const FileMetaData* starting_l0_file =
+          nullptr,  // If not null, restricts L0 file selection to only include
+                    // files at or older than starting_l0_file.
+      InternalKey** next_smallest =
+          nullptr  // if non-null, returns the
+                   // smallest key of next file not included
+  ) const;
   void GetCleanInputsWithinInterval(
       int level, const InternalKey* begin,  // nullptr means before all keys
       const InternalKey* end,               // nullptr means after all keys
@@ -286,8 +291,10 @@ class VersionStorageInfo {
       int hint_index,                // index of overlap file
       int* file_index,               // return index of overlap file
       bool within_interval = false,  // if set, force the inputs within interval
-      InternalKey** next_smallest = nullptr)  // if non-null, returns the
-      const;  // smallest key of next file not included
+      InternalKey** next_smallest =
+          nullptr  // if non-null, returns the
+                   // smallest key of next file not included
+  ) const;
 
   // Returns true iff some file in the specified level overlaps
   // some part of [*smallest_user_key,*largest_user_key].
diff --git a/unreleased_history/behavior_changes/standalone-range-del.md b/unreleased_history/behavior_changes/standalone-range-del.md
new file mode 100644
index 000000000000..6d95bece1964
--- /dev/null
+++ b/unreleased_history/behavior_changes/standalone-range-del.md
@@ -0,0 +1 @@
+* Updated standalone range deletion L0 file compaction behavior to avoid compacting with any newer L0 files (which is expensive and not useful).

From fac8222bfe434491473c608dfb04477897493b80 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Thu, 23 Oct 2025 18:10:12 -0700
Subject: [PATCH 350/500] Make Meta Internal Linter happy (#14074)

Summary:
Linter complains like this
```
  void foo(Arg parameter_name) {}
    void bar() {
    Arg a;
    foo(/*some_other_name=*/ a); // Wrong! Comment/parameter name mismatch
    foo(/*parameter_name=*/ a);  // This is OK; the names match.
  }
```
```
Argument name in comment (`read_only`) does not match parameter name (`unchanging`).
```

This used to be warning, but now treated as an error :(

Fixing a few other linter warnings before they become errors in the future.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14074

Test Plan: CI

Reviewed By: archang19

Differential Revision: D85370353

Pulled By: jaykorean

fbshipit-source-id: 20e96aad740d516a29c0424282674e655f99c0a2
---
 db/compaction/compaction_job_test.cc | 10 ++++++----
 tools/ldb_cmd.cc                     |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 4c5f889de847..e1e11e76fe36 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -216,7 +216,7 @@ class CompactionJobTestBase : public testing::Test {
             /*block_cache_tracer=*/nullptr,
             /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
             /*daily_offpeak_time_utc=*/"",
-            /*error_handler=*/nullptr, /*read_only=*/false)),
+            /*error_handler=*/nullptr, /*unchanging=*/false)),
         shutting_down_(false),
         mock_table_factory_(new mock::MockTableFactory()),
         error_handler_(nullptr, db_options_, &mutex_),
@@ -552,7 +552,7 @@ class CompactionJobTestBase : public testing::Test {
                        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                        test::kUnitTestDbId, /*db_session_id=*/"",
                        /*daily_offpeak_time_utc=*/"",
-                       /*error_handler=*/nullptr, /*read_only=*/false));
+                       /*error_handler=*/nullptr, /*unchanging=*/false));
     compaction_job_stats_.Reset();
 
     VersionEdit new_db;
@@ -2420,7 +2420,7 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
 
  protected:
   static constexpr const char* kCancelBeforeThisKey = "cancel_before_this_key";
-  std::string progress_dir_ = "";
+  std::string progress_dir_;
   bool enable_cancel_ = false;
   std::atomic<int> stop_count_{0};
   std::atomic<bool> cancel_{false};
@@ -2580,7 +2580,9 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
 
     while (reader.ReadRecord(&slice, &record)) {
       VersionEdit edit;
-      if (!edit.DecodeFrom(slice).ok()) continue;
+      if (!edit.DecodeFrom(slice).ok()) {
+        continue;
+      }
       builder.ProcessVersionEdit(edit);
     }
 
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 96842a86e3a5..9ab70b97410b 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1614,7 +1614,7 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       options.daily_offpeak_time_utc,
-                      /*error_handler=*/nullptr, /*read_only=*/true);
+                      /*error_handler=*/nullptr, /*unchanging=*/true);
   Status s = versions.DumpManifest(options, file, verbose, hex, json, cf_descs);
   if (!s.ok()) {
     fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
@@ -1809,7 +1809,7 @@ Status GetLiveFilesChecksumInfoFromVersionSet(Options options,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       options.daily_offpeak_time_utc,
-                      /*error_handler=*/nullptr, /*read_only=*/true);
+                      /*error_handler=*/nullptr, /*unchanging=*/true);
   std::vector<std::string> cf_name_list;
   s = versions.ListColumnFamilies(&cf_name_list, db_path,
                                   immutable_db_options.fs.get());

From dce33f9443815dcbe1d9a98d4d34776dfdf1112e Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Thu, 23 Oct 2025 20:34:21 -0700
Subject: [PATCH 351/500] Follow up on MultiScan change in #14040 (#14055)

Summary:
* Address feedback from https://github.com/facebook/rocksdb/issues/14040
* Add additional test for MultiScan
* Fix a bug when del range and data are in same file for multi-scan
* Rewrite the cases need to be handled in SeekMultiScan

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14055

Test Plan: Unit test

Reviewed By: cbi42, anand1976

Differential Revision: D84851788

Pulled By: xingbowang

fbshipit-source-id: 0f69632733afb99685f6341badbf239681010c38
---
 db/version_set.cc                             |   4 +-
 .../block_based/block_based_table_iterator.cc | 282 ++++++++++--------
 .../block_based/block_based_table_iterator.h  |  26 +-
 .../block_based_table_reader_test.cc          |  17 +-
 table/table_test.cc                           |  87 +++---
 5 files changed, 245 insertions(+), 171 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index ed43e0c98571..8b835e75c704 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1188,8 +1188,8 @@ class LevelIterator final : public InternalIterator {
             continue;
           }
           auto const metadata = flevel_->files[i].file_metadata;
-          if (metadata->num_entries == metadata->num_range_deletions) {
-            // Skip range deletion only files.
+          if (metadata->FileIsStandAloneRangeTombstone()) {
+            // Skip stand alone range deletion files.
             continue;
           }
           auto& args = GetMultiScanArgForFile(i);
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 10b4a70897f2..a505a8449329 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1045,13 +1045,6 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 }
 
 void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
-  if (SeekMultiScanImpl(seek_target)) {
-    is_out_of_bound_ = true;
-    assert(!Valid());
-  }
-}
-
-bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
   assert(multi_scan_ && multi_scan_status_.ok());
   // This is a MultiScan and Preapre() has been called.
 
@@ -1063,49 +1056,59 @@ bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
   if (!seek_target) {
     // start key must be set for multi-scan
     multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
-    return false;
+    return;
   }
 
-  constexpr auto out_of_bound = true;
-
   // Check the case where there is no range prepared on this table
   if (multi_scan_->scan_opts->size() == 0) {
     // out of bound
-    return out_of_bound;
+    MarkPreparedRangeExhausted();
+    return;
   }
 
   // Check whether seek key is moving forward.
-  if (!multi_scan_->prev_seek_key_.empty()) {
-    if (user_comparator_.CompareWithoutTimestamp(ExtractUserKey(*seek_target),
-                                                 /*a_has_ts=*/true,
-                                                 multi_scan_->prev_seek_key_,
-                                                 /*b_has_ts=*/false) < 0) {
-      // The seek target moved backward
-      multi_scan_status_ =
-          Status::InvalidArgument("Unexpected seek key moving backward");
-      return false;
-    }
-  }
-  multi_scan_->prev_seek_key_ = ExtractUserKey(*seek_target).ToString();
-
-  // There are still a few cases we need to handle
-  // table: _____[prepared range 1]_____[prepared range 2]_____
-  // seek :   1  2        3          4                      5
-  // Case 1: seek before the first prepared ranges, return out of bound
-  // Case 2: seek at the beginning of a prepared range (expected case)
-  // Case 3: seek within a prepared range (unexpected, but supported)
-  // Case 4: seek between 2 of the prepared ranges, return out of bound
-  // Case 5: seek after all of the prepared ranges, should move on to next file
-  // The reason this could happen is due to seek key adjustment due to delete
-  // range file.
-  // E.g. LSM has 3 levels, each level has only 1 file:
-  // L1 : key :              0---10
-  // L2 : Delete range key : 0-5
-  // L3 : key :              0---10
-  // When a range 2-8 was prepared, the prepared key would be 2 on L3 file, but
-  // the seek key would be 5, as the seek key was updated by the largest key of
-  // delete range. This causes all of the cases above to be possible, when the
-  // ranges are adjusted in the above examples.
+  if (multi_scan_->prev_seek_key_.empty() ||
+      icomp_.Compare(*seek_target, multi_scan_->prev_seek_key_) > 0) {
+    // If seek key is empty or is larger than previous seek key, update the
+    // previous seek key. Otherwise use the previous seek key as the adjusted
+    // seek target moving forward. This prevents seek target going backward,
+    // which would visit pages that have been unpinned.
+    // This issue is caused by sub-optimal range delete handling inside merge
+    // iterator.
+    // TODO xingbo issues:14068 : Optimize the handling of range delete iterator
+    // inside merge iterator, so that it doesn't move seek key backward. After
+    // that we could return error if the key moves backward here.
+    multi_scan_->prev_seek_key_ = seek_target->ToString();
+  } else {
+    // Seek key is adjusted to previous one, we can return here directly.
+    return;
+  }
+
+  // There are 3 different Cases we need to handle:
+  // The following diagram explain different seek targets seeking at various
+  // position on the table, while the next_scan_idx points to the PreparedRange
+  // 2.
+  //
+  // next_scan_idx: -------------------┐
+  //                                   ▼
+  // table:     : __[PreparedRange 1]__[PreparedRange 2]__[PreparedRange 3]__
+  // Seek target: <----- Case 1 ------>▲<------------- Case 2 -------------->
+  //                                   │
+  //                                 Case 3
+  //
+  // Case 1: seek before the start of next prepared ranges. This could happen
+  //    due to too many delete tomestone triggered reseek or delete range.
+  // Case 2: seek after the start of next prepared range.
+  //    This could happen due to seek key adjustment from delete range file.
+  //    E.g. LSM has 3 levels, each level has only 1 file:
+  //    L1 : key :              0---10
+  //    L2 : Delete range key : 0-5
+  //    L3 : key :              0---10
+  //    When a range 2-8 was prepared, the prepared key would be 2 on L3 file,
+  //    but the seek key would be 5, as the seek key was updated by the largest
+  //    key of delete range. This causes all of the cases above to be possible,
+  //    when the ranges are adjusted in the above examples.
+  // Case 3: seek at the beginning of a prepared range (expected case)
 
   // Allow reseek on the start of the last prepared range due to too many
   // tombstone
@@ -1113,83 +1116,152 @@ bool BlockBasedTableIterator::SeekMultiScanImpl(const Slice* seek_target) {
       std::min(multi_scan_->next_scan_idx,
                multi_scan_->block_index_ranges_per_scan.size() - 1);
 
+  auto user_seek_target = ExtractUserKey(*seek_target);
+
   auto compare_next_scan_start_result =
       user_comparator_.CompareWithoutTimestamp(
-          ExtractUserKey(*seek_target), /*a_has_ts=*/true,
+          user_seek_target, /*a_has_ts=*/true,
           multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
               .range.start.value(),
           /*b_has_ts=*/false);
 
   if (compare_next_scan_start_result != 0) {
-    // The seek key is not exactly same as what was prepared.
+    // The seek target is not exactly same as what was prepared.
     if (compare_next_scan_start_result < 0) {
-      // Needs to handle Cases: 1, 3, 4
-      //
-      // next_scan_idx :                    |
-      //                                    V
-      // table: _____[prepared range 1]_____[prepared range 2]_____
-      // seek :   1           3          4
-
-      // Case 1: Seek key is before the start key of the first range
+      // Case 1:
       if (multi_scan_->next_scan_idx == 0) {
-        return out_of_bound;
+        // This should not happen, even when seek target is adjusted by delete
+        // range. The reason is that if the seek target is before the start key
+        // of the first prepared range, its end key needs to be >= the smallest
+        // key of this file, otherwise it is skipped in level iterator. If its
+        // end key is >= the smallest key of this file, then this range will be
+        // prepared for this file. As delete range could only adjust seek
+        // target forward, so it would never be before the start key of the
+        // first prepared range.
+        assert(false && "Seek target before the first prepared range");
+        MarkPreparedRangeExhausted();
+        return;
+      }
+      auto seek_target_before_previous_prepared_range =
+          user_comparator_.CompareWithoutTimestamp(
+              user_seek_target, /*a_has_ts=*/true,
+              multi_scan_->scan_opts
+                  ->GetScanRanges()[multi_scan_->next_scan_idx - 1]
+                  .range.start.value(),
+              /*b_has_ts=*/false) < 0;
+      // Not expected to happen
+      // This should never happen, the reason is that the
+      // multi_scan_->next_scan_idx is set to a non zero value is due to a seek
+      // target larger or equal to the start key of multi_scan_->next_scan_idx-1
+      // happended earlier. If a seek happens before the start key of
+      // multi_scan_->next_scan_idx-1, it would seek a key that is less than
+      // what was seeked before.
+      assert(!seek_target_before_previous_prepared_range);
+      if (seek_target_before_previous_prepared_range) {
+        multi_scan_status_ = Status::InvalidArgument(
+            "Seek target is before the previous prepared range at index " +
+            std::to_string(multi_scan_->next_scan_idx));
+        return;
       }
-      // Case: 3, 4
+      // It should only be possible to seek a key between the start of current
+      // prepared scan and start of next prepared range.
       MultiScanUnexpectedSeekTarget(
-          seek_target, std::get<0>(multi_scan_->block_index_ranges_per_scan
-                                       [multi_scan_->next_scan_idx - 1]));
-
+          seek_target, &user_seek_target,
+          std::get<0>(multi_scan_->block_index_ranges_per_scan
+                          [multi_scan_->next_scan_idx - 1]));
     } else {
-      // Needs to handle Cases: 3, 4, 5
-      // next_scan_idx :|
-      //                V
-      // table:     ____[prepared range 1]_____[prepared range 2]_____
-      // seek :                 3           4                      5
+      // Case 2:
       MultiScanUnexpectedSeekTarget(
-          seek_target,
+          seek_target, &user_seek_target,
           std::get<0>(
               multi_scan_
                   ->block_index_ranges_per_scan[multi_scan_->next_scan_idx]));
     }
   } else {
-    if (multi_scan_->next_scan_idx >=
-        multi_scan_->block_index_ranges_per_scan.size()) {
-      // Seeking a range that is out side of prepared ranges.
-      return out_of_bound;
-    }
+    // Case 2:
+    assert(multi_scan_->next_scan_idx <
+           multi_scan_->block_index_ranges_per_scan.size());
 
     auto [cur_scan_start_idx, cur_scan_end_idx] =
         multi_scan_->block_index_ranges_per_scan[multi_scan_->next_scan_idx];
     // We should have the data block already loaded
     ++multi_scan_->next_scan_idx;
     if (cur_scan_start_idx >= cur_scan_end_idx) {
-      if (multi_scan_->next_scan_idx <
-          multi_scan_->block_index_ranges_per_scan.size()) {
-        return out_of_bound;
-      } else {
-        ResetDataIter();
-        return false;
-      }
-    } else {
-      is_out_of_bound_ = false;
+      // No blocks are prepared for this range at current file.
+      MarkPreparedRangeExhausted();
+      return;
     }
 
     MultiScanSeekTargetFromBlock(seek_target, cur_scan_start_idx);
   }
-
-  return false;
 }
 
 void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
-    const Slice* seek_target, size_t block_idx) {
+    const Slice* seek_target, const Slice* user_seek_target, size_t block_idx) {
   // linear search the block that contains the seek target, and unpin blocks
   // that are before it.
+
+  // The logic here could be confusing when there is a delete range involved.
+  // E.g. we have an LSM with 3 levels, each level has only 1 file:
+  // L1: data file :    0---10
+  // L2: Delete range : 0-5
+  // L3: data file :    0---10
+  //
+  // MultiScan on ranges 1-2, 3-4, and 5-6.
+  // When user first do Seek(1), on level 2, due to delete range 0-5, the seek
+  // key is adjusted to 5 at level 3. Therefore, we will internally do Seek(5)
+  // and unpins all blocks until 5 at level 3. Then the next scan's blocks from
+  // 3-4 are unpinned at level 3. It is confusing that maybe block 3-4 should
+  // not be unpinned, as next scan would need it. But Seek(5) implies that these
+  // keys are all covered by some range deletion, so the next Seek(3) will also
+  // do Seek(5) internally, so the blocks from 3-4 could be safely unpinned.
+
+  // advance to the right prepared range
+  while (
+      multi_scan_->next_scan_idx <
+          multi_scan_->block_index_ranges_per_scan.size() &&
+      (user_comparator_.CompareWithoutTimestamp(
+           *user_seek_target, /*a_has_ts=*/true,
+           multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
+               .range.start.value(),
+           /*b_has_ts=*/false) >= 0)) {
+    multi_scan_->next_scan_idx++;
+  }
+
+  // next_scan_idx is guaranteed to be higher than 0. If the seek key is before
+  // the start key of first prepared range, it is already handled by caller
+  // SeekMultiScan. It is equal, it would not call this funciton. If it is
+  // after, next_scan_idx would be advanced by the loop above.
+  assert(multi_scan_->next_scan_idx > 0);
+  // Get the current range
+  auto cur_scan_idx = multi_scan_->next_scan_idx - 1;
+  auto [cur_scan_start_idx, cur_scan_end_idx] =
+      multi_scan_->block_index_ranges_per_scan[cur_scan_idx];
+
+  if (cur_scan_start_idx >= cur_scan_end_idx) {
+    // No blocks are prepared for this range at current file.
+    MarkPreparedRangeExhausted();
+    return;
+  }
+
+  // Unpin all the blocks from multi_scan_->cur_data_block_idx to
+  // cur_scan_start_idx
+  for (auto unpin_block_idx = multi_scan_->cur_data_block_idx;
+       unpin_block_idx < cur_scan_start_idx; unpin_block_idx++) {
+    if (!multi_scan_->pinned_data_blocks[unpin_block_idx].IsEmpty()) {
+      multi_scan_->pinned_data_blocks[unpin_block_idx].Reset();
+    }
+  }
+
+  // Find the right block_idx;
+  block_idx = cur_scan_start_idx;
   auto const& data_block_separators = multi_scan_->data_block_separators;
   while (block_idx < data_block_separators.size() &&
          (user_comparator_.CompareWithoutTimestamp(
-              ExtractUserKey(*seek_target), /*a_has_ts=*/true,
+              *user_seek_target, /*a_has_ts=*/true,
               data_block_separators[block_idx],
               /*b_has_ts=*/false) > 0)) {
+    // Unpin the blocks that are passed
     if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
       multi_scan_->pinned_data_blocks[block_idx].Reset();
     }
@@ -1197,30 +1269,11 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
   }
 
   if (block_idx >= data_block_separators.size()) {
-    // Handle case 5, when seek key is larger than the last block in the last
-    // prepared range.
-    ResetDataIter();
-    assert(!Valid());
+    // All of the prepared blocks for this file is exhausted.
+    MarkPreparedRangeExhausted();
     return;
   }
 
-  // // The iterator from previous seek may have moved forward a few blocks,
-  // // In that case, have block_idx catch up the cur_data_block_idx
-  // // Note no need to handle block unpin, as it has been handled during
-  // iterating block_idx = std::max(block_idx, multi_scan_->cur_data_block_idx);
-
-  // advance to the right prepared range
-  while (
-      multi_scan_->next_scan_idx <
-          multi_scan_->block_index_ranges_per_scan.size() &&
-      (user_comparator_.CompareWithoutTimestamp(
-           ExtractUserKey(*seek_target), /*a_has_ts=*/true,
-           multi_scan_->scan_opts->GetScanRanges()[multi_scan_->next_scan_idx]
-               .range.start.value(),
-           /*b_has_ts=*/false) >= 0)) {
-    multi_scan_->next_scan_idx++;
-  }
-
   // The current block may contain the data for the target key
   MultiScanSeekTargetFromBlock(seek_target, block_idx);
 }
@@ -1257,6 +1310,7 @@ void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
   block_iter_points_to_real_block_ = true;
   block_iter_.Seek(*seek_target);
   FindKeyForward();
+  CheckOutOfBound();
 }
 
 void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
@@ -1275,20 +1329,7 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     // for this file, it may need to continue to scan into the next file, so
     // we do not set is_out_of_bound_ in this case.
     if (multi_scan_->cur_data_block_idx + 1 >= cur_scan_end_idx) {
-      if (multi_scan_->next_scan_idx >=
-          multi_scan_->block_index_ranges_per_scan.size()) {
-        // We are done with this file, should let LevelIter advance to the
-        // next file instead of ending the scan
-        ResetDataIter();
-        assert(!is_out_of_bound_);
-        assert(!Valid());
-        return;
-      }
-      // We don't ResetDataIter() here since next scan might be reading from
-      // the same block. ResetDataIter() will free the underlying block cache
-      // handle and we don't want the block to be unpinned.
-      is_out_of_bound_ = true;
-      assert(!Valid());
+      MarkPreparedRangeExhausted();
       return;
     }
     // Move to the next pinned data block
@@ -1419,7 +1460,7 @@ Status BlockBasedTableIterator::CollectBlockHandles(
     std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
     std::vector<std::string>* data_block_separators) {
   // print file name and level
-  if (kVerbose) {
+  if (UNLIKELY(kVerbose)) {
     auto file_name = table_->get_rep()->file->file_name();
     auto level = table_->get_rep()->level;
     printf("file name : %s, level %d\n", file_name.c_str(), level);
@@ -1480,11 +1521,16 @@ Status BlockBasedTableIterator::CollectBlockHandles(
     }
     block_index_ranges_per_scan->emplace_back(
         scan_block_handles->size() - num_blocks, scan_block_handles->size());
-    if (kVerbose) {
+    if (UNLIKELY(kVerbose)) {
       printf("separators :");
       for (const auto& separator : *data_block_separators) {
         printf("%s, ", separator.c_str());
       }
+      printf("\nblock_index_ranges_per_scan :");
+      for (auto const& block_index_range : *block_index_ranges_per_scan) {
+        printf("[%zu, %zu], ", std::get<0>(block_index_range),
+               std::get<1>(block_index_range));
+      }
       printf("\n");
     }
   }
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 85e2f8d90923..a28133261559 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -381,6 +381,27 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   bool block_iter_points_to_real_block_;
   // See InternalIteratorBase::IsOutOfBound().
   bool is_out_of_bound_ = false;
+
+  // Mark prepared ranges as exhausted for multiscan.
+  void MarkPreparedRangeExhausted() {
+    assert(multi_scan_ != nullptr);
+    if (multi_scan_->next_scan_idx <
+        multi_scan_->block_index_ranges_per_scan.size()) {
+      // If there are more prepared ranges, we don't ResetDataIter() here,
+      // because next scan might be reading from the same block. ResetDataIter()
+      // will free the underlying block cache handle and we don't want the
+      // block to be unpinned.
+      // Set out of bound to mark the current prepared range as exhausted.
+      is_out_of_bound_ = true;
+    } else {
+      // This is the last prepared range of this file, there might be more
+      // data on next file. Reset data iterator to indicate the iterator is
+      // no longer valid on this file. Let LevelIter advance to the next file
+      // instead of ending the scan.
+      ResetDataIter();
+    }
+  }
+
   // During cache lookup to find readahead size, index_iter_ is iterated and it
   // can point to a different block.
   // If Prepare() is called, index_iter_ is used to prefetch data blocks for the
@@ -612,12 +633,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   // *** BEGIN APIs relevant to multiscan ***
 
-  // Wrapper for SeekMultiScanImpl for handling out of bound
   void SeekMultiScan(const Slice* target);
 
-  // Return true if the result is out of bound
-  bool SeekMultiScanImpl(const Slice* seek_target);
-
   void FindBlockForwardInMultiScan();
 
   void PrepareReadAsyncCallBack(FSReadRequest& req, void* cb_arg) {
@@ -642,6 +659,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   void MultiScanSeekTargetFromBlock(const Slice* seek_target, size_t block_idx);
   void MultiScanUnexpectedSeekTarget(const Slice* seek_target,
+                                     const Slice* user_seek_target,
                                      size_t block_idx);
 
   // Return true, if there is an error, or end of file
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index 9b40dd1d5f42..7b20759caa54 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -1274,13 +1274,13 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
                       ExtractUserKey(kv[33 * kEntriesPerBlock].first));
   iter->Prepare(&scan_options);
   iter->Seek(kv[32 * kEntriesPerBlock].first);
+  auto key = iter->key();
   ASSERT_OK(iter->status());
-  iter->Seek(kv[34 * kEntriesPerBlock].first);
-  ASSERT_OK(iter->status());
-  // Seek key could not going backward
   iter->Seek(kv[30 * kEntriesPerBlock].first);
-  ASSERT_EQ(iter->status(),
-            Status::InvalidArgument("Unexpected seek key moving backward"));
+  // When seek key goes backward, it is adjusted to the last seeked position.
+  // Assert the key read is same as before.
+  ASSERT_EQ(key, iter->key());
+  ASSERT_OK(iter->status());
 
   // Test prefetch limit reached.
   iter.reset(table->NewIterator(
@@ -1333,9 +1333,10 @@ TEST_P(BlockBasedTableReaderMultiScanAsyncIOTest, MultiScanPrepare) {
     std::cout << random_seed << std::endl;
     SCOPED_TRACE("Random seed " + std::to_string(random_seed));
 
-    int last_read_key_index = rnd.Uniform(100);
-    while (last_read_key_index < 100) {
-      iter->Seek(kv[last_read_key_index * kEntriesPerBlock].first);
+    // Search key always start from the start key of first prepared range.
+    int last_read_key_index = rnd.Uniform(100) + 5 * kEntriesPerBlock;
+    while (last_read_key_index < 100 * kEntriesPerBlock) {
+      iter->Seek(kv[last_read_key_index].first);
       EXPECT_OK(iter->status());
       // iterate for a few keys
       while (iter->Valid()) {
diff --git a/table/table_test.cc b/table/table_test.cc
index 869e6030ddb4..ae4fbea0e85c 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -9140,6 +9140,7 @@ class UserDefinedIndexStressTest
                                const std::vector<DataRange>& ranges,
                                bool& data_added) {
     std::unique_ptr<SstFileWriter> writer;
+
     data_added = false;
 
     std::vector<DataRange> ranges_in_file;
@@ -9151,9 +9152,12 @@ class UserDefinedIndexStressTest
       }
 
       if (writer == nullptr) {
+        // lazy create writer until there is data to be written to avoid
+        // unchecked status error
         writer = std::make_unique<SstFileWriter>(EnvOptions(), options_);
         ASSERT_OK(writer->Open(ingest_file));
       }
+
       ranges_in_file.push_back(range);
 
       data_added = true;
@@ -9256,6 +9260,7 @@ class UserDefinedIndexStressTest
       if (kVerbose) {
         std::cout << "iteration " << i << std::endl;
       }
+      SCOPED_TRACE("Iteration " + std::to_string(i));
       // randomly generate 1 to 3 ranges
       auto ranges = GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "");
 
@@ -9299,13 +9304,14 @@ class UserDefinedIndexStressTest
                              size_t& ingest_file_count,
                              const IngestExternalFileOptions& ifo,
                              bool combine_ranges = false) {
-    std::vector<std::string> ingest_files;
     // Generate SST file and bulk load them one level at a time
+    std::vector<std::string> ingest_files;
     if (combine_ranges) {
       size_t i = 0;
       while (i < ranges_in_level.size()) {
         // if combine ranges, generate 1 SST file that combines muliple ranges
         // together
+        // Randomly combine ranges to SST file.
         size_t batch_end_idx =
             std::min(i + rnd.Uniform(3) + 2, ranges_in_level.size());
         bool data_added = false;
@@ -9361,22 +9367,7 @@ class UserDefinedIndexStressTest
   }
 };
 
-// TODO(xingbo)
-// This test is disabled due to following test case condition:
-// level n:   delete range 4-6
-// level n+1: data range 0-------10
-// query: 3-9, count=2.
-// Becuase query count == 2, level n+1 would only prepare 3-5. but since 4-6
-// got deleted in the upper level, they are not returned, so only 3 is
-// returned. Meantime the query should have return [3, 6]
-// One way to fix this is by preparing more data blocks once prepared blocks
-// are exhausted, but upper bound is not reached yet. This requires following
-// changes:
-// 1. Fix out of bound flag in block table iterator. Only set it if the key is
-// larger than the upper bound.
-// 2. Refactor the prepared block single dimension vector into 2 dimension of
-// vectors, so that more blocks could be prepared if needed.
-TEST_P(UserDefinedIndexStressTest, DISABLED_PartialDeleteRange) {
+TEST_P(UserDefinedIndexStressTest, PartialDeleteRange) {
   // Create 2 column families. One use normal put/del, the other uses sst
   // ingest Randomly generate multiple non overlapping range for multiple
   // levels Range scan same range between the 2 CF and validate the result is
@@ -9387,6 +9378,22 @@ TEST_P(UserDefinedIndexStressTest, DISABLED_PartialDeleteRange) {
   SCOPED_TRACE("dbname: " + dbname_);
   ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
 
+  if (enable_udi_) {
+    // Skip UDI for now.
+    // The issue is that with UDI enabled, prepare might not prepare enough keys
+    // at lower level due to range delete from upper level.
+    // E.g. consider a LSM tree:
+    // L1: Data         [0-1]
+    // L2: Delete Range [0-6]
+    // L3: Data         [0-9]
+    // When multiscan queries range [0-9) with UDI count as 3, the L3 file
+    // will only prepare range [0-3). However, this range is masked out by upper
+    // layer delete range from [0-6] from L2. This causes query to only return
+    // [0,1], while [0,1,7] is the right result. Until prepare is able to
+    // preparing additional block supported, UDI is skipped.
+    return;
+  }
+
   for (int i = 0; i < 5; i++) {
     ranges_in_levels_.push_back(
         GenerateKeyRanges(rnd.Uniform(3) + 4, 2,
@@ -9408,9 +9415,9 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
   // Create 2 column families. One use normal put/del, the other uses sst
   // ingest.
   // Test the case where there are 3 levels, the middle level is a delete
-  // range file that span across the entire key space. The top level file have
-  // multiple files and each one has both data and delete range Scan same
-  // range between the 2 CF and validate the result is same
+  // range file that span across the entire key space. The top and bottom level
+  // file have multiple files and each one has both data and delete range. Scan
+  // same range between the 2 CF and validate the result is same
   SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
   dbname_ = test::PerThreadDBPath(
       "UserDefinedIndexStressTest_DeleteRangeMixedWithDataFile");
@@ -9418,9 +9425,9 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
   ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
 
   // Test 3 levels.
-  // bottom level is normal data files.
-  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
-  // middle level delete range between each level
+  // Bottom level is mixed data with delete range.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L6"));
+  // Middle level delete range across entire key space.
   if (is_reverse_comparator_) {
     ranges_in_levels_.push_back({{.start = 100,
                                   .end = 0,
@@ -9437,8 +9444,8 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
                                   .end_key = "keyz"}});
   }
 
-  // Top level is normal data files
-  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
+  // Top level is mixed data with delete range.
+  ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 6, 2, "L4"));
 
   IngestExternalFileOptions ifo;
   ifo.snapshot_consistency = false;
@@ -9448,7 +9455,7 @@ TEST_P(UserDefinedIndexStressTest, DeleteRangeMixedWithDataFile) {
   for (auto const& ranges_in_level : ranges_in_levels_) {
     ASSERT_NO_FATAL_FAILURE(
         IngestFilesInOneLevel(ranges_in_level, ingest_file_name_prefix,
-                              ingest_file_count, ifo, true));
+                              ingest_file_count, ifo, /*combine_ranges=*/true));
     if (first_level) {
       first_level = false;
       if (enable_compaction_with_sst_partitioner_) {
@@ -9475,9 +9482,10 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
   ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
 
   // Test 3 levels.
-  // bottom level is normal data files.
+  // bottom level constains multiple files, each could have data or delete
+  // ranges or both.
   ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
-  // middle level delete range between each level
+  // middle level delete range across entire key space
   if (is_reverse_comparator_) {
     ranges_in_levels_.push_back({{.start = 100,
                                   .end = 0,
@@ -9493,7 +9501,8 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
                                   .start_key = "key",
                                   .end_key = "keyz"}});
   }
-  // Top level is normal data files
+  // Top level constains multiple files, each could have data or delete
+  // ranges or both.
   ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
 
   IngestExternalFileOptions ifo;
@@ -9519,20 +9528,19 @@ TEST_P(UserDefinedIndexStressTest, DeleteRange) {
 }
 
 TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
-  // Create 2 column families. One use normal put/del, the other uses sst
-  // ingest.
-  // Test the case where there are 3 levels, the middle level is a delete
-  // range file that span across the entire key space. Range scan same range
-  // between the 2 CF and validate the result is same
+  // Create 2 column families. One use normal put/del, the other uses SST
+  // ingest. The SST ingest uses atomic range replace.
   SCOPED_TRACE("Start with random seed: " + std::to_string(rand_seed_));
-  dbname_ = test::PerThreadDBPath("UserDefinedIndexStressTest_DeleteRange");
+  dbname_ =
+      test::PerThreadDBPath("UserDefinedIndexStressTest_AtomicReplaceBulkLoad");
   SCOPED_TRACE("dbname: " + dbname_);
   ASSERT_NO_FATAL_FAILURE(SetupDB(dbname_));
 
   // Test 3 levels.
-  // bottom level is normal data files.
+  // bottom level constains multiple files, each could have data or delete
+  // ranges or both.
   ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L6"));
-  // middle level delete range between each level
+  // middle level delete range across entire key space
   if (is_reverse_comparator_) {
     ranges_in_levels_.push_back({{.start = 100,
                                   .end = 0,
@@ -9548,7 +9556,8 @@ TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
                                   .start_key = "key",
                                   .end_key = "keyz"}});
   }
-  // Top level is normal data files
+  // Top level constains multiple files, each could have data or delete
+  // ranges or both.
   ranges_in_levels_.push_back(GenerateKeyRanges(rnd.Uniform(3) + 4, 2, "L4"));
 
   IngestExternalFileOptions ifo;
@@ -9569,7 +9578,7 @@ TEST_P(UserDefinedIndexStressTest, AtomicReplaceBulkLoad) {
   }
 
   // Ingest the a new file with atomic replace with full key space, this layer
-  // is exactly same as the one at Level 4
+  // is exactly same as the one at the top level
   bool data_added;
   ASSERT_NO_FATAL_FAILURE(CreateSstFileWithRanges(
       ingest_file_name_prefix + std::to_string(++ingest_file_count),

From 2edc660e28b62951d7ea134d27d456e040aa638a Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 24 Oct 2025 12:51:43 -0700
Subject: [PATCH 352/500] Fix multiscan assert failure in stress test (#14077)

Summary:
should not use async_io when not supported to avoid the assert failure here: https://github.com/facebook/rocksdb/blob/dce33f9443815dcbe1d9a98d4d34776dfdf1112e/table/block_based/block_based_table_iterator.cc#L1710.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14077

Test Plan: monitor future CI failure.

Reviewed By: anand1976

Differential Revision: D85456447

Pulled By: cbi42

fbshipit-source-id: dccc865a5aedf194029a53616f4bbc99d0162691
---
 db_stress_tool/db_stress_test_base.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index b37a0307ff45..9b5203dc19fb 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -25,6 +25,7 @@
 #include "db_stress_tool/db_stress_filters.h"
 #include "db_stress_tool/db_stress_table_properties_collector.h"
 #include "db_stress_tool/db_stress_wide_merge_operator.h"
+#include "file/file_util.h"
 #include "options/options_parser.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
@@ -1695,7 +1696,10 @@ Status StressTest::TestMultiScan(ThreadState* thread,
   std::vector<std::string> end_key_strs;
   // TODO support reverse BytewiseComparator in the stress test
   MultiScanArgs scan_opts(options_.comparator);
-  scan_opts.use_async_io = FLAGS_multiscan_use_async_io;
+  scan_opts.use_async_io =
+      FLAGS_multiscan_use_async_io &&
+      CheckFSFeatureSupport(options_.env->GetFileSystem().get(),
+                            FSSupportedOps::kAsyncIO);
   start_key_strs.reserve(num_scans);
   end_key_strs.reserve(num_scans);
 

From e687ca79b42ca8673de8ad50c97f3e8b9eefe414 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <ajkryczka+github@gmail.com>
Date: Fri, 24 Oct 2025 13:11:26 -0700
Subject: [PATCH 353/500] Fix a missing CV signal in `FindObsoleteFiles()`
 (#14069)

Summary:
Fixed a missing CV signal when `FindObsoleteFiles()` decides there is nothing to purge and then decrements `pending_purge_obsolete_files_` to zero.  This bug could cause `DB::GetSortedWalFiles()` to hang, at least.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14069

Test Plan: unit test repro

Reviewed By: hx235

Differential Revision: D85453534

Pulled By: cbi42

fbshipit-source-id: cf5cfe7f5087459ca1f1f28ce81ea6afc84178f0
---
 db/db_filesnapshot.cc                         |  1 +
 db/db_impl/db_impl_files.cc                   |  3 ++
 db/obsolete_files_test.cc                     | 41 +++++++++++++++++++
 .../get_sorted_wal_files_noop_purge_hang.md   |  1 +
 4 files changed, 46 insertions(+)
 create mode 100644 unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md

diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index d5244877503e..7bf821170031 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -109,6 +109,7 @@ Status DBImpl::GetSortedWalFilesImpl(VectorWalPtr& files, bool need_seqnos) {
   {
     InstrumentedMutexLock l(&mutex_);
     while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
+      TEST_SYNC_POINT("DBImpl::GetSortedWalFilesImpl:WaitPurge");
       bg_cv_.Wait();
     }
 
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 445f7338d1f7..d9d56a1f447b 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -267,6 +267,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     if (!job_context->HaveSomethingToDelete()) {
       mutex_.AssertHeld();
       --pending_purge_obsolete_files_;
+      if (pending_purge_obsolete_files_ == 0) {
+        bg_cv_.SignalAll();
+      }
     }
   });
 
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index eb3ed078c79e..818bcc4b5901 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -303,6 +303,47 @@ TEST_F(ObsoleteFilesTest, BlobFiles) {
   ASSERT_EQ(deleted_files, expected_deleted_files);
 }
 
+TEST_F(ObsoleteFilesTest, GetSortedWalFilesHangsAfterNoopPurge) {
+  // This test used to trigger a hang in `DB::GetSortedWalFiles()`, where it
+  // would wait for a no-op purge that did not signal the CV upon completion.
+
+  // Grab an iterator and flush to switch the super version. That way, when the
+  // iterator is destroyed, it will go through the purge path.
+  DB* db = db_;  // Only using `db` makes it clear we only use DB-level APIs.
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Sync points ensure `GetSortedWalFiles()` waits for a purge after
+  // `FindObsoleteFiles()` releases the mutex but before its corresponding purge
+  // completes.
+  SyncPoint::GetInstance()->SetCallBack(
+      "FindObsoleteFiles::PostMutexUnlock", [&](void* /* arg */) {
+        TEST_SYNC_POINT(
+            "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:"
+            "InCallback:1");
+        TEST_SYNC_POINT(
+            "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:"
+            "InCallback:2");
+      });
+  SyncPoint::GetInstance()->LoadDependency({
+      {"ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:InCallback:1",
+       "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:Thread:Begin"},
+      {"DBImpl::GetSortedWalFilesImpl:WaitPurge",
+       "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:InCallback:2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread get_sorted_wal_files_thread([db]() {
+    TEST_SYNC_POINT(
+        "ObsoleteFilesTest::GetSortedWalFilesHangsAfterNoopPurge:Thread:Begin");
+    VectorWalPtr files;
+    ASSERT_OK(db->GetSortedWalFiles(files));
+  });
+  iter.reset();
+  get_sorted_wal_files_thread.join();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md b/unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md
new file mode 100644
index 000000000000..48f6efea4cbe
--- /dev/null
+++ b/unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md
@@ -0,0 +1 @@
+Fixed a bug where `DB::GetSortedWalFiles()` could hang when waiting for a purge operation that found nothing to do (potentially triggered by iterator release, flush, compaction, etc.).
\ No newline at end of file

From 10478b98a595cdcb26e649a91df7249914b0b383 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <ajkryczka+github@gmail.com>
Date: Fri, 24 Oct 2025 17:10:48 -0700
Subject: [PATCH 354/500] Fix unsigned underflow in WAL TTL logic when system
 clock goes backwards (#14016)

Summary:
The TTL-based WAL archive cleanup logic could incorrectly delete an archived WAL if the system clock moved backwards between the last write to that WAL and `WALManager::PurgeObsoleteWALFiles()`. This happened due to unsigned underflow in subtraction of two wall clock based timestamps: `now_seconds - file_m_time`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14016

Test Plan: unit test repro

Reviewed By: pdillinger

Differential Revision: D83879806

Pulled By: hx235

fbshipit-source-id: 643e7f623c6b5c31711565854314cfd6cbbcf3a7
---
 db/wal_manager.cc                             |  8 ++-
 db/wal_manager_test.cc                        | 54 ++++++++++++++++---
 .../wal_ttl_clock_regression_underflow.md     |  1 +
 3 files changed, 54 insertions(+), 9 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md

diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index a0a7a8d9027d..67582c80552f 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -192,7 +192,13 @@ void WalManager::PurgeObsoleteWALFiles() {
                          s.ToString().c_str());
           continue;
         }
-        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+
+        // Avoid expression `now_seconds - file_m_time` when
+        // `file_m_time > now_seconds` to prevent unsigned underflow in case
+        // system clock goes backwards. Both timestamps are based on wall clock
+        // time, which is not guaranteed to be monotonic.
+        if (file_m_time <= now_seconds &&
+            now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
           s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
                            /*force_fg=*/!wal_in_db_path_);
           if (!s.ok()) {
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 5b5ba7c0a872..55736f2fdb5c 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@@ -39,7 +40,7 @@ class WalManagerTest : public testing::Test {
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
-  void Init() {
+  void Init(SystemClock* clock_override) {
     ASSERT_OK(env_->CreateDirIfMissing(dbname_));
     ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
     db_options_.db_paths.emplace_back(dbname_,
@@ -47,7 +48,11 @@ class WalManagerTest : public testing::Test {
     db_options_.wal_dir = dbname_;
     db_options_.env = env_.get();
     db_options_.fs = env_->GetFileSystem();
-    db_options_.clock = env_->GetSystemClock().get();
+    if (clock_override == nullptr) {
+      db_options_.clock = env_->GetSystemClock().get();
+    } else {
+      db_options_.clock = clock_override;
+    }
 
     versions_.reset(new VersionSet(
         dbname_, &db_options_, env_options_, table_cache_.get(),
@@ -124,7 +129,7 @@ class WalManagerTest : public testing::Test {
 };
 
 TEST_F(WalManagerTest, ReadFirstRecordCache) {
-  Init();
+  Init(nullptr /* clock_override */);
   std::string path = dbname_ + "/000001.log";
   std::unique_ptr<FSWritableFile> file;
   ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
@@ -221,7 +226,7 @@ int CountRecords(TransactionLogIterator* iter) {
 TEST_F(WalManagerTest, WALArchivalSizeLimit) {
   db_options_.WAL_ttl_seconds = 0;
   db_options_.WAL_size_limit_MB = 1000;
-  Init();
+  Init(nullptr /* clock_override */);
 
   // TEST : Create WalManager with huge size limit and no ttl.
   // Create some archived files and call PurgeObsoleteWALFiles().
@@ -258,7 +263,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
 
 TEST_F(WalManagerTest, WALArchivalTtl) {
   db_options_.WAL_ttl_seconds = 1000;
-  Init();
+  Init(nullptr /* clock_override */);
 
   // TEST : Create WalManager with a ttl and no size limit.
   // Create some archived log files and call PurgeObsoleteWALFiles().
@@ -282,8 +287,41 @@ TEST_F(WalManagerTest, WALArchivalTtl) {
   ASSERT_TRUE(log_files.empty());
 }
 
+TEST_F(WalManagerTest, WALArchivalTtlClockGoesBackwards) {
+  // This test used to trigger an unsigned underflow bug, where WAL files were
+  // incorrectly deleted when the system time moved backwards between writing
+  // to a WAL and running `WalManager::PurgeObsoleteWALFiles()`.
+  constexpr int kNumLogs = 5;
+  constexpr int kEntriesPerLog = 100;
+
+  db_options_.WAL_ttl_seconds = 86400;  // One day
+
+  // Configure mock clock to lag one second behind system time. That way, the
+  // WAL file's mtime will appear to be in the future when
+  // `WalManager::PurgeObsoleteWALFiles()` runs.
+  int64_t now_seconds;
+  ASSERT_OK(env_->GetSystemClock()->GetCurrentTime(&now_seconds));
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  mock_clock->SetCurrentTime(static_cast<uint64_t>(now_seconds - 1));
+  db_options_.clock = mock_clock.get();
+
+  Init(mock_clock.get() /* clock */);
+
+  CreateArchiveLogs(kNumLogs, kEntriesPerLog);
+
+  const std::string archive_dir = ArchivalDirectory(dbname_);
+  ASSERT_EQ(kNumLogs,
+            ListSpecificFiles(env_.get(), archive_dir, kWalFile).size());
+
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  // All files must still be present because TTL has not elapsed.
+  ASSERT_EQ(kNumLogs,
+            ListSpecificFiles(env_.get(), archive_dir, kWalFile).size());
+}
+
 TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
-  Init();
+  Init(nullptr /* clock_override */);
   RollTheLog(false);
   Put("key1", std::string(1024, 'a'));
   // Create a zero record WAL file.
@@ -297,7 +335,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
 }
 
 TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
-  Init();
+  Init(nullptr /* clock_override */);
   RollTheLog(false);
   auto iter = OpenTransactionLogIter(0);
   // Check that an empty iterator is returned
@@ -305,7 +343,7 @@ TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
 }
 
 TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
-  Init();
+  Init(nullptr /* clock_override */);
   CreateArchiveLogs(2, 100);
   auto iter = OpenTransactionLogIter(0);
   CreateArchiveLogs(1, 100);
diff --git a/unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md b/unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md
new file mode 100644
index 000000000000..4cdf0a07bada
--- /dev/null
+++ b/unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md
@@ -0,0 +1 @@
+Fixed a bug for `WAL_ttl_seconds > 0` use cases where the newest archived WAL files could be incorrectly deleted when the system clock moved backwards.
\ No newline at end of file

From 32f66712c85f602b9b49b92c4dc13b6d60fd2608 Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Mon, 27 Oct 2025 13:16:33 -0700
Subject: [PATCH 355/500] optimize C API to reduce memory allocations and using
 PinnableSlice for zero-copy reads (#13911)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
### Problem
The current C API implementation has inefficiencies that impact performance in production environments:

1. **Double allocations in Get operations**: Values are first copied into a `std::string`, then copied again into a malloc'd buffer
2. **Unnecessary string temporaries**: Using `std::string` as intermediate storage adds allocation/deallocation overhead
3. **No zero-copy read path**: All read operations require at least one allocation and copy
4. **Redundant operations**: CopyString performed unnecessary `sizeof(char)` multiplication

### Solution

#### 1. Use PinnableSlice for Get Operations
- **Before**: `DB::Get() → std::string → malloc'd buffer` (2 allocations, 2 copies)
- **After**: `DB::Get() → PinnableSlice → malloc'd buffer` (1 allocation, 1 copy)
- **Impact**: 50% reduction in allocations and copies

#### 2. Optimize CopyString Helper
- Removed redundant `sizeof(char)` multiplication
- Single implementation using `Slice` parameter (works with all types via implicit conversion)
- Added `inline` for better optimization

#### 3. New Zero-Copy API Functions
Added high-performance alternatives for allocation-sensitive workloads:
- rocksdb_get_pinned_v2/ rocksdb_get_pinned_cf_v2 - Zero-copy read access
- rocksdb_get_into_buffer/ rocksdb_get_into_buffer_cf - Copy into user-provided buffer
- `rocksdb_pinnable_handle_*` - Handle management functions

### Performance Improvements

| Operation | Allocations | Improvement |
|-----------|------------|-------------|
| [rocksdb_get](cci:1://file:///Users/zaidoon/public%20repos/rocksdb/db/c.cc:1391:0-1411:1) | 2 → 1 | **50% reduction** |
| [rocksdb_get_cf](cci:1://file:///Users/zaidoon/public%20repos/rocksdb/db/c.cc:1411:0-1431:1) | 2 → 1 | **50% reduction** |
| [rocksdb_multi_get](cci:1://file:///Users/zaidoon/public%20repos/rocksdb/db/c.cc:1495:0-1520:1) (per key) | 2 → 1 | **50% reduction** |
| [rocksdb_transaction_get](cci:1://file:///Users/zaidoon/public%20repos/rocksdb/db/c.cc:6730:0-6748:1) | 2 → 1 | **50% reduction** |
| [rocksdb_writebatch_wi_get_from_batch](cci:1://file:///Users/zaidoon/public%20repos/rocksdb/db/c.cc:2714:0-2732:1) | 2 → 1 | **50% reduction** |
| [rocksdb_get_pinned_v2](cci:1://file:///Users/zaidoon/public%20repos/rocksdb/db/c.cc:7761:0-7775:1) (new) | 0 | **100% reduction** |

### Functions Optimized (30+)
- All Get variants (regular, CF, with timestamps)
- All MultiGet variants
- All Transaction Get/MultiGet operations
- All WriteBatch Get operations
- KeyMayExist operations
- Metadata getters (column family names, SST file keys, transaction names, DB identity)

### Testing
- Added tests for new zero-copy functions
- Added tests for previously untested functions rocksdb_column_family_handle_get_name, rocksdb_transaction_get_name

### Migration Path
Applications can adopt improvements in three ways:
1. **No changes needed** - Existing code automatically benefits from 50% allocation reduction
2. **Incremental adoption** - Replace hot-path calls with zero-copy variants
3. **Full optimization** - Use rocksdb_get_into_buffer

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13911

Reviewed By: cbi42

Differential Revision: D83508431

Pulled By: jaykorean

fbshipit-source-id: 96146a59b0f9e839f6603b376d4e51f0e97c3a8c
---
 db/c.cc             | 138 +++++++++++++++++++++++++++++++++++++++-----
 db/c_test.c         | 107 ++++++++++++++++++++++++++++++++++
 include/rocksdb/c.h |  42 ++++++++++++++
 3 files changed, 274 insertions(+), 13 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index 177343b889ba..d6d476bd311f 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -635,10 +635,11 @@ static bool SaveError(char** errptr, const Status& s) {
   return true;
 }
 
-// Copies str to a new malloc()-ed buffer. The buffer is not NUL terminated.
-static char* CopyString(const std::string& str) {
-  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
-  memcpy(result, str.data(), sizeof(char) * str.size());
+// Helper function to copy string data to a malloc'd buffer
+// Works with std::string, Slice, and PinnableSlice through implicit conversion
+static inline char* CopyString(const Slice& slice) {
+  char* result = reinterpret_cast<char*>(malloc(slice.size()));
+  memcpy(result, slice.data(), slice.size());
   return result;
 }
 
@@ -1440,11 +1441,14 @@ char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
                   const char* key, size_t keylen, size_t* vallen,
                   char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  // Use PinnableSlice to avoid unnecessary copy
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    // Only one copy: from PinnableSlice to malloc'd buffer
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -1459,12 +1463,14 @@ char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options,
                      const char* key, size_t keylen, size_t* vallen,
                      char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s =
-      db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  // Use PinnableSlice to avoid unnecessary copy
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    // Only one copy: from PinnableSlice to malloc'd buffer
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -7900,4 +7906,110 @@ uint64_t rocksdb_wait_for_compact_options_get_timeout(
   return opt->rep.timeout.count();
 }
 
+/* High-performance zero-copy Get implementations */
+
+struct rocksdb_pinnable_handle_t {
+  PinnableSlice rep;
+};
+
+rocksdb_pinnable_handle_t* rocksdb_get_pinned_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnable_handle_t* handle = new rocksdb_pinnable_handle_t;
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &handle->rep);
+  if (!s.ok()) {
+    delete handle;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return handle;
+}
+
+rocksdb_pinnable_handle_t* rocksdb_get_pinned_cf_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnable_handle_t* handle = new rocksdb_pinnable_handle_t;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &handle->rep);
+  if (!s.ok()) {
+    delete handle;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return handle;
+}
+
+const char* rocksdb_pinnable_handle_get_value(
+    const rocksdb_pinnable_handle_t* handle, size_t* vallen) {
+  if (!handle) {
+    *vallen = 0;
+    return nullptr;
+  }
+  *vallen = handle->rep.size();
+  return handle->rep.data();
+}
+
+void rocksdb_pinnable_handle_destroy(rocksdb_pinnable_handle_t* handle) {
+  delete handle;
+}
+
+unsigned char rocksdb_get_into_buffer(rocksdb_t* db,
+                                      const rocksdb_readoptions_t* options,
+                                      const char* key, size_t keylen,
+                                      char* buffer, size_t buffer_size,
+                                      size_t* vallen, unsigned char* found,
+                                      char** errptr) {
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &pinnable_val);
+  if (s.ok()) {
+    *found = 1;
+    *vallen = pinnable_val.size();
+    if (buffer_size >= pinnable_val.size()) {
+      memcpy(buffer, pinnable_val.data(), pinnable_val.size());
+      return 1;  // Success - data copied
+    }
+    return 0;  // Buffer too small
+  } else {
+    *found = 0;
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return 0;
+  }
+}
+
+unsigned char rocksdb_get_into_buffer_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
+    unsigned char* found, char** errptr) {
+  PinnableSlice pinnable_val;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &pinnable_val);
+  if (s.ok()) {
+    *found = 1;
+    *vallen = pinnable_val.size();
+    if (buffer_size >= pinnable_val.size()) {
+      memcpy(buffer, pinnable_val.data(), pinnable_val.size());
+      return 1;  // Success - data copied
+    }
+    return 0;  // Buffer too small
+  } else {
+    *found = 0;
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return 0;
+  }
+}
+
 }  // end extern "C"
diff --git a/db/c_test.c b/db/c_test.c
index a06c8a74d2e7..2ac1c77617d4 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -1505,6 +1505,53 @@ int main(int argc, char** argv) {
     CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
   }
 
+  StartPhase("zero_copy_get_pinned_v2");
+  {
+    // Test new zero-copy get functions
+
+    // Test rocksdb_get_pinned_v2
+    rocksdb_pinnable_handle_t* handle =
+        rocksdb_get_pinned_v2(db, roptions, "foo", 3, &err);
+    CheckNoError(err);
+    CheckCondition(handle != NULL);
+    size_t val_len;
+    const char* val = rocksdb_pinnable_handle_get_value(handle, &val_len);
+    CheckEqual("hello", val, val_len);
+    rocksdb_pinnable_handle_destroy(handle);
+
+    // Test with non-existent key
+    handle = rocksdb_get_pinned_v2(db, roptions, "notfound", 8, &err);
+    CheckNoError(err);
+    CheckCondition(handle == NULL);
+
+    // Test rocksdb_get_into_buffer
+    char buffer[100];
+    unsigned char found;
+    unsigned char success = rocksdb_get_into_buffer(
+        db, roptions, "foo", 3, buffer, sizeof(buffer), &val_len, &found, &err);
+    CheckNoError(err);
+    CheckCondition(success == 1);
+    CheckCondition(found == 1);
+    CheckCondition(val_len == 5);
+    CheckCondition(memcmp(buffer, "hello", 5) == 0);
+
+    // Test with buffer too small
+    success = rocksdb_get_into_buffer(db, roptions, "foo", 3, buffer,
+                                      2,  // Buffer too small
+                                      &val_len, &found, &err);
+    CheckNoError(err);
+    CheckCondition(success == 0);  // Should fail due to small buffer
+    CheckCondition(found == 1);
+    CheckCondition(val_len == 5);  // Should still report actual size
+
+    // Test with non-existent key
+    success = rocksdb_get_into_buffer(db, roptions, "notfound", 8, buffer,
+                                      sizeof(buffer), &val_len, &found, &err);
+    CheckNoError(err);
+    CheckCondition(success == 0);
+    CheckCondition(found == 0);
+  }
+
   StartPhase("pin_get");
   {
     CheckPinGet(db, roptions, "box", "c");
@@ -1922,6 +1969,55 @@ int main(int argc, char** argv) {
     rocksdb_flush_wal(db, 1, &err);
     CheckNoError(err);
 
+    // Test column family handle get name
+    {
+      size_t name_len;
+      char* cf_name =
+          rocksdb_column_family_handle_get_name(handles[1], &name_len);
+      CheckCondition(name_len == 3);
+      CheckCondition(memcmp(cf_name, "cf1", 3) == 0);
+      rocksdb_free(cf_name);
+    }
+
+    // Test zero-copy get with column families
+    {
+      rocksdb_pinnable_handle_t* handle =
+          rocksdb_get_pinned_cf_v2(db, roptions, handles[1], "box", 3, &err);
+      CheckNoError(err);
+      CheckCondition(handle != NULL);
+      size_t val_len;
+      const char* val = rocksdb_pinnable_handle_get_value(handle, &val_len);
+      CheckEqual("c", val, val_len);
+      rocksdb_pinnable_handle_destroy(handle);
+
+      // Test with non-existent key
+      handle = rocksdb_get_pinned_cf_v2(db, roptions, handles[1], "notfound", 8,
+                                        &err);
+      CheckNoError(err);
+      CheckCondition(handle == NULL);
+
+      // Test rocksdb_get_into_buffer_cf
+      char buffer[100];
+      unsigned char found;
+      unsigned char success = rocksdb_get_into_buffer_cf(
+          db, roptions, handles[1], "buff", 4, buffer, sizeof(buffer), &val_len,
+          &found, &err);
+      CheckNoError(err);
+      CheckCondition(success == 1);
+      CheckCondition(found == 1);
+      CheckCondition(val_len == 7);
+      CheckCondition(memcmp(buffer, "rocksdb", 7) == 0);
+
+      // Test with buffer too small
+      success = rocksdb_get_into_buffer_cf(db, roptions, handles[1], "buff", 4,
+                                           buffer, 3,  // Buffer too small
+                                           &val_len, &found, &err);
+      CheckNoError(err);
+      CheckCondition(success == 0);  // Should fail due to small buffer
+      CheckCondition(found == 1);
+      CheckCondition(val_len == 7);  // Should still report actual size
+    }
+
     // Test WriteBatchWithIndex iteration with Column Family
     rocksdb_writebatch_wi_t* wbwi = rocksdb_writebatch_wi_create(0, true);
     rocksdb_writebatch_wi_put_cf(wbwi, handles[1], "boat", 4, "row",
@@ -3469,6 +3565,17 @@ int main(int argc, char** argv) {
     rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
     CheckNoError(err);
 
+    // test transaction get/set name (before commit)
+    {
+      rocksdb_transaction_set_name(txn, "test_txn", 8, &err);
+      CheckNoError(err);
+      size_t name_len;
+      char* txn_name = rocksdb_transaction_get_name(txn, &name_len);
+      CheckCondition(name_len == 8);
+      CheckCondition(memcmp(txn_name, "test_txn", 8) == 0);
+      rocksdb_free(txn_name);
+    }
+
     // read from outside transaction, before commit
     CheckTxnDBGet(txn_db, roptions, "foo", NULL);
     CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index e615c8da521c..a5334baf6680 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -3440,6 +3440,48 @@ extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_wait_for_compact_options_get_timeout(
     rocksdb_wait_for_compact_options_t* opt);
 
+/* High-performance zero-copy Get variants
+   These functions avoid unnecessary memory allocations and copies.
+   The returned buffer is valid until the handle is destroyed.
+   Bindings should migrate to these for better performance. */
+
+/* Zero-copy get that returns a handle to pinned data.
+   The data remains valid until rocksdb_pinnable_handle_destroy is called.
+   Returns NULL on error or not found. Check errptr to distinguish. */
+typedef struct rocksdb_pinnable_handle_t rocksdb_pinnable_handle_t;
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnable_handle_t* rocksdb_get_pinned_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_pinnable_handle_t* rocksdb_get_pinned_cf_v2(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
+
+/* Get the data pointer and size from a pinnable handle.
+   The data pointer is valid until the handle is destroyed. */
+extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnable_handle_get_value(
+    const rocksdb_pinnable_handle_t* handle, size_t* vallen);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_pinnable_handle_destroy(
+    rocksdb_pinnable_handle_t* handle);
+
+/* Direct get into caller-provided buffer.
+   Returns 1 if value fits in buffer, 0 if buffer too small.
+   Sets *vallen to actual value size.
+   If buffer is too small, no data is copied but *vallen is set. */
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
+    unsigned char* found, char** errptr);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
+    unsigned char* found, char** errptr);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif

From a3aa44a7167b8336f9bc15c8aba063260268ff68 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Mon, 27 Oct 2025 14:22:47 -0700
Subject: [PATCH 356/500] Fix regression test script for internal use (#14079)

Summary:
Due to some internal requirements, what's being used for`$SSH` and `$SCP` has changed and it broke the regression test. (e.g. tarball streaming to remote host no longer works)

Minor behavior changes to the script to make the internal workflow work.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14079

Test Plan:
```
./tools/regression_test.sh
```
Meta Internal automation

Reviewed By: pdillinger

Differential Revision: D85502798

Pulled By: jaykorean

fbshipit-source-id: d294c2ee47661fbe368ccc318062e891f3ac7c81
---
 tools/regression_test.sh | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/tools/regression_test.sh b/tools/regression_test.sh
index 26380f61439c..a823f39b735a 100755
--- a/tools/regression_test.sh
+++ b/tools/regression_test.sh
@@ -127,17 +127,15 @@ function main {
 
   setup_test_directory
   if [ $TEST_MODE -le 1 ]; then
-      test_remote "test -d $ORIGIN_PATH"
-      if [[ $? -ne 0 ]]; then
-          echo "Building DB..."
-          # compactall alone will not print ops or threads, which will fail update_report
-          run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
-          # only save for future use on success
-          test_remote "mv $DB_PATH $ORIGIN_PATH"
-      fi
+      echo "Building DB..."
+      # compactall alone will not print ops or threads, which will fail update_report
+     run_db_bench "fillseq,compactall" $NUM_KEYS 1 0 0
+     # only save for future use on success
+     test_remote "mv $DB_PATH $ORIGIN_PATH"
   fi
   if [ $TEST_MODE -ge 1 ]; then
       build_checkpoint
+
       # run_db_bench benchmark_name NUM_OPS NUM_THREADS USED_EXISTING_DB UPDATE_REPORT ASYNC_IO
       run_db_bench "seekrandom_asyncio" $NUM_OPS $NUM_THREADS  1 1 true
       run_db_bench "multireadrandom_asyncio" $NUM_OPS $NUM_THREADS  1 1 true
@@ -332,25 +330,22 @@ function set_async_io_parameters {
 }
 
 function build_checkpoint {
-    cmd_prefix=""
-    if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
-        cmd_prefix="$SSH $REMOTE_USER_AT_HOST "
-    fi
+    echo "NUM_MULTI_DB=$NUM_MULTI_DB"
     if [ $NUM_MULTI_DB -gt 1 ]; then
-        dirs=$($cmd_prefix find $ORIGIN_PATH -type d -links 2)
+        run_remote "mkdir -p $DB_PATH"
+        run_remote "find $ORIGIN_PATH -type d -links 2"
+        dirs=$?
         for dir in $dirs; do
             db_index=$(basename $dir)
             echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..."
-            $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \
-                        --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1
+            run_remote "$DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1"
             exit_on_error $?
         done
     else
         # checkpoint cannot build in directory already exists
-        $cmd_prefix rm -rf $DB_PATH
+        run_remote "rm -rf $DB_PATH"
         echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..."
-        $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \
-                    --db=$ORIGIN_PATH --try_load_options 2>&1
+        run_remote "$DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH --db=$ORIGIN_PATH --try_load_options 2>&1"
         exit_on_error $?
     fi
 }
@@ -453,7 +448,7 @@ function setup_options_file {
  if ! [ -z "$OPTIONS_FILE" ]; then
     if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
       options_file="$DB_BENCH_DIR/OPTIONS_FILE"
-      run_local "$SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file"
+      $SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file
     else
       options_file="$OPTIONS_FILE"
     fi
@@ -486,9 +481,8 @@ function setup_test_directory {
   run_remote "ls -l $DB_BENCH_DIR"
 
   if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
-      shopt -s nullglob # allow missing librocksdb*.so* for static lib build
-      run_local "tar cz db_bench ldb librocksdb*.so* | $SSH $REMOTE_USER_AT_HOST 'cd $DB_BENCH_DIR/ && tar xzv'"
-      shopt -u nullglob
+    run_local "$SCP db_bench $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/."
+    run_local "$SCP ldb $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/."
   fi
 
   run_local "mkdir -p $RESULT_PATH"

From 12b85c8ce99c1fc79fa23b8785f1e1b148985779 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Tue, 28 Oct 2025 11:15:42 -0700
Subject: [PATCH 357/500] Fix timestamp handling in LevelIterator MultiScan
 seeks (#14085)

Summary:
As titled, this fixes some internal crash test failures when UDT is enabled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14085

Test Plan: monitor crash tests.

Reviewed By: anand1976

Differential Revision: D85617949

Pulled By: cbi42

fbshipit-source-id: da6fb21c0ca5803ea24e8daf7de8558321babcf4
---
 db/version_set.cc | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 8b835e75c704..3a90c7afece1 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1613,8 +1613,19 @@ bool LevelIterator::SkipEmptyFileForward() {
         const ScanOptions& opts =
             GetMultiScanArgForFile(file_index_).GetScanRanges().front();
         if (opts.range.start.has_value()) {
-          InternalKey target(*opts.range.start.AsPtr(), kMaxSequenceNumber,
-                             kValueTypeForSeek);
+          InternalKey target;
+          const size_t ts_size =
+              user_comparator_.user_comparator()->timestamp_size();
+          if (ts_size == 0) {
+            target = InternalKey(opts.range.start.value(), kMaxSequenceNumber,
+                                 kValueTypeForSeek);
+          } else {
+            std::string seek_key;
+            AppendKeyWithMaxTimestamp(&seek_key, opts.range.start.value(),
+                                      ts_size);
+            target =
+                InternalKey(seek_key, kMaxSequenceNumber, kValueTypeForSeek);
+          }
           file_iter_.Seek(target.Encode());
         }
       } else {

From fd0b4e0cf08315f6a644d54d585fe70ca958d4ba Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Tue, 28 Oct 2025 12:59:00 -0700
Subject: [PATCH 358/500] Disable mmap_read in Stress Test (#14083)

Summary:
All remote compaction test failures had `mmap_read=1` in common. Unfortunately, the failure hasn't been very reproducible. Try disabling `mmap_read` to see if that shed some light.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14083

Test Plan: CI

Reviewed By: hx235

Differential Revision: D85622229

Pulled By: jaykorean

fbshipit-source-id: bbe9e08efc369813f0fec388c910446089e43650
---
 tools/db_crashtest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index cab484f089f7..1b25f6a8ea43 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -846,6 +846,8 @@ def finalize_and_sanitize(src_params):
         dest_params["checkpoint_one_in"] = 0
         dest_params["use_timed_put_one_in"] = 0
         dest_params["test_secondary"] = 0
+        dest_params["mmap_read"] = 0
+
         # Disable database open fault injection to prevent test inefficiency described below.
         # When fault injection occurs during DB open, the db will wait for compaction
         # to finish to clean up the database before retrying without injected error.

From 64817ae6048575a1a7f6e1978ffd4a08b40867a1 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Wed, 29 Oct 2025 12:42:34 -0700
Subject: [PATCH 359/500] Disable internal reseeking for multiscan stress test
 (#14087)

Summary:
Stress test can fail with assertion inside MultiScan in some reseek scenario. E.g., data block 1 ends with k@9, data block 2 starts with k@8, when a DB iter seeks to k@0 (see option `max_sequential_skip_in_iterations`), MultiScan will land in data block 1 due to https://github.com/facebook/rocksdb/blob/fd0b4e0cf08315f6a644d54d585fe70ca958d4ba/table/block_based/block_based_table_iterator.cc#L1258-L1263.

We can't just use internal key as separator since index block might not use it. I plan to follow up with a fix that never moves `cur_data_block_idx` backward within a MultiScan.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14087

Test Plan: CI and internal crash tests

Reviewed By: anand1976

Differential Revision: D85701668

Pulled By: cbi42

fbshipit-source-id: d3f1aaff40a12be4e3d1b4b7160bf2547f43b849
---
 db_stress_tool/db_stress_test_base.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 9b5203dc19fb..2d40136df4ab 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -350,7 +350,6 @@ bool StressTest::BuildOptionsTable() {
            "1",
            "2",
        }},
-      {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
       {"block_based_table_factory",
        {
            keepRibbonFilterPolicyOnly ? "{filter_policy=ribbonfilter:2.35}"
@@ -363,6 +362,13 @@ bool StressTest::BuildOptionsTable() {
                std::to_string(FLAGS_block_size + (FLAGS_seed & 0xFFFU)) + "}",
        }},
   };
+  if (FLAGS_use_multiscan == 0) {
+    // TODO: this can fail MultiScan when consecutive data blocks share the
+    // same user at boundary. MultiScan uses user key to locate the block to
+    // reach which can move the scan earlier than its current block.
+    options_tbl.emplace("max_sequential_skip_in_iterations",
+                        std::vector<std::string>{"4", "8", "12"});
+  }
   if (FLAGS_compaction_style == kCompactionStyleUniversal &&
       FLAGS_universal_max_read_amp > 0) {
     // level0_file_num_compaction_trigger needs to be at most max_read_amp

From 1bb704b6e05287bd4160bab212d05bbb0a2985fb Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Wed, 29 Oct 2025 12:57:49 -0700
Subject: [PATCH 360/500] optimize memory allocations and vector overhead in
 RocksDB C API using unique_ptr and PinnableSlice (#14036)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Comprehensive performance optimizations for the RocksDB C API that eliminate unnecessary memory allocations and copies.

## Key Changes

### 1. PinnableSlice for Get Operations (50% reduction in copies)
- Changed all `rocksdb_get*` functions to use `PinnableSlice` internally instead of `std::string`
- **Before:** RocksDB → std::string → malloc'd buffer (2 copies)
- **After:** RocksDB → malloc'd buffer (1 copy)
- Affects: Get, Transaction Get, TransactionDB Get, WriteBatch Get variants

### 2. Array-Based MultiGet with PinnableSlice (30% allocation reduction)
- Switched MultiGet operations to use optimized array-based RocksDB API with `PinnableSlice`
- Eliminates vector overhead and string allocations
- Affects: MultiGet, Transaction MultiGet, TransactionDB MultiGet variants

### New Zero-Copy APIs
Added high-performance zero-copy functions for applications that can use them:
- `rocksdb_iter_key_slice()` / `value_slice()` / `timestamp_slice()` - Return slices by value (eliminates output param overhead)
- `rocksdb_batched_multi_get_cf_slice()` - Batched get with slice array input
- `rocksdb_slice_t` - ABI-compatible slice type

Note that this pr builds on top of https://github.com/facebook/rocksdb/pull/13911

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14036

Reviewed By: pdillinger

Differential Revision: D85604919

Pulled By: jaykorean

fbshipit-source-id: 7f04b935eea79af1d45b3125a79b90e4706666f6
---
 db/c.cc             | 358 +++++++++++++++++++++++++++++---------------
 db/c_test.c         | 169 +++++++++++++++++++++
 include/rocksdb/c.h |  30 ++++
 3 files changed, 440 insertions(+), 117 deletions(-)

diff --git a/db/c.cc b/db/c.cc
index d6d476bd311f..b02c7bc4bd19 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -1543,12 +1543,17 @@ void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
                        size_t num_keys, const char* const* keys_list,
                        const size_t* keys_list_sizes, char** values_list,
                        size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency (avoids vector overhead for fixed-size array)
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  auto cfh = db->rep->DefaultColumnFamily();
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  db->rep->MultiGet(options->rep, cfh, num_keys, keys.get(), values.data(),
+                    statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -1573,10 +1578,13 @@ void rocksdb_multi_get_with_ts(rocksdb_t* db,
                                char** values_list, size_t* values_list_sizes,
                                char** timestamp_list,
                                size_t* timestamp_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
+  // Note: MultiGet with timestamps only has vector-based API
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<std::string> timestamps(num_keys);
   std::vector<Status> statuses =
@@ -1608,15 +1616,19 @@ void rocksdb_multi_get_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
-  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  // Use unique_ptr for efficiency (avoids vector overhead for fixed-size
+  // arrays)
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
+  std::unique_ptr<ColumnFamilyHandle*[]> cfs(new ColumnFamilyHandle*[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses =
-      db->rep->MultiGet(options->rep, cfs, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(),
+                    values.data(), statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -1641,16 +1653,20 @@ void rocksdb_multi_get_cf_with_ts(
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** timestamps_list,
     size_t* timestamps_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
-  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  // Use unique_ptr for efficiency (avoids vector overhead for fixed-size
+  // arrays)
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
+  std::unique_ptr<ColumnFamilyHandle*[]> cfs(new ColumnFamilyHandle*[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
-  std::vector<std::string> values(num_keys);
+  // Use PinnableSlice to avoid unnecessary allocations
+  std::vector<PinnableSlice> values(num_keys);
   std::vector<std::string> timestamps(num_keys);
-  std::vector<Status> statuses =
-      db->rep->MultiGet(options->rep, cfs, keys, &values, &timestamps);
+  std::vector<Status> statuses(num_keys);
+  db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(),
+                    values.data(), timestamps.data(), statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -1709,6 +1725,41 @@ void rocksdb_batched_multi_get_cf(rocksdb_t* db,
   delete[] statuses;
 }
 
+// Batched MultiGet that takes pre-built Slice array, avoiding key conversion
+// overhead
+void rocksdb_batched_multi_get_cf_slice(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, size_t num_keys,
+    const rocksdb_slice_t* keys_list, rocksdb_pinnableslice_t** values,
+    char** errs, const bool sorted_input) {
+  PinnableSlice* value_slices = new PinnableSlice[num_keys];
+  Status* statuses = new Status[num_keys];
+
+  // Cast rocksdb_slice_t* to Slice* - they have identical memory layout
+  const Slice* key_slices = reinterpret_cast<const Slice*>(keys_list);
+
+  db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices,
+                    value_slices, statuses, sorted_input);
+
+  for (size_t i = 0; i < num_keys; ++i) {
+    if (statuses[i].ok()) {
+      values[i] = new (rocksdb_pinnableslice_t);
+      values[i]->rep = std::move(value_slices[i]);
+      errs[i] = nullptr;
+    } else {
+      values[i] = nullptr;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+
+  delete[] value_slices;
+  delete[] statuses;
+}
+
 unsigned char rocksdb_key_may_exist(rocksdb_t* db,
                                     const rocksdb_readoptions_t* options,
                                     const char* key, size_t key_len,
@@ -2129,6 +2180,32 @@ void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
   SaveError(errptr, iter->rep->status());
 }
 
+// Iterator functions that return rocksdb_slice_t directly for better
+// performance
+rocksdb_slice_t rocksdb_iter_key_slice(const rocksdb_iterator_t* iter) {
+  Slice s = iter->rep->key();
+  rocksdb_slice_t result;
+  result.data = s.data();
+  result.size = s.size();
+  return result;
+}
+
+rocksdb_slice_t rocksdb_iter_value_slice(const rocksdb_iterator_t* iter) {
+  Slice s = iter->rep->value();
+  rocksdb_slice_t result;
+  result.data = s.data();
+  result.size = s.size();
+  return result;
+}
+
+rocksdb_slice_t rocksdb_iter_timestamp_slice(const rocksdb_iterator_t* iter) {
+  Slice s = iter->rep->timestamp();
+  rocksdb_slice_t result;
+  result.data = s.data();
+  result.size = s.size();
+  return result;
+}
+
 void rocksdb_iter_refresh(const rocksdb_iterator_t* iter, char** errptr) {
   SaveError(errptr, iter->rep->Refresh());
 }
@@ -2184,16 +2261,18 @@ void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys,
                              const size_t* keys_list_sizes, int num_values,
                              const char* const* values_list,
                              const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Put immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Put(SliceParts(key_slices.data(), num_keys),
-             SliceParts(value_slices.data(), num_values));
+  b->rep.Put(SliceParts(key_slices.get(), num_keys),
+             SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
@@ -2202,16 +2281,18 @@ void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
                                 const size_t* keys_list_sizes, int num_values,
                                 const char* const* values_list,
                                 const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Put immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
-             SliceParts(value_slices.data(), num_values));
+  b->rep.Put(column_family->rep, SliceParts(key_slices.get(), num_keys),
+             SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key,
@@ -2231,16 +2312,18 @@ void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys,
                                const size_t* keys_list_sizes, int num_values,
                                const char* const* values_list,
                                const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Merge immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Merge(SliceParts(key_slices.data(), num_keys),
-               SliceParts(value_slices.data(), num_values));
+  b->rep.Merge(SliceParts(key_slices.get(), num_keys),
+               SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
@@ -2249,16 +2332,18 @@ void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
                                   const size_t* keys_list_sizes, int num_values,
                                   const char* const* values_list,
                                   const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Merge immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
-               SliceParts(value_slices.data(), num_values));
+  b->rep.Merge(column_family->rep, SliceParts(key_slices.get(), num_keys),
+               SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key,
@@ -2298,21 +2383,25 @@ void rocksdb_writebatch_singledelete_cf_with_ts(
 void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys,
                                 const char* const* keys_list,
                                 const size_t* keys_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Delete immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+  b->rep.Delete(SliceParts(key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_deletev_cf(
     rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
     int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::Delete immediately copies the data
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+  b->rep.Delete(column_family->rep, SliceParts(key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
@@ -2336,14 +2425,16 @@ void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
                                       const size_t* start_keys_list_sizes,
                                       const char* const* end_keys_list,
                                       const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::DeleteRange immediately copies the data
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
-  b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
-                     SliceParts(end_key_slices.data(), num_keys));
+  b->rep.DeleteRange(SliceParts(start_key_slices.get(), num_keys),
+                     SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_delete_rangev_cf(
@@ -2351,15 +2442,17 @@ void rocksdb_writebatch_delete_rangev_cf(
     int num_keys, const char* const* start_keys_list,
     const size_t* start_keys_list_sizes, const char* const* end_keys_list,
     const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr instead of vector to avoid overhead
+  // Safe because WriteBatch::DeleteRange immediately copies the data
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
   b->rep.DeleteRange(column_family->rep,
-                     SliceParts(start_key_slices.data(), num_keys),
-                     SliceParts(end_key_slices.data(), num_keys));
+                     SliceParts(start_key_slices.get(), num_keys),
+                     SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob,
@@ -2520,16 +2613,17 @@ void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys,
                                 const size_t* keys_list_sizes, int num_values,
                                 const char* const* values_list,
                                 const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep->Put(SliceParts(key_slices.data(), num_keys),
-              SliceParts(value_slices.data(), num_values));
+  b->rep->Put(SliceParts(key_slices.get(), num_keys),
+              SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_wi_putv_cf(
@@ -2565,16 +2659,17 @@ void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys,
                                   const size_t* keys_list_sizes, int num_values,
                                   const char* const* values_list,
                                   const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep->Merge(SliceParts(key_slices.data(), num_keys),
-                SliceParts(value_slices.data(), num_values));
+  b->rep->Merge(SliceParts(key_slices.get(), num_keys),
+                SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_wi_mergev_cf(
@@ -2582,16 +2677,17 @@ void rocksdb_writebatch_wi_mergev_cf(
     int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
     int num_values, const char* const* values_list,
     const size_t* values_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<Slice> value_slices(num_values);
+  std::unique_ptr<Slice[]> value_slices(new Slice[num_values]);
   for (int i = 0; i < num_values; i++) {
     value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
   }
-  b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
-                SliceParts(value_slices.data(), num_values));
+  b->rep->Merge(column_family->rep, SliceParts(key_slices.get(), num_keys),
+                SliceParts(value_slices.get(), num_values));
 }
 
 void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key,
@@ -2629,11 +2725,12 @@ void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys,
 void rocksdb_writebatch_wi_deletev_cf(
     rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
     int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
-  std::vector<Slice> key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+  b->rep->Delete(column_family->rep, SliceParts(key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
@@ -2659,14 +2756,15 @@ void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b,
                                          const size_t* start_keys_list_sizes,
                                          const char* const* end_keys_list,
                                          const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
-  b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
-                      SliceParts(end_key_slices.data(), num_keys));
+  b->rep->DeleteRange(SliceParts(start_key_slices.get(), num_keys),
+                      SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_wi_delete_rangev_cf(
@@ -2674,15 +2772,16 @@ void rocksdb_writebatch_wi_delete_rangev_cf(
     int num_keys, const char* const* start_keys_list,
     const size_t* start_keys_list_sizes, const char* const* end_keys_list,
     const size_t* end_keys_list_sizes) {
-  std::vector<Slice> start_key_slices(num_keys);
-  std::vector<Slice> end_key_slices(num_keys);
+  // Use unique_ptr for better performance (avoids vector overhead)
+  std::unique_ptr<Slice[]> start_key_slices(new Slice[num_keys]);
+  std::unique_ptr<Slice[]> end_key_slices(new Slice[num_keys]);
   for (int i = 0; i < num_keys; i++) {
     start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
     end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
   }
   b->rep->DeleteRange(column_family->rep,
-                      SliceParts(start_key_slices.data(), num_keys),
-                      SliceParts(end_key_slices.data(), num_keys));
+                      SliceParts(start_key_slices.get(), num_keys),
+                      SliceParts(end_key_slices.get(), num_keys));
 }
 
 void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b,
@@ -2800,12 +2899,13 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db(
     const rocksdb_readoptions_t* options, const char* key, size_t keylen,
     size_t* vallen, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
+  // Use PinnableSlice to avoid unnecessary allocations
+  PinnableSlice pinnable_val;
   Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
-                                          Slice(key, keylen), &tmp);
+                                          Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -2838,12 +2938,14 @@ char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
     rocksdb_column_family_handle_t* column_family, const char* key,
     size_t keylen, size_t* vallen, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = wbwi->rep->GetFromBatchAndDB(
-      db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  // Use PinnableSlice to avoid unnecessary allocations
+  PinnableSlice pinnable_val;
+  Status s =
+      wbwi->rep->GetFromBatchAndDB(db->rep, options->rep, column_family->rep,
+                                   Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -6928,11 +7030,11 @@ char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
                               const char* key, size_t klen, size_t* vlen,
                               char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+  PinnableSlice pinnable_val;
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &pinnable_val);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -6963,12 +7065,12 @@ char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
                                  const char* key, size_t klen, size_t* vlen,
                                  char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s =
-      txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+  PinnableSlice pinnable_val;
+  Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen),
+                           &pinnable_val);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -7002,12 +7104,12 @@ char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
                                          size_t* vlen, unsigned char exclusive,
                                          char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s =
-      txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+  PinnableSlice pinnable_val;
+  Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen),
+                                    &pinnable_val, exclusive);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -7039,12 +7141,12 @@ char* rocksdb_transaction_get_for_update_cf(
     rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
     size_t* vlen, unsigned char exclusive, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
+  PinnableSlice pinnable_val;
   Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
-                                    Slice(key, klen), &tmp, exclusive);
+                                    Slice(key, klen), &pinnable_val, exclusive);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -7078,10 +7180,13 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
                                    const size_t* keys_list_sizes,
                                    char** values_list,
                                    size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
+  // Note: Transaction only has vector-based MultiGet API
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGet(options->rep, keys, &values);
@@ -7107,10 +7212,14 @@ void rocksdb_transaction_multi_get_for_update(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
+  // Note: GetForUpdate only has vector-based API, no array-based PinnableSlice
+  // variant
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGetForUpdate(options->rep, keys, &values);
@@ -7137,12 +7246,15 @@ void rocksdb_transaction_multi_get_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   std::vector<ColumnFamilyHandle*> cfs(num_keys);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
+  // Note: Transaction only has vector-based MultiGet API
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGet(options->rep, cfs, keys, &values);
@@ -7169,12 +7281,16 @@ void rocksdb_transaction_multi_get_for_update_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys_arr(new Slice[num_keys]);
   std::vector<ColumnFamilyHandle*> cfs(num_keys);
   for (size_t i = 0; i < num_keys; i++) {
-    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    keys_arr[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
+  // Note: GetForUpdate only has vector-based API, no array-based PinnableSlice
+  // variant
+  std::vector<Slice> keys(keys_arr.get(), keys_arr.get() + num_keys);
   std::vector<std::string> values(num_keys);
   std::vector<Status> statuses =
       txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values);
@@ -7201,11 +7317,12 @@ char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
                                 const char* key, size_t klen, size_t* vlen,
                                 char** errptr) {
   char* result = nullptr;
-  std::string tmp;
-  Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+  PinnableSlice pinnable_val;
+  Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(),
+                              Slice(key, klen), &pinnable_val);
   if (s.ok()) {
-    *vlen = tmp.size();
-    result = CopyString(tmp);
+    *vlen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vlen = 0;
     if (!s.IsNotFound()) {
@@ -7236,12 +7353,12 @@ char* rocksdb_transactiondb_get_cf(
     rocksdb_column_family_handle_t* column_family, const char* key,
     size_t keylen, size_t* vallen, char** errptr) {
   char* result = nullptr;
-  std::string tmp;
+  PinnableSlice pinnable_val;
   Status s = txn_db->rep->Get(options->rep, column_family->rep,
-                              Slice(key, keylen), &tmp);
+                              Slice(key, keylen), &pinnable_val);
   if (s.ok()) {
-    *vallen = tmp.size();
-    result = CopyString(tmp);
+    *vallen = pinnable_val.size();
+    result = CopyString(pinnable_val);
   } else {
     *vallen = 0;
     if (!s.IsNotFound()) {
@@ -7275,13 +7392,17 @@ void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db,
                                      const size_t* keys_list_sizes,
                                      char** values_list,
                                      size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses =
-      txn_db->rep->MultiGet(options->rep, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  auto cfh = txn_db->rep->DefaultColumnFamily();
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  txn_db->rep->MultiGet(options->rep, cfh, num_keys, keys.get(), values.data(),
+                        statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
@@ -7305,15 +7426,18 @@ void rocksdb_transactiondb_multi_get_cf(
     size_t num_keys, const char* const* keys_list,
     const size_t* keys_list_sizes, char** values_list,
     size_t* values_list_sizes, char** errs) {
-  std::vector<Slice> keys(num_keys);
-  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  // Use unique_ptr for efficiency
+  std::unique_ptr<Slice[]> keys(new Slice[num_keys]);
+  std::unique_ptr<ColumnFamilyHandle*[]> cfs(new ColumnFamilyHandle*[num_keys]);
   for (size_t i = 0; i < num_keys; i++) {
     keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
     cfs[i] = column_families[i]->rep;
   }
-  std::vector<std::string> values(num_keys);
-  std::vector<Status> statuses =
-      txn_db->rep->MultiGet(options->rep, cfs, keys, &values);
+  // Use PinnableSlice to avoid unnecessary allocations
+  std::vector<PinnableSlice> values(num_keys);
+  std::vector<Status> statuses(num_keys);
+  txn_db->rep->MultiGet(options->rep, num_keys, cfs.get(), keys.get(),
+                        values.data(), statuses.data());
   for (size_t i = 0; i < num_keys; i++) {
     if (statuses[i].ok()) {
       values_list[i] = CopyString(values[i]);
diff --git a/db/c_test.c b/db/c_test.c
index 2ac1c77617d4..ca5a76fba063 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -1255,6 +1255,70 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_destroy(wb);
   }
 
+  StartPhase("writebatch_vectors_cf");
+  {
+    const char* cf_name = "wb_vectors_cf";
+    rocksdb_column_family_handle_t* wb_cf =
+        rocksdb_create_column_family(db, options, cf_name, &err);
+    CheckNoError(err);
+
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+
+    // Test putv_cf: concatenates multiple slices into a single key/value
+    const char* put_keys[2] = {"k", "ey"};
+    const size_t put_key_sizes[2] = {1, 2};
+    const char* put_vals[3] = {"v", "a", "l"};
+    const size_t put_val_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_putv_cf(wb, wb_cf, 2, put_keys, put_key_sizes, 3,
+                               put_vals, put_val_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    // putv_cf concatenates: key="k"+"ey"="key", value="v"+"a"+"l"="val"
+    CheckGetCF(db, roptions, wb_cf, "key", "val");
+    CheckGetCF(db, roptions, wb_cf, "k", NULL);
+    CheckGetCF(db, roptions, wb_cf, "ey", NULL);
+
+    // Test deletev_cf: concatenates multiple slices for key
+    rocksdb_writebatch_clear(wb);
+    const char* del_keys[2] = {"k", "ey"};
+    const size_t del_key_sizes[2] = {1, 2};
+    rocksdb_writebatch_deletev_cf(wb, wb_cf, 2, del_keys, del_key_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, wb_cf, "key", NULL);
+
+    // Test delete_rangev_cf: concatenates slices for range deletion
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put_cf(wb, wb_cf, "a", 1, "1", 1);
+    rocksdb_writebatch_put_cf(wb, wb_cf, "b", 1, "2", 1);
+    rocksdb_writebatch_put_cf(wb, wb_cf, "c", 1, "3", 1);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, wb_cf, "a", "1");
+    CheckGetCF(db, roptions, wb_cf, "b", "2");
+    CheckGetCF(db, roptions, wb_cf, "c", "3");
+
+    rocksdb_writebatch_clear(wb);
+    const char* range_start[2] = {"a", ""};  // "a" + "" = "a"
+    const size_t range_start_sizes[2] = {1, 0};
+    const char* range_end[2] = {"c", ""};
+    const size_t range_end_sizes[2] = {1, 0};
+    rocksdb_writebatch_delete_rangev_cf(wb, wb_cf, 2, range_start,
+                                        range_start_sizes, range_end,
+                                        range_end_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    // Range [a, c) should delete "a" and "b", but not "c"
+    CheckGetCF(db, roptions, wb_cf, "a", NULL);
+    CheckGetCF(db, roptions, wb_cf, "b", NULL);
+    CheckGetCF(db, roptions, wb_cf, "c", "3");
+
+    rocksdb_writebatch_destroy(wb);
+    rocksdb_drop_column_family(db, wb_cf, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_destroy(wb_cf);
+  }
+
   StartPhase("writebatch_vectors");
   {
     rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
@@ -1420,6 +1484,43 @@ int main(int argc, char** argv) {
     rocksdb_iter_destroy(iter);
   }
 
+  StartPhase("iter_slice");
+  {
+    // Test the new slice-based iterator API for better performance
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    // Test rocksdb_iter_key_slice
+    rocksdb_slice_t key_slice = rocksdb_iter_key_slice(iter);
+    CheckEqual("box", key_slice.data, key_slice.size);
+
+    // Test rocksdb_iter_value_slice
+    rocksdb_slice_t value_slice = rocksdb_iter_value_slice(iter);
+    CheckEqual("c", value_slice.data, value_slice.size);
+
+    // Move to next entry and test again
+    rocksdb_iter_next(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    key_slice = rocksdb_iter_key_slice(iter);
+    value_slice = rocksdb_iter_value_slice(iter);
+    CheckEqual("foo", key_slice.data, key_slice.size);
+    CheckEqual("hello", value_slice.data, value_slice.size);
+
+    // Test seeking with slice API
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckCondition(rocksdb_iter_valid(iter));
+    key_slice = rocksdb_iter_key_slice(iter);
+    value_slice = rocksdb_iter_value_slice(iter);
+    CheckEqual("box", key_slice.data, key_slice.size);
+    CheckEqual("c", value_slice.data, value_slice.size);
+
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
   StartPhase("wbwi_iter");
   {
     rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
@@ -2094,6 +2195,74 @@ int main(int argc, char** argv) {
       }
     }
 
+    {
+      // Test rocksdb_batched_multi_get_cf_slice for better performance
+      // Build rocksdb_slice_t array directly to avoid conversion overhead
+      rocksdb_slice_t batched_key_slices[4];
+      batched_key_slices[0].data = "box";
+      batched_key_slices[0].size = 3;
+      batched_key_slices[1].data = "buff";
+      batched_key_slices[1].size = 4;
+      batched_key_slices[2].data = "barfooxx";
+      batched_key_slices[2].size = 8;
+      batched_key_slices[3].data = "box";
+      batched_key_slices[3].size = 3;
+
+      const char* expected_value[4] = {"c", "rocksdb", NULL, "c"};
+      char* batched_errs[4];
+      rocksdb_pinnableslice_t* pvals[4];
+
+      rocksdb_batched_multi_get_cf_slice(db, roptions, handles[1], 4,
+                                         batched_key_slices, pvals,
+                                         batched_errs, false);
+
+      const char* val;
+      size_t val_len;
+      for (i = 0; i < 4; ++i) {
+        CheckNoError(batched_errs[i]);
+        if (pvals[i] != NULL) {
+          val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+          CheckEqual(expected_value[i], val, val_len);
+          rocksdb_pinnableslice_destroy(pvals[i]);
+        } else {
+          CheckEqual(expected_value[i], NULL, 0);
+        }
+      }
+    }
+
+    {
+      // Test rocksdb_batched_multi_get_cf_slice with sorted_input=true
+      // Keys must be in sorted order for this optimization
+      rocksdb_slice_t sorted_key_slices[3];
+      sorted_key_slices[0].data = "box";
+      sorted_key_slices[0].size = 3;
+      sorted_key_slices[1].data = "buff";
+      sorted_key_slices[1].size = 4;
+      sorted_key_slices[2].data = "notfound";
+      sorted_key_slices[2].size = 8;
+
+      const char* expected_value[3] = {"c", "rocksdb", NULL};
+      char* batched_errs[3];
+      rocksdb_pinnableslice_t* pvals[3];
+
+      rocksdb_batched_multi_get_cf_slice(db, roptions, handles[1], 3,
+                                         sorted_key_slices, pvals, batched_errs,
+                                         true);
+
+      const char* val;
+      size_t val_len;
+      for (i = 0; i < 3; ++i) {
+        CheckNoError(batched_errs[i]);
+        if (pvals[i] != NULL) {
+          val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+          CheckEqual(expected_value[i], val, val_len);
+          rocksdb_pinnableslice_destroy(pvals[i]);
+        } else {
+          CheckEqual(expected_value[i], NULL, 0);
+        }
+      }
+    }
+
     {
       unsigned char value_found = 0;
 
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a5334baf6680..ce9e5229c824 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -148,6 +148,14 @@ typedef struct rocksdb_statistics_histogram_data_t
     rocksdb_statistics_histogram_data_t;
 typedef struct rocksdb_wait_for_compact_options_t
     rocksdb_wait_for_compact_options_t;
+
+/* rocksdb_slice_t: Optimized slice type for high-performance C API operations
+ * This struct is ABI-compatible with rocksdb::Slice for zero-copy interop.
+ * Used by slice iterator functions and batched operations. */
+typedef struct rocksdb_slice_t {
+  const char* data;
+  size_t size;
+} rocksdb_slice_t;
 typedef struct rocksdb_flushjobinfo_t rocksdb_flushjobinfo_t;
 typedef struct rocksdb_compactionjobinfo_t rocksdb_compactionjobinfo_t;
 typedef struct rocksdb_subcompactionjobinfo_t rocksdb_subcompactionjobinfo_t;
@@ -610,6 +618,16 @@ extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf(
     const char* const* keys_list, const size_t* keys_list_sizes,
     rocksdb_pinnableslice_t** values, char** errs, const bool sorted_input);
 
+/* Batched MultiGet with slice array: Takes rocksdb_slice_t array directly,
+ * avoiding key conversion. faster than rocksdb_batched_multi_get_cf for
+ * operations with many keys. Eliminates overhead of converting keys from
+ * separate pointer+size arrays to Slice objects. */
+extern ROCKSDB_LIBRARY_API void rocksdb_batched_multi_get_cf_slice(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, size_t num_keys,
+    const rocksdb_slice_t* keys_list, rocksdb_pinnableslice_t** values,
+    char** errs, const bool sorted_input);
+
 // The value is only allocated (using malloc) and returned if it is found and
 // value_found isn't NULL. In that case the user is responsible for freeing it.
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_key_may_exist(
@@ -776,6 +794,18 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_timestamp(
     const rocksdb_iterator_t*, size_t* tslen);
 extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
     const rocksdb_iterator_t*, char** errptr);
+
+/* Slice iterator functions: Return rocksdb_slice_t directly for better
+ * performance. These functions avoid the overhead of passing output parameters
+ * and provide zero-copy access to key/value/timestamp data. faster than
+ * traditional rocksdb_iter_key/value/timestamp functions. */
+extern ROCKSDB_LIBRARY_API rocksdb_slice_t
+rocksdb_iter_key_slice(const rocksdb_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API rocksdb_slice_t
+rocksdb_iter_value_slice(const rocksdb_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API rocksdb_slice_t
+rocksdb_iter_timestamp_slice(const rocksdb_iterator_t* iter);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_iter_refresh(
     const rocksdb_iterator_t* iter, char** errptr);
 

From 0eb5b43b4f9d139d5933c5423d786e2bf11a644d Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Wed, 29 Oct 2025 12:58:03 -0700
Subject: [PATCH 361/500] Change PosixWritableFile Truncate to reseek to new
 end of file (#14088)

Summary:
Change PosixWritableFile's Truncate to the new end offset. This ensures that future appends are written with no holes or overwrites. RocksDB doesn't guarantee this in the FileSystem contract, and its left up to the specific implementation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14088

Reviewed By: cbi42

Differential Revision: D85786398

Pulled By: anand1976

fbshipit-source-id: 3520d9d6336362f5128a17bbf396297d821a5da3
---
 env/io_posix.cc                               |  1 +
 env/io_posix_test.cc                          | 43 +++++++++++++++++++
 include/rocksdb/file_system.h                 |  6 ++-
 .../posix_writable_file_truncate.md           |  1 +
 4 files changed, 49 insertions(+), 2 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/posix_writable_file_truncate.md

diff --git a/env/io_posix.cc b/env/io_posix.cc
index 6f3edf47a507..5a0f0338d50a 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -1406,6 +1406,7 @@ IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
                 filename_, errno);
   } else {
     filesize_ = size;
+    lseek(fd_, filesize_, SEEK_SET);
   }
   return s;
 }
diff --git a/env/io_posix_test.cc b/env/io_posix_test.cc
index 81ce5058708b..6daff356afaf 100644
--- a/env/io_posix_test.cc
+++ b/env/io_posix_test.cc
@@ -4,6 +4,7 @@
 // (found in the LICENSE.Apache file in the root directory).
 
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 #ifdef ROCKSDB_LIB_IO_POSIX
 #include "env/io_posix.h"
@@ -131,6 +132,48 @@ TEST_F(LogicalBlockSizeCacheTest, Ref) {
 }
 #endif
 
+class PosixWritableFileTest : public testing::Test {};
+
+TEST_F(PosixWritableFileTest, SeekAfterTruncate) {
+  std::shared_ptr<FileSystem> fs = FileSystem::Default();
+  std::string path =
+      test::PerThreadDBPath("PosixWritableFileTest_SeekAfterTruncate");
+  Random rnd(300);
+  std::unique_ptr<FSWritableFile> wfile;
+
+  ASSERT_OK(fs->NewWritableFile(path, FileOptions(), &wfile, nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(16384), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Truncate(4096, IOOptions(), nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(4096), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  wfile.reset();
+
+  uint64_t size = 0;
+  ASSERT_OK(fs->GetFileSize(path, IOOptions(), &size, nullptr));
+  ASSERT_EQ(size, 8192);
+  ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr));
+}
+
+TEST_F(PosixWritableFileTest, SeekAfterExtend) {
+  std::shared_ptr<FileSystem> fs = FileSystem::Default();
+  std::string path =
+      test::PerThreadDBPath("PosixWritableFileTest_SeekAfterTruncate");
+  Random rnd(300);
+  std::unique_ptr<FSWritableFile> wfile;
+
+  ASSERT_OK(fs->NewWritableFile(path, FileOptions(), &wfile, nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(4096), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Truncate(8192, IOOptions(), nullptr));
+  ASSERT_OK(wfile->Append(rnd.RandomString(8192), IOOptions(), nullptr));
+  ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  wfile.reset();
+
+  uint64_t size = 0;
+  ASSERT_OK(fs->GetFileSize(path, IOOptions(), &size, nullptr));
+  ASSERT_EQ(size, 16384);
+  ASSERT_OK(fs->DeleteFile(path, IOOptions(), nullptr));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif
 
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index 8fbb8c4ab55a..b1b0a17a97c0 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -1166,8 +1166,10 @@ class FSWritableFile {
 
   // Truncate is necessary to trim the file to the correct size
   // before closing. It is not always possible to keep track of the file
-  // size due to whole pages writes. The behavior is undefined if called
-  // with other writes to follow.
+  // size due to whole pages writes. If called with other writes to follow,
+  // the behavior is file system specific. Posix will reseek to the new EOF.
+  // Other file systems may behave differently. Its the caller's
+  // responsibility to check the file system contract.
   virtual IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
                             IODebugContext* /*dbg*/) {
     return IOStatus::OK();
diff --git a/unreleased_history/behavior_changes/posix_writable_file_truncate.md b/unreleased_history/behavior_changes/posix_writable_file_truncate.md
new file mode 100644
index 000000000000..861702e124b4
--- /dev/null
+++ b/unreleased_history/behavior_changes/posix_writable_file_truncate.md
@@ -0,0 +1 @@
+PosixWritableFile now repositions the seek pointer to the new end of file after a call to Truncate.

From 94d91daddb64b38df0a30dac5c199a419bae9480 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 29 Oct 2025 16:02:10 -0700
Subject: [PATCH 362/500] Update folly (part way), fix USE_FOLLY_LITE (#14071)

Summary:
Resolving this folly upgrade required fixing the FOLLY_LITE build with header include from the 'fmt' library.

I was close to timing out on fixing USE_FOLLY_LITE and removing it altogether - it could be considered obsolete and/or not worth the maintenance cost.

Follow-up: make the folly build caching more friendly by hashing the relevant makefile parts. Not in this PR because then you wouldn't be able to see what changed in the folly build steps themselves.

UPDATE/NOTE: I wasn't able to fully update to latest due to a failure seen in F14, using the next folly commit or later. The source of the bug is likely outside of F14 but investigation is in progress.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14071

Test Plan: CI

Reviewed By: jaykorean

Differential Revision: D85268833

Pulled By: pdillinger

fbshipit-source-id: 1d0a2d61f095524a20e6ec796ef46c02d0696f4e
---
 Makefile                          | 19 ++++++++++++++-----
 build_tools/build_detect_platform |  2 ++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index a6a5dd3b4708..87adeef9e0eb 100644
--- a/Makefile
+++ b/Makefile
@@ -510,6 +510,17 @@ ifneq ($(strip $(BOOST_SOURCE_PATH)),)
 		PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE)
 	endif
 endif  # BOOST_SOURCE_PATH
+ifneq ($(strip $(FMT_SOURCE_PATH)),)
+	FMT_INCLUDE = $(shell (ls -d $(FMT_SOURCE_PATH)/fmt*/include/))
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FMT_INCLUDE)
+		PLATFORM_CXXFLAGS += -I$(FMT_INCLUDE)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FMT_INCLUDE)
+		PLATFORM_CXXFLAGS += -isystem $(FMT_INCLUDE)
+	endif
+endif  # FMT_SOURCE_PATH
 	# AIX: pre-defined system headers are surrounded by an extern "C" block
 	ifeq ($(PLATFORM), OS_AIX)
 		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
@@ -2495,7 +2506,7 @@ commit_prereq:
 	false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm
 	# $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
 
-FOLLY_COMMIT_HASH = e95383b7c8b5b1e46cf47acf2f317d54f93c8268
+FOLLY_COMMIT_HASH = b5543d6706270cd41f1140421cc13c0d7e695ae2
 
 # For public CI runs, checkout folly in a way that can build with RocksDB.
 # This is mostly intended as a test-only simulation of Meta-internal folly
@@ -2511,12 +2522,10 @@ checkout_folly:
 	cd third-party/folly && git reset --hard $(FOLLY_COMMIT_HASH)
 	@# Apparently missing include
 	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
-	@# Warning-as-error on memcpy
-	perl -pi -e 's/memcpy.&ptr/memcpy((void*)&ptr/' third-party/folly/folly/lang/Exception.cpp
 	@# const mismatch
 	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
-	@# NOTE: boost source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on boost headers
-	cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py fetch boost
+	@# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers
+	cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
 
 CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
 
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 93e0c0fa76b9..15f1cc568d24 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -772,6 +772,7 @@ fi
 if [ "$USE_FOLLY_LITE" ]; then
   if [ "$FOLLY_DIR" ]; then
     BOOST_SOURCE_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-source-dir boost`
+    FMT_SOURCE_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-source-dir fmt`
   fi
 fi
 
@@ -816,6 +817,7 @@ echo "FIND=$FIND" >> "$OUTPUT"
 echo "WATCH=$WATCH" >> "$OUTPUT"
 echo "FOLLY_PATH=$FOLLY_PATH" >> "$OUTPUT"
 echo "BOOST_SOURCE_PATH=$BOOST_SOURCE_PATH" >> "$OUTPUT"
+echo "FMT_SOURCE_PATH=$FMT_SOURCE_PATH" >> "$OUTPUT"
 
 # This will enable some related identifiers for the preprocessor
 if test -n "$JEMALLOC"; then

From 9577b92b555907f9989a2e793fd9372fc92a628c Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Sun, 2 Nov 2025 16:08:09 -0800
Subject: [PATCH 363/500] Fix ODR violation from open source folly build,
 update (#14094)

Summary:
Following up on https://github.com/facebook/rocksdb/pull/14071, updating folly to
https://github.com/facebook/folly/commit/8a9fc1e80a18cafadbec85e33d5042ce13a7c634 or beyond was failing an F14Table assertion for a very subtle reason: ODR violation between the folly build and RocksDB build because folly build was release mode and RocksDB build was debug mode. What was happening was that folly change introduced a dependence on kDebug (whether build is debug) in a hashing implementation in a .h file, and the inconsistency between the inlined implementation during RocksDB build and the linked-to implementation from the folly build was leading to inconsistencies in the data structure.

The primary fix is to ensure we build folly in debug mode for debug mode RocksDB builds. Also,

* Needed to use the `patchelf` tool in `build_folly` to ensure the glog dependency shared library can always find its own gflags dependency. I explored many options for working around this, and this is what would work without reworking folly's own build.
* Updated folly to latest commit.
* Thrown in an ad hoc folly patch to use ftp.gnu.org mirrors (the canonical is super slow)
* Moved the placement of GETDEPS_USE_WGET=1 to apply to local builds also, to avoid the issue of a large download almost reaching completion and then stalling indefinitely.
* Fix failing nightly build-linux-cmake-with-folly-lite-no-test with fmt includes in cmake build (as was done with make build)
* Add a release mode folly+RocksDB to nightly CI, including both cmake and make. This also serves as a non-cached folly build to detect potential problems with PR jobs working from cached folly build.
* Move build-linux-cmake-with-folly to nightly because it's mostly covered by build-linux-cmake-with-folly-coroutines

Intended follow-up:
* folly-lite build with tests
* Make the folly build caching more friendly+accurate by hashing the relevant Makefile parts and tagging whether debug or release. Not in this PR because then you wouldn't be able to see what changed in the folly build steps themselves.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14094

Test Plan: manual + CI

Reviewed By: mszeszko-meta

Differential Revision: D85864871

Pulled By: pdillinger

fbshipit-source-id: 50009b33422d5781074fcbbdf18089be9e36800d
---
 .github/actions/setup-folly/action.yml |  4 ++-
 .github/workflows/nightly.yml          | 33 +++++++++++++++++++++++++
 .github/workflows/pr-jobs.yml          | 18 --------------
 CMakeLists.txt                         | 23 ++++++++++++++++-
 Makefile                               | 34 +++++++++++++++++++++-----
 build_tools/build_detect_platform      |  6 +++--
 6 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/.github/actions/setup-folly/action.yml b/.github/actions/setup-folly/action.yml
index 438d8e8e8183..af1d4b727be6 100644
--- a/.github/actions/setup-folly/action.yml
+++ b/.github/actions/setup-folly/action.yml
@@ -5,5 +5,7 @@ runs:
   - name: Checkout folly sources
     run: |
       make checkout_folly
-      echo "GETDEPS_USE_WGET=1" >> "$GITHUB_ENV"
+    shell: bash
+  - name: Install patchelf
+    run: apt-get update -y && apt-get install -y patchelf
     shell: bash
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 4ea230899737..1ca6c69818c4 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -58,6 +58,39 @@ jobs:
     - uses: "./.github/actions/build-folly"
     - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
     - uses: "./.github/actions/post-steps"
+  build-linux-cmake-with-folly:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/setup-folly"
+    - uses: "./.github/actions/cache-folly"
+      id: cache-folly
+    - uses: "./.github/actions/build-folly"
+      with:
+        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
+    - uses: "./.github/actions/post-steps"
+  build-linux-release-with-folly:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/setup-folly"
+    - run: "DEBUG_LEVEL=0 make -j20 build_folly"
+    - run: "USE_FOLLY=1 LIB_MODE=static DEBUG_LEVEL=0 V=1 make -j20 release"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 -DCMAKE_BUILD_TYPE=Release .. && make V=1 -j20 && ctest -j20)"
+    - uses: "./.github/actions/post-steps"
   build-linux-valgrind:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 98c4cfa22e19..af5fe4689541 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -92,24 +92,6 @@ jobs:
         which javac && javac -version
         mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
     - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-folly:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - uses: "./.github/actions/setup-folly"
-    - uses: "./.github/actions/cache-folly"
-      id: cache-folly
-    - uses: "./.github/actions/build-folly"
-      with:
-        cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
-    - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0bbbc4c5cfc..8664c1c1d865 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,6 +80,7 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "${default_build_type}" CACHE STRING
     "Default BUILD_TYPE is ${default_build_type}" FORCE)
 endif()
+message(STATUS "CMAKE_BUILD_TYPE is set to ${CMAKE_BUILD_TYPE}")
 
 find_program(CCACHE_FOUND ccache)
 if(CCACHE_FOUND)
@@ -132,7 +133,9 @@ else()
     option(WITH_GFLAGS "build with GFlags" ON)
   endif()
   set(GFLAGS_LIB)
-  if(WITH_GFLAGS)
+  # Skip all gflags detection and setup when USE_FOLLY or USE_COROUTINES is enabled
+  # since Folly provides its own gflags (USE_COROUTINES automatically sets USE_FOLLY)
+  if(WITH_GFLAGS AND NOT USE_FOLLY AND NOT USE_COROUTINES)
     # Config with namespace available since gflags 2.2.2
     option(GFLAGS_USE_TARGET_NAMESPACE "Use gflags import target with namespace." ON)
     find_package(gflags CONFIG)
@@ -151,6 +154,9 @@ else()
     include_directories(${GFLAGS_INCLUDE_DIR})
     list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB})
     add_definitions(-DGFLAGS=1)
+  elseif(WITH_GFLAGS AND (USE_FOLLY OR USE_COROUTINES))
+    # Still set the DGFLAGS=1 define when using Folly since Folly provides gflags
+    add_definitions(-DGFLAGS=1)
   endif()
 
   if(WITH_SNAPPY)
@@ -642,6 +648,12 @@ if(USE_FOLLY)
     ${FOLLY_INST_PATH}/lib/cmake/folly/folly-targets.cmake)
 
     include(${FOLLY_INST_PATH}/lib/cmake/folly/folly-config.cmake)
+
+    # Fix gflags library name for debug builds
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath=${GFLAGS_INST_PATH}/lib")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${GFLAGS_INST_PATH}/lib/libgflags_debug.so.2.2")
+    endif()
   endif()
 
   add_compile_definitions(USE_FOLLY FOLLY_NO_CONFIG HAVE_CXX11_ATOMIC)
@@ -1082,12 +1094,21 @@ if(USE_FOLLY_LITE)
     third-party/folly/folly/synchronization/DistributedMutex.cpp
     third-party/folly/folly/synchronization/ParkingLot.cpp)
   include_directories(${PROJECT_SOURCE_DIR}/third-party/folly)
+  # Add boost to the include path
   exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS
   build/fbcode_builder/getdeps.py show-source-dir boost OUTPUT_VARIABLE
   BOOST_SOURCE_PATH)
   exec_program(ls ARGS -d ${BOOST_SOURCE_PATH}/boost* OUTPUT_VARIABLE
   BOOST_INCLUDE_DIR)
   include_directories(${BOOST_INCLUDE_DIR})
+  # Add fmt to the include path
+  exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS
+  build/fbcode_builder/getdeps.py show-source-dir fmt OUTPUT_VARIABLE
+  FMT_SOURCE_PATH)
+  exec_program(ls ARGS -d ${FMT_SOURCE_PATH}/fmt* OUTPUT_VARIABLE
+  FMT_INCLUDE_DIR)
+  include_directories(${FMT_INCLUDE_DIR})
+
   add_definitions(-DUSE_FOLLY -DFOLLY_NO_CONFIG)
   list(APPEND THIRDPARTY_LIBS glog)
 endif()
diff --git a/Makefile b/Makefile
index 87adeef9e0eb..e08a35396eea 100644
--- a/Makefile
+++ b/Makefile
@@ -489,8 +489,14 @@ ifneq ($(strip $(FOLLY_PATH)),)
 
 	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
 	# in the command line
-	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2 $(LIBEVENT_PATH)/lib/libevent-2.1.so -ldl
-	PLATFORM_LDFLAGS += -Wl,-rpath=$(GFLAGS_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(LIBSODIUM_PATH)/lib -Wl,-rpath=$(LIBEVENT_PATH)/lib
+
+	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent-2.1.so $(LIBSODIUM_PATH)/lib/libsodium.a -ldl
+ifneq ($(DEBUG_LEVEL),0)
+	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmtd.a $(GLOG_LIB_PATH)/libglogd.so $(GFLAGS_PATH)/lib/libgflags_debug.so.2.2
+else
+	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2
+endif
+	PLATFORM_LDFLAGS += -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib
 endif
 	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
 	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
@@ -2506,7 +2512,7 @@ commit_prereq:
 	false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm
 	# $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
 
-FOLLY_COMMIT_HASH = b5543d6706270cd41f1140421cc13c0d7e695ae2
+FOLLY_COMMIT_HASH = abe68f7e917e8b7a0ee2fe066c972dc98fd35aa1
 
 # For public CI runs, checkout folly in a way that can build with RocksDB.
 # This is mostly intended as a test-only simulation of Meta-internal folly
@@ -2524,13 +2530,23 @@ checkout_folly:
 	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
 	@# const mismatch
 	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
+	@# Use gnu.org mirrors to improve download speed (ftp.gnu.org is often super slow)
+	cd third-party/folly && perl -pi -e 's/ftp.gnu.org/ftpmirror.gnu.org/' `git grep -l ftp.gnu.org` README.md
 	@# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers
-	cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
+	cd third-party/folly && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
 
 CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
 
+FOLLY_BUILD_FLAGS = --no-tests
+# NOTE: To avoid ODR violations, we must build folly in debug mode iff
+# building RocksDB in debug mode.
+ifneq ($(DEBUG_LEVEL),0)
+FOLLY_BUILD_FLAGS += --build-type Debug
+endif
+
+
 build_folly:
-	FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
 	if [ "$$FOLLY_INST_PATH" ]; then \
 		rm -rf $${FOLLY_INST_PATH}/../../*; \
 	else \
@@ -2538,7 +2554,13 @@ build_folly:
 		false; \
 	fi
 	cd third-party/folly && \
-		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
+		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py build $(FOLLY_BUILD_FLAGS)
+	@# In the folly build, glog and gflags are only built as dynamic libraries,
+	@# not static. This patchelf command is needed to reliably have the glog
+	@# library find its dependency gflags, because apparently the rpath of the
+	@# final binary is not used in resolving that transitive dependency.
+	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	cd "$$FOLLY_INST_PATH" && patchelf --add-rpath $$PWD/../gflags-*/lib ../glog-*/lib*/libglog*.so.*.*.*
 
 # ---------------------------------------------------------------------------
 #   Build size testing
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 15f1cc568d24..ff7ceeece8d8 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -311,7 +311,8 @@ EOF
 EOF
         then
           COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
-          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+          # Hack: don't link extra gflags assuming it comes with folly
+          [ "$USE_FOLLY" ] || PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         # check if namespace is gflags
         elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
             #include <gflags/gflags.h>
@@ -320,7 +321,8 @@ EOF
 EOF
         then
           COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
-          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+          # Hack: don't link extra gflags assuming it comes with folly
+          [ "$USE_FOLLY" ] || PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         # check if namespace is google
         elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
             #include <gflags/gflags.h>

From befa6b8050c3d51ae2e9b0bc42ba2076357e9dc8 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 4 Nov 2025 19:47:42 -0800
Subject: [PATCH 364/500] Fix and check for potential ODR violations (#14096)

Summary:
... caused by public headers depending on build parameters (macro definitions). This change also adds a check under 'make check-headers' (already in CI) looking for potential future violations.

I've audited the uses of '#if' in public headers and either
* Eliminated them
* Systematically excluded them because they are intentional or similar (details in comments in check-public-header.sh
* Manually excluded them as being ODR-SAFE

In the case of ROCKSDB_USING_THREAD_STATUS, there was no good reason for this to appear in public headers so I've replaced it with a static bool ThreadStatus::kEnabled. I considered getting rid of the ability to disable this code but some relatively recent PRs have been submitted for fixing that case. I've added a release note and updated one of the CI jobs to use this build configuration. (I didn't want to combine with some jobs like no_compression and status_checked because the interaction might limit what is checked.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14096

Test Plan: manual 'make check-headers' + manual cmake as in new CI config + CI

Reviewed By: jaykorean

Differential Revision: D86241864

Pulled By: pdillinger

fbshipit-source-id: d16addc9e3480706b174a006720a4def0740bf2e
---
 .github/workflows/pr-jobs.yml                 |  4 +--
 Makefile                                      |  1 +
 build_tools/check-public-header.sh            | 31 +++++++++++++++++++
 db/db_impl/db_impl.cc                         |  4 +--
 db/db_test.cc                                 |  8 ++---
 db/listener_test.cc                           | 24 +++++---------
 include/rocksdb/compaction_filter.h           |  2 --
 include/rocksdb/db.h                          |  6 ----
 include/rocksdb/env.h                         |  2 +-
 include/rocksdb/file_system.h                 |  4 +--
 include/rocksdb/options.h                     |  2 +-
 include/rocksdb/sst_file_writer.h             |  8 +----
 include/rocksdb/thread_status.h               | 20 ++++--------
 include/rocksdb/utilities/env_mirror.h        |  4 +--
 .../utilities/ldb_cmd_execute_result.h        |  4 ---
 monitoring/thread_status_impl.cc              |  7 +++--
 monitoring/thread_status_updater.cc           |  4 +--
 monitoring/thread_status_updater.h            | 12 +++----
 monitoring/thread_status_updater_debug.cc     |  4 +--
 monitoring/thread_status_util.cc              | 15 ++++++---
 monitoring/thread_status_util.h               |  4 +--
 .../public_api_changes/odr_thread_status.md   |  1 +
 util/thread_list_test.cc                      |  4 +--
 util/thread_operation.h                       |  4 +--
 util/threadpool_imp.cc                        |  4 +--
 25 files changed, 94 insertions(+), 89 deletions(-)
 create mode 100755 build_tools/check-public-header.sh
 create mode 100644 unreleased_history/public_api_changes/odr_thread_status.md

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index af5fe4689541..23d71687d255 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -141,7 +141,7 @@ jobs:
         cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
     - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-benchmark:
+  build-linux-cmake-with-benchmark-no-thread-status:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
@@ -151,7 +151,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20
+    - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 -DCMAKE_CXX_FLAGS=-DNROCKSDB_THREAD_STATUS .. && make V=1 -j20 && ctest -j20
     - uses: "./.github/actions/post-steps"
   build-linux-encrypted_env-no_compression:
     if: ${{ github.repository_owner == 'facebook' }}
diff --git a/Makefile b/Makefile
index e08a35396eea..6c053a6d0e4b 100644
--- a/Makefile
+++ b/Makefile
@@ -700,6 +700,7 @@ am__v_CCH_1 =
 %.h.pub: %.h # .h.pub not actually created, so re-checked on each invocation
 	$(AM_V_CCH) cd include/ && echo '#include "$(patsubst include/%,%,$<)"' | \
 	  $(CXX) -std=$(or $(ROCKSDB_CXX_STANDARD),c++20) -I. -DROCKSDB_NAMESPACE=42 -x c++ -c - -o /dev/null
+	build_tools/check-public-header.sh $<
 
 check-headers: $(HEADER_OK_FILES)
 
diff --git a/build_tools/check-public-header.sh b/build_tools/check-public-header.sh
new file mode 100755
index 000000000000..027758a801f0
--- /dev/null
+++ b/build_tools/check-public-header.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved.
+#
+# Check for some simple mistakes in public headers (on the command line)
+# that should prevent commit or push
+
+BAD=""
+
+# Look for potential for ODR violations caused by public headers depending on
+# build parameters that could vary between RocksDB build and application build.
+# * Cases like LUA, ROCKSDB_NAMESPACE, and ROCKSDB_ASSERT_STATUS_CHECKED are
+#   intentional, hard to avoid. (We expect definitions to change and the user
+#   should also.)
+# * Cases like _WIN32, OS_WIN, and __cplusplus are essentially ODR-safe.
+# * Cases like
+#   #ifdef BLAH  // ODR-SAFE
+#   #undef BLAH
+#   #endif
+#   that should not cause ODR violations can be exempted with the ODR-SAFE
+#   marker recognized here.
+
+grep -nHE '^#if' -- "$@" | grep -vE 'ROCKSDB_NAMESPACE|ROCKSDB_ASSERT_STATUS_CHECKED|LUA|_WIN32|OS_WIN|ODR-SAFE|__cplusplus|ROCKSDB_DLL|ROCKSDB_LIBRARY_EXPORTS'
+if [ "$?" != "1" ]; then
+  echo "^^^^^ #if in public API could cause an ODR violation."
+  echo "      Add // ODR-SAFE if verified safe."
+  BAD=1
+fi
+
+if [ "$BAD" ]; then
+  exit 1
+fi
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 55bf299c3bec..f5ade39fc89e 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -5506,7 +5506,7 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name,
   return s;
 }
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
   if (immutable_db_options_.enable_thread_tracking) {
@@ -5533,7 +5533,7 @@ void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
 void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
 
 void DBImpl::EraseThreadStatusDbInfo() const {}
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 
 //
 // A global method that can dump out the build version
diff --git a/db/db_test.cc b/db/db_test.cc
index 1919be904c23..ab8757291834 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -4934,7 +4934,7 @@ TEST_F(DBTest, DynamicMemtableOptions) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 namespace {
 bool VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
                           int expected_count) {
@@ -5392,7 +5392,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
   ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 
 TEST_F(DBTest, FlushOnDestroy) {
   WriteOptions wo;
@@ -6127,9 +6127,9 @@ TEST_F(DBTest, MergeTestTime) {
 
   ASSERT_EQ(1, count);
   ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }
 
 TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 033e86d2023d..f587717c6f26 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -163,9 +163,7 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   options.max_bytes_for_level_base = options.target_file_size_base * 2;
   options.max_bytes_for_level_multiplier = 2;
   options.compression = kNoCompression;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   options.level0_file_num_compaction_trigger = kNumL0Files;
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
@@ -229,7 +227,7 @@ class TestFlushListener : public EventListener {
     ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
     ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
     // Verify the id of the current thread that created this table
     // file matches the id of any active flush or compaction thread.
     uint64_t thread_id = env_->GetThreadID();
@@ -246,7 +244,7 @@ class TestFlushListener : public EventListener {
       }
     }
     ASSERT_TRUE(found_match);
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
   }
 
   void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
@@ -310,9 +308,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   Options options;
   options.env = CurrentOptions().env;
   options.write_buffer_size = k110KB;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   TestFlushListener* listener = new TestFlushListener(options.env, this);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
@@ -357,9 +353,7 @@ TEST_F(EventListenerTest, MultiCF) {
     Options options;
     options.env = CurrentOptions().env;
     options.write_buffer_size = k110KB;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-    options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+    options.enable_thread_tracking = ThreadStatus::kEnabled;
     options.atomic_flush = atomic_flush;
     options.create_if_missing = true;
     DestroyAndReopen(options);
@@ -407,9 +401,7 @@ TEST_F(EventListenerTest, MultiCF) {
 TEST_F(EventListenerTest, MultiDBMultiListeners) {
   Options options;
   options.env = CurrentOptions().env;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   options.table_properties_collector_factories.push_back(
       std::make_shared<TestPropertiesCollectorFactory>());
   std::vector<TestFlushListener*> listeners;
@@ -497,9 +489,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
 TEST_F(EventListenerTest, DisableBGCompaction) {
   Options options;
   options.env = CurrentOptions().env;
-#ifdef ROCKSDB_USING_THREAD_STATUS
-  options.enable_thread_tracking = true;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = ThreadStatus::kEnabled;
   TestFlushListener* listener = new TestFlushListener(options.env, this);
   const int kCompactionTrigger = 1;
   const int kSlowdownTrigger = 5;
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
index 66f2f390e7d1..68a7116de9bd 100644
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -284,9 +284,7 @@ class CompactionFilter : public Customizable {
       std::string* new_value,
       std::vector<std::pair<std::string, std::string>>* /* new_columns */,
       std::string* skip_until) const {
-#ifdef NDEBUG
     (void)existing_columns;
-#endif
 
     assert(!existing_value || !existing_columns);
     assert(value_type == ValueType::kWideColumnEntity || existing_value);
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index ff62188795d1..0b9c506e5ce3 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -35,12 +35,6 @@
 #include "rocksdb/version.h"
 #include "rocksdb/wide_columns.h"
 
-#if defined(__GNUC__) || defined(__clang__)
-#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
-#elif _WIN32
-#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 struct ColumnFamilyOptions;
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 03a64b968982..6dbfa7537bac 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -39,7 +39,7 @@
 #undef LoadLibrary
 #endif
 
-#if defined(__GNUC__) || defined(__clang__)
+#if defined(__GNUC__) || defined(__clang__)  // ODR-SAFE (essentially)
 #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
   __attribute__((__format__(__printf__, format_param, dots_param)))
 #else
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index b1b0a17a97c0..1a08d43041bb 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -558,7 +558,7 @@ class FileSystem : public Customizable {
   }
 
 // This seems to clash with a macro on Windows, so #undef it here
-#ifdef DeleteFile
+#ifdef DeleteFile  // ODR-SAFE
 #undef DeleteFile
 #endif
   // Delete the named file.
@@ -719,7 +719,7 @@ class FileSystem : public Customizable {
       const ImmutableDBOptions& db_options) const;
 
 // This seems to clash with a macro on Windows, so #undef it here
-#ifdef GetFreeSpace
+#ifdef GetFreeSpace  // ODR-SAFE
 #undef GetFreeSpace
 #endif
 
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index aeca38ec2487..e932af5628c7 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -32,7 +32,7 @@
 #include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
 
-#ifdef max
+#ifdef max  // ODR-SAFE
 #undef max
 #endif
 
diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h
index d893cb1e2afb..6da739cf38b2 100644
--- a/include/rocksdb/sst_file_writer.h
+++ b/include/rocksdb/sst_file_writer.h
@@ -15,12 +15,6 @@
 #include "rocksdb/types.h"
 #include "rocksdb/wide_columns.h"
 
-#if defined(__GNUC__) || defined(__clang__)
-#define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__))
-#elif _WIN32
-#define ROCKSDB_DEPRECATED_FUNC __declspec(deprecated)
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 class Comparator;
@@ -117,7 +111,7 @@ class SstFileWriter {
   // REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
   //           key according to the comparator.
   // REQUIRES: comparator is *not* timestamp-aware.
-  ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value);
+  [[deprecated]] Status Add(const Slice& user_key, const Slice& value);
 
   // Add a Put key with value to currently opened file
   // REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h
index 3c4bbe9a01ad..07c872c0e9b5 100644
--- a/include/rocksdb/thread_status.h
+++ b/include/rocksdb/thread_status.h
@@ -22,24 +22,16 @@
 
 #include "rocksdb/rocksdb_namespace.h"
 
-#if !defined(NROCKSDB_THREAD_STATUS)
-#define ROCKSDB_USING_THREAD_STATUS
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
-// TODO(yhchiang): remove this function once c++14 is available
-//                 as std::max will be able to cover this.
-// Current MS compiler does not support constexpr
-template <int A, int B>
-struct constexpr_max {
-  static const int result = (A > B) ? A : B;
-};
-
 // A structure that describes the current status of a thread.
 // The status of active threads can be fetched using
 // ROCKSDB_NAMESPACE::GetThreadList().
 struct ThreadStatus {
+  // Whether RocksDB was built with !NROCKSDB_THREAD_STATUS for
+  // ROCKSDB_NAMESPACE::GetThreadList() to be supported.
+  static const bool kEnabled;
+
   // The type of a thread.
   enum ThreadType : int {
     HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
@@ -102,8 +94,8 @@ struct ThreadStatus {
 
   // The maximum number of properties of an operation.
   // This number should be set to the biggest NUM_XXX_PROPERTIES.
-  static const int kNumOperationProperties =
-      constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
+  static constexpr int kNumOperationProperties =
+      std::max(int{NUM_COMPACTION_PROPERTIES}, int{NUM_FLUSH_PROPERTIES});
 
   // The type used to refer to a thread state.
   // A state describes lower-level action of a thread
diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h
index 40c04095bde9..68cce77dad4e 100644
--- a/include/rocksdb/utilities/env_mirror.h
+++ b/include/rocksdb/utilities/env_mirror.h
@@ -68,7 +68,7 @@ class EnvMirror : public EnvWrapper {
     assert(as == bs);
     return as;
   }
-#if defined(_MSC_VER)
+#if defined(_MSC_VER)  // ODR-SAFE
 #pragma warning(push)
 // logical operation on address of string constant
 #pragma warning(disable : 4130)
@@ -87,7 +87,7 @@ class EnvMirror : public EnvWrapper {
     *r = ar;
     return as;
   }
-#if defined(_MSC_VER)
+#if defined(_MSC_VER)  // ODR-SAFE
 #pragma warning(pop)
 #endif
   Status DeleteFile(const std::string& f) override {
diff --git a/include/rocksdb/utilities/ldb_cmd_execute_result.h b/include/rocksdb/utilities/ldb_cmd_execute_result.h
index 57bac334682b..2af07eeba55f 100644
--- a/include/rocksdb/utilities/ldb_cmd_execute_result.h
+++ b/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -9,10 +9,6 @@
 
 #include "rocksdb/rocksdb_namespace.h"
 
-#ifdef FAILED
-#undef FAILED
-#endif
-
 namespace ROCKSDB_NAMESPACE {
 
 class LDBCommandExecuteResult {
diff --git a/monitoring/thread_status_impl.cc b/monitoring/thread_status_impl.cc
index 153753682cfa..2b3041c4c61d 100644
--- a/monitoring/thread_status_impl.cc
+++ b/monitoring/thread_status_impl.cc
@@ -13,7 +13,9 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
+const bool ThreadStatus::kEnabled = true;
+
 std::string ThreadStatus::GetThreadTypeName(
     ThreadStatus::ThreadType thread_type) {
   switch (thread_type) {
@@ -117,6 +119,7 @@ std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
 }
 
 #else
+const bool ThreadStatus::kEnabled = false;
 
 std::string ThreadStatus::GetThreadTypeName(
     ThreadStatus::ThreadType /*thread_type*/) {
@@ -159,5 +162,5 @@ std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
   return std::map<std::string, uint64_t>();
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc
index 37fcef62b0f9..7df2b2c6fa4b 100644
--- a/monitoring/thread_status_updater.cc
+++ b/monitoring/thread_status_updater.cc
@@ -14,7 +14,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 thread_local ThreadStatusData* ThreadStatusUpdater::thread_status_data_ =
     nullptr;
@@ -324,5 +324,5 @@ void ThreadStatusUpdater::SetThreadOperationProperty(int /*i*/,
 void ThreadStatusUpdater::IncreaseThreadOperationProperty(int /*i*/,
                                                           uint64_t /*delta*/) {}
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h
index 696063cb46cd..6d3bc74c4510 100644
--- a/monitoring/thread_status_updater.h
+++ b/monitoring/thread_status_updater.h
@@ -47,7 +47,7 @@ class ColumnFamilyHandle;
 
 // The structure that keeps constant information about a column family.
 struct ConstantColumnFamilyInfo {
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
  public:
   ConstantColumnFamilyInfo(const void* _db_key, const std::string& _db_name,
                            const std::string& _cf_name)
@@ -55,13 +55,13 @@ struct ConstantColumnFamilyInfo {
   const void* db_key;
   const std::string db_name;
   const std::string cf_name;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 };
 
 // the internal data-structure that is used to reflect the current
 // status of a thread using a set of atomic pointers.
 struct ThreadStatusData {
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   explicit ThreadStatusData() {
     enable_tracking.store(false);
     thread_id.store(0);
@@ -86,7 +86,7 @@ struct ThreadStatusData {
   std::atomic<ThreadStatus::OperationStage> operation_stage;
   std::atomic<uint64_t> op_properties[ThreadStatus::kNumOperationProperties];
   std::atomic<ThreadStatus::StateType> state_type;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 };
 
 // The class that stores and updates the status of the current thread
@@ -190,7 +190,7 @@ class ThreadStatusUpdater {
       const std::vector<ColumnFamilyHandle*>& handles, bool check_exist);
 
  protected:
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   // The thread-local variable for storing thread status.
   static thread_local ThreadStatusData* thread_status_data_;
 
@@ -220,7 +220,7 @@ class ThreadStatusUpdater {
 
 #else
   static ThreadStatusData* thread_status_data_;
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_updater_debug.cc b/monitoring/thread_status_updater_debug.cc
index 464c23bbaa89..39b3ef2d0167 100644
--- a/monitoring/thread_status_updater_debug.cc
+++ b/monitoring/thread_status_updater_debug.cc
@@ -12,7 +12,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef NDEBUG
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     const std::vector<ColumnFamilyHandle*>& handles, bool check_exist) {
   std::unique_lock<std::mutex> lock(thread_list_mutex_);
@@ -37,7 +37,7 @@ void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
     const std::vector<ColumnFamilyHandle*>& /*handles*/, bool /*check_exist*/) {
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 #endif  // !NDEBUG
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc
index d61bcba1ce55..d84f46a681bd 100644
--- a/monitoring/thread_status_util.cc
+++ b/monitoring/thread_status_util.cc
@@ -11,7 +11,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 thread_local ThreadStatusUpdater*
     ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
 thread_local bool ThreadStatusUtil::thread_updater_initialized_ = false;
@@ -171,9 +171,10 @@ AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
 ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
 bool ThreadStatusUtil::thread_updater_initialized_ = false;
 
-bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) {
-  return false;
-}
+void ThreadStatusUtil::RegisterThread(
+    const Env* /*env*/, ThreadStatus::ThreadType /*thread_type*/) {}
+
+void ThreadStatusUtil::UnregisterThread() {}
 
 void ThreadStatusUtil::SetEnableTracking(bool /*enable_tracking*/) {}
 
@@ -204,11 +205,15 @@ void ThreadStatusUtil::EraseDatabaseInfo(const DB* /*db*/) {}
 
 void ThreadStatusUtil::ResetThreadStatus() {}
 
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) {
+  return false;
+}
+
 AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
     ThreadStatus::OperationStage /*stage*/) {}
 
 AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {}
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h
index df148a039565..082dbd7324b3 100644
--- a/monitoring/thread_status_util.h
+++ b/monitoring/thread_status_util.h
@@ -90,7 +90,7 @@ class ThreadStatusUtil {
   // a non-null pointer.
   static bool MaybeInitThreadLocalUpdater(const Env* env);
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   // A boolean flag indicating whether thread_updater_local_cache_
   // is initialized.  It is set to true when an Env uses any
   // ThreadStatusUtil functions using the current thread other
@@ -130,7 +130,7 @@ class AutoThreadOperationStageUpdater {
   explicit AutoThreadOperationStageUpdater(ThreadStatus::OperationStage stage);
   ~AutoThreadOperationStageUpdater();
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
  private:
   ThreadStatus::OperationStage prev_stage_;
 #endif
diff --git a/unreleased_history/public_api_changes/odr_thread_status.md b/unreleased_history/public_api_changes/odr_thread_status.md
new file mode 100644
index 000000000000..c3af9367a55d
--- /dev/null
+++ b/unreleased_history/public_api_changes/odr_thread_status.md
@@ -0,0 +1 @@
+* To reduce risk of ODR violations or similar, `ROCKSDB_USING_THREAD_STATUS` has been removed from public headers and replaced with static `const bool ThreadStatus::kEnabled`. Some other uses of conditional compilation have been removed from public API headers to reduce risk of ODR violations or other issues.
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index 4899b98ac4d9..76170768a146 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -10,7 +10,7 @@
 #include "rocksdb/db.h"
 #include "test_util/testharness.h"
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -359,4 +359,4 @@ int main(int argc, char** argv) {
   return 0;
 }
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
diff --git a/util/thread_operation.h b/util/thread_operation.h
index 84911ddc82ff..91c26f99079b 100644
--- a/util/thread_operation.h
+++ b/util/thread_operation.h
@@ -19,7 +19,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
 
 // The structure that describes a major thread operation.
 struct OperationInfo {
@@ -120,5 +120,5 @@ struct OperationInfo {};
 
 struct StateInfo {};
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+#endif  // !NROCKSDB_THREAD_STATUS
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index 8397c4b39072..901de90555f4 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -324,7 +324,7 @@ void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
   BGThreadMetadata* meta = static_cast<BGThreadMetadata*>(arg);
   size_t thread_id = meta->thread_id_;
   ThreadPoolImpl::Impl* tp = meta->thread_pool_;
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   // initialize it because compiler isn't good enough to see we don't use it
   // uninitialized
   ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES;
@@ -350,7 +350,7 @@ void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
 #endif
   delete meta;
   tp->BGThread(thread_id);
-#ifdef ROCKSDB_USING_THREAD_STATUS
+#ifndef NROCKSDB_THREAD_STATUS
   ThreadStatusUtil::UnregisterThread();
 #endif
   return;

From d56da8c112b4e6968fd79ce2bf15e6435df40656 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 5 Nov 2025 11:39:21 -0800
Subject: [PATCH 365/500] More folly build updates (#14099)

Summary:
* Fix nightly build-linux-cmake-with-folly-lite-no-test for real this time
  with correct include directory. (CMakeLists.txt)
* Add test runs to that build (and rename)
* Improve folly build caching with a folly.mk file with most of the relevant
  parts of Makefile that contribute to the checkout_folly and
  build_folly builds. This reduces the risk of false passing of CI job with
  cache folly build. This caching is still only for folly debug builds, (which
  is probably OK with just a single nightly build relying on release folly
  build, which also serves as a rough canary against false passing
  because of caching).
* Use `make VERBOSE=1` after cmake calls for detailed output

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14099

Test Plan:
temporary CI change to put the relevant parts in pr-jobs,
then back to homes including in nightly

Reviewed By: mszeszko-meta

Differential Revision: D86243363

Pulled By: pdillinger

fbshipit-source-id: f7975fa190ef45195c6d0b74417f7886e551516a
---
 .github/actions/cache-folly/action.yml |  21 ++--
 .github/workflows/nightly.yml          |   8 +-
 .github/workflows/pr-jobs.yml          |   6 +-
 CMakeLists.txt                         |   2 +-
 Makefile                               | 145 +-----------------------
 folly.mk                               | 148 +++++++++++++++++++++++++
 6 files changed, 171 insertions(+), 159 deletions(-)
 create mode 100644 folly.mk

diff --git a/.github/actions/cache-folly/action.yml b/.github/actions/cache-folly/action.yml
index 7ec394eb2391..f54a5a9a5a2e 100644
--- a/.github/actions/cache-folly/action.yml
+++ b/.github/actions/cache-folly/action.yml
@@ -7,20 +7,27 @@ outputs:
 runs:
   using: composite
   steps:
-  - name: Extract FOLLY_COMMIT_HASH from Makefile
+  - name: Extract FOLLY_MK_HASH
     id: extract-folly-hash
     shell: bash
     run: |
-      FOLLY_COMMIT_HASH=$(grep '^FOLLY_COMMIT_HASH' Makefile | awk '{print $3}')
-      echo "hash=$FOLLY_COMMIT_HASH" >> $GITHUB_OUTPUT
+      FOLLY_MK_HASH=$(md5sum folly.mk | cut -d' ' -f1)
+      echo "hash=$FOLLY_MK_HASH" >> $GITHUB_OUTPUT
+  - name: Extract FOLLY_INSTALL_DIR
+    id: extract-folly-install-dir
+    shell: bash
+    run: |
+      FOLLY_INSTALL_DIR=$(cd third-party/folly && python3 build/fbcode_builder/getdeps.py show-inst-dir)
+      echo "dir=$(echo $FOLLY_INSTALL_DIR | sed 's|installed/folly|installed|')" >> $GITHUB_OUTPUT
   - name: Cache folly build
     id: cache-folly-build
     uses: actions/cache@v4
     with:
       # Cache the folly build directory
-      path: /tmp/fbcode_builder_getdeps-Z__wZrocksdbZrocksdbZthird-partyZfollyZbuildZfbcode_builder-root/installed
+      path: ${{ steps.extract-folly-install-dir.outputs.dir }}
       # Key is based on:
       # - OS and architecture
-      # - The specific folly commit hash from Makefile
-      # - The container image version to account for different compiler/library versions
-      key: folly-build-${{ runner.os }}-${{ runner.arch }}-${{ steps.extract-folly-hash.outputs.hash }}-ubuntu22.1-v1
+      # - The docker image, which may not always be specified/known
+      # - Hash of folly.mk, which includes the folly repository commit hash
+      # NOTE: this is still only intended for DEBUG folly builds
+      key: folly-build-${{ runner.os }}-${{ runner.arch }}-${{ github.job_container.image }}-${{ steps.extract-folly-hash.outputs.hash }}
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1ca6c69818c4..f8583e44244a 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -74,7 +74,7 @@ jobs:
     - uses: "./.github/actions/build-folly"
       with:
         cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-linux-release-with-folly:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -89,7 +89,7 @@ jobs:
     - uses: "./.github/actions/setup-folly"
     - run: "DEBUG_LEVEL=0 make -j20 build_folly"
     - run: "USE_FOLLY=1 LIB_MODE=static DEBUG_LEVEL=0 V=1 make -j20 release"
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 -DCMAKE_BUILD_TYPE=Release .. && make V=1 -j20 && ctest -j20)"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 -DCMAKE_BUILD_TYPE=Release .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-linux-valgrind:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -150,7 +150,7 @@ jobs:
     - name: Build fuzzers
       run: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
     - uses: "./.github/actions/post-steps"
-  build-linux-cmake-with-folly-lite-no-test:
+  build-linux-cmake-with-folly-lite:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:
       labels: 16-core-ubuntu
@@ -161,5 +161,5 @@ jobs:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
     - uses: "./.github/actions/setup-folly"
-    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 -DCMAKE_CXX_FLAGS=-DGLOG_USE_GLOG_EXPORT .. && make V=1 -j20)"
+    - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 -DCMAKE_CXX_FLAGS=-DGLOG_USE_GLOG_EXPORT .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 23d71687d255..cfd7b0343b8d 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -139,7 +139,7 @@ jobs:
     - uses: "./.github/actions/build-folly"
       with:
         cache-hit: ${{ steps.cache-folly.outputs.cache-hit }}
-    - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)"
+    - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-benchmark-no-thread-status:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -151,7 +151,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
-    - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 -DCMAKE_CXX_FLAGS=-DNROCKSDB_THREAD_STATUS .. && make V=1 -j20 && ctest -j20
+    - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 -DCMAKE_CXX_FLAGS=-DNROCKSDB_THREAD_STATUS .. && make VERBOSE=1 -j20 && ctest -j20
     - uses: "./.github/actions/post-steps"
   build-linux-encrypted_env-no_compression:
     if: ${{ github.repository_owner == 'facebook' }}
@@ -354,7 +354,7 @@ jobs:
     - name: cmake generate project file
       run: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 ..
     - name: Build tests
-      run: cd build && make V=1 -j8
+      run: cd build && make VERBOSE=1 -j8
     - name: Run shard 0 out of 4 test shards
       run: ulimit -S -n `ulimit -H -n` && cd build && ctest -j8 -I 0,,4
       if: ${{ matrix.run_sharded_tests == 0 }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8664c1c1d865..c1e459337a40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1105,7 +1105,7 @@ if(USE_FOLLY_LITE)
   exec_program(python3 ${PROJECT_SOURCE_DIR}/third-party/folly ARGS
   build/fbcode_builder/getdeps.py show-source-dir fmt OUTPUT_VARIABLE
   FMT_SOURCE_PATH)
-  exec_program(ls ARGS -d ${FMT_SOURCE_PATH}/fmt* OUTPUT_VARIABLE
+  exec_program(ls ARGS -d ${FMT_SOURCE_PATH}/fmt*/include OUTPUT_VARIABLE
   FMT_INCLUDE_DIR)
   include_directories(${FMT_INCLUDE_DIR})
 
diff --git a/Makefile b/Makefile
index 6c053a6d0e4b..5e07b0cb50e0 100644
--- a/Makefile
+++ b/Makefile
@@ -446,100 +446,7 @@ else
 	PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
 endif
 
-# This provides a Makefile simulation of a Meta-internal folly integration.
-# It is not validated for general use.
-#
-# USE_FOLLY links the build targets with libfolly.a. The latter could be
-# built using 'make build_folly', or built externally and specified in
-# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform
-# script tries to detect if an external folly dependency has been specified.
-# If not, it exports FOLLY_PATH to the path of the installed Folly and
-# dependency libraries.
-#
-# USE_FOLLY_LITE cherry picks source files from Folly to include in the
-# RocksDB library. Its faster and has fewer dependencies on 3rd party
-# libraries, but with limited functionality. For example, coroutine
-# functionality is not available.
-ifeq ($(USE_FOLLY),1)
-ifeq ($(USE_FOLLY_LITE),1)
-$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE)
-endif
-ifneq ($(strip $(FOLLY_PATH)),)
-	BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*))
-	DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*))
-	GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*))
-	GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*))
-	LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*))
-	XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*))
-	LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*))
-	FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*))
-
-	# For some reason, glog and fmt libraries are under either lib or lib64
-	GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*))
-	FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*))
-
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
-		PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
-	else
-		PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
-		PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
-	endif
-
-	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
-	# in the command line
-
-	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent-2.1.so $(LIBSODIUM_PATH)/lib/libsodium.a -ldl
-ifneq ($(DEBUG_LEVEL),0)
-	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmtd.a $(GLOG_LIB_PATH)/libglogd.so $(GFLAGS_PATH)/lib/libgflags_debug.so.2.2
-else
-	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2
-endif
-	PLATFORM_LDFLAGS += -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib
-endif
-	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-endif
-
-ifeq ($(USE_FOLLY_LITE),1)
-	# Path to the Folly source code and include files
-	FOLLY_DIR = ./third-party/folly
-ifneq ($(strip $(BOOST_SOURCE_PATH)),)
-	BOOST_INCLUDE = $(shell (ls -d $(BOOST_SOURCE_PATH)/boost*/))
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(BOOST_INCLUDE)
-		PLATFORM_CXXFLAGS += -I$(BOOST_INCLUDE)
-	else
-		PLATFORM_CCFLAGS += -isystem $(BOOST_INCLUDE)
-		PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE)
-	endif
-endif  # BOOST_SOURCE_PATH
-ifneq ($(strip $(FMT_SOURCE_PATH)),)
-	FMT_INCLUDE = $(shell (ls -d $(FMT_SOURCE_PATH)/fmt*/include/))
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(FMT_INCLUDE)
-		PLATFORM_CXXFLAGS += -I$(FMT_INCLUDE)
-	else
-		PLATFORM_CCFLAGS += -isystem $(FMT_INCLUDE)
-		PLATFORM_CXXFLAGS += -isystem $(FMT_INCLUDE)
-	endif
-endif  # FMT_SOURCE_PATH
-	# AIX: pre-defined system headers are surrounded by an extern "C" block
-	ifeq ($(PLATFORM), OS_AIX)
-		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
-		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
-	else
-		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
-		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
-	endif
-	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
-# TODO: fix linking with fbcode compiler config
-	PLATFORM_LDFLAGS += -lglog
-endif
+include folly.mk
 
 ifdef TEST_CACHE_LINE_SIZE
   PLATFORM_CCFLAGS += -DTEST_CACHE_LINE_SIZE=$(TEST_CACHE_LINE_SIZE)
@@ -2513,56 +2420,6 @@ commit_prereq:
 	false # J=$(J) build_tools/precommit_checker.py unit clang_unit release clang_release tsan asan ubsan lite unit_non_shm
 	# $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
 
-FOLLY_COMMIT_HASH = abe68f7e917e8b7a0ee2fe066c972dc98fd35aa1
-
-# For public CI runs, checkout folly in a way that can build with RocksDB.
-# This is mostly intended as a test-only simulation of Meta-internal folly
-# integration.
-checkout_folly:
-	if [ -e third-party/folly ]; then \
-		cd third-party/folly && ${GIT_COMMAND} fetch origin; \
-	else \
-		cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \
-	fi
-	@# Pin to a particular version for public CI, so that PR authors don't
-	@# need to worry about folly breaking our integration. Update periodically
-	cd third-party/folly && git reset --hard $(FOLLY_COMMIT_HASH)
-	@# Apparently missing include
-	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
-	@# const mismatch
-	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
-	@# Use gnu.org mirrors to improve download speed (ftp.gnu.org is often super slow)
-	cd third-party/folly && perl -pi -e 's/ftp.gnu.org/ftpmirror.gnu.org/' `git grep -l ftp.gnu.org` README.md
-	@# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers
-	cd third-party/folly && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
-
-CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
-
-FOLLY_BUILD_FLAGS = --no-tests
-# NOTE: To avoid ODR violations, we must build folly in debug mode iff
-# building RocksDB in debug mode.
-ifneq ($(DEBUG_LEVEL),0)
-FOLLY_BUILD_FLAGS += --build-type Debug
-endif
-
-
-build_folly:
-	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
-	if [ "$$FOLLY_INST_PATH" ]; then \
-		rm -rf $${FOLLY_INST_PATH}/../../*; \
-	else \
-		echo "Please run checkout_folly first"; \
-		false; \
-	fi
-	cd third-party/folly && \
-		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py build $(FOLLY_BUILD_FLAGS)
-	@# In the folly build, glog and gflags are only built as dynamic libraries,
-	@# not static. This patchelf command is needed to reliably have the glog
-	@# library find its dependency gflags, because apparently the rpath of the
-	@# final binary is not used in resolving that transitive dependency.
-	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
-	cd "$$FOLLY_INST_PATH" && patchelf --add-rpath $$PWD/../gflags-*/lib ../glog-*/lib*/libglog*.so.*.*.*
-
 # ---------------------------------------------------------------------------
 #   Build size testing
 # ---------------------------------------------------------------------------
diff --git a/folly.mk b/folly.mk
new file mode 100644
index 000000000000..590abf0226f8
--- /dev/null
+++ b/folly.mk
@@ -0,0 +1,148 @@
+# This file contains the vast majority of folly-related build configuration
+# for the checkout_folly and build_folly targets, so that this file can be
+# hashed for purposes of caching folly builds and not hitting that cache when
+# something here changes.
+
+# This provides a Makefile simulation of a Meta-internal folly integration.
+# It is not validated for general use.
+#
+# USE_FOLLY links the build targets with libfolly.a. The latter could be
+# built using 'make build_folly', or built externally and specified in
+# the CXXFLAGS and EXTRA_LDFLAGS env variables. The build_detect_platform
+# script tries to detect if an external folly dependency has been specified.
+# If not, it exports FOLLY_PATH to the path of the installed Folly and
+# dependency libraries.
+#
+# USE_FOLLY_LITE cherry picks source files from Folly to include in the
+# RocksDB library. Its faster and has fewer dependencies on 3rd party
+# libraries, but with limited functionality. For example, coroutine
+# functionality is not available.
+ifeq ($(USE_FOLLY),1)
+ifeq ($(USE_FOLLY_LITE),1)
+$(error Please specify only one of USE_FOLLY and USE_FOLLY_LITE)
+endif
+ifneq ($(strip $(FOLLY_PATH)),)
+	BOOST_PATH = $(shell (ls -d $(FOLLY_PATH)/../boost*))
+	DBL_CONV_PATH = $(shell (ls -d $(FOLLY_PATH)/../double-conversion*))
+	GFLAGS_PATH = $(shell (ls -d $(FOLLY_PATH)/../gflags*))
+	GLOG_PATH = $(shell (ls -d $(FOLLY_PATH)/../glog*))
+	LIBEVENT_PATH = $(shell (ls -d $(FOLLY_PATH)/../libevent*))
+	XZ_PATH = $(shell (ls -d $(FOLLY_PATH)/../xz*))
+	LIBSODIUM_PATH = $(shell (ls -d $(FOLLY_PATH)/../libsodium*))
+	FMT_PATH = $(shell (ls -d $(FOLLY_PATH)/../fmt*))
+
+	# For some reason, glog and fmt libraries are under either lib or lib64
+	GLOG_LIB_PATH = $(shell (ls -d $(GLOG_PATH)/lib*))
+	FMT_LIB_PATH = $(shell (ls -d $(FMT_PATH)/lib*))
+
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
+		PLATFORM_CXXFLAGS += -I$(BOOST_PATH)/include -I$(DBL_CONV_PATH)/include -I$(GLOG_PATH)/include -I$(LIBEVENT_PATH)/include -I$(XZ_PATH)/include -I$(LIBSODIUM_PATH)/include -I$(FOLLY_PATH)/include -I$(FMT_PATH)/include
+	else
+		PLATFORM_CCFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
+		PLATFORM_CXXFLAGS += -isystem $(BOOST_PATH)/include -isystem $(DBL_CONV_PATH)/include -isystem $(GLOG_PATH)/include -isystem $(LIBEVENT_PATH)/include -isystem $(XZ_PATH)/include -isystem $(LIBSODIUM_PATH)/include -isystem $(FOLLY_PATH)/include -isystem $(FMT_PATH)/include
+	endif
+
+	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
+	# in the command line
+
+	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent-2.1.so $(LIBSODIUM_PATH)/lib/libsodium.a -ldl
+ifneq ($(DEBUG_LEVEL),0)
+	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmtd.a $(GLOG_LIB_PATH)/libglogd.so $(GFLAGS_PATH)/lib/libgflags_debug.so.2.2
+else
+	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2
+endif
+	PLATFORM_LDFLAGS += -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib
+endif
+	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+endif
+
+ifeq ($(USE_FOLLY_LITE),1)
+	# Path to the Folly source code and include files
+	FOLLY_DIR = ./third-party/folly
+ifneq ($(strip $(BOOST_SOURCE_PATH)),)
+	BOOST_INCLUDE = $(shell (ls -d $(BOOST_SOURCE_PATH)/boost*/))
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(BOOST_INCLUDE)
+		PLATFORM_CXXFLAGS += -I$(BOOST_INCLUDE)
+	else
+		PLATFORM_CCFLAGS += -isystem $(BOOST_INCLUDE)
+		PLATFORM_CXXFLAGS += -isystem $(BOOST_INCLUDE)
+	endif
+endif  # BOOST_SOURCE_PATH
+ifneq ($(strip $(FMT_SOURCE_PATH)),)
+	FMT_INCLUDE = $(shell (ls -d $(FMT_SOURCE_PATH)/fmt*/include/))
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FMT_INCLUDE)
+		PLATFORM_CXXFLAGS += -I$(FMT_INCLUDE)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FMT_INCLUDE)
+		PLATFORM_CXXFLAGS += -isystem $(FMT_INCLUDE)
+	endif
+endif  # FMT_SOURCE_PATH
+	# AIX: pre-defined system headers are surrounded by an extern "C" block
+	ifeq ($(PLATFORM), OS_AIX)
+		PLATFORM_CCFLAGS += -I$(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -I$(FOLLY_DIR)
+	else
+		PLATFORM_CCFLAGS += -isystem $(FOLLY_DIR)
+		PLATFORM_CXXFLAGS += -isystem $(FOLLY_DIR)
+	endif
+	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
+# TODO: fix linking with fbcode compiler config
+	PLATFORM_LDFLAGS += -lglog
+endif
+
+FOLLY_COMMIT_HASH = abe68f7e917e8b7a0ee2fe066c972dc98fd35aa1
+
+# For public CI runs, checkout folly in a way that can build with RocksDB.
+# This is mostly intended as a test-only simulation of Meta-internal folly
+# integration.
+checkout_folly:
+	if [ -e third-party/folly ]; then \
+		cd third-party/folly && ${GIT_COMMAND} fetch origin; \
+	else \
+		cd third-party && ${GIT_COMMAND} clone https://github.com/facebook/folly.git; \
+	fi
+	@# Pin to a particular version for public CI, so that PR authors don't
+	@# need to worry about folly breaking our integration. Update periodically
+	cd third-party/folly && git reset --hard $(FOLLY_COMMIT_HASH)
+	@# Apparently missing include
+	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
+	@# const mismatch
+	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
+	@# Use gnu.org mirrors to improve download speed (ftp.gnu.org is often super slow)
+	cd third-party/folly && perl -pi -e 's/ftp.gnu.org/ftpmirror.gnu.org/' `git grep -l ftp.gnu.org` README.md
+	@# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers
+	cd third-party/folly && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
+
+CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
+
+FOLLY_BUILD_FLAGS = --no-tests
+# NOTE: To avoid ODR violations, we must build folly in debug mode iff
+# building RocksDB in debug mode.
+ifneq ($(DEBUG_LEVEL),0)
+FOLLY_BUILD_FLAGS += --build-type Debug
+endif
+
+build_folly:
+	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	if [ "$$FOLLY_INST_PATH" ]; then \
+		rm -rf $${FOLLY_INST_PATH}/../../*; \
+	else \
+		echo "Please run checkout_folly first"; \
+		false; \
+	fi
+	cd third-party/folly && \
+		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py build $(FOLLY_BUILD_FLAGS)
+	@# In the folly build, glog and gflags are only built as dynamic libraries,
+	@# not static. This patchelf command is needed to reliably have the glog
+	@# library find its dependency gflags, because apparently the rpath of the
+	@# final binary is not used in resolving that transitive dependency.
+	FOLLY_INST_PATH=`cd third-party/folly && $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
+	cd "$$FOLLY_INST_PATH" && patchelf --add-rpath $$PWD/../gflags-*/lib ../glog-*/lib*/libglog*.so.*.*.*

From 7603712a88a947717ce281c47fcb75e1bd8fd29c Mon Sep 17 00:00:00 2001
From: ngina <221624547+nmk70@users.noreply.github.com>
Date: Wed, 5 Nov 2025 20:00:00 -0800
Subject: [PATCH 366/500] Introduce tail estimation to prevent oversized
 compaction files (#14051)

Summary:
**Summary:**
This change introduces tail size estimation during SST construction to improve compaction file cutting accuracy to prevent oversized files. The BlockBasedTableBuilder now estimates the SST tail size (index and filter blocks) and uses this estimate, in addition to the data size, to determine when to cut files during compaction.

**Problem:**
Currently, file cutting logic only considers data size when determining where to cut a file, failing to reserve space for index and filter blocks that are added when the file is finalized. This often leads to SST files that exceed target file size limits.

**Behavior Change:**
Implement size estimation methods for index and filter builders, and integrate these estimates into BlockBasedTableBuilder via a new EstimatedTailSize() method. This method aggregates estimates from all tail components and is used for file cutting decisions during compaction.

**Performance Considerations:**
To minimize CPU overhead, size estimates are updated when data blocks are finalized rather than on every key add. For index builders, estimates are updated when index entries are added (one per data block). For filter builders, the OnDataBlockFinalized() hook triggers estimate updates when data blocks are cut/finalized.

This approach provides:
* Minimal impact to compaction hot path (key additions)
* Near real-time estimates for file cutting decisions
* Meaningful estimate changes only when data blocks are finalized

**Usage:**
* Set true mutable cf option `compaction_use_tail_size_estimation`
to use tail size estimation for compaction file cutting decisions.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14051

Test Plan:
* Assert tail size estimate is an overestimate in BlockBasedTableBuilder::Finish
* Add new test to verify compaction output file is below target file size

**Next steps:**
* Enable tail size estimation for compaction file cutting by default (and other improvements)

Reviewed By: pdillinger, cbi42

Differential Revision: D84852285

Pulled By: nmk70

fbshipit-source-id: c43cf5dbd2cb2f623a0622591ef24eee30ce0c87
---
 BUCK                                          |   6 -
 CMakeLists.txt                                |   1 -
 Makefile                                      |   3 -
 db/compaction/compaction_outputs.cc           |   6 +-
 db/db_compaction_test.cc                      |  65 +++++++
 include/rocksdb/advanced_options.h            |  11 ++
 options/cf_options.cc                         |   6 +
 options/cf_options.h                          |   4 +
 options/options.cc                            |   4 +
 options/options_helper.cc                     |   2 +
 options/options_settable_test.cc              |   1 +
 src.mk                                        |   1 -
 .../block_based/block_based_table_builder.cc  |  79 +++++++-
 table/block_based/block_based_table_builder.h |  10 +-
 table/block_based/filter_block.h              |  17 ++
 table/block_based/filter_policy.cc            |  14 +-
 table/block_based/full_filter_block.cc        |  31 +++
 table/block_based/full_filter_block.h         |   6 +
 table/block_based/index_builder.cc            |  98 ++++++++--
 table/block_based/index_builder.h             |  68 +++++--
 table/block_based/index_builder_test.cc       | 183 ------------------
 table/block_based/partitioned_filter_block.cc |  51 +++++
 table/block_based/partitioned_filter_block.h  |  19 ++
 .../block_based/user_defined_index_wrapper.h  |   2 +-
 table/table_builder.h                         |   5 +
 .../target_file_size_is_upper_bound.md        |   1 +
 26 files changed, 453 insertions(+), 241 deletions(-)
 delete mode 100644 table/block_based/index_builder_test.cc
 create mode 100644 unreleased_history/new_features/target_file_size_is_upper_bound.md

diff --git a/BUCK b/BUCK
index c4327a3f724b..d52c03f7bf0e 100644
--- a/BUCK
+++ b/BUCK
@@ -5194,12 +5194,6 @@ cpp_unittest_wrapper(name="import_column_family_test",
             extra_compiler_flags=[])
 
 
-cpp_unittest_wrapper(name="index_builder_test",
-            srcs=["table/block_based/index_builder_test.cc"],
-            deps=[":rocksdb_test_lib"],
-            extra_compiler_flags=[])
-
-
 cpp_unittest_wrapper(name="inlineskiplist_test",
             srcs=["memtable/inlineskiplist_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1e459337a40..fd60f833222b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1473,7 +1473,6 @@ if(WITH_TESTS)
         table/block_based/block_based_table_reader_test.cc
         table/block_based/block_test.cc
         table/block_based/data_block_hash_index_test.cc
-        table/block_based/index_builder_test.cc
         table/block_based/full_filter_block_test.cc
         table/block_based/partitioned_filter_block_test.cc
         table/cleanable_test.cc
diff --git a/Makefile b/Makefile
index 5e07b0cb50e0..987dd6fe175e 100644
--- a/Makefile
+++ b/Makefile
@@ -1664,9 +1664,6 @@ block_test: $(OBJ_DIR)/table/block_based/block_test.o $(TEST_LIBRARY) $(LIBRARY)
 data_block_hash_index_test: $(OBJ_DIR)/table/block_based/data_block_hash_index_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-index_builder_test: $(OBJ_DIR)/table/block_based/index_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
-	$(AM_LINK)
-
 inlineskiplist_test: $(OBJ_DIR)/memtable/inlineskiplist_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index 67ea73567ae8..ff1e446a6953 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -278,7 +278,11 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
   }
 
   // reach the max file size
-  if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+  uint64_t estimated_file_size = current_output_file_size_;
+  if (compaction_->mutable_cf_options().target_file_size_is_upper_bound) {
+    estimated_file_size += builder_->EstimatedTailSize();
+  }
+  if (estimated_file_size >= compaction_->max_output_file_size()) {
     return true;
   }
 
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 7e3f61662220..749acda11c8b 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -11541,6 +11541,71 @@ TEST_F(DBCompactionTest, RecordNewestKeyTimeForTtlCompaction) {
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 }
 
+// Test verifies compaction file cutting logic when using tail size estimation
+// maintains output files at or below the target file size.
+TEST_F(DBCompactionTest, CompactionRespectsTargetSizeWithTailEstimation) {
+  const int kInitialKeyCount = 10000;  // 10k keys
+  const int kValueSize = 100;          // 100 bytes per key
+  const int kSeed = 301;
+
+  Options options = CurrentOptions();
+  options.target_file_size_is_upper_bound = true;
+  options.target_file_size_base = 256 * 1024;
+  options.write_buffer_size = 2 * 1024 * 1024;
+  options.level0_file_num_compaction_trigger = 100;  // Never trigger L0->L1
+  options.compression = kNoCompression;
+
+  BlockBasedTableOptions table_options;
+  table_options.partition_filters = true;
+  table_options.metadata_block_size = 4 * 1024;
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Generate 2 L0 files
+  // Generate first file with 10k keys (each ~100 bytes) approx 1.2MB total
+  Random rnd(kSeed);
+  for (int i = 0; i < kInitialKeyCount; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate second file with overlapping keys to force compaction (prevent
+  // trivial move)
+  for (int i = kInitialKeyCount / 2; i < kInitialKeyCount * 1.5; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+  }
+  ASSERT_OK(Flush());
+
+  // Capture file metadata and assert two L0 files
+  std::vector<LiveFileMetaData> file_metadata;
+  db_->GetLiveFilesMetaData(&file_metadata);
+  ASSERT_EQ(file_metadata.size(), 2);
+  for (const auto& file : file_metadata) {
+    ASSERT_EQ(file.level, 0);
+  };
+
+  // Manually compact LO files to L1
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 1;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify that compacted output files are under target file size
+  for (const auto& file : file_metadata) {
+    if (file.level > 0) {
+      EXPECT_LE(file.size, options.target_file_size_base)
+          << "Output file size exceeds target size: " << " File: " << file.name
+          << " level: " << file.level << " File size: " << file.size
+          << " Target size: " << options.target_file_size_base;
+    }
+  }
+}
+
 class PeriodicCompactionListener : public EventListener {
  public:
   explicit PeriodicCompactionListener() {}
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index b4e7a30e9523..dba041b0f4eb 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -473,6 +473,17 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   int target_file_size_multiplier = 1;
 
+  // If true, RocksDB will consider the estimated tail size (filter + index +
+  // meta blocks) when deciding whether to cut a compaction output file. This
+  // helps prevent output files from exceeding the target_file_size_base due to
+  // large tail blocks. When disabled, only the data block size is considered,
+  // which may result in SST files exceeding the target_file_size_base.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
+  bool target_file_size_is_upper_bound = false;
+
   // If true, RocksDB will pick target size of each level dynamically.
   // We will pick a base level b >= 1. L0 will be directly merged into level b,
   // instead of always into level 1. Level 1 to b-1 need to be empty.
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 475e0d7a4386..f0e9e26b43b5 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -450,6 +450,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, target_file_size_multiplier),
           OptionType::kInt, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"target_file_size_is_upper_bound",
+         {offsetof(struct MutableCFOptions, target_file_size_is_upper_bound),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"arena_block_size",
          {offsetof(struct MutableCFOptions, arena_block_size),
           OptionType::kSizeT, OptionVerificationType::kNormal,
@@ -1168,6 +1172,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  target_file_size_base);
   ROCKS_LOG_INFO(log, "              target_file_size_multiplier: %d",
                  target_file_size_multiplier);
+  ROCKS_LOG_INFO(log, "         target_file_size_is_upper_bound: %d",
+                 target_file_size_is_upper_bound);
   ROCKS_LOG_INFO(log, "                 max_bytes_for_level_base: %" PRIu64,
                  max_bytes_for_level_base);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
diff --git a/options/cf_options.h b/options/cf_options.h
index 815c60f54c52..539ddf494f75 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -132,6 +132,8 @@ struct MutableCFOptions {
         max_compaction_bytes(options.max_compaction_bytes),
         target_file_size_base(options.target_file_size_base),
         target_file_size_multiplier(options.target_file_size_multiplier),
+        target_file_size_is_upper_bound(
+            options.target_file_size_is_upper_bound),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
         ttl(options.ttl),
@@ -206,6 +208,7 @@ struct MutableCFOptions {
         max_compaction_bytes(0),
         target_file_size_base(0),
         target_file_size_multiplier(0),
+        target_file_size_is_upper_bound(false),
         max_bytes_for_level_base(0),
         max_bytes_for_level_multiplier(0),
         ttl(0),
@@ -304,6 +307,7 @@ struct MutableCFOptions {
   uint64_t max_compaction_bytes;
   uint64_t target_file_size_base;
   int target_file_size_multiplier;
+  bool target_file_size_is_upper_bound;
   uint64_t max_bytes_for_level_base;
   double max_bytes_for_level_multiplier;
   uint64_t ttl;
diff --git a/options/options.cc b/options/options.cc
index 0ce071573a4e..d9f64f93d235 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -63,6 +63,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       level0_stop_writes_trigger(options.level0_stop_writes_trigger),
       target_file_size_base(options.target_file_size_base),
       target_file_size_multiplier(options.target_file_size_multiplier),
+      target_file_size_is_upper_bound(options.target_file_size_is_upper_bound),
       level_compaction_dynamic_level_bytes(
           options.level_compaction_dynamic_level_bytes),
       max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
@@ -269,6 +270,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                    target_file_size_base);
   ROCKS_LOG_HEADER(log, "            Options.target_file_size_multiplier: %d",
                    target_file_size_multiplier);
+  ROCKS_LOG_HEADER(log,
+                   "           Options.target_file_size_is_upper_bound: %d",
+                   target_file_size_is_upper_bound);
   ROCKS_LOG_HEADER(log,
                    "               Options.max_bytes_for_level_base: %" PRIu64,
                    max_bytes_for_level_base);
diff --git a/options/options_helper.cc b/options/options_helper.cc
index f2081ef8259f..efc91aa9f2f8 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -252,6 +252,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->max_compaction_bytes = moptions.max_compaction_bytes;
   cf_opts->target_file_size_base = moptions.target_file_size_base;
   cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier;
+  cf_opts->target_file_size_is_upper_bound =
+      moptions.target_file_size_is_upper_bound;
   cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base;
   cf_opts->max_bytes_for_level_multiplier =
       moptions.max_bytes_for_level_multiplier;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 8266adbb8ba4..3a00c768b6ed 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -610,6 +610,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "max_sequential_skip_in_iterations=4294971408;"
       "arena_block_size=1893;"
       "target_file_size_multiplier=35;"
+      "target_file_size_is_upper_bound=false;"
       "min_write_buffer_number_to_merge=9;"
       "max_write_buffer_number=84;"
       "write_buffer_size=1653;"
diff --git a/src.mk b/src.mk
index 06310de3d3ab..f4efad68bbc3 100644
--- a/src.mk
+++ b/src.mk
@@ -589,7 +589,6 @@ TEST_MAIN_SOURCES =                                                     \
   table/block_based/block_based_table_reader_test.cc                    \
   table/block_based/block_test.cc                                       \
   table/block_based/data_block_hash_index_test.cc                       \
-  table/block_based/index_builder_test.cc                               \
   table/block_based/full_filter_block_test.cc                           \
   table/block_based/partitioned_filter_block_test.cc                    \
   table/cleanable_test.cc                                               \
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index d85a73a11553..5b36e02dff66 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -894,6 +894,7 @@ struct BlockBasedTableBuilder::Rep {
   std::unique_ptr<FilterBlockBuilder> filter_builder;
   OffsetableCacheKey base_cache_key;
   const TableFileCreationReason reason;
+  const bool target_file_size_is_upper_bound;
 
   BlockHandle pending_handle;  // Handle to add to index block
 
@@ -1041,6 +1042,8 @@ struct BlockBasedTableBuilder::Rep {
         use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
                                             !table_opt.block_align),
         reason(tbo.reason),
+        target_file_size_is_upper_bound(
+            tbo.moptions.target_file_size_is_upper_bound),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
@@ -1611,6 +1614,17 @@ void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
     rep_->data_begin_offset += uncompressed_block_data.size();
     MaybeEnterUnbuffered(first_key_in_next_block);
   } else {
+    // Increment num_data_blocks when a data block is finalized in the
+    // emit thread to avoid data races with write worker threads
+    ++r->props.num_data_blocks;
+
+    // Notify filter builder that a data block has been finalized
+    // This must happen on the emit thread before the block is added to the
+    // ring buffer to avoid race conditions with worker threads
+    if (r->filter_builder) {
+      r->filter_builder->OnDataBlockFinalized(r->props.num_data_blocks);
+    }
+
     if (r->IsParallelCompressionActive()) {
       EmitBlockForParallel(r->data_block.MutableBuffer(), r->last_ikey,
                            first_key_in_next_block);
@@ -1735,7 +1749,6 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   if (is_data_block) {
     r->props.data_size = r->get_offset();
     r->props.uncompressed_data_size += uncompressed_block_data.size();
-    ++r->props.num_data_blocks;
   }
 }
 
@@ -1784,7 +1797,6 @@ void BlockBasedTableBuilder::BGWorker(WorkingAreaPair& working_area) {
       if (LIKELY(ios.ok())) {
         rep_->props.data_size = rep_->get_offset();
         rep_->props.uncompressed_data_size += block_rep->uncompressed.size();
-        ++rep_->props.num_data_blocks;
 
         rep_->index_builder->FinishIndexEntry(
             rep_->pending_handle, block_rep->prepared_index_entry.get(),
@@ -2701,6 +2713,8 @@ Status BlockBasedTableBuilder::Finish() {
 
   r->props.tail_start_offset = r->offset.LoadRelaxed();
 
+  uint64_t last_estimated_tail_size = EstimatedTailSize();
+
   // Write meta blocks, metaindex block and footer in the following order.
   //    1. [meta block: filter]
   //    2. [meta block: index]
@@ -2727,6 +2741,24 @@ Status BlockBasedTableBuilder::Finish() {
   r->state = Rep::State::kClosed;
   r->tail_size = r->offset.LoadRelaxed() - r->props.tail_start_offset;
 
+  // Assert tail size estimation is an overestimate only when tail size
+  // estimation option is enabled for compaction files with supported
+  // index/filter types:
+  // - Shortened indexes (kBinarySearch, kBinarySearchWithFirstKey)
+  // - Partitioned indexes (kTwoLevelIndexSearch)
+  // - Full filters
+  // - Partitioned filters
+  if (r->target_file_size_is_upper_bound &&
+      r->reason == TableFileCreationReason::kCompaction &&
+      r->table_options.index_type != BlockBasedTableOptions::kHashSearch) {
+    ROCKS_LOG_WARN(r->ioptions.info_log,
+                   "File number: %" PRIu64 ", Estimated tail size = %" PRIu64
+                   " bytes, Actual tail size = %" PRIu64 " bytes",
+                   r->props.orig_file_number, last_estimated_tail_size,
+                   r->tail_size);
+    assert(r->tail_size <= last_estimated_tail_size);
+  }
+
   return r->GetStatus();
 }
 
@@ -2764,6 +2796,49 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
   }
 }
 
+uint64_t BlockBasedTableBuilder::EstimatedTailSize() const {
+  uint64_t estimated_tail_size = 0;
+
+  // 1. Estimate index size
+  if (rep_->table_options.index_type ==
+      BlockBasedTableOptions::kTwoLevelIndexSearch) {
+    assert(rep_->p_index_builder_);
+    estimated_tail_size += rep_->p_index_builder_->CurrentIndexSizeEstimate();
+  } else {
+    assert(rep_->index_builder);
+    estimated_tail_size += rep_->index_builder->CurrentIndexSizeEstimate();
+  }
+
+  // 2. Estimate filter size
+  if (rep_->filter_builder) {
+    estimated_tail_size += rep_->filter_builder->CurrentFilterSizeEstimate();
+  }
+
+  // 3. Estimate compression dictionary size
+  if (rep_->compressor_with_dict) {
+    Slice dict = rep_->compressor_with_dict->GetSerializedDict();
+    if (!dict.empty()) {
+      estimated_tail_size += dict.size();
+    }
+  }
+
+  // 4. Estimate range deletion block size
+  if (!rep_->range_del_block.empty()) {
+    estimated_tail_size += rep_->range_del_block.CurrentSizeEstimate();
+  }
+
+  // 5. Estimate properties block size conservatively (~1-2KB)
+  estimated_tail_size += 2048;
+
+  // 6. Estimate meta-index block size conservatively (~1KB)
+  estimated_tail_size += 1024;
+
+  // 7. Add footer size
+  estimated_tail_size += Footer::kMaxEncodedLength;
+
+  return estimated_tail_size;
+}
+
 uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; }
 
 bool BlockBasedTableBuilder::NeedCompact() const {
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index db96a8929fa4..1bd2bcc2b30a 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -89,11 +89,15 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
 
-  // Estimated size of the file generated so far. This is used when
-  // FileSize() cannot estimate final SST size, e.g. parallel compression
-  // is enabled.
+  // Estimated size of the file generated so far (based on data blocks, this
+  // estimate does not include meta blocks). This is used when FileSize() cannot
+  // estimate final SST size, e.g. parallel compression is enabled.
   uint64_t EstimatedFileSize() const override;
 
+  // Estimated tail size of the SST file generated so far. The "tail" refers to
+  // all blocks written after data blocks (index + filter).
+  uint64_t EstimatedTailSize() const override;
+
   // Get the size of the "tail" part of a SST file. "Tail" refers to
   // all blocks after data blocks till the end of the SST file.
   uint64_t GetTailSize() const override;
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index 6f502cc0e59b..e0c0d094554e 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -68,6 +68,18 @@ class FilterBlockBuilder {
   // For reporting stats on how many entries the builder considered unique
   virtual size_t EstimateEntriesAdded() = 0;
 
+  // Returns an estimate of the current filter size based on the builder's
+  // state. Implementations should cache the estimate and update it via
+  // UpdateFilterSizeEstimate() to avoid recalculating on every key add.
+  //
+  // Can be called at any time during table construction, even before calling
+  // Finish(). Used during table construction to determine when to cut files.
+  virtual size_t CurrentFilterSizeEstimate() = 0;
+
+  // Provides a hook for filter builder when a data block is finalized, such as
+  // to update cached filter size estimates.
+  virtual void OnDataBlockFinalized(uint64_t /* num_data_blocks */) {}
+
   // When using AddWithPrevKey, this must be called before Finish(). (May also
   // be called without AddWithPrevKey, but prev_key_without_ts must be
   // accurate regardless.)
@@ -110,6 +122,11 @@ class FilterBlockBuilder {
     return filter;
   }
 #endif  // NDEBUG
+
+ protected:
+  // Update cached filter size estimate. Subclasses should override to update
+  // estimates based on their internal state.
+  virtual void UpdateFilterSizeEstimate(uint64_t /* num_data_blocks */) {}
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 08314ccc9db0..cf83cf084575 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -29,6 +29,7 @@
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
+#include "util/atomic.h"
 #include "util/bloom_impl.h"
 #include "util/coding.h"
 #include "util/hash.h"
@@ -126,8 +127,11 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
     }
   }
 
+  // Returns an estimate of the number of entries added to the
+  // filter. This method is thread-safe and can be safely called
+  // from background threads during parallel compression.
   size_t EstimateEntriesAdded() override {
-    return hash_entries_info_.entries.size();
+    return hash_entries_info_.entries_count.LoadRelaxed();
   }
 
   Status MaybePostVerify(const Slice& filter_content) override;
@@ -147,6 +151,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
       hash_entries_info_.xor_checksum ^= hash;
     }
     hash_entries_info_.entries.push_back(hash);
+    hash_entries_info_.entries_count.FetchAddRelaxed(1);
     if (cache_res_mgr_ &&
         // Traditional rounding to whole bucket size
         ((hash_entries_info_.entries.size() %
@@ -314,6 +319,10 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
     // and has near-minimal peak memory use.
     std::deque<uint64_t> entries;
 
+    // Tracks the number of entries added for thread-safe
+    // size estimation.
+    RelaxedAtomic<size_t> entries_count{0};
+
     // If cache_res_mgr_ != nullptr,
     // it manages cache charge for buckets of hash entries in (new) Bloom
     // or Ribbon Filter construction.
@@ -332,6 +341,8 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
     void Swap(HashEntriesInfo* other) {
       assert(other != nullptr);
       std::swap(entries, other->entries);
+      entries_count.StoreRelaxed(
+          other->entries_count.ExchangeRelaxed(entries_count.LoadRelaxed()));
       std::swap(cache_res_bucket_handles, other->cache_res_bucket_handles);
       std::swap(xor_checksum, other->xor_checksum);
       std::swap(prev_alt_hash, other->prev_alt_hash);
@@ -339,6 +350,7 @@ class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
 
     void Reset() {
       entries.clear();
+      entries_count.StoreRelaxed(0);
       cache_res_bucket_handles.clear();
       xor_checksum = 0;
       prev_alt_hash = {};
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index af741787a32d..f0f1a958ae15 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -30,6 +30,37 @@ size_t FullFilterBlockBuilder::EstimateEntriesAdded() {
   return filter_bits_builder_->EstimateEntriesAdded();
 }
 
+void FullFilterBlockBuilder::OnDataBlockFinalized(uint64_t num_data_blocks) {
+  UpdateFilterSizeEstimate(num_data_blocks);
+}
+
+size_t FullFilterBlockBuilder::CurrentFilterSizeEstimate() {
+  return estimated_filter_size_;
+}
+
+void FullFilterBlockBuilder::UpdateFilterSizeEstimate(
+    uint64_t num_data_blocks) {
+  size_t entries_added = filter_bits_builder_->EstimateEntriesAdded();
+
+  if (entries_added == 0) {
+    estimated_filter_size_ = 0;
+    return;
+  }
+
+  // Calculate the estimated filter size in bytes.
+  // Estimate ~15 bits per key for bloom filters.
+  size_t filter_size = ((entries_added * 15) + 7) / 8;
+
+  // Reserve filter space for next data block ~2x the average.
+  size_t buffer_size = 0;
+  if (num_data_blocks > 0) {
+    buffer_size = (filter_size / num_data_blocks) * 2;
+    estimated_filter_size_ = filter_size + buffer_size;
+  } else {
+    estimated_filter_size_ = filter_size;
+  }
+}
+
 void FullFilterBlockBuilder::AddWithPrevKey(
     const Slice& key_without_ts, const Slice& /*prev_key_without_ts*/) {
   FullFilterBlockBuilder::Add(key_without_ts);
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 784f0eb881c3..96e8300b2086 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -57,6 +57,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
     return filter_bits_builder_->EstimateEntriesAdded() == 0;
   }
   size_t EstimateEntriesAdded() override;
+  size_t CurrentFilterSizeEstimate() override;
+  void OnDataBlockFinalized(uint64_t num_data_blocks) override;
   Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter,
                 std::unique_ptr<const char[]>* filter_owner = nullptr) override;
   using FilterBlockBuilder::Finish;
@@ -73,6 +75,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
 
   std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
 
+  void UpdateFilterSizeEstimate(uint64_t num_data_blocks_written) override;
+
  private:
   // important: all of these might point to invalid addresses
   // at the time of destruction of this filter block. destructor
@@ -80,6 +84,8 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   const SliceTransform* const prefix_extractor_;
   const bool whole_key_filtering_;
   std::unique_ptr<const char[]> filter_data_;
+
+  size_t estimated_filter_size_ = 0;
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index 56e539da1eb5..7731f42790fa 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -117,20 +117,18 @@ Slice ShortenedIndexBuilder::FindShortInternalKeySuccessor(
   }
 }
 
-uint64_t ShortenedIndexBuilder::EstimateCurrentIndexSize() const {
+void ShortenedIndexBuilder::UpdateIndexSizeEstimate() {
   uint64_t current_size =
-      must_use_separator_with_seq_
+      must_use_separator_with_seq_.LoadRelaxed()
           ? index_block_builder_.CurrentSizeEstimate()
           : index_block_builder_without_seq_.CurrentSizeEstimate();
 
-  if (num_index_entries_ == 0) {
-    return current_size;
+  uint64_t final_estimate = current_size;
+  if (num_index_entries_ > 0) {
+    // Add buffer to generously account (in most cases) for the next index entry
+    final_estimate += (2 * (current_size / num_index_entries_));
   }
-
-  uint64_t avg_entry_size = current_size / num_index_entries_;
-
-  // Add buffer to generously account (in most cases) for the next index entry
-  return current_size + (2 * avg_entry_size);
+  estimated_index_size_.StoreRelaxed(final_estimate);
 }
 
 PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
@@ -188,8 +186,8 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
   // must_use_separator_with_seq_ is true (internal-key mode) (set to false by
   // default on Creation) so that flush policy can point to
   // sub_index_builder_->index_block_builder_
-  if (must_use_separator_with_seq_) {
-    sub_index_builder_->must_use_separator_with_seq_ = true;
+  if (must_use_separator_with_seq_.LoadRelaxed()) {
+    sub_index_builder_->must_use_separator_with_seq_.StoreRelaxed(true);
     builder_to_monitor = &sub_index_builder_->index_block_builder_;
   } else {
     builder_to_monitor = &sub_index_builder_->index_block_builder_without_seq_;
@@ -237,6 +235,11 @@ void PartitionedIndexBuilder::MaybeFlush(const Slice& index_key,
                        index_key, EncodedBlockHandle(index_value).AsSlice()));
   if (do_flush) {
     assert(entries_.back().value.get() == sub_index_builder_);
+
+    // Update estimate of completed partitions when a partition is flushed
+    estimated_completed_partitions_size_.FetchAddRelaxed(
+        sub_index_builder_->CurrentIndexSizeEstimate());
+
     cut_filter_block = true;
     MakeNewSubIndexBuilder();
   }
@@ -254,9 +257,15 @@ void PartitionedIndexBuilder::FinishIndexEntry(const BlockHandle& block_handle,
                                        skip_delta_encoding);
   std::swap(entries_.back().key, entry->separator_with_seq);
 
-  if (!must_use_separator_with_seq_ && entry->must_use_separator_with_seq) {
+  // Update cached size estimate when data blocks are finalized for more
+  // accurate tail size estimation. This is needed for parallel compression
+  // which uses FinishIndexEntry() instead of AddIndexEntry().
+  UpdateIndexSizeEstimate();
+
+  if (!must_use_separator_with_seq_.LoadRelaxed() &&
+      entry->must_use_separator_with_seq) {
     // We need to apply !must_use_separator_with_seq to all sub-index builders
-    must_use_separator_with_seq_ = true;
+    must_use_separator_with_seq_.StoreRelaxed(true);
     flush_policy_->Retarget(sub_index_builder_->index_block_builder_);
   }
   // NOTE: not compatible with coupled partitioned filters so don't need to
@@ -278,10 +287,15 @@ Slice PartitionedIndexBuilder::AddIndexEntry(
       separator_scratch, skip_delta_encoding);
   entries_.back().key.assign(sep.data(), sep.size());
 
-  if (!must_use_separator_with_seq_ &&
-      sub_index_builder_->must_use_separator_with_seq_) {
+  // Update cached size estimate when data blocks are finalized for more
+  // accurate tail size estimation. This ensures the estimate reflects current
+  // state after each data block is added.
+  UpdateIndexSizeEstimate();
+
+  if (!must_use_separator_with_seq_.LoadRelaxed() &&
+      sub_index_builder_->must_use_separator_with_seq_.LoadRelaxed()) {
     // We need to apply !must_use_separator_with_seq to all sub-index builders
-    must_use_separator_with_seq_ = true;
+    must_use_separator_with_seq_.StoreRelaxed(true);
     flush_policy_->Retarget(sub_index_builder_->index_block_builder_);
   }
   if (UNLIKELY(first_key_in_next_block == nullptr)) {
@@ -315,7 +329,7 @@ Status PartitionedIndexBuilder::Finish(
     const Slice handle_delta_encoding_slice(handle_delta_encoding);
     index_block_builder_.Add(last_entry.key, handle_encoding.AsSlice(),
                              &handle_delta_encoding_slice);
-    if (!must_use_separator_with_seq_) {
+    if (!must_use_separator_with_seq_.LoadRelaxed()) {
       index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
                                            handle_encoding.AsSlice(),
                                            &handle_delta_encoding_slice);
@@ -324,7 +338,7 @@ Status PartitionedIndexBuilder::Finish(
   }
   // If there is no sub_index left, then return the 2nd level index.
   if (UNLIKELY(entries_.empty())) {
-    if (must_use_separator_with_seq_) {
+    if (must_use_separator_with_seq_.LoadRelaxed()) {
       index_blocks->index_block_contents = index_block_builder_.Finish();
     } else {
       index_blocks->index_block_contents =
@@ -338,7 +352,8 @@ Status PartitionedIndexBuilder::Finish(
     // expect more calls to Finish
     Entry& entry = entries_.front();
     // Apply the policy to all sub-indexes
-    entry.value->must_use_separator_with_seq_ = must_use_separator_with_seq_;
+    entry.value->must_use_separator_with_seq_.StoreRelaxed(
+        must_use_separator_with_seq_.LoadRelaxed());
     auto s = entry.value->Finish(index_blocks);
     index_size_ += index_blocks->index_block_contents.size();
     finishing_indexes_ = true;
@@ -347,4 +362,49 @@ Status PartitionedIndexBuilder::Finish(
 }
 
 size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
+
+void PartitionedIndexBuilder::UpdateIndexSizeEstimate() {
+  uint64_t total_size = 0;
+
+  // Ignore last entry which is a placeholder for the partition being built
+  size_t completed_partitions = entries_.size() > 0 ? entries_.size() - 1 : 0;
+
+  // Use running estimate of completed partitions instead of IndexSize() which
+  // is only available after calling Finish().
+  uint64_t completed_partitions_size =
+      estimated_completed_partitions_size_.LoadRelaxed();
+  total_size += completed_partitions_size;
+
+  // Add current active partition size if it exists
+  uint64_t current_sub_index_size = 0;
+  if (sub_index_builder_ != nullptr) {
+    current_sub_index_size = sub_index_builder_->CurrentIndexSizeEstimate();
+    total_size += current_sub_index_size;
+  }
+
+  // Add buffer for top-level index and next partition
+  uint64_t buffer_size = 0;
+  if (completed_partitions > 0) {
+    // Calculate top-level index size. Each top-level entry consists of:
+    // separator key (~20-50 bytes) + BlockHandle (~20 bytes) + overhead
+    // Estimate ~70 bytes per top-level entry as a reasonable average
+    auto estimated_top_level_size = completed_partitions * 70;
+    total_size += completed_partitions * 70;
+
+    // Buffer for next partition + next top-level entry
+    uint64_t avg_partition_size =
+        completed_partitions_size / completed_partitions;
+    uint64_t avg_top_level_entry_size =
+        estimated_top_level_size / completed_partitions;
+
+    buffer_size = 2 * (avg_partition_size + avg_top_level_entry_size);
+    total_size += buffer_size;
+  } else if (sub_index_builder_ != nullptr) {
+    // For the first partition, estimate using the current partition's state
+    buffer_size = 2 * current_sub_index_size;
+    total_size += buffer_size;
+  }
+  estimated_index_size_.StoreRelaxed(total_size);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index b1e9dea46cfb..fea042b1f8be 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -158,12 +158,15 @@ class IndexBuilder {
   // Get the size for index block. Must be called after ::Finish.
   virtual size_t IndexSize() const = 0;
 
-  // Get an estimate for current total index size based on current builder
-  // state.
+  // Returns an estimate of the current index size based on the builder's state.
+  // Implementations should cache the estimate and update it via
+  // UpdateIndexSizeEstimate() to avoid recalculating on every key add,
+  // which is critical for performance in the compaction hot path.
   //
-  // Called during compaction to estimate final index size for file cutting
-  // decisions.
-  virtual uint64_t EstimateCurrentIndexSize() const = 0;
+  // This function is only called by the SST "emit thread" but must be
+  // thread safe with concurrent calls to UpdateIndexSizeEstimate() from another
+  // thread (such as during parallel compression).
+  virtual uint64_t CurrentIndexSizeEstimate() const = 0;
 
   virtual bool separator_is_key_plus_seq() { return true; }
 
@@ -187,6 +190,13 @@ class IndexBuilder {
                      l_user_key, r_user_key) == 0;
   }
 
+  // Updates the cached index size estimate used by CurrentIndexSizeEstimate().
+  //
+  // This function can be called from the SST "write thread" (via
+  // FinishIndexEntry()), and needs to be thread safe with
+  // CurrentIndexSizeEstimate() called from the SST "emit thread".
+  virtual void UpdateIndexSizeEstimate() {}
+
   const InternalKeyComparator* comparator_;
   // Size of user-defined timestamp in bytes.
   size_t ts_sz_;
@@ -234,7 +244,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
         include_first_key_(include_first_key),
         shortening_mode_(shortening_mode) {
     // Making the default true will disable the feature for old versions
-    must_use_separator_with_seq_ = (format_version <= 2);
+    must_use_separator_with_seq_.StoreRelaxed(format_version <= 2);
   }
 
   void OnKeyAdded(const Slice& key,
@@ -257,10 +267,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
       } else {
         separator_with_seq = last_key_in_current_block;
       }
-      if (!must_use_separator_with_seq_ &&
+      if (!must_use_separator_with_seq_.LoadRelaxed() &&
           ShouldUseKeyPlusSeqAsSeparator(last_key_in_current_block,
                                          *first_key_in_next_block)) {
-        must_use_separator_with_seq_ = true;
+        must_use_separator_with_seq_.StoreRelaxed(true);
       }
     } else {
       if (shortening_mode_ == BlockBasedTableOptions::IndexShorteningMode::
@@ -333,6 +343,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
     }
 
     ++num_index_entries_;
+    UpdateIndexSizeEstimate();
   }
 
   Slice AddIndexEntry(const Slice& last_key_in_current_block,
@@ -347,7 +358,8 @@ class ShortenedIndexBuilder : public IndexBuilder {
     Slice first_internal_key = GetFirstInternalKey(&first_internal_key_buf);
 
     AddIndexEntryImpl(separator_with_seq, first_internal_key, block_handle,
-                      must_use_separator_with_seq_, skip_delta_encoding);
+                      must_use_separator_with_seq_.LoadRelaxed(),
+                      skip_delta_encoding);
     current_block_first_internal_key_.clear();
     return separator_with_seq;
   }
@@ -396,7 +408,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
                             &entry->separator_with_seq);
     Slice first_internal_key = GetFirstInternalKey(&entry->first_internal_key);
     entry->SaveFrom(separator, first_internal_key,
-                    must_use_separator_with_seq_);
+                    must_use_separator_with_seq_.LoadRelaxed());
     current_block_first_internal_key_.clear();
   }
 
@@ -413,7 +425,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
   using IndexBuilder::Finish;
   Status Finish(IndexBlocks* index_blocks,
                 const BlockHandle& /*last_partition_block_handle*/) override {
-    if (must_use_separator_with_seq_) {
+    if (must_use_separator_with_seq_.LoadRelaxed()) {
       index_blocks->index_block_contents = index_block_builder_.Finish();
     } else {
       index_blocks->index_block_contents =
@@ -425,10 +437,15 @@ class ShortenedIndexBuilder : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
-  uint64_t EstimateCurrentIndexSize() const override;
+  uint64_t CurrentIndexSizeEstimate() const override {
+    return estimated_index_size_.LoadRelaxed();
+  }
+
+  // Updates the cached size estimate to minimize CPU usage in hot path
+  void UpdateIndexSizeEstimate() override;
 
   bool separator_is_key_plus_seq() override {
-    return must_use_separator_with_seq_;
+    return must_use_separator_with_seq_.LoadRelaxed();
   }
 
   // Changes *key to a short string >= *key.
@@ -452,12 +469,14 @@ class ShortenedIndexBuilder : public IndexBuilder {
   // before).
   BlockBuilder index_block_builder_without_seq_;
   const bool use_value_delta_encoding_;
-  bool must_use_separator_with_seq_;
+  RelaxedAtomic<bool> must_use_separator_with_seq_;
   const bool include_first_key_;
   BlockBasedTableOptions::IndexShorteningMode shortening_mode_;
   BlockHandle last_encoded_handle_ = BlockHandle::NullBlockHandle();
   std::string current_block_first_internal_key_;
   uint64_t num_index_entries_ = 0;
+  // Cache for index size estimate to avoid recalculating in hot path
+  RelaxedAtomic<uint64_t> estimated_index_size_{0};
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -579,8 +598,7 @@ class HashIndexBuilder : public IndexBuilder {
            prefix_meta_block_.size();
   }
 
-  // TODO: implement
-  uint64_t EstimateCurrentIndexSize() const override { return 0; }
+  uint64_t CurrentIndexSizeEstimate() const override { return 0; }
 
   bool separator_is_key_plus_seq() override {
     return primary_index_builder_.separator_is_key_plus_seq();
@@ -658,8 +676,11 @@ class PartitionedIndexBuilder : public IndexBuilder {
   size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
   size_t NumPartitions() const;
 
-  // TODO: implement
-  uint64_t EstimateCurrentIndexSize() const override { return 0; }
+  // Returns a cached estimate of the current index size. This
+  // estimate is updated when data blocks are added.
+  uint64_t CurrentIndexSizeEstimate() const override {
+    return estimated_index_size_.LoadRelaxed();
+  }
 
   inline bool ShouldCutFilterBlock() {
     // Current policy is to align the partitions of index and filters
@@ -679,8 +700,10 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // cutting the next partition
   void RequestPartitionCut();
 
+  // This function must be thread safe because multiple worker threads might
+  // update the index builder state during parallel compression.
   bool separator_is_key_plus_seq() override {
-    return must_use_separator_with_seq_;
+    return must_use_separator_with_seq_.LoadRelaxed();
   }
 
   bool get_use_value_delta_encoding() const {
@@ -694,6 +717,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
   size_t partition_cnt_ = 0;
 
   void MakeNewSubIndexBuilder();
+  void UpdateIndexSizeEstimate() override;
 
   struct Entry {
     std::string key;
@@ -713,7 +737,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // true if Finish is called once but not complete yet.
   bool finishing_indexes_ = false;
   const BlockBasedTableOptions& table_opt_;
-  bool must_use_separator_with_seq_;
+  RelaxedAtomic<bool> must_use_separator_with_seq_;
   bool use_value_delta_encoding_;
   // true if an external entity (such as filter partition builder) request
   // cutting the next partition
@@ -721,5 +745,9 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // true if it should cut the next filter partition block
   bool cut_filter_block = false;
   BlockHandle last_encoded_handle_;
+  // Cached estimate of current index size, updated when data blocks are added
+  RelaxedAtomic<uint64_t> estimated_index_size_{0};
+  // Running estimate of completed partitions total size
+  RelaxedAtomic<uint64_t> estimated_completed_partitions_size_{0};
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/index_builder_test.cc b/table/block_based/index_builder_test.cc
deleted file mode 100644
index d398c214b70b..000000000000
--- a/table/block_based/index_builder_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "table/block_based/index_builder.h"
-
-#include <gtest/gtest.h>
-
-#include <memory>
-#include <string>
-
-#include "db/dbformat.h"
-#include "rocksdb/comparator.h"
-#include "table/format.h"
-#include "test_util/testharness.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class IndexBuilderTest
-    : public testing::Test,
-      public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
- public:
-  IndexBuilderTest() : icomp_(BytewiseComparator()) {}
-
-  std::unique_ptr<IndexBuilder> CreateIndexBuilder() {
-    BlockBasedTableOptions table_options;
-    BlockBasedTableOptions::IndexType index_type = GetParam();
-    return std::unique_ptr<IndexBuilder>(IndexBuilder::CreateIndexBuilder(
-        index_type, &icomp_, nullptr, false /* use_value_delta_encoding */,
-        table_options, 0 /* ts_sz */,
-        true /* persist_user_defined_timestamps */));
-  }
-
-  std::string MakeKey(int i) {
-    return InternalKey(std::string("key") + std::to_string(i), 100 - i,
-                       kTypeValue)
-        .Encode()
-        .ToString();
-  }
-
-  BlockHandle MakeBlockHandle(uint64_t offset, uint64_t size) {
-    BlockHandle handle;
-    handle.set_offset(offset);
-    handle.set_size(size);
-    return handle;
-  }
-
-  void AddEntriesToBuilder(IndexBuilder* builder, int num_entries,
-                           std::vector<uint64_t>* estimates = nullptr) {
-    for (int i = 1; i <= num_entries; ++i) {
-      std::string key_current = MakeKey(i);
-      BlockHandle handle = MakeBlockHandle(i * kBlockOffset, kBlockSize);
-      std::string separator_scratch;
-
-      if (i == num_entries) {
-        // Last entry - no next key
-        builder->AddIndexEntry(key_current, nullptr, handle, &separator_scratch,
-                               false);
-      } else {
-        std::string key_next = MakeKey(i + 1);
-        Slice key_next_slice(key_next);
-        builder->AddIndexEntry(key_current, &key_next_slice, handle,
-                               &separator_scratch, false);
-      }
-
-      if (estimates) {
-        uint64_t current_estimate = builder->EstimateCurrentIndexSize();
-        estimates->push_back(current_estimate);
-      }
-    }
-  }
-
- protected:
-  InternalKeyComparator icomp_;
-  static const uint64_t kBlockOffset = 1000;
-  static const uint64_t kBlockSize = 4096;
-  // BlockBuilder initial overhead
-  // See BlockBuilder constructor and Reset()
-  static const uint64_t kBlockBuilderInitialOverhead = 2 * sizeof(uint32_t);
-};
-
-const uint64_t IndexBuilderTest::kBlockOffset;
-const uint64_t IndexBuilderTest::kBlockSize;
-const uint64_t IndexBuilderTest::kBlockBuilderInitialOverhead;
-
-TEST_P(IndexBuilderTest, EstimateCurrentIndexSize) {
-  auto builder = CreateIndexBuilder();
-  BlockBasedTableOptions::IndexType index_type = GetParam();
-
-  // Empty builder
-  uint64_t empty_size = builder->EstimateCurrentIndexSize();
-  if (index_type == BlockBasedTableOptions::kBinarySearch) {
-    EXPECT_EQ(empty_size, kBlockBuilderInitialOverhead)
-        << "Empty ShortenedIndexBuilder should return BlockBuilder initial "
-           "overhead ("
-        << kBlockBuilderInitialOverhead;
-  } else {
-    EXPECT_EQ(empty_size, 0) << "Other builders should return 0 when empty";
-  }
-
-  // Add one entry
-  AddEntriesToBuilder(builder.get(), 1);
-  uint64_t size_after_one = builder->EstimateCurrentIndexSize();
-
-  if (index_type == BlockBasedTableOptions::kBinarySearch) {
-    EXPECT_GT(size_after_one, kBlockBuilderInitialOverhead)
-        << "Estimate should be greater than initial overhead";
-  } else {
-    // Other builders currently return 0 (which is expected)
-    EXPECT_EQ(size_after_one, 0) << "Other index builders currently return 0";
-  }
-
-  // Add multiple entries and capture all estimates
-  std::vector<uint64_t> estimates;
-  auto new_builder = CreateIndexBuilder();
-  AddEntriesToBuilder(new_builder.get(), 5, &estimates);
-
-  // Validate reported estimates
-  for (size_t i = 0; i < estimates.size(); ++i) {
-    uint64_t estimate = estimates[i];
-
-    if (index_type == BlockBasedTableOptions::kBinarySearch) {
-      EXPECT_GT(estimate, 0)
-          << "Estimate should be positive for " << i << " entry";
-      if (i > 0) {
-        EXPECT_GT(estimate, estimates[i - 1])
-            << "Estimate should not decrease with more entries (entry " << i - 1
-            << ": " << estimates[i - 1] << ", entry " << i << ": " << estimate
-            << ")";
-      }
-    } else {
-      EXPECT_EQ(estimate, 0) << "Other index builders currently return 0";
-    }
-  }
-
-  // Multiple calls should return the same value if the builder state is not
-  // modified
-  uint64_t estimate1 = builder->EstimateCurrentIndexSize();
-  uint64_t estimate2 = builder->EstimateCurrentIndexSize();
-  uint64_t estimate3 = builder->EstimateCurrentIndexSize();
-
-  EXPECT_EQ(estimate1, estimate2);
-  EXPECT_EQ(estimate2, estimate3);
-
-  // Test behavior after Finish() - only for builders that can be finished
-  // successfully
-  if (index_type == BlockBasedTableOptions::kBinarySearch) {
-    uint64_t estimate_before_finish = builder->EstimateCurrentIndexSize();
-
-    IndexBuilder::IndexBlocks index_blocks;
-    Status s = builder->Finish(&index_blocks);
-    EXPECT_TRUE(s.ok()) << "ShortenedIndexBuilder should finish successfully: "
-                        << s.ToString();
-
-    uint64_t estimate_after_finish = builder->EstimateCurrentIndexSize();
-    EXPECT_GT(estimate_after_finish, 0);
-    EXPECT_LE(estimate_before_finish, estimate_after_finish)
-        << "Estimate should not decrease after finish";
-
-    // Ensure that the actual index size is not greater than the estimated size
-    // after finish is called to prevent underestimation.
-    uint64_t actual_index_size = builder->IndexSize();
-    EXPECT_LE(actual_index_size, estimate_after_finish)
-        << "Actual index size should not be greater than estimated size: "
-           "actual size:  "
-        << actual_index_size << ", estimated size: " << estimate_after_finish;
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(
-    IndexBuilderTypes, IndexBuilderTest,
-    ::testing::Values(BlockBasedTableOptions::kBinarySearch,
-                      BlockBasedTableOptions::kHashSearch,
-                      BlockBasedTableOptions::kTwoLevelIndexSearch));
-
-}  // namespace ROCKSDB_NAMESPACE
-
-int main(int argc, char** argv) {
-  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index c7024895453f..95c1cf32a2e8 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -143,6 +143,7 @@ void PartitionedFilterBlockBuilder::CutAFilterBlock(const Slice* next_key,
     ikey = p_index_builder_->GetPartitionKey();
   }
   filters_.push_back({std::move(ikey), std::move(filter_data), filter});
+  completed_partitions_size_.FetchAddRelaxed(filter.size());
   partitioned_filters_construction_status_.UpdateIfOk(
       filter_construction_status);
 
@@ -209,6 +210,56 @@ size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
   return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
 }
 
+size_t PartitionedFilterBlockBuilder::CurrentFilterSizeEstimate() {
+  size_t active_partition_size =
+      filter_bits_builder_->EstimateEntriesAdded() * 2;  // 2 bytes per key
+
+  return estimated_filter_size_.LoadRelaxed() + active_partition_size;
+}
+
+void PartitionedFilterBlockBuilder::OnDataBlockFinalized(
+    uint64_t num_data_blocks) {
+  UpdateFilterSizeEstimate(num_data_blocks);
+}
+
+void PartitionedFilterBlockBuilder::UpdateFilterSizeEstimate(
+    uint64_t num_data_blocks) {
+  size_t partitions_size = completed_partitions_size_.LoadRelaxed();
+
+  // Reserve space if no partitions have been cut
+  size_t active_filter_estimate = 0;
+  if (partitions_size == 0) {
+    size_t avg_bytes_per_entry =
+        2;  // 2 bytes per entry, approx 15 bits per key
+
+    // Estimate using keys_per_partition_ since we expect to cut the first
+    // partition once it reaches approx. this many entries.
+    active_filter_estimate = keys_per_partition_ * avg_bytes_per_entry;
+
+    // Add a 2x buffer (for top-level index, etc.)
+    active_filter_estimate = active_filter_estimate * 2;
+  }
+  size_t filter_estimate = std::max(partitions_size, active_filter_estimate);
+
+  // Estimate top-level partition index size
+  if (p_index_builder_->separator_is_key_plus_seq()) {
+    filter_estimate += index_on_filter_block_builder_.CurrentSizeEstimate();
+  } else {
+    filter_estimate +=
+        index_on_filter_block_builder_without_seq_.CurrentSizeEstimate();
+  }
+
+  // Reserve filter space for the next data block
+  size_t reserved = 0;
+  if (num_data_blocks > 0) {
+    reserved = (filter_estimate / num_data_blocks) *
+               2;  // 2x average size per data block
+    estimated_filter_size_.StoreRelaxed(filter_estimate + reserved);
+  } else {
+    estimated_filter_size_.StoreRelaxed(filter_estimate);
+  }
+}
+
 void PartitionedFilterBlockBuilder::PrevKeyBeforeFinish(
     const Slice& prev_key_without_ts) {
   assert(prev_key_without_ts.compare(DEBUG_add_with_prev_key_called_
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 8faed24a92db..96f39dd4f01a 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -18,6 +18,7 @@
 #include "table/block_based/filter_block_reader_common.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/index_builder.h"
+#include "util/atomic.h"
 #include "util/autovector.h"
 #include "util/hash_containers.h"
 
@@ -46,6 +47,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   }
 
   size_t EstimateEntriesAdded() override;
+  size_t CurrentFilterSizeEstimate() override;
+  void OnDataBlockFinalized(uint64_t num_data_blocks) override;
 
   void PrevKeyBeforeFinish(const Slice& prev_key_without_ts) override;
   Status Finish(const BlockHandle& last_partition_block_handle, Slice* filter,
@@ -67,6 +70,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
     return Status::OK();
   }
 
+ protected:
+  // Needs to be thread-safe to be invoked from background worker
+  // thread when parallel compression is enabled.
+  void UpdateFilterSizeEstimate(uint64_t num_data_blocks) override;
+
  private:  // fns
   // Whether to cut a filter block before the next key
   bool DecideCutAFilterBlock();
@@ -92,6 +100,11 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   };
   std::deque<FilterEntry> filters_;  // list of partitioned filters and keys
                                      // used in building the index
+  // Running total of completed filter partition sizes to avoid
+  // iterating over filters_ deque, which can be concurrently modified by
+  // the main thread when parallel compression is enabled.
+  RelaxedAtomic<size_t> completed_partitions_size_{0};
+
   // The desired number of keys per partition
   uint32_t keys_per_partition_;
   // According to the bits builders, how many keys/prefixes added
@@ -107,6 +120,12 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   // For Add without prev key
   std::string prev_key_without_ts_;
 
+  // Cached filter size estimate for hot path performance - updated only when
+  // data blocks are written for meaningful estimate updates.
+  // Must be atomic since UpdateFilterSizeEstimate() can be called from
+  // background worker threads when parallel compression is enabled.
+  RelaxedAtomic<size_t> estimated_filter_size_{0};
+
 #ifndef NDEBUG
   // For verifying accurate previous keys are provided by the caller, so that
   // release code can be fast
diff --git a/table/block_based/user_defined_index_wrapper.h b/table/block_based/user_defined_index_wrapper.h
index 37860eef38e6..b65ba147e2fc 100644
--- a/table/block_based/user_defined_index_wrapper.h
+++ b/table/block_based/user_defined_index_wrapper.h
@@ -158,7 +158,7 @@ class UserDefinedIndexBuilderWrapper : public IndexBuilder {
 
   size_t IndexSize() const override { return index_size_; }
 
-  uint64_t EstimateCurrentIndexSize() const override { return 0; }
+  uint64_t CurrentIndexSizeEstimate() const override { return 0; }
 
   bool separator_is_key_plus_seq() override {
     return internal_index_builder_->separator_is_key_plus_seq();
diff --git a/table/table_builder.h b/table/table_builder.h
index 64a1ab02791d..63ab175b5f60 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -225,6 +225,11 @@ class TableBuilder {
   // is enabled.
   virtual uint64_t EstimatedFileSize() const { return FileSize(); }
 
+  // Estimated tail size of the SST file generated so far. The "tail" refers to
+  // all blocks written after data blocks (index + filter). This value helps
+  // estimate the total file size when deciding when to cut files.
+  virtual uint64_t EstimatedTailSize() const { return 0; }
+
   virtual uint64_t GetTailSize() const { return 0; }
 
   // If the user defined table properties collector suggest the file to
diff --git a/unreleased_history/new_features/target_file_size_is_upper_bound.md b/unreleased_history/new_features/target_file_size_is_upper_bound.md
new file mode 100644
index 000000000000..4dc578949f7a
--- /dev/null
+++ b/unreleased_history/new_features/target_file_size_is_upper_bound.md
@@ -0,0 +1 @@
+Added new option target_file_size_is_upper_bound  that makes most compaction output SST files come close to the target file size without exceeding it, rather than commonly exceeding it by some fraction (current behavior). For now the new behavior is off by default, but we expect to enable it by default in the future.

From 37176a4a440cf1acd784ac5cf5b8ed81783e9481 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 7 Nov 2025 09:04:52 -0800
Subject: [PATCH 367/500] Auto-tune manifest file size (#14076)

Summary:
Adds auto-tuning of manifest file size to avoid the need to scale `max_manifest_file_size` in proportion to things like number of SST files to properly balance (a) manifest file write amp and new file creation, vs. (b) manifest file space amp and replay time, including non-incremental space usage in backups. (Manifest file write amp comes from re-writing a "live" record when the manifest file is re-created, or "compacted"; space amp is usage beyond what would be used by a compacted manifest file.) In more detail,

* Add new option `max_manifest_space_amp_pct` with default value of 500, which defaults to 0.2 write amp and up to roughly 5.0 space amp, except `max_manifest_file_size` is treated as the "minimum" size before re-creating ("compacting") the manifest file.
* `max_manifest_file_size` in a way means the same thing, with the same default of 1GB, but in a way has taken on a new role. What is the same is that we do not re-create the manifest file before reaching this size (except for DB re-open), and so users are very unlikely to see a change in default behavior (auto-tuning only kicking in if auto-tuning would exceed 1GB for effective max size for the current manifest file). The new role is as a file size lower bound before auto-tuning kicks in, to minimize churn in files considered "negligibly small." We recommend a new setting of around 1MB or even smaller like 64KB, and expect something like this to become the default soon.
* These two options along with `manifest_preallocation_size` are now mutable with SetDBOptions. The effect is nearly immediate, affecting the next write to the current manifest file.

Also in this PR:
* Refactoring of VersionSet to allow it to get (more) settings from MutableDBOptions. This touches a number of files in not very interesting ways, but notably we have to be careful about thread-safe access to MutableDBOptions fields, and even fields within VersionSet. I have decided to save copies of relevant fields from MutableDBOptions to simplify testing, etc. by not saving a reference to MutableDBOptions but getting notified of updates.
* Updated some logging in VersionSet to provide some basic data about final and compacted manifest sizes (effects of auto-tuning), making sure to avoid I/O while holding DB mutex.
* Added db_etc3_test.cc which is intended as a successor to db_test and db_test2, but having "test.cc" in its name for easier exclusion of test files when using `git grep`. Intended follow-up: rename db_test2 to db_etc2_test
* Moved+updated `ManifestRollOver` test to the new file to be closer to other manifest file rollover testing.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14076

Test Plan:
As for correctness, new unit test AutoTuneManifestSize is pretty thorough. Some other unit tests updated appropriately. Manual tests in the performance section were also audited for expected behavior based on the new logging in the DB LOG. Example LOG data with -max_manifest_file_size=2048 -max_manifest_space_amp_pct=500:

```
2025/10/24-11:12:48.979472 2150678 [/version_set.cc:5927] Created manifest 5, compacted+appended from 52 to 116
2025/10/24-11:12:49.626441 2150682 [/version_set.cc:5927] Created manifest 24, compacted+appended from 2169 to 1801
2025/10/24-11:12:52.194592 2150682 [/version_set.cc:5927] Created manifest 91, compacted+appended from 10913 to 8707
2025/10/24-11:13:02.969944 2150682 [/version_set.cc:5927] Created manifest 362, compacted+appended from 52259 to 13321
2025/10/24-11:13:18.815120 2150681 [/version_set.cc:5927] Created manifest 765, compacted+appended from 80064 to 13304
2025/10/24-11:13:35.590905 2150681 [/version_set.cc:5927] Created manifest 1167, compacted+appended from 79863 to 13304
```

As you can see, it only took a few iterations of ramp-up to settle on the auto-tuned max manifest size for tracking ~122 live SST files, around 80KB and compacting down to about 13KB. (13KB * (500 + 100) / 100 = 78KB). With the default large setting for max_manifest_file_size, we end up with a 232KB manifest, which is more than 90% wasted space. (A long-running DB would be much worse.)

As for performance, we don't expect a difference, even with TransactionDB because actual writing of the manifest is done without holding the DB mutex. I was not able to see a performance regression using db_bench with FIFO compaction and >1000 ~10MB SST files, including settings of -max_manifest_file_size=2048 -max_manifest_space_amp_pct={500,10,0}. No "hiccups" visible with -histogram either.

I also tried seeding a 1 second delay in writing new manifest files (other than the first). This had no significant effect at -max_manifest_space_amp_pct=500 but at 100 started causing write stalls in my test. In many ways this is kind of a worst case scenario and out-of-proportion test, but gives me more confidence that a higher number like 500 is probably the best balance in general.

Reviewed By: xingbowang

Differential Revision: D85445178

Pulled By: pdillinger

fbshipit-source-id: 1e6e07e89c586762dd65c65bb7cb2b8b719513f9
---
 BUCK                                          |   6 +
 CMakeLists.txt                                |   1 +
 Makefile                                      |   3 +
 db/compaction/compaction_job_test.cc          |  18 +-
 db/db_basic_test.cc                           |  24 ---
 db/db_etc3_test.cc                            | 161 ++++++++++++++++++
 db/db_impl/db_impl.cc                         |  10 +-
 db/db_impl/db_impl_follower.cc                |   6 +-
 db/db_impl/db_impl_open.cc                    |   2 +-
 db/db_impl/db_impl_secondary.cc               |   6 +-
 db/db_options_test.cc                         |  35 ++++
 db/db_test2.cc                                |   3 +
 db/db_test_util.cc                            |   3 +-
 db/db_wal_test.cc                             |   5 +-
 db/flush_job_test.cc                          |  14 +-
 db/memtable_list_test.cc                      |   6 +-
 db/repair.cc                                  |   4 +-
 db/version_set.cc                             |  64 +++++--
 db/version_set.h                              |  46 ++++-
 db/version_set_test.cc                        |  80 ++++-----
 db/version_util.h                             |   5 +-
 db/wal_manager_test.cc                        |   4 +-
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   6 +-
 db_stress_tool/db_stress_test_base.cc         |   1 +
 include/rocksdb/options.h                     |  63 ++++++-
 options/db_options.cc                         |  63 +++----
 options/db_options.h                          |   5 +-
 options/options_helper.cc                     |   6 +-
 options/options_settable_test.cc              |   1 +
 options/options_test.cc                       |   8 +-
 src.mk                                        |   1 +
 tools/db_bench_tool.cc                        |  10 ++
 tools/db_crashtest.py                         |   3 +-
 tools/ldb_cmd.cc                              |   9 +-
 tools/ldb_cmd_test.cc                         |   5 +-
 .../new_features/auto_tune_manifest.md        |   1 +
 utilities/backup/backup_engine_test.cc        |   1 +
 utilities/checkpoint/checkpoint_test.cc       |   1 +
 39 files changed, 519 insertions(+), 172 deletions(-)
 create mode 100644 db/db_etc3_test.cc
 create mode 100644 unreleased_history/new_features/auto_tune_manifest.md

diff --git a/BUCK b/BUCK
index d52c03f7bf0e..e9e6de2f6b16 100644
--- a/BUCK
+++ b/BUCK
@@ -4844,6 +4844,12 @@ cpp_unittest_wrapper(name="db_encryption_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="db_etc3_test",
+            srcs=["db/db_etc3_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_flush_test",
             srcs=["db/db_flush_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd60f833222b..03837b672ac4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1380,6 +1380,7 @@ if(WITH_TESTS)
         db/db_clip_test.cc
         db/db_dynamic_level_test.cc
         db/db_encryption_test.cc
+        db/db_etc3_test.cc
         db/db_flush_test.cc
         db/db_inplace_update_test.cc
         db/db_io_failure_test.cc
diff --git a/Makefile b/Makefile
index 987dd6fe175e..403c804c17f7 100644
--- a/Makefile
+++ b/Makefile
@@ -1421,6 +1421,9 @@ db_test: $(OBJ_DIR)/db/db_test.o $(TEST_LIBRARY) $(LIBRARY)
 db_test2: $(OBJ_DIR)/db/db_test2.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_etc3_test: $(OBJ_DIR)/db/db_etc3_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 compression_test: $(OBJ_DIR)/util/compression_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index e1e11e76fe36..2836ed20e3ba 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -211,8 +211,8 @@ class CompactionJobTestBase : public testing::Test {
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(
-            dbname_, &db_options_, env_options_, table_cache_.get(),
-            &write_buffer_manager_, &write_controller_,
+            dbname_, &db_options_, mutable_db_options_, env_options_,
+            table_cache_.get(), &write_buffer_manager_, &write_controller_,
             /*block_cache_tracer=*/nullptr,
             /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
             /*daily_offpeak_time_utc=*/"",
@@ -546,13 +546,13 @@ class CompactionJobTestBase : public testing::Test {
     ASSERT_OK(s);
     db_options_.info_log = info_log;
 
-    versions_.reset(
-        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
-                       &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-                       test::kUnitTestDbId, /*db_session_id=*/"",
-                       /*daily_offpeak_time_utc=*/"",
-                       /*error_handler=*/nullptr, /*unchanging=*/false));
+    versions_.reset(new VersionSet(
+        dbname_, &db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
+        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+        test::kUnitTestDbId, /*db_session_id=*/"",
+        /*daily_offpeak_time_utc=*/"",
+        /*error_handler=*/nullptr, /*unchanging=*/false));
     compaction_job_stats_.Reset();
 
     VersionEdit new_db;
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 9a4a5b983621..b115e7069d14 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -675,30 +675,6 @@ TEST_F(DBBasicTest, Flush) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBBasicTest, ManifestRollOver) {
-  do {
-    Options options;
-    options.max_manifest_file_size = 10;  // 10 bytes
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-    {
-      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
-      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
-      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
-      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
-      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
-      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
-      ASSERT_GT(manifest_after_flush, manifest_before_flush);
-      ReopenWithColumnFamilies({"default", "pikachu"}, options);
-      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
-      // check if a new manifest file got inserted or not.
-      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
-      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
-      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
-    }
-  } while (ChangeCompactOptions());
-}
-
 TEST_F(DBBasicTest, IdentityAcrossRestarts) {
   constexpr size_t kMinIdSize = 10;
   do {
diff --git a/db/db_etc3_test.cc b/db/db_etc3_test.cc
new file mode 100644
index 000000000000..e5152fcd58d2
--- /dev/null
+++ b/db/db_etc3_test.cc
@@ -0,0 +1,161 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEtc3Test : public DBTestBase {
+ public:
+  DBEtc3Test() : DBTestBase("db_etc3_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBEtc3Test, ManifestRollOver) {
+  do {
+    Options options;
+    // Force new manifest on each manifest write
+    options.max_manifest_file_size = 0;
+    options.max_manifest_space_amp_pct = 0;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    {
+      ASSERT_OK(Put(1, "key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      // Re-open should always re-create manifest file
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBEtc3Test, AutoTuneManifestSize) {
+  // Ensure we have auto-tuning beyond max_manifest_file_size by default
+  ASSERT_EQ(DBOptions{}.max_manifest_space_amp_pct, 500);
+
+  Options options = CurrentOptions();
+  ASSERT_OK(db_->SetOptions({{"level0_file_num_compaction_trigger", "20"}}));
+
+  // Use large column family names to essentially control the amount of payload
+  // data needed for the manifest file. Drop manifest entries don't include the
+  // CF name so are small.
+  uint64_t prev_manifest_num = 0, cur_manifest_num = 0;
+  std::deque<ColumnFamilyHandle*> handles;
+  int counter = 5;
+  auto AddCfFn = [&]() {
+    std::string name = "cf" + std::to_string(counter++);
+    name.resize(1000, 'a');
+    ASSERT_OK(db_->CreateColumnFamily(options, name, &handles.emplace_back()));
+    prev_manifest_num = cur_manifest_num;
+    cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+  };
+  auto DropCfFn = [&]() {
+    ASSERT_OK(db_->DropColumnFamily(handles.front()));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(handles.front()));
+    handles.pop_front();
+    prev_manifest_num = cur_manifest_num;
+    cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+  };
+  auto TrivialManifestWriteFn = [&]() {
+    ASSERT_OK(Put("x", std::to_string(counter++)));
+    ASSERT_OK(Flush());
+    prev_manifest_num = cur_manifest_num;
+    cur_manifest_num = dbfull()->TEST_Current_Manifest_FileNo();
+  };
+
+  options.max_manifest_file_size = 1000000;
+  options.max_manifest_space_amp_pct = 0;  // no auto-tuning yet
+  DestroyAndReopen(options);
+
+  // With the generous (minimum) maximum manifest size, should not be rotated
+  AddCfFn();
+  AddCfFn();
+  AddCfFn();
+  ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+
+  // Change options for small max and (still) no auto-tuning
+  ASSERT_OK(db_->SetDBOptions({{"max_manifest_file_size", "3000"}}));
+
+  // Takes effect on the next manifest write
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // Now we have to rewrite the whole manifest on each write because the
+  // compacted size exceeds the "max" size.
+  AddCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  DropCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  AddCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // Enabling auto-tuning should fix this, immediately for next manifest writes.
+  // This will allow up to double-ish the size of the compacted manifest,
+  // which last should have been 4000 + some bytes.
+  ASSERT_EQ(handles.size(), 4U);
+  ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "105"}}));
+
+  // After 9 CF names should be enough to rotate the manifest
+  for (int i = 1; i <= 5; ++i) {
+    if ((i % 2) == 1) {
+      DropCfFn();
+    }
+    AddCfFn();
+    ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+  }
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // We now have a different last compacted manifest size, should be
+  // able to go beyond 9 CFs named in manifest this time.
+  ASSERT_EQ(handles.size(), 6U);
+
+  DropCfFn();
+  DropCfFn();
+  for (int i = 1; i <= 4; ++i) {
+    DropCfFn();
+    AddCfFn();
+    ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+  }
+  // We've written 10 named CFs to the manifest. We should be able to
+  // dynamically change the auto-tuning still based on the last "compacted"
+  // manifest size of 7000 + some bytes.
+  ASSERT_OK(db_->SetDBOptions({{"max_manifest_space_amp_pct", "51"}}));
+  TrivialManifestWriteFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+  // And the "compacted" manifest size has reset again, so should be changed
+  // again sooner.
+  ASSERT_EQ(handles.size(), 4U);
+  for (int i = 1; i <= 2; ++i) {
+    AddCfFn();
+    ASSERT_EQ(prev_manifest_num, cur_manifest_num);
+  }
+  // Enough for manifest change
+  AddCfFn();
+  ASSERT_LT(prev_manifest_num, cur_manifest_num);
+
+  // Wrap up
+  while (!handles.empty()) {
+    DropCfFn();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index f5ade39fc89e..5676bb2cd588 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -258,10 +258,10 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       [this]() { this->TriggerPeriodicCompaction(); });
 
   versions_.reset(new VersionSet(
-      dbname_, &immutable_db_options_, file_options_, table_cache_.get(),
-      write_buffer_manager_, &write_controller_, &block_cache_tracer_,
-      io_tracer_, db_id_, db_session_id_, options.daily_offpeak_time_utc,
-      &error_handler_, read_only));
+      dbname_, &immutable_db_options_, mutable_db_options_, file_options_,
+      table_cache_.get(), write_buffer_manager_, &write_controller_,
+      &block_cache_tracer_, io_tracer_, db_id_, db_session_id_,
+      options.daily_offpeak_time_utc, &error_handler_, read_only));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
@@ -1412,7 +1412,7 @@ Status DBImpl::SetDBOptions(
       file_options_for_compaction_ = FileOptions(new_db_options);
       file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
           file_options_for_compaction_, immutable_db_options_);
-      versions_->ChangeFileOptions(mutable_db_options_);
+      versions_->UpdatedMutableDbOptions(mutable_db_options_, &mutex_);
       // TODO(xiez): clarify why apply optimize for read to write options
       file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
           file_options_for_compaction_, immutable_db_options_);
diff --git a/db/db_impl/db_impl_follower.cc b/db/db_impl/db_impl_follower.cc
index 1ff12cec0153..1262c5bdfdb6 100644
--- a/db/db_impl/db_impl_follower.cc
+++ b/db/db_impl/db_impl_follower.cc
@@ -293,9 +293,9 @@ Status DB::OpenAsFollower(
   DBImplFollower* impl =
       new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path);
   impl->versions_.reset(new ReactiveVersionSet(
-      dbname, &impl->immutable_db_options_, impl->file_options_,
-      impl->table_cache_.get(), impl->write_buffer_manager_,
-      &impl->write_controller_, impl->io_tracer_));
+      dbname, &impl->immutable_db_options_, impl->mutable_db_options_,
+      impl->file_options_, impl->table_cache_.get(),
+      impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
   impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index a9871d6bb2f5..cccc3ea2c708 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -329,7 +329,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
     }
     FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
     file->SetPreallocationBlockSize(
-        immutable_db_options_.manifest_preallocation_size);
+        mutable_db_options_.manifest_preallocation_size);
     std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
         std::move(file), manifest, file_options, immutable_db_options_.clock,
         io_tracer_, nullptr /* stats */,
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 05337a019f3e..5e6de87c586f 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -783,9 +783,9 @@ Status DB::OpenAsSecondary(
   handles->clear();
   DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
   impl->versions_.reset(new ReactiveVersionSet(
-      dbname, &impl->immutable_db_options_, impl->file_options_,
-      impl->table_cache_.get(), impl->write_buffer_manager_,
-      &impl->write_controller_, impl->io_tracer_));
+      dbname, &impl->immutable_db_options_, impl->mutable_db_options_,
+      impl->file_options_, impl->table_cache_.get(),
+      impl->write_buffer_manager_, &impl->write_controller_, impl->io_tracer_));
   impl->column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
   impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index cfe0b8f96522..36c4f211af76 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -432,12 +432,47 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) {
   ASSERT_GT(low_bytes_per_sync, counter);
 }
 
+TEST_F(DBOptionsTest, MutableManifestOptions) {
+  // These aren't end-to-end tests, but sufficient to ensure the VersionSet
+  // receives the updates with SetDBOptions
+  for (int64_t i : {0, 1, 100, 100000, 10000000}) {
+    ASSERT_OK(
+        db_->SetDBOptions({{"max_manifest_file_size", std::to_string(i)}}));
+    ASSERT_EQ(i,
+              static_cast<int64_t>(db_->GetDBOptions().max_manifest_file_size));
+    ASSERT_EQ(i,
+              static_cast<int64_t>(
+                  dbfull()->GetVersionSet()->TEST_GetMinMaxManifestFileSize()));
+    if (i > 1) {
+      ++i;
+    }
+    ASSERT_OK(
+        db_->SetDBOptions({{"max_manifest_space_amp_pct", std::to_string(i)}}));
+    ASSERT_EQ(i, static_cast<int64_t>(
+                     db_->GetDBOptions().max_manifest_space_amp_pct));
+    ASSERT_EQ(i,
+              static_cast<int64_t>(
+                  dbfull()->GetVersionSet()->TEST_GetMaxManifestSpaceAmpPct()));
+    if (i > 1) {
+      ++i;
+    }
+    ASSERT_OK(db_->SetDBOptions(
+        {{"manifest_preallocation_size", std::to_string(i)}}));
+    ASSERT_EQ(i, static_cast<int64_t>(
+                     db_->GetDBOptions().manifest_preallocation_size));
+    ASSERT_EQ(
+        i, static_cast<int64_t>(
+               dbfull()->GetVersionSet()->TEST_GetManifestPreallocationSize()));
+  }
+}
+
 TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
   Options options;
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 1024 * 1024;
   options.level0_file_num_compaction_trigger = 3;
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.env = env_;
   int buffer_size = 1024 * 1024;
   Reopen(options);
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 1a565c8e1630..33da1ffaf12f 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -5205,6 +5205,7 @@ TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
   options.max_manifest_file_size = 10;
+  options.max_manifest_space_amp_pct = 0;
   options.create_if_missing = true;
   CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_EQ(2, handles_.size());
@@ -5896,6 +5897,7 @@ TEST_P(RenameCurrentTest, Flush) {
   Destroy(last_options_);
   Options options = GetDefaultOptions();
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.create_if_missing = true;
   Reopen(options);
   ASSERT_OK(Put("key", "value"));
@@ -5915,6 +5917,7 @@ TEST_P(RenameCurrentTest, Compaction) {
   Destroy(last_options_);
   Options options = GetDefaultOptions();
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.create_if_missing = true;
   Reopen(options);
   ASSERT_OK(Put("a", "a_value"));
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 0cefcfd41d73..a0608b30b4b8 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -454,7 +454,8 @@ Options DBTestBase::GetOptions(
       options.allow_mmap_reads = can_allow_mmap;
       break;
     case kManifestFileSize:
-      options.max_manifest_file_size = 50;  // 50 bytes
+      options.max_manifest_file_size = 50;     // 50 bytes
+      options.max_manifest_space_amp_pct = 0;  // old behavior
       break;
     case kPerfOptions:
       options.delayed_write_rate = 8 * 1024 * 1024;
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index da9ef31587f7..75e13724a75e 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -1746,8 +1746,8 @@ class RecoveryTestHelper {
     WriteController write_controller;
 
     versions.reset(new VersionSet(
-        test->dbname_, &db_options, file_options, table_cache.get(),
-        &write_buffer_manager, &write_controller,
+        test->dbname_, &db_options, MutableDBOptions{options}, file_options,
+        table_cache.get(), &write_buffer_manager, &write_controller,
         /*block_cache_tracer=*/nullptr,
         /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"",
         options.daily_offpeak_time_utc,
@@ -2277,6 +2277,7 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
   Options options = CurrentOptions();
   // Small size to force manifest creation
   options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
   options.track_and_verify_wals_in_manifest = true;
   DestroyAndReopen(options);
 
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index b84bb3d8bcb7..3d4cf1d8debd 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -142,13 +142,13 @@ class FlushJobTestBase : public testing::Test {
       column_families.emplace_back(cf_name, cf_options_);
     }
 
-    versions_.reset(
-        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
-                       &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
-                       test::kUnitTestDbId, /*db_session_id=*/"",
-                       /*daily_offpeak_time_utc=*/"",
-                       /*error_handler=*/nullptr, /*read_only=*/false));
+    versions_.reset(new VersionSet(
+        dbname_, &db_options_, MutableDBOptions{options_}, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
+        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+        test::kUnitTestDbId, /*db_session_id=*/"",
+        /*daily_offpeak_time_utc=*/"",
+        /*error_handler=*/nullptr, /*read_only=*/false));
     EXPECT_OK(versions_->Recover(column_families, false));
   }
 
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index 97e36e00f60c..7065b125babe 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -112,7 +112,8 @@ class MemTableListTest : public testing::Test {
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
     WriteController write_controller(10000000u);
 
-    VersionSet versions(dbname, &immutable_db_options, env_options,
+    VersionSet versions(dbname, &immutable_db_options,
+                        MutableDBOptions{db_options}, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller, /*block_cache_tracer=*/nullptr,
                         /*io_tracer=*/nullptr, /*db_id=*/"",
@@ -163,7 +164,8 @@ class MemTableListTest : public testing::Test {
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
     WriteController write_controller(10000000u);
 
-    VersionSet versions(dbname, &immutable_db_options, env_options,
+    VersionSet versions(dbname, &immutable_db_options,
+                        MutableDBOptions{db_options}, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller, /*block_cache_tracer=*/nullptr,
                         /*io_tracer=*/nullptr, /*db_id=*/"",
diff --git a/db/repair.cc b/db/repair.cc
index 6d184eba8b1c..05672957f805 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -120,8 +120,8 @@ class Repairer {
                                     /*io_tracer=*/nullptr, db_session_id_)),
         wb_(db_options_.db_write_buffer_size),
         wc_(db_options_.delayed_write_rate),
-        vset_(dbname_, &immutable_db_options_, file_options_,
-              raw_table_cache_.get(), &wb_, &wc_,
+        vset_(dbname_, &immutable_db_options_, MutableDBOptions{db_options_},
+              file_options_, raw_table_cache_.get(), &wb_, &wc_,
               /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
               /*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc,
               /*error_handler=*/nullptr, /*read_only=*/false),
diff --git a/db/version_set.cc b/db/version_set.cc
index 3a90c7afece1..e2dce0e8c80b 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5331,6 +5331,7 @@ void AtomicGroupReadBuffer::Clear() {
 
 VersionSet::VersionSet(
     const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const MutableDBOptions& mutable_db_options,
     const FileOptions& storage_options, Cache* table_cache,
     WriteBufferManager* write_buffer_manager, WriteController* write_controller,
     BlockCacheTracer* const block_cache_tracer,
@@ -5359,6 +5360,7 @@ VersionSet::VersionSet(
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
+      last_compacted_manifest_file_size_(0),
       file_options_(storage_options),
       block_cache_tracer_(block_cache_tracer),
       io_tracer_(io_tracer),
@@ -5366,7 +5368,9 @@ VersionSet::VersionSet(
       offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)),
       error_handler_(error_handler),
       unchanging_(unchanging),
-      closed_(false) {}
+      closed_(false) {
+  UpdatedMutableDbOptions(mutable_db_options, /*mu=*/nullptr);
+}
 
 Status VersionSet::Close(FSDirectory* db_dir, InstrumentedMutex* mu) {
   Status s;
@@ -5462,11 +5466,35 @@ void VersionSet::Reset() {
   current_version_number_ = 0;
   manifest_writers_.clear();
   manifest_file_size_ = 0;
+  last_compacted_manifest_file_size_ = 0;
+  TuneMaxManifestFileSize();
   obsolete_files_.clear();
   obsolete_manifests_.clear();
   wals_.Reset();
 }
 
+void VersionSet::UpdatedMutableDbOptions(
+    const MutableDBOptions& updated_options, InstrumentedMutex* mu) {
+  // Must be holding mutex if not called during initialization
+  if (manifest_file_size_ > 0) {
+    mu->AssertHeld();
+  }
+  file_options_.writable_file_max_buffer_size =
+      updated_options.writable_file_max_buffer_size;
+  min_max_manifest_file_size_ = updated_options.max_manifest_file_size;
+  max_manifest_space_amp_pct_ = static_cast<unsigned>(
+      std::max(updated_options.max_manifest_space_amp_pct, 0));
+  manifest_preallocation_size_ = updated_options.manifest_preallocation_size;
+  TuneMaxManifestFileSize();
+}
+
+void VersionSet::TuneMaxManifestFileSize() {
+  tuned_max_manifest_file_size_ =
+      std::max(min_max_manifest_file_size_,
+               last_compacted_manifest_file_size_ *
+                   (100U + max_manifest_space_amp_pct_) / 100U);
+}
+
 void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
                                Version* v) {
   // compute new compaction score
@@ -5710,10 +5738,11 @@ Status VersionSet::ProcessManifestWrites(
   }
 #endif  // NDEBUG
 
+  uint64_t prev_manifest_file_size = manifest_file_size_;
   assert(pending_manifest_file_number_ == 0);
   if (!skip_manifest_write &&
       (!descriptor_log_ ||
-       manifest_file_size_ > db_options_->max_manifest_file_size)) {
+       prev_manifest_file_size >= tuned_max_manifest_file_size_)) {
     TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
     new_descriptor_log = true;
   } else {
@@ -5753,6 +5782,8 @@ Status VersionSet::ProcessManifestWrites(
   IOStatus manifest_io_status;
   manifest_io_status.PermitUncheckedError();
   std::unique_ptr<log::Writer> new_desc_log_ptr;
+  // Save before releasing mu
+  uint64_t manifest_preallocation_size = manifest_preallocation_size_;
   if (skip_manifest_write) {
     if (s.ok()) {
       constexpr bool update_stats = true;
@@ -5796,16 +5827,13 @@ Status VersionSet::ProcessManifestWrites(
       // This is fine because everything inside of this block is serialized --
       // only one thread can be here at the same time
       // create new manifest file
-      ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
-                     pending_manifest_file_number_);
       std::string descriptor_fname =
           DescriptorFileName(dbname_, pending_manifest_file_number_);
       std::unique_ptr<FSWritableFile> descriptor_file;
       io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
                              opt_file_opts);
       if (io_s.ok()) {
-        descriptor_file->SetPreallocationBlockSize(
-            db_options_->manifest_preallocation_size);
+        descriptor_file->SetPreallocationBlockSize(manifest_preallocation_size);
         FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
         std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
             std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
@@ -5906,6 +5934,13 @@ Status VersionSet::ProcessManifestWrites(
     if (s.ok()) {
       // find offset in manifest file where this version is stored.
       new_manifest_file_size = raw_desc_log_ptr->file()->GetFileSize();
+      if (new_descriptor_log) {
+        ROCKS_LOG_INFO(db_options_->info_log,
+                       "Created manifest %" PRIu64
+                       ", compacted+appended from %" PRIu64 " to %" PRIu64 "\n",
+                       pending_manifest_file_number_, prev_manifest_file_size,
+                       new_manifest_file_size);
+      }
     }
 
     if (first_writer.edit_list.front()->IsColumnFamilyDrop()) {
@@ -5954,6 +5989,8 @@ Status VersionSet::ProcessManifestWrites(
     descriptor_log_ = std::move(new_desc_log_ptr);
     obsolete_manifests_.emplace_back(
         DescriptorFileName("", manifest_file_number_));
+    last_compacted_manifest_file_size_ = new_manifest_file_size;
+    TuneMaxManifestFileSize();
   }
 
   // Install the new versions
@@ -6587,14 +6624,16 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   const ReadOptions read_options;
   const WriteOptions write_options;
 
-  ImmutableDBOptions db_options(*options);
+  ImmutableDBOptions imm_db_options(*options);
+  MutableDBOptions mutable_db_options(*options);
   ColumnFamilyOptions cf_options(*options);
   std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
                                         options->table_cache_numshardbits));
   WriteController wc(options->delayed_write_rate);
   WriteBufferManager wb(options->db_write_buffer_size);
-  VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
-                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+  VersionSet versions(dbname, &imm_db_options, mutable_db_options, file_options,
+                      tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/,
+                      nullptr /*IOTracer*/,
                       /*db_id*/ "",
                       /*db_session_id*/ "", options->daily_offpeak_time_utc,
                       /*error_handler_*/ nullptr, /*unchanging=*/false);
@@ -7646,12 +7685,13 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options,
 }
 
 ReactiveVersionSet::ReactiveVersionSet(
-    const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const std::string& dbname, const ImmutableDBOptions* imm_db_options,
+    const MutableDBOptions& mutable_db_options,
     const FileOptions& _file_options, Cache* table_cache,
     WriteBufferManager* write_buffer_manager, WriteController* write_controller,
     const std::shared_ptr<IOTracer>& io_tracer)
-    : VersionSet(dbname, _db_options, _file_options, table_cache,
-                 write_buffer_manager, write_controller,
+    : VersionSet(dbname, imm_db_options, mutable_db_options, _file_options,
+                 table_cache, write_buffer_manager, write_controller,
                  /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
                  /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "",
                  /*error_handler=*/nullptr, /*unchanging=*/false) {}
diff --git a/db/version_set.h b/db/version_set.h
index cff81717bc63..85759f82f5a3 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1193,7 +1193,9 @@ class AtomicGroupReadBuffer {
 // but false for secondary instance or writable DB).
 class VersionSet {
  public:
-  VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+  VersionSet(const std::string& dbname,
+             const ImmutableDBOptions* imm_db_options,
+             const MutableDBOptions& mutable_db_options,
              const FileOptions& file_options, Cache* table_cache,
              WriteBufferManager* write_buffer_manager,
              WriteController* write_controller,
@@ -1210,6 +1212,13 @@ class VersionSet {
 
   virtual Status Close(FSDirectory* db_dir, InstrumentedMutex* mu);
 
+  // Requires: already holding DB mutex `mu`, to ensure
+  // * Safely read values from `updated_options`
+  // * Safely update fields on `this` (must be read elsewhere while holding mu)
+  // except `mu` can be nullptr during initialization
+  void UpdatedMutableDbOptions(const MutableDBOptions& updated_options,
+                               InstrumentedMutex* mu);
+
   Status LogAndApplyToDefaultColumnFamily(
       const ReadOptions& read_options, const WriteOptions& write_options,
       VersionEdit* edit, InstrumentedMutex* mu,
@@ -1555,10 +1564,6 @@ class VersionSet {
   }
 
   const FileOptions& file_options() { return file_options_; }
-  void ChangeFileOptions(const MutableDBOptions& new_options) {
-    file_options_.writable_file_max_buffer_size =
-        new_options.writable_file_max_buffer_size;
-  }
 
   // TODO - Consider updating together when file options change in SetDBOptions
   const OffpeakTimeOption& offpeak_time_option() {
@@ -1597,6 +1602,16 @@ class VersionSet {
 
   bool& TEST_unchanging() { return const_cast<bool&>(unchanging_); }
 
+  uint64_t TEST_GetMinMaxManifestFileSize() {
+    return min_max_manifest_file_size_;
+  }
+  unsigned TEST_GetMaxManifestSpaceAmpPct() {
+    return max_manifest_space_amp_pct_;
+  }
+  size_t TEST_GetManifestPreallocationSize() {
+    return manifest_preallocation_size_;
+  }
+
  protected:
   struct ManifestWriter;
 
@@ -1617,6 +1632,7 @@ class VersionSet {
     }
   };
 
+  // Revert back to a post-construction state (keep same options/settings)
   void Reset();
 
   // Returns approximated offset of a key in a file for a given version.
@@ -1655,6 +1671,11 @@ class VersionSet {
                             ColumnFamilyData* cfd, const std::string& fpath,
                             int level, const FileMetaData& meta);
 
+  // Auto-tune next max size for the current manifest file based on its initial
+  // "compacted" size and other parameters saved in this VersionSet. Must be
+  // holding DB mutex if outside of DB startup.
+  void TuneMaxManifestFileSize();
+
   // Protected by DB mutex.
   WalSet wals_;
 
@@ -1706,6 +1727,20 @@ class VersionSet {
   // Current size of manifest file
   uint64_t manifest_file_size_;
 
+  // Size of the populated manifest file last time it was re-written from
+  // scratch.
+  uint64_t last_compacted_manifest_file_size_;
+
+  // Auto-tuned max allowed size for the current manifest file
+  uint64_t tuned_max_manifest_file_size_;
+
+  // Saved copy of max_manifest_file_size in (Mutable)DBOptions
+  uint64_t min_max_manifest_file_size_;
+  // Saved, sanitized copy from (Mutable)DBOptions
+  unsigned max_manifest_space_amp_pct_;
+  // Saved copy from (Mutable)DBOptions
+  size_t manifest_preallocation_size_;
+
   // Obsolete files, or during DB shutdown any files not referenced by what's
   // left of the in-memory LSM state.
   std::vector<ObsoleteFileInfo> obsolete_files_;
@@ -1758,6 +1793,7 @@ class ReactiveVersionSet : public VersionSet {
  public:
   ReactiveVersionSet(const std::string& dbname,
                      const ImmutableDBOptions* _db_options,
+                     const MutableDBOptions& mutable_db_options,
                      const FileOptions& _file_options, Cache* table_cache,
                      WriteBufferManager* write_buffer_manager,
                      WriteController* write_controller,
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 65cee38de10d..fefde1170ae5 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -1159,12 +1159,12 @@ class VersionSetTestBase {
       : env_(nullptr),
         dbname_(test::PerThreadDBPath(name)),
         options_(),
-        db_options_(options_),
+        imm_db_options_(options_),
         cf_options_(options_),
-        immutable_options_(db_options_, cf_options_),
+        immutable_options_(imm_db_options_, cf_options_),
         mutable_cf_options_(cf_options_),
         table_cache_(NewLRUCache(50000, 16)),
-        write_buffer_manager_(db_options_.db_write_buffer_size),
+        write_buffer_manager_(imm_db_options_.db_write_buffer_size),
         shutting_down_(false),
         table_factory_(std::make_shared<mock::MockTableFactory>()) {
     EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
@@ -1178,8 +1178,8 @@ class VersionSetTestBase {
     EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
 
     options_.env = env_;
-    db_options_.env = env_;
-    db_options_.fs = fs_;
+    imm_db_options_.env = env_;
+    imm_db_options_.fs = fs_;
     immutable_options_.env = env_;
     immutable_options_.fs = fs_;
     immutable_options_.clock = env_->GetSystemClock().get();
@@ -1188,16 +1188,17 @@ class VersionSetTestBase {
     mutable_cf_options_.table_factory = table_factory_;
 
     versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*read_only=*/false));
     reactive_versions_ = std::make_shared<ReactiveVersionSet>(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_, nullptr);
-    db_options_.db_paths.emplace_back(dbname_,
-                                      std::numeric_limits<uint64_t>::max());
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
+        nullptr);
+    imm_db_options_.db_paths.emplace_back(dbname_,
+                                          std::numeric_limits<uint64_t>::max());
   }
 
   virtual ~VersionSetTestBase() {
@@ -1220,7 +1221,7 @@ class VersionSetTestBase {
     ASSERT_OK(
         SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
+    if (imm_db_options_.write_dbid_to_manifest) {
       DBOptions tmp_db_options;
       tmp_db_options.env = env_;
       std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
@@ -1381,8 +1382,8 @@ class VersionSetTestBase {
 
   void ReopenDB() {
     versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*read_only=*/false));
@@ -1471,7 +1472,8 @@ class VersionSetTestBase {
   const std::string dbname_;
   EnvOptions env_options_;
   Options options_;
-  ImmutableDBOptions db_options_;
+  ImmutableDBOptions imm_db_options_;
+  MutableDBOptions mutable_db_options_;
   ColumnFamilyOptions cf_options_;
   ImmutableOptions immutable_options_;
   MutableCFOptions mutable_cf_options_;
@@ -1902,8 +1904,8 @@ TEST_F(VersionSetTest, WalAddition) {
   // Recover a new VersionSet.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -1970,8 +1972,8 @@ TEST_F(VersionSetTest, WalCloseWithoutSync) {
   // Recover a new VersionSet.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -2024,8 +2026,8 @@ TEST_F(VersionSetTest, WalDeletion) {
   // Recover a new VersionSet, only the non-closed WAL should show up.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -2063,8 +2065,8 @@ TEST_F(VersionSetTest, WalDeletion) {
   // Recover from the new MANIFEST, only the non-closed WAL should show up.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -2184,8 +2186,8 @@ TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
   // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -2221,8 +2223,8 @@ TEST_F(VersionSetTest, DeleteAllWals) {
   // Recover a new VersionSet, all WALs are deleted.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -2264,8 +2266,8 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
   // kept.
   {
     std::unique_ptr<VersionSet> new_versions(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -2444,8 +2446,8 @@ class VersionSetWithTimestampTest : public VersionSetTest {
 
   void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
     std::unique_ptr<VersionSet> vset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &imm_db_options_, mutable_db_options_, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*unchanging=*/false));
@@ -3500,7 +3502,7 @@ class VersionSetTestEmptyDb
                        std::unique_ptr<log::Writer>* log_writer) override {
     assert(nullptr != log_writer);
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
+    if (imm_db_options_.write_dbid_to_manifest) {
       ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
                                 Temperature::kUnknown));
       DBOptions tmp_db_options;
@@ -3532,7 +3534,7 @@ class VersionSetTestEmptyDb
 const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
 
 TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   log_writer_.reset();
   CreateCurrentFile();
@@ -3564,7 +3566,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Only a subset of column families in the MANIFEST.
   VersionEdit new_cf1;
@@ -3605,7 +3607,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Write all column families but no log_number, next_file_number and
   // last_sequence.
@@ -3651,7 +3653,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Write all column families but no log_number, next_file_number and
   // last_sequence.
@@ -3708,7 +3710,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
 }
 
 TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
-  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  imm_db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
   PrepareManifest(nullptr, nullptr, &log_writer_);
   // Write all column families but no log_number, next_file_number and
   // last_sequence.
@@ -3828,7 +3830,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
     ASSERT_OK(s);
     log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
+    if (imm_db_options_.write_dbid_to_manifest) {
       DBOptions tmp_db_options;
       tmp_db_options.env = env_;
       std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
@@ -4088,7 +4090,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
 }
 
 TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
-  db_options_.allow_2pc = true;
+  imm_db_options_.allow_2pc = true;
   NewDB();
 
   SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */,
diff --git a/db/version_util.h b/db/version_util.h
index 2690a00f48d9..7219d11c854b 100644
--- a/db/version_util.h
+++ b/db/version_util.h
@@ -1,4 +1,4 @@
-//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
@@ -23,7 +23,8 @@ class OfflineManifestWriter {
         immutable_db_options_(WithDbPath(options, db_path)),
         tc_(NewLRUCache(1 << 20 /* capacity */,
                         options.table_cache_numshardbits)),
-        versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_,
+        versions_(db_path, &immutable_db_options_, MutableDBOptions{options},
+                  sopt_, tc_.get(), &wb_, &wc_,
                   /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                   /*db_id=*/"", /*db_session_id=*/"",
                   options.daily_offpeak_time_utc,
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 55736f2fdb5c..e674e7b778c9 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -55,8 +55,8 @@ class WalManagerTest : public testing::Test {
     }
 
     versions_.reset(new VersionSet(
-        dbname_, &db_options_, env_options_, table_cache_.get(),
-        &write_buffer_manager_, &write_controller_,
+        dbname_, &db_options_, MutableDBOptions{}, env_options_,
+        table_cache_.get(), &write_buffer_manager_, &write_controller_,
         /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
         /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
         /*error_handler=*/nullptr, /*read_only=*/false));
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 9c3b6563f2f2..619c24e75b40 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -248,6 +248,7 @@ DECLARE_string(fs_uri);
 DECLARE_uint64(ops_per_thread);
 DECLARE_uint64(log2_keys_per_lock);
 DECLARE_uint64(max_manifest_file_size);
+DECLARE_int32(max_manifest_space_amp_pct);
 DECLARE_bool(in_place_update);
 DECLARE_string(memtablerep);
 DECLARE_int32(prefix_size);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 47b5f715fdb1..e9f7e172bd15 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -978,7 +978,11 @@ DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
 static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
 
-DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+DEFINE_uint64(max_manifest_file_size, 16384,
+              "Maximum size of a MANIFEST file (without auto-tuning)");
+
+DEFINE_int32(max_manifest_space_amp_pct, 500,
+             "Max manifest space amp percentage for auto-tuning");
 
 DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 2d40136df4ab..6a37af5a4c66 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4430,6 +4430,7 @@ void InitializeOptionsFromFlags(
     options.compression_opts.checksum = true;
   }
   options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+  options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
   options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
   options.allow_concurrent_memtable_write =
       FLAGS_allow_concurrent_memtable_write;
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index e932af5628c7..96342647d432 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -958,12 +958,67 @@ struct DBOptions {
   // Default: 0
   size_t recycle_log_file_num = 0;
 
-  // manifest file is rolled over on reaching this limit.
-  // The older manifest file be deleted.
-  // The default value is 1GB so that the manifest file can grow, but not
-  // reach the limit of storage capacity.
+  // The manifest file is rolled over on reaching this limit AND the
+  // space amp limit described in max_manifest_space_amp_pct. More trade-off
+  // details there.
+  //
+  // NOTE: this option used to be a hard limit, but that made this a dangerous
+  // tuning parameter for optimizing manifest file size because the best
+  // size really depends on the DB size and average SST file size (and other
+  // settings). Now it is essentially a minimum for the auto-tuned max manifest
+  // file size.
+  //
+  // Until the max_manifest_space_amp_pct feature is fully validated to show a
+  // smaller default here like 1MB is appropriate, the default value is 1GB to
+  // match historical behavior (without it being a hard limit in case of giant
+  // compacted manifest size).
+  //
+  // This option is mutable with SetDBOptions(), taking effect on the next
+  // manifest write (e.g. completed DB compaction or flush).
   uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
 
+  // This option mostly replaces max_manifest_file_size to control an auto-tuned
+  // balance of manifest write amplification and space amplification. A new
+  // manifest file is created with the "compacted" contents of the old one when
+  //  current_manifest_size
+  //    >
+  //  max(max_manifest_file_size,
+  //      est_compacted_manifest_size * (1 + max_manifest_space_amp_pct/100))
+  //
+  // where est_compacted_manifest_size is an estimate of how big a new compacted
+  // version of the current manifest would be. Currently, the estimate used is
+  // the last newly-written manifest, in its "compacted" form.
+  //
+  // Space amplification in the manifest file might be less of a concern for
+  // primary storage space and more of a concern for DB recover time and size of
+  // backup files that aren't incremental between backups. To minimize manifest
+  // churn on initial DB population, setting max_manifest_file_size to something
+  // not too small, like 1MB, should suffice. Similarly, write amp on the
+  // manifest file is likely not a direct concern but completed compactions and
+  // flushes cannot (currently) be committed while the (relatively small)
+  // manifest file is being compacted. Manifest compactions should not
+  // interfere with user write latency or throughput unless the DB is
+  // chronically stalling or close to stalling writes already.
+  //
+  // For this option to have a meaningful effect, it is recommended to set
+  // max_manifest_file_size to something modest like 1MB. Then we can interpret
+  // values for this option as follows, starting with minimum space amp and
+  // maximum write amp:
+  // * 0 - Every manifest write (flush, compaction, etc.) generates a whole new
+  // manifest. Only useful for testing.
+  // * very small - Doesn't take many manifest writes to generate a whole new
+  // manifest.
+  // * 100 - In a DB with pretty consistent number of SST files, etc., achieves
+  // about 1.0 write amp (writing about 2x the theoretical minimum) and a max of
+  // about 1.0 space amp (manifest up to 2x the compacted size).
+  // * 500 - Recommended and default: 0.2 write amp and up to roughly 5.0 space
+  // amp.
+  // * 10000 - 0.01 write amp and up to 100 space amp on the manifest.
+  //
+  // This option is mutable with SetDBOptions(), taking effect on the next
+  // manifest write (e.g. completed DB compaction or flush).
+  int max_manifest_space_amp_pct = 500;
+
   // Number of shards used for table cache.
   int table_cache_numshardbits = 6;
 
diff --git a/options/db_options.cc b/options/db_options.cc
index 3e06c4ceb687..dfacea8e5b22 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -124,6 +124,18 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableDBOptions, max_background_flushes),
           OptionType::kInt, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"max_manifest_file_size",
+         {offsetof(struct MutableDBOptions, max_manifest_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_manifest_space_amp_pct",
+         {offsetof(struct MutableDBOptions, max_manifest_space_amp_pct),
+          OptionType::kInt, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"manifest_preallocation_size",
+         {offsetof(struct MutableDBOptions, manifest_preallocation_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"daily_offpeak_time_utc",
          {offsetof(struct MutableDBOptions, daily_offpeak_time_utc),
           OptionType::kString, OptionVerificationType::kNormal,
@@ -288,10 +300,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, log_file_time_to_roll),
           OptionType::kSizeT, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
-        {"manifest_preallocation_size",
-         {offsetof(struct ImmutableDBOptions, manifest_preallocation_size),
-          OptionType::kSizeT, OptionVerificationType::kNormal,
-          OptionTypeFlags::kNone}},
         {"max_log_file_size",
          {offsetof(struct ImmutableDBOptions, max_log_file_size),
           OptionType::kSizeT, OptionVerificationType::kNormal,
@@ -310,10 +318,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
-        {"max_manifest_file_size",
-         {offsetof(struct ImmutableDBOptions, max_manifest_file_size),
-          OptionType::kUInt64T, OptionVerificationType::kNormal,
-          OptionTypeFlags::kNone}},
         {"persist_stats_to_disk",
          {offsetof(struct ImmutableDBOptions, persist_stats_to_disk),
           OptionType::kBoolean, OptionVerificationType::kNormal,
@@ -657,7 +661,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable {
   explicit DBOptionsConfigurable(
       const DBOptions& opts,
       const std::unordered_map<std::string, std::string>* map = nullptr)
-      : MutableDBConfigurable(MutableDBOptions(opts), map), db_options_(opts) {
+      : MutableDBConfigurable(MutableDBOptions{opts}, map), db_options_(opts) {
     // The ImmutableDBOptions currently requires the env to be non-null.  Make
     // sure it is
     if (opts.env != nullptr) {
@@ -708,7 +712,7 @@ std::unique_ptr<Configurable> DBOptionsAsConfigurable(
   return ptr;
 }
 
-ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {}
+ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(DBOptions{}) {}
 
 ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
     : create_if_missing(options.create_if_missing),
@@ -737,13 +741,11 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       log_file_time_to_roll(options.log_file_time_to_roll),
       keep_log_file_num(options.keep_log_file_num),
       recycle_log_file_num(options.recycle_log_file_num),
-      max_manifest_file_size(options.max_manifest_file_size),
       table_cache_numshardbits(options.table_cache_numshardbits),
       WAL_ttl_seconds(options.WAL_ttl_seconds),
       WAL_size_limit_MB(options.WAL_size_limit_MB),
       max_write_batch_group_size_bytes(
           options.max_write_batch_group_size_bytes),
-      manifest_preallocation_size(options.manifest_preallocation_size),
       allow_mmap_reads(options.allow_mmap_reads),
       allow_mmap_writes(options.allow_mmap_writes),
       use_direct_reads(options.use_direct_reads),
@@ -850,9 +852,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                      Options.max_log_file_size: %" ROCKSDB_PRIszt,
       max_log_file_size);
-  ROCKS_LOG_HEADER(log,
-                   "                 Options.max_manifest_file_size: %" PRIu64,
-                   max_manifest_file_size);
   ROCKS_LOG_HEADER(
       log, "                  Options.log_file_time_to_roll: %" ROCKSDB_PRIszt,
       log_file_time_to_roll);
@@ -892,9 +891,6 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    "                       "
                    "Options.max_write_batch_group_size_bytes: %" PRIu64,
                    max_write_batch_group_size_bytes);
-  ROCKS_LOG_HEADER(
-      log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
-      manifest_preallocation_size);
   ROCKS_LOG_HEADER(log, "                    Options.is_fd_close_on_exec: %d",
                    is_fd_close_on_exec);
   ROCKS_LOG_HEADER(log, "                  Options.advise_random_on_open: %d",
@@ -1025,24 +1021,7 @@ const std::string& ImmutableDBOptions::GetWalDir(
   }
 }
 
-MutableDBOptions::MutableDBOptions()
-    : max_background_jobs(2),
-      max_background_compactions(-1),
-      max_subcompactions(0),
-      avoid_flush_during_shutdown(false),
-      writable_file_max_buffer_size(1024 * 1024),
-      delayed_write_rate(2 * 1024U * 1024U),
-      max_total_wal_size(0),
-      delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
-      stats_dump_period_sec(600),
-      stats_persist_period_sec(600),
-      stats_history_buffer_size(1024 * 1024),
-      max_open_files(-1),
-      bytes_per_sync(0),
-      wal_bytes_per_sync(0),
-      strict_bytes_per_sync(false),
-      compaction_readahead_size(0),
-      max_background_flushes(-1) {}
+MutableDBOptions::MutableDBOptions() : MutableDBOptions(DBOptions{}) {}
 
 MutableDBOptions::MutableDBOptions(const DBOptions& options)
     : max_background_jobs(options.max_background_jobs),
@@ -1063,6 +1042,9 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
       strict_bytes_per_sync(options.strict_bytes_per_sync),
       compaction_readahead_size(options.compaction_readahead_size),
       max_background_flushes(options.max_background_flushes),
+      max_manifest_file_size(options.max_manifest_file_size),
+      max_manifest_space_amp_pct(options.max_manifest_space_amp_pct),
+      manifest_preallocation_size(options.manifest_preallocation_size),
       daily_offpeak_time_utc(options.daily_offpeak_time_utc) {}
 
 void MutableDBOptions::Dump(Logger* log) const {
@@ -1107,6 +1089,15 @@ void MutableDBOptions::Dump(Logger* log) const {
                    compaction_readahead_size);
   ROCKS_LOG_HEADER(log, "                 Options.max_background_flushes: %d",
                    max_background_flushes);
+  ROCKS_LOG_HEADER(log,
+                   "                 Options.max_manifest_file_size: %" PRIu64,
+                   max_manifest_file_size);
+  ROCKS_LOG_HEADER(log,
+                   "                 Options.max_manifest_space_amp_pct: %d",
+                   max_manifest_space_amp_pct);
+  ROCKS_LOG_HEADER(
+      log, "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
+      manifest_preallocation_size);
   ROCKS_LOG_HEADER(log, "Options.daily_offpeak_time_utc: %s",
                    daily_offpeak_time_utc.c_str());
 }
diff --git a/options/db_options.h b/options/db_options.h
index c23a6f1c945f..ef8607d8bba1 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -47,12 +47,10 @@ struct ImmutableDBOptions {
   size_t log_file_time_to_roll;
   size_t keep_log_file_num;
   size_t recycle_log_file_num;
-  uint64_t max_manifest_file_size;
   int table_cache_numshardbits;
   uint64_t WAL_ttl_seconds;
   uint64_t WAL_size_limit_MB;
   uint64_t max_write_batch_group_size_bytes;
-  size_t manifest_preallocation_size;
   bool allow_mmap_reads;
   bool allow_mmap_writes;
   bool use_direct_reads;
@@ -146,6 +144,9 @@ struct MutableDBOptions {
   bool strict_bytes_per_sync;
   size_t compaction_readahead_size;
   int max_background_flushes;
+  uint64_t max_manifest_file_size;
+  int max_manifest_space_amp_pct;
+  size_t manifest_preallocation_size;
   std::string daily_offpeak_time_utc;
 };
 
diff --git a/options/options_helper.cc b/options/options_helper.cc
index efc91aa9f2f8..65404f112f26 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -99,13 +99,15 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.log_file_time_to_roll = immutable_db_options.log_file_time_to_roll;
   options.keep_log_file_num = immutable_db_options.keep_log_file_num;
   options.recycle_log_file_num = immutable_db_options.recycle_log_file_num;
-  options.max_manifest_file_size = immutable_db_options.max_manifest_file_size;
+  options.max_manifest_file_size = mutable_db_options.max_manifest_file_size;
+  options.max_manifest_space_amp_pct =
+      mutable_db_options.max_manifest_space_amp_pct;
   options.table_cache_numshardbits =
       immutable_db_options.table_cache_numshardbits;
   options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds;
   options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB;
   options.manifest_preallocation_size =
-      immutable_db_options.manifest_preallocation_size;
+      mutable_db_options.manifest_preallocation_size;
   options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
   options.allow_mmap_writes = immutable_db_options.allow_mmap_writes;
   options.use_direct_reads = immutable_db_options.use_direct_reads;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 3a00c768b6ed..b4880b754aa4 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -408,6 +408,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "skip_stats_update_on_db_open=false;"
                              "skip_checking_sst_file_sizes_on_db_open=false;"
                              "max_manifest_file_size=4295009941;"
+                             "max_manifest_space_amp_pct=321;"
                              "db_log_dir=path/to/db_log_dir;"
                              "writable_file_max_buffer_size=1048576;"
                              "paranoid_checks=true;"
diff --git a/options/options_test.cc b/options/options_test.cc
index fc05e64ed79e..7ecde7885ba2 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -160,6 +160,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"keep_log_file_num", "39"},
       {"recycle_log_file_num", "5"},
       {"max_manifest_file_size", "40"},
+      {"max_manifest_space_amp_pct", "42"},
       {"table_cache_numshardbits", "41"},
       {"WAL_ttl_seconds", "43"},
       {"WAL_size_limit_MB", "44"},
@@ -341,7 +342,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
   ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
   ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
-  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40});
+  ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42);
   ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
   ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
   ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
@@ -2468,6 +2470,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
       {"keep_log_file_num", "39"},
       {"recycle_log_file_num", "5"},
       {"max_manifest_file_size", "40"},
+      {"max_manifest_space_amp_pct", "42"},
       {"table_cache_numshardbits", "41"},
       {"WAL_ttl_seconds", "43"},
       {"WAL_size_limit_MB", "44"},
@@ -2653,7 +2656,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
   ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
   ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
-  ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
+  ASSERT_EQ(new_db_opt.max_manifest_file_size, uint64_t{40});
+  ASSERT_EQ(new_db_opt.max_manifest_space_amp_pct, 42);
   ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
   ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
   ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast<uint64_t>(44));
diff --git a/src.mk b/src.mk
index f4efad68bbc3..5eac640572d1 100644
--- a/src.mk
+++ b/src.mk
@@ -494,6 +494,7 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_clip_test.cc                                                    \
   db/db_dynamic_level_test.cc                                           \
   db/db_encryption_test.cc                                              \
+  db/db_etc3_test.cc                                                    \
   db/db_flush_test.cc                                                   \
   db/db_follower_test.cc						                                    \
   db/db_readonly_with_timestamp_test.cc                                 \
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index caf23ee61e7d..655bba868f6e 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -446,6 +446,14 @@ DEFINE_int64(db_write_buffer_size,
              ROCKSDB_NAMESPACE::Options().db_write_buffer_size,
              "Number of bytes to buffer in all memtables before compacting");
 
+DEFINE_int64(max_manifest_file_size,
+             ROCKSDB_NAMESPACE::Options().max_manifest_file_size,
+             "Max manifest file size (or minimum max with auto-tuning)");
+
+DEFINE_int32(max_manifest_space_amp_pct,
+             ROCKSDB_NAMESPACE::Options().max_manifest_space_amp_pct,
+             "Max manifest space amp percentage for auto-tuning");
+
 DEFINE_bool(cost_write_buffer_to_cache, false,
             "The usage of memtable is costed to the block cache");
 
@@ -4368,6 +4376,8 @@ class Benchmark {
       options.write_buffer_manager.reset(
           new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
     }
+    options.max_manifest_file_size = FLAGS_max_manifest_file_size;
+    options.max_manifest_space_amp_pct = FLAGS_max_manifest_space_amp_pct;
     options.arena_block_size = FLAGS_arena_block_size;
     options.write_buffer_size = FLAGS_write_buffer_size;
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 1b25f6a8ea43..64eb676d7cfc 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -245,8 +245,9 @@ def apply_random_seed_per_iteration():
     # Test small max_manifest_file_size in a smaller chance, as most of the
     # time we wnat manifest history to be preserved to help debug
     "max_manifest_file_size": lambda: random.choice(
-        [t * 16384 if t < 3 else 1024 * 1024 * 1024 for t in range(1, 30)]
+        [t * 2048 if t < 5 else 1024 * 1024 * 1024 for t in range(1, 30)]
     ),
+    "max_manifest_space_amp_pct": lambda: random.choice([0, 10, 100, 1000]),
     # Sync mode might make test runs slower so running it in a smaller chance
     "sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]),
     "bytes_per_sync": lambda: random.choice([0, 262144]),
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 9ab70b97410b..328f7d875414 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -1610,7 +1610,8 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   WriteController wc(options.delayed_write_rate);
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
-  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+  VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{}, sopt,
+                      tc.get(), &wb, &wc,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       options.daily_offpeak_time_utc,
@@ -1805,7 +1806,8 @@ Status GetLiveFilesChecksumInfoFromVersionSet(Options options,
   WriteController wc(options.delayed_write_rate);
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
-  VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
+  VersionSet versions(dbname, &immutable_db_options, MutableDBOptions{options},
+                      sopt, tc.get(), &wb, &wc,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       options.daily_offpeak_time_utc,
@@ -2660,7 +2662,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) {
   const InternalKeyComparator cmp(opt.comparator);
   WriteController wc(opt.delayed_write_rate);
   WriteBufferManager wb(opt.db_write_buffer_size);
-  VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
+  VersionSet versions(db_path_, &db_options, MutableDBOptions{opt}, soptions,
+                      tc.get(), &wb, &wc,
                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
                       /*db_id=*/"", /*db_session_id=*/"",
                       opt.daily_offpeak_time_utc,
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 5715f93db337..6943780f74cc 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -208,8 +208,9 @@ class FileChecksumTestHelper {
     WriteController wc(options_.delayed_write_rate);
     WriteBufferManager wb(options_.db_write_buffer_size);
     ImmutableDBOptions immutable_db_options(options_);
-    VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
-                        &wc, nullptr, nullptr, "", "",
+    VersionSet versions(dbname_, &immutable_db_options,
+                        MutableDBOptions{options_}, sopt, tc.get(), &wb, &wc,
+                        nullptr, nullptr, "", "",
                         options_.daily_offpeak_time_utc, nullptr,
                         /*read_only=*/false);
     std::vector<std::string> cf_name_list;
diff --git a/unreleased_history/new_features/auto_tune_manifest.md b/unreleased_history/new_features/auto_tune_manifest.md
new file mode 100644
index 000000000000..9bc95a05e2ee
--- /dev/null
+++ b/unreleased_history/new_features/auto_tune_manifest.md
@@ -0,0 +1 @@
+* Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable.
diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc
index 51581fb00dda..9438b8574451 100644
--- a/utilities/backup/backup_engine_test.cc
+++ b/utilities/backup/backup_engine_test.cc
@@ -3541,6 +3541,7 @@ TEST_F(BackupEngineTest, EnvFailures) {
 TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
   DestroyDBWithoutCheck(dbname_, options_);
   options_.max_manifest_file_size = 0;  // always rollover manifest for file add
+  options_.max_manifest_space_amp_pct = 0;
   OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100, kAutoFlushOnly);
 
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index a514c3400f2d..f7ca4136e7d9 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -596,6 +596,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
   Options options = CurrentOptions();
   options.max_manifest_file_size = 0;  // always rollover manifest for file add
+  options.max_manifest_space_amp_pct = 0;
   Reopen(options);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(

From 2bee29729a208dd9863e39fb4400a42b60d89682 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 7 Nov 2025 10:36:34 -0800
Subject: [PATCH 368/500] CI: move valgrind to weekly (#14110)

Summary:
This test is now taking > 6 hours, timing out, and has low signal, so creating a weekly job for it, with an explicit timeout of 12 hours.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14110

Test Plan: watch CI

Reviewed By: virajthakur

Differential Revision: D86428262

Pulled By: pdillinger

fbshipit-source-id: 44103518064ca378f3fd2ff8d21967ede698c8ea
---
 .github/workflows/nightly.yml | 12 ------------
 .github/workflows/weekly.yml  | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 12 deletions(-)
 create mode 100644 .github/workflows/weekly.yml

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index f8583e44244a..2fe599ef1a1a 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -91,18 +91,6 @@ jobs:
     - run: "USE_FOLLY=1 LIB_MODE=static DEBUG_LEVEL=0 V=1 make -j20 release"
     - run: "(mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 -DCMAKE_BUILD_TYPE=Release .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
-  build-linux-valgrind:
-    if: ${{ github.repository_owner == 'facebook' }}
-    runs-on:
-      labels: 16-core-ubuntu
-    container:
-      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
-      options: --shm-size=16gb
-    steps:
-    - uses: actions/checkout@v4.1.0
-    - uses: "./.github/actions/pre-steps"
-    - run: make V=1 -j32 valgrind_test
-    - uses: "./.github/actions/post-steps"
   build-windows-vs2022-avx2:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on: windows-2022
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
new file mode 100644
index 000000000000..37d36513a783
--- /dev/null
+++ b/.github/workflows/weekly.yml
@@ -0,0 +1,20 @@
+name: facebook/rocksdb/weekly
+on:
+  schedule:
+  - cron: 0 9 * * 0
+  workflow_dispatch:
+permissions: {}
+jobs:
+  build-linux-valgrind:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 16-core-ubuntu
+    timeout-minutes: 840
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:22.1
+      options: --shm-size=16gb
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - run: make V=1 -j20 valgrind_test
+    - uses: "./.github/actions/post-steps"

From ea75cdc4939ddba5ed51eb93ca90eeaa0bc5a0b5 Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 7 Nov 2025 11:04:57 -0800
Subject: [PATCH 369/500] Fix a bug in MultiScan that moves iterator backward
 (#14106)

Summary:
MultiScanUnexpectedSeekTarget() currently uses user key comparison to decide on the next data block for multiscan. This can cause a multiscan to move backward in the following scenario:

data block 1: ..., k@7, k@6
data block 2: k@5, ...

DB iter scan through k@7, k@6 and k@5 and decides to seek to k@0 due to option [`max_sequential_skip_in_iterations`](https://github.com/facebook/rocksdb/blob/d56da8c112b4e6968fd79ce2bf15e6435df40656/include/rocksdb/advanced_options.h#L621-L629). Multiscan was on data block 2, but moves to data block 1 after the seek.

This can cause assertion failure in debug mode and seg fault in prod since older data blocks are unpinned and freed as we advanced a multiscan. This PR fixes the issue by forcing a multiscan to never go backward.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14106

Test Plan: - added a new unit test that reproduces the scenario: `./db_iterator_test --gtest_filter="*ReseekAcrossBlocksSameUserKey*"`

Reviewed By: xingbowang

Differential Revision: D86428845

Pulled By: cbi42

fbshipit-source-id: ab623f93e73298a60857fb2ff268366f289092a0
---
 db/db_iterator_test.cc                        | 63 +++++++++++++++++++
 .../block_based/block_based_table_iterator.cc | 25 ++++----
 .../block_based/block_based_table_iterator.h  |  3 +-
 .../bug_fixes/multiscan_backward_seek.md      |  1 +
 4 files changed, 77 insertions(+), 15 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/multiscan_backward_seek.md

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 842a38f4b35e..dc73938dad21 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4671,6 +4671,69 @@ TEST_P(DBMultiScanIteratorTest, FragmentedRangeTombstones) {
   iter.reset();
 }
 
+TEST_P(DBMultiScanIteratorTest, ReseekAcrossBlocksSameUserKey) {
+  // This test exposes a bug where multiscan reseeks backwards when
+  // max_sequential_skip_in_iterations is triggered with the same user key
+  // spanning multiple data blocks.
+
+  auto options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  options.compression = kNoCompression;
+
+  // Force each internal key into its own block
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Taking a snapshot after each Put to preserve all versions during flush.
+  std::vector<const Snapshot*> snapshots;
+  for (int i = 0; i < 7; ++i) {
+    ASSERT_OK(Put("key_a", "value_" + std::to_string(i)));
+    snapshots.push_back(db_->GetSnapshot());
+  }
+  ASSERT_OK(Put("key_b", "value_b"));
+
+  ASSERT_OK(Flush());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // Setup multiscan range covering both keys
+  std::vector<std::string> key_ranges({"key_a", "key_c"});
+  ReadOptions ro;
+  Slice ub = key_ranges[1];
+  ro.iterate_upper_bound = &ub;
+  ro.fill_cache = GetParam();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ro, cfh));
+  ASSERT_NE(iter, nullptr);
+  iter->Prepare(scan_options);
+
+  std::vector<std::string> seen_keys;
+  std::vector<std::string> seen_values;
+  iter->Seek(key_ranges[0]);
+  while (iter->status().ok() && iter->Valid()) {
+    seen_keys.push_back(iter->key().ToString());
+    seen_values.push_back(iter->value().ToString());
+    iter->Next();
+  }
+  ASSERT_OK(iter->status()) << iter->status().ToString();
+
+  ASSERT_EQ(seen_keys.size(), 2) << "Should see key_a and key_b";
+  ASSERT_EQ(seen_keys[0], "key_a");
+  ASSERT_EQ(seen_keys[1], "key_b");
+  ASSERT_EQ(seen_values[0], "value_6");
+  ASSERT_EQ(seen_values[1], "value_b");
+
+  for (auto* snapshot : snapshots) {
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index a505a8449329..65d1750cddf1 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1165,17 +1165,10 @@ void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
       }
       // It should only be possible to seek a key between the start of current
       // prepared scan and start of next prepared range.
-      MultiScanUnexpectedSeekTarget(
-          seek_target, &user_seek_target,
-          std::get<0>(multi_scan_->block_index_ranges_per_scan
-                          [multi_scan_->next_scan_idx - 1]));
+      MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target);
     } else {
       // Case 2:
-      MultiScanUnexpectedSeekTarget(
-          seek_target, &user_seek_target,
-          std::get<0>(
-              multi_scan_
-                  ->block_index_ranges_per_scan[multi_scan_->next_scan_idx]));
+      MultiScanUnexpectedSeekTarget(seek_target, &user_seek_target);
     }
   } else {
     // Case 2:
@@ -1192,12 +1185,17 @@ void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
       return;
     }
 
-    MultiScanSeekTargetFromBlock(seek_target, cur_scan_start_idx);
+    // max_sequential_skip_in_iterations can trigger a reseek on the start
+    // key of a scan range, even though the multiscan is already past
+    // `cur_scan_start_idx` (e.g., a user key spans multiple data blocks).
+    size_t block_idx =
+        std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx);
+    MultiScanSeekTargetFromBlock(seek_target, block_idx);
   }
 }
 
 void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
-    const Slice* seek_target, const Slice* user_seek_target, size_t block_idx) {
+    const Slice* seek_target, const Slice* user_seek_target) {
   // linear search the block that contains the seek target, and unpin blocks
   // that are before it.
 
@@ -1253,8 +1251,9 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
     }
   }
 
-  // Find the right block_idx;
-  block_idx = cur_scan_start_idx;
+  // Take the max here to ensure we don't move backwards.
+  size_t block_idx =
+      std::max(cur_scan_start_idx, multi_scan_->cur_data_block_idx);
   auto const& data_block_separators = multi_scan_->data_block_separators;
   while (block_idx < data_block_separators.size() &&
          (user_comparator_.CompareWithoutTimestamp(
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index a28133261559..8c44d0c9be0e 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -659,8 +659,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   void MultiScanSeekTargetFromBlock(const Slice* seek_target, size_t block_idx);
   void MultiScanUnexpectedSeekTarget(const Slice* seek_target,
-                                     const Slice* user_seek_target,
-                                     size_t block_idx);
+                                     const Slice* user_seek_target);
 
   // Return true, if there is an error, or end of file
   bool MultiScanLoadDataBlock(size_t idx) {
diff --git a/unreleased_history/bug_fixes/multiscan_backward_seek.md b/unreleased_history/bug_fixes/multiscan_backward_seek.md
new file mode 100644
index 000000000000..e800b2c067d1
--- /dev/null
+++ b/unreleased_history/bug_fixes/multiscan_backward_seek.md
@@ -0,0 +1 @@
+Fixed a bug in MultiScan where `max_sequential_skip_in_iterations` could cause the iterator to seek backward to already-unpinned blocks when the same user key spans multiple data blocks, leading to assertion failures or seg fault.

From 5879f8b62b86687da4fcf71601262d71a97dd006 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Fri, 7 Nov 2025 14:22:00 -0800
Subject: [PATCH 370/500] Add option to verify block checksums of output files
 (#14103)

Summary:
For all compactions, RocksDB performs a lightweight sanity check on output SST files before installation (in `CompactionJob::VerifyOutputFiles()`). However, this lightweight check may not catch corruption that is small enough to allow the SST files to still be opened.

There is an existing feature, `paranoid_file_check`, which opens the SST file, iterates through all keys, and checks the hash of each key. While this provides the ultimate level of data integrity checking, it comes at a high computational cost.

In this PR, we introduce a new mutable CF option, `verify_output_flags`. The `verify_output_flags` is a bitmask enum that allows users to select various verification types, including block checksum verification, full key iteration, and file checksum verification (to be added in subsequent PRs). Note that the existing `paranoid_file_check` option is equivalent to a full key iteration check. Block-level checksum verification is much lighter than the full key iteration check.

Please note that the previously deprecated `verify_checksums_in_compaction` option (removed in version 5.3.0) was for verifying the checksum of **input SST files**. RocksDB continues to perform this verification for both local and remote compactions, and this behavior remains unchanged. In contrast, this PR focuses on verifying the **output SST files**.

## To follow up
- File-level Checksum verification for output SST files
- Deprecate `paranoid_file_checks` option in favor of the new option
- Add to stress test / db_bench

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14103

Test Plan:
New Unit Test added. The corruption is both detected by `paranoid_file_check` and various types of verification set by this new option, `verify_output_flags`
```
./compaction_service_test --gtest_filter="*CompactionServiceTest.CorruptedOutput*"
```

Reviewed By: pdillinger

Differential Revision: D86357924

Pulled By: jaykorean

fbshipit-source-id: a9e04798f249c7e977231e179622a0830d6675fe
---
 db/compaction/compaction_job.cc               | 96 ++++++++++++-------
 db/compaction/compaction_service_test.cc      | 82 ++++++++++++++++
 include/rocksdb/advanced_options.h            | 62 ++++++++++++
 options/cf_options.cc                         |  4 +
 options/cf_options.h                          |  3 +
 options/options_helper.cc                     |  2 +
 options/options_settable_test.cc              |  3 +-
 options/options_test.cc                       | 14 ++-
 table/block_based/block_based_table_reader.cc | 28 +++---
 table/block_based/block_based_table_reader.h  |  5 +-
 table/external_table.cc                       |  4 +-
 table/table_reader.h                          |  3 +-
 12 files changed, 254 insertions(+), 52 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index c21306e65cde..3f1fd9546d43 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -812,20 +812,19 @@ Status CompactionJob::SyncOutputDirectories() {
 Status CompactionJob::VerifyOutputFiles() {
   Status status;
   std::vector<port::Thread> thread_pool;
-  std::vector<const CompactionOutputs::Output*> files_output;
-  for (const auto& state : compact_->sub_compact_states) {
-    for (const auto& output : state.GetOutputs()) {
-      files_output.emplace_back(&output);
-    }
-  }
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-  std::atomic<size_t> next_file_idx(0);
-  auto verify_table = [&](Status& output_status) {
-    while (true) {
-      size_t file_idx = next_file_idx.fetch_add(1);
-      if (file_idx >= files_output.size()) {
-        break;
-      }
+  VerifyOutputFlags verify_output_flags =
+      compact_->compaction->mutable_cf_options().verify_output_flags;
+
+  // For backward compatibility
+  if (paranoid_file_checks_) {
+    verify_output_flags |= VerifyOutputFlags::kVerifyIteration;
+    verify_output_flags |= VerifyOutputFlags::kEnableForLocalCompaction;
+    verify_output_flags |= VerifyOutputFlags::kEnableForRemoteCompaction;
+  }
+
+  auto verify_table = [&](SubcompactionState& subcompaction_state) {
+    for (const auto& output_file : subcompaction_state.GetOutputs()) {
       // Verify that the table is usable
       // We set for_compaction to false and don't
       // OptimizeForCompactionTableRead here because this is a special case
@@ -834,13 +833,19 @@ Status CompactionJob::VerifyOutputFiles() {
       // verification as user reads since the goal is to cache it here for
       // further user reads
       ReadOptions verify_table_read_options(Env::IOActivity::kCompaction);
+      verify_table_read_options.verify_checksums = true;
+      verify_table_read_options.readahead_size =
+          file_options_for_read_.compaction_readahead_size;
+
+      std::unique_ptr<TableReader> table_reader_guard;
+      TableReader* table_reader_ptr = table_reader_guard.get();
       verify_table_read_options.rate_limiter_priority =
           GetRateLimiterPriority();
       InternalIterator* iter = cfd->table_cache()->NewIterator(
           verify_table_read_options, file_options_, cfd->internal_comparator(),
-          files_output[file_idx]->meta,
+          output_file.meta,
           /*range_del_agg=*/nullptr, compact_->compaction->mutable_cf_options(),
-          /*table_reader_ptr=*/nullptr,
+          /*table_reader_ptr=*/&table_reader_ptr,
           cfd->internal_stats()->GetFileReadHist(
               compact_->compaction->output_level()),
           TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
@@ -850,38 +855,63 @@ Status CompactionJob::VerifyOutputFiles() {
           /*largest_compaction_key=*/nullptr,
           /*allow_unprepared_value=*/false);
       auto s = iter->status();
-
-      if (s.ok() && paranoid_file_checks_) {
-        OutputValidator validator(cfd->internal_comparator(),
-                                  /*_enable_hash=*/true);
-        for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-          s = validator.Add(iter->key(), iter->value());
-          if (!s.ok()) {
-            break;
+      if (s.ok()) {
+        // Check for remote/local compaction and verify_output_flags flags
+        const bool should_verify =
+            (subcompaction_state.compaction_job_stats.is_remote_compaction &&
+             !!(verify_output_flags &
+                VerifyOutputFlags::kEnableForRemoteCompaction)) ||
+            (!subcompaction_state.compaction_job_stats.is_remote_compaction &&
+             !!(verify_output_flags &
+                VerifyOutputFlags::kEnableForLocalCompaction));
+
+        if (should_verify) {
+          const bool should_verify_block_checksum =
+              !!(verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum);
+          const bool should_verify_iteration =
+              !!(verify_output_flags & VerifyOutputFlags::kVerifyIteration);
+          if (should_verify_block_checksum) {
+            assert(table_reader_ptr != nullptr);
+            // If verifying iteration as well, verify meta blocks here only to
+            // avoid redundant checks on data blocks
+            s = table_reader_ptr->VerifyChecksum(
+                verify_table_read_options, TableReaderCaller::kCompaction,
+                /*meta_blocks_only=*/should_verify_iteration);
+          }
+          if (s.ok() && should_verify_iteration) {
+            OutputValidator validator(cfd->internal_comparator(),
+                                      /*_enable_hash=*/true);
+            for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+              s = validator.Add(iter->key(), iter->value());
+              if (!s.ok()) {
+                break;
+              }
+            }
+            if (s.ok()) {
+              s = iter->status();
+            }
+            if (s.ok() && !validator.CompareValidator(output_file.validator)) {
+              s = Status::Corruption(
+                  "Key-value checksum of compaction output doesn't match what "
+                  "was computed when written");
+            }
           }
-        }
-        if (s.ok()) {
-          s = iter->status();
-        }
-        if (s.ok() &&
-            !validator.CompareValidator(files_output[file_idx]->validator)) {
-          s = Status::Corruption("Paranoid checksums do not match");
         }
       }
 
       delete iter;
 
       if (!s.ok()) {
-        output_status = s;
+        subcompaction_state.status = s;
         break;
       }
     }
   };
   for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
     thread_pool.emplace_back(verify_table,
-                             std::ref(compact_->sub_compact_states[i].status));
+                             std::ref(compact_->sub_compact_states[i]));
   }
-  verify_table(compact_->sub_compact_states[0].status);
+  verify_table(compact_->sub_compact_states[0]);
   for (auto& thread : thread_pool) {
     thread.join();
   }
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 1127244d6247..bd0a52e4559c 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -1047,6 +1047,7 @@ TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) {
     Destroy(options);
     options.disable_auto_compactions = true;
     options.paranoid_file_checks = paranoid_file_check_enabled;
+    options.verify_output_flags = VerifyOutputFlags::kVerifyNone;
     ReopenWithCompactionService(&options);
     GenerateTestData();
 
@@ -1101,6 +1102,87 @@ TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) {
   }
 }
 
+TEST_F(CompactionServiceTest, CorruptedOutputVerifyOutputFlags) {
+  for (VerifyOutputFlags verify_output_flags :
+       {VerifyOutputFlags::kVerifyNone,
+        VerifyOutputFlags::kEnableForLocalCompaction |
+            VerifyOutputFlags::kVerifyBlockChecksum,
+        VerifyOutputFlags::kEnableForRemoteCompaction |
+            VerifyOutputFlags::kVerifyBlockChecksum,
+        VerifyOutputFlags::kEnableForRemoteCompaction |
+            VerifyOutputFlags::kVerifyIteration,
+        VerifyOutputFlags::kVerifyAll}) {
+    SCOPED_TRACE(
+        "verify_output_flags=" +
+        std::to_string(static_cast<std::underlying_type_t<VerifyOutputFlags>>(
+            verify_output_flags)));
+
+    Options options = CurrentOptions();
+    Destroy(options);
+    options.disable_auto_compactions = true;
+    options.paranoid_file_checks = false;
+    options.verify_output_flags = verify_output_flags;
+    ReopenWithCompactionService(&options);
+    GenerateTestData();
+
+    auto my_cs = GetCompactionService();
+
+    std::string start_str = Key(15);
+    std::string end_str = Key(45);
+    Slice start(start_str);
+    Slice end(end_str);
+    uint64_t comp_num = my_cs->GetCompactionNum();
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompactionServiceCompactionJob::Run:0", [&](void* arg) {
+          CompactionServiceResult* compaction_result =
+              *(static_cast<CompactionServiceResult**>(arg));
+          ASSERT_TRUE(compaction_result != nullptr &&
+                      !compaction_result->output_files.empty());
+          // Corrupt files here
+          for (const auto& output_file : compaction_result->output_files) {
+            std::string file_name =
+                compaction_result->output_path + "/" + output_file.file_name;
+
+            // Corrupt very small range of bytes. This corruption is so small
+            // that this isn't caught by default light-weight check
+            ASSERT_OK(test::CorruptFile(env_, file_name, 0, 1,
+                                        false /* verifyChecksum */));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    const bool is_enabled_for_remote_compaction =
+        !!(verify_output_flags & VerifyOutputFlags::kEnableForRemoteCompaction);
+    const bool should_verify_block_checksum =
+        !!(verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum);
+    const bool should_verify_iteration =
+        !!(verify_output_flags & VerifyOutputFlags::kVerifyIteration);
+
+    Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+    if (is_enabled_for_remote_compaction &&
+        (should_verify_block_checksum || should_verify_iteration)) {
+      ASSERT_NOK(s);
+      ASSERT_TRUE(s.IsCorruption());
+    } else {
+      // CompactRange() goes through if block checksum wasn't verified
+      ASSERT_OK(s);
+    }
+
+    ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // On the worker side, the compaction is considered success
+    // Verification is done on the primary side
+    CompactionServiceResult result;
+    my_cs->GetResult(&result);
+    ASSERT_OK(result.status);
+    ASSERT_TRUE(result.stats.is_manual_compaction);
+    ASSERT_TRUE(result.stats.is_remote_compaction);
+  }
+}
+
 TEST_F(CompactionServiceTest, TruncatedOutput) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index dba041b0f4eb..ffa5d5a2659c 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -156,6 +156,61 @@ enum class PrepopulateBlobCache : uint8_t {
   kFlushOnly = 0x1,  // Prepopulate blobs during flush only
 };
 
+// Bitmask enum for verify output flags during compaction.
+// This allows fine-grained control over what verification is performed
+// on compaction output files and when it's enabled.
+enum class VerifyOutputFlags : uint32_t {
+  kVerifyNone = 0x0,  // No verification
+
+  // First set of bits: type of verifications
+  kVerifyBlockChecksum = 1 << 0,  // Verify block checksums
+  kVerifyIteration = 1 << 1,      // Verify iteration and full key/value hash
+                                  // by comparing the one inserted into a
+                                  // file, and what is read back.
+
+  // TODO - Implement
+  // kVerifyFileChecksum = 1 << 2,   // Verify file-level checksum
+
+  // Second set of bits: when to enable verification
+  kEnableForLocalCompaction = 1 << 10,   // Enable for local compaction
+  kEnableForRemoteCompaction = 1 << 11,  // Enable for remote compaction
+
+  // TODO - Implement
+  // kEnableForFlush = 1 << 12,  // Enable for flush
+
+  kVerifyAll = 0xFFFFFFFF,
+};
+
+inline VerifyOutputFlags operator|(VerifyOutputFlags lhs,
+                                   VerifyOutputFlags rhs) {
+  using T = std::underlying_type_t<VerifyOutputFlags>;
+  return static_cast<VerifyOutputFlags>(static_cast<T>(lhs) |
+                                        static_cast<T>(rhs));
+}
+
+inline VerifyOutputFlags& operator|=(VerifyOutputFlags& lhs,
+                                     VerifyOutputFlags rhs) {
+  lhs = lhs | rhs;
+  return lhs;
+}
+
+inline VerifyOutputFlags operator&(VerifyOutputFlags lhs,
+                                   VerifyOutputFlags rhs) {
+  using T = std::underlying_type_t<VerifyOutputFlags>;
+  return static_cast<VerifyOutputFlags>(static_cast<T>(lhs) &
+                                        static_cast<T>(rhs));
+}
+
+inline VerifyOutputFlags& operator&=(VerifyOutputFlags& lhs,
+                                     VerifyOutputFlags rhs) {
+  lhs = lhs & rhs;
+  return lhs;
+}
+
+inline bool operator!(VerifyOutputFlags flag) {
+  return flag == VerifyOutputFlags::kVerifyNone;
+}
+
 struct AdvancedColumnFamilyOptions {
   // The maximum number of write buffers that are built up in memory.
   // The default and the minimum number is 2, so that when 1 write buffer
@@ -715,6 +770,13 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   bool paranoid_file_checks = false;
 
+  // Bitmask enum for output verification option.
+  //
+  // Default: 0 (kVerifyNone)
+  //
+  // Dynamically changeable (as a uint32_t) through SetOptions() API.
+  VerifyOutputFlags verify_output_flags = VerifyOutputFlags::kVerifyNone;
+
   // In debug mode, RocksDB runs consistency checks on the LSM every time the
   // LSM changes (Flush, Compaction, AddFile). When this option is true, these
   // checks are also enabled in release mode. These checks were historically
diff --git a/options/cf_options.cc b/options/cf_options.cc
index f0e9e26b43b5..eca2cd930966 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -395,6 +395,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, paranoid_file_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"verify_output_flags",
+         {offsetof(struct MutableCFOptions, verify_output_flags),
+          OptionType::kUInt32T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"verify_checksums_in_compaction",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kMutable}},
diff --git a/options/cf_options.h b/options/cf_options.h
index 539ddf494f75..3f5804445142 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -145,6 +145,7 @@ struct MutableCFOptions {
         preclude_last_level_data_seconds(
             options.preclude_last_level_data_seconds),
         preserve_internal_time_seconds(options.preserve_internal_time_seconds),
+        verify_output_flags(options.verify_output_flags),
         enable_blob_files(options.enable_blob_files),
         min_blob_size(options.min_blob_size),
         blob_file_size(options.blob_file_size),
@@ -216,6 +217,7 @@ struct MutableCFOptions {
         compaction_options_fifo(),
         preclude_last_level_data_seconds(0),
         preserve_internal_time_seconds(0),
+        verify_output_flags(VerifyOutputFlags::kVerifyNone),
         enable_blob_files(false),
         min_blob_size(0),
         blob_file_size(0),
@@ -317,6 +319,7 @@ struct MutableCFOptions {
   CompactionOptionsUniversal compaction_options_universal;
   uint64_t preclude_last_level_data_seconds;
   uint64_t preserve_internal_time_seconds;
+  VerifyOutputFlags verify_output_flags;
 
   // Blob file related options
   bool enable_blob_files;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index 65404f112f26..e5622d0a3238 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -274,6 +274,8 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
   cf_opts->compaction_options_fifo = moptions.compaction_options_fifo;
   cf_opts->compaction_options_universal = moptions.compaction_options_universal;
 
+  cf_opts->verify_output_flags = moptions.verify_output_flags;
+
   // Blob file related options
   cf_opts->enable_blob_files = moptions.enable_blob_files;
   cf_opts->min_blob_size = moptions.min_blob_size;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index b4880b754aa4..c752b2401718 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -689,7 +689,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "memtable_veirfy_per_key_checksum_on_seek=1;"
       "memtable_op_scan_flush_trigger=123;"
       "memtable_avg_op_scan_flush_trigger=12;"
-      "cf_allow_ingest_behind=1;",
+      "cf_allow_ingest_behind=1;"
+      "verify_output_flags=2049;",
       new_options));
 
   ASSERT_NE(new_options->blob_cache.get(), nullptr);
diff --git a/options/options_test.cc b/options/options_test.cc
index 7ecde7885ba2..7111872f541b 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -1722,13 +1722,24 @@ TEST_F(OptionsTest, MutableCFOptions) {
 
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       config_options, cf_opts,
-      "paranoid_file_checks=true; block_based_table_factory.block_align=false; "
+      "paranoid_file_checks=true; "
+      "verify_output_flags=2049; "
+      "block_based_table_factory.block_align=false; "
       "block_based_table_factory.super_block_alignment_size=65536; "
       "block_based_table_factory.super_block_alignment_space_overhead_ratio="
       "4096; "
       "block_based_table_factory.block_size=8192;",
       &cf_opts));
   ASSERT_TRUE(cf_opts.paranoid_file_checks);
+  ASSERT_NE(
+      (cf_opts.verify_output_flags & VerifyOutputFlags::kVerifyBlockChecksum),
+      VerifyOutputFlags::kVerifyNone);
+  ASSERT_NE((cf_opts.verify_output_flags &
+             VerifyOutputFlags::kEnableForRemoteCompaction),
+            VerifyOutputFlags::kVerifyNone);
+  ASSERT_EQ((cf_opts.verify_output_flags &
+             VerifyOutputFlags::kEnableForLocalCompaction),
+            VerifyOutputFlags::kVerifyNone);
   ASSERT_NE(cf_opts.table_factory.get(), nullptr);
   auto* bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
   ASSERT_NE(bbto, nullptr);
@@ -2584,6 +2595,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.optimize_filters_for_hits, true);
   ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.FixedPrefix.31");
   ASSERT_EQ(new_cf_opt.experimental_mempurge_threshold, 0.003);
+  ASSERT_EQ(new_cf_opt.verify_output_flags, VerifyOutputFlags::kVerifyNone);
   ASSERT_EQ(new_cf_opt.enable_blob_files, true);
   ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
   ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index a0eba4f009b5..581eecc0d90f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -2705,7 +2705,7 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options,
   }
   BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
   IndexBlockIter iiter_on_stack;
-  auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+  auto iiter = NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                                 &iiter_on_stack, /*get_context=*/nullptr,
                                 &lookup_context);
   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
@@ -2742,7 +2742,7 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options,
     DataBlockIter biter;
     Status tmp_status;
     NewDataBlockIterator<DataBlockIter>(
-        read_options, block_handle, &biter, /*type=*/BlockType::kData,
+        read_options, block_handle, &biter, /*block_type=*/BlockType::kData,
         /*get_context=*/nullptr, &lookup_context,
         /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
         /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true);
@@ -2757,7 +2757,8 @@ Status BlockBasedTable::Prefetch(const ReadOptions& read_options,
 }
 
 Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
-                                       TableReaderCaller caller) {
+                                       TableReaderCaller caller,
+                                       bool meta_blocks_only) {
   Status s;
   // Check Meta blocks
   std::unique_ptr<Block> metaindex;
@@ -2772,6 +2773,9 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
   } else {
     return s;
   }
+  if (meta_blocks_only) {
+    return s;
+  }
   // Check Data blocks
   IndexBlockIter iiter_on_stack;
   BlockCacheLookupContext context{caller};
@@ -2967,7 +2971,7 @@ bool BlockBasedTable::TEST_BlockInCache(const BlockHandle& handle) const {
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
   std::unique_ptr<InternalIteratorBase<IndexValue>> iiter(NewIndexIterator(
-      options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr,
+      options, /*disable_prefix_seek=*/false, /*input_iter=*/nullptr,
       /*get_context=*/nullptr, /*lookup_context=*/nullptr));
   iiter->Seek(key);
   assert(iiter->status().ok());
@@ -3174,9 +3178,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const {
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
     const ReadOptions& read_options, std::vector<KVPairBlock>* kv_pair_blocks) {
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
-      NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+      NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/nullptr));
+                       /*lookup_context=*/nullptr));
 
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -3196,7 +3200,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
     Status tmp_status;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
         read_options, blockhandles_iter->value().handle,
-        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*input_iter=*/nullptr, /*block_type=*/BlockType::kData,
         /*get_context=*/nullptr, /*lookup_context=*/nullptr,
         /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
         /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true));
@@ -3347,9 +3351,9 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
-      NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+      NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/nullptr));
+                       /*lookup_context=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
     out_stream << "Can not read Index Block \n\n";
@@ -3398,9 +3402,9 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
-      NewIndexIterator(read_options, /*need_upper_bound_check=*/false,
+      NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
-                       /*lookup_contex=*/nullptr));
+                       /*lookup_context=*/nullptr));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
     out_stream << "Can not read Index Block \n\n";
@@ -3433,7 +3437,7 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
     Status tmp_status;
     datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
         read_options, blockhandles_iter->value().handle,
-        /*input_iter=*/nullptr, /*type=*/BlockType::kData,
+        /*input_iter=*/nullptr, /*block_type=*/BlockType::kData,
         /*get_context=*/nullptr, /*lookup_context=*/nullptr,
         /*prefetch_buffer=*/nullptr, /*for_compaction=*/false,
         /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true));
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index fb4bc998300b..bac27ff18f78 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -211,7 +211,8 @@ class BlockBasedTable : public TableReader {
   Status DumpTable(WritableFile* out_file) override;
 
   Status VerifyChecksum(const ReadOptions& readOptions,
-                        TableReaderCaller caller) override;
+                        TableReaderCaller caller,
+                        bool meta_blocks_only = false) override;
 
   void MarkObsolete(uint32_t uncache_aggressiveness) override;
 
@@ -429,7 +430,7 @@ class BlockBasedTable : public TableReader {
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
   InternalIteratorBase<IndexValue>* NewIndexIterator(
-      const ReadOptions& read_options, bool need_upper_bound_check,
+      const ReadOptions& read_options, bool disable_prefix_seek,
       IndexBlockIter* input_iter, GetContext* get_context,
       BlockCacheLookupContext* lookup_context) const;
 
diff --git a/table/external_table.cc b/table/external_table.cc
index 514cf14b1e62..5fc20f406929 100644
--- a/table/external_table.cc
+++ b/table/external_table.cc
@@ -239,8 +239,8 @@ class ExternalTableReaderAdapter : public TableReader {
         "Get() not supported on external file iterator");
   }
 
-  virtual Status VerifyChecksum(const ReadOptions& /*ro*/,
-                                TableReaderCaller /*caller*/) override {
+  Status VerifyChecksum(const ReadOptions& /*ro*/, TableReaderCaller /*caller*/,
+                        bool /*meta_blocks_only*/ = false) override {
     return Status::OK();
   }
 
diff --git a/table/table_reader.h b/table/table_reader.h
index a9d46499bd06..354557db4aa5 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -185,7 +185,8 @@ class TableReader {
 
   // check whether there is corruption in this db file
   virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
-                                TableReaderCaller /*caller*/) {
+                                TableReaderCaller /*caller*/,
+                                bool /*meta_blocks_only*/ = false) {
     return Status::NotSupported("VerifyChecksum() not supported");
   }
 

From b897c3789b886f7e6dee6f22b8d29034dc3559b0 Mon Sep 17 00:00:00 2001
From: ngina <221624547+nmk70@users.noreply.github.com>
Date: Mon, 10 Nov 2025 14:47:36 -0800
Subject: [PATCH 371/500] Merge BuiltinFilterBitsBuilder into FilterBitsBuilder
 for accurate filter size estimation (#14111)

Summary:
**Summary:**
Merge the BuiltinFilterBitsBuilder into FilterBitsBuilder.  This enables using
CalculateSpace() for accurate filter size estimation instead of hardcoded
bits-per-key which could result in incorrect estimations for different filter types.
The previous hardcoded estimate of 15 bits per key was in the filter block builders UpdateFilterSizeEstimate().

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14111

Test Plan: - Existing filter tests pass (bloom_test, full_filter_block_test, filter_bench, db_bloom_filter_test)

Reviewed By: pdillinger

Differential Revision: D86473287

Pulled By: nmk70

fbshipit-source-id: cd4a47351e67444e944d5b1b375b3b13274dd6e3
---
 db/db_bloom_filter_test.cc                  | 10 ++++++-
 table/block_based/filter_policy.cc          |  6 ++--
 table/block_based/filter_policy_internal.h  | 31 +++++++++------------
 table/block_based/full_filter_block.cc      |  4 +--
 table/block_based/full_filter_block_test.cc | 15 ++++++++++
 util/bloom_test.cc                          |  7 ++---
 util/filter_bench.cc                        |  7 ++---
 7 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index edb02920e72d..c268400c78fe 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -710,12 +710,20 @@ class AlwaysTrueBitsBuilder : public FilterBitsBuilder {
     count_ = 0;
     // Interpreted as "always true" filter (0 probes over 1 byte of
     // payload, 5 bytes metadata)
-    return Slice("\0\0\0\0\0\0", 6);
+    return Slice("\0\0\0\0\0\0", kAlwaysTrueFilterBytes);
   }
   using FilterBitsBuilder::Finish;
   size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; }
+  size_t CalculateSpace(size_t /* num_entries */) override {
+    return kAlwaysTrueFilterBytes;
+  }
+  double EstimatedFpRate(size_t /* num_entries */,
+                         size_t /* bytes */) override {
+    return 1.0;
+  }
 
  private:
+  static constexpr size_t kAlwaysTrueFilterBytes = 6;
   size_t count_ = 0;
 };
 
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index cf83cf084575..cdc4c144c369 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -17,7 +17,6 @@
 #include <limits>
 #include <memory>
 
-#include "cache/cache_entry_roles.h"
 #include "cache/cache_reservation_manager.h"
 #include "logging/logging.h"
 #include "port/lang.h"
@@ -31,7 +30,6 @@
 #include "table/block_based/full_filter_block.h"
 #include "util/atomic.h"
 #include "util/bloom_impl.h"
-#include "util/coding.h"
 #include "util/hash.h"
 #include "util/math.h"
 #include "util/ribbon_config.h"
@@ -62,7 +60,7 @@ Slice FinishAlwaysTrue(std::unique_ptr<const char[]>* /*buf*/) {
 
 // Base class for filter builders using the XXH3 preview hash,
 // also known as Hash64 or GetSliceHash64.
-class XXPH3FilterBitsBuilder : public BuiltinFilterBitsBuilder {
+class XXPH3FilterBitsBuilder : public FilterBitsBuilder {
  public:
   explicit XXPH3FilterBitsBuilder(
       std::atomic<int64_t>* aggregate_rounding_balance,
@@ -1078,7 +1076,7 @@ class Standard128RibbonBitsReader : public BuiltinFilterBitsReader {
 
 using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
 
-class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+class LegacyBloomBitsBuilder : public FilterBitsBuilder {
  public:
   explicit LegacyBloomBitsBuilder(const int bits_per_key, Logger* info_log);
 
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index a823bf059732..3e6df57194dc 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -90,6 +90,19 @@ class FilterBitsBuilder {
   // <= the specified number of bytes. Callers (including RocksDB) should
   // only use this result for optimizing performance and not as a guarantee.
   virtual size_t ApproximateNumEntries(size_t bytes) = 0;
+
+  // Calculate number of bytes needed for a new filter, including
+  // metadata. Passing the result to ApproximateNumEntries should
+  // (ideally, usually) return >= the num_entry passed in.
+  // When optimize_filters_for_memory is enabled, this function
+  // is not authoritative but represents a target size that should
+  // be close to the average size.
+  virtual size_t CalculateSpace(size_t num_entries) = 0;
+
+  // Returns an estimate of the FP rate of the returned filter if
+  // `num_entries` keys are added and the filter returned by Finish
+  // is `bytes` bytes.
+  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
 };
 
 // A class that checks if a key can be in filter
@@ -109,24 +122,6 @@ class FilterBitsReader {
   }
 };
 
-// Exposes any extra information needed for testing built-in
-// FilterBitsBuilders
-class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
- public:
-  // Calculate number of bytes needed for a new filter, including
-  // metadata. Passing the result to ApproximateNumEntries should
-  // (ideally, usually) return >= the num_entry passed in.
-  // When optimize_filters_for_memory is enabled, this function
-  // is not authoritative but represents a target size that should
-  // be close to the average size.
-  virtual size_t CalculateSpace(size_t num_entries) = 0;
-
-  // Returns an estimate of the FP rate of the returned filter if
-  // `num_entries` keys are added and the filter returned by Finish
-  // is `bytes` bytes.
-  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
-};
-
 // Base class for RocksDB built-in filter reader with
 // extra useful functionalities for inernal.
 class BuiltinFilterBitsReader : public FilterBitsReader {
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index f0f1a958ae15..c7d069f3e524 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -47,9 +47,7 @@ void FullFilterBlockBuilder::UpdateFilterSizeEstimate(
     return;
   }
 
-  // Calculate the estimated filter size in bytes.
-  // Estimate ~15 bits per key for bloom filters.
-  size_t filter_size = ((entries_added * 15) + 7) / 8;
+  size_t filter_size = filter_bits_builder_->CalculateSpace(entries_added);
 
   // Reserve filter space for next data block ~2x the average.
   size_t buffer_size = 0;
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index f90492d8583b..1ce6844741eb 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -52,6 +52,13 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
 
   size_t ApproximateNumEntries(size_t bytes) override { return bytes / 4; }
 
+  size_t CalculateSpace(size_t num_entries) override { return num_entries * 4; }
+
+  double EstimatedFpRate(size_t /* num_entries */,
+                         size_t /* bytes */) override {
+    return 0.0;
+  }
+
  private:
   std::vector<uint32_t> hash_entries_;
 };
@@ -229,6 +236,14 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
     return b_->ApproximateNumEntries(bytes);
   }
 
+  size_t CalculateSpace(size_t num_entries) override {
+    return b_->CalculateSpace(num_entries);
+  }
+
+  double EstimatedFpRate(size_t num_entries, size_t bytes) override {
+    return b_->EstimatedFpRate(num_entries, bytes);
+  }
+
   size_t CountUnique() { return uniq_.size(); }
 };
 
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index f3dbe6373532..d4d9fb87e5d7 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -86,10 +86,7 @@ class FullBloomTest : public testing::TestWithParam<std::string> {
     ResetPolicy();
   }
 
-  BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() {
-    // Throws on bad cast
-    return dynamic_cast<BuiltinFilterBitsBuilder*>(bits_builder_.get());
-  }
+  FilterBitsBuilder* GetFilterBitsBuilder() { return bits_builder_.get(); }
 
   const BloomLikeFilterPolicy* GetBloomLikeFilterPolicy() {
     // Throws on bad cast
@@ -239,7 +236,7 @@ TEST_P(FullBloomTest, FilterSize) {
     EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey());
     EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
 
-    auto bits_builder = GetBuiltinFilterBitsBuilder();
+    auto bits_builder = GetFilterBitsBuilder();
     if (bpk.second == 0) {
       ASSERT_EQ(bits_builder, nullptr);
       continue;
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index c94d58194c39..0afe8c2fd6bf 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -145,7 +145,6 @@ using ROCKSDB_NAMESPACE::BlockContents;
 using ROCKSDB_NAMESPACE::BloomFilterPolicy;
 using ROCKSDB_NAMESPACE::BloomHash;
 using ROCKSDB_NAMESPACE::BloomLikeFilterPolicy;
-using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder;
 using ROCKSDB_NAMESPACE::CachableEntry;
 using ROCKSDB_NAMESPACE::Cache;
 using ROCKSDB_NAMESPACE::CacheEntryRole;
@@ -153,6 +152,7 @@ using ROCKSDB_NAMESPACE::CacheEntryRoleOptions;
 using ROCKSDB_NAMESPACE::EncodeFixed32;
 using ROCKSDB_NAMESPACE::Env;
 using ROCKSDB_NAMESPACE::FastRange32;
+using ROCKSDB_NAMESPACE::FilterBitsBuilder;
 using ROCKSDB_NAMESPACE::FilterBitsReader;
 using ROCKSDB_NAMESPACE::FilterBuildingContext;
 using ROCKSDB_NAMESPACE::FilterPolicy;
@@ -393,7 +393,7 @@ void FilterBench::Go() {
 
   std::cout << "Building..." << std::endl;
 
-  std::unique_ptr<BuiltinFilterBitsBuilder> builder;
+  std::unique_ptr<FilterBitsBuilder> builder;
 
   size_t total_memory_used = 0;
   size_t total_size = 0;
@@ -440,8 +440,7 @@ void FilterBench::Go() {
       info.filter_ = info.plain_table_bloom_->GetRawData();
     } else {
       if (!builder) {
-        builder.reset(
-            static_cast_with_check<BuiltinFilterBitsBuilder>(GetBuilder()));
+        builder.reset(GetBuilder());
       }
       for (uint32_t i = 0; i < keys_to_add; ++i) {
         builder->AddKey(kms_[0].Get(filter_id, i));

From b33c547b069c87eed1558215bbb1c10441490325 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 10 Nov 2025 15:20:50 -0800
Subject: [PATCH 372/500] Add trivial move support in CompactFiles API (#14112)

Summary:
Support trivial move in CompactFiles API, which is not supported previously.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14112

Test Plan: Unit test

Reviewed By: cbi42

Differential Revision: D86546150

Pulled By: xingbowang

fbshipit-source-id: 08a3ae9a055f3d3d41711403b1695f44977e6ea8
---
 db/compact_files_test.cc                      | 226 ++++++++++++++++++
 db/db_impl/db_impl.h                          |   8 +
 db/db_impl/db_impl_compaction_flush.cc        | 153 +++++++++---
 include/rocksdb/options.h                     |   8 +-
 ...rivial_move_support_in_CompactFiles_API.md |   1 +
 5 files changed, 356 insertions(+), 40 deletions(-)
 create mode 100644 unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md

diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index 83bec82b94c7..b1331d1ccff9 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -534,6 +534,232 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   delete db;
 }
 
+// Helper function to generate zero-padded keys
+// e.g., MakeKey("a", 5) -> "a05", MakeKey("b", 42) -> "b42"
+static std::string MakeKey(const std::string& prefix, int index) {
+  return prefix + (index < 10 ? "0" : "") + std::to_string(index);
+}
+
+TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.level_compaction_dynamic_level_bytes = false;
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // Create 3 non-overlapping files in L0
+  // File 1: keys [a00-a99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("a", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // File 2: keys [b00-b99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("b", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // File 3: keys [c00-c99]
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("c", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Verify files are in L0
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[0].files.size(), 3);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);
+
+  // Get L0 files
+  std::vector<std::string> l0_files;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files.push_back(file.db_path + "/" + file.name);
+  }
+
+  CompactionOptions compact_option;
+  compact_option.allow_trivial_move = true;
+  // Compact all L0 files to L1 (non-overlapping in L1)
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files, 1));
+
+  // Verify files are now in L1
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[0].files.size(), 0);
+  ASSERT_EQ(meta.levels[1].files.size(), 3);
+
+  // Get the first file from L1 (should be the one with keys a00-a99)
+  std::string l1_file_to_move;
+  std::vector<std::string> l1_files_to_move_later;
+  uint64_t l1_file_number = 0;
+  for (const auto& file : meta.levels[1].files) {
+    if (file.smallestkey[0] == 'a') {
+      l1_file_to_move = file.db_path + "/" + file.name;
+      l1_file_number = file.file_number;
+    } else {
+      l1_files_to_move_later.push_back(file.db_path + "/" + file.name);
+    }
+  }
+  ASSERT_FALSE(l1_file_to_move.empty());
+
+  // Set up sync point to verify trivial move path is taken
+  bool trivial_move_executed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CompactFilesImpl:TrivialMove",
+      [&](void* /*arg*/) { trivial_move_executed = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Move the file from L1 to L6 - this should be a trivial move
+  // because the file doesn't overlap with anything in L6
+  std::vector<std::string> files_to_move = {l1_file_to_move};
+  ASSERT_OK(db->CompactFiles(compact_option, files_to_move, 6));
+
+  // Verify trivial move was executed
+  ASSERT_TRUE(trivial_move_executed);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify the file is now in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 2);  // Two files remain in L1
+  ASSERT_EQ(meta.levels[6].files.size(), 1);  // One file in L6
+
+  // Verify it's the correct file in L6
+  bool found_file_in_l6 = false;
+  for (const auto& file : meta.levels[6].files) {
+    if (file.file_number == l1_file_number) {
+      found_file_in_l6 = true;
+      // Verify key range hasn't changed
+      ASSERT_EQ(file.smallestkey[0], 'a');
+      ASSERT_EQ(file.largestkey[0], 'a');
+      break;
+    }
+  }
+  ASSERT_TRUE(found_file_in_l6);
+
+  // Move the other 2 files from L1 to L6, with allow_trivial_move set to false.
+  // This will trigger a normal compaction, so the 2 files will be compacted
+  // into a single file in L6.
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l1_files_to_move_later, 6));
+
+  // Verify files in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);  // Zero files remain in L1
+  ASSERT_EQ(meta.levels[6].files.size(), 2);  // Two file in L6
+
+  // Verify data integrity - all keys should still be readable
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("a", i);
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), key, &value));
+    ASSERT_EQ(value, "value_" + key);
+  }
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.num_levels = 7;
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // Create a file in L6 with keys [m00-m99] (wide range)
+  for (int i = 0; i < 100; i++) {
+    std::string key = MakeKey("m", i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Get L0 file
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  std::vector<std::string> l0_files;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files.push_back(file.db_path + "/" + file.name);
+  }
+
+  CompactionOptions compact_option;
+  compact_option.allow_trivial_move = true;
+
+  // Move to L6
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files, 6));
+
+  // Now create a file in L1 with overlapping keys [m50-m60]
+  for (int i = 50; i <= 60; i++) {
+    std::string key = "m" + std::to_string(i);
+    ASSERT_OK(db->Put(WriteOptions(), key, "updated_value_" + key));
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Get the L0 file
+  db->GetColumnFamilyMetaData(&meta);
+  std::vector<std::string> l0_files_2;
+  for (const auto& file : meta.levels[0].files) {
+    l0_files_2.push_back(file.db_path + "/" + file.name);
+  }
+
+  // Move to L1
+  ASSERT_OK(db->CompactFiles(compact_option, l0_files_2, 1));
+
+  // Get the L1 file
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 1);
+  std::string l1_file =
+      meta.levels[1].files[0].db_path + "/" + meta.levels[1].files[0].name;
+
+  // Set up sync point to verify full compaction path is taken
+  bool trivial_move_executed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CompactFilesImpl:TrivialMove",
+      [&](void* /*arg*/) { trivial_move_executed = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Try to move from L1 to L6 - this should NOT be a trivial move
+  // because the file overlaps with the existing file in L6
+  ASSERT_OK(db->CompactFiles(compact_option, {l1_file}, 6));
+
+  // Verify trivial move was NOT executed (full compaction happened)
+  ASSERT_FALSE(trivial_move_executed);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify the result - should have merged data in L6
+  db->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[1].files.size(), 0);  // L1 should be empty
+  // L6 should have the merged file (may be 1 file if merged, or 2 if not)
+  ASSERT_GE(meta.levels[6].files.size(), 1);
+
+  // Verify updated values are present
+  for (int i = 50; i <= 60; i++) {
+    std::string key = "m" + std::to_string(i);
+    std::string value;
+    ASSERT_OK(db->Get(ReadOptions(), key, &value));
+    ASSERT_EQ(value, "updated_value_" + key);
+  }
+
+  delete db;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index da1879688e56..c3d045725242 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -2396,6 +2396,14 @@ class DBImpl : public DB {
                           JobContext* job_context, LogBuffer* log_buffer,
                           CompactionJobInfo* compaction_job_info);
 
+  // Helper function to perform trivial move by updating manifest metadata
+  // without rewriting data files. This is called when IsTrivialMove() is true.
+  // REQUIRES: mutex held
+  // Returns: Status of the trivial move operation
+  Status PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+                            bool& compaction_released, size_t& moved_files,
+                            size_t& moved_bytes);
+
   // REQUIRES: mutex unlocked
   void TrackOrUntrackFiles(const std::vector<std::string>& existing_data_files,
                            bool track);
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 5e883874715e..9f4d08e938ee 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1424,6 +1424,56 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
   return s;
 }
 
+Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
+                                  bool& compaction_released,
+                                  size_t& moved_files, size_t& moved_bytes) {
+  mutex_.AssertHeld();
+
+  ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving %d files to level-%d\n",
+                   c.column_family_data()->GetName().c_str(),
+                   static_cast<int>(c.num_input_files(0)), c.output_level());
+
+  // Move files to the output level by editing the manifest
+  for (unsigned int l = 0; l < c.num_input_levels(); l++) {
+    if (c.level(l) == c.output_level()) {
+      continue;
+    }
+    for (size_t i = 0; i < c.num_input_files(l); i++) {
+      FileMetaData* f = c.input(l, i);
+      c.edit()->DeleteFile(c.level(l), f->fd.GetNumber());
+      c.edit()->AddFile(c.output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+                        f->fd.GetFileSize(), f->smallest, f->largest,
+                        f->fd.smallest_seqno, f->fd.largest_seqno,
+                        f->marked_for_compaction, f->temperature,
+                        f->oldest_blob_file_number, f->oldest_ancester_time,
+                        f->file_creation_time, f->epoch_number,
+                        f->file_checksum, f->file_checksum_func_name,
+                        f->unique_id, f->compensated_range_deletion_size,
+                        f->tail_size, f->user_defined_timestamps_persisted);
+      moved_bytes += static_cast<size_t>(c.input(l, i)->fd.GetFileSize());
+      ROCKS_LOG_BUFFER(
+          log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+          c.column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+          c.output_level(), f->fd.GetFileSize());
+    }
+    moved_files += c.num_input_files(l);
+  }
+
+  // Install the new version
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  const WriteOptions write_options(Env::IOActivity::kCompaction);
+  Status status = versions_->LogAndApply(
+      c.column_family_data(), read_options, write_options, c.edit(), &mutex_,
+      directories_.GetDbDir(), /*new_descriptor_log=*/false,
+      /*column_family_options=*/nullptr,
+      [&c, &compaction_released](const Status& s) {
+        c.ReleaseCompactionFiles(s);
+        compaction_released = true;
+      });
+
+  return status;
+}
+
 Status DBImpl::CompactFilesImpl(
     const CompactionOptions& compact_options, ColumnFamilyData* cfd,
     Version* version, const std::vector<std::string>& input_file_names,
@@ -1511,6 +1561,63 @@ Status DBImpl::CompactFilesImpl(
   // deletion compaction currently not allowed in CompactFiles.
   assert(!c->deletion_compaction());
 
+  // Check if this can be a trivial move (metadata-only update)
+  // Similar to the logic in DBImpl::BackgroundCompaction
+  // Note: We disable trivial move when compaction_service is present because
+  // the service expects all compactions to go through CompactionJob for
+  // tracking
+  bool is_trivial_move = compact_options.allow_trivial_move &&
+                         c->IsTrivialMove() &&
+                         immutable_db_options().compaction_service == nullptr;
+
+  if (is_trivial_move) {
+    // Perform trivial move: just update manifest without rewriting data
+    TEST_SYNC_POINT("DBImpl::CompactFilesImpl:TrivialMove");
+
+    bool compaction_released = false;
+    size_t moved_files = 0;
+    size_t moved_bytes = 0;
+    Status status = PerformTrivialMove(
+        *c.get(), log_buffer, compaction_released, moved_files, moved_bytes);
+
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(
+          c->column_family_data(), job_context->superversion_contexts.data());
+
+      // Populate output file names for trivial move
+      if (output_file_names != nullptr) {
+        for (const auto& newf : c->edit()->GetNewFiles()) {
+          output_file_names->push_back(TableFileName(
+              c->immutable_options().cf_paths, newf.second.fd.GetNumber(),
+              newf.second.fd.GetPathId()));
+        }
+      }
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Trivial move succeeded for %zu files, %zu bytes total\n",
+          c->column_family_data()->GetName().c_str(), moved_files, moved_bytes);
+    } else {
+      if (!compaction_released) {
+        c->ReleaseCompactionFiles(status);
+      }
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Trivial move failed: %s\n",
+                       c->column_family_data()->GetName().c_str(),
+                       status.ToString().c_str());
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+
+    c.reset();
+    bg_compaction_scheduled_--;
+    if (bg_compaction_scheduled_ == 0) {
+      bg_cv_.SignalAll();
+    }
+    MaybeScheduleFlushOrCompaction();
+
+    return status;
+  }
+
+  // Not a trivial move, proceed with full compaction
   InitSnapshotContext(job_context);
 
   std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
@@ -4074,35 +4181,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
                             compaction_job_stats, job_context->job_id);
 
-    // Move files to next level
-    int32_t moved_files = 0;
-    int64_t moved_bytes = 0;
-    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
-      if (c->level(l) == c->output_level()) {
-        continue;
-      }
-      for (size_t i = 0; i < c->num_input_files(l); i++) {
-        FileMetaData* f = c->input(l, i);
-        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
-        c->edit()->AddFile(
-            c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
-            f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
-            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
-            f->oldest_blob_file_number, f->oldest_ancester_time,
-            f->file_creation_time, f->epoch_number, f->file_checksum,
-            f->file_checksum_func_name, f->unique_id,
-            f->compensated_range_deletion_size, f->tail_size,
-            f->user_defined_timestamps_persisted);
-
-        ROCKS_LOG_BUFFER(
-            log_buffer,
-            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
-            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
-            c->output_level(), f->fd.GetFileSize());
-        ++moved_files;
-        moved_bytes += f->fd.GetFileSize();
-      }
-    }
     if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
         c->immutable_options().compaction_pri == kRoundRobin) {
       int start_level = c->start_level();
@@ -4113,14 +4191,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
             vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
       }
     }
-    status = versions_->LogAndApply(
-        c->column_family_data(), read_options, write_options, c->edit(),
-        &mutex_, directories_.GetDbDir(),
-        /*new_descriptor_log=*/false, /*column_family_options=*/nullptr,
-        [&c, &compaction_released](const Status& s) {
-          c->ReleaseCompactionFiles(s);
-          compaction_released = true;
-        });
+
+    // Perform the trivial move
+    size_t moved_files = 0;
+    size_t moved_bytes = 0;
+    status = PerformTrivialMove(*c.get(), log_buffer, compaction_released,
+                                moved_files, moved_bytes);
     io_s = versions_->io_status();
     InstallSuperVersionAndScheduleWork(
         c->column_family_data(), job_context->superversion_contexts.data());
@@ -4135,8 +4211,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           << "total_files_size" << moved_bytes;
     }
     ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        log_buffer, "[%s] Moved #%d files to level-%zu %zu bytes %s: %s\n",
         c->column_family_data()->GetName().c_str(), moved_files,
         c->output_level(), moved_bytes, status.ToString().c_str(),
         c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 96342647d432..d8acfe8f7175 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -2438,11 +2438,17 @@ struct CompactionOptions {
   // "default_write_temperature"
   Temperature output_temperature_override = Temperature::kUnknown;
 
+  // Option to optimize the manual compaction by enabling trivial move for non
+  // overlapping files.
+  // Default: false
+  bool allow_trivial_move;
+
   CompactionOptions()
       : compression(kDisableCompressionOption),
         output_file_size_limit(std::numeric_limits<uint64_t>::max()),
         max_subcompactions(0),
-        canceled(nullptr) {}
+        canceled(nullptr),
+        allow_trivial_move(false) {}
 };
 
 // For level based compaction, we can configure if we want to skip/force
diff --git a/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md b/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md
new file mode 100644
index 000000000000..4c52fc3abf5d
--- /dev/null
+++ b/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md
@@ -0,0 +1 @@
+Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.

From 9fbb68be173bf24dfc97e482150bc735a0ca6ce8 Mon Sep 17 00:00:00 2001
From: Ranjan Banerjee <rbanerjee2@meta.com>
Date: Mon, 10 Nov 2025 17:13:34 -0800
Subject: [PATCH 373/500] Api to get SST file with key ranges for a particular
 level and key range (startKey, EndKey)rocksdb [Internal version] (#14009)

Summary:
There are instances where  an application might be interested in knowing the distribution in SST files for a key range in a particular level.

This implementation creates an overloaded GetColumnFamilyMetaData api where  (startKey, EndKey) can be passed along with level information to filter the necessary sst files along with the keyranges for each sst file

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14009

Reviewed By: anand1976

Differential Revision: D83389707

fbshipit-source-id: 6df1dc1f9233efe9000b03cc1831b3c618cbcef3
---
 db/db_impl/db_impl.cc                    |  13 ++
 db/db_impl/db_impl.h                     |   5 +
 db/db_test.cc                            | 245 +++++++++++++++++++++++
 db/version_set.cc                        |  73 +++++++
 db/version_set.h                         |   4 +
 include/rocksdb/db.h                     |  13 ++
 include/rocksdb/metadata.h               |  14 ++
 include/rocksdb/utilities/stackable_db.h |   6 +
 8 files changed, 373 insertions(+)

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 5676bb2cd588..2aa7be859081 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -5047,6 +5047,19 @@ void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
   }
 }
 
+void DBImpl::GetColumnFamilyMetaData(
+    ColumnFamilyHandle* column_family,
+    const GetColumnFamilyMetaDataOptions& options,
+    ColumnFamilyMetaData* metadata) {
+  assert(column_family);
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  {
+    InstrumentedMutexLock l(&mutex_);
+    cfd->current()->GetColumnFamilyMetaData(options, metadata);
+  }
+}
+
 void DBImpl::GetAllColumnFamilyMetaData(
     std::vector<ColumnFamilyMetaData>* metadata) {
   InstrumentedMutexLock l(&mutex_);
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index c3d045725242..9168c94f6810 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -573,6 +573,11 @@ class DBImpl : public DB {
   void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
                                ColumnFamilyMetaData* metadata) override;
 
+  // Get column family metadata with filtering based on key range and level
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                               const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* metadata) override;
+
   void GetAllColumnFamilyMetaData(
       std::vector<ColumnFamilyMetaData>* metadata) override;
 
diff --git a/db/db_test.cc b/db/db_test.cc
index ab8757291834..7456679a152a 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -1492,6 +1492,246 @@ TEST_F(DBTest, MetaDataTest) {
   CheckLiveFilesMeta(live_file_meta, files_by_level);
 }
 
+TEST_F(DBTest, GetColumnFamilyMetaDataWithKeyRangeAndLevel) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  ASSERT_OK(options.env->GetCurrentTime(&temp_time));
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  for (int i = 0; i < 100; ++i) {
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  ASSERT_OK(options.env->GetCurrentTime(&temp_time));
+
+  ColumnFamilyMetaData cf_meta;
+  // Keys in the SST files are distributed
+  // (key000000, key000100) ->File 1
+  // (key000101, key000201) -> File 2
+  // (key000202, key000302) -> File 3
+  // (key009999, key010099) -> File 100
+
+  // With keySlice (key000050, key000150) => should only pick 2 files(instead of
+  // default 100 that is in the level)
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  GetColumnFamilyMetaDataOptions cf_options(startKey, endKey, 0);
+  db_->GetColumnFamilyMetaData(cf_options, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  const auto& level_meta_from_cf = cf_meta.levels[0];
+  ASSERT_EQ(level_meta_from_cf.files.size(), 2);
+  ASSERT_LT(level_meta_from_cf.files[1].smallestkey,
+            std::string(startKey.data()));
+  ASSERT_GT(level_meta_from_cf.files[0].largestkey, std::string(endKey.data()));
+
+  GetColumnFamilyMetaDataOptions cf_option_default;
+  db_->GetColumnFamilyMetaData(cf_option_default, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 100);
+
+  // Test with start key valid and end key unbounded
+  // This should get all files from key000150 onwards (99 files)
+  auto startKeyUnbounded = Slice("key000150");
+  GetColumnFamilyMetaDataOptions cf_options_unbounded_end(startKeyUnbounded,
+                                                          OptSlice(), 0);
+  db_->GetColumnFamilyMetaData(cf_options_unbounded_end, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 99);
+
+  // Test with end key valid and start key unbounded
+  // This should get all files from beginning to key000250 ( 3 files)
+  auto endKeyUnbounded = Slice("key000250");
+  GetColumnFamilyMetaDataOptions cf_options_unbounded_start(OptSlice(),
+                                                            endKeyUnbounded, 0);
+  db_->GetColumnFamilyMetaData(cf_options_unbounded_start, &cf_meta);
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 3);
+}
+
+TEST_F(DBTest, GetColumnFamilyMetaDataBottommostLevel) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+
+  for (int i = 0; i < 100; ++i) {
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+  // Nothing on Level 0 after compaction
+  ColumnFamilyMetaData cf_meta;
+  GetColumnFamilyMetaDataOptions cf_options_0(OptSlice(), OptSlice(), 0);
+  db_->GetColumnFamilyMetaData(cf_options_0, &cf_meta);
+
+  ASSERT_EQ(cf_meta.levels.size(), 0);
+  ASSERT_EQ(cf_meta.file_count, 0);
+
+  // Data should be in Level 6
+  GetColumnFamilyMetaDataOptions cf_options(OptSlice(), OptSlice(), 6);
+  db_->GetColumnFamilyMetaData(cf_options, &cf_meta);
+
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].level, 6);
+  ASSERT_GT(cf_meta.levels[0].files.size(), 0);
+  size_t all_files = cf_meta.levels[0].files.size();
+
+  // Keys in the SST files are distributed across level 6
+  // Test with key range - should only return files within the range
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  GetColumnFamilyMetaDataOptions cf_options_range(startKey, endKey, 6);
+  db_->GetColumnFamilyMetaData(cf_options_range, &cf_meta);
+
+  ASSERT_EQ(cf_meta.levels.size(), 1);
+  ASSERT_EQ(cf_meta.levels[0].level, 6);
+  ASSERT_GT(cf_meta.levels[0].files.size(), 0);
+  size_t files_in_range = cf_meta.levels[0].files.size();
+
+  // Files in range should be less than or equal to all files
+  ASSERT_LE(files_in_range, all_files);
+}
+
+TEST_F(DBTest, GetColumnFamilyMetaDataMultipleLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+
+  for (int i = 0; i < 50; ++i) {
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+  for (int i = 0; i < 30; ++i) {
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  // First verify both levels have files without key range filter
+  ColumnFamilyMetaData cf_meta_all_no_range;
+  GetColumnFamilyMetaDataOptions cf_options_all_no_range;
+  db_->GetColumnFamilyMetaData(cf_options_all_no_range, &cf_meta_all_no_range);
+
+  bool has_level_0 = false;
+  bool has_level_6 = false;
+  for (const auto& level : cf_meta_all_no_range.levels) {
+    if (level.level == 0 && level.files.size() > 0) {
+      has_level_0 = true;
+    }
+    if (level.level == 6 && level.files.size() > 0) {
+      has_level_6 = true;
+    }
+  }
+
+  ASSERT_TRUE(has_level_0);
+  ASSERT_TRUE(has_level_6);
+
+  // Test querying bottommost level only with key range
+  // Use a range that should be in the first set of files (now in level 6)
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  ColumnFamilyMetaData cf_meta_bottommost;
+  GetColumnFamilyMetaDataOptions cf_options_bottommost(startKey, endKey, 6);
+  db_->GetColumnFamilyMetaData(cf_options_bottommost, &cf_meta_bottommost);
+
+  ASSERT_EQ(cf_meta_bottommost.levels.size(), 1);
+  ASSERT_EQ(cf_meta_bottommost.levels[0].level, 6);
+  ASSERT_GT(cf_meta_bottommost.levels[0].files.size(), 0);
+  size_t level_6_files_in_range = cf_meta_bottommost.levels[0].files.size();
+
+  // Test querying all levels with same key range
+  ColumnFamilyMetaData cf_meta_all;
+  GetColumnFamilyMetaDataOptions cf_options_all(startKey, endKey);
+  db_->GetColumnFamilyMetaData(cf_options_all, &cf_meta_all);
+
+  size_t level_6_files_in_range_from_all = 0;
+  for (const auto& level : cf_meta_all.levels) {
+    if (level.level == 6) {
+      level_6_files_in_range_from_all = level.files.size();
+    }
+  }
+
+  ASSERT_GT(level_6_files_in_range_from_all, 0);
+  ASSERT_EQ(level_6_files_in_range, level_6_files_in_range_from_all);
+}
+
+TEST_F(DBTest, GetColumnFamilyMetaDataEmptyDB) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+
+  // Test on empty database
+  ColumnFamilyMetaData cf_meta_empty_db;
+  GetColumnFamilyMetaDataOptions cf_options_empty_db;
+  db_->GetColumnFamilyMetaData(cf_options_empty_db, &cf_meta_empty_db);
+
+  ASSERT_EQ(cf_meta_empty_db.levels.size(), 0);
+  ASSERT_EQ(cf_meta_empty_db.file_count, 0);
+  ASSERT_EQ(cf_meta_empty_db.size, 0);
+
+  // Test on empty database with key range
+  auto startKey = Slice("key000050");
+  auto endKey = Slice("key000150");
+  ColumnFamilyMetaData cf_meta_empty_range;
+  GetColumnFamilyMetaDataOptions cf_options_empty_range(startKey, endKey);
+  db_->GetColumnFamilyMetaData(cf_options_empty_range, &cf_meta_empty_range);
+
+  ASSERT_EQ(cf_meta_empty_range.levels.size(), 0);
+  ASSERT_EQ(cf_meta_empty_range.file_count, 0);
+  ASSERT_EQ(cf_meta_empty_range.size, 0);
+}
+
 TEST_F(DBTest, AllMetaDataTest) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -3535,6 +3775,11 @@ class ModelDB : public DB {
   void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                ColumnFamilyMetaData* /*metadata*/) override {}
 
+  void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* /*column_family*/,
+      const GetColumnFamilyMetaDataOptions& /*options*/,
+      ColumnFamilyMetaData* /*metadata*/) override {}
+
   Status GetDbIdentity(std::string& /*identity*/) const override {
     return Status::OK();
   }
diff --git a/db/version_set.cc b/db/version_set.cc
index e2dce0e8c80b..84fe95a9b535 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -2035,6 +2035,79 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
   }
 }
 
+void Version::GetColumnFamilyMetaData(
+    const GetColumnFamilyMetaDataOptions& options,
+    ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+  cf_meta->blob_file_size = 0;
+  cf_meta->blob_file_count = 0;
+  cf_meta->blob_files.clear();
+
+  const auto& ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  int first_level = (options.level >= 0) ? options.level : 0;
+  int last_level =
+      (options.level >= 0) ? options.level + 1 : cfd_->NumberLevels();
+
+  InternalKey ikey_start, ikey_end;
+  const InternalKey* begin = nullptr;
+  const InternalKey* end = nullptr;
+
+  if (options.range.start.has_value()) {
+    ikey_start = InternalKey(options.range.start.value(), kMaxSequenceNumber,
+                             kValueTypeForSeek);
+    begin = &ikey_start;
+  }
+
+  if (options.range.limit.has_value()) {
+    ikey_end = InternalKey(options.range.limit.value(), kMaxSequenceNumber,
+                           kValueTypeForSeek);
+    end = &ikey_end;
+  }
+
+  for (int l = first_level; l < last_level; ++l) {
+    uint64_t level_size = 0;
+    std::vector<SstFileMetaData> files;
+    std::vector<FileMetaData*> overlapping_files;
+    vstorage->GetOverlappingInputs(l, begin, end, &overlapping_files);
+
+    for (const auto& file : overlapping_files) {
+      uint32_t path_id = file->fd.GetPathId();
+      const auto& file_path = (path_id < ioptions.cf_paths.size())
+                                  ? ioptions.cf_paths[path_id].path
+                                  : ioptions.cf_paths.back().path;
+      const uint64_t file_number = file->fd.GetNumber();
+      files.emplace_back(
+          MakeTableFileName("", file_number), file_number, file_path,
+          file->fd.GetFileSize(), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+          file->being_compacted, file->temperature,
+          file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+          file->TryGetFileCreationTime(), file->epoch_number,
+          file->file_checksum, file->file_checksum_func_name);
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
+      files.back().smallest = file->smallest.Encode().ToString();
+      files.back().largest = file->largest.Encode().ToString();
+      level_size += file->fd.GetFileSize();
+      cf_meta->file_count++;
+    }
+    if (!files.empty()) {
+      cf_meta->levels.emplace_back(l, level_size, std::move(files));
+      cf_meta->size += level_size;
+    }
+  }
+}
+
 uint64_t Version::GetSstFilesSize() {
   uint64_t sst_files_size = 0;
   for (int level = 0; level < storage_info_.num_levels_; level++) {
diff --git a/db/version_set.h b/db/version_set.h
index 85759f82f5a3..365d2838183d 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1051,6 +1051,10 @@ class Version {
 
   void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
 
+  // Get column family metadata with optional filtering by key range and level.
+  void GetColumnFamilyMetaData(const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* cf_meta);
+
   void GetSstFilesBoundaryKeys(Slice* smallest_user_key,
                                Slice* largest_user_key);
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 0b9c506e5ce3..2ac8aa99c543 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1929,11 +1929,24 @@ class DB {
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                        ColumnFamilyMetaData* /*metadata*/) {}
 
+  // Obtains the LSM-tree meta data of the specified column family of the DB
+  // with optional filtering by key range and level.
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* /*column_family*/,
+      const GetColumnFamilyMetaDataOptions& /*options*/,
+      ColumnFamilyMetaData* /*metadata*/) {}
+
   // Get the metadata of the default column family.
   void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
   }
 
+  // Get the metadata of the default column family with optional filtering.
+  void GetColumnFamilyMetaData(const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* metadata) {
+    GetColumnFamilyMetaData(DefaultColumnFamily(), options, metadata);
+  }
+
   // Obtains the LSM-tree meta data of all column families of the DB, including
   // metadata for each live table (SST) file and each blob file in the DB.
   virtual void GetAllColumnFamilyMetaData(
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 4c6c79f4c6fb..29e6b6dc575d 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -224,6 +224,20 @@ struct LevelMetaData {
   const std::vector<SstFileMetaData> files;
 };
 
+// Options for filtering column family metadata by key range.
+struct GetColumnFamilyMetaDataOptions {
+  RangeOpt range;
+
+  // The level to filter on. If -1, all levels are included.
+  int level = -1;
+
+  GetColumnFamilyMetaDataOptions() = default;
+
+  GetColumnFamilyMetaDataOptions(const OptSlice& _start_key,
+                                 const OptSlice& _end_key, int _level = -1)
+      : range(_start_key, _end_key), level(_level) {}
+};
+
 // The metadata that describes a column family.
 struct ColumnFamilyMetaData {
   ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 0710c713de0b..c84dc06b8168 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -456,6 +456,12 @@ class StackableDB : public DB {
     db_->GetColumnFamilyMetaData(column_family, cf_meta);
   }
 
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                               const GetColumnFamilyMetaDataOptions& options,
+                               ColumnFamilyMetaData* metadata) override {
+    db_->GetColumnFamilyMetaData(column_family, options, metadata);
+  }
+
   using DB::StartBlockCacheTrace;
   Status StartBlockCacheTrace(
       const TraceOptions& trace_options,

From c757f5b4e30718487ae06f9e5cb1be1fedbe1340 Mon Sep 17 00:00:00 2001
From: Siying Dong <dong.sy@gmail.com>
Date: Tue, 11 Nov 2025 15:58:00 -0800
Subject: [PATCH 374/500] Java's Get() to directly return for NotFound (#14095)

Summary:
Right now, in Java's Get() calls, the way Get() is treated is inefficient. Status.NotFound is turned into an exception in the JNI layer, and is caught in the same function to turn into not found return. This causes significant overhead in the scenario where most of the queries ending up with not found. For example, in Spark's deduplication query, this exception creation overhead is higher than Get() itself. With the proposed change, if return status is NotFound, we directly return, rather than going through the exception path

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14095

Test Plan: Existing tests should cover all Get() cases, and they are passing.

Reviewed By: jaykorean

Differential Revision: D86797594

Pulled By: cbi42

fbshipit-source-id: 1202d24e46a2358976bb7c8ff38a2fd4783d0f99
---
 java/rocksjni/rocksjni.cc | 87 +++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 5aad46fa4926..57272170c326 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -1210,6 +1210,9 @@ jint Java_org_rocksdb_RocksDB_getDirect(JNIEnv* env, jclass /*jdb*/,
           db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice());
     }
 
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
     ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1453,10 +1456,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(JNIEnv* env, jclass,
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
+    ROCKSDB_NAMESPACE::Status s =
         db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(),
-                key.slice(), &value.pinnable_slice()));
+                key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
 
   } catch (ROCKSDB_NAMESPACE::KVException&) {
@@ -1484,9 +1490,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(JNIEnv* env, jclass,
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
-                     &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s =
+        db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
+                &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
 
   } catch (ROCKSDB_NAMESPACE::KVException&) {
@@ -1509,11 +1519,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(JNIEnv* env, jclass,
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
-        db->Get(
-            *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
-            db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
   } catch (ROCKSDB_NAMESPACE::KVException&) {
     return nullptr;
@@ -1538,10 +1550,13 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ(
   try {
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(*reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
-                         jropt_handle),
-                     cf_handle, key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        cf_handle, key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return nullptr;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.NewByteArray();
   } catch (ROCKSDB_NAMESPACE::KVException&) {
     return nullptr;
@@ -1563,10 +1578,13 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jclass,
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
+    ROCKSDB_NAMESPACE::Status s =
         db->Get(ROCKSDB_NAMESPACE::ReadOptions(), db->DefaultColumnFamily(),
-                key.slice(), &value.pinnable_slice()));
+                key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1595,9 +1613,13 @@ jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jclass,
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
-                     &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s =
+        db->Get(ROCKSDB_NAMESPACE::ReadOptions(), cf_handle, key.slice(),
+                &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1621,11 +1643,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jclass,
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env,
-        db->Get(
-            *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
-            db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        db->DefaultColumnFamily(), key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {
@@ -1652,10 +1676,13 @@ jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ(
     ROCKSDB_NAMESPACE::JByteArraySlice key(env, jkey, jkey_off, jkey_len);
     ROCKSDB_NAMESPACE::JByteArrayPinnableSlice value(env, jval, jval_off,
                                                      jval_len);
-    ROCKSDB_NAMESPACE::KVException::ThrowOnError(
-        env, db->Get(*reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(
-                         jropt_handle),
-                     cf_handle, key.slice(), &value.pinnable_slice()));
+    ROCKSDB_NAMESPACE::Status s = db->Get(
+        *reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jropt_handle),
+        cf_handle, key.slice(), &value.pinnable_slice());
+    if (s.IsNotFound()) {
+      return ROCKSDB_NAMESPACE::KVException::kNotFound;
+    }
+    ROCKSDB_NAMESPACE::KVException::ThrowOnError(env, s);
     return value.Fetch();
 
   } catch (ROCKSDB_NAMESPACE::KVException& e) {

From 2cf81e0a20ddc232c6706700cbdbd4d475db3b91 Mon Sep 17 00:00:00 2001
From: Viraj Thakur <virajthakur@fb.com>
Date: Wed, 12 Nov 2025 10:29:44 -0800
Subject: [PATCH 375/500] fix compiler warning for mutex->AssertHeld (#14115)

Summary:
We are seeing Github actions failures due to a compiler error:

https://github.com/facebook/rocksdb/actions/runs/19190877461/job/54865138898?fbclid=IwY2xjawN_Hc9leHRuA2FlbQIxMQBicmlkETFZeGlpZXZXMGlDTVhTYldwc3J0YwZhcHBfaWQBMAABHp6JoIoMBbZq-8Kgfc1honBdkAbHAZzW2ORiCM2Br2D9utxtMlq6IIqUUQnu_aem_SOU-DDsjDDMB3mTncKfLwQ&brid=VRqQ-asf2myW425wX1qqhg

When UpdatedMutableDbOptions is called from the VersionSet constructor, manifest_file_size_ is 0, and mu is nullptr. This is expected and fine, and we never enter the block where AssertHeld is called.

All other times UpdatedMutableDbOptions is called, the mutex must be held. This PR just checks that mu is not null, to satisfy the compiler. We could alternatively intentionally crash if there is concern over a silent failure if mu is passed as nullptr

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14115

Reviewed By: pdillinger

Differential Revision: D86733318

Pulled By: virajthakur

fbshipit-source-id: ce9ed6275c9495a3ea2a12f984dbceef7b441e24
---
 db/version_set.cc | 112 +++++++++++++++++++++++++---------------------
 1 file changed, 60 insertions(+), 52 deletions(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index 84fe95a9b535..960b897d00ff 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5549,8 +5549,11 @@ void VersionSet::Reset() {
 void VersionSet::UpdatedMutableDbOptions(
     const MutableDBOptions& updated_options, InstrumentedMutex* mu) {
   // Must be holding mutex if not called during initialization
-  if (manifest_file_size_ > 0) {
+  if (mu) {
     mu->AssertHeld();
+  } else {
+    // manifest_file_size_ must be 0 if called from the constructor
+    assert(manifest_file_size_ == 0);
   }
   file_options_.writable_file_max_buffer_size =
       updated_options.writable_file_max_buffer_size;
@@ -5650,8 +5653,8 @@ Status VersionSet::ProcessManifestWrites(
         // the preceding version edits in the same atomic group, and update
         // their `remaining_entries_` member variable because we are NOT going
         // to write the version edits' of dropped CF to the MANIFEST. If we
-        // don't update, then Recover can report corrupted atomic group because
-        // the `remaining_entries_` do not match.
+        // don't update, then Recover can report corrupted atomic group
+        // because the `remaining_entries_` do not match.
         if (!batch_edits.empty()) {
           if (batch_edits.back()->IsInAtomicGroup() &&
               batch_edits.back()->GetRemainingEntries() > 0) {
@@ -5956,10 +5959,12 @@ Status VersionSet::ProcessManifestWrites(
 #ifndef NDEBUG
         if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
           TEST_SYNC_POINT_CALLBACK(
-              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
+              "0",
               nullptr);
           TEST_SYNC_POINT(
-              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:"
+              "1");
         }
         ++idx;
 #endif /* !NDEBUG */
@@ -5996,8 +6001,8 @@ Status VersionSet::ProcessManifestWrites(
           file_options_.temperature, dir_contains_current_file);
       if (!io_s.ok()) {
         s = io_s;
-        // Quarantine old manifest file in case new manifest file's CURRENT file
-        // wasn't created successfully and the old manifest is needed.
+        // Quarantine old manifest file in case new manifest file's CURRENT
+        // file wasn't created successfully and the old manifest is needed.
         limbo_descriptor_log_file_number.push_back(manifest_file_number_);
         files_to_quarantine_if_commit_fail.push_back(
             &limbo_descriptor_log_file_number);
@@ -6146,21 +6151,21 @@ Status VersionSet::ProcessManifestWrites(
     // that renaming tmp file to CURRENT failed.
     //
     // On local POSIX-compliant FS, the CURRENT must point to the original
-    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
-    // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
-    // process not to crash and continue using the db. Any future LogAndApply()
-    // call will switch to a new MANIFEST and update CURRENT, still ignoring
-    // this one.
+    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can
+    // also keep it. Future recovery will ignore this MANIFEST. It's also ok
+    // for the process not to crash and continue using the db. Any future
+    // LogAndApply() call will switch to a new MANIFEST and update CURRENT,
+    // still ignoring this one.
     //
     // On non-local FS, it is
     // possible that the rename operation succeeded on the server (remote)
     // side, but the client somehow returns a non-ok status to RocksDB. Note
     // that this does not violate atomicity. Should we delete the new MANIFEST
     // successfully, a subsequent recovery attempt will likely see the CURRENT
-    // pointing to the new MANIFEST, thus fail. We will not be able to open the
-    // DB again. Therefore, if manifest operations succeed, we should keep the
-    // the new MANIFEST. If the process proceeds, any future LogAndApply() call
-    // will switch to a new MANIFEST and update CURRENT. If user tries to
+    // pointing to the new MANIFEST, thus fail. We will not be able to open
+    // the DB again. Therefore, if manifest operations succeed, we should keep
+    // the the new MANIFEST. If the process proceeds, any future LogAndApply()
+    // call will switch to a new MANIFEST and update CURRENT. If user tries to
     // re-open the DB,
     // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
     // b) CURRENT points to the original MANIFEST, and the original MANIFEST
@@ -6289,9 +6294,9 @@ Status VersionSet::LogAndApply(
     first_writer.cv.Wait();
   }
   if (first_writer.done) {
-    // All non-CF-manipulation operations can be grouped together and committed
-    // to MANIFEST. They should all have finished. The status code is stored in
-    // the first manifest writer.
+    // All non-CF-manipulation operations can be grouped together and
+    // committed to MANIFEST. They should all have finished. The status code
+    // is stored in the first manifest writer.
 #ifndef NDEBUG
     for (const auto& writer : writers) {
       assert(writer.done);
@@ -6345,8 +6350,8 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
   assert(!edit->HasLastSequence());
   edit->SetLastSequence(*max_last_sequence);
   if (edit->IsColumnFamilyDrop()) {
-    // if we drop column family, we have to make sure to save max column family,
-    // so that we don't reuse existing ID
+    // if we drop column family, we have to make sure to save max column
+    // family, so that we don't reuse existing ID
     edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
   }
 }
@@ -6635,7 +6640,8 @@ void VersionSet::RecoverEpochNumbers() {
 Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
                                       const std::string& dbname,
                                       FileSystem* fs) {
-  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  // Read "CURRENT" file, which contains a pointer to the current manifest
+  // file
   std::string manifest_path;
   uint64_t manifest_file_number;
   Status s = GetCurrentManifestPath(dbname, fs, /*is_retry=*/false,
@@ -6792,9 +6798,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 
 // Get the checksum information including the checksum and checksum function
 // name of all SST and blob files in VersionSet. Store the information in
-// FileChecksumList which contains a map from file number to its checksum info.
-// If DB is not running, make sure call VersionSet::Recover() to load the file
-// metadata from Manifest to VersionSet before calling this function.
+// FileChecksumList which contains a map from file number to its checksum
+// info. If DB is not running, make sure call VersionSet::Recover() to load
+// the file metadata from Manifest to VersionSet before calling this function.
 Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
   // Clean the previously stored checksum information if any.
   Status s;
@@ -6936,8 +6942,8 @@ Status VersionSet::WriteCurrentStateToManifest(
   // WARNING: This method doesn't hold a mutex!!
 
   // This is done without DB mutex lock held, but only within single-threaded
-  // LogAndApply. Column family manipulations can only happen within LogAndApply
-  // (the same single thread), so we're safe to iterate.
+  // LogAndApply. Column family manipulations can only happen within
+  // LogAndApply (the same single thread), so we're safe to iterate.
 
   assert(io_s.ok());
   if (db_options_->write_dbid_to_manifest) {
@@ -6971,9 +6977,9 @@ Status VersionSet::WriteCurrentStateToManifest(
   }
 
   // New manifest should rollover the WAL deletion record from previous
-  // manifest. Otherwise, when an addition record of a deleted WAL gets added to
-  // this new manifest later (which can happens in e.g, SyncWAL()), this new
-  // manifest creates an illusion that such WAL hasn't been deleted.
+  // manifest. Otherwise, when an addition record of a deleted WAL gets added
+  // to this new manifest later (which can happens in e.g, SyncWAL()), this
+  // new manifest creates an illusion that such WAL hasn't been deleted.
   VersionEdit wal_deletions;
   wal_deletions.DeleteWalsBefore(min_log_number_to_keep());
   std::string wal_deletions_record;
@@ -7105,9 +7111,9 @@ Status VersionSet::WriteCurrentStateToManifest(
 // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
 // function is called repeatedly with consecutive pairs of slices. For example
 // if the slice list is [a, b, c, d] this function is called with arguments
-// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
-// we avoid doing binary search for the keys b and c twice and instead somehow
-// maintain state of where they first appear in the files.
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible
+// where we avoid doing binary search for the keys b and c twice and instead
+// somehow maintain state of where they first appear in the files.
 uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
                                      const ReadOptions& read_options,
                                      Version* v, const Slice& start,
@@ -7128,19 +7134,20 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
   }
 
   // Outline of the optimization that uses options.files_size_error_margin.
-  // When approximating the files total size that is used to store a keys range,
-  // we first sum up the sizes of the files that fully fall into the range.
-  // Then we sum up the sizes of all the files that may intersect with the range
-  // (this includes all files in L0 as well). Then, if total_intersecting_size
-  // is smaller than total_full_size * options.files_size_error_margin - we can
-  // infer that the intersecting files have a sufficiently negligible
-  // contribution to the total size, and we can approximate the storage required
-  // for the keys in range as just half of the intersecting_files_size.
-  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
-  // approximation is limited to only ~10% of the total size of files that fully
-  // fall into the keys range. In such case, this helps to avoid a costly
-  // process of binary searching the intersecting files that is required only
-  // for a more precise calculation of the total size.
+  // When approximating the files total size that is used to store a keys
+  // range, we first sum up the sizes of the files that fully fall into the
+  // range. Then we sum up the sizes of all the files that may intersect with
+  // the range (this includes all files in L0 as well). Then, if
+  // total_intersecting_size is smaller than total_full_size *
+  // options.files_size_error_margin - we can infer that the intersecting
+  // files have a sufficiently negligible contribution to the total size, and
+  // we can approximate the storage required for the keys in range as just
+  // half of the intersecting_files_size. E.g., if the value of
+  // files_size_error_margin is 0.1, then the error of the approximation is
+  // limited to only ~10% of the total size of files that fully fall into the
+  // keys range. In such case, this helps to avoid a costly process of binary
+  // searching the intersecting files that is required only for a more precise
+  // calculation of the total size.
 
   autovector<FdWithKeyRange*, 32> first_files;
   autovector<FdWithKeyRange*, 16> last_files;
@@ -7212,10 +7219,11 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
     total_intersecting_size += file_ptr->fd.GetFileSize();
   }
 
-  // Now scan all the first & last files at each level, and estimate their size.
-  // If the total_intersecting_size is less than X% of the total_full_size - we
-  // want to approximate the result in order to avoid the costly binary search
-  // inside ApproximateSize. We use half of file size as an approximation below.
+  // Now scan all the first & last files at each level, and estimate their
+  // size. If the total_intersecting_size is less than X% of the
+  // total_full_size - we want to approximate the result in order to avoid the
+  // costly binary search inside ApproximateSize. We use half of file size as
+  // an approximation below.
 
   const double margin = options.files_size_error_margin;
   if (margin > 0 && total_intersecting_size <
@@ -7888,8 +7896,8 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
     }
   } else if (s.IsPathNotFound()) {
     // This can happen if the primary switches to a new MANIFEST after the
-    // secondary reads the CURRENT file but before the secondary actually tries
-    // to open the MANIFEST.
+    // secondary reads the CURRENT file but before the secondary actually
+    // tries to open the MANIFEST.
     s = Status::TryAgain(
         "The primary may have switched to a new MANIFEST and deleted the old "
         "one.");

From f6c9c3bf1cf05096e8ff8c03ded60c1e199edbb7 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 12 Nov 2025 21:40:15 -0800
Subject: [PATCH 376/500] Use AutoHCC by default in tools (#14120)

Summary:
Oversight in https://github.com/facebook/rocksdb/issues/13964. More detail:
* Applies to cache_bench and db_bench (db_stress already using it)
* Make sure those along with db_stress treat "hyper_clock_cache" as "auto_hyper_clock_cache" because this is now the blessed implementation.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14120

Test Plan: manual runs of the tools

Reviewed By: krhancoc

Differential Revision: D86913202

Pulled By: pdillinger

fbshipit-source-id: 07b425d3522103417f4b034735376b9d759af5fb
---
 cache/cache_bench_tool.cc             | 2 +-
 db_stress_tool/db_stress_test_base.cc | 6 +++---
 tools/db_bench_tool.cc                | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
index 6de9c00818b6..7b62fbae662a 100644
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@@ -119,7 +119,7 @@ DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
 DEFINE_string(secondary_cache_uri, "",
               "Full URI for creating a custom secondary cache object");
 
-DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+DEFINE_string(cache_type, "hyper_clock_cache", "Type of block cache.");
 
 DEFINE_bool(use_jemalloc_no_dump_allocator, false,
             "Whether to use JemallocNoDumpAllocator");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 6a37af5a4c66..c2c000f506fb 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -195,10 +195,10 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
     exit(1);
   } else if (EndsWith(cache_type, "hyper_clock_cache")) {
     size_t estimated_entry_charge;
-    if (cache_type == "fixed_hyper_clock_cache" ||
-        cache_type == "hyper_clock_cache") {
+    if (cache_type == "fixed_hyper_clock_cache") {
       estimated_entry_charge = FLAGS_block_size;
-    } else if (cache_type == "auto_hyper_clock_cache") {
+    } else if (cache_type == "auto_hyper_clock_cache" ||
+               cache_type == "hyper_clock_cache") {
       estimated_entry_charge = 0;
     } else {
       fprintf(stderr, "Cache type not supported.");
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 655bba868f6e..16033434f564 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -587,7 +587,7 @@ DEFINE_double(cache_high_pri_pool_ratio, 0.0,
 DEFINE_double(cache_low_pri_pool_ratio, 0.0,
               "Ratio of block cache reserve for low pri blocks.");
 
-DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
+DEFINE_string(cache_type, "hyper_clock_cache", "Type of block cache.");
 
 DEFINE_bool(use_compressed_secondary_cache, false,
             "Use the CompressedSecondaryCache as the secondary cache.");
@@ -3260,10 +3260,10 @@ class Benchmark {
       db_bench_exit(1);
     } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) {
       size_t estimated_entry_charge;
-      if (FLAGS_cache_type == "fixed_hyper_clock_cache" ||
-          FLAGS_cache_type == "hyper_clock_cache") {
+      if (FLAGS_cache_type == "fixed_hyper_clock_cache") {
         estimated_entry_charge = FLAGS_block_size;
-      } else if (FLAGS_cache_type == "auto_hyper_clock_cache") {
+      } else if (FLAGS_cache_type == "auto_hyper_clock_cache" ||
+                 FLAGS_cache_type == "hyper_clock_cache") {
         estimated_entry_charge = 0;
       } else {
         fprintf(stderr, "Cache type not supported.");

From b9951ded37ba48d3fdcd1f2b484973b40b3d245e Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Tue, 18 Nov 2025 15:57:03 -0800
Subject: [PATCH 377/500] Introducing Prepare all iterators for LevelIterator
 (#14100)

Summary:
This diff introduces the async prepare of all iterators within a MultiScan. The current state has each iterator be prepared as its needed, and with this diff, we prepare all iterators during the prepare phase of the Level Iterator, this will allow more time for each IO to be dispatched and serviced, increasing the odds that a block is ready as the scan seeks to it.

Benchmark is prefilled using
```
KEYSIZE=64
VALUESIZE=512
NUMKEYS=5000000
SCAN_SIZE=100
DISTANCE=25000
NUM_SCANS=15
THREADS=1

./db_bench --db=$DB \
    --benchmarks="fillseq" \
    --write_buffer_size=5242880 \
    --max_write_buffer_number=4 \
    --target_file_size_base=5242880 \
    --disable_wal=1 --key_size=$KEYSIZE \
    --value_size=$VALUESIZE --num=$NUMKEYS --threads=32

}
```

And benchmark ran is
```
run() {
echo 1 | sudo tee /proc/sys/vm/drop_caches
./db_bench --db=$DB --use_existing_db=1 \
    --benchmarks=multiscan \
    --disable_auto_compactions=1 --seek_nexts=$SCAN_SIZE \
    --multiscan-use-async-io=1 \
    --multiscan-size=$NUM_SCANS --multiscan-stride=$DISTANCE \
    --key_size=$KEYSIZE --value_size=$VALUESIZE \
    --num=$NUMKEYS --threads=$THREADS --duration=60 --statistics
}
```

The benchmark uses large stride sides to ensure that two scans would touch separate files. We reduce the size of the block cache to increase likelyhood of reads (and simulate larger data sets)

**Branch:**

```
Integrated BlobDB: blob cache disabled
RocksDB:    version 10.8.0
Date:       Tue Nov 11 13:26:29 2025
CPU:        166 * AMD EPYC-Milan Processor
CPUCache:   512 KB
Keys:       64 bytes each (+ 0 bytes user-defined timestamp)
Values:     512 bytes each (256 bytes after compression)
Entries:    5000000
Prefix:    0 bytes
Keys per prefix:    0
RawSize:    2746.6 MB (estimated)
FileSize:   1525.9 MB (estimated)
Write rate: 0 bytes/second
Read rate: 0 ops/second
Compression: Snappy
Compression sampling rate: 0
Memtablerep: SkipListFactory
Perf Level: 1
------------------------------------------------
multiscan_stride = 25000
multiscan_size = 15
seek_nexts = 100
DB path: [/data/rocksdb/mydb]
multiscan    :     837.941 micros/op 1193 ops/sec 60.001 seconds 71605 operations; (multscans:71605)
```

**Baseline:**

```
Set seed to 1762898809121995 because --seed was 0
Initializing RocksDB Options from the specified file
Initializing RocksDB Options from command-line flags
Integrated BlobDB: blob cache disabled
RocksDB:    version 10.9.0
Date:       Tue Nov 11 14:06:49 2025
CPU:        166 * AMD EPYC-Milan Processor
CPUCache:   512 KB
Keys:       64 bytes each (+ 0 bytes user-defined timestamp)
Values:     512 bytes each (256 bytes after compression)
Entries:    5000000
Prefix:    0 bytes
Keys per prefix:    0
RawSize:    2746.6 MB (estimated)
FileSize:   1525.9 MB (estimated)
Write rate: 0 bytes/second
Read rate: 0 ops/second
Compression: Snappy
Compression sampling rate: 0
Memtablerep: SkipListFactory
Perf Level: 1
------------------------------------------------
multiscan_stride = 25000
multiscan_size = 15
seek_nexts = 100
DB path: [/data/rocksdb/mydb]
multiscan    :    1129.916 micros/op 885 ops/sec 60.001 seconds 53102 operations; (multscans:53102)
```
Repeated for confirmation.

This introduces a ~20% improvement in latency and op/s.

Note: Benchmarks are single threaded as, when increasing thread count, we start seeing large amounts of overhead being induced by block cache contention, finally resulting in both baseline and branch becoming equal.

Further on network attached storage with high latency, the level iterator, preparing all iterators so a 20% improvement even at high thread counts.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14100

Reviewed By: anand1976

Differential Revision: D86913584

Pulled By: krhancoc

fbshipit-source-id: da9d0c890e25e392a33389ce6b80f9bfb84d3f85
---
 db/db_iterator_test.cc                        | 293 ++++++++++++++++++
 db/version_set.cc                             |  97 ++++--
 include/rocksdb/statistics.h                  |   3 +
 monitoring/statistics.cc                      |   2 +
 .../block_based/block_based_table_iterator.cc |   1 -
 ...ll_iterators_in_level_iterators_prepare.md |   1 +
 6 files changed, 375 insertions(+), 22 deletions(-)
 create mode 100644 unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index dc73938dad21..20d9d9fa2d59 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -4154,6 +4154,9 @@ INSTANTIATE_TEST_CASE_P(DBMultiScanIteratorTest, DBMultiScanIteratorTest,
                         ::testing::Bool());
 
 TEST_P(DBMultiScanIteratorTest, BasicTest) {
+  auto options = CurrentOptions();
+  DestroyAndReopen(options);
+
   // Create a file
   for (int i = 0; i < 100; ++i) {
     std::stringstream ss;
@@ -4196,6 +4199,8 @@ TEST_P(DBMultiScanIteratorTest, BasicTest) {
 }
 
 TEST_P(DBMultiScanIteratorTest, MixedBoundsTest) {
+  auto options = CurrentOptions();
+  DestroyAndReopen(options);
   // Create a file
   for (int i = 0; i < 100; ++i) {
     std::stringstream ss;
@@ -4734,6 +4739,294 @@ TEST_P(DBMultiScanIteratorTest, ReseekAcrossBlocksSameUserKey) {
   }
 }
 
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchAcrossMultipleFiles) {
+  // Test async prefetch with multiple ranges within a single file
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(303);
+
+  // Create a single large file with many keys
+  // ~1MiB of data
+  // Should be lots of files now
+  for (int i = 0; i < 1000; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    // 1KiB values
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  ASSERT_GT(NumTableFilesAtLevel(49), 3);
+
+  // Set up multiple non-overlapping ranges in the same file
+  // Every 32 values should be a file or so
+  std::vector<std::string> key_ranges(
+      {"k00000", "k00100", "k00500", "k00600", "k00800", "k00900"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.insert(key_ranges[4], key_ranges[5]);
+
+  auto read_count_before =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+  auto read_count_after =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+  ASSERT_EQ(read_count_after, read_count_before);
+
+  // Verify all three ranges can be scanned successfully
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+      }
+    }
+  } catch (MultiScanException& ex) {
+    // Make sure exception contains the status
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchMultipleLevels) {
+  // Test async prefetch with files in L0 and non-L0 levels
+  // Similar setup to AsyncPrefetchAcrossMultipleFiles but with L0 files
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(304);
+
+  // Create base files and compact to bottom level - ~500KiB of data
+  for (int i = 0; i < 500; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  // Verify we have files at bottom level
+  ASSERT_GT(NumTableFilesAtLevel(49), 0);
+
+  // Create additional L0 files with overlapping key ranges
+  for (int i = 100; i < 150; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  // Verify we now have files in both L0 and bottom level
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(49), 0);
+
+  // Set up multiple non-overlapping ranges
+  std::vector<std::string> key_ranges(
+      {"k00000", "k00100", "k00200", "k00300", "k00400", "k00500"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+  scan_options.insert(key_ranges[4], key_ranges[5]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Verify all three ranges can be scanned successfully
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  // Should have keys from all three ranges
+  ASSERT_GT(total_keys, 0);
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithDeleteRange) {
+  // Test async prefetch with delete ranges
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(305);
+
+  // Create base data - ~500KiB
+  for (int i = 0; i < 500; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+
+  // Add delete ranges
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), dbfull()->DefaultColumnFamily(),
+                             "k00100", "k00200"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(49), 0);
+
+  // Set up scan ranges that interact with delete ranges
+  std::vector<std::string> key_ranges({"k00000", "k00500"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Verify ranges can be scanned successfully
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        std::string key = it.first.ToString();
+        // Verify deleted keys are not returned
+        ASSERT_TRUE((key < "k00100" || key >= "k00200"));
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  // Should have keys excluding deleted ranges
+  ASSERT_EQ(total_keys, 400);
+  iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithExternalFileIngestion) {
+  // Test async prefetch with externally ingested files
+  auto options = CurrentOptions();
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+
+  Random rnd(306);
+
+  // Create base data - ~200KiB
+  for (int i = 0; i < 200; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  // Create and ingest external SST file with new data
+  std::string ingest_file = dbname_ + "/test_ingest.sst";
+  {
+    std::unique_ptr<SstFileWriter> writer;
+    writer.reset(new SstFileWriter(EnvOptions(), options));
+    ASSERT_OK(writer->Open(ingest_file));
+    for (int i = 300; i < 500; ++i) {
+      std::stringstream ss;
+      ss << "k" << std::setw(5) << std::setfill('0') << i;
+      ASSERT_OK(writer->Put(ss.str(), rnd.RandomString(1 << 10)));
+    }
+    ASSERT_OK(writer->Finish());
+  }
+
+  IngestExternalFileOptions ifo;
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ASSERT_OK(dbfull()->IngestExternalFile(cfh, {ingest_file}, ifo));
+
+  // Set up scan ranges that span both regular and ingested files
+  std::vector<std::string> key_ranges({"k00000", "k00500"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = true;
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
+
+  // Verify all ranges can be scanned successfully
+  int total_keys = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        it.first.ToString();
+        total_keys++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
+  }
+
+  ASSERT_EQ(total_keys, 400);
+  iter.reset();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 960b897d00ff..b9339929acd0 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -16,6 +16,7 @@
 #include <list>
 #include <map>
 #include <set>
+#include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -980,7 +981,8 @@ class LevelIterator final : public InternalIterator {
           nullptr,
       bool allow_unprepared_value = false,
       std::unique_ptr<TruncatedRangeDelIterator>*** range_tombstone_iter_ptr_ =
-          nullptr)
+          nullptr,
+      Statistics* db_statistics = nullptr, SystemClock* clock = nullptr)
       : table_cache_(table_cache),
         read_options_(read_options),
         file_options_(file_options),
@@ -1005,7 +1007,9 @@ class LevelIterator final : public InternalIterator {
         allow_unprepared_value_(allow_unprepared_value),
         is_next_read_sequential_(false),
         to_return_sentinel_(false),
-        scan_opts_(nullptr) {
+        scan_opts_(nullptr),
+        db_statistics_(db_statistics),
+        clock_(clock) {
     // Empty level is not supported.
     assert(flevel_ != nullptr && flevel_->num_files > 0);
     if (range_tombstone_iter_ptr_) {
@@ -1013,7 +1017,15 @@ class LevelIterator final : public InternalIterator {
     }
   }
 
-  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+  ~LevelIterator() override {
+    delete file_iter_.Set(nullptr);
+    // Clean up any prepared iterators that weren't used
+    assert(prepared_iters_.size() == 0);
+    for (auto& entry : prepared_iters_) {
+      delete entry.second;
+    }
+    prepared_iters_.clear();
+  }
 
   // Seek to the first file with a key >= target.
   // If range_tombstone_iter_ is not nullptr, then we pretend that file
@@ -1124,10 +1136,12 @@ class LevelIterator final : public InternalIterator {
 
   void Prepare(const MultiScanArgs* so) override {
     // We assume here that scan_opts is sorted such that
-    // scan_opts[0].range.start < scan_opts[1].range.start, and non overlapping
+    // scan_opts[0].range.start < scan_opts[1].range.start, and non
+    // overlapping
     if (so == nullptr) {
       return;
     }
+
     scan_opts_ = so;
 
     // Verify comparator is consistent
@@ -1197,9 +1211,35 @@ class LevelIterator final : public InternalIterator {
         }
       }
     }
+
+    StopWatch timer(clock_, db_statistics_, MULTISCAN_PREPARE_ITERATORS);
+
     // Propagate multiscan configs
     for (auto& file_to_arg : *file_to_scan_opts_) {
       file_to_arg.second.CopyConfigFrom(*so);
+      assert(OverlapRange(*file_to_arg.second.GetScanRanges().begin(),
+                          file_to_arg.first) &&
+             OverlapRange(*file_to_arg.second.GetScanRanges().rbegin(),
+                          file_to_arg.first));
+    }
+
+    if (so->use_async_io) {
+      auto before = file_index_;
+      // Pre-create and prepare only relevant file iterators
+      for (auto& file_to_arg : *file_to_scan_opts_) {
+        size_t file_index = file_to_arg.first;
+
+        file_index_ = file_index;
+        // Create iterator for this file
+        auto iter = NewFileIterator();
+        if (iter != nullptr) {
+          // If we have async enabled, lets prepare all our iterators.
+          iter->Prepare(&file_to_arg.second);
+          // Store the prepared iterator
+          prepared_iters_[file_index] = iter;
+        }
+      }
+      file_index_ = before;
     }
   }
 
@@ -1276,7 +1316,7 @@ class LevelIterator final : public InternalIterator {
   }
 
 #ifndef NDEBUG
-  bool OverlapRange(const ScanOptions& opts);
+  bool OverlapRange(const ScanOptions& opts, size_t file_index);
 #endif
 
   TableCache* table_cache_;
@@ -1334,9 +1374,15 @@ class LevelIterator final : public InternalIterator {
   bool to_return_sentinel_ = false;
   const MultiScanArgs* scan_opts_ = nullptr;
 
+  Statistics* db_statistics_ = nullptr;
+  SystemClock* clock_ = nullptr;
+
   // Our stored scan_opts for each prefix
   std::unique_ptr<ScanOptionsMap> file_to_scan_opts_ = nullptr;
 
+  // Map to store pre-created iterators by file index
+  std::unordered_map<size_t, InternalIterator*> prepared_iters_;
+
   // Sets flags for if we should return the sentinel key next.
   // The condition for returning sentinel is reaching the end of current
   // file_iter_: !Valid() && status.().ok().
@@ -1673,14 +1719,14 @@ void LevelIterator::SkipEmptyFileBackward() {
 }
 
 #ifndef NDEBUG
-bool LevelIterator::OverlapRange(const ScanOptions& opts) {
+bool LevelIterator::OverlapRange(const ScanOptions& opts, size_t file_index) {
   return (user_comparator_.CompareWithoutTimestamp(
               opts.range.start.value(), /*a_has_ts=*/false,
-              ExtractUserKey(flevel_->files[file_index_].largest_key),
+              ExtractUserKey(flevel_->files[file_index].largest_key),
               /*b_has_ts=*/true) <= 0 &&
           user_comparator_.CompareWithoutTimestamp(
               opts.range.limit.value(), /*a_has_ts=*/false,
-              ExtractUserKey(flevel_->files[file_index_].smallest_key),
+              ExtractUserKey(flevel_->files[file_index].smallest_key),
               /*b_has_ts=*/true) > 0);
 }
 #endif
@@ -1691,15 +1737,6 @@ void LevelIterator::SetFileIterator(InternalIterator* iter) {
   }
 
   InternalIterator* old_iter = file_iter_.Set(iter);
-  if (iter && scan_opts_) {
-    if (FileHasMultiScanArg(file_index_)) {
-      const MultiScanArgs& new_opts = GetMultiScanArgForFile(file_index_);
-      assert(OverlapRange(*new_opts.GetScanRanges().begin()) &&
-             OverlapRange(*new_opts.GetScanRanges().rbegin()));
-      file_iter_.Prepare(&new_opts);
-    }
-  }
-
   // Update the read pattern for PrefetchBuffer.
   if (is_next_read_sequential_) {
     file_iter_.UpdateReadaheadState(old_iter);
@@ -1729,7 +1766,24 @@ void LevelIterator::InitFileIterator(size_t new_file_index) {
       // no need to change anything
     } else {
       file_index_ = new_file_index;
+      if (!prepared_iters_.empty()) {
+        auto prepared_it = prepared_iters_.find(file_index_);
+        if (prepared_it != prepared_iters_.end()) {
+          InternalIterator* iter = prepared_it->second;
+          prepared_iters_.erase(prepared_it);
+          SetFileIterator(iter);
+          return;
+        }
+      }
+
       InternalIterator* iter = NewFileIterator();
+      if (FileHasMultiScanArg(file_index_)) {
+        auto& args = GetMultiScanArgForFile(file_index_);
+        assert(OverlapRange(*args.GetScanRanges().begin(), file_index_) &&
+               OverlapRange(*args.GetScanRanges().rbegin(), file_index_));
+        iter->Prepare(&args);
+      }
+
       SetFileIterator(iter);
     }
   }
@@ -2192,7 +2246,7 @@ InternalIterator* Version::TEST_GetLevelIterator(
       cfd_->internal_stats()->GetFileReadHist(level),
       TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
       nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
-      allow_unprepared_value, &tombstone_iter_ptr);
+      allow_unprepared_value, &tombstone_iter_ptr, db_statistics_, clock_);
   if (read_options.ignore_range_deletions) {
     merge_iter_builder->AddIterator(level_iter);
   } else {
@@ -2332,7 +2386,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
         TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
         /*range_del_agg=*/nullptr,
         /*compaction_boundaries=*/nullptr, allow_unprepared_value,
-        &tombstone_iter_ptr);
+        &tombstone_iter_ptr, db_statistics_, clock_);
     if (read_options.ignore_range_deletions) {
       merge_iter_builder->AddIterator(level_iter);
     } else {
@@ -2389,7 +2443,7 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
         mutable_cf_options_, should_sample_file_read(),
         cfd_->internal_stats()->GetFileReadHist(level),
         TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
-        &range_del_agg, nullptr, false));
+        &range_del_agg, nullptr, false, nullptr, db_statistics_, clock_));
     status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
                                  iter.get(), overlap);
   }
@@ -7491,7 +7545,8 @@ InternalIterator* VersionSet::MakeInputIterator(
             /*no per level latency histogram=*/nullptr,
             TableReaderCaller::kCompaction, /*skip_filters=*/false,
             /*level=*/static_cast<int>(c->level(which)), range_del_agg,
-            c->boundaries(which), false, &tombstone_iter_ptr);
+            c->boundaries(which), false, &tombstone_iter_ptr,
+            db_options_->statistics.get(), clock_);
         range_tombstones.emplace_back(nullptr, tombstone_iter_ptr);
       }
     }
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 1bd4f382b7a4..6438ff70556b 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -692,6 +692,9 @@ enum Histograms : uint32_t {
   // Number of operations per transaction.
   NUM_OP_PER_TRANSACTION,
 
+  // MultiScan Prefill iterator Prepare cost
+  MULTISCAN_PREPARE_ITERATORS,
+
   HISTOGRAM_ENUM_MAX
 };
 
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 652080e59d85..28d4278b2197 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -352,6 +352,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
     {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
      "rocksdb.table.open.prefetch.tail.read.bytes"},
     {NUM_OP_PER_TRANSACTION, "rocksdb.num.op.per.transaction"},
+    {MULTISCAN_PREPARE_ITERATORS,
+     "rocksdb.multiscan.op.prepare.iterators.micros"},
 };
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 65d1750cddf1..c507497244f2 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1709,7 +1709,6 @@ Status BlockBasedTableIterator::ExecuteIO(
         assert(false);
         return s;
       }
-      assert(async_read.io_handle);
       for (auto& req : *read_reqs) {
         if (!req.status.ok()) {
           assert(false);
diff --git a/unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md b/unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md
new file mode 100644
index 000000000000..ed141b0ceae8
--- /dev/null
+++ b/unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md
@@ -0,0 +1 @@
+Added optimization that allowed for the asynchronous prefetching of all data outlined in a multiscan iterator. This optimization was applied to the level iterator, which prefetches all data through each of the block-based iterators.

From 57a6fb9e3a8d507948bc71fb9b436a68d3e62e9d Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 19 Nov 2025 05:10:03 -0800
Subject: [PATCH 378/500] Refactor and support option migration for db with
 multiple CFs (#14059)

Summary:
**Context/Summary:**
This PR adds multi-cf support to option migration. The original implementation sets options, opens db, compacts files and reopens the db in almost all the three branches below. Such design makes expanding to multi-cf difficult as it needs to change all these places within each of the branch causing code redundancy.
```
Status OptionChangeMigration(std::string dbname, const Options& old_opts,
                             const Options& new_opts) {
  if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
    // LSM generated by FIFO compaction can be opened by any compaction.
    return Status::OK();
  } else if (new_opts.compaction_style ==
             CompactionStyle::kCompactionStyleUniversal) {
    return MigrateToUniversal(dbname, old_opts, new_opts);
  } else if (new_opts.compaction_style ==
             CompactionStyle::kCompactionStyleLevel) {
    return MigrateToLevelBase(dbname, old_opts, new_opts);
  } else if (new_opts.compaction_style ==
             CompactionStyle::kCompactionStyleFIFO) {
    return CompactToLevel(old_opts, dbname, 0, 0 /* l0_file_size */, true);
  } else {
    return Status::NotSupported(
        "Do not how to migrate to this compaction style");
  }
}
```

Therefore this PR
-  Refactor the option migration implementation by moving the common parts into the high-level `OptionChangeMigration()` through `PrepareNoCompactionCFDescriptors()` and `OpenDBWithCFs()` so `MigrateAllCFs()` can focus on compaction only.
-  Treat the original OptionChangeMigration() API as a special case of the multi-cf version option migration
- Add multiple-cf support

A few notes:
- CompactToLevel() originally modifies the compaction-related options conditionally before doing compaction. This is moved into earlier steps through `ApplySpecialSingleLevelSettings()` in `PrepareNoCompactionCFDescriptors()`
- MigrateToUniversal() originally opens the db twice with essentially the same option. This PR reduces that to one open
- Option migration does not always use the old option to compact the db and reopen the db after migration, see `  return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1,/*l0_file_size=*/0, false);`. `PrepareNoCompactionCFDescriptors()` is where we handle those decisions.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14059

Test Plan:
- Existing UTs
- New UTs

Reviewed By: cbi42

Differential Revision: D84852970

Pulled By: hx235

fbshipit-source-id: 936b456cf9fb4c3ccb687e5d1387f2d67a1448be
---
 .../utilities/option_change_migration.h       |  38 +-
 .../new_features/multi-cf-option-migration.md |   1 +
 .../option_change_migration.cc                | 414 +++++++++++++-----
 .../option_change_migration_test.cc           | 379 ++++++++++++++++
 4 files changed, 716 insertions(+), 116 deletions(-)
 create mode 100644 unreleased_history/new_features/multi-cf-option-migration.md

diff --git a/include/rocksdb/utilities/option_change_migration.h b/include/rocksdb/utilities/option_change_migration.h
index 0ad00cc860e3..ff941e0cf9b8 100644
--- a/include/rocksdb/utilities/option_change_migration.h
+++ b/include/rocksdb/utilities/option_change_migration.h
@@ -6,19 +6,47 @@
 #pragma once
 
 #include <string>
+#include <vector>
 
+#include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
-// Try to migrate DB created with old_opts to be use new_opts.
-// Multiple column families is not supported.
-// It is best-effort. No guarantee to succeed.
-// A full compaction may be executed.
+// Prepares a database to be compatible with new_opts after using old_opts.
+// Restructures the LSM tree but does NOT apply new_opts - you must call
+// DB::Open(new_opts, dbname) afterward to actually use the new configuration.
+// It is best-effort with no guarantee to succeed. A full compaction may be
+// executed.
+//
+// Limitations: single column family only
+//
 // WARNING: using this to migrate from non-FIFO to FIFO compaction
 // with `Options::compaction_options_fifo.max_table_files_size` > 0 can cause
 // the whole DB to be dropped right after migration if the migrated data is
 // larger than `max_table_files_size`
-Status OptionChangeMigration(std::string dbname, const Options& old_opts,
+Status OptionChangeMigration(std::string& dbname, const Options& old_opts,
                              const Options& new_opts);
+
+// Multi-CF version: Prepares a database with multiple column families to be
+// compatible with new options after using old options.
+//
+// REQUIREMENTS:
+// - old_cf_descs and new_cf_descs MUST have the same number of CFs
+// - old_cf_descs and new_cf_descs MUST have the same CF names IN THE SAME ORDER
+// - Adding or dropping CFs is NOT supported - use CreateColumnFamily() or
+//   DropColumnFamily() separately before/after migration
+//
+// The function will return InvalidArgument status if these requirements are
+// violated.
+//
+// WARNING: using this to migrate from non-FIFO to FIFO compaction
+// with `max_table_files_size` > 0 can cause the whole DB to be dropped right
+// after migration if the migrated data is larger than `max_table_files_size`
+Status OptionChangeMigration(
+    const std::string& dbname, const DBOptions& old_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const DBOptions& new_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/unreleased_history/new_features/multi-cf-option-migration.md b/unreleased_history/new_features/multi-cf-option-migration.md
new file mode 100644
index 000000000000..817286fb0ba0
--- /dev/null
+++ b/unreleased_history/new_features/multi-cf-option-migration.md
@@ -0,0 +1 @@
+Added a new API to support option migration for multiple column families
diff --git a/utilities/option_change_migration/option_change_migration.cc b/utilities/option_change_migration/option_change_migration.cc
index a08c5b59292c..66703034bb6d 100644
--- a/utilities/option_change_migration/option_change_migration.cc
+++ b/utilities/option_change_migration/option_change_migration.cc
@@ -23,147 +23,339 @@ Options GetNoCompactionOptions(const Options& opts) {
   return ret_opts;
 }
 
-Status OpenDb(const Options& options, const std::string& dbname,
-              std::unique_ptr<DB>* db) {
-  db->reset();
-  DB* tmpdb;
-  Status s = DB::Open(options, dbname, &tmpdb);
-  if (s.ok()) {
-    db->reset(tmpdb);
-  }
-  return s;
-}
+// Compact a specific CF to a specific level
+//  cf_handle should not be null
+Status CompactToLevel(DB* db, ColumnFamilyHandle* cf_handle, int dest_level) {
+  assert(cf_handle != nullptr);
 
-// l0_file_size specifies size of file on L0. Files will be range partitioned
-// after a full compaction so they are likely qualified to put on L0. If
-// left as 0, the files are compacted in a single file and put to L0. Otherwise,
-// will try to compact the files as size l0_file_size.
-Status CompactToLevel(const Options& options, const std::string& dbname,
-                      int dest_level, uint64_t l0_file_size, bool need_reopen) {
-  std::unique_ptr<DB> db;
-  Options no_compact_opts = GetNoCompactionOptions(options);
-  if (dest_level == 0) {
-    if (l0_file_size == 0) {
-      // Single file.
-      l0_file_size = 999999999999999;
-    }
-    // L0 has strict sequenceID requirements to files to it. It's safer
-    // to only put one compacted file to there.
-    // This is only used for converting to universal compaction with
-    // only one level. In this case, compacting to one file is also
-    // optimal.
-    no_compact_opts.target_file_size_base = l0_file_size;
-    no_compact_opts.max_compaction_bytes = l0_file_size;
-  }
-  Status s = OpenDb(no_compact_opts, dbname, &db);
-  if (!s.ok()) {
-    return s;
-  }
   CompactRangeOptions cro;
   cro.change_level = true;
   cro.target_level = dest_level;
+
   if (dest_level == 0) {
     // cannot use kForceOptimized because the compaction is expected to
-    // generate one output file
+    // generate one output file so to force the full compaction to skip trivial
+    // move to L0
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   }
-  s = db->CompactRange(cro, nullptr, nullptr);
 
-  if (s.ok() && need_reopen) {
-    // Need to restart DB to rewrite the manifest file.
-    // In order to open a DB with specific num_levels, the manifest file should
-    // contain no record that mentiones any level beyond num_levels. Issuing a
-    // full compaction will move all the data to a level not exceeding
-    // num_levels, but the manifest may still contain previous record mentioning
-    // a higher level. Reopening the DB will force the manifest to be rewritten
-    // so that those records will be cleared.
-    db.reset();
-    s = OpenDb(no_compact_opts, dbname, &db);
-  }
-  return s;
+  return db->CompactRange(cro, cf_handle, nullptr, nullptr);
 }
 
-Status MigrateToUniversal(std::string dbname, const Options& old_opts,
-                          const Options& new_opts) {
-  if (old_opts.num_levels <= new_opts.num_levels ||
-      old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
-    return Status::OK();
-  } else {
-    bool need_compact = false;
-    {
-      std::unique_ptr<DB> db;
-      Options opts = GetNoCompactionOptions(old_opts);
-      Status s = OpenDb(opts, dbname, &db);
-      if (!s.ok()) {
-        return s;
-      }
-      ColumnFamilyMetaData metadata;
-      db->GetColumnFamilyMetaData(&metadata);
-      if (!metadata.levels.empty() &&
-          metadata.levels.back().level >= new_opts.num_levels) {
-        need_compact = true;
-      }
-    }
-    if (need_compact) {
-      return CompactToLevel(old_opts, dbname, new_opts.num_levels - 1,
-                            /*l0_file_size=*/0, true);
-    }
+Status MigrateToUniversal(DB* db, ColumnFamilyHandle* cf_handle,
+                          int old_num_levels, int new_num_levels) {
+  assert(cf_handle != nullptr);
+
+  if (old_num_levels <= new_num_levels) {
     return Status::OK();
   }
+
+  // Check if compaction is needed
+  ColumnFamilyMetaData metadata;
+  db->GetColumnFamilyMetaData(cf_handle, &metadata);
+
+  if (!metadata.levels.empty() &&
+      metadata.levels.back().level >= new_num_levels) {
+    // Need to compact to fit new num_levels
+    return CompactToLevel(db, cf_handle, new_num_levels - 1);
+  }
+
+  return Status::OK();
 }
 
-Status MigrateToLevelBase(std::string dbname, const Options& old_opts,
-                          const Options& new_opts) {
-  if (!new_opts.level_compaction_dynamic_level_bytes) {
-    if (old_opts.num_levels == 1) {
+Status MigrateToLevelBase(DB* db, ColumnFamilyHandle* cf_handle,
+                          int old_num_levels, int new_num_levels,
+                          bool dynamic_level_bytes) {
+  assert(cf_handle != nullptr);
+
+  if (!dynamic_level_bytes) {
+    // Non-dynamic level mode
+    if (old_num_levels == 1) {
       return Status::OK();
     }
-    // Compact everything to level 1 to guarantee it can be safely opened.
-    Options opts = old_opts;
-    opts.target_file_size_base = new_opts.target_file_size_base;
-    // Although sometimes we can open the DB with the new option without error,
-    // We still want to compact the files to avoid the LSM tree to stuck
-    // in bad shape. For example, if the user changed the level size
-    // multiplier from 4 to 8, with the same data, we will have fewer
-    // levels. Unless we issue a full comaction, the LSM tree may stuck
-    // with more levels than needed and it won't recover automatically.
-    return CompactToLevel(opts, dbname, 1, /*l0_file_size=*/0, true);
+    // Compact to L1
+    return CompactToLevel(db, cf_handle, 1);
+
   } else {
-    // Compact everything to the last level to guarantee it can be safely
-    // opened.
-    if (old_opts.num_levels == 1) {
+    // Dynamic level mode
+    if (old_num_levels == 1) {
       return Status::OK();
-    } else if (new_opts.num_levels > old_opts.num_levels) {
-      // Dynamic level mode requires data to be put in the last level first.
-      return CompactToLevel(new_opts, dbname, new_opts.num_levels - 1,
-                            /*l0_file_size=*/0, false);
-    } else {
-      Options opts = old_opts;
-      opts.target_file_size_base = new_opts.target_file_size_base;
-      return CompactToLevel(opts, dbname, new_opts.num_levels - 1,
-                            /*l0_file_size=*/0, true);
     }
+    // Compact to last level
+    return CompactToLevel(db, cf_handle, new_num_levels - 1);
   }
 }
-}  // namespace
 
-Status OptionChangeMigration(std::string dbname, const Options& old_opts,
-                             const Options& new_opts) {
+Status MigrateToFIFO(DB* db, ColumnFamilyHandle* cf_handle) {
+  assert(cf_handle != nullptr);
+  return CompactToLevel(db, cf_handle, 0);
+}
+
+Status MigrateSingleColumnFamily(DB* db, ColumnFamilyHandle* cf_handle,
+                                 const Options& old_opts,
+                                 const Options& new_opts) {
+  assert(cf_handle != nullptr);
+
   if (old_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
-    // LSM generated by FIFO compaction can be opened by any compaction.
     return Status::OK();
-  } else if (new_opts.compaction_style ==
-             CompactionStyle::kCompactionStyleUniversal) {
-    return MigrateToUniversal(dbname, old_opts, new_opts);
+  }
+
+  if (new_opts.compaction_style == CompactionStyle::kCompactionStyleUniversal) {
+    return MigrateToUniversal(db, cf_handle, old_opts.num_levels,
+                              new_opts.num_levels);
   } else if (new_opts.compaction_style ==
              CompactionStyle::kCompactionStyleLevel) {
-    return MigrateToLevelBase(dbname, old_opts, new_opts);
+    return MigrateToLevelBase(db, cf_handle, old_opts.num_levels,
+                              new_opts.num_levels,
+                              new_opts.level_compaction_dynamic_level_bytes);
   } else if (new_opts.compaction_style ==
              CompactionStyle::kCompactionStyleFIFO) {
-    return CompactToLevel(old_opts, dbname, 0, 0 /* l0_file_size */, true);
+    return MigrateToFIFO(db, cf_handle);
+  }
+
+  return Status::NotSupported(
+      "Do not know how to migrate to this compaction style");
+}
+
+Status ValidateCFDescriptors(
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs) {
+  if (old_cf_descs.size() != new_cf_descs.size()) {
+    return Status::InvalidArgument(
+        "old_cf_descs and new_cf_descs must have the same number of column "
+        "families. Got " +
+        std::to_string(old_cf_descs.size()) + " old CFs and " +
+        std::to_string(new_cf_descs.size()) +
+        " new CFs. Adding or dropping CFs is not supported.");
+  }
+
+  for (size_t i = 0; i < old_cf_descs.size(); ++i) {
+    if (old_cf_descs[i].name != new_cf_descs[i].name) {
+      return Status::InvalidArgument(
+          "Column family mismatch at index " + std::to_string(i) + ": " +
+          "old has '" + old_cf_descs[i].name + "', " + "new has '" +
+          new_cf_descs[i].name + "'. CF names and order must match exactly.");
+    }
+  }
+
+  return Status::OK();
+}
+
+struct BaseOptionsResult {
+  ColumnFamilyOptions base_opts;
+  bool need_reopen = true;
+};
+
+BaseOptionsResult DetermineBaseOptions(const ColumnFamilyOptions& old_opts,
+                                       const ColumnFamilyOptions& new_opts) {
+  BaseOptionsResult result;
+
+  if (new_opts.compaction_style == CompactionStyle::kCompactionStyleLevel) {
+    if (!new_opts.level_compaction_dynamic_level_bytes) {
+      result.base_opts = old_opts;
+      result.base_opts.target_file_size_base = new_opts.target_file_size_base;
+    } else {
+      if (new_opts.num_levels > old_opts.num_levels) {
+        result.base_opts = new_opts;
+        result.need_reopen = false;
+      } else {
+        result.base_opts = old_opts;
+        result.base_opts.target_file_size_base = new_opts.target_file_size_base;
+      }
+    }
+  } else {
+    result.base_opts = old_opts;
+  }
+
+  return result;
+}
+
+void ApplySpecialSingleLevelSettings(const ColumnFamilyOptions& new_opts,
+                                     ColumnFamilyOptions* base_opts) {
+  if (((new_opts.compaction_style ==
+            CompactionStyle::kCompactionStyleUniversal ||
+        new_opts.compaction_style == CompactionStyle::kCompactionStyleLevel) &&
+       new_opts.num_levels == 1) ||
+      new_opts.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    base_opts->target_file_size_base = 999999999999999;
+    base_opts->max_compaction_bytes = 999999999999999;
+  }
+}
+
+std::vector<ColumnFamilyDescriptor> PrepareNoCompactionCFDescriptors(
+    const DBOptions& old_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs,
+    bool* any_need_reopen) {
+  assert(old_cf_descs.size() == new_cf_descs.size());
+
+  std::vector<ColumnFamilyDescriptor> no_compact_cf_descs;
+  *any_need_reopen = false;
+
+  for (size_t i = 0; i < old_cf_descs.size(); ++i) {
+    const std::string& cf_name = old_cf_descs[i].name;
+    const ColumnFamilyOptions& old_opts = old_cf_descs[i].options;
+    const ColumnFamilyOptions& new_opts = new_cf_descs[i].options;
+
+    BaseOptionsResult result = DetermineBaseOptions(old_opts, new_opts);
+    ColumnFamilyOptions base_opts = result.base_opts;
+
+    if (result.need_reopen) {
+      *any_need_reopen = true;
+    }
+
+    ApplySpecialSingleLevelSettings(new_opts, &base_opts);
+
+    Options tmp_opts(old_db_opts, base_opts);
+    Options no_compact_opts = GetNoCompactionOptions(tmp_opts);
+
+    no_compact_cf_descs.emplace_back(cf_name,
+                                     ColumnFamilyOptions(no_compact_opts));
+  }
+
+  return no_compact_cf_descs;
+}
+
+Status OpenDBWithCFs(const DBOptions& db_opts, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& cf_descs,
+                     std::unique_ptr<DB>* db,
+                     std::vector<ColumnFamilyHandle*>* handles) {
+  handles->clear();
+  DB* tmpdb;
+  Status s = DB::Open(db_opts, dbname, cf_descs, handles, &tmpdb);
+
+  if (s.ok()) {
+    db->reset(tmpdb);
   } else {
-    return Status::NotSupported(
-        "Do not how to migrate to this compaction style");
+    for (auto* handle : *handles) {
+      delete handle;
+    }
+    handles->clear();
   }
+
+  return s;
+}
+
+Status CleanupCFHandles(DB* db, std::vector<ColumnFamilyHandle*>* handles) {
+  Status s;
+  for (auto* handle : *handles) {
+    if (handle != db->DefaultColumnFamily()) {
+      Status destroy_status = db->DestroyColumnFamilyHandle(handle);
+      if (!destroy_status.ok() && s.ok()) {
+        s = destroy_status;
+      }
+    }
+  }
+  handles->clear();
+  return s;
+}
+
+Status MigrateAllCFs(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+                     const DBOptions& old_db_opts, const DBOptions& new_db_opts,
+                     const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+                     const std::vector<ColumnFamilyDescriptor>& new_cf_descs) {
+  assert(handles.size() == old_cf_descs.size());
+  assert(old_cf_descs.size() == new_cf_descs.size());
+
+  for (size_t i = 0; i < handles.size(); ++i) {
+    const ColumnFamilyOptions& old_cf_opts = old_cf_descs[i].options;
+    const ColumnFamilyOptions& new_cf_opts = new_cf_descs[i].options;
+
+    Options old_opts(old_db_opts, old_cf_opts);
+    Options new_opts(new_db_opts, new_cf_opts);
+
+    Status s = MigrateSingleColumnFamily(db, handles[i], old_opts, new_opts);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status OptionChangeMigration(
+    const std::string& dbname, const DBOptions& old_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& old_cf_descs,
+    const DBOptions& new_db_opts,
+    const std::vector<ColumnFamilyDescriptor>& new_cf_descs) {
+  // Step 1: Validate that old and new have same CFs in same order
+  Status s = ValidateCFDescriptors(old_cf_descs, new_cf_descs);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Step 2: Prepare no-compaction CF descriptors
+  bool any_need_reopen = false;
+  std::vector<ColumnFamilyDescriptor> no_compact_cf_descs =
+      PrepareNoCompactionCFDescriptors(old_db_opts, old_cf_descs, new_cf_descs,
+                                       &any_need_reopen);
+
+  // Step 3: Open DB with all CFs
+  std::unique_ptr<DB> db;
+  std::vector<ColumnFamilyHandle*> handles;
+  s = OpenDBWithCFs(old_db_opts, dbname, no_compact_cf_descs, &db, &handles);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(db != nullptr);
+
+  // Step 4: Migrate all CFs
+  s = MigrateAllCFs(db.get(), handles, old_db_opts, new_db_opts, old_cf_descs,
+                    new_cf_descs);
+
+  // Step 5: Cleanup CF handles
+  Status cleanup_status = CleanupCFHandles(db.get(), &handles);
+  if (s.ok() && !cleanup_status.ok()) {
+    s = cleanup_status;
+  }
+
+  // Step 6: Close and reopen DB if needed to rewrite manifest
+  if (s.ok() && any_need_reopen) {
+    Status close_status = db->Close();
+    if (!close_status.ok()) {
+      return close_status;
+    }
+    db.reset();
+
+    s = OpenDBWithCFs(old_db_opts, dbname, no_compact_cf_descs, &db, &handles);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Cleanup CF handles before final close
+    cleanup_status = CleanupCFHandles(db.get(), &handles);
+    if (!cleanup_status.ok() && s.ok()) {
+      s = cleanup_status;
+    }
+  }
+
+  // Final step: Close DB (either after reopening or without reopening)
+  Status close_status = db->Close();
+  if (!close_status.ok() && s.ok()) {
+    s = close_status;
+  }
+
+  db.reset();
+
+  return s;
+}
+
+Status OptionChangeMigration(std::string& dbname, const Options& old_opts,
+                             const Options& new_opts) {
+  DBOptions old_db_opts(old_opts);
+  DBOptions new_db_opts(new_opts);
+
+  ColumnFamilyOptions old_cf_opts(old_opts);
+  ColumnFamilyOptions new_cf_opts(new_opts);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}};
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, new_cf_opts}};
+
+  return OptionChangeMigration(dbname, old_db_opts, old_cf_descs, new_db_opts,
+                               new_cf_descs);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/option_change_migration/option_change_migration_test.cc b/utilities/option_change_migration/option_change_migration_test.cc
index 9984f0dd456e..4a78e9fe1111 100644
--- a/utilities/option_change_migration/option_change_migration_test.cc
+++ b/utilities/option_change_migration/option_change_migration_test.cc
@@ -556,6 +556,385 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
   }
 }
 
+class DBOptionChangeMigrationMultiCFTest : public DBTestBase {
+ public:
+  DBOptionChangeMigrationMultiCFTest()
+      : DBTestBase("db_option_change_migration_multi_cf_test",
+                   /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, BasicMultiCF) {
+  Options options = CurrentOptions();
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.num_levels = 4;
+  options.write_buffer_size = 64 * 1024;
+  options.target_file_size_base = 128 * 1024;
+
+  // Create DB with default CF
+  Reopen(options);
+
+  // Create additional CF
+  ColumnFamilyHandle* cf_handle;
+  ASSERT_OK(db_->CreateColumnFamily(options, "cf1", &cf_handle));
+
+  // Write data to both CFs
+  Random rnd(301);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(900)));
+    ASSERT_OK(db_->Put(WriteOptions(), cf_handle, "key" + std::to_string(i),
+                       rnd.RandomString(900)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Flush(FlushOptions(), cf_handle));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Collect keys from both CFs
+  std::set<std::string> keys_default;
+  std::set<std::string> keys_cf1;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_default.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), cf_handle));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_cf1.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+
+  delete cf_handle;
+  Close();
+
+  // Prepare old and new options
+  DBOptions old_db_opts(options);
+  ColumnFamilyOptions old_cf_opts(options);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}, {"cf1", old_cf_opts}};
+
+  // New options: migrate to Universal compaction
+  Options new_options = options;
+  new_options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  new_options.num_levels = 5;
+  new_options.target_file_size_base = 256 * 1024;
+
+  DBOptions new_db_opts(new_options);
+  ColumnFamilyOptions new_cf_opts(new_options);
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, new_cf_opts}, {"cf1", new_cf_opts}};
+
+  // Perform multi-CF migration
+  ASSERT_OK(OptionChangeMigration(dbname_, old_db_opts, old_cf_descs,
+                                  new_db_opts, new_cf_descs));
+
+  // Reopen with new options
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(new_db_opts, dbname_, new_cf_descs, &handles, &db_));
+  ASSERT_EQ(handles.size(), 2);
+
+  // Verify data in default CF
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (const std::string& key : keys_default) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Verify data in cf1
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), handles[1]));
+    it->SeekToFirst();
+    for (const std::string& key : keys_cf1) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Cleanup
+  for (auto* handle : handles) {
+    if (handle != db_->DefaultColumnFamily()) {
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handle));
+    }
+  }
+}
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, DifferentStylesPerCF) {
+  // Create DB with 2 CFs, both using Level compaction
+  Options options1 = CurrentOptions();
+  options1.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options1.num_levels = 4;
+  options1.write_buffer_size = 64 * 1024;
+
+  Reopen(options1);
+
+  ColumnFamilyHandle* cf_handle;
+  ASSERT_OK(db_->CreateColumnFamily(options1, "cf1", &cf_handle));
+
+  // Write data
+  Random rnd(301);
+  for (int i = 0; i < 50; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(900)));
+    ASSERT_OK(db_->Put(WriteOptions(), cf_handle, "key" + std::to_string(i),
+                       rnd.RandomString(900)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Flush(FlushOptions(), cf_handle));
+
+  // Collect keys from both CFs
+  std::set<std::string> keys_default;
+  std::set<std::string> keys_cf1;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_default.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), cf_handle));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_cf1.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+
+  delete cf_handle;
+  Close();
+
+  // Old descriptors
+  DBOptions old_db_opts(options1);
+  ColumnFamilyOptions old_cf_opts(options1);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}, {"cf1", old_cf_opts}};
+
+  // New descriptors: default CF to Universal, cf1 to Level with dynamic
+  Options new_opts_default = options1;
+  new_opts_default.compaction_style =
+      CompactionStyle::kCompactionStyleUniversal;
+  new_opts_default.num_levels = 5;
+
+  Options new_opts_cf1 = options1;
+  new_opts_cf1.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  new_opts_cf1.level_compaction_dynamic_level_bytes = true;
+  new_opts_cf1.num_levels = 5;
+
+  DBOptions new_db_opts(new_opts_default);
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, ColumnFamilyOptions(new_opts_default)},
+      {"cf1", ColumnFamilyOptions(new_opts_cf1)}};
+
+  // Perform migration
+  ASSERT_OK(OptionChangeMigration(dbname_, old_db_opts, old_cf_descs,
+                                  new_db_opts, new_cf_descs));
+
+  // Reopen and verify
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(new_db_opts, dbname_, new_cf_descs, &handles, &db_));
+  ASSERT_EQ(handles.size(), 2);
+
+  // Verify data in default CF
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (const std::string& key : keys_default) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Verify data in cf1
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), handles[1]));
+    it->SeekToFirst();
+    for (const std::string& key : keys_cf1) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Cleanup
+  for (auto* handle : handles) {
+    if (handle != db_->DefaultColumnFamily()) {
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handle));
+    }
+  }
+}
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, ValidationMismatched) {
+  Options options = CurrentOptions();
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  // Test 1: Mismatched CF count (missing cf1)
+  {
+    std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}, {"cf1", cf_opts}};
+
+    std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}};  // Missing cf1
+
+    Status s = OptionChangeMigration(dbname_, db_opts, old_cf_descs, db_opts,
+                                     new_cf_descs);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(s.ToString().find("same number") != std::string::npos);
+  }
+
+  // Test 2: Mismatched CF names (cf2 instead of cf1)
+  {
+    std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}, {"cf1", cf_opts}};
+
+    std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts},
+        {"cf2", cf_opts}};  // Different name
+
+    Status s = OptionChangeMigration(dbname_, db_opts, old_cf_descs, db_opts,
+                                     new_cf_descs);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(s.ToString().find("mismatch") != std::string::npos);
+  }
+
+  // Test 3: Mismatched CF order (swapped)
+  {
+    std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+        {kDefaultColumnFamilyName, cf_opts}, {"cf1", cf_opts}};
+
+    std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+        {"cf1", cf_opts},  // Swapped order
+        {kDefaultColumnFamilyName, cf_opts}};
+
+    Status s = OptionChangeMigration(dbname_, db_opts, old_cf_descs, db_opts,
+                                     new_cf_descs);
+    ASSERT_TRUE(s.IsInvalidArgument());
+    ASSERT_TRUE(s.ToString().find("mismatch") != std::string::npos);
+  }
+}
+
+TEST_F(DBOptionChangeMigrationMultiCFTest, FromFIFOMultiCF) {
+  Options options = CurrentOptions();
+  options.compaction_style = CompactionStyle::kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+
+  Reopen(options);
+
+  ColumnFamilyHandle* cf_handle;
+  ASSERT_OK(db_->CreateColumnFamily(options, "cf1", &cf_handle));
+
+  // Write some data
+  Random rnd(301);
+  for (int i = 0; i < 50; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(900)));
+    ASSERT_OK(db_->Put(WriteOptions(), cf_handle, "key" + std::to_string(i),
+                       rnd.RandomString(900)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Flush(FlushOptions(), cf_handle));
+
+  // Collect keys from both CFs
+  std::set<std::string> keys_default;
+  std::set<std::string> keys_cf1;
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_default.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), cf_handle));
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      keys_cf1.insert(it->key().ToString());
+    }
+    ASSERT_OK(it->status());
+  }
+
+  delete cf_handle;
+  Close();
+
+  // Migrate from FIFO to Level
+  DBOptions old_db_opts(options);
+  ColumnFamilyOptions old_cf_opts(options);
+
+  std::vector<ColumnFamilyDescriptor> old_cf_descs = {
+      {kDefaultColumnFamilyName, old_cf_opts}, {"cf1", old_cf_opts}};
+
+  Options new_options = options;
+  new_options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  new_options.num_levels = 4;
+  new_options.max_open_files = 1000;
+
+  DBOptions new_db_opts(new_options);
+  ColumnFamilyOptions new_cf_opts(new_options);
+
+  std::vector<ColumnFamilyDescriptor> new_cf_descs = {
+      {kDefaultColumnFamilyName, new_cf_opts}, {"cf1", new_cf_opts}};
+
+  // Migration should succeed (FIFO is special case)
+  ASSERT_OK(OptionChangeMigration(dbname_, old_db_opts, old_cf_descs,
+                                  new_db_opts, new_cf_descs));
+
+  // Reopen and verify
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(new_db_opts, dbname_, new_cf_descs, &handles, &db_));
+  ASSERT_EQ(handles.size(), 2);
+
+  // Verify data in default CF
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToFirst();
+    for (const std::string& key : keys_default) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Verify data in cf1
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions(), handles[1]));
+    it->SeekToFirst();
+    for (const std::string& key : keys_cf1) {
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ(key, it->key().ToString());
+      it->Next();
+    }
+    ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Cleanup
+  for (auto* handle : handles) {
+    if (handle != db_->DefaultColumnFamily()) {
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handle));
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 0762586067a25e040685b085fcb70c3e9e8dd8b0 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 19 Nov 2025 09:23:41 -0800
Subject: [PATCH 379/500] Relax an assertion related to parallel compression
 (#14130)

Summary:
Saw a mysterious failure of assertion
`assert(rep_->props.num_data_blocks == 0)` in
DBCompressionTest/CompressionFailuresTest.CompressionFailures/45. This seems to be caused by a parallel compression failure arriving after the emit thread has started Finish() but before the Flush() at the start of Finish(). We can fix this by relaxing the assertion to allow for the !ok() case. Testing revealed more ok() assertions that needed to be relaxed/moved.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14130

Test Plan: Added a sync point to inject a failure status in the right place and added to unit test to be sure the case is essentially covered. It would arguably be a more realistic test to force a particular thread interleaving but I believe simple is good here.

Reviewed By: hx235

Differential Revision: D87377709

Pulled By: pdillinger

fbshipit-source-id: 4bd465673b084afcc235688503d1c2f464eed32d
---
 .../block_based/block_based_table_builder.cc  | 24 +++++++++++---
 util/compression_test.cc                      | 31 ++++++++++++++++---
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 5b36e02dff66..3b1befdfbc46 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1952,6 +1952,9 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock(
     const Slice& block_contents, CompressionType comp_type, BlockHandle* handle,
     BlockType block_type, const Slice* uncompressed_block_data,
     bool* skip_delta_encoding) {
+  // Must have pre-checked status in single-threaded context
+  assert(status().ok());
+  assert(io_status().ok());
   rep_->SetIOStatus(WriteMaybeCompressedBlockImpl(
       block_contents, comp_type, handle, block_type, uncompressed_block_data,
       skip_delta_encoding));
@@ -2026,8 +2029,6 @@ IOStatus BlockBasedTableBuilder::WriteMaybeCompressedBlockImpl(
 
   handle->set_offset(offset);
   handle->set_size(block_contents.size());
-  assert(status().ok());
-  assert(io_status().ok());
   if (uncompressed_block_data == nullptr) {
     uncompressed_block_data = &block_contents;
     assert(comp_type == kNoCompression);
@@ -2152,8 +2153,9 @@ void BlockBasedTableBuilder::StopParallelCompression(bool abort) {
     pc_rep.SetAbort(pc_rep.emit_thread_state);
   } else if (pc_rep.emit_thread_state !=
              ParallelCompressionRep::ThreadState::kEnd) {
-    // In case we didn't do a final flush with no next key
-    assert(rep_->props.num_data_blocks == 0);
+    // In case we didn't do a final flush with no next key, which might have
+    // been skipped if !ok() was set after the start of Finish()
+    assert(rep_->props.num_data_blocks == 0 || !ok());
     pc_rep.SetNoMoreToEmit(pc_rep.emit_thread_state, pc_rep.emit_slot);
   }
 #ifdef BBTB_PC_WATCHDOG
@@ -2700,6 +2702,20 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
 Status BlockBasedTableBuilder::Finish() {
   Rep* r = rep_.get();
   assert(r->state != Rep::State::kClosed);
+
+#ifndef NDEBUG
+  {
+    // This sync point callback is a simple approximation of a failure detected
+    // in parallel compression after the start of calling Finish() but before
+    // Finish() calls Flush()
+    IOStatus s = rep_->GetIOStatus();
+    TEST_SYNC_POINT_CALLBACK("BlockBasedTableBuilder::Finish:ParallelIOStatus",
+                             &s);
+    if (!s.ok()) {
+      rep_->SetIOStatus(s);
+    }
+  }
+#endif  // !NDEBUG
   // To make sure properties block is able to keep the accurate size of index
   // block, we will finish writing all index entries first, in Flush().
   Flush(/*first_key_in_next_block=*/nullptr);
diff --git a/util/compression_test.cc b/util/compression_test.cc
index b51c872f1452..da95a91af210 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -769,7 +769,8 @@ class CompactionCompressionListener : public EventListener {
 enum CompressionFailureType {
   kTestCompressionFail,
   kTestDecompressionFail,
-  kTestDecompressionCorruption
+  kTestDecompressionCorruption,
+  kTestStartOfFinishFail,
 };
 
 class CompressionFailuresTest
@@ -793,7 +794,8 @@ INSTANTIATE_TEST_CASE_P(
     DBCompressionTest, CompressionFailuresTest,
     ::testing::Combine(::testing::Values(kTestCompressionFail,
                                          kTestDecompressionFail,
-                                         kTestDecompressionCorruption),
+                                         kTestDecompressionCorruption,
+                                         kTestStartOfFinishFail),
                        ::testing::ValuesIn(GetSupportedCompressions()),
                        ::testing::Values(0, 10), ::testing::Values(1, 4)));
 
@@ -845,6 +847,17 @@ TEST_P(CompressionFailuresTest, CompressionFailures) {
           std::unique_ptr<char[]> fake_data(new char[len]());
           *contents = BlockContents(std::move(fake_data), len);
         });
+  } else if (compression_failure_type_ == kTestStartOfFinishFail) {
+    if (compression_parallel_threads_ <= 1) {
+      // skip this configuration
+      return;
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableBuilder::Finish:ParallelIOStatus", [&](void* arg) {
+          *static_cast<IOStatus*>(arg) = IOStatus::Corruption("Seeded failure");
+        });
+  } else {
+    abort();
   }
 
   std::map<std::string, std::string> key_value_written;
@@ -888,6 +901,7 @@ TEST_P(CompressionFailuresTest, CompressionFailures) {
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
+  auto st = s.getState();
   if (compression_failure_type_ == kTestCompressionFail) {
     // Should be kNoCompression, check content consistency
     std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
@@ -901,11 +915,18 @@ TEST_P(CompressionFailuresTest, CompressionFailures) {
     ASSERT_OK(db_iter->status());
     ASSERT_EQ(0, key_value_written.size());
   } else if (compression_failure_type_ == kTestDecompressionFail) {
-    ASSERT_EQ(std::string(s.getState()),
-              "Could not decompress: kTestDecompressionFail");
+    ASSERT_EQ(s.code(), Status::kCorruption);
+    ASSERT_NE(st, nullptr);
+    ASSERT_EQ(std::string(st), "Could not decompress: kTestDecompressionFail");
   } else if (compression_failure_type_ == kTestDecompressionCorruption) {
-    ASSERT_EQ(std::string(s.getState()),
+    ASSERT_EQ(s.code(), Status::kCorruption);
+    ASSERT_NE(st, nullptr);
+    ASSERT_EQ(std::string(st),
               "Decompressed block did not match pre-compression block");
+  } else if (compression_failure_type_ == kTestStartOfFinishFail) {
+    ASSERT_EQ(s.code(), Status::kCorruption);
+    ASSERT_NE(st, nullptr);
+    ASSERT_EQ(std::string(st), "Seeded failure");
   }
 }
 

From 678690274dd83fe3bcecfd97c0cdcaa1eba9cad0 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 19 Nov 2025 13:16:06 -0800
Subject: [PATCH 380/500] More options for sst_dump recompress (#14133)

Summary:
I have been using sst_dump --command=recompress for some ad hoc automation for compression engineering and these new options help with that.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14133

Test Plan: manual

Reviewed By: hx235

Differential Revision: D87453635

Pulled By: pdillinger

fbshipit-source-id: 2ae54e13a9221ec27c6637fea16623465a9163ae
---
 tools/sst_dump_tool.cc | 66 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index f09712838f96..67142b0967ce 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -8,6 +8,7 @@
 
 #include <cinttypes>
 #include <iostream>
+#include <regex>
 
 #include "db_stress_tool/db_stress_compression_manager.h"
 #include "options/options_helper.h"
@@ -100,6 +101,10 @@ void print_help(bool to_stderr) {
       instead of the built-in compression manager, which may support a
       different set of compression types.
 
+    --enable_index_compression=<bool>
+      Used with --command=recompress to specify whether to compress index
+      blocks (in addition to data blocks).
+
     --parse_internal_key=<0xKEY>
       Convenience option to parse an internal key on the command line. Dumps the
       internal key in hex format {'key' @ SN: type}
@@ -188,8 +193,15 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   std::string compression_level_to_str;
   size_t block_size = 16384;  // A popular choice for default
   size_t readahead_size = 2 * 1024 * 1024;
+  // These two options are intentionally secret options because they are
+  // niche ways to select files to get the "recompress" treatment. And even
+  // if std::regex is flawed, it should be good enough for these niche uses.
+  std::unique_ptr<std::regex> require_property_regex;
+  std::unique_ptr<std::regex> exclude_property_regex;
   std::vector<CompressionType> compression_types;
   std::shared_ptr<CompressionManager> compression_manager;
+  bool enable_index_compression =
+      BlockBasedTableOptions{}.enable_index_compression;
   uint64_t total_num_files = 0;
   uint64_t total_num_data_blocks = 0;
   uint64_t total_data_block_size = 0;
@@ -268,6 +280,12 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         }
         compression_types.emplace_back(iter->second);
       }
+    } else if (strncmp(argv[i], "--require_property_regex=", 25) == 0) {
+      require_property_regex = std::make_unique<std::regex>(
+          argv[i] + 25, std::regex_constants::egrep);
+    } else if (strncmp(argv[i], "--exclude_property_regex=", 25) == 0) {
+      exclude_property_regex = std::make_unique<std::regex>(
+          argv[i] + 25, std::regex_constants::egrep);
     } else if (strncmp(argv[i], "--compression_manager=", 22) == 0) {
       std::string compression_manager_str = argv[i] + 22;
       ConfigOptions config_options;
@@ -287,6 +305,11 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       options.compression_manager = compression_manager;
       printf("Using compression manager: %s\n",
              compression_manager->GetId().c_str());
+    } else if (strncmp(argv[i], "--enable_index_compression=", 27) == 0) {
+      if (strlen(argv[i]) > 27) {
+        enable_index_compression =
+            argv[i][27] == '1' || argv[i][27] == 't' || argv[i][27] == 'T';
+      }
     } else if (strncmp(argv[i], "--parse_internal_key=", 21) == 0) {
       std::string in_key(argv[i] + 21);
       try {
@@ -492,6 +515,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         bbto = *options.table_factory->GetOptions<BlockBasedTableOptions>();
       }
       bbto.block_size = block_size;
+      bbto.enable_index_compression = enable_index_compression;
       // Maximize compression features available
       bbto.format_version = kLatestFormatVersion;
       options.table_factory = std::make_shared<BlockBasedTableFactory>(bbto);
@@ -514,17 +538,37 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
               dumper.getStatus().ToString().c_str());
       continue;
-    } else {
-      valid_sst_files.push_back(filename);
-      // Print out from and to key information once
-      // where there is at least one valid SST
-      if (valid_sst_files.size() == 1) {
-        // from_key and to_key are only used for "check", "scan", or ""
-        if (command == "check" || command == "scan" || command == "") {
-          fprintf(stdout, "from [%s] to [%s]\n",
-                  ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
-                  ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
-        }
+    }
+    auto props_ptr = dumper.GetInitTableProperties();
+    if (props_ptr && (require_property_regex || exclude_property_regex)) {
+      // Call should match with show_properties below
+      auto props_str = props_ptr->ToString("\n  ", ": ");
+      if (require_property_regex &&
+          !std::regex_search(props_str, *require_property_regex)) {
+        fprintf(stderr,
+                "%s: skipping because properties string doesn't match required "
+                "regex\n",
+                filename.c_str());
+        continue;
+      }
+      if (exclude_property_regex &&
+          std::regex_search(props_str, *exclude_property_regex)) {
+        fprintf(
+            stderr,
+            "%s: skipping because properties string matches excluded regex\n",
+            filename.c_str());
+        continue;
+      }
+    }
+    valid_sst_files.push_back(filename);
+    // Print out from and to key information once
+    // where there is at least one valid SST
+    if (valid_sst_files.size() == 1) {
+      // from_key and to_key are only used for "check", "scan", or ""
+      if (command == "check" || command == "scan" || command == "") {
+        fprintf(stdout, "from [%s] to [%s]\n",
+                ROCKSDB_NAMESPACE::Slice(from_key).ToString(true).c_str(),
+                ROCKSDB_NAMESPACE::Slice(to_key).ToString(true).c_str());
       }
     }
 

From 8c8586aa23dff524bdd883c5635fd6c8326b8b11 Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Wed, 19 Nov 2025 14:04:58 -0800
Subject: [PATCH 381/500] Add oncall to BUCK file (#14134)

Summary:
As title

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14134

Test Plan:
The following command generated the BUCK file correctly
```
python3 buckifier/buckify_rocksdb.py
```

Reviewed By: anand1976

Differential Revision: D87469877

Pulled By: jaykorean

fbshipit-source-id: 9ec330084cfe96ad9b71aa13c8eb16593256a5ac
---
 BUCK                         |  6 ++++--
 buckifier/buckify_rocksdb.py |  3 +++
 buckifier/targets_builder.py |  5 +++++
 buckifier/targets_cfg.py     | 11 ++++++++---
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/BUCK b/BUCK
index e9e6de2f6b16..c7fd89eeb18d 100644
--- a/BUCK
+++ b/BUCK
@@ -1,12 +1,14 @@
 # This file @generated by:
 #$ python3 buckifier/buckify_rocksdb.py
 # --> DO NOT EDIT MANUALLY <--
-# This file is a Facebook-specific integration for buck builds, so can
-# only be validated by Facebook employees.
+# This file is a Meta-specific integration for buck builds, so can
+# only be validated by Meta employees.
 load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
 load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 
 
+oncall("rocksdb_point_of_contact")
+
 cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "cache/cache.cc",
         "cache/cache_entry_roles.cc",
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index 113d58e11655..647353e44f3c 100755
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -135,6 +135,9 @@ def generate_buck(repo_path, deps_map):
 
     BUCK = TARGETSBuilder("%s/BUCK" % repo_path, extra_argv)
 
+    # Add oncall("rocksdb_point_of_contact") at the top
+    BUCK.add_oncall("rocksdb_point_of_contact")
+
     # rocksdb_lib
     BUCK.add_library(
         "rocksdb_lib",
diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py
index e62eaf958504..1f0f412e18e3 100644
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@@ -45,6 +45,11 @@ def __init__(self, path, extra_argv):
         self.total_bin = 0
         self.total_test = 0
         self.tests_cfg = ""
+    
+    def add_oncall(self, oncall):
+       with open(self.path, "ab") as targets_file:
+            targets_file.write(targets_cfg.oncall_template.format(name=oncall).encode("utf-8"))
+                
 
     def add_library(
         self,
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 4e58d1210200..7306ee0cd09d 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -1,10 +1,10 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
 
 rocksdb_target_header_template = """# This file \100generated by:
 #$ python3 buckifier/buckify_rocksdb.py{extra_argv}
 # --> DO NOT EDIT MANUALLY <--
-# This file is a Facebook-specific integration for buck builds, so can
-# only be validated by Facebook employees.
+# This file is a Meta-specific integration for buck builds, so can
+# only be validated by Meta employees.
 load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
 load("@fbcode_macros//build_defs:export_files.bzl", "export_file")
 
@@ -41,3 +41,8 @@
 export_file_template = """
 export_file(name = "{name}")
 """
+
+
+oncall_template = """
+oncall("{name}")
+"""

From c76cacc696b50c3a407b3df537646a6535059ab5 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Wed, 19 Nov 2025 16:25:53 -0800
Subject: [PATCH 382/500] Fix overflow in MultiplyCheckOverflow() due to
 std::numeric_limits<uint64_t>::max()'s promotion to double (#14132)

Summary:
**Context/Summary:**
Due to double's 53-bit mantissa limitation, large uint64_t values lose precision when converted to double. Value equals to or smaller than UINT64_MAX (but greater than 2^64 - 1024) round up to 2^64 since rounding up results in less error than rounding down, which exceeds UINT64_MAX. `std::numeric_limits<uint64_t>::max() / op1 < op2` won't catch those cases. Casting such out-of-range doubles back to uint64_t causes undefined behavior. T

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14132
UndefinedBehaviorSanitizer: undefined-behavior options/cf_options.cc:1087:32 in
```
before the fix but not after.

Test Plan:
```
COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j55 db_stress

python3 tools/db_crashtest.py --simple blackbox --compact_range_one_in=5 --target_file_size_base=9223372036854775807 // Half of std::numeric_limits<uint64_t>::max()
```
It fails with
```
stderr:
 options/cf_options.cc:1087:32: runtime error: 1.84467e+19 is outside the range of representable values of type 'unsigned long'

Reviewed By: pdillinger

Differential Revision: D87434936

Pulled By: hx235

fbshipit-source-id: 65563edf9faf732410bdba8b9e4b7fd61b958169
---
 options/cf_options.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/options/cf_options.cc b/options/cf_options.cc
index eca2cd930966..ba1360aa841e 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -1078,10 +1078,12 @@ uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
   if (op1 == 0 || op2 <= 0) {
     return 0;
   }
-  if (std::numeric_limits<uint64_t>::max() / op1 < op2) {
-    return op1;
+
+  if (op1 * op2 < static_cast<double>(std::numeric_limits<uint64_t>::max())) {
+    return static_cast<uint64_t>(op1 * op2);
   }
-  return static_cast<uint64_t>(op1 * op2);
+
+  return op1;
 }
 
 // when level_compaction_dynamic_level_bytes is true and leveled compaction

From dc33c1adaf21a589953e6e7c6b0a0d2b7b3a57c0 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 21 Nov 2025 11:32:00 -0800
Subject: [PATCH 383/500] Include verify_output_flags to check resumable
 compaction compatibility (#14139)

Summary:
**Context/Summary:**

.. because verify_output_flags contains information of usage of paranoid_file_check that is currently not yet compatible with resumable remote compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14139

Test Plan: Existing tests

Reviewed By: jaykorean

Differential Revision: D87582635

Pulled By: hx235

fbshipit-source-id: ef21223da53a0696fa3ca9b1617c2c1ee2e19878
---
 db/db_impl/db_impl_secondary.cc | 43 ++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index 5e6de87c586f..b73de2b350ae 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -1299,24 +1299,33 @@ Status DBImplSecondary::CompactWithoutInstallation(
   }
   Status s;
 
+  const auto& mutable_cf_options = cfd->GetLatestMutableCFOptions();
+
   // TODO(hx235): Resuming compaction is currently incompatible with
-  // paranoid_file_checks=true because OutputValidator hash verification would
-  // fail during compaction resumption. Before interruption, resuming
-  // compaction needs to persist the hash of each output file to enable
-  // validation after resumption. Alternatively and preferably, we could move
-  // the output verification to happen immediately after each output file is
-  // created. This workaround currently disables resuming compaction when
-  // paranoid_file_checks is enabled. Note that paranoid_file_checks is
-  // disabled by default.
+  // output hash verification (enabled via paranoid_file_checks=true or
+  // verify_output_flags containing kVerifyIteration) because resumed compaction
+  // will lose the hash computed before interruption.
+  // Potential solutions:
+  // 1. Persist the hash state: Before interruption, save the current hash value
+  //    of each output file to disk, allowing validation to continue correctly
+  //    after resumption.
+  // 2. Immediate verification: Move output verification to happen
+  //    immediately after each output file is created and closed, eliminating
+  //    the need to maintain hash state across resumption boundaries.
+  bool output_hash_verification_enabled =
+      mutable_cf_options.paranoid_file_checks ||
+      !!(mutable_cf_options.verify_output_flags &
+         VerifyOutputFlags::kVerifyIteration);
+
   bool allow_resumption =
-      options.allow_resumption &&
-      !cfd->GetLatestMutableCFOptions().paranoid_file_checks;
+      options.allow_resumption && !output_hash_verification_enabled;
 
-  if (options.allow_resumption &&
-      cfd->GetLatestMutableCFOptions().paranoid_file_checks) {
+  if (options.allow_resumption && output_hash_verification_enabled) {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "Resume compaction configured but disabled due to "
-                   "incompatible with paranoid_file_checks=true");
+                   "incompatibility with output hash verification "
+                   "(paranoid_file_checks=true or verify_output_flags "
+                   "containing kVerifyIteration)");
   }
 
   mutex_.Unlock();
@@ -1345,8 +1354,8 @@ Status DBImplSecondary::CompactWithoutInstallation(
   CompactionOptions comp_options;
   comp_options.compression = kDisableCompressionOption;
   comp_options.output_file_size_limit = MaxFileSizeForLevel(
-      cfd->GetLatestMutableCFOptions(), input.output_level,
-      cfd->ioptions().compaction_style, vstorage->base_level(),
+      mutable_cf_options, input.output_level, cfd->ioptions().compaction_style,
+      vstorage->base_level(),
       cfd->ioptions().level_compaction_dynamic_level_bytes);
 
   std::vector<CompactionInputFiles> input_files;
@@ -1384,8 +1393,8 @@ Status DBImplSecondary::CompactWithoutInstallation(
   }
   c.reset(cfd->compaction_picker()->PickCompactionForCompactFiles(
       comp_options, input_files, input.output_level, vstorage,
-      cfd->GetLatestMutableCFOptions(), mutable_db_options_, 0,
-      earliest_snapshot, job_context.snapshot_checker));
+      mutable_cf_options, mutable_db_options_, 0, earliest_snapshot,
+      job_context.snapshot_checker));
   assert(c != nullptr);
   c->FinalizeInputInfo(version);
 

From c4bbad4dfe7ce6e9f38689bb9281277f374dbebc Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Fri, 21 Nov 2025 11:32:10 -0800
Subject: [PATCH 384/500] Update format-diff script to add text to new files
 (#14143)

Summary:
Fixing internal validator failure

```
Every project specific source file must contain a doc block with an appropriate copyright header. Unrelated files must be listed as exceptions in the Copyright Headers Exceptions page in the repo dashboard.
A copyright header clearly indicates that the code is owned by Meta. Every open source file must start with a comment containing "Meta Platforms, Inc. and affiliates"
https://github.com/facebook/rocksdb/blob/main/buckifier/targets_cfg.py:
The first 16 lines of 'buckifier/targets_cfg.py' do not contain the patterns:
	(Meta Platforms, Inc. and affiliates)|(Facebook, Inc(\.|,)? and its affiliates)|([0-9]{4}-present(\.|,)? Facebook)|([0-9]{4}(\.|,)? Facebook)
```

While fixing the text to pass the linter, I took the opportunity to modify `format-diff.sh` script to add the copyright header automatically if missing in new files.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14143

Test Plan:
```
$> make format
```
**new python file**
```
build_tools/format-diff.sh
Checking format of uncommitted changes...
Checking for copyright headers in new files...
Added copyright header to build_tools/test.py
Copyright headers were added to new files.
Nothing needs to be reformatted!
```
**new header file**
```
build_tools/format-diff.sh
Checking format of uncommitted changes...
Checking for copyright headers in new files...
Added copyright header to db/db_impl/db_impl_jewoongh.h
Copyright headers were added to new files.
Nothing needs to be reformatted!
```

Reviewed By: hx235

Differential Revision: D87653124

Pulled By: jaykorean

fbshipit-source-id: 164322cfcd2c162bb3b41bb8f3bafefa3f20b695
---
 buckifier/targets_cfg.py   |  4 ++-
 build_tools/format-diff.sh | 64 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 7306ee0cd09d..e9ff129a604a 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -1,4 +1,6 @@
-# Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#  This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory)
+#  and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory).
 
 rocksdb_target_header_template = """# This file \100generated by:
 #$ python3 buckifier/buckify_rocksdb.py{extra_argv}
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index aa6b634563da..91cbb46a3412 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -148,6 +148,70 @@ else
   echo "Checking format of uncommitted changes..."
 fi
 
+# Check for missing copyright in new files
+echo "Checking for copyright headers in new files..."
+
+# Get list of new files (added, not just modified)
+if [ -z "$uncommitted_code" ]; then
+  # Post-commit: check files added since merge base
+  new_files=$(git diff --name-only --diff-filter=A "$FORMAT_UPSTREAM_MERGE_BASE" -- '*.h' '*.cc' '*.py' $EXCLUDE)
+else
+  # Pre-commit: check staged new files
+  new_files=$(git diff --name-only --diff-filter=A --cached HEAD -- '*.h' '*.cc' '*.py' $EXCLUDE)
+fi
+
+if [ -n "$new_files" ]; then
+  files_missing_copyright=""
+
+  for file in $new_files; do
+    if [ -f "$file" ]; then
+      # Check if file is missing copyright
+      # For .py files, check for Python-style comment
+      # For .h and .cc files, check for C++-style comment
+      if [[ "$file" == *.py ]]; then
+        if ! grep -q "Copyright (c) Meta Platforms, Inc. and affiliates" "$file"; then
+          files_missing_copyright="$files_missing_copyright $file"
+          # Add copyright header to Python file
+          temp_file=$(mktemp)
+          {
+            echo "#  Copyright (c) Meta Platforms, Inc. and affiliates."
+            echo "#  This source code is licensed under both the GPLv2 (found in the COPYING file in the root directory)"
+            echo "#  and the Apache 2.0 License (found in the LICENSE.Apache file in the root directory)."
+            echo
+            cat "$file"
+          } > "$temp_file"
+          mv "$temp_file" "$file"
+          echo "Added copyright header to $file"
+        fi
+      elif [[ "$file" == *.h ]] || [[ "$file" == *.cc ]]; then
+        if ! grep -q "Copyright (c) Meta Platforms, Inc. and affiliates" "$file"; then
+          files_missing_copyright="$files_missing_copyright $file"
+          # Add copyright header to C++ file
+          temp_file=$(mktemp)
+          {
+            echo "//  Copyright (c) Meta Platforms, Inc. and affiliates. "
+            echo "//  This source code is licensed under both the GPLv2 (found in the "
+            echo "//  COPYING file in the root directory) and Apache 2.0 License "
+            echo "//  (found in the LICENSE.Apache file in the root directory)."
+            echo
+            cat "$file"
+          } > "$temp_file"
+          mv "$temp_file" "$file"
+          echo "Added copyright header to $file"
+        fi
+      fi
+    fi
+  done
+
+  if [ -n "$files_missing_copyright" ]; then
+    echo "Copyright headers were added to new files."
+  else
+    echo "All new files have copyright headers."
+  fi
+else
+  echo "No new files to check for copyright headers."
+fi
+
 if [ -z "$diffs" ]
 then
   echo "Nothing needs to be reformatted!"

From 2f583aed8f66e36d5feb7a5923da9440345d1bba Mon Sep 17 00:00:00 2001
From: Jay Huh <jewoongh@meta.com>
Date: Fri, 21 Nov 2025 13:30:31 -0800
Subject: [PATCH 385/500] Move prepared_iter size assertion after cleanup
 (#14144)

Summary:
Fixing crash test failure caused by `prepared_iters_.size() == 0`

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14144

Test Plan:
```
python3 -u tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --simple blackbox
```

Reviewed By: krhancoc

Differential Revision: D87656914

Pulled By: jaykorean

fbshipit-source-id: 9ef7cf4ea5d34fe9dee6219b32323e91a2ea3e5f
---
 db/version_set.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/db/version_set.cc b/db/version_set.cc
index b9339929acd0..baf12b9ba359 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1020,11 +1020,11 @@ class LevelIterator final : public InternalIterator {
   ~LevelIterator() override {
     delete file_iter_.Set(nullptr);
     // Clean up any prepared iterators that weren't used
-    assert(prepared_iters_.size() == 0);
     for (auto& entry : prepared_iters_) {
       delete entry.second;
     }
     prepared_iters_.clear();
+    assert(prepared_iters_.size() == 0);
   }
 
   // Seek to the first file with a key >= target.

From 8c7c8b8dab04f945a3574941185bdddc3d15a1be Mon Sep 17 00:00:00 2001
From: Changyu Bi <changyubi@meta.com>
Date: Fri, 21 Nov 2025 14:18:38 -0800
Subject: [PATCH 386/500] Fix a bug where compaction with range deletion can
 persist kTypeMaxValid in file metadata (#14122)

Summary:
Range deletion start keys are considered during compaction for cutting output files. Due to some ordering requirement (see comment above InsertNextValidRangeTombstoneAtLevel()) between truncated range deletion start key and a file's point keys, there was logic in https://github.com/facebook/rocksdb/blob/f6c9c3bf1cf05096e8ff8c03ded60c1e199edbb7/db/range_del_aggregator.cc#L39 that changes the value type to be kTypeMaxValid. However, kTypeMaxValid is not supposed to be persisted per https://github.com/facebook/rocksdb/blob/f6c9c3bf1cf05096e8ff8c03ded60c1e199edbb7/db/dbformat.h#L75-L76. This can cause forward compatibility issues reported in https://github.com/facebook/rocksdb/issues/14101. This PR fixes this issue by removing the logic that sets kTypeMaxValid and always skip truncated range deletion start key in CompactionMergingIterator.

For existing SST files, we want to avoid using this kTypeMaxValid, so this PR also introduces a new placeholder value type. This allows us to re-strengthen the relevant value type checks (IsExtendedValueType()) that was loosen for kTypeMaxValid.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14122

Test Plan:
- a unit test that persists kTypeMaxValid before this fix
- crash test with frequent range deletion: `python3 ./tools/db_crashtest.py blackbox --delrangepercent=11 --readpercent=35`
- Generate SST files with 0x1A as value type (kTypeMaxValid before this change) in file metadata. Run ldb with the strengthened check in IsExtendedValueType() to dump the MANIFEST. It failed to parse MANIFEST as expected before this PR and succeeds after this PR.
```
Error in processing file /tmp/rocksdbtest-543376/db_range_del_test_2549357_6547198162080866792/MANIFEST-000005 Corruption: VersionEdit: new-file4 entry  The file /tmp/rocksdbtest-543376/db_range_del_test_2549357_6547198162080866792/MANIFEST-000005 may be corrupted.
```

Reviewed By: pdillinger

Differential Revision: D87016541

Pulled By: cbi42

fbshipit-source-id: 9957a095db2cd9947463b403f352bd9a1fd70a76
---
 db/db_range_del_test.cc                       | 128 ++++++++++++++++++
 db/dbformat.h                                 |  16 ++-
 db/range_del_aggregator.cc                    |   4 -
 db/range_del_aggregator_test.cc               |  38 +++---
 db/version_edit.cc                            |   1 +
 db/version_edit.h                             |   3 +
 table/compaction_merging_iterator.cc          |  31 +++--
 .../bug_fixes/fix-range-del-boundary.md       |   1 +
 utilities/debug.cc                            |   2 +
 9 files changed, 185 insertions(+), 39 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/fix-range-del-boundary.md

diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 5122aedc97a3..289f783ab5e2 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -3825,6 +3825,134 @@ TEST_F(DBRangeDelTest, RowCache) {
   // and should not turn db into read-only mdoe.
   ASSERT_OK(Put(Key(5), "foo"));
 }
+
+TEST_F(DBRangeDelTest, FileCutWithTruncatedRangeDelKey) {
+  // Test for a bug that used to generate files with meta.smallest
+  // containing kMaxValid.
+  //
+  // Setup:
+  // - Write Key(2), Key(3) and DeleteRange(Key(1), Key(4))
+  // - Flush to L0
+  // - Use SingleKeySstPartitioner to force each user key into its own file
+  // - Compact files from L0 to L1 will generate files
+  // File[0]:
+  //   smallest=[user_key=key000001, seq=4, type=15],
+  //   largest= [user_key=key000002, seq=72057594037927935, type=15]
+  // File[1]:
+  //   smallest=[user_key=key000002, seq=2, type=1],
+  //   largest= [user_key=key000003, seq=72057594037927935, type=15]
+  // File[2]:
+  //   smallest=[user_key=key000003, seq=3, type=1],
+  //   largest= [user_key=key000004, seq=72057594037927935, type=15]
+  // With range deletions truncated to each files key range.
+  //
+  // - Compact these files again into L2. RocksDB usede to set truncated
+  // range deletion start key to have value type kMaxValid. The range deletion
+  // start key is used in compaction file cutting decision.
+  // - Verify the file boundary keys after compaction have valid boundary keys
+  //
+  // Before the fix:
+  // File[0]:
+  //   smallest=[user_key=key000001, seq=4, type=15],
+  //   largest= [user_key=key000002, seq=72057594037927935, type=15]
+  // File[1]:
+  //   smallest=[user_key=key000002, seq=2, type=26],
+  //   largest= [user_key=key000003, seq=72057594037927935, type=15]
+  // File[2]:
+  //   smallest=[user_key=key000003, seq=3, type=26],
+  //   largest= [user_key=key000004, seq=72057594037927935, type=15]
+  //
+  // After the fix:
+  // File[0]:
+  //   smallest=[user_key=key000001, seq=4, type=15],
+  //   largest= [user_key=key000002, seq=72057594037927935, type=15]
+  // File[1]:
+  //   smallest=[user_key=key000002, seq=2, type=1],
+  //   largest= [user_key=key000003, seq=72057594037927935, type=15]
+  // File[2]:
+  //   smallest=[user_key=key000003, seq=3, type=1],
+  //   largest= [user_key=key000004, seq=72057594037927935, type=15]
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+
+  // Use partitioner that cuts before every new user key.
+  // Key(x) generates keys of length 9.
+  auto factory = std::shared_ptr<SstPartitionerFactory>(
+      NewSstPartitionerFixedPrefixFactory(10));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Create a file in a lower level so the compactions below are not
+  // bottommost compactions. Range deletion start keys are not considered
+  // in bottommost compaction.
+  ASSERT_OK(Put(Key(3), rnd.RandomBinaryString(100)));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(6);
+  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+
+  ASSERT_OK(Put(Key(2), rnd.RandomString(100)));
+  // Snapshots keep point keys alive.
+  ManagedSnapshot snapshot1(db_);
+  ASSERT_OK(Put(Key(3), rnd.RandomString(100)));
+  ManagedSnapshot snapshot2(db_);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key(4)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  ColumnFamilyMetaData cf_meta_l0;
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_l0);
+  ASSERT_EQ(1, cf_meta_l0.levels[0].files.size());
+  std::vector<std::string> l0_filenames;
+  for (const auto& sst_file : cf_meta_l0.levels[0].files) {
+    l0_filenames.push_back(sst_file.name);
+  }
+
+  // Compact L0 files to L1
+  CompactionOptions compact_options_l0;
+  ASSERT_OK(db_->CompactFiles(compact_options_l0, l0_filenames, 1));
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  // Check L1 file metadata
+  std::vector<std::vector<FileMetaData>> files_l1;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_l1);
+
+  for (const auto& file : files_l1[1]) {
+    ASSERT_LT(ExtractValueType(file.smallest.Encode()), kTypeMaxValid);
+    ASSERT_LT(ExtractValueType(file.largest.Encode()), kTypeMaxValid);
+  }
+
+  // Get file names from level 1
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta);
+  std::vector<std::string> input_filenames;
+  for (const auto& sst_file : cf_meta.levels[1].files) {
+    input_filenames.push_back(sst_file.name);
+  }
+
+  // Compact files from L1 to L2
+  CompactionOptions compact_options;
+  ASSERT_OK(db_->CompactFiles(compact_options, input_filenames, 2));
+
+  // Check L2 file metadata
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+  for (const auto& file : files[2]) {
+    ASSERT_LT(ExtractValueType(file.smallest.Encode()), kTypeMaxValid);
+    ASSERT_LT(ExtractValueType(file.largest.Encode()), kTypeMaxValid);
+  }
+
+  // // Verify iteration works correctly
+  std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/dbformat.h b/db/dbformat.h
index 0ee6e9272b5f..e50380858774 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -72,6 +72,12 @@ enum ValueType : unsigned char {
   kTypeColumnFamilyWideColumnEntity = 0x17,     // WAL only
   kTypeValuePreferredSeqno = 0x18,              // Value with a unix write time
   kTypeColumnFamilyValuePreferredSeqno = 0x19,  // WAL only
+  // Placeholder value type for legacy SST files with incorrectly persisted
+  // file boundaries. Prior to the fix, TruncatedRangeDelIterator assigned
+  // kTypeMaxValid to truncated range deletion keys, which was then
+  // incorrectly persisted to SST file metadata. This placeholder value allows
+  // reading such legacy files for without using kTypeMaxValid.
+  kTypeTruncatedRangeDeletionSentinel = 0x1A,
   kTypeMaxValid,    // Should be after the last valid type, only used for
                     // validation
   kMaxValue = 0x7F  // Not used for storing records.
@@ -118,10 +124,11 @@ inline bool IsValueType(ValueType t) {
 
 // Checks whether a type is from user operation
 // kTypeRangeDeletion is in meta block so this API is separated from above
-// kTypeMaxValid can be from keys generated by
-// TruncatedRangeDelIterator::start_key()
+// kTypeTruncatedRangeDeletionSentinel is for legacy files with incorrectly
+// persisted file boundaries.
 inline bool IsExtendedValueType(ValueType t) {
-  return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid;
+  return IsValueType(t) || t == kTypeRangeDeletion ||
+         t == kTypeTruncatedRangeDeletionSentinel;
 }
 
 // We leave eight bits empty at the bottom so a type and sequence#
@@ -180,8 +187,7 @@ inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
 // Pack a sequence number and a ValueType into a uint64_t
 inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
   assert(seq <= kMaxSequenceNumber);
-  // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
-  assert(IsExtendedValueType(t) || t == kTypeMaxValid);
+  assert(IsExtendedValueType(t));
   return (seq << 8) | t;
 }
 
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index f41521e1162a..8b389bac5468 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -36,7 +36,6 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
     Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
                                          false /* log_err_key */);  // TODO
     pik_status.PermitUncheckedError();
-    parsed_smallest.type = kTypeMaxValid;
     assert(pik_status.ok());
     smallest_ = &parsed_smallest;
   }
@@ -71,9 +70,6 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
       // the truncated end key can cover the largest key in this sstable, reduce
       // its sequence number by 1.
       parsed_largest.sequence -= 1;
-      // This line is not needed for correctness, but it ensures that the
-      // truncated end key is not covering keys from the next SST file.
-      parsed_largest.type = kTypeMaxValid;
     }
     largest_ = &parsed_largest;
   }
diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc
index 89391c924d93..41fbfbb9249d 100644
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
@@ -89,7 +89,9 @@ void VerifyIterator(
   for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
-    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end))
+        << iter->end_key().DebugString(false, false) << " "
+        << expected_range_dels[i].end.DebugString(false, false);
     EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
   }
   EXPECT_FALSE(iter->Valid());
@@ -305,28 +307,28 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
 
   VerifyIterator(
       &iter, bytewise_icmp,
-      {{InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
+      {{InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10},
        {InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {InternalValue("j", 4, kTypeRangeDeletion),
-        InternalValue("m", 8, kTypeMaxValid), 4}});
+        InternalValue("m", 8, kTypeValue), 4}});
 
   VerifySeek(
       &iter, bytewise_icmp,
-      {{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
+      {{"d", InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10},
        {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {"ia", InternalValue("j", 4, kTypeRangeDeletion),
-        InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */},
+        InternalValue("m", 8, kTypeValue), 4, false /* invalid */},
        {"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
         true /* invalid */},
-       {"", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}});
+       {"", InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10}});
 
   VerifySeekForPrev(
       &iter, bytewise_icmp,
-      {{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
+      {{"d", InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10},
        {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {"n", InternalValue("j", 4, kTypeRangeDeletion),
-        InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */},
+        InternalValue("m", 8, kTypeValue), 4, false /* invalid */},
        {"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
         true /* invalid */}});
 }
@@ -345,23 +347,21 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
   TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
                                  &smallest, &largest);
 
-  VerifyIterator(
-      &iter, bytewise_icmp,
-      {{InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}});
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8}});
 
-  VerifySeek(
-      &iter, bytewise_icmp,
-      {{"d", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
-       {"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
-       {"j", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
-        true /* invalid */}});
+  VerifySeek(&iter, bytewise_icmp,
+             {{"d", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8},
+              {"f", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8},
+              {"j", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""),
+               0, true /* invalid */}});
 
   VerifySeekForPrev(
       &iter, bytewise_icmp,
       {{"d", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
         true /* invalid */},
-       {"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
-       {"j", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}});
+       {"f", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8}});
 }
 
 TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 88150181bf4c..afc9128d45ad 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -30,6 +30,7 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
 Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
                                       SequenceNumber seqno,
                                       ValueType value_type) {
+  assert(value_type < kTypeMaxValid);
   if (value_type == kTypeBlobIndex) {
     BlobIndex blob_index;
     const Status s = blob_index.DecodeFrom(value);
diff --git a/db/version_edit.h b/db/version_edit.h
index 8ed83cc4a8ed..2f0543b19d34 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -317,6 +317,9 @@ struct FileMetaData {
   void UpdateBoundariesForRange(const InternalKey& start,
                                 const InternalKey& end, SequenceNumber seqno,
                                 const InternalKeyComparator& icmp) {
+    assert(ExtractValueType(start.Encode()) < kTypeMaxValid);
+    assert(ExtractValueType(end.Encode()) < kTypeMaxValid);
+
     if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
       smallest = start;
     }
diff --git a/table/compaction_merging_iterator.cc b/table/compaction_merging_iterator.cc
index 6c9dabb3ec12..b67b63472d63 100644
--- a/table/compaction_merging_iterator.cc
+++ b/table/compaction_merging_iterator.cc
@@ -191,13 +191,6 @@ class CompactionMergingIterator : public InternalIterator {
 
     bool operator()(HeapItem* a, HeapItem* b) const {
       int r = comparator_->Compare(a->key(), b->key());
-      // For each file, we assume all range tombstone start keys come before
-      // its file boundary sentinel key (file's meta.largest key).
-      // In the case when meta.smallest = meta.largest and range tombstone start
-      // key is truncated at meta.smallest, the start key will have op_type =
-      // kMaxValid to make it smaller (see TruncatedRangeDelIterator
-      // constructor). The following assertion validates this assumption.
-      assert(a->type == b->type || r != 0);
       return r > 0;
     }
 
@@ -242,8 +235,24 @@ class CompactionMergingIterator : public InternalIterator {
     return !minHeap_.empty() ? minHeap_.top() : nullptr;
   }
 
-  void InsertRangeTombstoneAtLevel(size_t level) {
+  // For each file under a LevelIterator, the lifetime of range tombstone
+  // iterator is tied to the point key iterator. So we want scan through
+  // all range tombstone start keys before the file boundary sentinel key
+  // (file's meta.largest). When meta.smallest == meta.largest, the truncated
+  // range del start key may be ordered after meta.largest.
+  // Here we skip the first range deletion start key if it's truncated.
+  // This range deletion start key is redundant for compaction file cutting
+  // decision anyway, since the same point key will be considered for file
+  // cutting too.
+  void InsertNextValidRangeTombstoneAtLevel(size_t level) {
     if (range_tombstone_iters_[level]->Valid()) {
+      if (range_tombstone_iters_[level]->start_key().type !=
+          kTypeRangeDeletion) {
+        range_tombstone_iters_[level]->Next();
+        if (!range_tombstone_iters_[level]->Valid()) {
+          return;
+        }
+      }
       pinned_heap_item_[level].SetTombstoneForCompaction(
           range_tombstone_iters_[level]->start_key());
       minHeap_.push(&pinned_heap_item_[level]);
@@ -262,7 +271,7 @@ void CompactionMergingIterator::SeekToFirst() {
   for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
     if (range_tombstone_iters_[i]) {
       range_tombstone_iters_[i]->SeekToFirst();
-      InsertRangeTombstoneAtLevel(i);
+      InsertNextValidRangeTombstoneAtLevel(i);
     }
   }
 
@@ -290,7 +299,7 @@ void CompactionMergingIterator::Seek(const Slice& target) {
                  0) {
         range_tombstone_iters_[i]->Next();
       }
-      InsertRangeTombstoneAtLevel(i);
+      InsertNextValidRangeTombstoneAtLevel(i);
     }
   }
 
@@ -357,7 +366,7 @@ void CompactionMergingIterator::FindNextVisibleKey() {
       minHeap_.pop();
     }
     if (range_tombstone_iters_[current->level]) {
-      InsertRangeTombstoneAtLevel(current->level);
+      InsertNextValidRangeTombstoneAtLevel(current->level);
     }
   }
 }
diff --git a/unreleased_history/bug_fixes/fix-range-del-boundary.md b/unreleased_history/bug_fixes/fix-range-del-boundary.md
new file mode 100644
index 000000000000..b75bc6d11d60
--- /dev/null
+++ b/unreleased_history/bug_fixes/fix-range-del-boundary.md
@@ -0,0 +1 @@
+* Fix a bug where compaction with range deletion can persist kTypeMaxValid in MANIFEST as file metadata. kTypeMaxValid is not supposed to be persisted and can change as new value types are introduced. This can cause a forward compatibility issue where older versions of RocksDB don't recognize kTypeMaxValid from newer versions. A new placeholder value type kTypeTruncatedRangeDeletionSentinel is also introduced to replace kTypeMaxValid when reading existing SST files' metadata from MANIFEST. This allows us to strengthen some checks to avoid using kTypeMaxValid in the future.
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 59e6d46880f5..6bfd00b72c7a 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -41,6 +41,8 @@ static std::unordered_map<std::string, ValueType> value_type_string_map = {
     {"TypeValuePreferredSeqno", ValueType::kTypeValuePreferredSeqno},
     {"TypeColumnFamilyValuePreferredSeqno",
      ValueType::kTypeColumnFamilyValuePreferredSeqno},
+    {"kTypeTruncatedRangeDeletionSentinel",
+     ValueType::kTypeTruncatedRangeDeletionSentinel},
 };
 
 std::string KeyVersion::GetTypeName() const {

From 35148aca91cda84d6fa9b295eb5500d6d965dca6 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 21 Nov 2025 16:34:49 -0800
Subject: [PATCH 387/500] Improve distinct compression for index and data
 blocks (#14140)

Summary:
This change enables a custom CompressionManager / Compressor to adopt custom handling for data and index blocks. In particular, index blocks for format_version >= 4 use a distinct variant of the block format. Thus, a potentially format-aware compression algorithm such as OpenZL should be told which kind of block we are compressing. (And previously I avoided passing block type in CompressBlock for efficient handling of things like dictionaries but also avoiding checks on every CompressBlock call.)

Most of the change is in BlockBasedTableBuilder to call MaybeCloneSpecialized for both kDataBlock and for kIndexBlock. But I also needed some small tweaks/additions to the public API also:
* Require a Clone() function from Compressors, to support proper implementations of MaybeCloneSpecialized() in wrapper Compressors.
* Assert that the default implementation of CompressorWrapper::MaybeCloneSpecialized() is only used in allowable cases.
* Convenience function Compressor::CloneMaybeSpecialized()

This also fixes a serious bug/oversight in ManagedPtr for (ManagedWorkingArea) that somehow wasn't showing up before. It probably doesn't need a release note because CompressionManager stuff is still considered experimental.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14140

Test Plan: Greatly expanded DBCompressionTest.CompressionManagerWrapper to make sure the distinction between data blocks and index blocks is properly communicated to a custom CompressionManager/Compressor. The test includes processing the expected structure of data and index blocks, to serve as a tested example for structure-aware compressors.

Reviewed By: hx235

Differential Revision: D87600019

Pulled By: pdillinger

fbshipit-source-id: 252ef78910073a0e45f2c81dd45ac87ff8a41fc6
---
 include/rocksdb/advanced_compression.h        |  28 +-
 include/rocksdb/data_structure.h              |  24 +-
 .../block_based/block_based_table_builder.cc  |  88 ++-
 test_util/testutil.h                          |   6 +-
 util/auto_tune_compressor.cc                  |   8 +
 util/auto_tune_compressor.h                   |   2 +
 util/compression.cc                           |  40 +-
 util/compression_test.cc                      | 706 ++++++++++++------
 util/simple_mixed_compressor.cc               |  11 +-
 util/simple_mixed_compressor.h                |   3 +
 10 files changed, 631 insertions(+), 285 deletions(-)

diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index bd0294949827..be3da68f5177 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -90,6 +90,10 @@ class Compressor {
     return CompressionType::kDisableCompressionOption;
   }
 
+  // Return a distinct but functionally equivalent Compressor. This is often
+  // needed to implement MaybeCloneSpecialized() in wrapper compressors.
+  virtual std::unique_ptr<Compressor> Clone() const = 0;
+
   // Utility struct for providing sample data for the compression dictionary.
   // Potentially extensible by callers of Compressor (but not recommended)
   struct DictSampleArgs {
@@ -131,6 +135,18 @@ class Compressor {
     return nullptr;
   }
 
+  // A convenience function when a clone is needed and may or may not be
+  // specialized.
+  std::unique_ptr<Compressor> CloneMaybeSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+    auto clone = MaybeCloneSpecialized(block_type, std::move(dict_samples));
+    if (clone == nullptr) {
+      clone = Clone();
+      assert(clone != nullptr);
+    }
+    return clone;
+  }
+
   // A WorkingArea is an optional structure (both for callers and
   // implementations) that can enable optimizing repeated compressions by
   // reusing working space or thread-local tracking of statistics or trends.
@@ -473,9 +489,19 @@ class CompressorWrapper : public Compressor {
     return wrapped_->GetPreferredCompressionType();
   }
 
+  // NOTE: Clone() not implemented here because it needs to be in the derived
+  // class
+
+  // NOTE: MaybeCloneSpecialized() is only implemented here for convenience
+  // when the wrapped Compressor uses the default implementation of
+  // MaybeCloneSpecialized(). This needs to be overridden if not.
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
-    return wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+    auto clone =
+        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+    // Assert default no-op MaybeCloneSpecialized()
+    assert(clone == nullptr);
+    return clone;
   }
 
   ManagedWorkingArea ObtainWorkingArea() override {
diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h
index bf0144cd2904..7563a83abfcf 100644
--- a/include/rocksdb/data_structure.h
+++ b/include/rocksdb/data_structure.h
@@ -247,15 +247,7 @@ class ManagedPtr {
  public:
   ManagedPtr() = default;
   ManagedPtr(T* ptr, Owner* owner) : ptr_(ptr), owner_(owner) {}
-  ~ManagedPtr() {
-    if (ptr_ && owner_) {
-      if constexpr (std::is_member_function_pointer_v<decltype(Fn)>) {
-        (owner_->*Fn)(ptr_);
-      } else {
-        Fn(owner_, ptr_);
-      }
-    }
-  }
+  ~ManagedPtr() { Free(); }
   // No copies
   ManagedPtr(const ManagedPtr&) = delete;
   ManagedPtr& operator=(const ManagedPtr&) = delete;
@@ -267,6 +259,10 @@ class ManagedPtr {
     other.owner_ = nullptr;
   }
   ManagedPtr& operator=(ManagedPtr&& other) noexcept {
+    if (this == &other) {
+      return *this;
+    }
+    Free();
     ptr_ = other.ptr_;
     owner_ = other.owner_;
     other.ptr_ = nullptr;
@@ -284,6 +280,16 @@ class ManagedPtr {
  private:
   T* ptr_ = nullptr;
   Owner* owner_ = nullptr;
+
+  void Free() {
+    if (ptr_ && owner_) {
+      if constexpr (std::is_member_function_pointer_v<decltype(Fn)>) {
+        (owner_->*Fn)(ptr_);
+      } else {
+        Fn(owner_, ptr_);
+      }
+    }
+  }
 };
 
 template <typename T, typename comp>
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 3b1befdfbc46..74c90edea01b 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -110,6 +110,19 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
   }
 }
 
+// A convenience function for populating the Compressor* fields; see ~Rep()
+Compressor* MaybeCloneSpecialized(
+    Compressor* compressor, CacheEntryRole block_type,
+    Compressor::DictSampleArgs&& dict_samples = {}) {
+  auto specialized =
+      compressor->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+  if (specialized) {
+    // Caller is responsible for freeing when distinct
+    return specialized.release();
+  } else {
+    return compressor;
+  }
+}
 }  // namespace
 
 // kBlockBasedTableMagicNumber was picked by running
@@ -824,15 +837,17 @@ struct BlockBasedTableBuilder::Rep {
 
   // *** Compressors & decompressors - Yes, it seems like a lot here but ***
   // *** these are distinct fields to minimize extra conditionals and    ***
-  // *** field reads on hot code paths.                                  ***
+  // *** field reads on hot code paths. And to avoid interlocked         ***
+  // *** instructions associated with shared_ptr.                        ***
 
   // A compressor for blocks in general, without dictionary compression
   std::unique_ptr<Compressor> basic_compressor;
-  // A compressor using dictionary compression (when applicable)
-  std::unique_ptr<Compressor> compressor_with_dict;
-  // Once configured/determined, points to one of the above Compressors to
-  // use on data blocks.
-  Compressor* data_block_compressor = nullptr;
+  // A compressor for data blocks, which might be tuned differently and might
+  // use dictionary compression (when applicable). See ~Rep() for some details.
+  UnownedPtr<Compressor> data_block_compressor = nullptr;
+  // A compressor for index blocks, which might be tuned differently from
+  // basic_compressor. See ~Rep() for some details.
+  UnownedPtr<Compressor> index_block_compressor = nullptr;
   // A decompressor corresponding to basic_compressor (when non-nullptr).
   // Used for verification and cache warming.
   std::shared_ptr<Decompressor> basic_decompressor;
@@ -853,7 +868,7 @@ struct BlockBasedTableBuilder::Rep {
       compression_types_used;
 
   // Working area for basic_compressor when compression_parallel_threads==1
-  WorkingAreaPair basic_working_area;
+  WorkingAreaPair index_block_working_area;
   // Working area for data_block_compressor, for emit/compaction thread
   WorkingAreaPair data_block_working_area;
 
@@ -1099,7 +1114,10 @@ struct BlockBasedTableBuilder::Rep {
         filter_context, tbo.compression_opts, tbo.compression_type);
     if (basic_compressor) {
       if (table_options.enable_index_compression) {
-        basic_working_area.compress = basic_compressor->ObtainWorkingArea();
+        index_block_compressor = MaybeCloneSpecialized(
+            basic_compressor.get(), CacheEntryRole::kIndexBlock);
+        index_block_working_area.compress =
+            index_block_compressor->ObtainWorkingArea();
       }
       max_dict_sample_bytes = basic_compressor->GetMaxSampleSizeIfWantDict(
           CacheEntryRole::kDataBlock);
@@ -1114,8 +1132,10 @@ struct BlockBasedTableBuilder::Rep {
                                   tbo.compression_opts.max_dict_buffer_bytes);
         }
       } else {
-        // No distinct data block compressor using dictionary
-        data_block_compressor = basic_compressor.get();
+        // No distinct data block compressor using dictionary, but
+        // implementation might still want to specialize for data blocks
+        data_block_compressor = MaybeCloneSpecialized(
+            basic_compressor.get(), CacheEntryRole::kDataBlock);
         data_block_working_area.compress =
             data_block_compressor->ObtainWorkingArea();
       }
@@ -1129,8 +1149,9 @@ struct BlockBasedTableBuilder::Rep {
       if (table_options.verify_compression) {
         verify_decompressor = basic_decompressor.get();
         if (table_options.enable_index_compression) {
-          basic_working_area.verify = verify_decompressor->ObtainWorkingArea(
-              basic_compressor->GetPreferredCompressionType());
+          index_block_working_area.verify =
+              verify_decompressor->ObtainWorkingArea(
+                  index_block_compressor->GetPreferredCompressionType());
         }
         if (state == State::kUnbuffered) {
           assert(data_block_compressor);
@@ -1295,8 +1316,19 @@ struct BlockBasedTableBuilder::Rep {
   }
 
   ~Rep() {
+    // Delete working areas before their compressors.
+    index_block_working_area = {};
+    data_block_working_area = {};
     // Must have been cleaned up by StopParallelCompression
     assert(pc_rep == nullptr);
+    // Delete specialized compressors if they were distinct (avoiding extra
+    // fields and interlocked instructions with shared_ptr)
+    if (data_block_compressor.get() != basic_compressor.get()) {
+      delete data_block_compressor.get();
+    }
+    if (index_block_compressor.get() != basic_compressor.get()) {
+      delete index_block_compressor.get();
+    }
   }
 
   Rep(const Rep&) = delete;
@@ -1729,9 +1761,11 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data,
   assert(!r->IsParallelCompressionActive());
   CompressionType type;
   bool is_data_block = block_type == BlockType::kData;
+  // NOTE: only index and data blocks are currently compressed
+  assert(is_data_block || block_type == BlockType::kIndex);
   Status compress_status = CompressAndVerifyBlock(
       uncompressed_block_data, is_data_block,
-      is_data_block ? r->data_block_working_area : r->basic_working_area,
+      is_data_block ? r->data_block_working_area : r->index_block_working_area,
       &r->single_threaded_compressed_output, &type);
   r->SetStatus(compress_status);
   if (UNLIKELY(!ok())) {
@@ -1845,13 +1879,13 @@ Status BlockBasedTableBuilder::CompressAndVerifyBlock(
   Rep* r = rep_.get();
   Status status;
 
-  Compressor* compressor = nullptr;
+  UnownedPtr<Compressor> compressor = nullptr;
   Decompressor* verify_decomp = nullptr;
   if (is_data_block) {
     compressor = r->data_block_compressor;
     verify_decomp = r->data_block_verify_decompressor.get();
   } else {
-    compressor = r->basic_compressor.get();
+    compressor = r->index_block_compressor;
     verify_decomp = r->verify_decompressor.get();
   }
 
@@ -2116,7 +2150,7 @@ void BlockBasedTableBuilder::MaybeStartParallelCompression() {
   // that latency. So even with some optimizations, turning on the parallel
   // framework when compression is disabled just eats more CPU with little-to-no
   // improvement in throughput.
-  if (rep_->data_block_compressor == nullptr) {
+  if (!rep_->data_block_compressor) {
     // Force the generally best configuration for no compression: no parallelism
     return;
   }
@@ -2463,8 +2497,8 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
 void BlockBasedTableBuilder::WriteCompressionDictBlock(
     MetaIndexBuilder* meta_index_builder) {
   Slice compression_dict;
-  if (rep_->compressor_with_dict) {
-    compression_dict = rep_->compressor_with_dict->GetSerializedDict();
+  if (rep_->data_block_compressor) {
+    compression_dict = rep_->data_block_compressor->GetSerializedDict();
   }
   if (!compression_dict.empty()) {
     BlockHandle compression_dict_block_handle;
@@ -2559,6 +2593,7 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
     // The below code is neither safe nor necessary for handling zero data
     // blocks.
     // For PostPopulateCompressionProperties()
+    assert(!r->data_block_compressor);
     r->data_block_compressor = r->basic_compressor.get();
     return;
   }
@@ -2600,15 +2635,12 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
 
   assert(samples.sample_data.size() > 0);
 
-  // final sample data block flushed, now we can generate dictionary
-  r->compressor_with_dict = r->basic_compressor->MaybeCloneSpecialized(
-      CacheEntryRole::kDataBlock, std::move(samples));
+  // final sample data block flushed, now we can generate dictionary (or it
+  // might opt not to use a dictionary and that's ok)
+  r->data_block_compressor =
+      MaybeCloneSpecialized(r->basic_compressor.get(),
+                            CacheEntryRole::kDataBlock, std::move(samples));
 
-  // The compressor might opt not to use a dictionary, in which case we
-  // can use the same compressor as for e.g. index blocks.
-  r->data_block_compressor = r->compressor_with_dict
-                                 ? r->compressor_with_dict.get()
-                                 : r->basic_compressor.get();
   Slice serialized_dict = r->data_block_compressor->GetSerializedDict();
   if (r->verify_decompressor) {
     if (serialized_dict.empty()) {
@@ -2831,8 +2863,8 @@ uint64_t BlockBasedTableBuilder::EstimatedTailSize() const {
   }
 
   // 3. Estimate compression dictionary size
-  if (rep_->compressor_with_dict) {
-    Slice dict = rep_->compressor_with_dict->GetSerializedDict();
+  if (rep_->data_block_compressor) {
+    Slice dict = rep_->data_block_compressor->GetSerializedDict();
     if (!dict.empty()) {
       estimated_tail_size += dict.size();
     }
diff --git a/test_util/testutil.h b/test_util/testutil.h
index fc172b8e4b39..aa837a972b24 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -766,6 +766,10 @@ struct CompressorCustomAlg : public CompressorWrapper {
     return kCompression;
   }
 
+  std::unique_ptr<Compressor> Clone() const override {
+    return std::make_unique<CompressorCustomAlg>(wrapped_->Clone());
+  }
+
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,
@@ -794,7 +798,7 @@ struct CompressorCustomAlg : public CompressorWrapper {
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
     auto clone =
-        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+        wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples));
     return std::make_unique<CompressorCustomAlg>(std::move(clone));
   }
 
diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
index ed3bff812791..221ebe30073b 100644
--- a/util/auto_tune_compressor.cc
+++ b/util/auto_tune_compressor.cc
@@ -59,6 +59,10 @@ const char* AutoSkipCompressorWrapper::Name() const {
   return "AutoSkipCompressorWrapper";
 }
 
+std::unique_ptr<Compressor> AutoSkipCompressorWrapper::Clone() const {
+  return std::make_unique<AutoSkipCompressorWrapper>(wrapped_->Clone(), opts_);
+}
+
 Status AutoSkipCompressorWrapper::CompressBlock(
     Slice uncompressed_data, char* compressed_output,
     size_t* compressed_output_size, CompressionType* out_compression_type,
@@ -174,6 +178,10 @@ CostAwareCompressor::CostAwareCompressor(const CompressionOptions& opts)
 }
 
 const char* CostAwareCompressor::Name() const { return "CostAwareCompressor"; }
+
+std::unique_ptr<Compressor> CostAwareCompressor::Clone() const {
+  return std::make_unique<CostAwareCompressor>(opts_);
+}
 size_t CostAwareCompressor::GetMaxSampleSizeIfWantDict(
     CacheEntryRole block_type) const {
   auto idx = allcompressors_index_.back();
diff --git a/util/auto_tune_compressor.h b/util/auto_tune_compressor.h
index 818d8c43e753..ebd09f1c1e6a 100644
--- a/util/auto_tune_compressor.h
+++ b/util/auto_tune_compressor.h
@@ -64,6 +64,7 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
   explicit AutoSkipCompressorWrapper(std::unique_ptr<Compressor> compressor,
                                      const CompressionOptions& opts);
 
+  std::unique_ptr<Compressor> Clone() const override;
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,
@@ -149,6 +150,7 @@ class CostAwareCompressor : public Compressor {
  public:
   explicit CostAwareCompressor(const CompressionOptions& opts);
   const char* Name() const override;
+  std::unique_ptr<Compressor> Clone() const override;
   size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;
   Slice GetSerializedDict() const override;
   CompressionType GetPreferredCompressionType() const override;
diff --git a/util/compression.cc b/util/compression.cc
index 5831643d462f..afa1e3d5357e 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -174,6 +174,10 @@ class BuiltinCompressorV1 : public CompressorBase {
 
   CompressionType GetPreferredCompressionType() const override { return type_; }
 
+  std::unique_ptr<Compressor> Clone() const override {
+    return std::make_unique<BuiltinCompressorV1>(opts_, type_);
+  }
+
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,
@@ -226,6 +230,10 @@ class CompressorWithSimpleDictBase : public CompressorBase {
   // NOTE: empty dict is equivalent to no dict
   Slice GetSerializedDict() const override { return dict_data_; }
 
+  std::unique_ptr<Compressor> Clone() const override {
+    return CloneForDict(std::string{dict_data_});
+  }
+
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole /*block_type*/,
       DictSampleArgs&& dict_samples) final override {
@@ -238,7 +246,8 @@ class CompressorWithSimpleDictBase : public CompressorBase {
     }
   }
 
-  virtual std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) = 0;
+  virtual std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const = 0;
 
  protected:
   const std::string dict_data_;
@@ -257,7 +266,8 @@ class BuiltinSnappyCompressorV2 : public CompressorWithSimpleDictBase {
     return kSnappyCompression;
   }
 
-  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
     return std::make_unique<BuiltinSnappyCompressorV2>(opts_,
                                                        std::move(dict_data));
   }
@@ -349,7 +359,8 @@ class BuiltinZlibCompressorV2 : public CompressorWithSimpleDictBase {
     return kZlibCompression;
   }
 
-  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
     return std::make_unique<BuiltinZlibCompressorV2>(opts_,
                                                      std::move(dict_data));
   }
@@ -447,7 +458,8 @@ class BuiltinBZip2CompressorV2 : public CompressorWithSimpleDictBase {
     return kBZip2Compression;
   }
 
-  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
     return std::make_unique<BuiltinBZip2CompressorV2>(opts_,
                                                       std::move(dict_data));
   }
@@ -526,7 +538,8 @@ class BuiltinLZ4CompressorV2WithDict : public CompressorWithSimpleDictBase {
     return kLZ4Compression;
   }
 
-  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
     return std::make_unique<BuiltinLZ4CompressorV2WithDict>(
         opts_, std::move(dict_data));
   }
@@ -616,6 +629,10 @@ class BuiltinLZ4CompressorV2NoDict : public BuiltinLZ4CompressorV2WithDict {
   BuiltinLZ4CompressorV2NoDict(const CompressionOptions& opts)
       : BuiltinLZ4CompressorV2WithDict(opts, /*dict_data=*/{}) {}
 
+  std::unique_ptr<Compressor> Clone() const override {
+    return std::make_unique<BuiltinLZ4CompressorV2NoDict>(opts_);
+  }
+
   ManagedWorkingArea ObtainWorkingArea() override {
     // Using an LZ4_stream_t between compressions and resetting with
     // LZ4_resetStream_fast is actually slower than using a fresh LZ4_stream_t
@@ -687,7 +704,8 @@ class BuiltinLZ4HCCompressorV2 : public CompressorWithSimpleDictBase {
     return kLZ4HCCompression;
   }
 
-  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
     return std::make_unique<BuiltinLZ4HCCompressorV2>(opts_,
                                                       std::move(dict_data));
   }
@@ -782,7 +800,8 @@ class BuiltinXpressCompressorV2 : public CompressorWithSimpleDictBase {
     return kXpressCompression;
   }
 
-  std::unique_ptr<Compressor> CloneForDict(std::string&& dict_data) override {
+  std::unique_ptr<Compressor> CloneForDict(
+      std::string&& dict_data) const override {
     return std::make_unique<BuiltinXpressCompressorV2>(opts_,
                                                        std::move(dict_data));
   }
@@ -831,6 +850,13 @@ class BuiltinZSTDCompressorV2 : public CompressorBase {
 
   CompressionType GetPreferredCompressionType() const override { return kZSTD; }
 
+  std::unique_ptr<Compressor> Clone() const override {
+    CompressionDict dict_copy{dict_.GetRawDict().ToString(), kZSTD,
+                              opts_.level};
+    return std::make_unique<BuiltinZSTDCompressorV2>(opts_,
+                                                     std::move(dict_copy));
+  }
+
   size_t GetMaxSampleSizeIfWantDict(
       CacheEntryRole /*block_type*/) const override {
     if (opts_.max_dict_bytes == 0) {
diff --git a/util/compression_test.cc b/util/compression_test.cc
index da95a91af210..c4a3baa7a49e 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -15,10 +15,12 @@
 #include "table/block_based/block_builder.h"
 #include "test_util/testutil.h"
 #include "util/auto_tune_compressor.h"
+#include "util/coding.h"
 #include "util/random.h"
 #include "util/simple_mixed_compressor.h"
 
 namespace ROCKSDB_NAMESPACE {
+
 class DBCompressionTest : public DBTestBase {
  public:
   DBCompressionTest() : DBTestBase("compression_test", /*env_do_fsync=*/true) {}
@@ -1108,7 +1110,131 @@ TEST_F(DBCompressionTest, RandomMixedCompressionManager) {
   }
 }
 
-TEST_F(DBCompressionTest, CompressionManagerWrapper) {
+namespace {
+// Template parameter to distinguish data blocks vs. v4+ index blocks
+template <bool kIndexBlockV4>
+static Status ValidateRocksBlock(Slice data) {
+  const char* src = data.data();
+  size_t srcSize = data.size();
+  const char* const block_type_str =
+      kIndexBlockV4 ? "Index block" : "Data block";
+
+  // Minimum RocksDB block content size: at least 1 entry + restarts
+  if (srcSize < 8) {
+    return Status::Corruption(std::string(block_type_str) + " too small");
+  }
+
+  uint32_t numRestarts = DecodeFixed32(src + srcSize - sizeof(uint32_t));
+
+  // Sanity check: num_restarts should be reasonable
+  // TODO: also support data block hash index
+  if (numRestarts > srcSize / 4 || numRestarts == 0) {
+    return Status::Corruption(std::string("Invalid num_restarts in ") +
+                              block_type_str);
+  }
+
+  size_t restartsSize = numRestarts * sizeof(uint32_t) + sizeof(uint32_t);
+  if (srcSize < restartsSize) {
+    return Status::Corruption(std::string(block_type_str) +
+                              " too small for restarts array");
+  }
+
+  size_t entriesSize = srcSize - restartsSize;
+  const char* entriesEnd = src + entriesSize;
+
+  // Parse entries
+  const char* p = src;
+  while (p < entriesEnd) {
+    // Parse shared_bytes varint
+    uint32_t shared;
+    const char* next = GetVarint32Ptr(p, entriesEnd, &shared);
+    if (next == nullptr) {
+      return Status::Corruption(std::string("Invalid shared_bytes varint in ") +
+                                block_type_str);
+    }
+    p = next;
+
+    // Parse unshared_bytes varint
+    uint32_t unshared;
+    next = GetVarint32Ptr(p, entriesEnd, &unshared);
+    if (next == nullptr) {
+      return Status::Corruption(
+          std::string("Invalid unshared_bytes varint in ") + block_type_str);
+    }
+    p = next;
+
+    uint32_t valueLen = 0;
+    if constexpr (!kIndexBlockV4) {
+      // For data blocks, parse value_length varint
+      next = GetVarint32Ptr(p, entriesEnd, &valueLen);
+      if (next == nullptr) {
+        return Status::Corruption(
+            std::string("Invalid value_length varint in ") + block_type_str);
+      }
+      p = next;
+    }
+
+    // Validate key delta
+    if (p + unshared > entriesEnd) {
+      return Status::Corruption(
+          std::string("Key delta exceeds end of entries in ") + block_type_str);
+    }
+    p += unshared;
+
+    if constexpr (kIndexBlockV4) {
+      // For v4 index blocks, value is self-describing (varints)
+      // Parse first varint (always present)
+      uint32_t v1;
+      next = GetVarint32Ptr(p, entriesEnd, &v1);
+      if (next == nullptr) {
+        return Status::Corruption(std::string("Invalid value varint in ") +
+                                  block_type_str);
+      }
+      p = next;
+
+      // If shared_bytes == 0, there's a second varint
+      if (shared == 0) {
+        uint32_t v2;
+        next = GetVarint32Ptr(p, entriesEnd, &v2);
+        if (next == nullptr) {
+          return Status::Corruption(
+              std::string("Invalid second value varint in ") + block_type_str);
+        }
+        p = next;
+      }
+    } else {
+      // For data blocks, validate value
+      if (p + valueLen > entriesEnd) {
+        return Status::Corruption(
+            std::string("Value exceeds end of entries in ") + block_type_str);
+      }
+      p += valueLen;
+    }
+  }
+
+  return Status::OK();
+}
+}  // anonymous namespace
+
+class DBCompressionTestMaybeParallel
+    : public DBCompressionTest,
+      public testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  DBCompressionTestMaybeParallel()
+      : DBCompressionTest(),
+        parallel_threads_(std::get<0>(GetParam())),
+        use_dict_(std::get<1>(GetParam())) {}
+
+ protected:
+  int parallel_threads_;
+  bool use_dict_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompressionTest, DBCompressionTestMaybeParallel,
+                        ::testing::Combine(::testing::Values(1, 4),
+                                           ::testing::Values(false, true)));
+
+TEST_P(DBCompressionTestMaybeParallel, CompressionManagerWrapper) {
   // Test that we can use a custom CompressionManager to wrap the built-in
   // CompressionManager, thus adopting a custom *strategy* based on existing
   // algorithms. This will "mark" some blocks (in their contents) as "do not
@@ -1119,14 +1245,81 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
   static std::string kDoNotCompress = "do_not_compress";
   static std::string kRejectCompression = "reject_compression";
 
+  static RelaxedAtomic<int> dataCheckedCount{0};
+  static RelaxedAtomic<int> indexCheckedCount{0};
+  static RelaxedAtomic<int> compressCalledCount{0};
+
+  // We also have wrappers here to help verify that when RocksDB asks to
+  // specialize the Compressor for a particular kind of block, it only passes in
+  // that kind of block to ensure proper grouping of related data for
+  // compression. We check this by parsing the subtly distinct schemas of data
+  // blocks vs. v4+ index blocks. This also ensures that structure-aware
+  // compressions like OpenZL can parse the data block and index block formats.
+  struct CheckDataBlockCompressorWrapper : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "CheckDataBlockCompressor"; }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<CheckDataBlockCompressorWrapper>(
+          wrapped_->Clone());
+    }
+
+    Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                         size_t* compressed_output_size,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      dataCheckedCount.FetchAddRelaxed(1);
+      // Parse and validate data block format before compressing
+      Status s = ValidateRocksBlock</*kIndexBlockV4=*/false>(uncompressed_data);
+      if (!s.ok()) {
+        return s;
+      }
+      // Delegate to wrapped compressor on success
+      return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                     compressed_output_size,
+                                     out_compression_type, working_area);
+    }
+  };
+
+  struct CheckIndexBlockCompressorWrapper : public CompressorWrapper {
+    using CompressorWrapper::CompressorWrapper;
+    const char* Name() const override { return "CheckIndexBlockCompressor"; }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<CheckIndexBlockCompressorWrapper>(
+          wrapped_->Clone());
+    }
+
+    Status CompressBlock(Slice uncompressed_data, char* compressed_output,
+                         size_t* compressed_output_size,
+                         CompressionType* out_compression_type,
+                         ManagedWorkingArea* working_area) override {
+      indexCheckedCount.FetchAddRelaxed(1);
+      // Parse and validate index block v4 format before compressing
+      Status s = ValidateRocksBlock</*kIndexBlockV4=*/true>(uncompressed_data);
+      if (!s.ok()) {
+        return s;
+      }
+      // Delegate to wrapped compressor on success
+      return wrapped_->CompressBlock(uncompressed_data, compressed_output,
+                                     compressed_output_size,
+                                     out_compression_type, working_area);
+    }
+  };
+
   struct MyCompressor : public CompressorWrapper {
     using CompressorWrapper::CompressorWrapper;
     const char* Name() const override { return "MyCompressor"; }
 
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<MyCompressor>(wrapped_->Clone());
+    }
+
     Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                          size_t* compressed_output_size,
                          CompressionType* out_compression_type,
                          ManagedWorkingArea* working_area) override {
+      compressCalledCount.FetchAddRelaxed(1);
       auto begin = uncompressed_data.data();
       auto end = uncompressed_data.data() + uncompressed_data.size();
       if (std::search(begin, end, kDoNotCompress.begin(),
@@ -1154,6 +1347,7 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
           : wrapped_(std::move(wrapped)) {}
       ManagedWorkingArea wrapped_;
     };
+
     ManagedWorkingArea ObtainWorkingArea() override {
       ManagedWorkingArea rv{
           new MyWorkingArea{CompressorWrapper::ObtainWorkingArea()}, this};
@@ -1168,6 +1362,20 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
     void ReleaseWorkingArea(WorkingArea* wa) override {
       delete static_cast<MyWorkingArea*>(wa);
     }
+
+    std::unique_ptr<Compressor> MaybeCloneSpecialized(
+        CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+      std::unique_ptr<Compressor> result = std::make_unique<MyCompressor>(
+          wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples)));
+      if (block_type == CacheEntryRole::kDataBlock) {
+        result = std::make_unique<CheckDataBlockCompressorWrapper>(
+            std::move(result));
+      } else if (block_type == CacheEntryRole::kIndexBlock) {
+        result = std::make_unique<CheckIndexBlockCompressorWrapper>(
+            std::move(result));
+      }
+      return result;
+    }
   };
   struct MyManager : public CompressionManagerWrapper {
     using CompressionManagerWrapper::CompressionManagerWrapper;
@@ -1191,10 +1399,15 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
 
       Options options = CurrentOptions();
       options.compression = type;
+      options.compression_opts.parallel_threads = parallel_threads_;
+      options.compression_opts.max_dict_bytes = use_dict_ ? 4096 : 0;
       options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
       options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
       BlockBasedTableOptions bbto;
-      bbto.enable_index_compression = false;
+      bbto.enable_index_compression = true;
+      bbto.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      bbto.partition_filters = true;
+      bbto.filter_policy.reset(NewBloomFilterPolicy(5));
       options.table_factory.reset(NewBlockBasedTableFactory(bbto));
       options.compression_manager = use_wrapper ? mgr : nullptr;
       DestroyAndReopen(options);
@@ -1228,14 +1441,21 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
       }
       ASSERT_OK(Flush());
 
-      if (use_wrapper) {
-        EXPECT_EQ(kCount / 2 - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
+      // Index partition is compressed
+      constexpr int kIdxComp = 1;
+      // Top level index block is rejected for compression
+      constexpr int kIdxRej = 1;
+
+      if (use_dict_) {
+        // FIXME: why don't the stats match? (for now, checking for crashes)
+      } else if (use_wrapper) {
+        EXPECT_EQ(kCount / 2 - 1 + kIdxComp, PopStat(NUMBER_BLOCK_COMPRESSED));
         EXPECT_EQ(kCount / 2, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
-        EXPECT_EQ(1 + 1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+        EXPECT_EQ(1 + 1 + kIdxRej, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
       } else {
-        EXPECT_EQ(kCount - 1, PopStat(NUMBER_BLOCK_COMPRESSED));
+        EXPECT_EQ(kCount - 1 + kIdxComp, PopStat(NUMBER_BLOCK_COMPRESSED));
         EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED));
-        EXPECT_EQ(1, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
+        EXPECT_EQ(1 + kIdxRej, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED));
       }
 
       // Ensure well-formed for reads
@@ -1243,11 +1463,27 @@ TEST_F(DBCompressionTest, CompressionManagerWrapper) {
         ASSERT_NE(Get(Key(i)), "NOT_FOUND");
       }
       ASSERT_EQ(Get(Key(kCount)), "NOT_FOUND");
+
+      // Ensure expected checks were performed
+      EXPECT_EQ(indexCheckedCount.ExchangeRelaxed(0),
+                use_wrapper ? kIdxComp + kIdxRej : 0);
+      EXPECT_EQ(dataCheckedCount.ExchangeRelaxed(0), use_wrapper ? kCount : 0);
+      // And every use of MyCompressor went through either the data block
+      // checker or index block checker
+      EXPECT_EQ(compressCalledCount.ExchangeRelaxed(0),
+                use_wrapper ? kIdxComp + kIdxRej + kCount : 0);
     }
   }
 }
 
-TEST_F(DBCompressionTest, CompressionManagerCustomCompression) {
+namespace {
+std::string UniqueName(const std::string& base) {
+  static RelaxedAtomic<int> counter{0};
+  return base + std::to_string(counter.FetchAddRelaxed(1));
+}
+}  // anonymous namespace
+
+TEST_P(DBCompressionTestMaybeParallel, CompressionManagerCustomCompression) {
   // Test that we can use a custom CompressionManager to implement custom
   // compression algorithms, and that there are appropriate schema guard rails
   // to ensure data is not processed by the wrong algorithm.
@@ -1263,9 +1499,12 @@ TEST_F(DBCompressionTest, CompressionManagerCustomCompression) {
 
   class MyManager : public CompressionManager {
    public:
-    explicit MyManager(const char* compat_name) : compat_name_(compat_name) {}
+    explicit MyManager(const std::string& compat_name)
+        : compat_name_(compat_name), name_("MyManager:" + compat_name_) {}
     const char* Name() const override { return name_.c_str(); }
-    const char* CompatibilityName() const override { return compat_name_; }
+    const char* CompatibilityName() const override {
+      return compat_name_.c_str();
+    }
 
     bool SupportsCompressionType(CompressionType type) const override {
       return type == kCustomCompression8A || type == kCustomCompression8B ||
@@ -1330,246 +1569,237 @@ TEST_F(DBCompressionTest, CompressionManagerCustomCompression) {
     }
 
    private:
-    const char* compat_name_;
+    std::string compat_name_;
     std::string name_;
     // weak_ptr to avoid cycles
     std::map<std::string, std::weak_ptr<CompressionManager>> friends_;
   };
 
-  for (bool use_dict : {false, true}) {
-    SCOPED_TRACE(use_dict ? "With dict" : "No dict");
+  // Although these compression managers are actually compatible, we must
+  // respect their distinct compatibility names and treat them as incompatible
+  // (or else risk processing data incorrectly)
+  // NOTE: these are not registered in ObjectRegistry to test what happens
+  // when the original CompressionManager might not be available, but
+  // mgr_bar will be registered during the test, with different names to
+  // prevent interference between iterations.
+  auto mgr_foo = std::make_shared<MyManager>("Foo");
+  auto mgr_bar = std::make_shared<MyManager>(UniqueName("Bar"));
 
-    // Although these compression managers are actually compatible, we must
-    // respect their distinct compatibility names and treat them as incompatible
-    // (or else risk processing data incorrectly)
-    // NOTE: these are not registered in ObjectRegistry to test what happens
-    // when the original CompressionManager might not be available, but
-    // mgr_bar will be registered during the test, with different names to
-    // prevent interference between iterations.
-    auto mgr_foo = std::make_shared<MyManager>("Foo");
-    auto mgr_bar = std::make_shared<MyManager>(use_dict ? "Bar1" : "Bar2");
+  // And this one claims to be fully compatible with the built-in compression
+  // manager when it's not fully compatible (for custom CompressionTypes)
+  auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
 
-    // And this one claims to be fully compatible with the built-in compression
-    // manager when it's not fully compatible (for custom CompressionTypes)
-    auto mgr_claim_compatible = std::make_shared<MyManager>("BuiltinV2");
+  constexpr uint16_t kValueSize = 10000;
 
-    constexpr uint16_t kValueSize = 10000;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 20;
+  BlockBasedTableOptions bbto;
+  bbto.enable_index_compression = false;
+  bbto.format_version = 6;  // Before custom compression alg support
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  // Claims not to use custom compression (and doesn't unless setting a custom
+  // CompressionType)
+  options.compression_manager = mgr_claim_compatible;
+  // Use a built-in compression type with dictionary support
+  options.compression = kLZ4Compression;
+  options.compression_opts.max_dict_bytes = use_dict_ ? kValueSize / 2 : 0;
+  options.compression_opts.parallel_threads = parallel_threads_;
+  DestroyAndReopen(options);
 
-    Options options = CurrentOptions();
-    options.level0_file_num_compaction_trigger = 20;
-    BlockBasedTableOptions bbto;
-    bbto.enable_index_compression = false;
-    bbto.format_version = 6;  // Before custom compression alg support
-    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-    // Claims not to use custom compression (and doesn't unless setting a custom
-    // CompressionType)
-    options.compression_manager = mgr_claim_compatible;
-    // Use a built-in compression type with dictionary support
-    options.compression = kLZ4Compression;
-    options.compression_opts.max_dict_bytes = kValueSize / 2;
-    DestroyAndReopen(options);
+  Random rnd(404);
+  std::string value;
+  ASSERT_OK(Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
 
-    Random rnd(404);
-    std::string value;
-    ASSERT_OK(
-        Put("a", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
+  // That data should be readable without access to the original compression
+  // manager, because it used the built-in CompatibilityName and a built-in
+  // CompressionType
+  options.compression_manager = nullptr;
+  Reopen(options);
+  ASSERT_EQ(Get("a"), value);
+
+  // Verify it was compressed
+  Range r = {"a", "a0"};
+  TablePropertiesCollection tables_properties;
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "LZ4");
+
+  // Disallow setting a custom CompressionType with a CompressionManager
+  // claiming to be built-in compatible.
+  options.compression_manager = mgr_claim_compatible;
+  options.compression = kCustomCompression8A;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  options.compression_manager = nullptr;
+  options.compression = kCustomCompressionFE;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+  options.compression =
+      static_cast<CompressionType>(kLastBuiltinCompression + 1);
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Custom compression schema (different CompatibilityName) not supported
+  // before format_version=7
+  options.compression_manager = mgr_foo;
+  options.compression = kLZ4Compression;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
+
+  // Set format version supporting custom compression
+  bbto.format_version = 7;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
 
-    // That data should be readable without access to the original compression
-    // manager, because it used the built-in CompatibilityName and a built-in
-    // CompressionType
-    options.compression_manager = nullptr;
-    Reopen(options);
-    ASSERT_EQ(Get("a"), value);
-
-    // Verify it was compressed
-    Range r = {"a", "a0"};
-    TablePropertiesCollection tables_properties;
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "LZ4");
-
-    // Disallow setting a custom CompressionType with a CompressionManager
-    // claiming to be built-in compatible.
-    options.compression_manager = mgr_claim_compatible;
-    options.compression = kCustomCompression8A;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    options.compression_manager = nullptr;
-    options.compression = kCustomCompressionFE;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-    options.compression =
-        static_cast<CompressionType>(kLastBuiltinCompression + 1);
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    // Custom compression schema (different CompatibilityName) not supported
-    // before format_version=7
-    options.compression_manager = mgr_foo;
-    options.compression = kLZ4Compression;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    // Set format version supporting custom compression
-    bbto.format_version = 7;
-    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  // Custom compression type not supported with built-in schema name, even
+  // with format_version=7
+  options.compression_manager = mgr_claim_compatible;
+  options.compression = kCustomCompression8B;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
 
-    // Custom compression type not supported with built-in schema name, even
-    // with format_version=7
-    options.compression_manager = mgr_claim_compatible;
-    options.compression = kCustomCompression8B;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kInvalidArgument);
-
-    // Custom compression schema, but specifying a custom compression type it
-    // doesn't support.
-    options.compression_manager = mgr_foo;
-    options.compression = kCustomCompressionF0;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
-
-    // Using a built-in compression type with fv=7 but named custom schema
-    options.compression = kLZ4Compression;
-    Reopen(options);
-    ASSERT_OK(
-        Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-    ASSERT_EQ(Get("b"), value);
-
-    // Verify it was compressed with LZ4
-    r = {"b", "b0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    // Uses new format for "compression_name" property
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kLZ4Compression);
-
-    // Custom compression type
-    options.compression = kCustomCompression8A;
-    Reopen(options);
-    ASSERT_OK(
-        Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
-    ASSERT_EQ(Get("c"), value);
-    EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
-
-    // Verify it was compressed with custom format
-    r = {"c", "c0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kCustomCompression8A);
-
-    // Also dynamically changeable, because the compression manager will respect
-    // the current setting as reported under the legacy logic
-    ASSERT_OK(dbfull()->SetOptions({{"compression", "kLZ4Compression"}}));
-    ASSERT_OK(
-        Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 4);
-    ASSERT_EQ(Get("d"), value);
-
-    // Verify it was compressed with LZ4
-    r = {"d", "d0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kLZ4Compression);
-
-    // Dynamically changeable to custom compressions also
-    ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
-    ASSERT_OK(
-        Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
-    ASSERT_EQ(Get("e"), value);
-
-    // Verify it was compressed with custom format
-    r = {"e", "e0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
-    EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
-              kCustomCompression8B);
-
-    // Fails to re-open with incompatible compression manager (can't find
-    // compression manager Foo because it's not registered nor known by Bar)
-    options.compression_manager = mgr_bar;
-    options.compression = kLZ4Compression;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
-
-    // But should re-open if we make Bar aware of the Foo compression manager
-    mgr_bar->AddFriend(mgr_foo);
-    Reopen(options);
-
-    // Can still read everything
-    ASSERT_EQ(Get("a").size(), kValueSize);
-    ASSERT_EQ(Get("b").size(), kValueSize);
-    ASSERT_EQ(Get("c").size(), kValueSize);
-    ASSERT_EQ(Get("d").size(), kValueSize);
-    ASSERT_EQ(Get("e").size(), kValueSize);
-
-    // Add a file using mgr_bar
-    ASSERT_OK(
-        Put("f", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
-    ASSERT_OK(Flush());
-    ASSERT_EQ(NumTableFilesAtLevel(0), 6);
-    ASSERT_EQ(Get("f"), value);
-
-    // Verify it was compressed appropriately
-    r = {"f", "f0"};
-    tables_properties.clear();
-    ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r,
-                                                1, &tables_properties));
-    ASSERT_EQ(tables_properties.size(), 1U);
-    EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
-    EXPECT_EQ(mgr_bar->last_specific_decompressor_type_.LoadRelaxed(),
-              kLZ4Compression);
-
-    // Fails to re-open with incompatible compression manager (can't find
-    // compression manager Bar because it's not registered nor known by Foo)
-    options.compression_manager = mgr_foo;
-    ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
-
-    // Register and re-open
-    auto& library = *ObjectLibrary::Default();
-    library.AddFactory<CompressionManager>(
-        mgr_bar->CompatibilityName(),
-        [mgr_bar](const std::string& /*uri*/,
-                  std::unique_ptr<CompressionManager>* guard,
-                  std::string* /*errmsg*/) {
-          *guard = std::make_unique<MyManager>(mgr_bar->CompatibilityName());
-          return guard->get();
-        });
-    Reopen(options);
-
-    // Can still read everything
-    ASSERT_EQ(Get("a").size(), kValueSize);
-    ASSERT_EQ(Get("b").size(), kValueSize);
-    ASSERT_EQ(Get("c").size(), kValueSize);
-    ASSERT_EQ(Get("d").size(), kValueSize);
-    ASSERT_EQ(Get("e").size(), kValueSize);
-    ASSERT_EQ(Get("f").size(), kValueSize);
-
-    // TODO: test old version of a compression manager unable to read a
-    // compression type
-  }
+  // Custom compression schema, but specifying a custom compression type it
+  // doesn't support.
+  options.compression_manager = mgr_foo;
+  options.compression = kCustomCompressionF0;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+  // Using a built-in compression type with fv=7 but named custom schema
+  options.compression = kLZ4Compression;
+  Reopen(options);
+  ASSERT_OK(Put("b", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_EQ(Get("b"), value);
+
+  // Verify it was compressed with LZ4
+  r = {"b", "b0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  // Uses new format for "compression_name" property
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kLZ4Compression);
+
+  // Custom compression type
+  options.compression = kCustomCompression8A;
+  Reopen(options);
+  ASSERT_OK(Put("c", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 0);
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  ASSERT_EQ(Get("c"), value);
+  EXPECT_EQ(mgr_foo->used_compressor8A_count_, 1);
+
+  // Verify it was compressed with custom format
+  r = {"c", "c0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8A;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kCustomCompression8A);
+
+  // Also dynamically changeable, because the compression manager will respect
+  // the current setting as reported under the legacy logic
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kLZ4Compression"}}));
+  ASSERT_OK(Put("d", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+  ASSERT_EQ(Get("d"), value);
+
+  // Verify it was compressed with LZ4
+  r = {"d", "d0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;04;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kLZ4Compression);
+
+  // Dynamically changeable to custom compressions also
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kCustomCompression8B"}}));
+  ASSERT_OK(Put("e", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+  ASSERT_EQ(Get("e"), value);
+
+  // Verify it was compressed with custom format
+  r = {"e", "e0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(tables_properties.begin()->second->compression_name, "Foo;8B;");
+  EXPECT_EQ(mgr_foo->last_specific_decompressor_type_.LoadRelaxed(),
+            kCustomCompression8B);
+
+  // Fails to re-open with incompatible compression manager (can't find
+  // compression manager Foo because it's not registered nor known by Bar)
+  options.compression_manager = mgr_bar;
+  options.compression = kLZ4Compression;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+  // But should re-open if we make Bar aware of the Foo compression manager
+  mgr_bar->AddFriend(mgr_foo);
+  Reopen(options);
+
+  // Can still read everything
+  ASSERT_EQ(Get("a").size(), kValueSize);
+  ASSERT_EQ(Get("b").size(), kValueSize);
+  ASSERT_EQ(Get("c").size(), kValueSize);
+  ASSERT_EQ(Get("d").size(), kValueSize);
+  ASSERT_EQ(Get("e").size(), kValueSize);
+
+  // Add a file using mgr_bar
+  ASSERT_OK(Put("f", test::CompressibleString(&rnd, 0.1, kValueSize, &value)));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 6);
+  ASSERT_EQ(Get("f"), value);
+
+  // Verify it was compressed appropriately
+  r = {"f", "f0"};
+  tables_properties.clear();
+  ASSERT_OK(db_->GetPropertiesOfTablesInRange(db_->DefaultColumnFamily(), &r, 1,
+                                              &tables_properties));
+  ASSERT_EQ(tables_properties.size(), 1U);
+  EXPECT_LT(tables_properties.begin()->second->data_size, kValueSize / 2);
+  EXPECT_EQ(mgr_bar->last_specific_decompressor_type_.LoadRelaxed(),
+            kLZ4Compression);
+
+  // Fails to re-open with incompatible compression manager (can't find
+  // compression manager Bar because it's not registered nor known by Foo)
+  options.compression_manager = mgr_foo;
+  ASSERT_EQ(TryReopen(options).code(), Status::Code::kNotSupported);
+
+  // Register and re-open
+  auto& library = *ObjectLibrary::Default();
+  library.AddFactory<CompressionManager>(
+      mgr_bar->CompatibilityName(),
+      [mgr_bar](const std::string& /*uri*/,
+                std::unique_ptr<CompressionManager>* guard,
+                std::string* /*errmsg*/) {
+        *guard = std::make_unique<MyManager>(mgr_bar->CompatibilityName());
+        return guard->get();
+      });
+  Reopen(options);
+
+  // Can still read everything
+  ASSERT_EQ(Get("a").size(), kValueSize);
+  ASSERT_EQ(Get("b").size(), kValueSize);
+  ASSERT_EQ(Get("c").size(), kValueSize);
+  ASSERT_EQ(Get("d").size(), kValueSize);
+  ASSERT_EQ(Get("e").size(), kValueSize);
+  ASSERT_EQ(Get("f").size(), kValueSize);
+
+  // TODO: test old version of a compression manager unable to read a
+  // compression type
 }
 
 TEST_F(DBCompressionTest, FailWhenCompressionNotSupportedTest) {
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index a1ee40481d8c..282fcd97a5a9 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -16,7 +16,8 @@ namespace ROCKSDB_NAMESPACE {
 
 // MultiCompressorWrapper implementation
 MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts,
-                                               CompressionDict&& dict) {
+                                               CompressionDict&& dict)
+    : opts_(opts) {
   // TODO: make the compression manager a field
   auto builtInManager = GetBuiltinV2CompressionManager();
   const auto& compressions = GetSupportedCompressions();
@@ -59,6 +60,10 @@ const char* RandomMixedCompressor::Name() const {
   return "RandomMixedCompressor";
 }
 
+std::unique_ptr<Compressor> RandomMixedCompressor::Clone() const {
+  return std::make_unique<RandomMixedCompressor>(opts_);
+}
+
 Status RandomMixedCompressor::CompressBlock(
     Slice uncompressed_data, char* compressed_output,
     size_t* compressed_output_size, CompressionType* out_compression_type,
@@ -86,6 +91,10 @@ const char* RoundRobinCompressor::Name() const {
   return "RoundRobinCompressor";
 }
 
+std::unique_ptr<Compressor> RoundRobinCompressor::Clone() const {
+  return std::make_unique<RoundRobinCompressor>(opts_);
+}
+
 Status RoundRobinCompressor::CompressBlock(
     Slice uncompressed_data, char* compressed_output,
     size_t* compressed_output_size, CompressionType* out_compression_type,
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 0c12d88a0ac3..e55b724976d9 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -28,12 +28,14 @@ class MultiCompressorWrapper : public Compressor {
       CacheEntryRole block_type, DictSampleArgs&& dict_samples) override;
 
  protected:
+  const CompressionOptions opts_;
   std::vector<std::unique_ptr<Compressor>> compressors_;
 };
 
 struct RandomMixedCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   const char* Name() const override;
+  std::unique_ptr<Compressor> Clone() const override;
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,
@@ -51,6 +53,7 @@ class RandomMixedCompressionManager : public CompressionManagerWrapper {
 struct RoundRobinCompressor : public MultiCompressorWrapper {
   using MultiCompressorWrapper::MultiCompressorWrapper;
   const char* Name() const override;
+  std::unique_ptr<Compressor> Clone() const override;
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,

From 42ba71fbbf2d65d57f2ada4b39fad8cca5baf500 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 24 Nov 2025 08:45:40 -0800
Subject: [PATCH 388/500] Start 10.10.0 development (#14148)

Summary:
10.9.0 branch has been cut.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14148

Reviewed By: nmk70

Differential Revision: D87688882

Pulled By: xingbowang

fbshipit-source-id: 5fe95d3c64851b4f9490aed5d92451b38abe008d
---
 HISTORY.md                                    | 23 +++++++++++++++++++
 include/rocksdb/version.h                     |  2 +-
 tools/check_format_compatible.sh              |  2 +-
 .../posix_writable_file_truncate.md           |  1 -
 .../behavior_changes/standalone-range-del.md  |  1 -
 .../bug_fixes/fix-range-del-boundary.md       |  1 -
 .../get_sorted_wal_files_noop_purge_hang.md   |  1 -
 .../bug_fixes/multiscan_backward_seek.md      |  1 -
 .../wal_ttl_clock_regression_underflow.md     |  1 -
 ...rivial_move_support_in_CompactFiles_API.md |  1 -
 .../new_features/auto_tune_manifest.md        |  1 -
 .../new_features/multi-cf-option-migration.md |  1 -
 .../target_file_size_is_upper_bound.md        |  1 -
 ...ll_iterators_in_level_iterators_prepare.md |  1 -
 .../public_api_changes/odr_thread_status.md   |  1 -
 15 files changed, 25 insertions(+), 14 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/posix_writable_file_truncate.md
 delete mode 100644 unreleased_history/behavior_changes/standalone-range-del.md
 delete mode 100644 unreleased_history/bug_fixes/fix-range-del-boundary.md
 delete mode 100644 unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md
 delete mode 100644 unreleased_history/bug_fixes/multiscan_backward_seek.md
 delete mode 100644 unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md
 delete mode 100644 unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md
 delete mode 100644 unreleased_history/new_features/auto_tune_manifest.md
 delete mode 100644 unreleased_history/new_features/multi-cf-option-migration.md
 delete mode 100644 unreleased_history/new_features/target_file_size_is_upper_bound.md
 delete mode 100644 unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md
 delete mode 100644 unreleased_history/public_api_changes/odr_thread_status.md

diff --git a/HISTORY.md b/HISTORY.md
index c601e2cf8213..551314d7e494 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,29 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.9.0 (11/21/2025)
+### New Features
+* Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable.
+* Added a new API to support option migration for multiple column families
+* Added new option target_file_size_is_upper_bound  that makes most compaction output SST files come close to the target file size without exceeding it, rather than commonly exceeding it by some fraction (current behavior). For now the new behavior is off by default, but we expect to enable it by default in the future.
+* Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.
+
+### Public API Changes
+* To reduce risk of ODR violations or similar, `ROCKSDB_USING_THREAD_STATUS` has been removed from public headers and replaced with static `const bool ThreadStatus::kEnabled`. Some other uses of conditional compilation have been removed from public API headers to reduce risk of ODR violations or other issues.
+
+### Behavior Changes
+* PosixWritableFile now repositions the seek pointer to the new end of file after a call to Truncate.
+* Updated standalone range deletion L0 file compaction behavior to avoid compacting with any newer L0 files (which is expensive and not useful).
+
+### Bug Fixes
+* Fix a bug where compaction with range deletion can persist kTypeMaxValid in MANIFEST as file metadata. kTypeMaxValid is not supposed to be persisted and can change as new value types are introduced. This can cause a forward compatibility issue where older versions of RocksDB don't recognize kTypeMaxValid from newer versions. A new placeholder value type kTypeTruncatedRangeDeletionSentinel is also introduced to replace kTypeMaxValid when reading existing SST files' metadata from MANIFEST. This allows us to strengthen some checks to avoid using kTypeMaxValid in the future.
+* Fixed a bug where `DB::GetSortedWalFiles()` could hang when waiting for a purge operation that found nothing to do (potentially triggered by iterator release, flush, compaction, etc.).
+* Fixed a bug in MultiScan where `max_sequential_skip_in_iterations` could cause the iterator to seek backward to already-unpinned blocks when the same user key spans multiple data blocks, leading to assertion failures or seg fault.
+* Fixed a bug for `WAL_ttl_seconds > 0` use cases where the newest archived WAL files could be incorrectly deleted when the system clock moved backwards.
+
+### Performance Improvements
+* Added optimization that allowed for the asynchronous prefetching of all data outlined in a multiscan iterator. This optimization was applied to the level iterator, which prefetches all data through each of the block-based iterators.
+
 ## 10.8.0 (10/21/2025)
 ### New Features
 * Add kFSPrefetch to FSSupportedOps enum to allow file systems to indicate prefetch support capability, avoiding unnecessary prefetch system calls on file systems that don't support them.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 58d5119989a0..7cc7a3de9873 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 9
+#define ROCKSDB_MINOR 10
 #define ROCKSDB_PATCH 0
 
 // Make it easy to do conditional compilation based on version checks, i.e.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 37051c77bb5e..37fab422c312 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/posix_writable_file_truncate.md b/unreleased_history/behavior_changes/posix_writable_file_truncate.md
deleted file mode 100644
index 861702e124b4..000000000000
--- a/unreleased_history/behavior_changes/posix_writable_file_truncate.md
+++ /dev/null
@@ -1 +0,0 @@
-PosixWritableFile now repositions the seek pointer to the new end of file after a call to Truncate.
diff --git a/unreleased_history/behavior_changes/standalone-range-del.md b/unreleased_history/behavior_changes/standalone-range-del.md
deleted file mode 100644
index 6d95bece1964..000000000000
--- a/unreleased_history/behavior_changes/standalone-range-del.md
+++ /dev/null
@@ -1 +0,0 @@
-* Updated standalone range deletion L0 file compaction behavior to avoid compacting with any newer L0 files (which is expensive and not useful).
diff --git a/unreleased_history/bug_fixes/fix-range-del-boundary.md b/unreleased_history/bug_fixes/fix-range-del-boundary.md
deleted file mode 100644
index b75bc6d11d60..000000000000
--- a/unreleased_history/bug_fixes/fix-range-del-boundary.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fix a bug where compaction with range deletion can persist kTypeMaxValid in MANIFEST as file metadata. kTypeMaxValid is not supposed to be persisted and can change as new value types are introduced. This can cause a forward compatibility issue where older versions of RocksDB don't recognize kTypeMaxValid from newer versions. A new placeholder value type kTypeTruncatedRangeDeletionSentinel is also introduced to replace kTypeMaxValid when reading existing SST files' metadata from MANIFEST. This allows us to strengthen some checks to avoid using kTypeMaxValid in the future.
diff --git a/unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md b/unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md
deleted file mode 100644
index 48f6efea4cbe..000000000000
--- a/unreleased_history/bug_fixes/get_sorted_wal_files_noop_purge_hang.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed a bug where `DB::GetSortedWalFiles()` could hang when waiting for a purge operation that found nothing to do (potentially triggered by iterator release, flush, compaction, etc.).
\ No newline at end of file
diff --git a/unreleased_history/bug_fixes/multiscan_backward_seek.md b/unreleased_history/bug_fixes/multiscan_backward_seek.md
deleted file mode 100644
index e800b2c067d1..000000000000
--- a/unreleased_history/bug_fixes/multiscan_backward_seek.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed a bug in MultiScan where `max_sequential_skip_in_iterations` could cause the iterator to seek backward to already-unpinned blocks when the same user key spans multiple data blocks, leading to assertion failures or seg fault.
diff --git a/unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md b/unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md
deleted file mode 100644
index 4cdf0a07bada..000000000000
--- a/unreleased_history/bug_fixes/wal_ttl_clock_regression_underflow.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed a bug for `WAL_ttl_seconds > 0` use cases where the newest archived WAL files could be incorrectly deleted when the system clock moved backwards.
\ No newline at end of file
diff --git a/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md b/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md
deleted file mode 100644
index 4c52fc3abf5d..000000000000
--- a/unreleased_history/new_features/Trivial_move_support_in_CompactFiles_API.md
+++ /dev/null
@@ -1 +0,0 @@
-Add a new option allow_trivial_move in CompactionOptions to allow CompactFiles to perform trivial move if possible. By default the flag of allow_trivial_move is false, so it preserve the original behavior.
diff --git a/unreleased_history/new_features/auto_tune_manifest.md b/unreleased_history/new_features/auto_tune_manifest.md
deleted file mode 100644
index 9bc95a05e2ee..000000000000
--- a/unreleased_history/new_features/auto_tune_manifest.md
+++ /dev/null
@@ -1 +0,0 @@
-* Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable.
diff --git a/unreleased_history/new_features/multi-cf-option-migration.md b/unreleased_history/new_features/multi-cf-option-migration.md
deleted file mode 100644
index 817286fb0ba0..000000000000
--- a/unreleased_history/new_features/multi-cf-option-migration.md
+++ /dev/null
@@ -1 +0,0 @@
-Added a new API to support option migration for multiple column families
diff --git a/unreleased_history/new_features/target_file_size_is_upper_bound.md b/unreleased_history/new_features/target_file_size_is_upper_bound.md
deleted file mode 100644
index 4dc578949f7a..000000000000
--- a/unreleased_history/new_features/target_file_size_is_upper_bound.md
+++ /dev/null
@@ -1 +0,0 @@
-Added new option target_file_size_is_upper_bound  that makes most compaction output SST files come close to the target file size without exceeding it, rather than commonly exceeding it by some fraction (current behavior). For now the new behavior is off by default, but we expect to enable it by default in the future.
diff --git a/unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md b/unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md
deleted file mode 100644
index ed141b0ceae8..000000000000
--- a/unreleased_history/performance_improvements/Prefetch_all_iterators_in_level_iterators_prepare.md
+++ /dev/null
@@ -1 +0,0 @@
-Added optimization that allowed for the asynchronous prefetching of all data outlined in a multiscan iterator. This optimization was applied to the level iterator, which prefetches all data through each of the block-based iterators.
diff --git a/unreleased_history/public_api_changes/odr_thread_status.md b/unreleased_history/public_api_changes/odr_thread_status.md
deleted file mode 100644
index c3af9367a55d..000000000000
--- a/unreleased_history/public_api_changes/odr_thread_status.md
+++ /dev/null
@@ -1 +0,0 @@
-* To reduce risk of ODR violations or similar, `ROCKSDB_USING_THREAD_STATUS` has been removed from public headers and replaced with static `const bool ThreadStatus::kEnabled`. Some other uses of conditional compilation have been removed from public API headers to reduce risk of ODR violations or other issues.

From 9c2c8f54fa1d8e9960369311706e8be0a20869e3 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 24 Nov 2025 10:36:12 -0800
Subject: [PATCH 389/500] Fix AutoSkipCompressorWrapper with new logic (#14150)

Summary:
... from https://github.com/facebook/rocksdb/issues/14140. The assertion in the default implementation of CompressorWrapper::MaybeCloneSpecialized() could fail because this wrapper wasn't overriding it when it should. (See the NOTE on that implementation.)

Because this release already has a breaking modification to the Compressor API (adding Clone()), I took this opportunity to add 'const' to MaybeCloneSpecialized(). Also marked some compression classes as 'final' that could be marked as such.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14150

Test Plan: unit test expanded to cover this case (verified failing before). Audited the rest of our CompressorWrappers.

Reviewed By: archang19

Differential Revision: D87793987

Pulled By: pdillinger

fbshipit-source-id: 61c4469b84e4a47451a9942df09277faeeccfe63
---
 include/rocksdb/advanced_compression.h |  6 +--
 test_util/testutil.h                   |  2 +-
 util/auto_tune_compressor.cc           |  9 +++-
 util/auto_tune_compressor.h            |  4 +-
 util/compression.cc                    | 34 ++++++++-------
 util/compression_test.cc               | 60 ++++++++++++++------------
 util/simple_mixed_compressor.cc        |  2 +-
 util/simple_mixed_compressor.h         |  2 +-
 8 files changed, 67 insertions(+), 52 deletions(-)

diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index be3da68f5177..ae707b6479da 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -125,7 +125,7 @@ class Compressor {
   // dictionary associated with a returned compressor must be read from
   // GetSerializedDict().
   virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
     // Default implementation: no specialization
     (void)block_type;
     (void)dict_samples;
@@ -138,7 +138,7 @@ class Compressor {
   // A convenience function when a clone is needed and may or may not be
   // specialized.
   std::unique_ptr<Compressor> CloneMaybeSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
     auto clone = MaybeCloneSpecialized(block_type, std::move(dict_samples));
     if (clone == nullptr) {
       clone = Clone();
@@ -496,7 +496,7 @@ class CompressorWrapper : public Compressor {
   // when the wrapped Compressor uses the default implementation of
   // MaybeCloneSpecialized(). This needs to be overridden if not.
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override {
     auto clone =
         wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
     // Assert default no-op MaybeCloneSpecialized()
diff --git a/test_util/testutil.h b/test_util/testutil.h
index aa837a972b24..3bd97ef14b76 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -796,7 +796,7 @@ struct CompressorCustomAlg : public CompressorWrapper {
   }
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override {
     auto clone =
         wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples));
     return std::make_unique<CompressorCustomAlg>(std::move(clone));
diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
index 221ebe30073b..58d6ee968a43 100644
--- a/util/auto_tune_compressor.cc
+++ b/util/auto_tune_compressor.cc
@@ -63,6 +63,13 @@ std::unique_ptr<Compressor> AutoSkipCompressorWrapper::Clone() const {
   return std::make_unique<AutoSkipCompressorWrapper>(wrapped_->Clone(), opts_);
 }
 
+std::unique_ptr<Compressor> AutoSkipCompressorWrapper::MaybeCloneSpecialized(
+    CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
+  auto clone =
+      wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples));
+  return std::make_unique<AutoSkipCompressorWrapper>(std::move(clone), opts_);
+}
+
 Status AutoSkipCompressorWrapper::CompressBlock(
     Slice uncompressed_data, char* compressed_output,
     size_t* compressed_output_size, CompressionType* out_compression_type,
@@ -198,7 +205,7 @@ CompressionType CostAwareCompressor::GetPreferredCompressionType() const {
   return kZSTD;
 }
 std::unique_ptr<Compressor> CostAwareCompressor::MaybeCloneSpecialized(
-    CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+    CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
   // TODO: full dictionary compression support. Currently this just falls
   // back on a non-multi compressor when asked to use a dictionary.
   auto idx = allcompressors_index_.back();
diff --git a/util/auto_tune_compressor.h b/util/auto_tune_compressor.h
index ebd09f1c1e6a..e3653fd45205 100644
--- a/util/auto_tune_compressor.h
+++ b/util/auto_tune_compressor.h
@@ -65,6 +65,8 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
                                      const CompressionOptions& opts);
 
   std::unique_ptr<Compressor> Clone() const override;
+  std::unique_ptr<Compressor> MaybeCloneSpecialized(
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override;
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,
@@ -156,7 +158,7 @@ class CostAwareCompressor : public Compressor {
   CompressionType GetPreferredCompressionType() const override;
   ManagedWorkingArea ObtainWorkingArea() override;
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override;
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override;
 
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
diff --git a/util/compression.cc b/util/compression.cc
index afa1e3d5357e..30b7e8b09e1d 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -162,7 +162,7 @@ class CompressorBase : public Compressor {
   CompressionOptions opts_;
 };
 
-class BuiltinCompressorV1 : public CompressorBase {
+class BuiltinCompressorV1 final : public CompressorBase {
  public:
   const char* Name() const override { return "BuiltinCompressorV1"; }
 
@@ -236,7 +236,7 @@ class CompressorWithSimpleDictBase : public CompressorBase {
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole /*block_type*/,
-      DictSampleArgs&& dict_samples) final override {
+      DictSampleArgs&& dict_samples) const final override {
     assert(dict_samples.Verify());
     if (dict_samples.empty()) {
       // Nothing to specialize on
@@ -256,7 +256,7 @@ class CompressorWithSimpleDictBase : public CompressorBase {
 // NOTE: the legacy behavior is to pretend to use dictionary compression when
 // enabled, including storing a dictionary block, but to ignore it. That is
 // matched here.
-class BuiltinSnappyCompressorV2 : public CompressorWithSimpleDictBase {
+class BuiltinSnappyCompressorV2 final : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
 
@@ -349,7 +349,7 @@ std::pair<char*, size_t> StartCompressBlockV2(Slice uncompressed_data,
   return {alg_output, alg_max_output_size};
 }
 
-class BuiltinZlibCompressorV2 : public CompressorWithSimpleDictBase {
+class BuiltinZlibCompressorV2 final : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
 
@@ -448,7 +448,7 @@ class BuiltinZlibCompressorV2 : public CompressorWithSimpleDictBase {
   }
 };
 
-class BuiltinBZip2CompressorV2 : public CompressorWithSimpleDictBase {
+class BuiltinBZip2CompressorV2 final : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
 
@@ -624,7 +624,8 @@ class BuiltinLZ4CompressorV2WithDict : public CompressorWithSimpleDictBase {
   }
 };
 
-class BuiltinLZ4CompressorV2NoDict : public BuiltinLZ4CompressorV2WithDict {
+class BuiltinLZ4CompressorV2NoDict final
+    : public BuiltinLZ4CompressorV2WithDict {
  public:
   BuiltinLZ4CompressorV2NoDict(const CompressionOptions& opts)
       : BuiltinLZ4CompressorV2WithDict(opts, /*dict_data=*/{}) {}
@@ -694,7 +695,7 @@ class BuiltinLZ4CompressorV2NoDict : public BuiltinLZ4CompressorV2WithDict {
   }
 };
 
-class BuiltinLZ4HCCompressorV2 : public CompressorWithSimpleDictBase {
+class BuiltinLZ4HCCompressorV2 final : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
 
@@ -790,7 +791,7 @@ class BuiltinLZ4HCCompressorV2 : public CompressorWithSimpleDictBase {
   }
 };
 
-class BuiltinXpressCompressorV2 : public CompressorWithSimpleDictBase {
+class BuiltinXpressCompressorV2 final : public CompressorWithSimpleDictBase {
  public:
   using CompressorWithSimpleDictBase::CompressorWithSimpleDictBase;
 
@@ -840,7 +841,7 @@ class BuiltinXpressCompressorV2 : public CompressorWithSimpleDictBase {
   }
 };
 
-class BuiltinZSTDCompressorV2 : public CompressorBase {
+class BuiltinZSTDCompressorV2 final : public CompressorBase {
  public:
   explicit BuiltinZSTDCompressorV2(const CompressionOptions& opts,
                                    CompressionDict&& dict = {})
@@ -972,7 +973,8 @@ class BuiltinZSTDCompressorV2 : public CompressorBase {
   }
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole /*block_type*/, DictSampleArgs&& dict_samples) override {
+      CacheEntryRole /*block_type*/,
+      DictSampleArgs&& dict_samples) const override {
     assert(dict_samples.Verify());
     if (dict_samples.empty()) {
       // Nothing to specialize on
@@ -1009,7 +1011,7 @@ class BuiltinZSTDCompressorV2 : public CompressorBase {
 
 // NOTE: this implementation is intentionally SIMPLE based on existing code
 // and NOT EFFICIENT because this is an old/deprecated format.
-class BuiltinDecompressorV1 : public Decompressor {
+class BuiltinDecompressorV1 final : public Decompressor {
  public:
   const char* Name() const override { return "BuiltinDecompressorV1"; }
 
@@ -1061,7 +1063,7 @@ class BuiltinDecompressorV1 : public Decompressor {
   }
 };
 
-class BuiltinCompressionManagerV1 : public CompressionManager {
+class BuiltinCompressionManagerV1 final : public CompressionManager {
  public:
   BuiltinCompressionManagerV1() = default;
   ~BuiltinCompressionManagerV1() override = default;
@@ -1384,7 +1386,7 @@ class BuiltinDecompressorV2 : public Decompressor {
   }
 };
 
-class BuiltinDecompressorV2SnappyOnly : public BuiltinDecompressorV2 {
+class BuiltinDecompressorV2SnappyOnly final : public BuiltinDecompressorV2 {
  public:
   const char* Name() const override {
     return "BuiltinDecompressorV2SnappyOnly";
@@ -1412,7 +1414,7 @@ class BuiltinDecompressorV2SnappyOnly : public BuiltinDecompressorV2 {
   }
 };
 
-class BuiltinDecompressorV2WithDict : public BuiltinDecompressorV2 {
+class BuiltinDecompressorV2WithDict final : public BuiltinDecompressorV2 {
  public:
   explicit BuiltinDecompressorV2WithDict(const Slice& dict) : dict_(dict) {}
 
@@ -1505,7 +1507,7 @@ class BuiltinDecompressorV2OptimizeZstd : public BuiltinDecompressorV2 {
                            std::unique_ptr<Decompressor>* /*out*/) override;
 };
 
-class BuiltinDecompressorV2OptimizeZstdWithDict
+class BuiltinDecompressorV2OptimizeZstdWithDict final
     : public BuiltinDecompressorV2OptimizeZstd {
  public:
   explicit BuiltinDecompressorV2OptimizeZstdWithDict(const Slice& dict)
@@ -1569,7 +1571,7 @@ Status BuiltinDecompressorV2OptimizeZstd::MaybeCloneForDict(
       serialized_dict);
   return Status::OK();
 }
-class BuiltinCompressionManagerV2 : public CompressionManager {
+class BuiltinCompressionManagerV2 final : public CompressionManager {
  public:
   BuiltinCompressionManagerV2() = default;
   ~BuiltinCompressionManagerV2() override = default;
diff --git a/util/compression_test.cc b/util/compression_test.cc
index c4a3baa7a49e..06571f233bf3 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -1364,7 +1364,8 @@ TEST_P(DBCompressionTestMaybeParallel, CompressionManagerWrapper) {
     }
 
     std::unique_ptr<Compressor> MaybeCloneSpecialized(
-        CacheEntryRole block_type, DictSampleArgs&& dict_samples) override {
+        CacheEntryRole block_type,
+        DictSampleArgs&& dict_samples) const override {
       std::unique_ptr<Compressor> result = std::make_unique<MyCompressor>(
           wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples)));
       if (block_type == CacheEntryRole::kDataBlock) {
@@ -1991,34 +1992,37 @@ class DBAutoSkip : public DBTestBase {
 };
 
 TEST_F(DBAutoSkip, AutoSkipCompressionManager) {
-  for (auto type : GetSupportedCompressions()) {
-    if (type == kNoCompression) {
-      continue;
+  for (uint32_t max_dict_bytes : {0, 10000}) {
+    for (auto type : GetSupportedCompressions()) {
+      if (type == kNoCompression) {
+        continue;
+      }
+      options.compression = type;
+      options.bottommost_compression = type;
+      options.compression_opts.max_dict_bytes = max_dict_bytes;
+      DestroyAndReopen(options);
+      const int kValueSize = 20000;
+      // This will set the rejection ratio to 60%
+      CompressionUnfriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      // This will verify all the data block compressions are bypassed based on
+      // previous prediction
+      CompressionUnfriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      // This will set the rejection ratio to 40%
+      CompressionUnfriendlyPut(4, kValueSize);
+      CompressionFriendlyPut(6, kValueSize);
+      // This will verify all the data block compression are attempted based on
+      // previous prediction
+      // Compression will be rejected for 6 compression unfriendly blocks
+      // Compression will be accepted for 4 compression friendly blocks
+      CompressionUnfriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      // Extra block write to ensure that the all above cases are checked
+      CompressionFriendlyPut(6, kValueSize);
+      CompressionFriendlyPut(4, kValueSize);
+      ASSERT_OK(Flush());
     }
-    options.compression = type;
-    options.bottommost_compression = type;
-    DestroyAndReopen(options);
-    const int kValueSize = 20000;
-    // This will set the rejection ratio to 60%
-    CompressionUnfriendlyPut(6, kValueSize);
-    CompressionFriendlyPut(4, kValueSize);
-    // This will verify all the data block compressions are bypassed based on
-    // previous prediction
-    CompressionUnfriendlyPut(6, kValueSize);
-    CompressionFriendlyPut(4, kValueSize);
-    // This will set the rejection ratio to 40%
-    CompressionUnfriendlyPut(4, kValueSize);
-    CompressionFriendlyPut(6, kValueSize);
-    // This will verify all the data block compression are attempted based on
-    // previous prediction
-    // Compression will be rejected for 6 compression unfriendly blocks
-    // Compression will be accepted for 4 compression friendly blocks
-    CompressionUnfriendlyPut(6, kValueSize);
-    CompressionFriendlyPut(4, kValueSize);
-    // Extra block write to ensure that the all above cases are checked
-    CompressionFriendlyPut(6, kValueSize);
-    CompressionFriendlyPut(4, kValueSize);
-    ASSERT_OK(Flush());
   }
 }
 class CostAwareTestFlushBlockPolicy : public FlushBlockPolicy {
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 282fcd97a5a9..381cf2ec52c4 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -48,7 +48,7 @@ Compressor::ManagedWorkingArea MultiCompressorWrapper::ObtainWorkingArea() {
 }
 
 std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
-    CacheEntryRole block_type, DictSampleArgs&& dict_samples) {
+    CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
   // TODO: full dictionary compression support. Currently this just falls
   // back on a non-multi compressor when asked to use a dictionary.
   return compressors_.back()->MaybeCloneSpecialized(block_type,
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index e55b724976d9..79ba7b130c86 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -25,7 +25,7 @@ class MultiCompressorWrapper : public Compressor {
   CompressionType GetPreferredCompressionType() const override;
   ManagedWorkingArea ObtainWorkingArea() override;
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) override;
+      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override;
 
  protected:
   const CompressionOptions opts_;

From 9e14d06143dae681d252cb0434bea667995eaede Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@fb.com>
Date: Mon, 24 Nov 2025 10:48:09 -0800
Subject: [PATCH 390/500] Support ccache in make file (#14123)

Summary:
Support ccache in make file

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14123

Test Plan: local build

Reviewed By: cbi42

Differential Revision: D87332892

Pulled By: xingbowang

fbshipit-source-id: 2088bd19bdab1bd7070734c886200be80f1a65af
---
 Makefile | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/Makefile b/Makefile
index 403c804c17f7..cfbeb2a90460 100644
--- a/Makefile
+++ b/Makefile
@@ -296,6 +296,28 @@ $(info $(shell $(CC) --version))
 $(info $(shell $(CXX) --version))
 endif
 
+# ccache support
+# Set USE_CCACHE=1 to enable ccache, or let it auto-detect
+ifndef USE_CCACHE
+  CCACHE := $(shell which ccache 2>/dev/null)
+  ifneq ($(CCACHE),)
+    USE_CCACHE := 1
+  else
+    USE_CCACHE := 0
+  endif
+endif
+
+ifeq ($(USE_CCACHE), 1)
+  CCACHE := $(shell which ccache 2>/dev/null)
+  ifneq ($(CCACHE),)
+    $(info Using ccache: $(CCACHE))
+    CC := $(CCACHE) $(CC)
+    CXX := $(CCACHE) $(CXX)
+  else
+    $(warning ccache requested but not found in PATH)
+  endif
+endif
+
 missing_make_config_paths := $(shell				\
 	grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | 	\
 	while read path;					\

From ac412b10955d5a1d3d99aff8edf94eae1e4a22d5 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Tue, 25 Nov 2025 09:01:20 -0800
Subject: [PATCH 391/500] Add checks to terminate early when backup is stopped
 (#14129)

Summary:
I want to reduce the time from when we call `StopBackup` to `CreateNewBackup` returning `BackupStopped`. We already check for the `stop_backup_` inside `CopyOrCreateFile` and `ReadFileAndComputeChecksum`, but we should add a check at the top of these methods to abort immediately. This could help save some latency from the file system metadata operations, like creating the sequential file and writable file.

We also want to update the API documentation for `StopBackup` which currently does not indicate that once it is called, all subsequent requests to create backups will fail.

In a follow up PR, we should also add coverage of `StopBackup` to the crash tests.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14129

Test Plan:
We were missing unit test coverage for `StopBackup`. I added test cases which cancel backups at different points in time.

Once this change is rolled out to production, we can monitor the DB close latencies, which depend on first cancelling ongoing backups

Reviewed By: pdillinger

Differential Revision: D87356536

Pulled By: archang19

fbshipit-source-id: 687094a41f096f6a156be65b2cce0b5054fb26f2
---
 include/rocksdb/utilities/backup_engine.h |  9 ++-
 utilities/backup/backup_engine.cc         | 21 ++++++-
 utilities/backup/backup_engine_test.cc    | 74 +++++++++++++++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h
index 045fdb06aa49..1961691be15e 100644
--- a/include/rocksdb/utilities/backup_engine.h
+++ b/include/rocksdb/utilities/backup_engine.h
@@ -621,7 +621,14 @@ class BackupEngineAppendOnlyBase {
   // The backup will stop ASAP and the call to CreateNewBackup will
   // return Status::Incomplete(). It will not clean up after itself, but
   // the state will remain consistent. The state will be cleaned up the
-  // next time you call CreateNewBackup or GarbageCollect.
+  // next time you call CreateNewBackup or GarbageCollect for the same backup
+  // directory on a new BackupEngine object.
+  //
+  // NOTE: This is a one-way operation. Once StopBackup() is called on a
+  // BackupEngine instance, all subsequent backup requests (CreateNewBackup,
+  // CreateNewBackupWithMetadata) will fail with Status::Incomplete().
+  // To create new backups after calling StopBackup(), you must open a new
+  // BackupEngine instance.
   virtual void StopBackup() = 0;
 
   // Will delete any files left over from incomplete creation or deletion of
diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc
index 420dc8155e2c..3eedfa13c6c6 100644
--- a/utilities/backup/backup_engine.cc
+++ b/utilities/backup/backup_engine.cc
@@ -615,6 +615,10 @@ class BackupEngineImpl {
                                       std::string* checksum_hex,
                                       const Temperature src_temperature) const;
 
+  // Helper method to check if backup should be stopped. Can be overridden
+  // via sync points for testing.
+  bool ShouldStopBackup() const;
+
   // Obtain db_id and db_session_id from the table properties of file_path
   Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options,
                              const std::string& file_path,
@@ -2353,6 +2357,10 @@ IOStatus BackupEngineImpl::CopyOrCreateFile(
     Temperature dst_temperature, uint64_t* bytes_toward_next_callback,
     uint64_t* size, std::string* checksum_hex) {
   assert(src.empty() != contents.empty());
+  if (ShouldStopBackup()) {
+    return status_to_io_status(Status::Incomplete("Backup stopped"));
+  }
+
   IOStatus io_s;
   std::unique_ptr<FSWritableFile> dst_file;
   std::unique_ptr<FSSequentialFile> src_file;
@@ -2413,7 +2421,7 @@ IOStatus BackupEngineImpl::CopyOrCreateFile(
   Slice data;
   const IOOptions opts;
   do {
-    if (stop_backup_.load(std::memory_order_acquire)) {
+    if (ShouldStopBackup()) {
       return status_to_io_status(Status::Incomplete("Backup stopped"));
     }
     if (!src.empty()) {
@@ -2749,6 +2757,12 @@ IOStatus BackupEngineImpl::AddBackupFileWorkItem(
   return IOStatus::OK();
 }
 
+bool BackupEngineImpl::ShouldStopBackup() const {
+  bool should_stop = stop_backup_.load(std::memory_order_acquire);
+  TEST_SYNC_POINT_CALLBACK("BackupEngineImpl::ShouldStopBackup", &should_stop);
+  return should_stop;
+}
+
 IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
     const std::string& src, const std::shared_ptr<FileSystem>& src_fs,
     const EnvOptions& src_env_options, uint64_t size_limit,
@@ -2756,6 +2770,9 @@ IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
   if (checksum_hex == nullptr) {
     return status_to_io_status(Status::Aborted("Checksum pointer is null"));
   }
+  if (ShouldStopBackup()) {
+    return status_to_io_status(Status::Incomplete("Backup stopped"));
+  }
   uint32_t checksum_value = 0;
   if (size_limit == 0) {
     size_limit = std::numeric_limits<uint64_t>::max();
@@ -2783,7 +2800,7 @@ IOStatus BackupEngineImpl::ReadFileAndComputeChecksum(
   Slice data;
 
   do {
-    if (stop_backup_.load(std::memory_order_acquire)) {
+    if (ShouldStopBackup()) {
       return status_to_io_status(Status::Incomplete("Backup stopped"));
     }
     size_t buffer_to_read =
diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc
index 9438b8574451..d31f34ef887a 100644
--- a/utilities/backup/backup_engine_test.cc
+++ b/utilities/backup/backup_engine_test.cc
@@ -43,6 +43,7 @@
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/atomic.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
@@ -4790,6 +4791,79 @@ TEST_F(BackupEngineTest, IOBufferSize) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+// Test stopping backup at different points in the backup lifecycle
+// Uses randomized stop points with geometric distribution to better catch
+// edge cases across multiple iterations.
+TEST_F(BackupEngineTest, StopBackupAtDifferentStages) {
+  const int keys_iteration = 5000;
+  const int num_iterations = 10;
+
+  // Enable multi-threaded backup
+  engine_options_->max_background_operations = 7;
+
+  // Generate DB once and reuse across iterations
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, keys_iteration);
+
+  Random rnd(301);
+
+  for (int iteration = 0; iteration < num_iterations; iteration++) {
+    // Generate stop threshold using skewed distribution
+    // Smaller numbers are more likely, which is more interesting for testing
+    // Range: [0, 2^7-1] = [0, 127] with exponential bias towards 0
+    int stop_after_calls = rnd.Skewed(7);
+
+    RelaxedAtomic<int> call_count{0};
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BackupEngineImpl::ShouldStopBackup", [&](void* arg) {
+          call_count.FetchAddRelaxed(1);
+          if (call_count.LoadRelaxed() > stop_after_calls) {
+            bool* should_stop = static_cast<bool*>(arg);
+            *should_stop = true;
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Create backup - it may complete successfully or be stopped
+    IOStatus s = backup_engine_->CreateNewBackup(db_.get());
+
+    // Verify that ShouldStopBackup was called
+    ASSERT_GT(call_count.LoadRelaxed(), 0);
+
+    if (s.IsIncomplete()) {
+      // Backup was stopped - verify it's the expected error
+      ASSERT_TRUE(s.ToString().find("Backup stopped") != std::string::npos)
+          << "Unexpected incomplete status for threshold " << stop_after_calls
+          << ": " << s.ToString();
+      ASSERT_GT(call_count.LoadRelaxed(), stop_after_calls)
+          << "Expected call_count > stop_after_calls";
+
+      // Verify that no valid backup was created
+      std::vector<BackupInfo> backup_info;
+      backup_engine_->GetBackupInfo(&backup_info);
+      ASSERT_EQ(0, backup_info.size());
+    } else {
+      // Backup completed successfully before reaching the stop threshold
+      ASSERT_OK(s) << "Unexpected error for threshold " << stop_after_calls;
+      ASSERT_LE(call_count.LoadRelaxed(), stop_after_calls)
+          << "Backup completed but call_count exceeded threshold";
+
+      // Verify a backup was created
+      std::vector<BackupInfo> backup_info;
+      backup_engine_->GetBackupInfo(&backup_info);
+      ASSERT_EQ(1, backup_info.size());
+
+      // Clean up the successful backup for next iteration
+      ASSERT_OK(backup_engine_->DeleteBackup(backup_info[0].backup_id));
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  CloseDBAndBackupEngine();
+}
+
 }  // namespace
 
 }  // namespace ROCKSDB_NAMESPACE

From 4951494a27cf1c4c886297a1a4e1e5fdd2473ebe Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 1 Dec 2025 13:21:34 -0800
Subject: [PATCH 392/500] Continue migration of HCC impl to BitFields (#14027)

Summary:
Continuing work from https://github.com/facebook/rocksdb/issues/13965. Here I'm migrating the "next with shift" kind of bit field and for that I've added an API for atomic additive transformations that can be combined into a single atomic update for multiple fields. (I implemented more features than needed, just in case they are needed someday and to demonstrate what is possible.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14027

Test Plan: BitFields unit test updated/added, existing HCC tests

Reviewed By: xingbowang

Differential Revision: D83895094

Pulled By: pdillinger

fbshipit-source-id: e4487f34f5607b20f94b85a645ca654e6401e35d
---
 cache/clock_cache.cc    | 225 ++++++++++++++++++----------------------
 cache/clock_cache.h     |  77 ++++++++------
 test_util/sync_point.cc |   4 +
 test_util/sync_point.h  |  32 ++++++
 util/bit_fields.h       | 156 ++++++++++++++++++++++++----
 util/slice_test.cc      |  92 ++++++++++++++++
 6 files changed, 409 insertions(+), 177 deletions(-)

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index e65a3cf12f44..0ef599857d6a 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -1752,26 +1752,6 @@ inline void GetHomeIndexAndShift(uint64_t length_info, uint64_t hash,
   assert(*home < LengthInfoToUsedLength(length_info));
 }
 
-inline int GetShiftFromNextWithShift(uint64_t next_with_shift) {
-  return BitwiseAnd(next_with_shift,
-                    AutoHyperClockTable::HandleImpl::kShiftMask);
-}
-
-inline size_t GetNextFromNextWithShift(uint64_t next_with_shift) {
-  return static_cast<size_t>(next_with_shift >>
-                             AutoHyperClockTable::HandleImpl::kNextShift);
-}
-
-inline uint64_t MakeNextWithShift(size_t next, int shift) {
-  return (uint64_t{next} << AutoHyperClockTable::HandleImpl::kNextShift) |
-         static_cast<uint64_t>(shift);
-}
-
-inline uint64_t MakeNextWithShiftEnd(size_t head, int shift) {
-  return AutoHyperClockTable::HandleImpl::kNextEndFlags |
-         MakeNextWithShift(head, shift);
-}
-
 // Helper function for Lookup
 inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
                         int shift = 0, size_t home = 0,
@@ -1821,36 +1801,39 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
   }
 }
 
+using NextWithShift = AutoHyperClockTable::HandleImpl::NextWithShift;
+
 // Assumes a chain rewrite lock prevents concurrent modification of
 // these chain pointers
 void UpgradeShiftsOnRange(AutoHyperClockTable::HandleImpl* arr,
-                          size_t& frontier, uint64_t stop_before_or_new_tail,
-                          int old_shift, int new_shift) {
+                          size_t& frontier,
+                          NextWithShift stop_before_or_new_tail, int old_shift,
+                          int new_shift) {
   assert(frontier != SIZE_MAX);
   assert(new_shift == old_shift + 1);
   (void)old_shift;
   (void)new_shift;
-  using HandleImpl = AutoHyperClockTable::HandleImpl;
   for (;;) {
-    uint64_t next_with_shift = arr[frontier].chain_next_with_shift.Load();
-    assert(GetShiftFromNextWithShift(next_with_shift) == old_shift);
+    NextWithShift next_with_shift = arr[frontier].chain_next_with_shift.Load();
+    assert(next_with_shift.GetShift() == old_shift);
     if (next_with_shift == stop_before_or_new_tail) {
       // Stopping at entry with pointer matching "stop before"
-      assert(!HandleImpl::IsEnd(next_with_shift));
+      assert(!next_with_shift.IsEnd());
       return;
     }
-    if (HandleImpl::IsEnd(next_with_shift)) {
+    if (next_with_shift.IsEnd()) {
       // Also update tail to new tail
-      assert(HandleImpl::IsEnd(stop_before_or_new_tail));
+      assert(stop_before_or_new_tail.IsEnd());
       arr[frontier].chain_next_with_shift.Store(stop_before_or_new_tail);
       // Mark nothing left to upgrade
       frontier = SIZE_MAX;
       return;
     }
     // Next is another entry to process, so upgrade and advance frontier
-    arr[frontier].chain_next_with_shift.FetchAdd(1U);
-    assert(GetShiftFromNextWithShift(next_with_shift + 1) == new_shift);
-    frontier = GetNextFromNextWithShift(next_with_shift);
+    arr[frontier].chain_next_with_shift.Apply(
+        NextWithShift::Shift::PlusTransformPromiseNoOverflow(1U));
+    assert(next_with_shift.GetShift() + 1 == new_shift);
+    frontier = next_with_shift.GetNext();
   }
 }
 
@@ -1888,19 +1871,19 @@ class AutoHyperClockTable::ChainRewriteLock {
   // RAII wrap existing lock held (or end)
   explicit ChainRewriteLock(HandleImpl* h,
                             RelaxedAtomic<uint64_t>& /*yield_count*/,
-                            uint64_t already_locked_or_end)
+                            NextWithShift already_locked_or_end)
       : head_ptr_(&h->head_next_with_shift) {
     saved_head_ = already_locked_or_end;
     // already locked or end
-    assert(saved_head_ & HandleImpl::kHeadLocked);
+    assert(saved_head_.IsLocked());
   }
 
   ~ChainRewriteLock() {
     if (!IsEnd()) {
       // Release lock
-      uint64_t old = head_ptr_->FetchAnd(~HandleImpl::kHeadLocked);
-      (void)old;
-      assert((old & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked);
+      NextWithShift old;
+      head_ptr_->Apply(NextWithShift::LockedFlag::ClearTransform(), &old);
+      assert(old.IsLockedNotEnd());
     }
   }
 
@@ -1910,12 +1893,13 @@ class AutoHyperClockTable::ChainRewriteLock {
   }
 
   // Expected current state, assuming no parallel updates.
-  uint64_t GetSavedHead() const { return saved_head_; }
+  NextWithShift GetSavedHead() const { return saved_head_; }
 
-  bool CasUpdate(uint64_t next_with_shift,
+  bool CasUpdate(NextWithShift next_with_shift,
                  RelaxedAtomic<uint64_t>& yield_count) {
-    uint64_t new_head = next_with_shift | HandleImpl::kHeadLocked;
-    uint64_t expected = GetSavedHead();
+    NextWithShift new_head =
+        next_with_shift.With<NextWithShift::LockedFlag>(true);
+    NextWithShift expected = GetSavedHead();
     bool success = head_ptr_->CasStrong(expected, new_head);
     if (success) {
       // Ensure IsEnd() is kept up-to-date, including for dtor
@@ -1924,7 +1908,7 @@ class AutoHyperClockTable::ChainRewriteLock {
       // Parallel update to head, such as Insert()
       if (IsEnd()) {
         // Didn't previously hold a lock
-        if (HandleImpl::IsEnd(expected)) {
+        if (expected.IsEnd()) {
           // Still don't need to
           saved_head_ = expected;
         } else {
@@ -1933,28 +1917,25 @@ class AutoHyperClockTable::ChainRewriteLock {
         }
       } else {
         // Parallel update must preserve our lock
-        assert((expected & HandleImpl::kNextEndFlags) ==
-               HandleImpl::kHeadLocked);
+        assert(expected.IsLockedNotEnd());
         saved_head_ = expected;
       }
     }
     return success;
   }
 
-  bool IsEnd() const { return HandleImpl::IsEnd(saved_head_); }
+  bool IsEnd() const { return saved_head_.IsEnd(); }
 
  private:
   void Acquire(RelaxedAtomic<uint64_t>& yield_count) {
     for (;;) {
       // Acquire removal lock on the chain
-      uint64_t old_head = head_ptr_->FetchOr(HandleImpl::kHeadLocked);
-      if ((old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) {
+      NextWithShift old_head;
+      head_ptr_->Apply(NextWithShift::LockedFlag::SetTransform(), &old_head,
+                       &saved_head_);
+      if (!old_head.IsLockedNotEnd()) {
         // Either acquired the lock or lock not needed (end)
-        assert((old_head & HandleImpl::kNextEndFlags) == 0 ||
-               (old_head & HandleImpl::kNextEndFlags) ==
-                   HandleImpl::kNextEndFlags);
-
-        saved_head_ = old_head | HandleImpl::kHeadLocked;
+        assert(old_head.IsEnd() == old_head.IsLocked());
         break;
       }
       // NOTE: one of the few yield-wait loops, which is rare enough in practice
@@ -1965,8 +1946,8 @@ class AutoHyperClockTable::ChainRewriteLock {
     }
   }
 
-  AcqRelAtomic<uint64_t>* head_ptr_;
-  uint64_t saved_head_;
+  AcqRelBitFieldsAtomic<NextWithShift>* head_ptr_;
+  NextWithShift saved_head_;
 };
 
 AutoHyperClockTable::AutoHyperClockTable(
@@ -2021,9 +2002,9 @@ AutoHyperClockTable::AutoHyperClockTable(
 #endif
     if (major + i < used_length) {
       array_[i].head_next_with_shift.StoreRelaxed(
-          MakeNextWithShiftEnd(i, max_shift));
+          NextWithShift::MakeEnd(i, max_shift));
       array_[major + i].head_next_with_shift.StoreRelaxed(
-          MakeNextWithShiftEnd(major + i, max_shift));
+          NextWithShift::MakeEnd(major + i, max_shift));
 #ifndef NDEBUG  // Extra invariant checking
       GetHomeIndexAndShift(length_info, i, &home, &shift);
       assert(home == i);
@@ -2034,7 +2015,7 @@ AutoHyperClockTable::AutoHyperClockTable(
 #endif
     } else {
       array_[i].head_next_with_shift.StoreRelaxed(
-          MakeNextWithShiftEnd(i, min_shift));
+          NextWithShift::MakeEnd(i, min_shift));
 #ifndef NDEBUG  // Extra invariant checking
       GetHomeIndexAndShift(length_info, i, &home, &shift);
       assert(home == i);
@@ -2066,8 +2047,10 @@ AutoHyperClockTable::~AutoHyperClockTable() {
   // just a reasonable frontier past what we expect to have written.
 #ifdef MUST_FREE_HEAP_ALLOCATIONS
   for (size_t i = used_end; i < array_.Count() && i < used_end + 64U; i++) {
-    assert(array_[i].head_next_with_shift.LoadRelaxed() == 0);
-    assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0);
+    assert(array_[i].head_next_with_shift.LoadRelaxed() ==
+           HandleImpl::kUnusedMarker);
+    assert(array_[i].chain_next_with_shift.LoadRelaxed() ==
+           HandleImpl::kUnusedMarker);
     assert(array_[i].meta.LoadRelaxed() == 0);
   }
 #endif          // MUST_FREE_HEAP_ALLOCATIONS
@@ -2089,11 +2072,9 @@ AutoHyperClockTable::~AutoHyperClockTable() {
         usage_.FetchSubRelaxed(h.total_charge);
         occupancy_.FetchSubRelaxed(1U);
         was_populated[i] = true;
-        if (!HandleImpl::IsEnd(h.chain_next_with_shift.LoadRelaxed())) {
-          assert((h.chain_next_with_shift.LoadRelaxed() &
-                  HandleImpl::kHeadLocked) == 0);
-          size_t next =
-              GetNextFromNextWithShift(h.chain_next_with_shift.LoadRelaxed());
+        if (!h.chain_next_with_shift.LoadRelaxed().IsEnd()) {
+          assert(!h.chain_next_with_shift.LoadRelaxed().IsLocked());
+          size_t next = h.chain_next_with_shift.LoadRelaxed().GetNext();
           assert(!was_pointed_to[next]);
           was_pointed_to[next] = true;
         }
@@ -2105,9 +2086,8 @@ AutoHyperClockTable::~AutoHyperClockTable() {
         break;
     }
 #ifndef NDEBUG  // Extra invariant checking
-    if (!HandleImpl::IsEnd(h.head_next_with_shift.LoadRelaxed())) {
-      size_t next =
-          GetNextFromNextWithShift(h.head_next_with_shift.LoadRelaxed());
+    if (!h.head_next_with_shift.LoadRelaxed().IsEnd()) {
+      size_t next = h.head_next_with_shift.LoadRelaxed().GetNext();
       assert(!was_pointed_to[next]);
       was_pointed_to[next] = true;
     }
@@ -2222,10 +2202,10 @@ bool AutoHyperClockTable::Grow(InsertState& state) {
   // chain rewrite lock has been released.
   size_t old_old_home = BottomNBits(grow_home, old_shift - 1);
   for (;;) {
-    uint64_t old_old_head = array_[old_old_home].head_next_with_shift.Load();
-    if (GetShiftFromNextWithShift(old_old_head) >= old_shift) {
-      if ((old_old_head & HandleImpl::kNextEndFlags) !=
-          HandleImpl::kHeadLocked) {
+    NextWithShift old_old_head =
+        array_[old_old_home].head_next_with_shift.Load();
+    if (old_old_head.GetShift() >= old_shift) {
+      if (!old_old_head.IsLockedNotEnd()) {
         break;
       }
     }
@@ -2285,8 +2265,7 @@ void AutoHyperClockTable::CatchUpLengthInfoNoWait(
     if (published_usable_size < known_usable_grow_home) {
       int old_shift = FloorLog2(next_usable_size - 1);
       size_t old_home = BottomNBits(published_usable_size, old_shift);
-      int shift = GetShiftFromNextWithShift(
-          array_[old_home].head_next_with_shift.Load());
+      int shift = array_[old_home].head_next_with_shift.Load().GetShift();
       if (shift <= old_shift) {
         // Not ready
         break;
@@ -2437,9 +2416,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
   ChainRewriteLock zero_head_lock(&arr[old_home], yield_count_);
 
   // Used for locking the one chain below
-  uint64_t saved_one_head;
+  NextWithShift saved_one_head;
   // One head has not been written to
-  assert(arr[grow_home].head_next_with_shift.Load() == 0);
+  assert(arr[grow_home].head_next_with_shift.Load() ==
+         HandleImpl::kUnusedMarker);
 
   // old_home will also the head of the new "zero chain" -- all entries in the
   // "from" chain whose next hash bit is 0. grow_home will be head of the new
@@ -2461,7 +2441,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
     assert(cur == SIZE_MAX);
     assert(chain_frontier_first == -1);
 
-    uint64_t next_with_shift = zero_head_lock.GetSavedHead();
+    NextWithShift next_with_shift = zero_head_lock.GetSavedHead();
 
     // Find a single representative for each target chain, or scan the whole
     // chain if some target chain has no representative.
@@ -2474,16 +2454,16 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
       assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX &&
                                    one_chain_frontier == SIZE_MAX));
 
-      assert(GetShiftFromNextWithShift(next_with_shift) == old_shift);
+      assert(next_with_shift.GetShift() == old_shift);
 
       // Check for end of original chain
-      if (HandleImpl::IsEnd(next_with_shift)) {
+      if (next_with_shift.IsEnd()) {
         cur = SIZE_MAX;
         break;
       }
 
       // next_with_shift is not End
-      cur = GetNextFromNextWithShift(next_with_shift);
+      cur = next_with_shift.GetNext();
 
       if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) {
         // Entry for zero chain
@@ -2522,10 +2502,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
            (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX));
 
     // Always update one chain's head first (safe), and mark it as locked
-    saved_one_head = HandleImpl::kHeadLocked |
-                     (one_chain_frontier != SIZE_MAX
-                          ? MakeNextWithShift(one_chain_frontier, new_shift)
-                          : MakeNextWithShiftEnd(grow_home, new_shift));
+    saved_one_head = one_chain_frontier != SIZE_MAX
+                         ? NextWithShift::Make(one_chain_frontier, new_shift)
+                         : NextWithShift::MakeEnd(grow_home, new_shift);
+    saved_one_head.Set<NextWithShift::LockedFlag>(true);
     arr[grow_home].head_next_with_shift.Store(saved_one_head);
 
     // Make sure length_info_ hasn't been updated too early, as we're about
@@ -2535,8 +2515,8 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
     // Try to set zero's head.
     if (zero_head_lock.CasUpdate(
             zero_chain_frontier != SIZE_MAX
-                ? MakeNextWithShift(zero_chain_frontier, new_shift)
-                : MakeNextWithShiftEnd(old_home, new_shift),
+                ? NextWithShift::Make(zero_chain_frontier, new_shift)
+                : NextWithShift::MakeEnd(old_home, new_shift),
             yield_count_)) {
       // Both heads successfully updated to new shift
       break;
@@ -2570,10 +2550,10 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
     size_t& other_frontier = chain_frontier_first != 0
                                  ? /*&*/ zero_chain_frontier
                                  : /*&*/ one_chain_frontier;
-    uint64_t stop_before_or_new_tail =
+    NextWithShift stop_before_or_new_tail =
         other_frontier != SIZE_MAX
-            ? /*stop before*/ MakeNextWithShift(other_frontier, old_shift)
-            : /*new tail*/ MakeNextWithShiftEnd(
+            ? /*stop before*/ NextWithShift::Make(other_frontier, old_shift)
+            : /*new tail*/ NextWithShift::MakeEnd(
                   chain_frontier_first == 0 ? old_home : grow_home, new_shift);
     UpgradeShiftsOnRange(arr, first_frontier, stop_before_or_new_tail,
                          old_shift, new_shift);
@@ -2599,20 +2579,19 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
                                    ? /*&*/ zero_chain_frontier
                                    : /*&*/ one_chain_frontier;
       assert(cur != first_frontier);
-      assert(GetNextFromNextWithShift(
-                 arr[first_frontier].chain_next_with_shift.Load()) ==
+      assert(arr[first_frontier].chain_next_with_shift.Load().GetNext() ==
              other_frontier);
 
-      uint64_t next_with_shift = arr[cur].chain_next_with_shift.Load();
+      NextWithShift next_with_shift = arr[cur].chain_next_with_shift.Load();
 
       // Check for end of original chain
-      if (HandleImpl::IsEnd(next_with_shift)) {
+      if (next_with_shift.IsEnd()) {
         // Can set upgraded tail on first chain
-        uint64_t first_new_tail = MakeNextWithShiftEnd(
+        NextWithShift first_new_tail = NextWithShift::MakeEnd(
             chain_frontier_first == 0 ? old_home : grow_home, new_shift);
         arr[first_frontier].chain_next_with_shift.Store(first_new_tail);
         // And upgrade remainder of other chain
-        uint64_t other_new_tail = MakeNextWithShiftEnd(
+        NextWithShift other_new_tail = NextWithShift::MakeEnd(
             chain_frontier_first != 0 ? old_home : grow_home, new_shift);
         UpgradeShiftsOnRange(arr, other_frontier, other_new_tail, old_shift,
                              new_shift);
@@ -2621,7 +2600,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
       }
 
       // next_with_shift is not End
-      cur = GetNextFromNextWithShift(next_with_shift);
+      cur = next_with_shift.GetNext();
 
       int target_chain;
       if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) {
@@ -2634,7 +2613,7 @@ void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home,
       }
       if (target_chain == chain_frontier_first) {
         // Found next entry to skip to on the first chain
-        uint64_t skip_to = MakeNextWithShift(cur, new_shift);
+        NextWithShift skip_to = NextWithShift::Make(cur, new_shift);
         arr[first_frontier].chain_next_with_shift.Store(skip_to);
         first_frontier = cur;
         // Upgrade other chain up to entry before that one
@@ -2675,17 +2654,17 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
 
   HandleImpl* const arr = array_.Get();
 
-  uint64_t next_with_shift = rewrite_lock.GetSavedHead();
-  assert(!HandleImpl::IsEnd(next_with_shift));
-  int home_shift = GetShiftFromNextWithShift(next_with_shift);
+  NextWithShift next_with_shift = rewrite_lock.GetSavedHead();
+  assert(!next_with_shift.IsEnd());
+  int home_shift = next_with_shift.GetShift();
   (void)home;
   (void)home_shift;
-  size_t next = GetNextFromNextWithShift(next_with_shift);
+  size_t next = next_with_shift.GetNext();
   assert(next < array_.Count());
   HandleImpl* h = &arr[next];
   HandleImpl* prev_to_keep = nullptr;
 #ifndef NDEBUG
-  uint64_t prev_to_keep_next_with_shift = 0;
+  NextWithShift prev_to_keep_next_with_shift{};
 #endif
   // Whether there are entries between h and prev_to_keep that should be
   // purged from the chain.
@@ -2743,13 +2722,13 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
         // update any new entries just inserted in parallel.
         // Can simply restart (GetSavedHead() already updated from CAS failure).
         next_with_shift = rewrite_lock.GetSavedHead();
-        assert(!HandleImpl::IsEnd(next_with_shift));
-        next = GetNextFromNextWithShift(next_with_shift);
+        assert(!next_with_shift.IsEnd());
+        next = next_with_shift.GetNext();
         assert(next < array_.Count());
         h = &arr[next];
         pending_purge = false;
         assert(prev_to_keep == nullptr);
-        assert(GetShiftFromNextWithShift(next_with_shift) == home_shift);
+        assert(next_with_shift.GetShift() == home_shift);
         continue;
       }
       pending_purge = false;
@@ -2771,13 +2750,13 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
     }
 #endif
 
-    assert(GetShiftFromNextWithShift(next_with_shift) == home_shift);
+    assert(next_with_shift.GetShift() == home_shift);
 
     // Check for end marker
-    if (HandleImpl::IsEnd(next_with_shift)) {
+    if (next_with_shift.IsEnd()) {
       h = nullptr;
     } else {
-      next = GetNextFromNextWithShift(next_with_shift);
+      next = next_with_shift.GetNext();
       assert(next < array_.Count());
       h = &arr[next];
       assert(h != prev_to_keep);
@@ -2849,7 +2828,7 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home,
     // Ensure we are at the correct home for the shift in effect for the
     // chain head.
     for (;;) {
-      int shift = GetShiftFromNextWithShift(rewrite_lock.GetSavedHead());
+      int shift = rewrite_lock.GetSavedHead().GetShift();
 
       if (shift > home_shift) {
         // Found a newer shift at candidate head, which must apply to us.
@@ -3045,14 +3024,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
   }
 
   // Now insert into chain using head pointer
-  uint64_t next_with_shift;
+  NextWithShift next_with_shift;
   int home_shift = orig_home_shift;
 
   // Might need to retry
   for (int i = 0;; ++i) {
     CHECK_TOO_MANY_ITERATIONS(i);
     next_with_shift = arr[home].head_next_with_shift.Load();
-    int shift = GetShiftFromNextWithShift(next_with_shift);
+    int shift = next_with_shift.GetShift();
 
     if (UNLIKELY(shift != home_shift)) {
       // NOTE: shift increases with table growth
@@ -3079,15 +3058,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
     }
 
     // Values to update to
-    uint64_t head_next_with_shift = MakeNextWithShift(idx, home_shift);
-    uint64_t chain_next_with_shift = next_with_shift;
+    NextWithShift head_next_with_shift = NextWithShift::Make(idx, home_shift);
+    NextWithShift chain_next_with_shift = next_with_shift;
 
     // Preserve the locked state in head, without propagating to chain next
     // where it is meaningless (and not allowed)
-    if (UNLIKELY((next_with_shift & HandleImpl::kNextEndFlags) ==
-                 HandleImpl::kHeadLocked)) {
-      head_next_with_shift |= HandleImpl::kHeadLocked;
-      chain_next_with_shift &= ~HandleImpl::kHeadLocked;
+    if (UNLIKELY(next_with_shift.IsLockedNotEnd())) {
+      head_next_with_shift.Set<NextWithShift::LockedFlag>(true);
+      chain_next_with_shift.Set<NextWithShift::LockedFlag>(false);
     }
 
     arr[idx].chain_next_with_shift.Store(chain_next_with_shift);
@@ -3156,9 +3134,9 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
   // of a loop as possible.
 
   HandleImpl* const arr = array_.Get();
-  uint64_t next_with_shift = arr[home].head_next_with_shift.LoadRelaxed();
-  for (size_t i = 0; !HandleImpl::IsEnd(next_with_shift) && i < 10; ++i) {
-    HandleImpl* h = &arr[GetNextFromNextWithShift(next_with_shift)];
+  NextWithShift next_with_shift = arr[home].head_next_with_shift.LoadRelaxed();
+  for (size_t i = 0; !next_with_shift.IsEnd() && i < 10; ++i) {
+    HandleImpl* h = &arr[next_with_shift.IsEnd()];
     // Attempt cheap key match without acquiring a read ref. This could give a
     // false positive, which is re-checked after acquiring read ref, or false
     // negative, which is re-checked in the full Lookup. Also, this is a
@@ -3203,7 +3181,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
     // Read head or chain pointer
     next_with_shift = h ? h->chain_next_with_shift.Load()
                         : arr[home].head_next_with_shift.Load();
-    int shift = GetShiftFromNextWithShift(next_with_shift);
+    int shift = next_with_shift.GetShift();
 
     // Make sure it's usable
     size_t effective_home = home;
@@ -3257,10 +3235,10 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
     }
 
     // Check for end marker
-    if (HandleImpl::IsEnd(next_with_shift)) {
+    if (next_with_shift.IsEnd()) {
       // To ensure we didn't miss anything in the chain, the end marker must
       // point back to the correct home.
-      if (LIKELY(GetNextFromNextWithShift(next_with_shift) == effective_home)) {
+      if (LIKELY(next_with_shift.GetNext() == effective_home)) {
         // Complete, clean iteration of the chain, not found.
         // Clean up.
         if (read_ref_on_chain) {
@@ -3276,7 +3254,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
     }
 
     // Follow the next and check for full key match, home match, or neither
-    h = &arr[GetNextFromNextWithShift(next_with_shift)];
+    h = &arr[next_with_shift.GetNext()];
     bool full_match_or_unknown = false;
     if (MatchAndRef(&hashed_key, *h, shift, effective_home,
                     &full_match_or_unknown)) {
@@ -3600,8 +3578,7 @@ size_t AutoHyperClockTable::CalcMaxUsableLength(
 
 namespace {
 bool IsHeadNonempty(const AutoHyperClockTable::HandleImpl& h) {
-  return !AutoHyperClockTable::HandleImpl::IsEnd(
-      h.head_next_with_shift.LoadRelaxed());
+  return !h.head_next_with_shift.LoadRelaxed().IsEnd();
 }
 bool IsEntryAtHome(const AutoHyperClockTable::HandleImpl& h, int shift,
                    size_t home) {
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index 5ac8467bd3a3..f2d6c7fe6c58 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -775,6 +775,7 @@ class AutoHyperClockTable : public BaseClockTable {
     // chain--specifically the next entry in the chain.
     // * The end of a chain is given a special "end" marker and refers back
     // to the head of the chain.
+    // These decorated pointers use the NextWithShift bit field struct below.
     //
     // Why do we need shift on each pointer? To make Lookup wait-free, we need
     // to be able to query a chain without missing anything, and preferably
@@ -794,47 +795,63 @@ class AutoHyperClockTable : public BaseClockTable {
     // it is normal to see "under construction" entries on the chain, and it
     // is not safe to read their hashed key without either a read reference
     // on the entry or a rewrite lock on the chain.
-
-    // Marker in a "with_shift" head pointer for some thread owning writes
-    // to the chain structure (except for inserts), but only if not an
-    // "end" pointer. Also called the "rewrite lock."
-    static constexpr uint64_t kHeadLocked = uint64_t{1} << 7;
-
-    // Marker in a "with_shift" pointer for the end of a chain. Must also
-    // point back to the head of the chain (with end marker removed).
-    // Also includes the "locked" bit so that attempting to lock an empty
-    // chain has no effect (not needed, as the lock is only needed for
-    // removals).
-    static constexpr uint64_t kNextEndFlags = (uint64_t{1} << 6) | kHeadLocked;
-
-    static inline bool IsEnd(uint64_t next_with_shift) {
-      // Assuming certain values never used, suffices to check this one bit
-      constexpr auto kCheckBit = kNextEndFlags ^ kHeadLocked;
-      return next_with_shift & kCheckBit;
-    }
-
-    // Bottom bits to right shift away to get an array index from a
-    // "with_shift" pointer.
-    static constexpr int kNextShift = 8;
-
-    // A bit mask for the "shift" associated with each "with_shift" pointer.
-    // Always bottommost bits.
-    static constexpr int kShiftMask = 63;
+    struct NextWithShift : public BitFields<uint64_t, NextWithShift> {
+      // The "shift" associated with this decorated pointer (see description
+      // above).
+      using Shift = UnsignedBitField<NextWithShift, 6, NoPrevBitField>;
+      // Marker for the end of a chain. Must also (a) point back to the head of
+      // the chain (with end marker removed), and (b) set the LockedFlag
+      // (below), so that attempting to lock an empty chain has no effect (not
+      // needed, as the lock is only needed for removals).
+      using EndFlag = BoolBitField<NextWithShift, Shift>;
+      // Marker that some thread owning writes to the chain structure (except
+      // for inserts), but only if not an "end" pointer. Also called the
+      // "rewrite lock."
+      using LockedFlag = BoolBitField<NextWithShift, EndFlag>;
+      // The "next" associated with this decorated pointer, which is an index
+      // into the table's array_ (see description above).
+      using Next = UnsignedBitField<NextWithShift, 56, LockedFlag>;
+
+      bool IsLocked() const { return Get<LockedFlag>(); }
+      bool IsEnd() const {
+        // End flag should imply locked flag
+        assert(!Get<EndFlag>() || Get<LockedFlag>());
+        return Get<EndFlag>();
+      }
+      bool IsLockedNotEnd() const {
+        // NOTE: helping GCC to optimize this simpler code:
+        // return IsLocked() && !IsEnd();
+        constexpr U kEndFlag = U{1} << EndFlag::kBitOffset;
+        constexpr U kLockedFlag = U{1} << LockedFlag::kBitOffset;
+        return (underlying & (kEndFlag | kLockedFlag)) == kLockedFlag;
+      }
+      auto GetNext() const { return Get<Next>(); }
+      auto GetShift() const { return Get<Shift>(); }
+
+      static NextWithShift Make(size_t next, int shift) {
+        return NextWithShift{}.With<Next>(next).With<Shift>(
+            static_cast<uint8_t>(shift));
+      }
+
+      static NextWithShift MakeEnd(size_t next, int shift) {
+        return Make(next, shift).With<EndFlag>(true).With<LockedFlag>(true);
+      }
+    };
 
     // A marker for head_next_with_shift that indicates this HandleImpl is
     // heap allocated (standalone) rather than in the table.
-    static constexpr uint64_t kStandaloneMarker = UINT64_MAX;
+    static constexpr NextWithShift kStandaloneMarker{UINT64_MAX};
 
     // A marker for head_next_with_shift indicating the head is not yet part
     // of the usable table, or for chain_next_with_shift indicating that the
     // entry is not present or is not yet part of a chain (must not be
     // "shareable" state).
-    static constexpr uint64_t kUnusedMarker = 0;
+    static constexpr NextWithShift kUnusedMarker{0};
 
     // See above. The head pointer is logically independent of the rest of
     // the entry, including the chain next pointer.
-    AcqRelAtomic<uint64_t> head_next_with_shift{kUnusedMarker};
-    AcqRelAtomic<uint64_t> chain_next_with_shift{kUnusedMarker};
+    AcqRelBitFieldsAtomic<NextWithShift> head_next_with_shift{kUnusedMarker};
+    AcqRelBitFieldsAtomic<NextWithShift> chain_next_with_shift{kUnusedMarker};
 
     // For supporting CreateStandalone and some fallback cases.
     inline bool IsStandalone() const {
diff --git a/test_util/sync_point.cc b/test_util/sync_point.cc
index bec02d4f67a3..2b9ab2f69625 100644
--- a/test_util/sync_point.cc
+++ b/test_util/sync_point.cc
@@ -79,4 +79,8 @@ void SetupSyncPointsToMockDirectIO() {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 #endif
 }
+
+#ifndef NDEBUG
+std::atomic<int> g_throw_on_testable_assertion_failure{0};
+#endif  // NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/test_util/sync_point.h b/test_util/sync_point.h
index 6022073e573a..6bfb841926e9 100644
--- a/test_util/sync_point.h
+++ b/test_util/sync_point.h
@@ -180,3 +180,35 @@ void SetupSyncPointsToMockDirectIO();
     }                                               \
   }
 #endif  // NDEBUG
+
+// An alternative to assert() that is more test-friendly than using
+// ASSERT_DEATH. Relies on exception propagation.
+#ifdef NDEBUG
+#define testable_assert(cond)
+#else
+namespace ROCKSDB_NAMESPACE {
+// Intentionally not based on std::exception to reduce places where this
+// would be caught
+struct TestableAssertionFailure {};
+extern std::atomic<int> g_throw_on_testable_assertion_failure;
+}  // namespace ROCKSDB_NAMESPACE
+#define testable_assert(cond)                                          \
+  do {                                                                 \
+    if (ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.load( \
+            std::memory_order_relaxed) > 0) {                          \
+      if (cond) {                                                      \
+      } else                                                           \
+        throw ROCKSDB_NAMESPACE::TestableAssertionFailure();           \
+    } else {                                                           \
+      assert(cond);                                                    \
+    }                                                                  \
+  } while (0)
+#define ASSERT_TESTABLE_FAILURE(expr)                                   \
+  do {                                                                  \
+    ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_add( \
+        1, std::memory_order_relaxed);                                  \
+    ASSERT_THROW(expr, ROCKSDB_NAMESPACE::TestableAssertionFailure);    \
+    ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_sub( \
+        1, std::memory_order_relaxed);                                  \
+  } while (0)
+#endif
diff --git a/util/bit_fields.h b/util/bit_fields.h
index c2aeaf86ff8a..aa49cc0e0b76 100644
--- a/util/bit_fields.h
+++ b/util/bit_fields.h
@@ -6,8 +6,10 @@
 #pragma once
 
 #include <atomic>
+#include <vector>
 
 #include "rocksdb/rocksdb_namespace.h"
+#include "test_util/sync_point.h"
 #include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -144,8 +146,44 @@ struct AndTransform {
   }
 };
 
-// TODO: AddTransfrom, which is more complicated due to possible overflow into
-// other fields etc.
+// Can represent a combination of both subtractions and additions, representing
+// subtractions as the addition of a negated value. To ensure we don't create a
+// net overflow or underflow between fields, in debug builds we track the
+// corresponding preconditions. (NOTE that when representing a subtraction, we
+// rely on overflow of the unsigned representation.)
+template <typename BitFieldsT>
+struct AddTransform {
+  using U = typename BitFieldsT::U;
+  U to_add = 0;
+#ifndef NDEBUG
+  struct Precondition {
+    U mask;   // for bits of the target field
+    U piece;  // component of to_add for the target field
+  };
+  std::vector<Precondition> preconditions;
+#endif  // NDEBUG
+  void AssertPreconditions([[maybe_unused]] U from) {
+#ifndef NDEBUG
+    for (auto p : preconditions) {
+      U tmp = (from & p.mask) + p.piece;
+      // Assert no under/overflow (unless the field is at the top bits of the
+      // representation in U, which is allowed because it doesn't lead to
+      // leakage into other fields)
+      testable_assert((tmp & ~p.mask) == 0);
+    }
+#endif  // NDEBUG
+  }
+  // + for general combine
+  AddTransform<BitFieldsT> operator+(AddTransform<BitFieldsT> other) const {
+    AddTransform<BitFieldsT> rv{to_add + other.to_add};
+#ifndef NDEBUG
+    rv.preconditions = preconditions;
+    rv.preconditions.insert(rv.preconditions.end(), other.preconditions.begin(),
+                            other.preconditions.end());
+#endif  // NDEBUG
+    return rv;
+  }
+};
 
 // Placeholder for PrevField for the first field
 struct NoPrevBitField {
@@ -204,6 +242,7 @@ struct UnsignedBitField {
   static_assert(kBitCount >= 1);
   static_assert(kBitCount <= 64);
   static_assert(kBitOffset >= 0 && kEndBit <= BitFieldsT::kBitCount);
+  static constexpr bool kIncludesTopBit = (kEndBit == BitFieldsT::kBitCount);
 
   static constexpr V kMask = (V{1} << (kBitCount - 1) << 1) - 1;
 
@@ -219,9 +258,59 @@ struct UnsignedBitField {
     bf.underlying |= static_cast<U>(value & kMask) << kBitOffset;
   }
 
+  // Create a transfor for clearing this field to zero.
   static AndTransform<BitFieldsT> ClearTransform() {
     return AndTransform<BitFieldsT>{~(static_cast<U>(kMask) << kBitOffset)};
   }
+
+  // Create a transform for adding a particular value, but with the precondition
+  // that adding the value will not overflow the field. This applies for fields
+  // that do not include the top bit of the underlying representation. Can be
+  // combined with other additive transforms for other fields.
+  static AddTransform<BitFieldsT> PlusTransformPromiseNoOverflow(V value) {
+    static_assert(!kIncludesTopBit);
+    AddTransform<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
+#ifndef NDEBUG
+    rv.preconditions.push_back(
+        {static_cast<U>(kMask) << kBitOffset, rv.to_add});
+#endif  // NDEBUG
+    return rv;
+  }
+
+  // Create a transform for adding a particular value, but ignoring any overflow
+  // in that field. This applies for fields that include the top bit of the
+  // underlying representation. Can be combined with other additive transforms
+  // for other fields.
+  static AddTransform<BitFieldsT> PlusTransformIgnoreOverflow(V value) {
+    static_assert(kIncludesTopBit);
+    AddTransform<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
+    return rv;
+  }
+
+  // Create a transform for subtracting a particular value, but with the
+  // precondition that subtracting the value will not underflow the field. This
+  // applies for fields that do not include the top bit of the underlying
+  // representation. Can be combined with other additive transforms for other
+  // fields.
+  static AddTransform<BitFieldsT> MinusTransformPromiseNoUnderflow(V value) {
+    static_assert(!kIncludesTopBit);
+    AddTransform<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
+#ifndef NDEBUG
+    rv.preconditions.push_back(
+        {static_cast<U>(kMask) << kBitOffset, rv.to_add});
+#endif  // NDEBUG
+    return rv;
+  }
+
+  // Create a transform for subtracting a particular value, but ignoring any
+  // underflow in that field. This applies for fields that include the top bit
+  // of the underlying representation. Can be combined with other additive
+  // transforms for other fields.
+  static AddTransform<BitFieldsT> MinusTransformIgnoreUnderflow(V value) {
+    static_assert(kIncludesTopBit);
+    AddTransform<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
+    return rv;
+  }
 };
 
 // A handy wrapper for a relaxed atomic on some BitFields type (unlike
@@ -260,7 +349,22 @@ class RelaxedBitFieldsAtomic {
   }
   void ApplyRelaxed(OrTransform<BitFieldsT> transform,
                     BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
-    U before_val = v_.fetch_or(transform.to_or, std::memory_order_relaxed);
+    ApplyImpl<std::memory_order_relaxed>(transform, before, after);
+  }
+  void ApplyRelaxed(AndTransform<BitFieldsT> transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    ApplyImpl<std::memory_order_relaxed>(transform, before, after);
+  }
+  void ApplyRelaxed(AddTransform<BitFieldsT> transform,
+                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    ApplyImpl<std::memory_order_relaxed>(transform, before, after);
+  }
+
+ protected:  // fns
+  template <std::memory_order kOrder>
+  void ApplyImpl(OrTransform<BitFieldsT> transform,
+                 BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_or(transform.to_or, kOrder);
     if (before) {
       before->underlying = before_val;
     }
@@ -268,9 +372,10 @@ class RelaxedBitFieldsAtomic {
       after->underlying = before_val | transform.to_or;
     }
   }
-  void ApplyRelaxed(AndTransform<BitFieldsT> transform,
-                    BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
-    U before_val = v_.fetch_and(transform.to_and, std::memory_order_relaxed);
+  template <std::memory_order kOrder>
+  void ApplyImpl(AndTransform<BitFieldsT> transform,
+                 BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_and(transform.to_and, kOrder);
     if (before) {
       before->underlying = before_val;
     }
@@ -278,8 +383,20 @@ class RelaxedBitFieldsAtomic {
       after->underlying = before_val & transform.to_and;
     }
   }
+  template <std::memory_order kOrder>
+  void ApplyImpl(AddTransform<BitFieldsT> transform,
+                 BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
+    U before_val = v_.fetch_add(transform.to_add, kOrder);
+    transform.AssertPreconditions(before_val);
+    if (before) {
+      before->underlying = before_val;
+    }
+    if (after) {
+      after->underlying = before_val + transform.to_add;
+    }
+  }
 
- protected:
+ protected:  // data
   std::atomic<U> v_;
 };
 
@@ -313,25 +430,18 @@ class AcqRelBitFieldsAtomic : public RelaxedBitFieldsAtomic<BitFieldsT> {
   }
   void Apply(OrTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
              BitFieldsT* after = nullptr) {
-    U before_val =
-        Base::v_.fetch_or(transform.to_or, std::memory_order_acq_rel);
-    if (before) {
-      before->underlying = before_val;
-    }
-    if (after) {
-      after->underlying = before_val | transform.to_or;
-    }
+    Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
+                                                        after);
   }
   void Apply(AndTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
              BitFieldsT* after = nullptr) {
-    U before_val =
-        Base::v_.fetch_and(transform.to_and, std::memory_order_acq_rel);
-    if (before) {
-      before->underlying = before_val;
-    }
-    if (after) {
-      after->underlying = before_val & transform.to_and;
-    }
+    Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
+                                                        after);
+  }
+  void Apply(AddTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
+             BitFieldsT* after = nullptr) {
+    Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
+                                                        after);
   }
 };
 
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 72b9f19376d9..58de6b1612c8 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -527,6 +527,67 @@ TEST(BitFieldsTest, BitFields) {
     ASSERT_EQ(before.Get<Field2>(), false);
     ASSERT_EQ(before.Get<Field3>(), false);
     ASSERT_EQ(after, state);
+
+    ASSERT_EQ(state.Get<Field1>(), 45U);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+    ASSERT_EQ(state.Get<Field4>(), 3U);
+
+    auto transform3 = Field1::PlusTransformPromiseNoOverflow(10000U) +
+                      Field4::MinusTransformPromiseNoUnderflow(3U);
+    relaxed.ApplyRelaxed(transform3, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field1>(), 10045U);
+    ASSERT_EQ(after.Get<Field4>(), 0U);
+
+    auto transform4 = Field1::MinusTransformPromiseNoUnderflow(999U) +
+                      Field4::PlusTransformPromiseNoOverflow(31U);
+    relaxed.ApplyRelaxed(transform4, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 9046U);
+    ASSERT_EQ(after.Get<Field4>(), 31U);
+
+    // Unmodified
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+
+    // Test overflow/underflow detection
+    relaxed.StoreRelaxed(MyState{}.With<Field1>(65535U));  // Field1 max value
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field1::PlusTransformPromiseNoOverflow(1U)));
+    relaxed.StoreRelaxed(MyState{}.With<Field4>(31U));  // Field4 max value
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field4::PlusTransformPromiseNoOverflow(1U)));
+    relaxed.StoreRelaxed(MyState{}.With<Field1>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field1::MinusTransformPromiseNoUnderflow(1U)));
+    relaxed.StoreRelaxed(MyState{}.With<Field4>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field4::MinusTransformPromiseNoUnderflow(1U)));
+    ASSERT_TESTABLE_FAILURE(relaxed.ApplyRelaxed(
+        Field4::MinusTransformPromiseNoUnderflow(64U)));  // Too big
+    ASSERT_TESTABLE_FAILURE(relaxed.ApplyRelaxed(
+        Field4::PlusTransformPromiseNoOverflow(64U)));  // Too big
+
+    // Including combinations
+    relaxed.StoreRelaxed(MyState{}.With<Field4>(31U));  // Field4 max value
+    relaxed.StoreRelaxed(MyState{}.With<Field1>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        relaxed.ApplyRelaxed(Field4::PlusTransformPromiseNoOverflow(1U) +
+                             Field1::MinusTransformPromiseNoUnderflow(1U)));
+
+    // But a field at the limit of upper bits is allowed to over/underflow
+    using Field5 = UnsignedBitField<MyState, 9, Field4>;
+    relaxed.StoreRelaxed(MyState{}.With<Field5>(0));  // Field5 max value
+    relaxed.ApplyRelaxed(Field5::MinusTransformIgnoreUnderflow(1U), &before,
+                         &after);  // "Safe" underflow
+    ASSERT_EQ(after.Get<Field5>(), 511U);
+    relaxed.ApplyRelaxed(Field5::PlusTransformIgnoreOverflow(1U), &before,
+                         &after);  // "Safe" overflow
+    ASSERT_EQ(after.Get<Field5>(), 0U);
+    relaxed.ApplyRelaxed(Field5::PlusTransformIgnoreOverflow(2048U), &before,
+                         &after);  // "Safe" overflow
+    ASSERT_EQ(after.Get<Field5>(), 0U);
   }
   {
     AcqRelBitFieldsAtomic<MyState> acqrel{state};
@@ -555,6 +616,37 @@ TEST(BitFieldsTest, BitFields) {
     ASSERT_EQ(before.Get<Field2>(), false);
     ASSERT_EQ(before.Get<Field3>(), false);
     ASSERT_EQ(after, state);
+
+    ASSERT_EQ(state.Get<Field1>(), 45U);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+    ASSERT_EQ(state.Get<Field4>(), 3U);
+
+    auto transform3 = Field1::PlusTransformPromiseNoOverflow(10000U) +
+                      Field4::MinusTransformPromiseNoUnderflow(3U);
+    acqrel.Apply(transform3, &before, &after);
+    ASSERT_EQ(before, state);
+    ASSERT_NE(after, state);
+    ASSERT_EQ(after.Get<Field1>(), 10045U);
+    ASSERT_EQ(after.Get<Field4>(), 0U);
+
+    auto transform4 = Field1::MinusTransformPromiseNoUnderflow(999U) +
+                      Field4::PlusTransformPromiseNoOverflow(31U);
+    acqrel.Apply(transform4, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 9046U);
+    ASSERT_EQ(after.Get<Field4>(), 31U);
+
+    // Unmodified
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+
+    // Test overflow/underflow detection
+    acqrel.Store(MyState{}.With<Field1>(65535U));
+    ASSERT_TESTABLE_FAILURE(
+        acqrel.Apply(Field1::PlusTransformPromiseNoOverflow(1U)));
+    acqrel.Store(MyState{}.With<Field4>(0U));
+    ASSERT_TESTABLE_FAILURE(
+        acqrel.Apply(Field4::MinusTransformPromiseNoUnderflow(1U)));
   }
 }
 

From d2fe0ee389ce4695e00f4816b599b77eb1cf1135 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 2 Dec 2025 19:26:42 -0800
Subject: [PATCH 393/500] Fix use-after-free in BlockBasedTable after
 best-efforts recovery retry (#14155)

Summary:
**Context/Summary:**

Best-efforts recovery can cause a use-after-free bug after retrying for a failed recovery attempt. The issue occurs in VersionSet::Reset():
- First recovery attempt: Opens SST files, caching BlockBasedTable objects in table_cache_
https://github.com/facebook/rocksdb/blob/ac412b10955d5a1d3d99aff8edf94eae1e4a22d5/db/version_edit_handler.cc#L565
- Recovery fails: Calls Reset() which deletes the old ColumnFamilySet (and all CFDs)
https://github.com/facebook/rocksdb/blob/ac412b10955d5a1d3d99aff8edf94eae1e4a22d5/db/version_set.cc#L6631
- Creates new CFDs: But reuses the same table_cache_
https://github.com/facebook/rocksdb/blob/ac412b10955d5a1d3d99aff8edf94eae1e4a22d5/db/version_set.cc#L5579
- Bug: Cached BlockBasedTable objects contain now-dangling reference to previous CFD's member such as rep_->internal_comparator or rep_->ioptions as below. References instead of object copies are used for memory efficiency
```
struct BlockBasedTable::Rep {
  Rep(const ImmutableOptions& _ioptions, ..
      const InternalKeyComparator& _internal_comparato...)) {}
  ~Rep() { status.PermitUncheckedError(); }
  const ImmutableOptions& ioptions;
  ...
  const InternalKeyComparator& internal_comparator;
```
- Crash: Accessing any of the above reference in cached tables during read or compaction after recovery finishes triggers use-after-free

This PR calls table_cache_->EraseUnRefEntries()  to clear tables containing the dangling reference in VersionSet::Reset() before creating the new ColumnFamilySet.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14155

Test Plan:
- Add new unit test that fails before the fix under ASAN run and pass after
```
[ RUN      ] DBBasicTest.BestEffortRecoveryFailureWithTableCacheUseAfterFree
=================================================================
==1976446==ERROR: AddressSanitizer: heap-use-after-free on address 0x61e00000a8c8 at pc 0x7f6b21beae57 bp 0x7ffd65bacec0 sp 0x7ffd65baceb8
READ of size 8 at 0x61e00000a8c8 thread T0
    #0 0x7f6b21beae56 in rocksdb::UserComparatorWrapper::user_comparator() const util/user_comparator_wrapper.h:29 // rep_->ioptions
    https://github.com/facebook/rocksdb/issues/1 0x7f6b21beb02b in rocksdb::InternalKeyComparator::user_comparator() const db/dbformat.h:421
    https://github.com/facebook/rocksdb/issues/2 0x7f6b229a7a50 in rocksdb::BinarySearchIndexReader::NewIterator(rocksdb::ReadOptions const&, bool, rocksdb::IndexBlockIter*, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*) table/block_based/binary_search_index_reader.cc:62
    https://github.com/facebook/rocksdb/issues/3 0x7f6b22a9a649 in rocksdb::BlockBasedTable::NewIndexIterator(rocksdb::ReadOptions const&, bool, rocksdb::IndexBlockIter*, rocksdb::GetContext*, rocksdb::BlockCacheLookupContext*) const table/block_based/block_based_table_reader.cc:1683
    https://github.com/facebook/rocksdb/issues/4 0x7f6b22aa39be in rocksdb::BlockBasedTable::Get(rocksdb::ReadOptions const&, rocksdb::Slice const&, rocksdb::GetContext*, rocksdb::SliceTransform const*, bool) table/block_based/block_based_table_reader.cc:2533
    https://github.com/facebook/rocksdb/issues/5 0x7f6b2241201c in rocksdb::TableCache::Get(rocksdb::ReadOptions const&, rocksdb::InternalKeyComparator const&, rocksdb::FileMetaData const&, rocksdb::Slice const&, rocksdb::GetContext*, rocksdb::MutableCFOptions const&, rocksdb::HistogramImpl*, bool, int, unsigned long) db/table_cache.cc:492

0x61e00000a8c8 is located 72 bytes inside of 2784-byte region [0x61e00000a880,0x61e00000b360)
freed by thread T0 here:
    #0 0x7f6b248d20d7 in operator delete(void*, unsigned long) /home/engshare/third-party2/gcc/11.x/src/gcc-11.x/libsanitizer/asan/asan_new_delete.cpp:172
    https://github.com/facebook/rocksdb/issues/1 0x7f6b21ca8703 in rocksdb::ColumnFamilyData::UnrefAndTryDelete() db/column_family.cc:785
    https://github.com/facebook/rocksdb/issues/2 0x7f6b21cb25ee in rocksdb::ColumnFamilySet::~ColumnFamilySet() db/column_family.cc:1771
    https://github.com/facebook/rocksdb/issues/3 0x7f6b225683df in std::default_delete<rocksdb::ColumnFamilySet>::operator()(rocksdb::ColumnFamilySet*) const (/data/users/huixiao/rocksdb/librocksdb.so.10.10+0x1f683df)
    https://github.com/facebook/rocksdb/issues/4 0x7f6b22568ceb in std::__uniq_ptr_impl<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset(rocksdb::ColumnFamilySet*) /mnt/gvfs/third-party2/libgcc/d1129753c8361ac8e9453c0f4291337a4507ebe6/11.x/platform010/5684a5a/include/c++/trunk/bits/unique_ptr.h:182
    https://github.com/facebook/rocksdb/issues/5 0x7f6b22550c52 in std::unique_ptr<rocksdb::ColumnFamilySet, std::default_delete<rocksdb::ColumnFamilySet> >::reset(rocksdb::ColumnFamilySet*) /mnt/gvfs/third-party2/libgcc/d1129753c8361ac8e9453c0f4291337a4507ebe6/11.x/platform010/5684a5a/include/c++/trunk/bits/unique_ptr.h:456
    https://github.com/facebook/rocksdb/issues/6 0x7f6b224fa09e in rocksdb::VersionSet::Reset() db/version_set.cc:5587
    https://github.com/facebook/rocksdb/issues/7 0x7f6b2250752c in rocksdb::VersionSet::TryRecover(std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) db/version_set.cc:6640
    https://github.com/facebook/rocksdb/issues/8 0x7f6b220c5a88 in rocksdb::DBImpl::Recover(std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, bool, bool, bool, bool, unsigned long*, rocksdb::DBImpl::RecoveryContext*, bool*) db/db_impl/db_impl_open.cc:565

previously allocated by thread T0 here:
    #0 0x7f6b248d1257 in operator new(unsigned long) /home/engshare/third-party2/gcc/11.x/src/gcc-11.x/libsanitizer/asan/asan_new_delete.cpp:99
    https://github.com/facebook/rocksdb/issues/1 0x7f6b21cb30e0 in rocksdb::ColumnFamilySet::CreateColumnFamily(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned int, rocksdb::Version*, rocksdb::ColumnFamilyOptions const&, bool) db/column_family.cc:1827
    https://github.com/facebook/rocksdb/issues/2 0x7f6b22516a11 in rocksdb::VersionSet::CreateColumnFamily(rocksdb::ColumnFamilyOptions const&, rocksdb::ReadOptions const&, rocksdb::VersionEdit const*, bool) db/version_set.cc:7715
    https://github.com/facebook/rocksdb/issues/3 0x7f6b22494910 in rocksdb::VersionEditHandler::CreateCfAndInit(rocksdb::ColumnFamilyOptions const&, rocksdb::VersionEdit const&) db/version_edit_handler.cc:494
    https://github.com/facebook/rocksdb/issues/4 0x7f6b2249005f in rocksdb::VersionEditHandler::Initialize() db/version_edit_handler.cc:209
    https://github.com/facebook/rocksdb/issues/5 0x7f6b2248cd13 in rocksdb::VersionEditHandlerBase::Iterate(rocksdb::log::Reader&, rocksdb::Status*) db/version_edit_handler.cc:32
    https://github.com/facebook/rocksdb/issues/6 0x7f6b225081db in rocksdb::VersionSet::TryRecoverFromOneManifest(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, bool, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) db/version_set.cc:6679
    https://github.com/facebook/rocksdb/issues/7 0x7f6b225074a1 in rocksdb::VersionSet::TryRecover(std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, bool, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >*, bool*) db/version_set.cc:6635
    https://github.com/facebook/rocksdb/issues/8 0x7f6b220c5a88 in rocksdb::DBImpl::Recover(std::vector<rocksdb::ColumnFamilyDescriptor, std::allocator<rocksdb::ColumnFamilyDescriptor> > const&, bool, bool, bool, bool, unsigned long*, rocksdb::DBImpl::RecoveryContext*, bool*) db/db_impl/db_impl_open.cc:565
```

Reviewed By: anand1976

Differential Revision: D87991593

Pulled By: hx235

fbshipit-source-id: 2379b297ff592cadf02659e355cdc8e170917cfc
---
 db/db_basic_test.cc                           | 69 +++++++++++++++++++
 db/version_set.cc                             |  9 +++
 .../bug_fixes/ber_table_cache_uaf.md          |  1 +
 3 files changed, 79 insertions(+)
 create mode 100644 unreleased_history/bug_fixes/ber_table_cache_uaf.md

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index b115e7069d14..003e761466b9 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -3868,6 +3868,75 @@ TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
   ASSERT_OK(iter->status());
 }
 
+TEST_F(DBBasicTest, BestEffortRecoveryFailureWithTableCacheUseAfterFree) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  // Force multiple manifest files
+  options.max_manifest_file_size = 1;
+  options.max_manifest_space_amp_pct = 0;
+
+  DestroyAndReopen(options);
+
+  // Disable file deletions to preserve old manifest files for
+  // best-efforts recovery to succeed
+  ASSERT_OK(db_->DisableFileDeletions());
+
+  // Create multiple SST files to populate TableCache during
+  // best-efforts recovery
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i),
+                  std::string(1000, static_cast<char>('a' + i))));
+    ASSERT_OK(Flush());
+  }
+
+  // Verify we have multiple manifest files
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  int manifest_count = 0;
+  for (const auto& file : files) {
+    if (file.find("MANIFEST") != std::string::npos) {
+      manifest_count++;
+    }
+  }
+  ASSERT_GE(manifest_count, 2);
+
+  // Inject corruption after TableCache is populated (count > 3), but only once
+  // (injected flag) to allow best-effort recovery to trigger retry and succeed.
+  // This coerce the bug: first recovery caches SSTs with reference to column
+  // family's options in table cache and retry deletes column family so the
+  // reference becomes dangling.
+  int count = 0;
+  bool injected = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        count++;
+        if (count > 3 && !injected) {
+          ASSERT_NE(nullptr, arg);
+          *(static_cast<Status*>(arg)) =
+              Status::Corruption("Injected corruption");
+          injected = true;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.best_efforts_recovery = true;
+
+  Status s = TryReopen(options);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (int i = 0; i < 10; i++) {
+    std::string value;
+    // Without the fix, ASAN detects use-after-free when accessing cached SST
+    // files that hold dangling references to deleted ioptions.
+    s = db_->Get(ReadOptions(), "key" + std::to_string(i), &value);
+    ASSERT_TRUE(s.ok() || s.IsNotFound());
+  }
+}
+
 TEST_F(DBBasicTest, DisableTrackWal) {
   // If WAL tracking was enabled, and then disabled during reopen,
   // the previously tracked WALs should be removed from MANIFEST.
diff --git a/db/version_set.cc b/db/version_set.cc
index baf12b9ba359..cf89ec8ad735 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -5571,6 +5571,15 @@ void VersionSet::Reset() {
   if (column_family_set_) {
     WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
     WriteController* wc = column_family_set_->write_controller();
+
+    // Clear TableCache to prevent use-after-free: Reset() deletes old
+    // ColumnFamilySet but reuses table_cache_, which may contain
+    // BlockBasedTable entries with dangling references to deleted CFD's
+    // ioptions.
+    if (table_cache_) {
+      table_cache_->EraseUnRefEntries();
+    }
+
     // db_id becomes the source of truth after DBImpl::Recover():
     // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
     // Note: we may not be able to recover db_id from MANIFEST if
diff --git a/unreleased_history/bug_fixes/ber_table_cache_uaf.md b/unreleased_history/bug_fixes/ber_table_cache_uaf.md
new file mode 100644
index 000000000000..de2a96638bb3
--- /dev/null
+++ b/unreleased_history/bug_fixes/ber_table_cache_uaf.md
@@ -0,0 +1 @@
+Fixed a bug in best-efforts recovery that causes use-after-free crashes when accessing SST files that were cached during the recovery.

From 340ac7ea6be0fc0e1626bfba991d0983c563d850 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Thu, 4 Dec 2025 11:52:58 -0800
Subject: [PATCH 394/500] Improve sst_dump raw mode dump result (#14166)

Summary:
Add a new option in sst_dump command to show seq no and value type in raw mode

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14166

Test Plan:
Sample output

```
sst_dump --file=rocksdb_crashtest_blackbox/000010.sst  --command=raw --show_sequence_number_type

...

Range deletions:
--------------------------------------
  HEX    000000000000038D000000000000012B000000000000029A  seq: 3016892  type: 15 : 000000000000038D000000000000012B000000000000029E
  ASCII  \0 \0 \0 \0 \0 \0   \0 \0 \0 \0 \0 \0  + \0 \0 \0 \0 \0 \0   : \0 \0 \0 \0 \0 \0   \0 \0 \0 \0 \0 \0  + \0 \0 \0 \0 \0 \0
  ------

Data Block # 1 @ 0073
--------------------------------------
  HEX    000000000000038D000000000000012B000000000000029D  seq: 3004554  type: 0 :
  ASCII  \0 \0 \0 \0 \0 \0   \0 \0 \0 \0 \0 \0  + \0 \0 \0 \0 \0 \0   :
  ------
  HEX    000000000000038D000000000000012B000000000000029D  seq: 0  type: 1 : 03000000070605040B0A09080F0E0D0C13121110171615141B1A19181F1E1D1C
  ASCII  \0 \0 \0 \0 \0 \0   \0 \0 \0 \0 \0 \0  + \0 \0 \0 \0 \0 \0   :  \0 \0 \0
```

Reviewed By: hx235

Differential Revision: D88396223

Pulled By: xingbowang

fbshipit-source-id: b006cd7f51f941951349e4ec60ed5ef1e838919d
---
 table/block_based/block_based_table_reader.cc | 39 +++++++++++++------
 table/block_based/block_based_table_reader.h  |  9 +++--
 table/sst_file_dumper.cc                      |  5 ++-
 table/sst_file_dumper.h                       |  4 +-
 table/table_reader.h                          |  3 +-
 tools/sst_dump_tool.cc                        |  9 ++++-
 6 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 581eecc0d90f..094b0e0cb01b 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3232,7 +3232,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+Status BlockBasedTable::DumpTable(WritableFile* out_file,
+                                  bool show_sequence_number_type) {
   WritableFileStringStreamAdapter out_file_wrapper(out_file);
   std::ostream out_stream(&out_file_wrapper);
   // Output Footer
@@ -3325,15 +3326,15 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
       out_stream << "Range deletions:\n"
                     "--------------------------------------\n";
       for (; range_del_iter->Valid(); range_del_iter->Next()) {
-        DumpKeyValue(range_del_iter->key(), range_del_iter->value(),
-                     out_stream);
+        DumpKeyValue(range_del_iter->key(), range_del_iter->value(), out_stream,
+                     show_sequence_number_type);
       }
       out_stream << "\n";
     }
     delete range_del_iter;
   }
   // Output Data blocks
-  s = DumpDataBlocks(out_stream);
+  s = DumpDataBlocks(out_stream, show_sequence_number_type);
 
   if (!s.ok()) {
     return s;
@@ -3398,7 +3399,8 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
+Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream,
+                                       bool show_sequence_number_type) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
@@ -3455,7 +3457,8 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
         out_stream << "Error reading the block - Skipped \n";
         break;
       }
-      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream);
+      DumpKeyValue(datablock_iter->key(), datablock_iter->value(), out_stream,
+                   show_sequence_number_type);
     }
     out_stream << "\n";
   }
@@ -3477,14 +3480,26 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) {
 }
 
 void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
-                                   std::ostream& out_stream) {
-  InternalKey ikey;
-  ikey.DecodeFrom(key);
+                                   std::ostream& out_stream,
+                                   bool show_sequence_number_type) {
+  ParsedInternalKey result;
+  auto s = ParseInternalKey(key, &result, true);
+  if (!s.ok()) {
+    out_stream << "Error parsing internal key - Skipped \n";
+    return;
+  }
 
-  out_stream << "  HEX    " << ikey.user_key().ToString(true) << ": "
-             << value.ToString(true) << "\n";
+  if (show_sequence_number_type) {
+    out_stream << "  HEX    " << result.user_key.ToString(true)
+               << "  seq: " << result.sequence
+               << "  type: " << std::to_string(result.type) << " : "
+               << value.ToString(true) << "\n";
+  } else {
+    out_stream << "  HEX    " << result.user_key.ToString(true) << ": "
+               << value.ToString(true) << "\n";
+  }
 
-  std::string str_key = ikey.user_key().ToString();
+  std::string str_key = result.user_key.ToString();
   std::string str_value = value.ToString();
   std::string res_key, res_value;
   char cspace = ' ';
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index bac27ff18f78..b20d0db194e6 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -208,7 +208,8 @@ class BlockBasedTable : public TableReader {
   size_t ApproximateMemoryUsage() const override;
 
   // convert SST file to a human readable form
-  Status DumpTable(WritableFile* out_file) override;
+  Status DumpTable(WritableFile* out_file,
+                   bool show_sequence_number_type = false) override;
 
   Status VerifyChecksum(const ReadOptions& readOptions,
                         TableReaderCaller caller,
@@ -549,9 +550,11 @@ class BlockBasedTable : public TableReader {
 
   // Helper functions for DumpTable()
   Status DumpIndexBlock(std::ostream& out_stream);
-  Status DumpDataBlocks(std::ostream& out_stream);
+  Status DumpDataBlocks(std::ostream& out_stream,
+                        bool show_sequence_number_type = false);
   void DumpKeyValue(const Slice& key, const Slice& value,
-                    std::ostream& out_stream);
+                    std::ostream& out_stream,
+                    bool show_sequence_number_type = false);
 
   // Returns false if prefix_extractor exists and is compatible with that used
   // in building the table file, otherwise true.
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 5197eb5383cc..712f8fb0ccc5 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -47,12 +47,13 @@ SstFileDumper::SstFileDumper(const Options& options,
                              Temperature file_temp, size_t readahead_size,
                              bool verify_checksum, bool output_hex,
                              bool decode_blob_index, const EnvOptions& soptions,
-                             bool silent)
+                             bool silent, bool show_sequence_number_type)
     : file_name_(file_path),
       read_num_(0),
       file_temp_(file_temp),
       output_hex_(output_hex),
       decode_blob_index_(decode_blob_index),
+      show_sequence_number_type_(show_sequence_number_type),
       soptions_(soptions),
       silent_(silent),
       options_(options),
@@ -220,7 +221,7 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
   Env* env = options_.env;
   Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
   if (s.ok()) {
-    s = table_reader_->DumpTable(out_file.get());
+    s = table_reader_->DumpTable(out_file.get(), show_sequence_number_type_);
   }
   if (!s.ok()) {
     // close the file before return error, ignore the close error if there's any
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index 329915fdd662..23a878ba07f5 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -21,7 +21,8 @@ class SstFileDumper {
                          bool verify_checksum, bool output_hex,
                          bool decode_blob_index,
                          const EnvOptions& soptions = EnvOptions(),
-                         bool silent = false);
+                         bool silent = false,
+                         bool show_sequence_number_type = false);
 
   // read_num_limit limits the total number of keys read. If read_num_limit = 0,
   // then there is no limit. If read_num_limit = 0 or
@@ -79,6 +80,7 @@ class SstFileDumper {
   Temperature file_temp_;
   bool output_hex_;
   bool decode_blob_index_;
+  bool show_sequence_number_type_;
   EnvOptions soptions_;
   // less verbose in stdout/stderr
   bool silent_;
diff --git a/table/table_reader.h b/table/table_reader.h
index 354557db4aa5..4363755210fa 100644
--- a/table/table_reader.h
+++ b/table/table_reader.h
@@ -179,7 +179,8 @@ class TableReader {
   }
 
   // convert db file to a human readable form
-  virtual Status DumpTable(WritableFile* /*out_file*/) {
+  virtual Status DumpTable(WritableFile* /*out_file*/,
+                           bool /*show_sequence_number_type*/ = false) {
     return Status::NotSupported("DumpTable() not supported");
   }
 
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index 67142b0967ce..c288397d34b4 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -63,6 +63,9 @@ void print_help(bool to_stderr) {
     --decode_blob_index
       Decode blob indexes and print them in a human-readable format during scans.
 
+    --show_sequence_number_type
+      Show sequence number and value type when executing raw command
+
     --from=<user_key>
       Key to start reading from when executing check|scan
 
@@ -177,6 +180,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   bool verify_checksum = false;
   bool output_hex = false;
   bool decode_blob_index = false;
+  bool show_sequence_number_type = false;
   bool input_key_hex = false;
   bool has_from = false;
   bool has_to = false;
@@ -235,6 +239,8 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       output_hex = true;
     } else if (strcmp(argv[i], "--decode_blob_index") == 0) {
       decode_blob_index = true;
+    } else if (strcmp(argv[i], "--show_sequence_number_type") == 0) {
+      show_sequence_number_type = true;
     } else if (strcmp(argv[i], "--input_key_hex") == 0) {
       input_key_hex = true;
     } else if (sscanf(argv[i], "--read_num=%lu%c", (unsigned long*)&n, &junk) ==
@@ -531,7 +537,8 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
 
     ROCKSDB_NAMESPACE::SstFileDumper dumper(
         options, filename, Temperature::kUnknown, readahead_size,
-        verify_checksum, output_hex, decode_blob_index);
+        verify_checksum, output_hex, decode_blob_index, EnvOptions(), false,
+        show_sequence_number_type);
 
     // Not a valid SST
     if (!dumper.getStatus().ok()) {

From 707e405492a92d3466ac685c3820e6c55de91f4c Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Thu, 4 Dec 2025 12:28:01 -0800
Subject: [PATCH 395/500] Revert #14122 "Fix a bug where compaction ..."
 (#14170)

Summary:
Revert "Fix a bug where compaction with range deletion can persist kTypeMaxValid in file metadata (https://github.com/facebook/rocksdb/issues/14122)"

Add a new unit test to capture the situation found by stress test

This reverts commit 8c7c8b8dab04f945a3574941185bdddc3d15a1be.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14170

Test Plan: Unit Test

Reviewed By: anand1976

Differential Revision: D88395956

Pulled By: xingbowang

fbshipit-source-id: 226649dc79a86010ad326ffb2eae35109dc96bc4
---
 db/db_range_del_test.cc              | 187 ++++++++++-----------------
 db/dbformat.h                        |  16 +--
 db/range_del_aggregator.cc           |   4 +
 db/range_del_aggregator_test.cc      |  38 +++---
 db/version_edit.cc                   |   1 -
 db/version_edit.h                    |   3 -
 table/compaction_merging_iterator.cc |  31 ++---
 utilities/debug.cc                   |   2 -
 8 files changed, 110 insertions(+), 172 deletions(-)

diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 289f783ab5e2..e22cd5845b09 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -3826,133 +3826,88 @@ TEST_F(DBRangeDelTest, RowCache) {
   ASSERT_OK(Put(Key(5), "foo"));
 }
 
-TEST_F(DBRangeDelTest, FileCutWithTruncatedRangeDelKey) {
-  // Test for a bug that used to generate files with meta.smallest
-  // containing kMaxValid.
-  //
-  // Setup:
-  // - Write Key(2), Key(3) and DeleteRange(Key(1), Key(4))
-  // - Flush to L0
-  // - Use SingleKeySstPartitioner to force each user key into its own file
-  // - Compact files from L0 to L1 will generate files
-  // File[0]:
-  //   smallest=[user_key=key000001, seq=4, type=15],
-  //   largest= [user_key=key000002, seq=72057594037927935, type=15]
-  // File[1]:
-  //   smallest=[user_key=key000002, seq=2, type=1],
-  //   largest= [user_key=key000003, seq=72057594037927935, type=15]
-  // File[2]:
-  //   smallest=[user_key=key000003, seq=3, type=1],
-  //   largest= [user_key=key000004, seq=72057594037927935, type=15]
-  // With range deletions truncated to each files key range.
-  //
-  // - Compact these files again into L2. RocksDB usede to set truncated
-  // range deletion start key to have value type kMaxValid. The range deletion
-  // start key is used in compaction file cutting decision.
-  // - Verify the file boundary keys after compaction have valid boundary keys
-  //
-  // Before the fix:
-  // File[0]:
-  //   smallest=[user_key=key000001, seq=4, type=15],
-  //   largest= [user_key=key000002, seq=72057594037927935, type=15]
-  // File[1]:
-  //   smallest=[user_key=key000002, seq=2, type=26],
-  //   largest= [user_key=key000003, seq=72057594037927935, type=15]
-  // File[2]:
-  //   smallest=[user_key=key000003, seq=3, type=26],
-  //   largest= [user_key=key000004, seq=72057594037927935, type=15]
-  //
-  // After the fix:
-  // File[0]:
-  //   smallest=[user_key=key000001, seq=4, type=15],
-  //   largest= [user_key=key000002, seq=72057594037927935, type=15]
-  // File[1]:
-  //   smallest=[user_key=key000002, seq=2, type=1],
-  //   largest= [user_key=key000003, seq=72057594037927935, type=15]
-  // File[2]:
-  //   smallest=[user_key=key000003, seq=3, type=1],
-  //   largest= [user_key=key000004, seq=72057594037927935, type=15]
-
-  Options options = CurrentOptions();
-  options.disable_auto_compactions = true;
-
-  // Use partitioner that cuts before every new user key.
-  // Key(x) generates keys of length 9.
-  auto factory = std::shared_ptr<SstPartitionerFactory>(
-      NewSstPartitionerFixedPrefixFactory(10));
-  options.sst_partitioner_factory = factory;
-
-  DestroyAndReopen(options);
-
-  Random rnd(301);
+TEST_F(DBRangeDelTest, SeekForPrevTest) {
+  // open db
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleUniversal;
 
-  // Create a file in a lower level so the compactions below are not
-  // bottommost compactions. Range deletion start keys are not considered
-  // in bottommost compaction.
-  ASSERT_OK(Put(Key(3), rnd.RandomBinaryString(100)));
-  ASSERT_OK(Flush());
-  MoveFilesToLevel(6);
-  ASSERT_EQ(1, NumTableFilesAtLevel(6));
+  // add SST partitioner, split sst file with prefix length 2
+  options.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(2);
+  Reopen(options);
 
-  ASSERT_OK(Put(Key(2), rnd.RandomString(100)));
-  // Snapshots keep point keys alive.
-  ManagedSnapshot snapshot1(db_);
-  ASSERT_OK(Put(Key(3), rnd.RandomString(100)));
-  ManagedSnapshot snapshot2(db_);
-  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
-                             Key(4)));
-  ASSERT_OK(Flush());
-  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  // File uses SST partitioner, so it will be split into 3 files
+  // SST file 1: ka1, ka2
+  // SST file 2: kb1
+  // SST file 3: kc1, kc2
+  // Delete range covers from ka2 to kc2, which means record ka2 and kb1, kc1
+  // are covered by the delete range
 
-  ColumnFamilyMetaData cf_meta_l0;
-  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_l0);
-  ASSERT_EQ(1, cf_meta_l0.levels[0].files.size());
-  std::vector<std::string> l0_filenames;
-  for (const auto& sst_file : cf_meta_l0.levels[0].files) {
-    l0_filenames.push_back(sst_file.name);
+  std::vector<std::pair<std::string, std::string>> kv = {{"ka1", "value_1"},
+                                                         {"ka2", "value_2"},
+                                                         {"kb1", "value_3"},
+                                                         {"kc1", "value_4"},
+                                                         {"kc2", "value_5"}};
+  for (auto& p : kv) {
+    ASSERT_OK(Put(p.first, p.second));
   }
 
-  // Compact L0 files to L1
-  CompactionOptions compact_options_l0;
-  ASSERT_OK(db_->CompactFiles(compact_options_l0, l0_filenames, 1));
-  ASSERT_EQ(3, NumTableFilesAtLevel(1));
-
-  // Check L1 file metadata
-  std::vector<std::vector<FileMetaData>> files_l1;
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_l1);
-
-  for (const auto& file : files_l1[1]) {
-    ASSERT_LT(ExtractValueType(file.smallest.Encode()), kTypeMaxValid);
-    ASSERT_LT(ExtractValueType(file.largest.Encode()), kTypeMaxValid);
-  }
+  ASSERT_OK(Flush());
+  // Compact to Lmax, it should have seq 0 now.
+  ASSERT_OK(CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
-  // Get file names from level 1
-  ColumnFamilyMetaData cf_meta;
-  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta);
-  std::vector<std::string> input_filenames;
-  for (const auto& sst_file : cf_meta.levels[1].files) {
-    input_filenames.push_back(sst_file.name);
+  // Open an iterator and create a snapshot, so that keys are not deleted
+  // completely by delete range in SST
+  ReadOptions read_opts;
+  read_opts.snapshot = db_->GetSnapshot();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+  iter->SeekToFirst();
+  // iterate all the keys and validate the value
+  for (int i = 0; iter->Valid(); iter->Next()) {
+    ASSERT_EQ(kv[i].first, iter->key().ToString());
+    ASSERT_EQ(kv[i].second, iter->value().ToString());
+    i++;
   }
 
-  // Compact files from L1 to L2
-  CompactionOptions compact_options;
-  ASSERT_OK(db_->CompactFiles(compact_options, input_filenames, 2));
-
-  // Check L2 file metadata
-  std::vector<std::vector<FileMetaData>> files;
-  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
-
-  for (const auto& file : files[2]) {
-    ASSERT_LT(ExtractValueType(file.smallest.Encode()), kTypeMaxValid);
-    ASSERT_LT(ExtractValueType(file.largest.Encode()), kTypeMaxValid);
-  }
+  // use delete range to delete the record
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "ka2",
+                             "kc2"));
+  // Flush
+  ASSERT_OK(Flush());
+  // Compact to Lmax
+  ASSERT_OK(CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
-  // // Verify iteration works correctly
-  std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
-  iter->SeekToFirst();
+  // Close the iterator and release the snapshot.
   ASSERT_OK(iter->status());
-  ASSERT_FALSE(iter->Valid());
+  iter.reset();
+  db_->ReleaseSnapshot(read_opts.snapshot);
+
+  // create second iterator, seek each key and validate result
+  std::unique_ptr<Iterator> iter2(db_->NewIterator(ReadOptions()));
+  // Validate keys are deleted
+  iter2->SeekToFirst();
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("ka1", iter2->key().ToString());
+  iter2->Next();
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("kc2", iter2->key().ToString());
+  iter2->Next();
+  ASSERT_FALSE(iter2->Valid());
+
+  // Validate seek for prev result
+  for (auto& p : kv) {
+    iter2->SeekForPrev(p.first);
+    ASSERT_TRUE(iter2->Valid());
+    if (p.first == "kc2") {
+      ASSERT_EQ("kc2", iter2->key().ToString());
+    } else {
+      ASSERT_EQ("ka1", iter2->key().ToString());
+    }
+  }
+  ASSERT_OK(iter2->status());
+  iter2.reset();
 }
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/dbformat.h b/db/dbformat.h
index e50380858774..0ee6e9272b5f 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -72,12 +72,6 @@ enum ValueType : unsigned char {
   kTypeColumnFamilyWideColumnEntity = 0x17,     // WAL only
   kTypeValuePreferredSeqno = 0x18,              // Value with a unix write time
   kTypeColumnFamilyValuePreferredSeqno = 0x19,  // WAL only
-  // Placeholder value type for legacy SST files with incorrectly persisted
-  // file boundaries. Prior to the fix, TruncatedRangeDelIterator assigned
-  // kTypeMaxValid to truncated range deletion keys, which was then
-  // incorrectly persisted to SST file metadata. This placeholder value allows
-  // reading such legacy files for without using kTypeMaxValid.
-  kTypeTruncatedRangeDeletionSentinel = 0x1A,
   kTypeMaxValid,    // Should be after the last valid type, only used for
                     // validation
   kMaxValue = 0x7F  // Not used for storing records.
@@ -124,11 +118,10 @@ inline bool IsValueType(ValueType t) {
 
 // Checks whether a type is from user operation
 // kTypeRangeDeletion is in meta block so this API is separated from above
-// kTypeTruncatedRangeDeletionSentinel is for legacy files with incorrectly
-// persisted file boundaries.
+// kTypeMaxValid can be from keys generated by
+// TruncatedRangeDelIterator::start_key()
 inline bool IsExtendedValueType(ValueType t) {
-  return IsValueType(t) || t == kTypeRangeDeletion ||
-         t == kTypeTruncatedRangeDeletionSentinel;
+  return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid;
 }
 
 // We leave eight bits empty at the bottom so a type and sequence#
@@ -187,7 +180,8 @@ inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
 // Pack a sequence number and a ValueType into a uint64_t
 inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
   assert(seq <= kMaxSequenceNumber);
-  assert(IsExtendedValueType(t));
+  // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
+  assert(IsExtendedValueType(t) || t == kTypeMaxValid);
   return (seq << 8) | t;
 }
 
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index 8b389bac5468..f41521e1162a 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -36,6 +36,7 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
     Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
                                          false /* log_err_key */);  // TODO
     pik_status.PermitUncheckedError();
+    parsed_smallest.type = kTypeMaxValid;
     assert(pik_status.ok());
     smallest_ = &parsed_smallest;
   }
@@ -70,6 +71,9 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
       // the truncated end key can cover the largest key in this sstable, reduce
       // its sequence number by 1.
       parsed_largest.sequence -= 1;
+      // This line is not needed for correctness, but it ensures that the
+      // truncated end key is not covering keys from the next SST file.
+      parsed_largest.type = kTypeMaxValid;
     }
     largest_ = &parsed_largest;
   }
diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc
index 41fbfbb9249d..89391c924d93 100644
--- a/db/range_del_aggregator_test.cc
+++ b/db/range_del_aggregator_test.cc
@@ -89,9 +89,7 @@ void VerifyIterator(
   for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
     ASSERT_TRUE(iter->Valid());
     EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
-    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end))
-        << iter->end_key().DebugString(false, false) << " "
-        << expected_range_dels[i].end.DebugString(false, false);
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
     EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
   }
   EXPECT_FALSE(iter->Valid());
@@ -307,28 +305,28 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
 
   VerifyIterator(
       &iter, bytewise_icmp,
-      {{InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10},
+      {{InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
        {InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {InternalValue("j", 4, kTypeRangeDeletion),
-        InternalValue("m", 8, kTypeValue), 4}});
+        InternalValue("m", 8, kTypeMaxValid), 4}});
 
   VerifySeek(
       &iter, bytewise_icmp,
-      {{"d", InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10},
+      {{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
        {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {"ia", InternalValue("j", 4, kTypeRangeDeletion),
-        InternalValue("m", 8, kTypeValue), 4, false /* invalid */},
+        InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */},
        {"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
         true /* invalid */},
-       {"", InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10}});
+       {"", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}});
 
   VerifySeekForPrev(
       &iter, bytewise_icmp,
-      {{"d", InternalValue("d", 7, kTypeValue), UncutEndpoint("e"), 10},
+      {{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10},
        {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8},
        {"n", InternalValue("j", 4, kTypeRangeDeletion),
-        InternalValue("m", 8, kTypeValue), 4, false /* invalid */},
+        InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */},
        {"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
         true /* invalid */}});
 }
@@ -347,21 +345,23 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
   TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
                                  &smallest, &largest);
 
-  VerifyIterator(&iter, bytewise_icmp,
-                 {{InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8}});
+  VerifyIterator(
+      &iter, bytewise_icmp,
+      {{InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}});
 
-  VerifySeek(&iter, bytewise_icmp,
-             {{"d", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8},
-              {"f", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8},
-              {"j", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""),
-               0, true /* invalid */}});
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
+       {"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
+       {"j", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
+        true /* invalid */}});
 
   VerifySeekForPrev(
       &iter, bytewise_icmp,
       {{"d", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0,
         true /* invalid */},
-       {"f", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8},
-       {"j", InternalValue("f", 7, kTypeValue), UncutEndpoint("g"), 8}});
+       {"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}});
 }
 
 TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
diff --git a/db/version_edit.cc b/db/version_edit.cc
index afc9128d45ad..88150181bf4c 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -30,7 +30,6 @@ uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
 Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
                                       SequenceNumber seqno,
                                       ValueType value_type) {
-  assert(value_type < kTypeMaxValid);
   if (value_type == kTypeBlobIndex) {
     BlobIndex blob_index;
     const Status s = blob_index.DecodeFrom(value);
diff --git a/db/version_edit.h b/db/version_edit.h
index 2f0543b19d34..8ed83cc4a8ed 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -317,9 +317,6 @@ struct FileMetaData {
   void UpdateBoundariesForRange(const InternalKey& start,
                                 const InternalKey& end, SequenceNumber seqno,
                                 const InternalKeyComparator& icmp) {
-    assert(ExtractValueType(start.Encode()) < kTypeMaxValid);
-    assert(ExtractValueType(end.Encode()) < kTypeMaxValid);
-
     if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
       smallest = start;
     }
diff --git a/table/compaction_merging_iterator.cc b/table/compaction_merging_iterator.cc
index b67b63472d63..6c9dabb3ec12 100644
--- a/table/compaction_merging_iterator.cc
+++ b/table/compaction_merging_iterator.cc
@@ -191,6 +191,13 @@ class CompactionMergingIterator : public InternalIterator {
 
     bool operator()(HeapItem* a, HeapItem* b) const {
       int r = comparator_->Compare(a->key(), b->key());
+      // For each file, we assume all range tombstone start keys come before
+      // its file boundary sentinel key (file's meta.largest key).
+      // In the case when meta.smallest = meta.largest and range tombstone start
+      // key is truncated at meta.smallest, the start key will have op_type =
+      // kMaxValid to make it smaller (see TruncatedRangeDelIterator
+      // constructor). The following assertion validates this assumption.
+      assert(a->type == b->type || r != 0);
       return r > 0;
     }
 
@@ -235,24 +242,8 @@ class CompactionMergingIterator : public InternalIterator {
     return !minHeap_.empty() ? minHeap_.top() : nullptr;
   }
 
-  // For each file under a LevelIterator, the lifetime of range tombstone
-  // iterator is tied to the point key iterator. So we want scan through
-  // all range tombstone start keys before the file boundary sentinel key
-  // (file's meta.largest). When meta.smallest == meta.largest, the truncated
-  // range del start key may be ordered after meta.largest.
-  // Here we skip the first range deletion start key if it's truncated.
-  // This range deletion start key is redundant for compaction file cutting
-  // decision anyway, since the same point key will be considered for file
-  // cutting too.
-  void InsertNextValidRangeTombstoneAtLevel(size_t level) {
+  void InsertRangeTombstoneAtLevel(size_t level) {
     if (range_tombstone_iters_[level]->Valid()) {
-      if (range_tombstone_iters_[level]->start_key().type !=
-          kTypeRangeDeletion) {
-        range_tombstone_iters_[level]->Next();
-        if (!range_tombstone_iters_[level]->Valid()) {
-          return;
-        }
-      }
       pinned_heap_item_[level].SetTombstoneForCompaction(
           range_tombstone_iters_[level]->start_key());
       minHeap_.push(&pinned_heap_item_[level]);
@@ -271,7 +262,7 @@ void CompactionMergingIterator::SeekToFirst() {
   for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
     if (range_tombstone_iters_[i]) {
       range_tombstone_iters_[i]->SeekToFirst();
-      InsertNextValidRangeTombstoneAtLevel(i);
+      InsertRangeTombstoneAtLevel(i);
     }
   }
 
@@ -299,7 +290,7 @@ void CompactionMergingIterator::Seek(const Slice& target) {
                  0) {
         range_tombstone_iters_[i]->Next();
       }
-      InsertNextValidRangeTombstoneAtLevel(i);
+      InsertRangeTombstoneAtLevel(i);
     }
   }
 
@@ -366,7 +357,7 @@ void CompactionMergingIterator::FindNextVisibleKey() {
       minHeap_.pop();
     }
     if (range_tombstone_iters_[current->level]) {
-      InsertNextValidRangeTombstoneAtLevel(current->level);
+      InsertRangeTombstoneAtLevel(current->level);
     }
   }
 }
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 6bfd00b72c7a..59e6d46880f5 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -41,8 +41,6 @@ static std::unordered_map<std::string, ValueType> value_type_string_map = {
     {"TypeValuePreferredSeqno", ValueType::kTypeValuePreferredSeqno},
     {"TypeColumnFamilyValuePreferredSeqno",
      ValueType::kTypeColumnFamilyValuePreferredSeqno},
-    {"kTypeTruncatedRangeDeletionSentinel",
-     ValueType::kTypeTruncatedRangeDeletionSentinel},
 };
 
 std::string KeyVersion::GetTypeName() const {

From 7c48905ecd9e385b878d3a1b4611823e680daf39 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Thu, 4 Dec 2025 17:04:43 -0800
Subject: [PATCH 396/500] Fix missing const for arg of OptionChangeMigration
 (#14173)

Summary:
Fix missing const for arg of OptionChangeMigration

We switched from std::string to std::string & for API OptionChangeMigration, which caused const qualifier to be lost at call site, which causes compilation failure.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14173

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D88431457

Pulled By: xingbowang

fbshipit-source-id: a705f3b80cc5ff56dab73aa6a31c940798d8df45
---
 include/rocksdb/utilities/option_change_migration.h          | 2 +-
 utilities/option_change_migration/option_change_migration.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/utilities/option_change_migration.h b/include/rocksdb/utilities/option_change_migration.h
index ff941e0cf9b8..5c13329dc130 100644
--- a/include/rocksdb/utilities/option_change_migration.h
+++ b/include/rocksdb/utilities/option_change_migration.h
@@ -25,7 +25,7 @@ namespace ROCKSDB_NAMESPACE {
 // with `Options::compaction_options_fifo.max_table_files_size` > 0 can cause
 // the whole DB to be dropped right after migration if the migrated data is
 // larger than `max_table_files_size`
-Status OptionChangeMigration(std::string& dbname, const Options& old_opts,
+Status OptionChangeMigration(const std::string& dbname, const Options& old_opts,
                              const Options& new_opts);
 
 // Multi-CF version: Prepares a database with multiple column families to be
diff --git a/utilities/option_change_migration/option_change_migration.cc b/utilities/option_change_migration/option_change_migration.cc
index 66703034bb6d..9a17daca5ced 100644
--- a/utilities/option_change_migration/option_change_migration.cc
+++ b/utilities/option_change_migration/option_change_migration.cc
@@ -341,7 +341,7 @@ Status OptionChangeMigration(
   return s;
 }
 
-Status OptionChangeMigration(std::string& dbname, const Options& old_opts,
+Status OptionChangeMigration(const std::string& dbname, const Options& old_opts,
                              const Options& new_opts) {
   DBOptions old_db_opts(old_opts);
   DBOptions new_db_opts(new_opts);

From e3b54647859670105c0699566d6ef733542ae4d7 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 4 Dec 2025 20:39:10 -0800
Subject: [PATCH 397/500] GitHub Actions nightly crash test runs on ARM
 (#14172)

Summary:
To help find potential issues not showing up in ARM unit tests. I'm running it with and without TransactionDB (write-committed) for better coverage. The job expands the size of /dev/shm for adequate space on maximum performance storage, and adds swap space to reduce risk of OOM in case we fill that up.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14172

Test Plan: earlier drafts of this PR added the job to PR jobs, and the last before putting in "nightly" can be seen here: https://github.com/facebook/rocksdb/actions/runs/19945493840/job/57193797390?pr=14172

Reviewed By: archang19

Differential Revision: D88429479

Pulled By: pdillinger

fbshipit-source-id: bd4d9cda9256950c3c6c126c299a44dbbbc30c7e
---
 .github/workflows/nightly.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 2fe599ef1a1a..8fe6172da05a 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -110,6 +110,20 @@ jobs:
       - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev
       - run: make V=1 J=4 -j4 check
       - uses: "./.github/actions/post-steps"
+  build-linux-arm-crashtest:
+    if: ${{ github.repository_owner == 'facebook' }}
+    runs-on:
+      labels: 4-core-ubuntu-arm
+    steps:
+    - uses: actions/checkout@v4.1.0
+    - uses: "./.github/actions/pre-steps"
+    - run: sudo apt-get update && sudo apt-get install -y build-essential libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
+    - run: sudo mount -o remount,size=16G /dev/shm
+    - run: sudo dd bs=1048576 count=4096 if=/dev/zero of=/swapfile && sudo chmod 600 /swapfile && sudo mkswap /swapfile && sudo swapon /swapfile
+    - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=1800 --max_key=2500000' blackbox_crash_test_with_atomic_flush
+    - run: rm -rf /dev/shm/rocksdb.*
+    - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=1800 --max_key=2500000' blackbox_crash_test_with_multiops_wc_txn
+    - uses: "./.github/actions/post-steps"
   build-examples:
     if: ${{ github.repository_owner == 'facebook' }}
     runs-on:

From 5d0cf98e6cea7b5b3be7716d531bc1967b25f0fb Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 5 Dec 2025 10:45:26 -0800
Subject: [PATCH 398/500] Surface MultiScan async read failure instead of
 asserting (#14171)

Summary:
Crash tests have been failing of late with this assertion failure - db_stress: `./table/block_based/block_based_table_iterator.h:656: void rocksdb::BlockBasedTableIterator::PrepareReadAsyncCallBack(rocksdb::FSReadRequest &, void *): Assertion `async_state->status.IsAborted()' failed.` Instead of asserting, surface the failure status so we can troubleshoot.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14171

Reviewed By: xingbowang

Differential Revision: D88396654

Pulled By: anand1976

fbshipit-source-id: 8d59d7ace0c522c17b7af17c50e16af876911bad
---
 table/block_based/block_based_table_iterator.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index 8c44d0c9be0e..a12570d0e78c 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -652,8 +652,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
             std::to_string(async_state->offset) + " and async callback " +
             std::to_string(req.offset));
       }
-    } else {
-      assert(async_state->status.IsAborted());
     }
   }
 

From 80c4a67d6aaaee1755a8ccb88c6a02eb75fb91ff Mon Sep 17 00:00:00 2001
From: nsaji-stripe <nsaji@stripe.com>
Date: Mon, 8 Dec 2025 10:08:19 -0800
Subject: [PATCH 399/500] Remote Compaction C API (#14136)

Summary:
r? cbi42

Exposes RocksDB's remote compaction functionality through the C API, enabling C/FFI clients (Go, Rust, Python, etc.) to offload compaction work to remote workers.

## API Components
### Compaction Service

Create service with schedule, wait, cancel, and on_installation callbacks
Ownership transfers to options object (auto-destroyed, no manual cleanup)

### Job Info (13 getters)

DB/CF metadata and compaction details (priority, reason, levels, flags)

### Schedule Response

Create with job ID and status (validated with errptr)
Status: success, failure, aborted, use_local

### OpenAndCompact (for remote workers)

Execute compaction on worker node with environment/comparator overrides
Cancellation support via atomic flags

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14136

Reviewed By: hx235

Differential Revision: D88316558

Pulled By: jaykorean

fbshipit-source-id: 60a0fee69ff1e650dd785d96ec656649263214f8
---
 db/c.cc             | 428 +++++++++++++++++++++++++++++++++++++++++++-
 db/c_test.c         | 295 ++++++++++++++++++++++++++++++
 include/rocksdb/c.h | 174 ++++++++++++++++++
 3 files changed, 896 insertions(+), 1 deletion(-)

diff --git a/db/c.cc b/db/c.cc
index b02c7bc4bd19..9f058c55aba4 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -71,6 +71,11 @@ using ROCKSDB_NAMESPACE::CompactionFilterFactory;
 using ROCKSDB_NAMESPACE::CompactionJobInfo;
 using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
 using ROCKSDB_NAMESPACE::CompactionReason;
+using ROCKSDB_NAMESPACE::CompactionService;
+using ROCKSDB_NAMESPACE::CompactionServiceJobInfo;
+using ROCKSDB_NAMESPACE::CompactionServiceJobStatus;
+using ROCKSDB_NAMESPACE::CompactionServiceOptionsOverride;
+using ROCKSDB_NAMESPACE::CompactionServiceScheduleResponse;
 using ROCKSDB_NAMESPACE::CompactRangeOptions;
 using ROCKSDB_NAMESPACE::Comparator;
 using ROCKSDB_NAMESPACE::CompressionType;
@@ -107,6 +112,7 @@ using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
 using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
 using ROCKSDB_NAMESPACE::NewLRUCache;
 using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::OpenAndCompactOptions;
 using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
 using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
 using ROCKSDB_NAMESPACE::Options;
@@ -349,6 +355,22 @@ struct rocksdb_statistics_histogram_data_t {
   HistogramData rep;
 };
 
+struct rocksdb_compactionservice_scheduleresponse_t {
+  CompactionServiceScheduleResponse rep;
+};
+
+struct rocksdb_compactionservice_jobinfo_t {
+  CompactionServiceJobInfo rep;
+};
+
+struct rocksdb_compaction_service_options_override_t {
+  CompactionServiceOptionsOverride rep;
+};
+
+struct rocksdb_open_and_compact_options_t {
+  OpenAndCompactOptions rep;
+};
+
 struct rocksdb_compactionfilter_t : public CompactionFilter {
   void* state_;
   void (*destructor_)(void*);
@@ -643,6 +665,410 @@ static inline char* CopyString(const Slice& slice) {
   return result;
 }
 
+const char* rocksdb_compactionservice_jobinfo_t_get_db_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.db_name.size();
+  return info->rep.db_name.data();
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_db_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.db_id.size();
+  return info->rep.db_id.data();
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_db_session_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.db_session_id.size();
+  return info->rep.db_session_id.data();
+}
+
+const char* rocksdb_compactionservice_jobinfo_t_get_cf_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len) {
+  *len = info->rep.cf_name.size();
+  return info->rep.cf_name.data();
+}
+
+uint32_t rocksdb_compactionservice_jobinfo_t_get_cf_id(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.cf_id;
+}
+
+uint64_t rocksdb_compactionservice_jobinfo_t_get_job_id(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.job_id;
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_priority(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return static_cast<int>(info->rep.priority);
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_compaction_reason(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return static_cast<int>(info->rep.compaction_reason);
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_base_input_level(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.base_input_level;
+}
+
+int rocksdb_compactionservice_jobinfo_t_get_output_level(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.output_level;
+}
+
+unsigned char rocksdb_compactionservice_jobinfo_t_is_full_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionservice_jobinfo_t_is_manual_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.is_manual_compaction;
+}
+
+unsigned char rocksdb_compactionservice_jobinfo_t_is_bottommost_level(
+    const rocksdb_compactionservice_jobinfo_t* info) {
+  return info->rep.bottommost_level;
+}
+
+// Helper function to validate compaction service job status
+static inline bool IsValidCompactionServiceJobStatus(int status) {
+  return status >= rocksdb_compactionservice_jobstatus_success &&
+         status <= rocksdb_compactionservice_jobstatus_use_local;
+}
+
+rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create(const char* scheduled_job_id,
+                                                  int status, char** errptr) {
+  // Validate status is in range [success=0, failure=1, aborted=2, use_local=3]
+  if (!IsValidCompactionServiceJobStatus(status)) {
+    SaveError(errptr,
+              Status::InvalidArgument("Invalid status value. Must be 0-3."));
+    return nullptr;
+  }
+
+  rocksdb_compactionservice_scheduleresponse_t* response =
+      new rocksdb_compactionservice_scheduleresponse_t{
+          CompactionServiceScheduleResponse(
+              scheduled_job_id ? std::string(scheduled_job_id) : "",
+              static_cast<CompactionServiceJobStatus>(status))};
+  return response;
+}
+
+rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create_with_status(int status,
+                                                              char** errptr) {
+  // Validate status is in range [success=0, failure=1, aborted=2, use_local=3]
+  if (!IsValidCompactionServiceJobStatus(status)) {
+    SaveError(errptr,
+              Status::InvalidArgument("Invalid status value. Must be 0-3."));
+    return nullptr;
+  }
+
+  rocksdb_compactionservice_scheduleresponse_t* response =
+      new rocksdb_compactionservice_scheduleresponse_t{
+          CompactionServiceScheduleResponse(
+              static_cast<CompactionServiceJobStatus>(status))};
+  return response;
+}
+
+void rocksdb_compactionservice_scheduleresponse_t_destroy(
+    rocksdb_compactionservice_scheduleresponse_t* response) {
+  if (response) {
+    delete response;
+  }
+}
+
+int rocksdb_compactionservice_scheduleresponse_getstatus(
+    const rocksdb_compactionservice_scheduleresponse_t* response) {
+  if (!response) {
+    return rocksdb_compactionservice_jobstatus_failure;
+  }
+  return static_cast<int>(response->rep.status);
+}
+
+const char* rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id(
+    const rocksdb_compactionservice_scheduleresponse_t* response, size_t* len) {
+  if (!response || !len) {
+    if (len) {
+      *len = 0;
+    }
+    return "";
+  }
+  *len = response->rep.scheduled_job_id.size();
+  return response->rep.scheduled_job_id.data();
+}
+
+struct rocksdb_compactionservice_t : public CompactionService {
+  void* state_;
+  void (*destructor_)(void*);
+  rocksdb_compaction_service_schedule_cb schedule_;
+  std::string name_;
+  rocksdb_compaction_service_wait_cb wait_;
+  rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs_;
+  rocksdb_compaction_service_on_installation_cb on_installation_;
+
+  rocksdb_compactionservice_t(
+      void* state, void (*destructor)(void*),
+      rocksdb_compaction_service_schedule_cb
+          rocksdb_compaction_service_schedule_ptr,
+      const char* name, rocksdb_compaction_service_wait_cb wait,
+      rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs,
+      rocksdb_compaction_service_on_installation_cb on_installation)
+      : state_(state),
+        destructor_(destructor),
+        schedule_(rocksdb_compaction_service_schedule_ptr),
+        name_(name ? name : "CompactionService"),
+        wait_(wait),
+        cancel_awaiting_jobs_(cancel_awaiting_jobs),
+        on_installation_(on_installation) {}
+
+  ~rocksdb_compactionservice_t() override {
+    if (destructor_) {
+      (*destructor_)(state_);
+    }
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+  CompactionServiceScheduleResponse Schedule(
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    if (schedule_ == nullptr) {
+      return CompactionServiceScheduleResponse(
+          CompactionServiceJobStatus::kUseLocal);
+    }
+
+    rocksdb_compactionservice_scheduleresponse_t* c_response = (*schedule_)(
+        state_,
+        reinterpret_cast<const rocksdb_compactionservice_jobinfo_t*>(&info),
+        compaction_service_input.data(), compaction_service_input.size());
+
+    if (c_response == nullptr) {
+      return CompactionServiceScheduleResponse(
+          CompactionServiceJobStatus::kFailure);
+    }
+
+    CompactionServiceScheduleResponse response = std::move(c_response->rep);
+    delete c_response;
+    return response;
+  }
+
+  CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
+                                  std::string* result) override {
+    if (wait_ == nullptr) {
+      return CompactionServiceJobStatus::kUseLocal;
+    }
+
+    char* c_result = nullptr;
+    size_t result_len = 0;
+
+    int status =
+        (*wait_)(state_, scheduled_job_id.c_str(), &c_result, &result_len);
+
+    if (c_result != nullptr) {
+      if (result != nullptr) {
+        result->assign(c_result, result_len);
+      }
+      free(c_result);
+    }
+
+    return static_cast<CompactionServiceJobStatus>(status);
+  }
+
+  void CancelAwaitingJobs() override {
+    if (cancel_awaiting_jobs_ != nullptr) {
+      (*cancel_awaiting_jobs_)(state_);
+    }
+  }
+
+  void OnInstallation(const std::string& scheduled_job_id,
+                      CompactionServiceJobStatus status) override {
+    if (on_installation_ != nullptr) {
+      (*on_installation_)(state_, scheduled_job_id.c_str(),
+                          static_cast<int>(status));
+    }
+  }
+};
+
+rocksdb_compactionservice_t* rocksdb_compactionservice_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compaction_service_schedule_cb schedule, const char* name,
+    rocksdb_compaction_service_wait_cb wait,
+    rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs,
+    rocksdb_compaction_service_on_installation_cb on_installation) {
+  return new rocksdb_compactionservice_t(state, destructor, schedule, name,
+                                         wait, cancel_awaiting_jobs,
+                                         on_installation);
+}
+
+void rocksdb_options_set_compaction_service(
+    rocksdb_options_t* opt, rocksdb_compactionservice_t* service) {
+  if (!opt || !service) {
+    return;
+  }
+
+  opt->rep.compaction_service = std::shared_ptr<CompactionService>(service);
+}
+
+// CompactionServiceOptionsOverride functions
+rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create() {
+  return new rocksdb_compaction_service_options_override_t;
+}
+
+void rocksdb_compaction_service_options_override_destroy(
+    rocksdb_compaction_service_options_override_t* override_options) {
+  if (override_options) {
+    delete override_options;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_env(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_env_t* env) {
+  if (override_options && env) {
+    override_options->rep.env = env->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_comparator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_comparator_t* comparator) {
+  if (override_options && comparator) {
+    override_options->rep.comparator = comparator;
+  }
+}
+
+// Atomic bool management for cancellation
+unsigned char* rocksdb_open_and_compact_canceled_create() {
+  return reinterpret_cast<unsigned char*>(new std::atomic<bool>(false));
+}
+
+void rocksdb_open_and_compact_canceled_destroy(unsigned char* canceled) {
+  if (canceled) {
+    delete reinterpret_cast<std::atomic<bool>*>(canceled);
+  }
+}
+
+void rocksdb_open_and_compact_canceled_set(unsigned char* canceled,
+                                           unsigned char value) {
+  if (canceled) {
+    reinterpret_cast<std::atomic<bool>*>(canceled)->store(value != 0);
+  }
+}
+
+// OpenAndCompactOptions functions
+rocksdb_open_and_compact_options_t* rocksdb_open_and_compact_options_create() {
+  return new rocksdb_open_and_compact_options_t;
+}
+
+void rocksdb_open_and_compact_options_destroy(
+    rocksdb_open_and_compact_options_t* options) {
+  if (options) {
+    delete options;
+  }
+}
+
+void rocksdb_open_and_compact_options_set_canceled(
+    rocksdb_open_and_compact_options_t* options, unsigned char* canceled) {
+  if (options && canceled) {
+    options->rep.canceled = reinterpret_cast<std::atomic<bool>*>(canceled);
+  }
+}
+
+void rocksdb_open_and_compact_options_set_allow_resumption(
+    rocksdb_open_and_compact_options_t* options,
+    unsigned char allow_resumption) {
+  if (options) {
+    options->rep.allow_resumption = allow_resumption != 0;
+  }
+}
+
+// OpenAndCompact functions
+char* rocksdb_open_and_compact(
+    const char* db_path, const char* output_directory, const char* input,
+    size_t input_len, size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr) {
+  if (!db_path || !output_directory || !input || !override_options) {
+    SaveError(errptr, Status::InvalidArgument("Invalid arguments"));
+    return nullptr;
+  }
+
+  std::string input_str(input, input_len);
+  std::string output_str;
+
+  Status s = DB::OpenAndCompact(db_path, output_directory, input_str,
+                                &output_str, override_options->rep);
+
+  if (!s.ok()) {
+    SaveError(errptr, s);
+    return nullptr;
+  }
+
+  // Allocate +1 for null terminator
+  char* result = static_cast<char*>(malloc(output_str.size() + 1));
+  if (!result) {
+    SaveError(errptr, Status::MemoryLimit("Failed to allocate output buffer"));
+    return nullptr;
+  }
+
+  memcpy(result, output_str.data(), output_str.size());
+  result[output_str.size()] = '\0';
+
+  // Only set output_len after successful allocation
+  if (output_len) {
+    *output_len = output_str.size();
+  }
+
+  return result;
+}
+
+char* rocksdb_open_and_compact_with_options(
+    const rocksdb_open_and_compact_options_t* options, const char* db_path,
+    const char* output_directory, const char* input, size_t input_len,
+    size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr) {
+  if (!options || !db_path || !output_directory || !input ||
+      !override_options) {
+    SaveError(errptr, Status::InvalidArgument("Invalid arguments"));
+    return nullptr;
+  }
+
+  std::string input_str(input, input_len);
+  std::string output_str;
+
+  Status s = DB::OpenAndCompact(options->rep, db_path, output_directory,
+                                input_str, &output_str, override_options->rep);
+
+  if (!s.ok()) {
+    SaveError(errptr, s);
+    return nullptr;
+  }
+
+  // Allocate +1 for null terminator
+  char* result = static_cast<char*>(malloc(output_str.size() + 1));
+  if (!result) {
+    SaveError(errptr, Status::MemoryLimit("Failed to allocate output buffer"));
+    return nullptr;
+  }
+
+  memcpy(result, output_str.data(), output_str.size());
+  result[output_str.size()] = '\0';  // Null terminate
+
+  // Only set output_len after successful allocation
+  if (output_len) {
+    *output_len = output_str.size();
+  }
+
+  return result;
+}
+
 rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name,
                         char** errptr) {
   DB* db;
@@ -7841,7 +8267,7 @@ rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
     dbs.push_back(db->rep);
   }
 
-  unordered_set<const Cache*> cache_set;
+  std::unordered_set<const Cache*> cache_set;
   for (auto cache : consumers->caches) {
     cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
   }
diff --git a/db/c_test.c b/db/c_test.c
index ca5a76fba063..7a0612a224b3 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -722,6 +722,88 @@ static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env,
                                       num_column_families);
 }
 
+// Global state for tracking remote compaction calls
+typedef struct {
+  int schedule_called;
+  int wait_called;
+  int cancel_called;
+  char last_scheduled_job_id[256];
+  char last_db_name[256];
+} RemoteCompactionState;
+
+// Schedule callback - gets called when compaction is scheduled
+static rocksdb_compactionservice_scheduleresponse_t* RemoteCompactionSchedule(
+    void* state, const rocksdb_compactionservice_jobinfo_t* info,
+    const char* input, size_t input_len) {
+  (void)input;
+  (void)input_len;
+  RemoteCompactionState* rcs = (RemoteCompactionState*)state;
+  rcs->schedule_called++;
+
+  // Extract job info
+  size_t db_name_len;
+  const char* db_name =
+      rocksdb_compactionservice_jobinfo_t_get_db_name(info, &db_name_len);
+  memcpy(rcs->last_db_name, db_name, db_name_len);
+  rcs->last_db_name[db_name_len] = '\0';
+
+  // Generate a job ID
+  snprintf(rcs->last_scheduled_job_id, sizeof(rcs->last_scheduled_job_id),
+           "job-%d", rcs->schedule_called);
+
+  // Create response with success status
+  char* err = NULL;
+  rocksdb_compactionservice_scheduleresponse_t* response =
+      rocksdb_compactionservice_scheduleresponse_create(
+          rcs->last_scheduled_job_id,
+          rocksdb_compactionservice_jobstatus_success, &err);
+  if (err) {
+    free(err);
+  }
+  return response;
+}
+
+// Wait callback - simulates waiting for remote compaction to complete
+static int RemoteCompactionWait(void* state, const char* scheduled_job_id,
+                                char** result, size_t* result_len) {
+  RemoteCompactionState* rcs = (RemoteCompactionState*)state;
+  rcs->wait_called++;
+
+  if (strcmp(scheduled_job_id, rcs->last_scheduled_job_id) != 0) {
+    return rocksdb_compactionservice_jobstatus_failure;
+  }
+
+  // For testing purposes, return kUseLocal to cause RocksDB to fall back to
+  // local compaction. This tests the callback mechanism without needing a fully
+  // serialized result. In a real scenario, this would communicate with a remote
+  // worker that calls rocksdb_open_and_compact() and returns a properly
+  // serialized CompactionServiceResult
+  *result = NULL;
+  *result_len = 0;
+
+  return rocksdb_compactionservice_jobstatus_use_local;
+}
+
+// Cancel callback - cancels pending jobs
+static void RemoteCompactionCancel(void* state) {
+  RemoteCompactionState* rcs = (RemoteCompactionState*)state;
+  rcs->cancel_called++;
+}
+
+// Destructor callback
+static void RemoteCompactionDestroy(void* state) { (void)state; }
+
+// NULL schedule callback for testing failure handling
+static rocksdb_compactionservice_scheduleresponse_t* NullSchedule(
+    void* state, const rocksdb_compactionservice_jobinfo_t* info,
+    const char* input, size_t input_len) {
+  (void)state;
+  (void)info;
+  (void)input;
+  (void)input_len;
+  return NULL;  // Return NULL to simulate failure
+}
+
 int main(int argc, char** argv) {
   (void)argc;
   (void)argv;
@@ -4483,6 +4565,219 @@ int main(int argc, char** argv) {
     rocksdb_cache_destroy(lru);
   }
 
+  StartPhase("remote_compaction_service");
+  {
+    RemoteCompactionState remote_state = {0, 0, 0, "", ""};
+
+    // Create compaction service
+    rocksdb_compactionservice_t* service = rocksdb_compactionservice_create(
+        &remote_state,             // state
+        RemoteCompactionDestroy,   // destructor
+        RemoteCompactionSchedule,  // schedule callback
+        "TestRemoteCompaction",    // name
+        RemoteCompactionWait,      // wait callback
+        RemoteCompactionCancel,    // cancel_awaiting_jobs
+        NULL);                     // on_installation
+
+    // Create options with remote compaction
+    rocksdb_options_t* remote_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(remote_options, 1);
+    rocksdb_options_set_level0_file_num_compaction_trigger(remote_options, 2);
+    rocksdb_options_set_write_buffer_size(remote_options,
+                                          64 * 1024);  // 64KB buffer
+    rocksdb_options_set_max_bytes_for_level_base(remote_options,
+                                                 256 * 1024);  // 256KB
+    rocksdb_options_set_target_file_size_base(
+        remote_options, 64 * 1024);  // 64KB target file size
+    // Disable automatic compactions to test manual compaction only
+    rocksdb_options_set_disable_auto_compactions(remote_options, 1);
+    rocksdb_options_set_compaction_service(remote_options, service);
+
+    // Destroy old DB and create new one
+    rocksdb_close(db);
+    rocksdb_destroy_db(remote_options, dbname, &err);
+    CheckNoError(err);
+
+    db = rocksdb_open(remote_options, dbname, &err);
+    CheckNoError(err);
+
+    // Create multiple SST files to trigger compaction
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+
+    // Write and flush multiple times to create multiple L0 files
+    // Write more data with larger values to ensure files are substantial
+    for (int batch = 0; batch < 5; batch++) {
+      for (int i = 0; i < 200; i++) {
+        char key[20], val[1000];
+        snprintf(key, sizeof(key), "key%d_%d", batch, i);
+        // Fill value with repeated data to make it larger
+        memset(val, 'a' + (batch % 26), sizeof(val) - 1);
+        val[sizeof(val) - 1] = '\0';
+        rocksdb_put(db, woptions, key, strlen(key), val, strlen(val), &err);
+        CheckNoError(err);
+      }
+      rocksdb_flush(db, flush_opts, &err);
+      CheckNoError(err);
+    }
+    rocksdb_flushoptions_destroy(flush_opts);
+
+    // Trigger manual compaction to invoke remote compaction service
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    rocksdb_wait_for_compact_options_t* wco =
+        rocksdb_wait_for_compact_options_create();
+    rocksdb_wait_for_compact(db, wco, &err);
+    CheckNoError(err);
+    rocksdb_wait_for_compact_options_destroy(wco);
+
+    // Verify that callbacks were actually called
+    CheckCondition(remote_state.schedule_called > 0);
+    CheckCondition(remote_state.wait_called > 0);
+    CheckCondition(strlen(remote_state.last_db_name) > 0);
+    CheckCondition(strstr(remote_state.last_db_name, "rocksdb_c_test") != NULL);
+
+    // Verify data is still accessible after remote compaction
+    // Just check a few keys to verify data integrity
+    for (int batch = 0; batch < 5; batch++) {
+      char key[20];
+      snprintf(key, sizeof(key), "key%d_0", batch);
+      size_t vallen;
+      char* val = rocksdb_get(db, roptions, key, strlen(key), &vallen, &err);
+      CheckNoError(err);
+      CheckCondition(val != NULL);
+      CheckCondition(vallen == 999);  // strlen of 1000-byte string
+      free(val);
+    }
+
+    // Test cancellation API directly
+    RemoteCompactionCancel(&remote_state);
+    CheckCondition(remote_state.cancel_called > 0);
+
+    // Cleanup
+    rocksdb_close(db);
+    rocksdb_destroy_db(remote_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_destroy(remote_options);
+
+    // Reopen DB with original options for subsequent tests
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("remote_compaction_scheduleresponse");
+  {
+    // Test scheduleresponse creation and getters
+    rocksdb_compactionservice_scheduleresponse_t* response;
+
+    // Test success response
+    err = NULL;
+    response = rocksdb_compactionservice_scheduleresponse_create(
+        "test-job-123", rocksdb_compactionservice_jobstatus_success, &err);
+    CheckNoError(err);
+    CheckCondition(response != NULL);
+    CheckCondition(
+        rocksdb_compactionservice_scheduleresponse_getstatus(response) ==
+        rocksdb_compactionservice_jobstatus_success);
+
+    size_t job_id_len;
+    const char* job_id =
+        rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id(
+            response, &job_id_len);
+    CheckCondition(job_id_len == strlen("test-job-123"));
+    CheckCondition(memcmp(job_id, "test-job-123", job_id_len) == 0);
+    rocksdb_compactionservice_scheduleresponse_t_destroy(response);
+
+    // Test failure response
+    response = rocksdb_compactionservice_scheduleresponse_create_with_status(
+        rocksdb_compactionservice_jobstatus_failure, &err);
+    CheckCondition(response != NULL);
+    CheckCondition(
+        rocksdb_compactionservice_scheduleresponse_getstatus(response) ==
+        rocksdb_compactionservice_jobstatus_failure);
+    rocksdb_compactionservice_scheduleresponse_t_destroy(response);
+
+    response = rocksdb_compactionservice_scheduleresponse_create_with_status(
+        999, &err);
+    CheckCondition(response == NULL);  // Invalid status
+    if (err) {
+      Free(&err);
+    }
+  }
+
+  StartPhase("remote_compaction_options_override");
+  {
+    // Test CompactionServiceOptionsOverride API
+    rocksdb_compaction_service_options_override_t* override_opts =
+        rocksdb_compaction_service_options_override_create();
+    CheckCondition(override_opts != NULL);
+
+    // Set up override options
+    rocksdb_compaction_service_options_override_set_env(override_opts, env);
+    rocksdb_compaction_service_options_override_set_comparator(override_opts,
+                                                               cmp);
+
+    rocksdb_compaction_service_options_override_destroy(override_opts);
+  }
+
+  StartPhase("remote_compaction_null_callback_handling");
+  {
+    // Test that NULL callback returns are handled gracefully
+    // This simulates a failure in the remote compaction service
+    rocksdb_compactionservice_t* null_service =
+        rocksdb_compactionservice_create(NULL, NULL, NullSchedule,
+                                         "NullTestService", NULL, NULL, NULL);
+
+    rocksdb_options_t* null_opts = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(null_opts, 1);
+    rocksdb_options_set_compaction_service(null_opts, null_service);
+
+    const char* null_db = "rocksdb_c_test_null_service";
+
+    rocksdb_t* null_db_handle = rocksdb_open(null_opts, null_db, &err);
+    CheckNoError(err);
+
+    // Write data and trigger compaction
+    for (int i = 0; i < 100; i++) {
+      char key[20], val[50];
+      snprintf(key, sizeof(key), "key%d", i);
+      snprintf(val, sizeof(val), "val%d", i);
+      rocksdb_put(null_db_handle, woptions, key, strlen(key), val, strlen(val),
+                  &err);
+      CheckNoError(err);
+    }
+
+    // This should fall back to local compaction (not crash)
+    rocksdb_compact_range(null_db_handle, NULL, 0, NULL, 0);
+
+    // Data should still be readable
+    CheckGet(null_db_handle, roptions, "key50", "val50");
+
+    rocksdb_close(null_db_handle);
+    rocksdb_destroy_db(null_opts, null_db, &err);
+    rocksdb_options_destroy(null_opts);
+  }
+
+  StartPhase("remote_compaction_canceled_flag");
+  {
+    // Test atomic cancellation flag API
+    unsigned char* canceled = rocksdb_open_and_compact_canceled_create();
+    CheckCondition(canceled != NULL);
+
+    // Set cancellation
+    rocksdb_open_and_compact_canceled_set(canceled, 1);
+
+    // Use with OpenAndCompactOptions
+    rocksdb_open_and_compact_options_t* oac_opts =
+        rocksdb_open_and_compact_options_create();
+    rocksdb_open_and_compact_options_set_canceled(oac_opts, canceled);
+    rocksdb_open_and_compact_options_set_allow_resumption(oac_opts, 1);
+
+    // Cleanup
+    rocksdb_open_and_compact_options_destroy(oac_opts);
+    rocksdb_open_and_compact_canceled_destroy(canceled);
+  }
+
   StartPhase("sst_file_manager");
   {
     rocksdb_sst_file_manager_t* sst_file_manager;
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ce9e5229c824..ffb5583e4aca 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -166,6 +166,31 @@ typedef struct rocksdb_writestallinfo_t rocksdb_writestallinfo_t;
 typedef struct rocksdb_writestallcondition_t rocksdb_writestallcondition_t;
 typedef struct rocksdb_memtableinfo_t rocksdb_memtableinfo_t;
 
+// Remote Compaction typedef
+typedef struct rocksdb_compactionservice_scheduleresponse_t
+    rocksdb_compactionservice_scheduleresponse_t;
+typedef struct rocksdb_compactionservice_jobinfo_t
+    rocksdb_compactionservice_jobinfo_t;
+typedef struct rocksdb_compactionservice_t rocksdb_compactionservice_t;
+typedef struct rocksdb_compaction_service_options_override_t
+    rocksdb_compaction_service_options_override_t;
+typedef struct rocksdb_open_and_compact_options_t
+    rocksdb_open_and_compact_options_t;
+typedef rocksdb_compactionservice_scheduleresponse_t* (
+    *rocksdb_compaction_service_schedule_cb)(
+    void* state, const rocksdb_compactionservice_jobinfo_t* info,
+    const char* compaction_service_input, size_t input_len);
+
+typedef int (*rocksdb_compaction_service_wait_cb)(void* state,
+                                                  const char* scheduled_job_id,
+                                                  char** result,
+                                                  size_t* result_len);
+
+typedef void (*rocksdb_compaction_service_cancel_awaiting_jobs_cb)(void* state);
+
+typedef void (*rocksdb_compaction_service_on_installation_cb)(
+    void* state, const char* scheduled_job_id, int status);
+
 /* DB operations */
 
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
@@ -3512,6 +3537,155 @@ extern ROCKSDB_LIBRARY_API unsigned char rocksdb_get_into_buffer_cf(
     size_t keylen, char* buffer, size_t buffer_size, size_t* vallen,
     unsigned char* found, char** errptr);
 
+// Remote compaction
+enum {
+  rocksdb_compactionservice_jobstatus_success = 0,
+  rocksdb_compactionservice_jobstatus_failure = 1,
+  rocksdb_compactionservice_jobstatus_aborted = 2,
+  rocksdb_compactionservice_jobstatus_use_local = 3,
+};
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create(const char* scheduled_job_id,
+                                                  int status, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_scheduleresponse_t*
+rocksdb_compactionservice_scheduleresponse_create_with_status(int status,
+                                                              char** errptr);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_scheduleresponse_getstatus(
+    const rocksdb_compactionservice_scheduleresponse_t* response);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_scheduleresponse_get_scheduled_job_id(
+    const rocksdb_compactionservice_scheduleresponse_t* response, size_t* len);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactionservice_scheduleresponse_t_destroy(
+    rocksdb_compactionservice_scheduleresponse_t* response);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_db_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_db_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_db_session_id(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API const char*
+rocksdb_compactionservice_jobinfo_t_get_cf_name(
+    const rocksdb_compactionservice_jobinfo_t* info, size_t* len);
+
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_compactionservice_jobinfo_t_get_cf_id(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_compactionservice_jobinfo_t_get_job_id(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int rocksdb_compactionservice_jobinfo_t_get_priority(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_jobinfo_t_get_compaction_reason(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_jobinfo_t_get_base_input_level(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_compactionservice_jobinfo_t_get_output_level(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionservice_jobinfo_t_is_full_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionservice_jobinfo_t_is_manual_compaction(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionservice_jobinfo_t_is_bottommost_level(
+    const rocksdb_compactionservice_jobinfo_t* info);
+
+extern ROCKSDB_LIBRARY_API rocksdb_compactionservice_t*
+rocksdb_compactionservice_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compaction_service_schedule_cb schedule, const char* name,
+    rocksdb_compaction_service_wait_cb wait,
+    rocksdb_compaction_service_cancel_awaiting_jobs_cb cancel_awaiting_jobs,
+    rocksdb_compaction_service_on_installation_cb on_installation);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_service(
+    rocksdb_options_t* options, rocksdb_compactionservice_t* service);
+
+// CompactionServiceOptionsOverride
+extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create(void);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_destroy(
+    rocksdb_compaction_service_options_override_t* override_options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_env(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_comparator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_comparator_t* comparator);
+
+// Atomic bool management for cancellation
+// Creates an atomic bool that can be used for cancellation.
+// User must call rocksdb_open_and_compact_canceled_destroy() to free it.
+extern ROCKSDB_LIBRARY_API unsigned char*
+rocksdb_open_and_compact_canceled_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_canceled_destroy(
+    unsigned char* canceled);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_canceled_set(
+    unsigned char* canceled, unsigned char value);
+
+// OpenAndCompactOptions
+extern ROCKSDB_LIBRARY_API rocksdb_open_and_compact_options_t*
+rocksdb_open_and_compact_options_create(void);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_options_destroy(
+    rocksdb_open_and_compact_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_open_and_compact_options_set_canceled(
+    rocksdb_open_and_compact_options_t* options, unsigned char* canceled);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_open_and_compact_options_set_allow_resumption(
+    rocksdb_open_and_compact_options_t* options,
+    unsigned char allow_resumption);
+
+// OpenAndCompact - main functions
+extern ROCKSDB_LIBRARY_API char* rocksdb_open_and_compact(
+    const char* db_path, const char* output_directory, const char* input,
+    size_t input_len, size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_open_and_compact_with_options(
+    const rocksdb_open_and_compact_options_t* options, const char* db_path,
+    const char* output_directory, const char* input, size_t input_len,
+    size_t* output_len,
+    const rocksdb_compaction_service_options_override_t* override_options,
+    char** errptr);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif

From eedf1fe06828a9e530cc20d55060c887112189f5 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 9 Dec 2025 10:35:41 -0800
Subject: [PATCH 400/500] Display copy-paste friendly flag value in
 db_crashtest.py (#14180)

Summary:
**Context/Summary:**
Stress test flag printed by db_crashtest.py like `./db_stres ....-secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true --otherflags=xxxx` is not copy-paste-run friendly. Directly running this command will cause parsing hiccups due to special characters like // or ;. This PR made the db_crashtest.py print a single-quoted value so at least the copy-paste-run works for unix-like shell (the most common case).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14180

Test Plan:
`python3 tools/db_crashtest.py --simple blackbox ...` display the following

Before fix, no single-quoted
```
Use random seed for iteration 9698536012932546857
Running db_stress with pid=1280640:./db_stress --secondary_cache_uri=compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true  ...

// Directly copy, paste and run the ./db_stress command will encounter
Error: Read(-readpercent=0)+Prefix(-prefixpercent=0)+Write(-writepercent=45)+Delete(-delpercent=0)+DeleteRange(-delrangepercent=30)+Iterate(-iterpercent=40)+CustomOps(-customopspercent=0) percents != 100!
bash: --set_options_one_in=0: command not found
```
After fix, has single-quoted
```
se random seed for iteration 6017815530972723112
Running db_stress with pid=1234632: ./db_stress --secondary_cache_uri='compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true' ....

// Directly copy, paste and run the ./db_stress command is fine
```

Reviewed By: archang19

Differential Revision: D88688584

Pulled By: hx235

fbshipit-source-id: 88b8b2de7c2c5619b6e19900f4144dcd8e032f7b
---
 tools/db_crashtest.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 64eb676d7cfc..1606679404eb 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -5,6 +5,7 @@
 import math
 import os
 import random
+import shlex
 import shutil
 import subprocess
 import sys
@@ -22,6 +23,18 @@ def get_random_seed(override):
         return override
 
 
+def quote_arg_for_display(arg):
+    """
+    Quote only the value after '=' for shell display.
+    This makes the printed command safe to copy/paste into a Unix shell.
+    Note: shlex is Unix-focused; Non-Unix shell users may need to adjust quoting after copying.
+    """
+    if "=" not in arg:
+        return arg
+    flag, value = arg.split("=", 1)
+    return f"{flag}={shlex.quote(value)}"
+
+
 def setup_random_seed_before_main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -1324,7 +1337,10 @@ def gen_cmd(params, unknown_params):
 
 def execute_cmd(cmd, timeout=None, timeout_pstack=False):
     child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-    print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd)))
+    print(
+        "Running db_stress with pid=%d: %s\n\n"
+        % (child.pid, " ".join(quote_arg_for_display(arg) for arg in cmd))
+    )
     pid = child.pid
 
     try:

From a1d83185635b05a907767e481ac0449980daeb73 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 11 Dec 2025 16:50:42 -0800
Subject: [PATCH 401/500] Fix resumable compaction to prevent resumption at
 truncated range deletion boundaries (#14184)

Summary:
**Context/Summary:**

Truncated range deletion in input files can be output by CompactionIterator with type kMaxValid instead of kTypeRangeDeletion, to satisfy ordering requirement between the truncated range deletion start key and a file's point keys. There was a plan to skip such key in https://github.com/facebook/rocksdb/pull/14122 but blockers remain to fulfill the plan.

Resumable compaction is not able to handle resumption from range deletion well at this point and should consider kMaxValid type same as kTypeRangeDeletion for resumption. Previously, it didn't and mistakenly allow resumption from a delete range. That led to an assertion failure, complaining about lacking information to update file boundaries in the presence of range deletion needed during cutting an output file, after the compaction resumes from that delete range and happens to cut the output file shortly after without any point keys in between.

```
frame https://github.com/facebook/rocksdb/issues/9: 0x00007f4f4743bc93 libc.so.6`__GI___assert_fail(assertion="meta.smallest.size() > 0", file="db/compaction/compaction_outputs.cc", line=530, function="rocksdb::Status rocksdb::CompactionOutputs::AddRangeDels(rocksdb::CompactionRangeDelAggregator&, const rocksdb::Slice*, const rocksdb::Slice*, rocksdb::CompactionIterationStats&, bool, const rocksdb::InternalKeyComparator&, rocksdb::SequenceNumber, std::pair<long unsigned int, long unsigned int>, const rocksdb::Slice&, const string&)") at assert.c:101:3
frame https://github.com/facebook/rocksdb/issues/10: 0x00007f4f4808c68c librocksdb.so.10.9`rocksdb::CompactionOutputs::AddRangeDels(this=0x00007f4f0c27e1a0, range_del_agg=0x00007f4f0c21ecc0, comp_start_user_key=0x0000000000000000, comp_end_user_key=0x0000000000000000, range_del_out_stats=0x00007f4f0dffa140, bottommost_level=false, icmp=0x00007f4ef4c93040, earliest_snapshot=13108729, keep_seqno_range=<unavailable>, next_table_min_key=0x00007f4ef4c8f540, full_history_ts_low="") at compaction_outputs.cc:530:7
frame https://github.com/facebook/rocksdb/issues/11: 0x00007f4f480480dd librocksdb.so.10.9`rocksdb::CompactionJob::FinishCompactionOutputFile(this=0x00007f4f0dffb890, input_status=<unavailable>, prev_table_last_internal_key=0x00007f4f0dffa650, next_table_min_key=0x00007f4ef4c8f540, comp_start_user_key=0x0000000000000000, comp_end_user_key=0x0000000000000000, c_iter=0x00007f4ef4c8f400, sub_compact=0x00007f4f0c27e000, outputs=0x00007f4f0c27e1a0) at compaction_job.cc:1917:31
```

This PR simply prevents  MaxValid from being a resumption point like regular range deletion - see commit 842d66eb18ea67e965d6acb1fce12c18eeb778d2

Besides that, the PR also improves the testing, variable naming, logging in resumable compaction codes that were needed to debug this assertion failure - see commit https://github.com/facebook/rocksdb/pull/14184/commits/aecd4e7f971f6dd4df672d9e5f1409fe4747c561. These improvements are covered by existing tests.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14184

Test Plan:
- The stress initially surfaced the error. Using the exact same LSM shapes and files that were used in stress test but in a unit test, I'm able to get a deterministic repro and confirmed the fix resolves the error.  This is the repro test https://github.com/hx235/rocksdb/commit/1075936e693c68c960761855900c53f5b894f57a
```
./compaction_service_test --gtest_filter=ResumableCompactionServiceTest.CompactSpecificFilesFromExistingDBWithCancelAndResume
# Before fix
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from ResumableCompactionServiceTest
[ RUN      ] ResumableCompactionServiceTest.CompactSpecificFilesFromExistingDBWithCancelAndResume
compaction_service_test: db/compaction/compaction_outputs.cc:530: rocksdb::Status rocksdb::CompactionOutputs::AddRangeDels(rocksdb::CompactionRangeDelAggregator&, const rocksdb::Slice*, const rocksdb::Slice*, rocksdb::CompactionIterationStats&, bool, const rocksdb::InternalKeyComparator&, rocksdb::SequenceNumber, std::pair<long unsigned int, long unsigned int>, const rocksdb::Slice&, const string&): Assertion `meta.smallest.size() > 0' failed.
Received signal 6 (Aborted)
Invoking GDB for stack trace...
[New LWP 2621610]
[New LWP 2621611]
[New LWP 2621612]
[New LWP 2621613]
[New LWP 2621614]
[New LWP 2621630]
[New LWP 2621631]

# After fix
Note: Google Test filter = ResumableCompactionServiceTest.CompactSpecificFilesFromExistingDBWithCancelAndResume
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from ResumableCompactionServiceTest
[ RUN      ] ResumableCompactionServiceTest.CompactSpecificFilesFromExistingDBWithCancelAndResume
[       OK ] ResumableCompactionServiceTest.CompactSpecificFilesFromExistingDBWithCancelAndResume (4722 ms)
[----------] 1 test from ResumableCompactionServiceTest (4722 ms total)

[----------] Global test environment tear-down
[==========] 1 test from 1 test case ran. (4722 ms total)
[  PASSED  ] 1 test.

```
- Follow-up: I tried a couple time to coerce the truncated range delete from scratch in the unit test but failed doing so. Considering kMaxValid may not be outputted by compaction iterator anymore after https://github.com/facebook/rocksdb/pull/14122/files gets landed again (and obsolete the bug) ADN the simple nature of this fix 842d66eb18ea67e965d6acb1fce12c18eeb778d2 AND the worst case of such fix going wrong is just less resumption, I decided to leave writing a unit test to coerce truncated ranged deletion from scratch a follow-up. Maybe I will draw inspiration from https://github.com/facebook/rocksdb/pull/14122/files.

Reviewed By: jaykorean

Differential Revision: D88912663

Pulled By: hx235

fbshipit-source-id: 80a01135684c8fea659650faaa00c2dc452c482a
---
 db/compaction/compaction_job.cc               | 58 ++++++++++---------
 db/compaction/compaction_job.h                |  4 +-
 db/compaction/compaction_job_test.cc          | 54 +++++++++--------
 db/compaction/compaction_outputs.cc           |  6 +-
 db/compaction/compaction_outputs.h            |  2 +-
 db/compaction/subcompaction_state.cc          |  4 +-
 db/compaction/subcompaction_state.h           |  2 +-
 db/db_impl/db_impl_secondary.cc               | 15 +++--
 db/version_edit.h                             | 11 ++--
 .../truncated_range_del_resume_compaction.md  |  1 +
 10 files changed, 90 insertions(+), 67 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3f1fd9546d43..3d51f8fd5410 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1480,11 +1480,11 @@ CompactionJob::CreateFileHandlers(SubcompactionState* sub_compact,
   const CompactionFileCloseFunc close_file_func =
       [this, sub_compact, start_user_key, end_user_key](
           const Status& status,
-          const ParsedInternalKey& prev_table_last_internal_key,
+          const ParsedInternalKey& prev_iter_output_internal_key,
           const Slice& next_table_min_key, const CompactionIterator* c_iter,
           CompactionOutputs& outputs) {
         return this->FinishCompactionOutputFile(
-            status, prev_table_last_internal_key, next_table_min_key,
+            status, prev_iter_output_internal_key, next_table_min_key,
             start_user_key, end_user_key, c_iter, sub_compact, outputs);
       };
 
@@ -1499,8 +1499,8 @@ Status CompactionJob::ProcessKeyValue(
   const uint64_t kRecordStatsEvery = 1000;
   [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
 
-  IterKey last_output_key;
-  ParsedInternalKey last_output_ikey;
+  IterKey prev_iter_output_key;
+  ParsedInternalKey prev_iter_output_internal_key;
 
   TEST_SYNC_POINT_CALLBACK(
       "CompactionJob::ProcessKeyValueCompaction()::Processing",
@@ -1551,9 +1551,9 @@ Status CompactionJob::ProcessKeyValue(
     // and `close_file_func`.
     // TODO: it would be better to have the compaction file open/close moved
     // into `CompactionOutputs` which has the output file information.
-    status =
-        sub_compact->AddToOutput(*c_iter, use_proximal_output, open_file_func,
-                                 close_file_func, last_output_ikey);
+    status = sub_compact->AddToOutput(*c_iter, use_proximal_output,
+                                      open_file_func, close_file_func,
+                                      prev_iter_output_internal_key);
     if (!status.ok()) {
       break;
     }
@@ -1562,9 +1562,10 @@ Status CompactionJob::ProcessKeyValue(
                              static_cast<void*>(const_cast<std::atomic<bool>*>(
                                  &manual_compaction_canceled_)));
 
-    last_output_key.SetInternalKey(c_iter->key(), &last_output_ikey);
-    last_output_ikey.sequence = ikey.sequence;
-    last_output_ikey.type = ikey.type;
+    prev_iter_output_key.SetInternalKey(c_iter->key(),
+                                        &prev_iter_output_internal_key);
+    prev_iter_output_internal_key.sequence = ikey.sequence;
+    prev_iter_output_internal_key.type = ikey.type;
     c_iter->Next();
 
 #ifndef NDEBUG
@@ -1871,7 +1872,7 @@ void CompactionJob::RecordDroppedKeys(
 
 Status CompactionJob::FinishCompactionOutputFile(
     const Status& input_status,
-    const ParsedInternalKey& prev_table_last_internal_key,
+    const ParsedInternalKey& prev_iter_output_internal_key,
     const Slice& next_table_min_key, const Slice* comp_start_user_key,
     const Slice* comp_end_user_key, const CompactionIterator* c_iter,
     SubcompactionState* sub_compact, CompactionOutputs& outputs) {
@@ -2049,7 +2050,7 @@ Status CompactionJob::FinishCompactionOutputFile(
   }
 
   if (s.ok() && ShouldUpdateSubcompactionProgress(sub_compact, c_iter,
-                                                  prev_table_last_internal_key,
+                                                  prev_iter_output_internal_key,
                                                   next_table_min_key, meta)) {
     UpdateSubcompactionProgress(c_iter, next_table_min_key, sub_compact);
     s = PersistSubcompactionProgress(sub_compact);
@@ -2060,10 +2061,10 @@ Status CompactionJob::FinishCompactionOutputFile(
 
 bool CompactionJob::ShouldUpdateSubcompactionProgress(
     const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
-    const ParsedInternalKey& prev_table_last_internal_key,
+    const ParsedInternalKey& prev_iter_output_internal_key,
     const Slice& next_table_min_internal_key, const FileMetaData* meta) const {
   const auto* cfd = sub_compact->compaction->column_family_data();
-  // No need to update when the output will not get persisted
+  // No need to update when the progress will not get persisted
   if (compaction_progress_writer_ == nullptr) {
     return false;
   }
@@ -2087,19 +2088,24 @@ bool CompactionJob::ShouldUpdateSubcompactionProgress(
   }
 
   // LIMITATION: Compaction progress persistence disabled for file boundaries
-  // contaning range deletions. Range deletions can span file boundaries, making
-  // it difficult (but possible) to ensure adjacent output tables have different
-  // user keys. See the last check for why different users keys of adjacent
-  // output tables are needed
+  // containing range deletions. Range deletions can span file boundaries,
+  // making it difficult to ensure adjacent output tables have different user
+  // keys. See the last check for why different users keys of adjacent output
+  // tables are needed
   const ValueType next_table_min_internal_key_type =
       ExtractValueType(next_table_min_internal_key);
-  const ValueType prev_table_last_internal_key_type =
-      prev_table_last_internal_key.user_key.empty()
+  const ValueType prev_iter_output_internal_key_type =
+      prev_iter_output_internal_key.user_key.empty()
           ? ValueType::kTypeValue
-          : prev_table_last_internal_key.type;
-
-  if (next_table_min_internal_key_type == ValueType::kTypeRangeDeletion ||
-      prev_table_last_internal_key_type == ValueType::kTypeRangeDeletion) {
+          : prev_iter_output_internal_key.type;
+
+  // Range deletes truncated to align with file boundaries may be output by the
+  // compaction iterator with `ValueType::kTypeMaxValid` instead of the original
+  // type.
+  if ((next_table_min_internal_key_type == ValueType::kTypeRangeDeletion ||
+       next_table_min_internal_key_type == ValueType::kTypeMaxValid) ||
+      (prev_iter_output_internal_key_type == ValueType::kTypeRangeDeletion ||
+       prev_iter_output_internal_key_type == ValueType::kTypeMaxValid)) {
     return false;
   }
 
@@ -2109,9 +2115,9 @@ bool CompactionJob::ShouldUpdateSubcompactionProgress(
   const Slice next_table_min_user_key =
       ExtractUserKey(next_table_min_internal_key);
   const Slice prev_table_last_user_key =
-      prev_table_last_internal_key.user_key.empty()
+      prev_iter_output_internal_key.user_key.empty()
           ? Slice()
-          : prev_table_last_internal_key.user_key;
+          : prev_iter_output_internal_key.user_key;
 
   if (cfd->user_comparator()->EqualWithoutTimestamp(next_table_min_user_key,
                                                     prev_table_last_user_key)) {
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index ca933f7d4814..c9dac611cd6f 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -430,7 +430,7 @@ class CompactionJob {
 
   Status FinishCompactionOutputFile(
       const Status& input_status,
-      const ParsedInternalKey& prev_table_last_internal_key,
+      const ParsedInternalKey& prev_iter_output_internal_key,
       const Slice& next_table_min_key, const Slice* comp_start_user_key,
       const Slice* comp_end_user_key, const CompactionIterator* c_iter,
       SubcompactionState* sub_compact, CompactionOutputs& outputs);
@@ -545,7 +545,7 @@ class CompactionJob {
 
   bool ShouldUpdateSubcompactionProgress(
       const SubcompactionState* sub_compact, const CompactionIterator* c_iter,
-      const ParsedInternalKey& prev_table_last_internal_key,
+      const ParsedInternalKey& prev_iter_output_internal_key,
       const Slice& next_table_min_internal_key, const FileMetaData* meta) const;
 
   void UpdateSubcompactionProgress(const CompactionIterator* c_iter,
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 2836ed20e3ba..95d74be4d485 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -2424,6 +2424,7 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
   bool enable_cancel_ = false;
   std::atomic<int> stop_count_{0};
   std::atomic<bool> cancel_{false};
+  SequenceNumber cancel_before_seqno = kMaxSequenceNumber;
 
   void SetUp() override {
     CompactionJobTestBase::SetUp();
@@ -2437,7 +2438,9 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
           if (enable_cancel_) {
             ParsedInternalKey parsed_key;
             if (ParseInternalKey(pair->second, &parsed_key, true).ok()) {
-              if (parsed_key.user_key == kCancelBeforeThisKey) {
+              if (parsed_key.user_key == kCancelBeforeThisKey &&
+                  (cancel_before_seqno == kMaxSequenceNumber ||
+                   parsed_key.sequence == cancel_before_seqno)) {
                 cancel_.store(true);
               }
             }
@@ -2665,7 +2668,7 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
       const std::initializer_list<mock::KVPair>& input_file_2,
       uint64_t last_sequence, const std::vector<uint64_t>& snapshots,
       const std::string& expected_next_key_to_compact,
-      const std::vector<std::string>& expected_input_keys, bool exists_progress,
+      const std::vector<std::string>& expected_input_keys,
       bool cancelled_past_mid_point = false) {
     std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
 
@@ -2701,7 +2704,7 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
 
     // Resume compaction
     CompactionProgress compaction_progress;
-    if (exists_progress) {
+    if (expected_next_key_to_compact != "") {
       compaction_progress.push_back(
           ReadAndParseProgress(compaction_progress_file));
     }
@@ -2774,35 +2777,43 @@ TEST_F(ResumableCompactionJobTest, BasicProgressResume) {
       4U /* last_sequence */, {} /* snapshots */,
       kCancelBeforeThisKey /* expected_next_key_to_compact */,
       {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */,
-      true /* exists_progress */, true /* cancelled_past_mid_point*/);
+      true /* cancelled_past_mid_point */);
 }
 
 TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSameKey) {
   NewDB();
 
+  // `cancel_before_seqno` is set to 0U to force cancellation after
+  // `kCancelBeforeThisKey@1` instead of `kCancelBeforeThisKey@2`.
+  // The seqno is 0 because `kCancelBeforeThisKey@1` will have its sequence
+  // number zeroed during compaction while `kCancelBeforeThisKey@2` won't be
+  cancel_before_seqno = 0U;
   RunCancelAndResumeTest(
       {{KeyStr(kCancelBeforeThisKey, 1U, kTypeValue),
         "val1"}} /* input_file_1 */,
-      {{KeyStr(kCancelBeforeThisKey, 2U, kTypeValue),
-        "val2"}} /* input_file_2 */,
-      2U /* last_sequence */, {1U} /* snapshots */,
+      {{KeyStr(kCancelBeforeThisKey, 2U, kTypeValue), "val11"},
+       {KeyStr("d", 3U, kTypeValue), "val2"}} /* input_file_2 */,
+      3U /* last_sequence */, {1U} /* snapshots */,
       "" /* expected_next_key_to_compact */,
-      {kCancelBeforeThisKey, kCancelBeforeThisKey} /* expected_input_keys */,
-      false /* exists_progress */);
+      {kCancelBeforeThisKey, kCancelBeforeThisKey,
+       "d"} /* expected_input_keys */);
 }
 
 TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeleteRange) {
   NewDB();
 
   RunCancelAndResumeTest(
-      {{KeyStr(kCancelBeforeThisKey, 1U, kTypeValue),
-        "val1"}} /* input_file_1 */,
-      {{KeyStr(kCancelBeforeThisKey, 2U, kTypeRangeDeletion),
-        "val2"}} /* input_file_2 */,
-      2U /* last_sequence */, {1U} /* snapshots */,
-      "" /* expected_next_key_to_compact */,
-      {kCancelBeforeThisKey, kCancelBeforeThisKey} /* expected_input_keys */,
-      false /* exists_progress */);
+      {{KeyStr("a", 1U, kTypeValue), "val1"},
+       {KeyStr("b", 2U, kTypeValue), "val2"},
+       {KeyStr(kCancelBeforeThisKey, 3U, kTypeValue),
+        "val3"}} /* input_file_1 */,
+      {{KeyStr(kCancelBeforeThisKey, 4U, kTypeRangeDeletion),
+        "range_deletion_end_key"},
+       {KeyStr("d", 5U, kTypeValue), "val4"}} /* input_file_2 */,
+      5U /* last_sequence */, {3U} /* snapshots */,
+      "b" /* expected_next_key_to_compact */,
+      {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
+       "d"} /* expected_input_keys */);
 }
 
 TEST_F(ResumableCompactionJobTest, NoProgressResumeOnMerge) {
@@ -2817,8 +2828,7 @@ TEST_F(ResumableCompactionJobTest, NoProgressResumeOnMerge) {
         "val4"}} /* input_file_2 */,
       4U /* last_sequence */, {} /* snapshots */,
       "bb" /* expected_next_key_to_compact */,
-      {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */,
-      true /* exists_progress */);
+      {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */);
 }
 
 TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSingleDelete) {
@@ -2834,8 +2844,7 @@ TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSingleDelete) {
       5U /* last_sequence */, {3U} /* snapshots */,
       "b" /* expected_next_key_to_compact */,
       {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
-       "d"} /* expected_input_keys */,
-      true /* exists_progress */);
+       "d"} /* expected_input_keys */);
 }
 
 TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeletionAtBottom) {
@@ -2851,8 +2860,7 @@ TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeletionAtBottom) {
       5U /* last_sequence */, {3U} /* snapshots */,
       "b" /* expected_next_key_to_compact */,
       {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
-       "d"} /* expected_input_keys */,
-      true /* exists_progress */);
+       "d"} /* expected_input_keys */);
 }
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index ff1e446a6953..34dc5f9ed135 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -364,7 +364,7 @@ Status CompactionOutputs::AddToOutput(
     const CompactionIterator& c_iter,
     const CompactionFileOpenFunc& open_file_func,
     const CompactionFileCloseFunc& close_file_func,
-    const ParsedInternalKey& prev_table_last_internal_key) {
+    const ParsedInternalKey& prev_iter_output_internal_key) {
   Status s;
   bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
   if (is_range_del && compaction_->bottommost_level()) {
@@ -375,8 +375,8 @@ Status CompactionOutputs::AddToOutput(
   }
   const Slice& key = c_iter.key();
   if (ShouldStopBefore(c_iter) && HasBuilder()) {
-    s = close_file_func(c_iter.InputStatus(), prev_table_last_internal_key, key,
-                        &c_iter, *this);
+    s = close_file_func(c_iter.InputStatus(), prev_iter_output_internal_key,
+                        key, &c_iter, *this);
     if (!s.ok()) {
       return s;
     }
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index d2f94a5c50da..6f9de28efcfd 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -262,7 +262,7 @@ class CompactionOutputs {
   Status AddToOutput(const CompactionIterator& c_iter,
                      const CompactionFileOpenFunc& open_file_func,
                      const CompactionFileCloseFunc& close_file_func,
-                     const ParsedInternalKey& prev_table_last_internal_key);
+                     const ParsedInternalKey& prev_iter_output_internal_key);
 
   // Close the current output. `open_file_func` is needed for creating new file
   // for range-dels only output file.
diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc
index 910c0bff7f03..0e8f673c1124 100644
--- a/db/compaction/subcompaction_state.cc
+++ b/db/compaction/subcompaction_state.cc
@@ -109,12 +109,12 @@ Status SubcompactionState::AddToOutput(
     const CompactionIterator& iter, bool use_proximal_output,
     const CompactionFileOpenFunc& open_file_func,
     const CompactionFileCloseFunc& close_file_func,
-    const ParsedInternalKey& prev_table_last_internal_key) {
+    const ParsedInternalKey& prev_iter_output_internal_key) {
   // update target output
   current_outputs_ =
       use_proximal_output ? &proximal_level_outputs_ : &compaction_outputs_;
   return current_outputs_->AddToOutput(iter, open_file_func, close_file_func,
-                                       prev_table_last_internal_key);
+                                       prev_iter_output_internal_key);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 944841f75ba3..09af46540ca9 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -221,7 +221,7 @@ class SubcompactionState {
   Status AddToOutput(const CompactionIterator& iter, bool use_proximal_output,
                      const CompactionFileOpenFunc& open_file_func,
                      const CompactionFileCloseFunc& close_file_func,
-                     const ParsedInternalKey& prev_table_last_internal_key);
+                     const ParsedInternalKey& prev_iter_output_internal_key);
 
   // Close all compaction output files, both output_to_proximal_level outputs
   // and normal outputs.
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index b73de2b350ae..c8517be25d73 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -1102,11 +1102,6 @@ Status DBImplSecondary::InitializeCompactionWorkspace(
     return s;
   }
 
-  ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                 "Initialized compaction workspace with %zu subcompaction "
-                 "progress to resume",
-                 compaction_progress_.size());
-
   return Status::OK();
 }
 
@@ -1219,6 +1214,11 @@ Status DBImplSecondary::PrepareCompactionProgressState() {
       return HandleInvalidOrNoCompactionProgress(compaction_progress_file_path,
                                                  scan_result);
     }
+
+    ROCKS_LOG_DEBUG(
+        immutable_db_options_.info_log,
+        "Loaded compaction progress with %zu subcompaction(s) from %s",
+        compaction_progress_.size(), compaction_progress_file_path.c_str());
     return s;
   } else {
     return HandleInvalidOrNoCompactionProgress(
@@ -1740,6 +1740,11 @@ Status DBImplSecondary::FinalizeCompactionProgressWriter(
     return HandleCompactionProgressWriterCreationFailure(
         "" /* temp_file_path */, final_file_path, compaction_progress_writer);
   }
+
+  ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                  "Finalized compaction progress writer onto %s",
+                  final_file_path.c_str());
+
   return Status::OK();
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_edit.h b/db/version_edit.h
index 8ed83cc4a8ed..4f60a86fa0e4 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -557,16 +557,19 @@ struct SubcompactionProgress {
       Slice key_slice(next_internal_key_to_compact);
       if (ParseInternalKey(key_slice, &parsed_key, false /* log_err_key */)
               .ok()) {
-        oss << "user_key=\"" << parsed_key.user_key.ToString(false /* hex */)
-            << "\" (hex:" << parsed_key.user_key.ToString(true /* hex */)
-            << ")";
+        oss << "user_key(hex)=" << parsed_key.user_key.ToString(true /* hex */);
         oss << ", seq=";
         if (parsed_key.sequence == kMaxSequenceNumber) {
           oss << "kMaxSequenceNumber";
         } else {
           oss << parsed_key.sequence;
         }
-        oss << ", type=" << static_cast<int>(parsed_key.type);
+        oss << ", type=";
+        if (parsed_key.type == kValueTypeForSeek) {
+          oss << "kValueTypeForSeek";
+        } else {
+          oss << static_cast<int>(parsed_key.type);
+        }
       } else {
         oss << "raw=" << key_slice.ToString(true /* hex */);
       }
diff --git a/unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md b/unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md
new file mode 100644
index 000000000000..72eb33e41f8d
--- /dev/null
+++ b/unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md
@@ -0,0 +1 @@
+Fix resumable compaction incorrectly allowing resumption from a truncated range deletion that is not well handled currently.

From 5a06787a26ee4035b5c7d46ea4b3d80fc9bc02c9 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Fri, 12 Dec 2025 14:25:40 -0800
Subject: [PATCH 402/500] IO uring improvements (#14158)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
`PosixRandomAccessFile::MultiRead` was introduced in Dec 2019 in https://github.com/facebook/rocksdb/pull/5881. Subsequently, 2 years after, we introduced the `PosixRandomAccessFile::ReadAsync` API in https://github.com/facebook/rocksdb/pull/9578, which was reusing the same `PosixFileSystem` IO ring as `MultiRead` API, consequently writing to the very same ring's submission queue (without waiting!). This 'shared ring' design is problematic, since sequentially interleaving `ReadAsync` and `MultiRead` API calls on the very same thread might result in reading 'unknown' events in `MultiRead` leading to `Bad cqe data` errors (and therefore falsely perceived  as a corruption) - which, for some services (running on local flash), in itself is a hard blocker for adopting RocksDB async prefetching ('async IO') that heavily relies on the `ReadAsync` API. This change aims to solve this problem by maintaining separate thread local IO rings for `async reads` and `multi reads` assuring correct execution. In addition, we're adding more robust error handling in form of retries for kernel interrupts and draining the queue when process is experiencing terse memory condition. Separately, we're enhancing the performance aspect by explicitly marking the rings to be written to / read from by a single thread (`IORING_SETUP_SINGLE_ISSUER` [if available]) and defer the task just before the application intends to process completions (`IORING_SETUP_DEFER_TASKRUN` [if available]). See https://man7.org/linux/man-pages/man2/io_uring_setup.2.html for reference.

## Benchmark

**TLDR**
There's no evident advantage of using `io_uring_submit` (relative to proposed `io_uring_submit_and_wait`) across batches of size 10, 250 and 1000 simulating significantly-less, close-to and 4x-above `kIoUringDepth` batch size. `io_uring_submit` might be more appealing if (at least) one of the IOs is slow (which was NOT the case during the benchmark). More notably, with this PR switching from `io_uring_submit_and_wait` -> `io_uring_submit` can be done with a single line change due to implemented guardrails (we can followup with adding optional config for true ring semantics [if needed]).

**Compilation**
```
DEBUG_LEVEL=0 make db_bench
```

**Create DB**

```
./db_bench \
    --db=/db/testdb_2.5m_k100_v6144_16kB_LZ4 \
    --benchmarks=fillseq \
    --num=2500000 \
    --key_size=100 \
    --value_size=6144 \
    --compression_type=LZ4 \
    --block_size=16384 \
    --seed=1723056275
```

**LSM**

* L0: 2 files, L1: 5, L2: 49, L3: 79
* Each file is roughly ~35M in size

### MultiReadRandom (with caching disabled)

Each run was preceded by OS page cache cleanup with `echo 1 | sudo tee /proc/sys/vm/drop_caches`.

```
./db_bench \
    --use_existing_db=true \
    --db=/db/testdb_2.5m_k100_v6144_16kB_LZ4 \
    --compression_type=LZ4 \
    --benchmarks=multireadrandom \
    --num= **<N>** \
    --batch_size= **<B>** \
    --io_uring_enabled=true \
    --async_io=false \
    --optimize_multiget_for_io=false \
    --threads=4 \
    --cache_size=0 \
    --use_direct_reads=true \
    --use_direct_io_for_flush_and_compaction=true \
    --cache_index_and_filter_blocks=false \
    --pin_l0_filter_and_index_blocks_in_cache=false \
    --pin_top_level_index_and_filter=false \
    --prepopulate_block_cache=0 \
    --row_cache_size=0 \
    --use_blob_cache=false \
    --use_compressed_secondary_cache=false
```

  | B=10; N=100,000 | B = 250; N=80,000  | B = 1,000; N=20,000
-- | -- | -- | --
baseline | 31.5 (± 0.4) us/op | 17.5 (± 0.5) us/op | 13.5 (± 0.4) us/op
io_uring_submit_and_wait |  31.5 (± 0.6) us/op |  17.7 (± 0.4) us/op |  13.6 (± 0.4) us/op
io_uring_submit | 31.5 (± 0.6) us/op | 17.5 (± 0.5) us/op | 13.4 (± 0.45) us/op

### Specs

  | Property | Value
-- | --
RocksDB | version 10.9.0
Date | Tue Dec 9 15:57:03 2025
CPU | 56 * Intel Sapphire Rapids (T10 SPR)
Kernel version | 6.9.0-0_fbk12_0_g28f2d09ad102

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14158

Reviewed By: anand1976

Differential Revision: D88172809

Pulled By: mszeszko-meta

fbshipit-source-id: 5198de3d2f18f76fee661a2ec5f447e79ba06fbd
---
 env/env_test.cc | 167 ++++++++++++++-----
 env/fs_posix.cc |  21 ++-
 env/io_posix.cc | 415 +++++++++++++++++++++++++++++++++++-------------
 env/io_posix.h  |  20 ++-
 4 files changed, 459 insertions(+), 164 deletions(-)

diff --git a/env/env_test.cc b/env/env_test.cc
index 30cfdde51055..e6f56402ea77 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -1655,42 +1655,6 @@ void GenerateFilesAndRequest(Env* env, const std::string& fname,
   }
 }
 
-TEST_F(EnvPosixTest, MultiReadIOUringError) {
-  // In this test we don't do aligned read, so we can't do direct I/O.
-  EnvOptions soptions;
-  soptions.use_direct_reads = soptions.use_direct_writes = false;
-  std::string fname = test::PerThreadDBPath(env_, "testfile");
-
-  std::vector<std::string> scratches;
-  std::vector<ReadRequest> reqs;
-  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
-  // Query the data
-  std::unique_ptr<RandomAccessFile> file;
-  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
-
-  bool io_uring_wait_cqe_called = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return",
-      [&](void* arg) {
-        if (!io_uring_wait_cqe_called) {
-          io_uring_wait_cqe_called = true;
-          ssize_t& ret = *(static_cast<ssize_t*>(arg));
-          ret = 1;
-        }
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  Status s = file->MultiRead(reqs.data(), reqs.size());
-  if (io_uring_wait_cqe_called) {
-    ASSERT_NOK(s);
-  } else {
-    s.PermitUncheckedError();
-  }
-
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-}
-
 TEST_F(EnvPosixTest, MultiReadIOUringError2) {
   // In this test we don't do aligned read, so we can't do direct I/O.
   EnvOptions soptions;
@@ -1706,19 +1670,20 @@ TEST_F(EnvPosixTest, MultiReadIOUringError2) {
 
   bool io_uring_submit_and_wait_called = false;
   SyncPoint::GetInstance()->SetCallBack(
-      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+      "PosixRandomAccessFile::MultiRead:io_uring_sq_ready:return1",
       [&](void* arg) {
         io_uring_submit_and_wait_called = true;
-        ssize_t* ret = static_cast<ssize_t*>(arg);
-        (*ret)--;
+        unsigned* ret = static_cast<unsigned*>(arg);
+        *ret = 1;
       });
   SyncPoint::GetInstance()->SetCallBack(
       "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
       [&](void* arg) {
         struct io_uring* iu = static_cast<struct io_uring*>(arg);
         struct io_uring_cqe* cqe;
-        assert(io_uring_wait_cqe(iu, &cqe) == 0);
-        io_uring_cqe_seen(iu, cqe);
+        // CQ should be empty after drain - peek should fail
+        int ret = io_uring_peek_cqe(iu, &cqe);
+        assert(-EAGAIN == ret);  // No CQEs available
       });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -3640,6 +3605,126 @@ TEST_F(TestAsyncRead, ReadAsync) {
   }
 }
 
+// Test ReadAsync -> MultiRead -> Poll with real io_uring (not mock).
+// This verifies that MultiRead doesn't interfere with async read buffers.
+TEST_F(TestAsyncRead, InterleavingIOUringOperations) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  // Use the real filesystem directly (not the mock ReadAsyncFS).
+  std::shared_ptr<FileSystem> fs = env_->GetFileSystem();
+  std::string fname = test::PerThreadDBPath(env_, "testfile_iouring");
+
+  constexpr size_t kSectorSize = 4096;
+  constexpr size_t kNumSectors = 8;
+
+  // 1. Create & write to a file.
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    ASSERT_OK(
+        fs->NewWritableFile(fname, FileOptions(), &wfile, nullptr /*dbg*/));
+
+    for (size_t i = 0; i < kNumSectors; ++i) {
+      auto data = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+
+  // 2. Test interleaved ReadAsync and MultiRead operations.
+  {
+    std::unique_ptr<FSRandomAccessFile> file;
+    ASSERT_OK(fs->NewRandomAccessFile(fname, FileOptions(), &file, nullptr));
+
+    IOOptions opts;
+    std::vector<void*> io_handles(kNumSectors);
+    std::vector<FSReadRequest> async_reqs(kNumSectors);
+    std::vector<std::unique_ptr<char, Deleter>> async_data;
+    std::vector<size_t> vals;
+    IOHandleDeleter del_fn;
+
+    // Initialize async read requests.
+    for (size_t i = 0; i < kNumSectors; i++) {
+      async_reqs[i].offset = i * kSectorSize;
+      async_reqs[i].len = kSectorSize;
+      async_data.emplace_back(NewAligned(kSectorSize, 0));
+      async_reqs[i].scratch = async_data.back().get();
+      vals.push_back(i);
+    }
+
+    // Callback function for async reads.
+    std::function<void(FSReadRequest&, void*)> callback =
+        [&](FSReadRequest& req, void* cb_arg) {
+          assert(cb_arg != nullptr);
+          size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+          async_reqs[i].offset = req.offset;
+          async_reqs[i].result = req.result;
+          async_reqs[i].status = req.status;
+        };
+
+    // Submit asynchronous read requests.
+    for (size_t i = 0; i < kNumSectors; i++) {
+      void* cb_arg = static_cast<void*>(&(vals[i]));
+      IOStatus s = file->ReadAsync(async_reqs[i], opts, callback, cb_arg,
+                                   &(io_handles[i]), &del_fn, nullptr);
+      if (s.IsNotSupported()) {
+        // io_uring not supported on this system, skip the test.
+        fprintf(stderr, "Skipping test - io_uring not supported: %s\n",
+                s.ToString().c_str());
+        for (size_t j = 0; j < i; j++) {
+          if (io_handles[j] != nullptr) {
+            del_fn(io_handles[j]);
+          }
+        }
+        return;
+      }
+      // For any other error, fail the test.
+      ASSERT_OK(s);
+    }
+
+    // Do a MultiRead on same sectors while async reads are submitted.
+    std::vector<FSReadRequest> multi_reqs(kNumSectors);
+    std::vector<std::unique_ptr<char, Deleter>> multi_data;
+    for (size_t i = 0; i < kNumSectors; i++) {
+      multi_reqs[i].offset = i * kSectorSize;
+      multi_reqs[i].len = kSectorSize;
+      multi_data.emplace_back(NewAligned(kSectorSize, 0));
+      multi_reqs[i].scratch = multi_data.back().get();
+    }
+    ASSERT_OK(file->MultiRead(multi_reqs.data(), kNumSectors, opts, nullptr));
+
+    // Check the status of MultiRead requests (should all succeed).
+    for (size_t i = 0; i < kNumSectors; i++) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice expected_data(buf.get(), kSectorSize);
+
+      ASSERT_EQ(multi_reqs[i].offset, i * kSectorSize);
+      ASSERT_OK(multi_reqs[i].status);
+      ASSERT_EQ(expected_data.ToString(), multi_reqs[i].result.ToString());
+    }
+
+    // Poll for the submitted async requests.
+    ASSERT_OK(fs->Poll(io_handles, kNumSectors));
+
+    // Check the status of async read requests (should all succeed).
+    for (size_t i = 0; i < kNumSectors; i++) {
+      auto buf = NewAligned(kSectorSize * 8, static_cast<char>(i + 1));
+      Slice expected_data(buf.get(), kSectorSize);
+
+      ASSERT_EQ(async_reqs[i].offset, i * kSectorSize);
+      ASSERT_OK(async_reqs[i].status);
+      ASSERT_EQ(expected_data.ToString(), async_reqs[i].result.ToString());
+    }
+
+    // Delete io_handles.
+    for (size_t i = 0; i < io_handles.size(); i++) {
+      del_fn(io_handles[i]);
+    }
+  }
+#else
+  fprintf(stderr, "Skipping test - ROCKSDB_IOURING_PRESENT not defined\n");
+#endif
+}
+
 struct StaticDestructionTester {
   bool activated = false;
   ~StaticDestructionTester() {
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index c93d9ce8675f..34efe1204f6d 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -270,7 +270,10 @@ class PosixFileSystem : public FileSystem {
           options
 #if defined(ROCKSDB_IOURING_PRESENT)
           ,
-          !IsIOUringEnabled() ? nullptr : thread_local_io_urings_.get()
+          !IsIOUringEnabled() ? nullptr
+                              : thread_local_async_read_io_urings_.get(),
+          !IsIOUringEnabled() ? nullptr
+                              : thread_local_multi_read_io_urings_.get()
 #endif
               ));
     }
@@ -1087,8 +1090,9 @@ class PosixFileSystem : public FileSystem {
 #if defined(ROCKSDB_IOURING_PRESENT)
     // io_uring_queue_init.
     struct io_uring* iu = nullptr;
-    if (thread_local_io_urings_) {
-      iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    if (thread_local_async_read_io_urings_) {
+      iu = static_cast<struct io_uring*>(
+          thread_local_async_read_io_urings_->Get());
     }
 
     // Init failed, platform doesn't support io_uring.
@@ -1161,8 +1165,9 @@ class PosixFileSystem : public FileSystem {
 #if defined(ROCKSDB_IOURING_PRESENT)
     // io_uring_queue_init.
     struct io_uring* iu = nullptr;
-    if (thread_local_io_urings_) {
-      iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+    if (thread_local_async_read_io_urings_) {
+      iu = static_cast<struct io_uring*>(
+          thread_local_async_read_io_urings_->Get());
     }
 
     // Init failed, platform doesn't support io_uring.
@@ -1277,7 +1282,8 @@ class PosixFileSystem : public FileSystem {
 
 #if defined(ROCKSDB_IOURING_PRESENT)
   // io_uring instance
-  std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
+  std::unique_ptr<ThreadLocalPtr> thread_local_async_read_io_urings_;
+  std::unique_ptr<ThreadLocalPtr> thread_local_multi_read_io_urings_;
 #endif
 
   size_t page_size_;
@@ -1337,7 +1343,8 @@ PosixFileSystem::PosixFileSystem()
   // io_uring can be created.
   struct io_uring* new_io_uring = CreateIOUring();
   if (new_io_uring != nullptr) {
-    thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
+    thread_local_async_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
+    thread_local_multi_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
     delete new_io_uring;
   }
 #endif
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 5a0f0338d50a..489e5b3a9e50 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -589,7 +589,8 @@ PosixRandomAccessFile::PosixRandomAccessFile(
     const EnvOptions& options
 #if defined(ROCKSDB_IOURING_PRESENT)
     ,
-    ThreadLocalPtr* thread_local_io_urings
+    ThreadLocalPtr* thread_local_async_read_io_urings,
+    ThreadLocalPtr* thread_local_multi_read_io_urings
 #endif
     )
     : filename_(fname),
@@ -598,7 +599,8 @@ PosixRandomAccessFile::PosixRandomAccessFile(
       logical_sector_size_(logical_block_size)
 #if defined(ROCKSDB_IOURING_PRESENT)
       ,
-      thread_local_io_urings_(thread_local_io_urings)
+      thread_local_async_read_io_urings_(thread_local_async_read_io_urings),
+      thread_local_multi_read_io_urings_(thread_local_multi_read_io_urings)
 #endif
 {
   assert(!options.use_direct_reads || !options.use_mmap_reads);
@@ -659,6 +661,83 @@ IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
   return s;
 }
 
+// MultiRead: Perform multiple concurrent read requests using io_uring.
+//
+// OVERVIEW:
+// This function batches multiple read requests and submits them concurrently
+// to io_uring for improved I/O performance. It operates synchronously from the
+// caller's perspective (blocks until all reads complete) but uses io_uring's
+// async capabilities internally for parallel I/O execution.
+//
+// IO_URING LIFECYCLE:
+// 1. Preparation Phase:
+//    - Allocate SQEs (Submission Queue Entries) for read requests
+//    - Limited by: min(pending_work, io_uring_sq_space_left(), kIoUringDepth -
+//    inflight)
+//    - Uses io_uring_sq_space_left() to query available SQ slots
+//    - Each SQE is tracked in wrap_cache for completion matching
+//
+// 2. Submission Phase:
+//    - Loop: while io_uring_sq_ready() > 0 (SQEs pending submission)
+//    - Call io_uring_submit_and_wait() to submit SQEs and wait for CQEs
+//    - Handles retryable errors (EINTR, EAGAIN) by continuing
+//    - Breaks on terminal errors (logs error, sets err variable)
+//
+// 3. Completion Phase:
+//    - Non-blocking CQE reaping via io_uring_for_each_cqe()
+//    - Matches CQEs to requests using user_data pointer
+//    - Processes results: updates bytes read, handles partial reads
+//    - Removes completed requests from wrap_cache
+//
+// 4. Loop Iteration:
+//    - Repeats until: all requests submitted AND all completions reaped
+//    - Termination condition: (num_reqs == reqs_off) &&
+//    resubmit_rq_list.empty() && wrap_cache.empty()
+//
+// ERROR HANDLING STRATEGY:
+// - Retryable submission errors (-EINTR, -EAGAIN): Retry submission
+// - Memory pressure (-ENOMEM): Mark memory_pressure_on_submission, attempt
+// recovery
+// - Terminal submission errors: Break, enter teardown path
+// - Retryable CQE errors (-EINTR, -EAGAIN): Add to resubmit_rq_list for retry
+// - Terminal CQE errors: Set ios to IOError, continue processing other CQEs
+// - Teardown path: If SQEs remain unsubmitted after error, reap submitted CQEs,
+//   destroy io_uring instance, return error
+//
+// PARTIAL READ HANDLING:
+// - Short reads (bytes_read < requested): Request added to resubmit_rq_list
+// - finished_len tracks cumulative bytes read across resubmissions
+// - iov.iov_base/iov_len adjusted on each resubmission attempt
+// - UpdateResult() determines if read should be retried based on:
+//   * Direct I/O alignment requirements
+//   * EOF detection
+//   * Error conditions
+//
+// RESUBMISSION LOGIC:
+// - resubmit_rq_list: Requests needing retry (short reads, EINTR/EAGAIN errors)
+// - Prioritized in SQE allocation loop: resubmits before new requests
+// - List cleared after SQE preparation
+// - Requests remain in wrap_cache across resubmissions until fully complete
+//
+// CONCURRENCY CONTROL:
+// - wrap_cache.size(): Tracks total inflight requests (SQ + CQ)
+// - io_uring_sq_ready(): Queries SQEs prepared but not yet submitted
+// - io_uring_sq_space_left(): Queries available SQ slots
+// - Max concurrency: kIoUringDepth (256)
+//
+// ACCOUNTING CORRECTNESS:
+// - Uses io_uring native APIs (io_uring_sq_ready, io_uring_sq_space_left)
+//   instead of manual counters for robustness
+// - wrap_cache is the authoritative source for inflight request tracking
+// - Re-query io_uring_sq_ready() after submission loop to detect
+//   unsubmitted SQEs (indicates submission errors)
+//
+// THREAD SAFETY:
+// - Uses thread-local io_uring instance (thread_local_multi_read_io_urings_)
+// - IORING_SETUP_SINGLE_ISSUER: Only one thread submits to this ring
+// - IORING_SETUP_DEFER_TASKRUN: Task work runs in submitting thread
+// - No cross-thread coordination required
+//
 IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
                                           const IOOptions& options,
                                           IODebugContext* dbg) {
@@ -672,12 +751,16 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
 
 #if defined(ROCKSDB_IOURING_PRESENT)
   struct io_uring* iu = nullptr;
-  if (thread_local_io_urings_) {
-    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+  if (thread_local_multi_read_io_urings_) {
+    iu = static_cast<struct io_uring*>(
+        thread_local_multi_read_io_urings_->Get());
     if (iu == nullptr) {
-      iu = CreateIOUring();
+      unsigned int flags = 0;
+      flags |= IORING_SETUP_SINGLE_ISSUER;
+      flags |= IORING_SETUP_DEFER_TASKRUN;
+      iu = CreateIOUring(flags);
       if (iu != nullptr) {
-        thread_local_io_urings_->Reset(iu);
+        thread_local_multi_read_io_urings_->Reset(iu);
       }
     }
   }
@@ -688,8 +771,6 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
     return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
   }
 
-  IOStatus ios = IOStatus::OK();
-
   struct WrappedReadRequest {
     FSReadRequest* req;
     struct iovec iov;
@@ -698,118 +779,199 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
   };
 
   autovector<WrappedReadRequest, 32> req_wraps;
-  autovector<WrappedReadRequest*, 4> incomplete_rq_list;
+  autovector<WrappedReadRequest*, 4> resubmit_rq_list;
   std::unordered_set<WrappedReadRequest*> wrap_cache;
 
   for (size_t i = 0; i < num_reqs; i++) {
     req_wraps.emplace_back(&reqs[i]);
   }
 
+  IOStatus ios = IOStatus::OK();
   size_t reqs_off = 0;
-  while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
-    size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
-
-    // If requests exceed depth, split it into batches
-    if (this_reqs > kIoUringDepth) {
-      this_reqs = kIoUringDepth;
-    }
-
-    assert(incomplete_rq_list.size() <= this_reqs);
-    for (size_t i = 0; i < this_reqs; i++) {
-      WrappedReadRequest* rep_to_submit;
-      if (i < incomplete_rq_list.size()) {
-        rep_to_submit = incomplete_rq_list[i];
+  while ((num_reqs > reqs_off) || !resubmit_rq_list.empty() ||
+         !wrap_cache.empty()) {
+    assert(resubmit_rq_list.size() + wrap_cache.size() <= kIoUringDepth);
+    // Total number of requests that still need to be submitted, includes:
+    //
+    //  1) requests NOT yet submitted (num_reqs - reqs_off)
+    //  2) requests on resubmission list (resubmit_rq_list)
+    //
+    // capped by min of the # of remaining entries in IO ring submission queue
+    // and the max IO ring depth less the inflight requests.
+    size_t new_sqe_reqs_count = std::min({
+        num_reqs - reqs_off + resubmit_rq_list.size(),
+        static_cast<size_t>(io_uring_sq_space_left(iu)),
+        kIoUringDepth - wrap_cache.size()  // queue depth less inflight requests
+    });
+    for (size_t i = 0; i < new_sqe_reqs_count; i++) {
+      WrappedReadRequest* req;
+      if (i < resubmit_rq_list.size()) {
+        req = resubmit_rq_list[i];
       } else {
-        rep_to_submit = &req_wraps[reqs_off++];
+        req = &req_wraps[reqs_off++];
       }
-      assert(rep_to_submit->req->len > rep_to_submit->finished_len);
-      rep_to_submit->iov.iov_base =
-          rep_to_submit->req->scratch + rep_to_submit->finished_len;
-      rep_to_submit->iov.iov_len =
-          rep_to_submit->req->len - rep_to_submit->finished_len;
+      assert(req->req->len > req->finished_len);
+      req->iov.iov_base = req->req->scratch + req->finished_len;
+      req->iov.iov_len = req->req->len - req->finished_len;
 
       struct io_uring_sqe* sqe;
       sqe = io_uring_get_sqe(iu);
-      io_uring_prep_readv(
-          sqe, fd_, &rep_to_submit->iov, 1,
-          rep_to_submit->req->offset + rep_to_submit->finished_len);
-      io_uring_sqe_set_data(sqe, rep_to_submit);
-      wrap_cache.emplace(rep_to_submit);
+      // NULL is unexpected as we do maintain proper ring accounting.
+      assert(sqe);
+      io_uring_prep_readv(sqe, fd_, &req->iov, 1,
+                          req->req->offset + req->finished_len);
+      io_uring_sqe_set_data(sqe, req);
+      wrap_cache.emplace(req);
     }
-    incomplete_rq_list.clear();
+    resubmit_rq_list.clear();
+
+    struct io_uring_cqe* cqe = nullptr;
+    unsigned head;
+    ssize_t err = 0;
+    bool memory_pressure_on_submission = false;
+    unsigned reqs_pending_submission;
+    unsigned reqs_submitted = 0;
+    while ((reqs_pending_submission = io_uring_sq_ready(iu))) {
+      // MultiRead is synchronous in nature. io_uring_submit_and_wait provides
+      // batching semantics (submit + best effort wait in one syscall), while
+      // io_uring_submit enables async producer/consumer semantics (submit
+      // only, requires separate reaping). We chose batching approach to
+      // reduce the volume of syscalls and context switches.
+      ssize_t ret = io_uring_submit_and_wait(iu, reqs_pending_submission);
+      if (ret < 0) {
+        if (-EINTR == ret || -EAGAIN == ret) {
+          // Submission failed due to rare, retryable syscall error. Try again.
+          continue;
+        }
+        if (-ENOMEM == ret) {
+          fprintf(stderr,
+                  "PosixRandomAccessFile::MultiRead: io_uring_submit_and_wait "
+                  "experienced terse memory condition.\n");
+          // Best effort to reclaim resources in terse condition.
+          memory_pressure_on_submission = true;
+        } else {
+          fprintf(stderr,
+                  "PosixRandomAccessFile::MultiRead: "
+                  "io_uring_submit_and_wait returned terminal error: %zd.\n",
+                  ret);
+          err = ret;
+        }
+        break;
+      }
+      if (0 == ret) {
+        // This scenario is unexpected for any modern kernel!
+        // We deliberately error out to avoid bugs around infinite loops.
+        fprintf(stderr,
+                "PosixRandomAccessFile::MultiRead: "
+                "io_uring_submit_and_wait returned 0 submissions!\n");
+        break;
+      }
+      reqs_submitted += static_cast<unsigned int>(ret);
+    };
+    reqs_pending_submission = io_uring_sq_ready(iu);
 
-    ssize_t ret =
-        io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
     TEST_SYNC_POINT_CALLBACK(
-        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
-        &ret);
-    TEST_SYNC_POINT_CALLBACK(
-        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
-        iu);
-
-    if (static_cast<size_t>(ret) != this_reqs) {
-      fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
-      // If error happens and we submitted fewer than expected, it is an
-      // exception case and we don't retry here. We should still consume
-      // what is is submitted in the ring.
-      for (ssize_t i = 0; i < ret; i++) {
-        struct io_uring_cqe* cqe = nullptr;
-        io_uring_wait_cqe(iu, &cqe);
-        if (cqe != nullptr) {
-          io_uring_cqe_seen(iu, cqe);
+        "PosixRandomAccessFile::MultiRead:io_uring_sq_ready:return1",
+        &reqs_pending_submission);
+
+    // Error occurred or IO uring stopped submitting outstanding requests.
+    if (reqs_pending_submission && !memory_pressure_on_submission) {
+      // IO ring is initialized once in thread-local variable and then reused
+      // to handle the consecutive MultiRead API calls. Therefore, it's crucial
+      // to reap all the submitted requests.
+      //
+      // NOTE: Loop will run indefinitely until we reap all the completions!!!
+      size_t nr = 0;
+      assert(reqs_pending_submission <= wrap_cache.size());
+      size_t nr_await_cqe = wrap_cache.size() - reqs_pending_submission;
+      while (nr < nr_await_cqe) {
+        // blocking
+        io_uring_wait_cqes(iu, &cqe,
+                           static_cast<unsigned int>(nr_await_cqe - nr),
+                           nullptr, nullptr);
+        size_t reaped_cqe_count = 0;
+        io_uring_for_each_cqe(iu, head, cqe) { reaped_cqe_count++; }
+        if (reaped_cqe_count > 0) {
+          io_uring_cq_advance(iu, static_cast<unsigned int>(reaped_cqe_count));
+          nr += reaped_cqe_count;
         }
       }
-      return IOStatus::IOError("io_uring_submit_and_wait() requested " +
-                               std::to_string(this_reqs) + " but returned " +
-                               std::to_string(ret));
-    }
-
-    for (size_t i = 0; i < this_reqs; i++) {
-      struct io_uring_cqe* cqe = nullptr;
-      WrappedReadRequest* req_wrap;
 
-      // We could use the peek variant here, but this seems safer in terms
-      // of our initial wait not reaping all completions
-      ret = io_uring_wait_cqe(iu, &cqe);
       TEST_SYNC_POINT_CALLBACK(
-          "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
-      if (ret) {
-        ios = IOStatus::IOError("io_uring_wait_cqe() returns " +
-                                std::to_string(ret));
-
-        if (cqe != nullptr) {
-          io_uring_cqe_seen(iu, cqe);
-        }
-        continue;
+          "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+          iu);
+
+      // While all the submitted completions have been reaped successfully,
+      // IO ring submission queue still contains at least one non-submitted
+      // request. Destroy io_uring (discards unsubmitted SQEs).
+      //
+      // NOTE: This is a rare scenario and should not happen in normal cases.
+      //       Hence, this should NOT materially impact the performance metrics.
+      io_uring_queue_exit(iu);
+      delete iu;
+      thread_local_multi_read_io_urings_->Reset(nullptr);
+
+      if (err < 0) {
+        return IOStatus::IOError(
+            "io_uring_submit_and_wait() failed with an error " +
+            std::to_string(err));
       }
+      return IOStatus::IOError(
+          "io_uring_submit_and_wait() requested " +
+          std::to_string(reqs_submitted + reqs_pending_submission) +
+          " but returned " + std::to_string(reqs_submitted));
+    }
 
-      req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
-      // Reset cqe data to catch any stray reuse of it
-      static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
-      // Check that we got a valid unique cqe data
-      auto wrap_check = wrap_cache.find(req_wrap);
-      if (wrap_check == wrap_cache.end()) {
-        fprintf(stderr,
-                "PosixRandomAccessFile::MultiRead: "
-                "Bad cqe data from IO uring - %p\n",
-                req_wrap);
-        port::PrintStack();
-        ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
-                                std::to_string((uint64_t)req_wrap));
-        continue;
-      }
-      wrap_cache.erase(wrap_check);
-
-      FSReadRequest* req = req_wrap->req;
-      size_t bytes_read = 0;
-      bool read_again = false;
-      UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
-                   false /*async_read*/, use_direct_io(),
-                   GetRequiredBufferAlignment(), req_wrap->finished_len, req,
-                   bytes_read, read_again);
-      int32_t res = cqe->res;
-      if (res >= 0) {
-        if (bytes_read == 0) {
+    if ((0 == reqs_submitted) && wrap_cache.size() > reqs_pending_submission) {
+      // If no requests have been submitted and there is at least one request
+      // pending completion, wait for at least one completion to arrive.
+      // This is a guardrail to prevent the busy CPU loops.
+      //
+      // NOTE: it's not really a tight CPU-burning loop in the traditional sense
+      // as it's naturally throttled by the io_uring_submit_and_wait() syscall.
+      io_uring_wait_cqe(iu, &cqe);
+    }
+
+    unsigned int nr = 0;
+    io_uring_for_each_cqe(iu, head, cqe) {  // non-blocking
+      if (cqe->user_data) {  // non-discarded, valid user data only!
+        nr++;
+        WrappedReadRequest* req_wrap =
+            static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
+        // Reset cqe data to catch any stray reuse of it
+        static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
+        // Check that we got a valid unique cqe data
+        auto wrap_check = wrap_cache.find(req_wrap);
+        if (wrap_check == wrap_cache.end()) {
+          fprintf(stderr,
+                  "PosixRandomAccessFile::MultiRead: "
+                  "Bad cqe data from IO uring - %p\n",
+                  req_wrap);
+          port::PrintStack();
+          ios = IOStatus::IOError("io_uring_cqe_get_data() returned " +
+                                  std::to_string((uint64_t)req_wrap));
+          continue;
+        }
+        wrap_cache.erase(wrap_check);
+        if (cqe->res < 0) {
+          if (-EINTR == cqe->res || -EAGAIN == cqe->res) {
+            resubmit_rq_list.push_back(req_wrap);
+          } else {
+            ios = IOStatus::IOError("io_uring_for_each_cqe() returns " +
+                                    std::to_string(cqe->res));
+          }
+          continue;
+        }
+        // cqe->res >= 0
+        FSReadRequest* req = req_wrap->req;
+        size_t bytes_read = 0;
+        bool read_again = false;
+        UpdateResult(cqe, filename_, req->len, req_wrap->iov.iov_len,
+                     false /*async_read*/, use_direct_io(),
+                     GetRequiredBufferAlignment(), req_wrap->finished_len, req,
+                     bytes_read, read_again);
+
+        if (0 == bytes_read) {
           if (read_again) {
             Slice tmp_slice;
             req->status =
@@ -819,14 +981,15 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
             req->result =
                 Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
           }
-          // else It means EOF so no need to do anything.
+          // else it means EOF so no need to do anything.
         } else if (bytes_read < req_wrap->iov.iov_len) {
-          incomplete_rq_list.push_back(req_wrap);
+          resubmit_rq_list.push_back(req_wrap);
         }
       }
-      io_uring_cqe_seen(iu, cqe);
     }
-    wrap_cache.clear();
+    if (nr > 0) {
+      io_uring_cq_advance(iu, nr);
+    }
   }
   return ios;
 #else
@@ -923,12 +1086,16 @@ IOStatus PosixRandomAccessFile::ReadAsync(
 #if defined(ROCKSDB_IOURING_PRESENT)
   // io_uring_queue_init.
   struct io_uring* iu = nullptr;
-  if (thread_local_io_urings_) {
-    iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
+  if (thread_local_async_read_io_urings_) {
+    iu = static_cast<struct io_uring*>(
+        thread_local_async_read_io_urings_->Get());
     if (iu == nullptr) {
-      iu = CreateIOUring();
+      unsigned int flags = 0;
+      flags |= IORING_SETUP_SINGLE_ISSUER;
+      flags |= IORING_SETUP_DEFER_TASKRUN;
+      iu = CreateIOUring(flags);
       if (iu != nullptr) {
-        thread_local_io_urings_->Reset(iu);
+        thread_local_async_read_io_urings_->Reset(iu);
       }
     }
   }
@@ -966,11 +1133,35 @@ IOStatus PosixRandomAccessFile::ReadAsync(
   io_uring_sqe_set_data(sqe, posix_handle);
 
   // Step 4: io_uring_submit
-  ssize_t ret = io_uring_submit(iu);
-  if (ret < 0) {
-    fprintf(stderr, "io_uring_submit error: %ld\n", long(ret));
-    return IOStatus::IOError("io_uring_submit() requested but returned " +
-                             std::to_string(ret));
+  ssize_t ret;
+  do {
+    ret = io_uring_submit(iu);
+    if (ret < 0) {
+      if (-EINTR == ret || -EAGAIN == ret) {
+        // Submission failed due to transient error. Try again.
+        continue;
+      }
+      fprintf(stderr,
+              "PosixRandomAccessFile::ReadAsync: "
+              "io_uring_submit returned terminal error = %zd\n",
+              ret);
+      break;
+    }
+    if (0 == ret) {
+      // Unexpected. Will be reported as error.
+      break;
+    }
+  } while (ret < 1);
+  if (ret <= 0) {
+    return IOStatus::IOError(
+        "PosixRandomAccessFile::ReadAsync: io_uring_submit() returned " +
+        std::to_string(ret));
+  }
+  if (ret > 1) {
+    fprintf(stderr,
+            "PosixRandomAccessFile::ReadAsync: "
+            "io_uring_submit() returned = %zd\n",
+            ret);
   }
   return IOStatus::OK();
 #else
diff --git a/env/io_posix.h b/env/io_posix.h
index 39fd8c0f49d1..ca33b8e3e948 100644
--- a/env/io_posix.h
+++ b/env/io_posix.h
@@ -11,6 +11,16 @@
 #if defined(ROCKSDB_IOURING_PRESENT)
 #include <liburing.h>
 #include <sys/uio.h>
+
+// Compatibility defines for io_uring flags that may not be present in older
+// kernel headers. These values are fixed and won't change, so it's safe to
+// define them even if the running kernel doesn't support them.
+#ifndef IORING_SETUP_SINGLE_ISSUER
+#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
+#endif
+#ifndef IORING_SETUP_DEFER_TASKRUN
+#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
+#endif
 #endif
 #include <unistd.h>
 
@@ -297,9 +307,9 @@ inline void DeleteIOUring(void* p) {
   delete iu;
 }
 
-inline struct io_uring* CreateIOUring() {
+inline struct io_uring* CreateIOUring(unsigned int flags = 0) {
   struct io_uring* new_io_uring = new struct io_uring;
-  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, 0);
+  int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags);
   if (ret) {
     delete new_io_uring;
     new_io_uring = nullptr;
@@ -315,7 +325,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile {
   bool use_direct_io_;
   size_t logical_sector_size_;
 #if defined(ROCKSDB_IOURING_PRESENT)
-  ThreadLocalPtr* thread_local_io_urings_;
+  ThreadLocalPtr* thread_local_async_read_io_urings_;
+  ThreadLocalPtr* thread_local_multi_read_io_urings_;
 #endif
 
  public:
@@ -323,7 +334,8 @@ class PosixRandomAccessFile : public FSRandomAccessFile {
                         size_t logical_block_size, const EnvOptions& options
 #if defined(ROCKSDB_IOURING_PRESENT)
                         ,
-                        ThreadLocalPtr* thread_local_io_urings
+                        ThreadLocalPtr* thread_local_async_read_io_urings,
+                        ThreadLocalPtr* thread_local_multi_read_io_urings
 #endif
   );
   virtual ~PosixRandomAccessFile();

From 9065ace05ada92e0a435fbb973ef957fc4401f43 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 16 Dec 2025 12:36:07 -0800
Subject: [PATCH 403/500] Disable multiscan+timestamp in crash test (#14189)

Summary:
Causing failures and not yet supported. Also putting a note in db.h about the combination being unsupported.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14189

Test Plan: started up blackbox_crash_test_with_ts many times and checked command line to be confident it's excluded.

Reviewed By: hx235

Differential Revision: D89297971

Pulled By: pdillinger

fbshipit-source-id: c5134351d9ecb37879c7e3319c17dd9228d7f12a
---
 include/rocksdb/db.h  | 9 ++++++---
 tools/db_crashtest.py | 2 ++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 2ac8aa99c543..9a753ae3eafd 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1105,9 +1105,12 @@ class DB {
   // details. For optimal performance, ensure that either all entries in
   // scan_opts specify the range limit, or none of them do.
   //
-  // NOTE: iterate_upper_bound in ReadOptions will be ignored. Instead, the
-  // range.limit in ScanOptions is consulted to determine the upper bound key,
-  // if specified.
+  // NOTE: NOT YET SUPPORTED in DBs using user timestamp (see
+  // Comparator::timestamp_size())
+  //
+  // NOTE: iterate_upper_bound in ReadOptions will
+  // be ignored. Instead, the range.limit in ScanOptions is consulted to
+  // determine the upper bound key, if specified.
   //
   // Example usage -
   //  std::vector<ScanOptions> scans{{.start = Slice("bar")},
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 1606679404eb..621795ceb34c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -692,6 +692,8 @@ def is_direct_io_supported(dbname):
     # Below flag is randomly picked once and kept consistent in following runs.
     "persist_user_defined_timestamps": random.choice([0, 1, 1]),
     "use_merge": 0,
+    # Causing failures and not yet compatible
+    "use_multiscan": 0,
     "use_full_merge_v1": 0,
     "use_txn": 0,
     "ingest_external_file_one_in": 0,

From 41beb1422f7d13039a984fbd3ad72ef0c1bd5ae3 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 22 Dec 2025 13:03:00 -0800
Subject: [PATCH 404/500] Improve db_crashtest.py for remote DB (#14195)

Summary:
Let db_crashtest.py work with TEST_TMPDIR on remote filesystem, by infering whether it's remote from the env_uri argument. Note that some other paths passed to db_stress are local paths and we can't reuse TEST_TMPDIR for those cases when it's remote.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14195

Test Plan: public and private CI

Reviewed By: archang19

Differential Revision: D89590246

Pulled By: pdillinger

fbshipit-source-id: db6eb9c16d4e76617183780747353c798cc9bef6
---
 tools/db_crashtest.py | 72 ++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 621795ceb34c..1de1525c06ab 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -14,6 +14,7 @@
 
 per_iteration_random_seed_override = 0
 remain_argv = None
+is_remote_db = False
 
 
 def get_random_seed(override):
@@ -35,7 +36,7 @@ def quote_arg_for_display(arg):
     return f"{flag}={shlex.quote(value)}"
 
 
-def setup_random_seed_before_main():
+def early_argument_parsing_before_main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--initial_random_seed_override",
@@ -58,21 +59,28 @@ def setup_random_seed_before_main():
     init_random_seed = get_random_seed(args.initial_random_seed_override)
     global per_iteration_random_seed_override
     per_iteration_random_seed_override = args.per_iteration_random_seed_override
+    global is_remote_db
+    # Set is_remote_db if remain_args has a non-empty --env_uri= argument
+    for arg in remain_args:
+        parts = arg.split("=", 1)
+        if parts[0] == "--env_uri" and len(parts) > 1 and parts[1]:
+            is_remote_db = True
+            break
 
     print(f"Start with random seed {init_random_seed}")
     random.seed(init_random_seed)
 
 
 def apply_random_seed_per_iteration():
-    global per_iteration_random_seed_override
     per_iteration_random_seed = get_random_seed(per_iteration_random_seed_override)
     print(f"Use random seed for iteration {per_iteration_random_seed}")
     random.seed(per_iteration_random_seed)
 
 
 # Random seed has to be setup before the rest of the script, so that the random
-# value selected in the global variable uses the random seed specified
-setup_random_seed_before_main()
+# value selected in the global variable uses the random seed specified. More
+# arguments can also be parsed early.
+early_argument_parsing_before_main()
 
 # params overwrite priority:
 #   for default:
@@ -441,6 +449,7 @@ def apply_random_seed_per_iteration():
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"
 # If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR
+# except on remote filesystem
 _TEST_EXPECTED_DIR_ENV_VAR = "TEST_TMPDIR_EXPECTED"
 _DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL"
 
@@ -459,15 +468,16 @@ def get_dbname(test_name):
         dbname = tempfile.mkdtemp(prefix=test_dir_name)
     else:
         dbname = test_tmpdir + "/" + test_dir_name
-        shutil.rmtree(dbname, True)
-        if cleanup_cmd is not None:
-            print("Running DB cleanup command - %s\n" % cleanup_cmd)
-            # Ignore failure
-            os.system(cleanup_cmd)
-        try:
-            os.mkdir(dbname)
-        except OSError:
-            pass
+        if not is_remote_db:
+            shutil.rmtree(dbname, True)
+            if cleanup_cmd is not None:
+                print("Running DB cleanup command - %s\n" % cleanup_cmd)
+                # Ignore failure
+                os.system(cleanup_cmd)
+            try:
+                os.mkdir(dbname)
+            except OSError:
+                pass
     return dbname
 
 
@@ -481,9 +491,7 @@ def setup_expected_values_dir():
     expected_dir_prefix = "rocksdb_crashtest_expected_"
     test_exp_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR)
 
-    # set the value to _TEST_DIR_ENV_VAR if _TEST_EXPECTED_DIR_ENV_VAR is not
-    # specified.
-    if test_exp_tmpdir is None or test_exp_tmpdir == "":
+    if not is_remote_db and (test_exp_tmpdir is None or test_exp_tmpdir == ""):
         test_exp_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
 
     if test_exp_tmpdir is None or test_exp_tmpdir == "":
@@ -507,9 +515,7 @@ def setup_multiops_txn_key_spaces_file():
     key_spaces_file_prefix = "rocksdb_crashtest_multiops_txn_key_spaces"
     test_exp_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR)
 
-    # set the value to _TEST_DIR_ENV_VAR if _TEST_EXPECTED_DIR_ENV_VAR is not
-    # specified.
-    if test_exp_tmpdir is None or test_exp_tmpdir == "":
+    if not is_remote_db and (test_exp_tmpdir is None or test_exp_tmpdir == ""):
         test_exp_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
 
     if test_exp_tmpdir is None or test_exp_tmpdir == "":
@@ -526,12 +532,15 @@ def setup_multiops_txn_key_spaces_file():
 
 
 def is_direct_io_supported(dbname):
-    with tempfile.NamedTemporaryFile(dir=dbname) as f:
-        try:
-            os.open(f.name, os.O_DIRECT)
-        except BaseException:
-            return False
-        return True
+    if is_remote_db:
+        return False
+    else:
+        with tempfile.NamedTemporaryFile(dir=dbname) as f:
+            try:
+                os.open(f.name, os.O_DIRECT)
+            except BaseException:
+                return False
+            return True
 
 
 blackbox_default_params = {
@@ -1326,7 +1335,6 @@ def gen_cmd(params, unknown_params):
                 "stress_cmd",
                 "test_tiered_storage",
                 "cleanup_cmd",
-                "skip_tmpdir_check",
                 "print_stderr_separately",
                 "verify_timeout",
             }
@@ -1374,7 +1382,8 @@ def print_output_and_exit_on_error(stdout, stderr, print_stderr_separately=False
 
 
 def cleanup_after_success(dbname):
-    shutil.rmtree(dbname, True)
+    if not is_remote_db:
+        shutil.rmtree(dbname, True)
     if cleanup_cmd is not None:
         print("Running DB cleanup command - %s\n" % cleanup_cmd)
         ret = os.system(cleanup_cmd)
@@ -1604,14 +1613,9 @@ def whitebox_crash_main(args, unknown_args):
         # try different modes.
         if time.time() > half_time:
             cleanup_after_success(dbname)
-            try:
-                os.mkdir(dbname)
-            except OSError:
-                pass
             if expected_values_dir is not None:
                 shutil.rmtree(expected_values_dir, True)
                 os.mkdir(expected_values_dir)
-
             check_mode = (check_mode + 1) % total_check_mode
 
         time.sleep(1)  # time to stabilize after a kill
@@ -1641,7 +1645,6 @@ def main():
     parser.add_argument("--stress_cmd")
     parser.add_argument("--test_tiered_storage", action="store_true")
     parser.add_argument("--cleanup_cmd")
-    parser.add_argument("--skip_tmpdir_check", action="store_true")
     parser.add_argument("--print_stderr_separately", action="store_true", default=False)
 
     all_params = dict(
@@ -1665,10 +1668,9 @@ def main():
         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
     # unknown_args are passed directly to db_stress
 
-    global remain_args
     args, unknown_args = parser.parse_known_args(remain_args)
     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
-    if test_tmpdir is not None and not args.skip_tmpdir_check:
+    if test_tmpdir is not None and not is_remote_db:
         isdir = False
         try:
             isdir = os.path.isdir(test_tmpdir)

From 7e9f54d56b2aabc2295529b8342e6b66a39d2f45 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 23 Dec 2025 09:10:12 -0800
Subject: [PATCH 405/500] Remove remaining pieces of Lua integration (#14200)

Summary:
Going from deprecated and partly removed to fully removed

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14200

Test Plan: existing tests

Reviewed By: archang19

Differential Revision: D89697543

Pulled By: pdillinger

fbshipit-source-id: bbf6161c04322e9756a9479758488cac6d03473a
---
 Makefile                                      | 32 +----------
 build_tools/build_detect_platform             |  1 -
 build_tools/check-public-header.sh            |  4 +-
 build_tools/dependencies_platform010.sh       |  1 -
 build_tools/fbcode_config.sh                  | 10 +---
 build_tools/fbcode_config_platform010.sh      |  2 +-
 build_tools/update_dependencies.sh            |  1 -
 .../utilities/lua/rocks_lua_custom_library.h  | 43 ---------------
 .../rocksdb/utilities/lua/rocks_lua_util.h    | 55 -------------------
 unreleased_history/public_api_changes/lua.md  |  1 +
 10 files changed, 7 insertions(+), 143 deletions(-)
 delete mode 100644 include/rocksdb/utilities/lua/rocks_lua_custom_library.h
 delete mode 100644 include/rocksdb/utilities/lua/rocks_lua_util.h
 create mode 100644 unreleased_history/public_api_changes/lua.md

diff --git a/Makefile b/Makefile
index cfbeb2a90460..0c31657d76f5 100644
--- a/Makefile
+++ b/Makefile
@@ -390,8 +390,6 @@ ifdef COMPILE_WITH_TSAN
         # Turn off -pg when enabling TSAN testing, because that induces
         # a link failure.  TODO: find the root cause
 	PROFILING_FLAGS =
-	# LUA is not supported under TSAN
-	LUA_PATH =
 	# Limit keys for crash test under TSAN to avoid error:
 	# "ThreadSanitizer: DenseSlabAllocator overflow. Dying."
 	CRASH_TEST_EXT_ARGS += --max_key=1000000
@@ -508,32 +506,6 @@ ifndef DISABLE_WARNING_AS_ERROR
 endif
 
 
-ifdef LUA_PATH
-
-ifndef LUA_INCLUDE
-LUA_INCLUDE=$(LUA_PATH)/include
-endif
-
-LUA_INCLUDE_FILE=$(LUA_INCLUDE)/lualib.h
-
-ifeq ("$(wildcard $(LUA_INCLUDE_FILE))", "")
-# LUA_INCLUDE_FILE does not exist
-$(error Cannot find lualib.h under $(LUA_INCLUDE).  Try to specify both LUA_PATH and LUA_INCLUDE manually)
-endif
-LUA_FLAGS = -I$(LUA_INCLUDE) -DLUA -DLUA_COMPAT_ALL
-CFLAGS += $(LUA_FLAGS)
-CXXFLAGS += $(LUA_FLAGS)
-
-ifndef LUA_LIB
-LUA_LIB = $(LUA_PATH)/lib/liblua.a
-endif
-ifeq ("$(wildcard $(LUA_LIB))", "") # LUA_LIB does not exist
-$(error $(LUA_LIB) does not exist.  Try to specify both LUA_PATH and LUA_LIB manually)
-endif
-EXEC_LDFLAGS += $(LUA_LIB)
-
-endif
-
 ifeq ($(NO_THREEWAY_CRC32C), 1)
 	CXXFLAGS += -DNO_THREEWAY_CRC32C
 endif
@@ -604,8 +576,8 @@ ifneq ($(filter check-headers, $(MAKECMDGOALS)),)
 # TODO: add/support JNI headers
 	DEV_HEADER_DIRS := $(sort include/ $(dir $(ALL_SOURCES)))
 # Some headers like in port/ are platform-specific
-	DEV_HEADERS_TO_CHECK := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|lua/|range_tree/|secondary_index/')
-	PUBLIC_HEADERS_TO_CHECK := $(shell $(FIND) include/ -type f -name '*.h' | grep -E -v 'lua/')
+	DEV_HEADERS_TO_CHECK := $(shell $(FIND) $(DEV_HEADER_DIRS) -type f -name '*.h' | grep -E -v 'port/|plugin/|range_tree/|secondary_index/')
+	PUBLIC_HEADERS_TO_CHECK := $(shell $(FIND) include/ -type f -name '*.h')
 else
 	DEV_HEADERS_TO_CHECK :=
 	PUBLIC_HEADERS_TO_CHECK :=
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index ff7ceeece8d8..cfb8d143664b 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -831,7 +831,6 @@ fi
 if test -n "$WITH_JEMALLOC_FLAG"; then
   echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
 fi
-echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
 if test -n "$USE_FOLLY"; then
   echo "USE_FOLLY=$USE_FOLLY" >> "$OUTPUT"
 fi
diff --git a/build_tools/check-public-header.sh b/build_tools/check-public-header.sh
index 027758a801f0..bb1bc147dc0a 100755
--- a/build_tools/check-public-header.sh
+++ b/build_tools/check-public-header.sh
@@ -8,7 +8,7 @@ BAD=""
 
 # Look for potential for ODR violations caused by public headers depending on
 # build parameters that could vary between RocksDB build and application build.
-# * Cases like LUA, ROCKSDB_NAMESPACE, and ROCKSDB_ASSERT_STATUS_CHECKED are
+# * Cases like ROCKSDB_NAMESPACE, and ROCKSDB_ASSERT_STATUS_CHECKED are
 #   intentional, hard to avoid. (We expect definitions to change and the user
 #   should also.)
 # * Cases like _WIN32, OS_WIN, and __cplusplus are essentially ODR-safe.
@@ -19,7 +19,7 @@ BAD=""
 #   that should not cause ODR violations can be exempted with the ODR-SAFE
 #   marker recognized here.
 
-grep -nHE '^#if' -- "$@" | grep -vE 'ROCKSDB_NAMESPACE|ROCKSDB_ASSERT_STATUS_CHECKED|LUA|_WIN32|OS_WIN|ODR-SAFE|__cplusplus|ROCKSDB_DLL|ROCKSDB_LIBRARY_EXPORTS'
+grep -nHE '^#if' -- "$@" | grep -vE 'ROCKSDB_NAMESPACE|ROCKSDB_ASSERT_STATUS_CHECKED|_WIN32|OS_WIN|ODR-SAFE|__cplusplus|ROCKSDB_DLL|ROCKSDB_LIBRARY_EXPORTS'
 if [ "$?" != "1" ]; then
   echo "^^^^^ #if in public API could cause an ODR violation."
   echo "      Add // ODR-SAFE if verified safe."
diff --git a/build_tools/dependencies_platform010.sh b/build_tools/dependencies_platform010.sh
index 9b19a801c85f..a55663cb25da 100644
--- a/build_tools/dependencies_platform010.sh
+++ b/build_tools/dependencies_platform010.sh
@@ -19,4 +19,3 @@ BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/780c7a0f9cf0967961e69ad08e61cddd
 KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/624a2f8f6c93c3c1df8aa4a6255d8202631a6c80/fb/platform010/da39a3e
 BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/39579e8603b48b3540f8b0633f43adf29acccb8b/2.37/centos8-native/da39a3e
 VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/cd9cc656d49ecb53797ce4d055e49fde29fd57ff/3.19.0/platform010/76ebdda
-LUA_BASE=/mnt/gvfs/third-party2/lua/363787fa5cac2a8aa20638909210443278fa138e/5.3.4/platform010/9079c97
diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh
index 02732bde3d1c..802e757795c7 100644
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@@ -164,12 +164,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GF
 
 VALGRIND_VER="$VALGRIND_BASE/bin/"
 
-LUA_PATH="$LUA_BASE"
-
-if test -z $PIC_BUILD; then
-  LUA_LIB=" $LUA_PATH/lib/liblua.a"
-else
-  LUA_LIB=" $LUA_PATH/lib/liblua_pic.a"
-fi
-
-export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD
diff --git a/build_tools/fbcode_config_platform010.sh b/build_tools/fbcode_config_platform010.sh
index 87a28b4f92d0..0fc99ecad159 100644
--- a/build_tools/fbcode_config_platform010.sh
+++ b/build_tools/fbcode_config_platform010.sh
@@ -172,4 +172,4 @@ EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GF
 
 VALGRIND_VER="$VALGRIND_BASE/bin/"
 
-export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
+export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD
diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh
index afc39ab8009a..6584cd6edaca 100755
--- a/build_tools/update_dependencies.sh
+++ b/build_tools/update_dependencies.sh
@@ -101,6 +101,5 @@ get_lib_base benchmark  LATEST  platform010
 get_lib_base kernel-headers fb platform010
 get_lib_base binutils   LATEST centos8-native
 get_lib_base valgrind   LATEST platform010
-get_lib_base lua        5.3.4  platform010
 
 git diff $OUTPUT
diff --git a/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
deleted file mode 100644
index f617da02bea6..000000000000
--- a/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifdef LUA
-
-// lua headers
-extern "C" {
-#include <lauxlib.h>
-#include <lua.h>
-#include <lualib.h>
-}
-
-namespace ROCKSDB_NAMESPACE {
-namespace lua {
-// A class that used to define custom C Library that is callable
-// from Lua script
-class RocksLuaCustomLibrary {
- public:
-  virtual ~RocksLuaCustomLibrary() {}
-  // The name of the C library.  This name will also be used as the table
-  // (namespace) in Lua that contains the C library.
-  virtual const char* Name() const = 0;
-
-  // Returns a "static const struct luaL_Reg[]", which includes a list of
-  // C functions.  Note that the last entry of this static array must be
-  // {nullptr, nullptr} as required by Lua.
-  //
-  // More details about how to implement Lua C libraries can be found
-  // in the official Lua document http://www.lua.org/pil/26.2.html
-  virtual const struct luaL_Reg* Lib() const = 0;
-
-  // A function that will be called right after the library has been created
-  // and pushed on the top of the lua_State.  This custom setup function
-  // allows developers to put additional table or constant values inside
-  // the same table / namespace.
-  virtual void CustomSetup(lua_State* /*L*/) const {}
-};
-}  // namespace lua
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // LUA
diff --git a/include/rocksdb/utilities/lua/rocks_lua_util.h b/include/rocksdb/utilities/lua/rocks_lua_util.h
deleted file mode 100644
index 3427b65ef674..000000000000
--- a/include/rocksdb/utilities/lua/rocks_lua_util.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-// lua headers
-extern "C" {
-#include <lauxlib.h>
-#include <lua.h>
-#include <lualib.h>
-}
-
-#ifdef LUA
-#include <string>
-#include <vector>
-
-#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
-
-namespace ROCKSDB_NAMESPACE {
-namespace lua {
-class LuaStateWrapper {
- public:
-  explicit LuaStateWrapper(const std::string& lua_script) {
-    lua_state_ = luaL_newstate();
-    Init(lua_script, {});
-  }
-  LuaStateWrapper(
-      const std::string& lua_script,
-      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
-    lua_state_ = luaL_newstate();
-    Init(lua_script, libraries);
-  }
-  lua_State* GetLuaState() const { return lua_state_; }
-  ~LuaStateWrapper() { lua_close(lua_state_); }
-
- private:
-  void Init(
-      const std::string& lua_script,
-      const std::vector<std::shared_ptr<RocksLuaCustomLibrary>>& libraries) {
-    if (lua_state_) {
-      luaL_openlibs(lua_state_);
-      for (const auto& library : libraries) {
-        luaL_openlib(lua_state_, library->Name(), library->Lib(), 0);
-        library->CustomSetup(lua_state_);
-      }
-      luaL_dostring(lua_state_, lua_script.c_str());
-    }
-  }
-
-  lua_State* lua_state_;
-};
-}  // namespace lua
-}  // namespace ROCKSDB_NAMESPACE
-#endif  // LUA
diff --git a/unreleased_history/public_api_changes/lua.md b/unreleased_history/public_api_changes/lua.md
new file mode 100644
index 000000000000..be62aef54e31
--- /dev/null
+++ b/unreleased_history/public_api_changes/lua.md
@@ -0,0 +1 @@
+* Remove remaining pieces of Lua integration

From e77ba4bc95a3ad06fa87a14d7b6d9fc64f4fd0eb Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 23 Dec 2025 12:18:44 -0800
Subject: [PATCH 406/500] Start 10.11.0 development (#14192)

Summary:
10.10.0 branch has been cut.

### Updated:

HISTORY.md
include/rocksdb/version.h
tools/check_format_compatible.sh

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14192

Reviewed By: virajthakur

Differential Revision: D89736225

Pulled By: pdillinger

fbshipit-source-id: d7fc592b33a5c60dc2b53aa72ceafaa507730f20
---
 HISTORY.md                                                  | 6 ++++++
 folly.mk                                                    | 2 +-
 include/rocksdb/version.h                                   | 2 +-
 tools/check_format_compatible.sh                            | 2 +-
 unreleased_history/bug_fixes/ber_table_cache_uaf.md         | 1 -
 .../bug_fixes/truncated_range_del_resume_compaction.md      | 1 -
 6 files changed, 9 insertions(+), 5 deletions(-)
 delete mode 100644 unreleased_history/bug_fixes/ber_table_cache_uaf.md
 delete mode 100644 unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md

diff --git a/HISTORY.md b/HISTORY.md
index 551314d7e494..9f440849b35a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,12 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.10.0 (12/16/2025)
+### Bug Fixes
+* Fixed a bug in best-efforts recovery that causes use-after-free crashes when accessing SST files that were cached during the recovery.
+* Fix resumable compaction incorrectly allowing resumption from a truncated range deletion that is not well handled currently.
+* Fixed a bug in `PosixRandomFileAccess` IO uring submission queue ownership & management. Fix eliminates the false positive 'Bad cqe data' IO errors in `PosixRandomFileAccess::MultiRead` when interleaved with `PosixRandomFileAccess::ReadAsync` on the same thread.
+
 ## 10.9.0 (11/21/2025)
 ### New Features
 * Added an auto-tuning feature for DB manifest file size that also (by default) improves the safety of existing configurations in case `max_manifest_file_size` is repeatedly exceeded. The new recommendation is to set `max_manifest_file_size` to something small like 1MB and tune `max_manifest_space_amp_pct` as needed to balance write amp and space amp in the manifest. Refer to comments on those options in `DBOptions` for details. Both options are (now) mutable.
diff --git a/folly.mk b/folly.mk
index 590abf0226f8..edd7ab1c9ca7 100644
--- a/folly.mk
+++ b/folly.mk
@@ -98,7 +98,7 @@ endif  # FMT_SOURCE_PATH
 	PLATFORM_LDFLAGS += -lglog
 endif
 
-FOLLY_COMMIT_HASH = abe68f7e917e8b7a0ee2fe066c972dc98fd35aa1
+FOLLY_COMMIT_HASH = 94a8e82cf16a0e229fc4fc89140219434ba78fa2
 
 # For public CI runs, checkout folly in a way that can build with RocksDB.
 # This is mostly intended as a test-only simulation of Meta-internal folly
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 7cc7a3de9873..4b7a720f1ae4 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 10
+#define ROCKSDB_MINOR 11
 #define ROCKSDB_PATCH 0
 
 // Make it easy to do conditional compilation based on version checks, i.e.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 37fab422c312..d764467403d0 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb")
+declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/bug_fixes/ber_table_cache_uaf.md b/unreleased_history/bug_fixes/ber_table_cache_uaf.md
deleted file mode 100644
index de2a96638bb3..000000000000
--- a/unreleased_history/bug_fixes/ber_table_cache_uaf.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed a bug in best-efforts recovery that causes use-after-free crashes when accessing SST files that were cached during the recovery.
diff --git a/unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md b/unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md
deleted file mode 100644
index 72eb33e41f8d..000000000000
--- a/unreleased_history/bug_fixes/truncated_range_del_resume_compaction.md
+++ /dev/null
@@ -1 +0,0 @@
-Fix resumable compaction incorrectly allowing resumption from a truncated range deletion that is not well handled currently.

From 45690d0f6af5318b5aa546cd6601bfbd257dc182 Mon Sep 17 00:00:00 2001
From: nsaji-stripe <nsaji@stripe.com>
Date: Mon, 29 Dec 2025 10:15:38 -0800
Subject: [PATCH 407/500] CompactionServiceOptionsOverride setters C API
 (#14183)

Summary:
## Context
1. OpenAndCompact required CompactionServiceOptionsOverride
2. Currently there are no C APIs to create CompactionServiceOptionsOverride

## Changes
1. Create C API for compactionServiceOptionsOverride
2. Create helper function to create compactionServiceOptionsOverride from Options.  This was added in because The C API lacks getter methods for non-serializable options (comparator, table_factory, etc.). Without this, users would need to maintain separate references to all these options just to pass them to the override. If the user need to create a new comparator or table factory then C API for compactionServiceOptionsOverride already as the setters for the same.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14183

Reviewed By: hx235

Differential Revision: D89690005

Pulled By: jaykorean

fbshipit-source-id: efe8211feec9d144b32be0f5e66c8cf8bde8dac0
---
 db/c.cc             | 219 ++++++++++++++++++++++++++++++++++++++++++++
 db/c_test.c         |  77 ++++++++++++++++
 include/rocksdb/c.h | 100 ++++++++++++++++++++
 3 files changed, 396 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 9f058c55aba4..894e35ef8ef8 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -89,10 +89,12 @@ using ROCKSDB_NAMESPACE::EnvOptions;
 using ROCKSDB_NAMESPACE::EventListener;
 using ROCKSDB_NAMESPACE::ExportImportFilesMetaData;
 using ROCKSDB_NAMESPACE::ExternalFileIngestionInfo;
+using ROCKSDB_NAMESPACE::FileChecksumGenFactory;
 using ROCKSDB_NAMESPACE::FileLock;
 using ROCKSDB_NAMESPACE::FilterPolicy;
 using ROCKSDB_NAMESPACE::FlushJobInfo;
 using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory;
 using ROCKSDB_NAMESPACE::HistogramData;
 using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
 using ROCKSDB_NAMESPACE::ImportColumnFamilyOptions;
@@ -112,6 +114,7 @@ using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
 using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
 using ROCKSDB_NAMESPACE::NewLRUCache;
 using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::NewSstPartitionerFixedPrefixFactory;
 using ROCKSDB_NAMESPACE::OpenAndCompactOptions;
 using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
 using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
@@ -133,9 +136,11 @@ using ROCKSDB_NAMESPACE::Snapshot;
 using ROCKSDB_NAMESPACE::SstFileManager;
 using ROCKSDB_NAMESPACE::SstFileMetaData;
 using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::SstPartitionerFactory;
 using ROCKSDB_NAMESPACE::Status;
 using ROCKSDB_NAMESPACE::StderrLogger;
 using ROCKSDB_NAMESPACE::SubcompactionJobInfo;
+using ROCKSDB_NAMESPACE::TableFactory;
 using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
 using ROCKSDB_NAMESPACE::Transaction;
 using ROCKSDB_NAMESPACE::TransactionDB;
@@ -235,6 +240,15 @@ struct rocksdb_filelock_t {
 struct rocksdb_logger_t {
   std::shared_ptr<Logger> rep;
 };
+struct rocksdb_file_checksum_gen_factory_t {
+  std::shared_ptr<FileChecksumGenFactory> rep;
+};
+struct rocksdb_sst_partitioner_factory_t {
+  std::shared_ptr<SstPartitionerFactory> rep;
+};
+struct rocksdb_table_properties_collector_factory_t {
+  std::shared_ptr<TablePropertiesCollectorFactory> rep;
+};
 struct rocksdb_lru_cache_options_t {
   LRUCacheOptions rep;
 };
@@ -920,6 +934,38 @@ rocksdb_compaction_service_options_override_create() {
   return new rocksdb_compaction_service_options_override_t;
 }
 
+rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create_from_options(
+    rocksdb_options_t* options) {
+  if (!options) {
+    return nullptr;
+  }
+
+  rocksdb_compaction_service_options_override_t* override_opts =
+      new rocksdb_compaction_service_options_override_t;
+
+  // Copy all relevant options from rocksdb_options_t
+  override_opts->rep.env = options->rep.env;
+  override_opts->rep.file_checksum_gen_factory =
+      options->rep.file_checksum_gen_factory;
+  override_opts->rep.comparator = options->rep.comparator;
+  override_opts->rep.merge_operator = options->rep.merge_operator;
+  override_opts->rep.compaction_filter = options->rep.compaction_filter;
+  override_opts->rep.compaction_filter_factory =
+      options->rep.compaction_filter_factory;
+  override_opts->rep.prefix_extractor = options->rep.prefix_extractor;
+  override_opts->rep.table_factory = options->rep.table_factory;
+  override_opts->rep.sst_partitioner_factory =
+      options->rep.sst_partitioner_factory;
+  override_opts->rep.listeners = options->rep.listeners;
+  override_opts->rep.statistics = options->rep.statistics;
+  override_opts->rep.info_log = options->rep.info_log;
+  override_opts->rep.table_properties_collector_factories =
+      options->rep.table_properties_collector_factories;
+
+  return override_opts;
+}
+
 void rocksdb_compaction_service_options_override_destroy(
     rocksdb_compaction_service_options_override_t* override_options) {
   if (override_options) {
@@ -943,6 +989,111 @@ void rocksdb_compaction_service_options_override_set_comparator(
   }
 }
 
+void rocksdb_compaction_service_options_override_set_merge_operator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_mergeoperator_t* merge_operator) {
+  if (override_options && merge_operator) {
+    override_options->rep.merge_operator =
+        std::shared_ptr<MergeOperator>(merge_operator);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_compaction_filter(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilter_t* compaction_filter) {
+  if (override_options && compaction_filter) {
+    override_options->rep.compaction_filter = compaction_filter;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_compaction_filter_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilterfactory_t* compaction_filter_factory) {
+  if (override_options && compaction_filter_factory) {
+    override_options->rep.compaction_filter_factory =
+        std::shared_ptr<CompactionFilterFactory>(compaction_filter_factory);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_prefix_extractor(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_slicetransform_t* prefix_extractor) {
+  if (override_options && prefix_extractor) {
+    override_options->rep.prefix_extractor =
+        std::shared_ptr<const SliceTransform>(prefix_extractor);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_block_based_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (override_options && table_options) {
+    override_options->rep.table_factory = std::shared_ptr<TableFactory>(
+        NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_cuckoo_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_cuckoo_table_options_t* table_options) {
+  if (override_options && table_options) {
+    override_options->rep.table_factory = std::shared_ptr<TableFactory>(
+        NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+// Note: add_event_listener is defined later after rocksdb_eventlistener_t
+// struct
+
+void rocksdb_compaction_service_options_override_set_statistics(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_options_t* options) {
+  if (override_options && options) {
+    override_options->rep.statistics = options->rep.statistics;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_info_log(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_logger_t* logger) {
+  if (override_options && logger) {
+    override_options->rep.info_log = logger->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_option(
+    rocksdb_compaction_service_options_override_t* override_options,
+    const char* key, const char* value) {
+  if (override_options && key && value) {
+    override_options->rep.options_map[std::string(key)] = std::string(value);
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_file_checksum_gen_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_file_checksum_gen_factory_t* factory) {
+  if (override_options && factory) {
+    override_options->rep.file_checksum_gen_factory = factory->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_set_sst_partitioner_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_sst_partitioner_factory_t* factory) {
+  if (override_options && factory) {
+    override_options->rep.sst_partitioner_factory = factory->rep;
+  }
+}
+
+void rocksdb_compaction_service_options_override_add_table_properties_collector_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_table_properties_collector_factory_t* factory) {
+  if (override_options && factory) {
+    override_options->rep.table_properties_collector_factories.push_back(
+        factory->rep);
+  }
+}
+
 // Atomic bool management for cancellation
 unsigned char* rocksdb_open_and_compact_canceled_create() {
   return reinterpret_cast<unsigned char*>(new std::atomic<bool>(false));
@@ -3967,6 +4118,15 @@ void rocksdb_options_add_eventlistener(rocksdb_options_t* opt,
   opt->rep.listeners.emplace_back(std::shared_ptr<EventListener>(t));
 }
 
+void rocksdb_compaction_service_options_override_add_event_listener(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_eventlistener_t* event_listener) {
+  if (override_options && event_listener) {
+    override_options->rep.listeners.emplace_back(
+        std::shared_ptr<EventListener>(event_listener));
+  }
+}
+
 rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() {
   return new rocksdb_cuckoo_table_options_t;
 }
@@ -4178,6 +4338,65 @@ rocksdb_logger_t* rocksdb_logger_create_callback_logger(
 
 void rocksdb_logger_destroy(rocksdb_logger_t* logger) { delete logger; }
 
+/* File Checksum Gen Factory */
+
+rocksdb_file_checksum_gen_factory_t*
+rocksdb_file_checksum_gen_crc32c_factory_create() {
+  rocksdb_file_checksum_gen_factory_t* factory =
+      new rocksdb_file_checksum_gen_factory_t;
+  factory->rep = GetFileChecksumGenCrc32cFactory();
+  return factory;
+}
+
+void rocksdb_file_checksum_gen_factory_destroy(
+    rocksdb_file_checksum_gen_factory_t* factory) {
+  delete factory;
+}
+
+void rocksdb_options_set_file_checksum_gen_factory(
+    rocksdb_options_t* opt, rocksdb_file_checksum_gen_factory_t* factory) {
+  if (opt && factory) {
+    opt->rep.file_checksum_gen_factory = factory->rep;
+  }
+}
+
+/* SST Partitioner Factory */
+
+rocksdb_sst_partitioner_factory_t*
+rocksdb_sst_partitioner_fixed_prefix_factory_create(size_t prefix_len) {
+  rocksdb_sst_partitioner_factory_t* factory =
+      new rocksdb_sst_partitioner_factory_t;
+  factory->rep = NewSstPartitionerFixedPrefixFactory(prefix_len);
+  return factory;
+}
+
+void rocksdb_sst_partitioner_factory_destroy(
+    rocksdb_sst_partitioner_factory_t* factory) {
+  delete factory;
+}
+
+void rocksdb_options_set_sst_partitioner_factory(
+    rocksdb_options_t* opt, rocksdb_sst_partitioner_factory_t* factory) {
+  if (opt && factory) {
+    opt->rep.sst_partitioner_factory = factory->rep;
+  }
+}
+
+/* Table Properties Collector Factory */
+
+void rocksdb_table_properties_collector_factory_destroy(
+    rocksdb_table_properties_collector_factory_t* factory) {
+  delete factory;
+}
+
+void rocksdb_options_add_table_properties_collector_factory(
+    rocksdb_options_t* opt,
+    rocksdb_table_properties_collector_factory_t* factory) {
+  if (opt && factory) {
+    opt->rep.table_properties_collector_factories.push_back(factory->rep);
+  }
+}
+
 void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
   opt->rep.env = (env ? env->rep : nullptr);
 }
diff --git a/db/c_test.c b/db/c_test.c
index 7a0612a224b3..7f05dd2ab4b2 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -4717,9 +4717,86 @@ int main(int argc, char** argv) {
     rocksdb_compaction_service_options_override_set_comparator(override_opts,
                                                                cmp);
 
+    // Test file checksum gen factory
+    rocksdb_file_checksum_gen_factory_t* checksum_factory =
+        rocksdb_file_checksum_gen_crc32c_factory_create();
+    CheckCondition(checksum_factory != NULL);
+    rocksdb_compaction_service_options_override_set_file_checksum_gen_factory(
+        override_opts, checksum_factory);
+
+    // Test SST partitioner factory
+    rocksdb_sst_partitioner_factory_t* partitioner_factory =
+        rocksdb_sst_partitioner_fixed_prefix_factory_create(4);
+    CheckCondition(partitioner_factory != NULL);
+    rocksdb_compaction_service_options_override_set_sst_partitioner_factory(
+        override_opts, partitioner_factory);
+
+    // Test merge operator
+    rocksdb_compaction_service_options_override_set_merge_operator(
+        override_opts, NULL);
+
+    // Test compaction filter
+    rocksdb_compaction_service_options_override_set_compaction_filter(
+        override_opts, NULL);
+
+    // Test prefix extractor
+    rocksdb_compaction_service_options_override_set_prefix_extractor(
+        override_opts, NULL);
+
+    // Test table factory - block based
+    rocksdb_block_based_table_options_t* table_opts =
+        rocksdb_block_based_options_create();
+    rocksdb_compaction_service_options_override_set_block_based_table_factory(
+        override_opts, table_opts);
+    rocksdb_block_based_options_destroy(table_opts);
+
+    // Test statistics via options
+    rocksdb_options_t* stats_opts = rocksdb_options_create();
+    rocksdb_options_enable_statistics(stats_opts);
+    rocksdb_compaction_service_options_override_set_statistics(override_opts,
+                                                               stats_opts);
+    rocksdb_options_destroy(stats_opts);
+
+    // Test info log
+    rocksdb_logger_t* logger =
+        rocksdb_logger_create_stderr_logger(1, "test_prefix");
+    rocksdb_compaction_service_options_override_set_info_log(override_opts,
+                                                             logger);
+    rocksdb_logger_destroy(logger);
+
+    // Test options map
+    rocksdb_compaction_service_options_override_set_option(
+        override_opts, "max_bytes_for_level_base", "67108864");
+
+    // Cleanup
+    rocksdb_file_checksum_gen_factory_destroy(checksum_factory);
+    rocksdb_sst_partitioner_factory_destroy(partitioner_factory);
     rocksdb_compaction_service_options_override_destroy(override_opts);
   }
 
+  StartPhase("factory_options_on_regular_options");
+  {
+    // Test that the new factory types work with regular rocksdb_options_t
+    rocksdb_options_t* test_opts = rocksdb_options_create();
+
+    // Test file checksum gen factory on regular options
+    rocksdb_file_checksum_gen_factory_t* checksum_factory =
+        rocksdb_file_checksum_gen_crc32c_factory_create();
+    CheckCondition(checksum_factory != NULL);
+    rocksdb_options_set_file_checksum_gen_factory(test_opts, checksum_factory);
+
+    // Test SST partitioner factory on regular options
+    rocksdb_sst_partitioner_factory_t* partitioner_factory =
+        rocksdb_sst_partitioner_fixed_prefix_factory_create(8);
+    CheckCondition(partitioner_factory != NULL);
+    rocksdb_options_set_sst_partitioner_factory(test_opts, partitioner_factory);
+
+    // Cleanup
+    rocksdb_file_checksum_gen_factory_destroy(checksum_factory);
+    rocksdb_sst_partitioner_factory_destroy(partitioner_factory);
+    rocksdb_options_destroy(test_opts);
+  }
+
   StartPhase("remote_compaction_null_callback_handling");
   {
     // Test that NULL callback returns are handled gracefully
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ffb5583e4aca..08557be3c03b 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -86,6 +86,12 @@ typedef struct rocksdb_compactionfiltercontext_t
     rocksdb_compactionfiltercontext_t;
 typedef struct rocksdb_compactionfilterfactory_t
     rocksdb_compactionfilterfactory_t;
+typedef struct rocksdb_file_checksum_gen_factory_t
+    rocksdb_file_checksum_gen_factory_t;
+typedef struct rocksdb_sst_partitioner_factory_t
+    rocksdb_sst_partitioner_factory_t;
+typedef struct rocksdb_table_properties_collector_factory_t
+    rocksdb_table_properties_collector_factory_t;
 typedef struct rocksdb_comparator_t rocksdb_comparator_t;
 typedef struct rocksdb_dbpath_t rocksdb_dbpath_t;
 typedef struct rocksdb_env_t rocksdb_env_t;
@@ -1471,6 +1477,31 @@ rocksdb_logger_create_callback_logger(int log_level,
                                       void* priv);
 extern ROCKSDB_LIBRARY_API void rocksdb_logger_destroy(
     rocksdb_logger_t* logger);
+
+/* File Checksum Gen Factory */
+extern ROCKSDB_LIBRARY_API rocksdb_file_checksum_gen_factory_t*
+rocksdb_file_checksum_gen_crc32c_factory_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_file_checksum_gen_factory_destroy(
+    rocksdb_file_checksum_gen_factory_t* factory);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_file_checksum_gen_factory(
+    rocksdb_options_t*, rocksdb_file_checksum_gen_factory_t*);
+
+/* SST Partitioner Factory */
+extern ROCKSDB_LIBRARY_API rocksdb_sst_partitioner_factory_t*
+rocksdb_sst_partitioner_fixed_prefix_factory_create(size_t prefix_len);
+extern ROCKSDB_LIBRARY_API void rocksdb_sst_partitioner_factory_destroy(
+    rocksdb_sst_partitioner_factory_t* factory);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_partitioner_factory(
+    rocksdb_options_t*, rocksdb_sst_partitioner_factory_t*);
+
+/* Table Properties Collector Factory */
+extern ROCKSDB_LIBRARY_API void
+rocksdb_table_properties_collector_factory_destroy(
+    rocksdb_table_properties_collector_factory_t* factory);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_table_properties_collector_factory(
+    rocksdb_options_t*, rocksdb_table_properties_collector_factory_t*);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
     rocksdb_options_t*, size_t);
 extern ROCKSDB_LIBRARY_API size_t
@@ -3631,6 +3662,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_service(
 extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t*
 rocksdb_compaction_service_options_override_create(void);
 
+extern ROCKSDB_LIBRARY_API rocksdb_compaction_service_options_override_t*
+rocksdb_compaction_service_options_override_create_from_options(
+    rocksdb_options_t* option);
+
 extern ROCKSDB_LIBRARY_API void
 rocksdb_compaction_service_options_override_destroy(
     rocksdb_compaction_service_options_override_t* override_options);
@@ -3645,6 +3680,71 @@ rocksdb_compaction_service_options_override_set_comparator(
     rocksdb_compaction_service_options_override_t* override_options,
     rocksdb_comparator_t* comparator);
 
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_merge_operator(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_mergeoperator_t* merge_operator);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_compaction_filter(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilter_t* compaction_filter);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_compaction_filter_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_compactionfilterfactory_t* compaction_filter_factory);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_prefix_extractor(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_slicetransform_t* prefix_extractor);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_block_based_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_block_based_table_options_t* table_options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_cuckoo_table_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_cuckoo_table_options_t* table_options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_add_event_listener(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_eventlistener_t* event_listener);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_statistics(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_info_log(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_logger_t* logger);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_option(
+    rocksdb_compaction_service_options_override_t* override_options,
+    const char* key, const char* value);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_file_checksum_gen_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_file_checksum_gen_factory_t* factory);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_set_sst_partitioner_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_sst_partitioner_factory_t* factory);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compaction_service_options_override_add_table_properties_collector_factory(
+    rocksdb_compaction_service_options_override_t* override_options,
+    rocksdb_table_properties_collector_factory_t* factory);
+
 // Atomic bool management for cancellation
 // Creates an atomic bool that can be used for cancellation.
 // User must call rocksdb_open_and_compact_canceled_destroy() to free it.

From 276721cd106d87cf5eebe432d77362a50e90ba58 Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Mon, 29 Dec 2025 10:32:36 -0800
Subject: [PATCH 408/500] eliminate per-iterator heap allocation by
 constructing InternalKeyComparator in-place (#14044)

Summary:
resolve [13951](https://github.com/facebook/rocksdb/issues/13951)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14044

Reviewed By: xingbowang

Differential Revision: D86217603

Pulled By: jaykorean

fbshipit-source-id: 8ed62503cfcfdfb26f7af7b0a5641cd47dd9e54c
---
 table/block_based/block.cc |  4 ++--
 table/block_based/block.h  | 19 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index d2a5d8d70a17..7b531f959879 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -440,8 +440,8 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) {
     return true;
   }
 
-  if (icmp_->user_comparator()->Compare(raw_key_.GetUserKey(),
-                                        target_user_key) != 0) {
+  if (icmp_.user_comparator()->Compare(raw_key_.GetUserKey(),
+                                       target_user_key) != 0) {
     // the key is not in this block and cannot be at the next block either.
     return false;
   }
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 7d7011d40571..afd0d302ce76 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -426,7 +426,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
   Cache::Handle* cache_handle() { return cache_handle_; }
 
  protected:
-  std::unique_ptr<InternalKeyComparator> icmp_;
+  InternalKeyComparator icmp_;
   const char* data_;       // underlying block contents
   uint32_t num_restarts_;  // Number of uint32_t entries in restart array
 
@@ -528,17 +528,15 @@ class BlockIter : public InternalIteratorBase<TValue> {
                       uint32_t block_restart_interval) {
     assert(data_ == nullptr);  // Ensure it is called only once
     assert(num_restarts > 0);  // Ensure the param is valid
-
-    icmp_ = std::make_unique<InternalKeyComparator>(raw_ucmp);
+    assert(raw_ucmp != nullptr);
+    icmp_ = InternalKeyComparator(raw_ucmp);
     data_ = data;
     restarts_ = restarts;
     num_restarts_ = num_restarts;
     current_ = restarts_;
     restart_index_ = num_restarts_;
     global_seqno_ = global_seqno;
-    if (raw_ucmp != nullptr) {
-      ts_sz_ = raw_ucmp->timestamp_size();
-    }
+    ts_sz_ = raw_ucmp->timestamp_size();
     pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted;
     block_contents_pinned_ = block_contents_pinned;
     cache_handle_ = nullptr;
@@ -622,14 +620,15 @@ class BlockIter : public InternalIteratorBase<TValue> {
   // comparator is used for the block contents, the LHS argument is the current
   // key with global seqno applied, and the RHS argument is `other`.
   int CompareCurrentKey(const Slice& other) {
+    assert(icmp_.user_comparator() != nullptr);
     if (raw_key_.IsUserKey()) {
       assert(global_seqno_ == kDisableGlobalSequenceNumber);
-      return icmp_->user_comparator()->Compare(raw_key_.GetUserKey(), other);
+      return icmp_.user_comparator()->Compare(raw_key_.GetUserKey(), other);
     } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
-      return icmp_->Compare(raw_key_.GetInternalKey(), other);
+      return icmp_.Compare(raw_key_.GetInternalKey(), other);
     }
-    return icmp_->Compare(raw_key_.GetInternalKey(), global_seqno_, other,
-                          kDisableGlobalSequenceNumber);
+    return icmp_.Compare(raw_key_.GetInternalKey(), global_seqno_, other,
+                         kDisableGlobalSequenceNumber);
   }
 
  private:

From 3818cc1acad4e7fbbfb9a1b2eb284bdb4f75d3ac Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Mon, 29 Dec 2025 10:53:58 -0800
Subject: [PATCH 409/500] Fix a bug in seqno zeroing logic with UDT (#14207)

Summary:
This bug caused seqno to be incorrectly zeroed when UDT is enabled. This is one of the contributing factor that caused tombstones to be accumulated at bottommost level, causing high space amp.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14207

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D89826564

Pulled By: xingbowang

fbshipit-source-id: 62ab1e37c36ae1ed95f26213c97a591a17e962a6
---
 db/compaction/compaction_iterator.cc          |  18 +-
 db/db_with_timestamp_compaction_test.cc       | 201 ++++++++++++++++++
 .../bug_fixes/udt_seqno_zero_bug_fix.md       |   1 +
 3 files changed, 215 insertions(+), 5 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md

diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 81d6266bdf61..e76490225c26 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -1310,14 +1310,14 @@ void CompactionIterator::PrepareOutput() {
             validity_info_.rep);
         assert(false);
       }
-      ikey_.sequence = 0;
-      last_key_seq_zeroed_ = true;
-      TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
-                               &ikey_);
+
+      bool zeroed_seqno = false;
       if (!timestamp_size_) {
         current_key_.UpdateInternalKey(0, ikey_.type);
+        zeroed_seqno = true;
       } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
-        // We can also zero out timestamp for better compression.
+        // For UDT, the seqno and timestamp could only be zeroed out after the
+        // key is below history_ts_low_.
         // For the same user key (excluding timestamp), the timestamp-based
         // history can be collapsed to save some space if the timestamp is
         // older than *full_history_ts_low_.
@@ -1325,6 +1325,14 @@ void CompactionIterator::PrepareOutput() {
         const Slice ts_slice = kTsMin;
         ikey_.SetTimestamp(ts_slice);
         current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+        zeroed_seqno = true;
+      }
+
+      if (zeroed_seqno) {
+        ikey_.sequence = 0;
+        last_key_seq_zeroed_ = true;
+        TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+                                 &ikey_);
       }
     }
   }
diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc
index 783140cbf7d9..8348316b6c90 100644
--- a/db/db_with_timestamp_compaction_test.cc
+++ b/db/db_with_timestamp_compaction_test.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <set>
+
 #include "db/compaction/compaction.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
@@ -344,6 +346,205 @@ TEST_F(TimestampCompatibleCompactionTest, EmptyCompactionOutput) {
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 }
 
+TEST_F(TimestampCompatibleCompactionTest, SeqnoZeroingWithUDT) {
+  // This test validates that seqno is only zeroed when the timestamp is older
+  // than full_history_ts_low_. Before the fix, seqno was incorrectly zeroed
+  // even when UDT was enabled but timestamp wasn't old enough.
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Track seqno zeroing events and which keys are zeroed
+  std::set<std::string> zeroed_keys;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq", [&](void* arg) {
+        auto* ikey = static_cast<ParsedInternalKey*>(arg);
+        ASSERT_EQ(0, ikey->sequence);
+        // Extract user key without timestamp (last 8 bytes)
+        Slice user_key_with_ts = ikey->user_key;
+        std::string user_key =
+            user_key_with_ts.ToString().substr(0, user_key_with_ts.size() - 8);
+        zeroed_keys.insert(user_key);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Test that seqno is NOT zeroed when full_history_ts_low is not set
+  // Write a key with timestamp 100
+  std::string ts_str = Timestamp(100);
+  ASSERT_OK(db_->Put(WriteOptions(), "key1", ts_str, "value1"));
+  ASSERT_OK(Flush());
+
+  zeroed_keys.clear();
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+  // With UDT enabled and no full_history_ts_low, seqno should NOT be zeroed
+  ASSERT_TRUE(zeroed_keys.empty());
+
+  // Case 2: Test that seqno IS zeroed when timestamp < full_history_ts_low
+  // Write a new key with timestamp 200
+  ts_str = Timestamp(200);
+  ASSERT_OK(db_->Put(WriteOptions(), "key2", ts_str, "value2"));
+  ASSERT_OK(Flush());
+
+  zeroed_keys.clear();
+  {
+    // Set full_history_ts_low to 300, so ts < 300 should be zeroed
+    std::string full_history_ts_low = Timestamp(300);
+    Slice ts_slice = full_history_ts_low;
+    CompactRangeOptions cro;
+    cro.full_history_ts_low = &ts_slice;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+  // key1 (ts=100) and key2 (ts=200) both have ts < 300, so both should be
+  // zeroed
+  ASSERT_EQ(2u, zeroed_keys.size());
+  ASSERT_TRUE(zeroed_keys.count("key1") > 0);
+  ASSERT_TRUE(zeroed_keys.count("key2") > 0);
+
+  // Case 3: Write a new key with timestamp >= full_history_ts_low
+  // and verify it is NOT zeroed while old keys are re-zeroed
+  ts_str = Timestamp(500);
+  ASSERT_OK(db_->Put(WriteOptions(), "key3", ts_str, "value3"));
+  ASSERT_OK(Flush());
+
+  zeroed_keys.clear();
+  {
+    // Set full_history_ts_low to 400
+    // key1 (ts=100) and key2 (ts=200) have ts < 400, will be re-processed
+    // key3 (ts=500) has ts >= 400, should NOT be zeroed
+    std::string full_history_ts_low = Timestamp(400);
+    Slice ts_slice = full_history_ts_low;
+    CompactRangeOptions cro;
+    cro.full_history_ts_low = &ts_slice;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+  // key3 should NOT appear in zeroed_keys since ts=500 >= 400
+  ASSERT_TRUE(zeroed_keys.count("key3") == 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify data is still readable
+  std::string value;
+  ts_str = Timestamp(600);
+  Slice read_ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  ASSERT_OK(db_->Get(read_opts, "key1", &value));
+  ASSERT_EQ("value1", value);
+  ASSERT_OK(db_->Get(read_opts, "key2", &value));
+  ASSERT_EQ("value2", value);
+  ASSERT_OK(db_->Get(read_opts, "key3", &value));
+  ASSERT_EQ("value3", value);
+}
+
+TEST_F(TimestampCompatibleCompactionTest, UdtTombstoneCollapsingTest) {
+  // This test validate tombstones accumulated at bottommost level due to UDT is
+  // cleaned up properly, avoiding high space amplification.
+
+  // Create a new column family with UDT enabled
+  Options options = GetDefaultOptions();
+  ColumnFamilyHandle* cfh = nullptr;
+  options = GetDefaultOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.num_levels = 7;
+  options.level0_file_num_compaction_trigger = 10;
+  options.persist_user_defined_timestamps = true;
+  options.comparator = BytewiseComparatorWithU64Ts();
+  options.target_file_size_base = 2 * 1024 * 1024;
+  options.max_bytes_for_level_base = 4 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+
+  ASSERT_OK(db_->CreateColumnFamily(options, "new_cf", &cfh));
+
+  std::string ts_buf;
+  uint64_t timestamp = 1000;
+  constexpr auto kBatchSize = 1000;
+  constexpr auto kTotalRecords = 100000;
+
+  int record_count = 0;
+  auto kValueSize = 1024;
+
+  Random rnd(0);
+  while (record_count < kTotalRecords) {
+    // Create rows with timestamp
+    for (int i = 0; i < kBatchSize; i++) {
+      timestamp = 1000 + record_count + i;
+      ts_buf = "";
+      PutFixed64(&ts_buf, timestamp);
+      Slice ts(ts_buf);
+      // generate a random value, so that they are not easily compressable
+      auto value = rnd.RandomString(kValueSize);
+      ASSERT_OK(
+          db_->Put(WriteOptions(), cfh, Key(record_count + i), ts, value));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+
+    // Create a snapshot for read, then release it, so that
+    // oldest_snapshot_seqnum_ is advanced periodically
+    auto snapshot = db_->GetSnapshot();
+    ReadOptions read_options;
+    std::string read_ts_buf = "";
+    timestamp = 1000 + record_count + kBatchSize;
+    PutFixed64(&read_ts_buf, timestamp);
+    Slice read_ts(read_ts_buf);
+    read_options.timestamp = &read_ts;
+    read_options.snapshot = snapshot;
+    std::string value;
+    ASSERT_OK(db_->Get(read_options, cfh, Key(record_count), &value, &ts_buf));
+    db_->ReleaseSnapshot(snapshot);
+
+    // Delete all of the rows created
+    for (int i = 0; i < kBatchSize; i++) {
+      timestamp = 2000 + record_count + i;
+      ts_buf = "";
+      PutFixed64(&ts_buf, timestamp);
+      Slice ts(ts_buf);
+      ASSERT_OK(db_->Delete(WriteOptions(), cfh, Key(record_count + i), ts));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+    record_count += kBatchSize;
+
+    // Advance full_history_ts_low with some delay periodically
+    timestamp = 1000 + record_count - kBatchSize;
+    ts_buf = "";
+    PutFixed64(&ts_buf, timestamp);
+    ASSERT_OK(db_->IncreaseFullHistoryTsLow(cfh, ts_buf));
+
+    constexpr bool debug = false;
+    if (debug) {
+      // Print stats from time to time
+      if (record_count % (kTotalRecords / 10) == 0) {
+        std::string cf_stats;
+        ASSERT_TRUE(db_->GetProperty(cfh, "rocksdb.cfstats-no-file-histogram",
+                                     &cf_stats));
+        printf("%s\n", cf_stats.c_str());
+        printf("db path %s\n", dbname_.c_str());
+        printf("completed record count %d\n", record_count);
+        printf("completed record percentage %f%%\n",
+               100 * (float)record_count / kTotalRecords);
+      }
+    }
+  }
+
+  // Validate CF size is less than 20% of the total data created to validate the
+  // tombstones has collapsed
+  uint64_t cf_size = 0;
+  ASSERT_TRUE(
+      db_->GetIntProperty(cfh, DB::Properties::kTotalSstFilesSize, &cf_size));
+  ASSERT_LE(cf_size, 0.2 * kTotalRecords * kValueSize);
+
+  delete cfh;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md b/unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md
new file mode 100644
index 000000000000..244fed53dcda
--- /dev/null
+++ b/unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md
@@ -0,0 +1 @@
+Bugfix for persisted UDT record sequence number zeroing logic.

From 1cc1df8dab6a400146188d35aee1b847a0dcc530 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 29 Dec 2025 17:13:50 -0800
Subject: [PATCH 410/500] Finish migrating HCC to BitFields API (#14154)

Summary:
This change builds on https://github.com/facebook/rocksdb/issues/14027 and https://github.com/facebook/rocksdb/issues/13965 to complete migration
of the HyperClockCache implementation to using the hygienic BitFields API.
No semantic change in the implementation details is intended, just
greatly improving readability and safety of the code while maintaining
the same performance.

In more detail,
* Refactor the main metadata atomic for each slot in an HCC table into
SlotMeta using BitFields.
* Extended BitFields APIs with some additional features, and renamed
  BlahTransform classes to BlahTransformer to resolve potential naming
  conflicts with member functions to create them.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14154

Test Plan:
for correctness, mostly existing tests. but also added tests
for new BitFields features. I especially ran local TSAN whitebox crash
test extensively which caught a couple of refactoring errors.

For performance, I verified with release builds of cache_bench, using
default options, that there was no noticeable/consistent difference
after all these HCC migrations vs. backing them out. That test was with
GCC 11 and -O2, which is a reasonable baseline for expected compiler
optimizations.

Reviewed By: xingbowang

Differential Revision: D87960540

Pulled By: pdillinger

fbshipit-source-id: e0257b7fea8a5c7709daef18911959201ce4e0f3
---
 cache/clock_cache.cc   | 550 +++++++++++++++++++----------------------
 cache/clock_cache.h    | 135 ++++++----
 test_util/sync_point.h |   9 +-
 util/bit_fields.h      |  96 ++++---
 util/slice_test.cc     |  34 +++
 5 files changed, 446 insertions(+), 378 deletions(-)

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 0ef599857d6a..dbd2a5b8fccb 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -38,13 +38,11 @@ namespace ROCKSDB_NAMESPACE {
 namespace clock_cache {
 
 namespace {
-inline uint64_t GetRefcount(uint64_t meta) {
-  return ((meta >> ClockHandle::kAcquireCounterShift) -
-          (meta >> ClockHandle::kReleaseCounterShift)) &
-         ClockHandle::kCounterMask;
-}
+using SlotMeta = ClockHandle::SlotMeta;
+using AcquireCounter = SlotMeta::AcquireCounter;
+using ReleaseCounter = SlotMeta::ReleaseCounter;
 
-inline uint64_t GetInitialCountdown(Cache::Priority priority) {
+inline uint32_t GetInitialCountdown(Cache::Priority priority) {
   // Set initial clock data from priority
   // TODO: configuration parameters for priority handling and clock cycle
   // count?
@@ -65,11 +63,11 @@ inline uint64_t GetInitialCountdown(Cache::Priority priority) {
 inline void MarkEmpty(ClockHandle& h) {
 #ifndef NDEBUG
   // Mark slot as empty, with assertion
-  uint64_t meta = h.meta.Exchange(0);
-  assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction);
+  auto old_meta = h.meta.Exchange({});
+  assert(old_meta.IsUnderConstruction());
 #else
   // Mark slot as empty
-  h.meta.Store(0);
+  h.meta.Store({});
 #endif
 }
 
@@ -85,18 +83,20 @@ inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) {
 
 // Called to undo the effect of referencing an entry for internal purposes,
 // so it should not be marked as having been used.
-inline void Unref(const ClockHandle& h, uint64_t count = 1) {
+inline void Unref(const ClockHandle& h, uint32_t count = 1) {
   // Pretend we never took the reference
   // WART: there's a tiny chance we release last ref to invisible
   // entry here. If that happens, we let eviction take care of it.
-  uint64_t old_meta = h.meta.FetchSub(ClockHandle::kAcquireIncrement * count);
-  assert(GetRefcount(old_meta) != 0);
+  SlotMeta old_meta;
+  h.meta.Apply(AcquireCounter::MinusTransformPromiseNoUnderflow(count),
+               &old_meta);
+  assert(old_meta.GetRefcount() != 0);
   (void)old_meta;
 }
 
 inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
                         bool* purgeable = nullptr) {
-  uint64_t meta;
+  SlotMeta meta;
   if (purgeable) {
     assert(*purgeable == false);
     // In AutoHCC, our eviction process follows the chain structure, so we
@@ -110,46 +110,40 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
     meta = h.meta.LoadRelaxed();
   }
 
-  if (((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) ==
-      0) {
+  if (!meta.IsShareable()) {
     // Only clock update Shareable entries
     if (purgeable) {
       *purgeable = true;
       // AutoHCC only: make sure we only attempt to update non-empty slots
-      assert((meta >> ClockHandle::kStateShift) &
-             ClockHandle::kStateOccupiedBit);
+      assert(!meta.IsEmpty());
     }
     return false;
   }
-  uint64_t acquire_count =
-      (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask;
-  uint64_t release_count =
-      (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask;
+  uint32_t acquire_count = meta.GetAcquireCounter();
+  uint32_t release_count = meta.GetReleaseCounter();
   if (acquire_count != release_count) {
     // Only clock update entries with no outstanding refs
     data->seen_pinned_count++;
     return false;
   }
-  if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) &&
-      acquire_count > 0) {
+  if (meta.IsVisible() && acquire_count > 0) {
     // Decrement clock
-    uint64_t new_count =
-        std::min(acquire_count - 1, uint64_t{ClockHandle::kMaxCountdown} - 1);
+    uint32_t new_count =
+        std::min(acquire_count - 1, uint32_t{ClockHandle::kMaxCountdown} - 1);
     // Compare-exchange in the decremented clock info, but
     // not aggressively
-    uint64_t new_meta =
-        (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) |
-        (meta & ClockHandle::kHitBitMask) |
-        (new_count << ClockHandle::kReleaseCounterShift) |
-        (new_count << ClockHandle::kAcquireCounterShift);
+    SlotMeta new_meta = meta;
+    new_meta.SetReleaseCounter(new_count);
+    new_meta.SetAcquireCounter(new_count);
     h.meta.CasStrongRelaxed(meta, new_meta);
     return false;
   }
   // Otherwise, remove entry (either unreferenced invisible or
   // unreferenced and expired visible).
-  if (h.meta.CasStrong(meta, (uint64_t{ClockHandle::kStateConstruction}
-                              << ClockHandle::kStateShift) |
-                                 (meta & ClockHandle::kHitBitMask))) {
+  SlotMeta construction_meta;
+  construction_meta.SetUnderConstruction();
+  construction_meta.SetHit(meta.GetHit());
+  if (h.meta.CasStrong(meta, construction_meta)) {
     // Took ownership.
     data->freed_charge += h.GetTotalCharge();
     data->freed_count += 1;
@@ -215,39 +209,39 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
 // counter to reach "high" state again and bumped back to "medium." (This
 // motivates only checking for release counter in high state, not both in high
 // state.)
-inline void CorrectNearOverflow(uint64_t old_meta,
-                                AcqRelAtomic<uint64_t>& meta) {
+inline void CorrectNearOverflow(SlotMeta old_meta,
+                                AcqRelBitFieldsAtomic<SlotMeta>& meta) {
   // We clear both top-most counter bits at the same time.
-  constexpr uint64_t kCounterTopBit = uint64_t{1}
-                                      << (ClockHandle::kCounterNumBits - 1);
-  constexpr uint64_t kClearBits =
-      (kCounterTopBit << ClockHandle::kAcquireCounterShift) |
-      (kCounterTopBit << ClockHandle::kReleaseCounterShift);
-  // A simple check that allows us to initiate clearing the top bits for
-  // a large portion of the "high" state space on release counter.
-  constexpr uint64_t kCheckBits =
-      (kCounterTopBit | (ClockHandle::kMaxCountdown + 1))
-      << ClockHandle::kReleaseCounterShift;
+  constexpr uint32_t kCounterTopBit = uint32_t{1}
+                                      << (SlotMeta::kCounterNumBits - 1);
+  // The threshold for correcting "near overflow" is to ensure
+  // (a) the value has a top bit set that can be cleared
+  // (b) when we clear the top bit, the eviction state will be preserved
+  //     (everything >= kMaxCountdown is treated equivalently)
+  // As mentioned above, we only check the release count.
+  constexpr uint32_t kThreshold = kCounterTopBit + ClockHandle::kMaxCountdown;
 
-  if (UNLIKELY(old_meta & kCheckBits)) {
-    meta.FetchAndRelaxed(~kClearBits);
+  if (UNLIKELY(old_meta.GetReleaseCounter() > kThreshold)) {
+    auto clear_transform = AcquireCounter::AndTransform(kCounterTopBit - 1) +
+                           ReleaseCounter::AndTransform(kCounterTopBit - 1);
+    meta.ApplyRelaxed(clear_transform);
   }
 }
 
 inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
-                            uint64_t initial_countdown, bool* already_matches) {
+                            uint32_t initial_countdown, bool* already_matches) {
   assert(*already_matches == false);
   // Optimistically transition the slot from "empty" to
   // "under construction" (no effect on other states)
-  uint64_t old_meta = h.meta.FetchOr(uint64_t{ClockHandle::kStateOccupiedBit}
-                                     << ClockHandle::kStateShift);
-  uint64_t old_state = old_meta >> ClockHandle::kStateShift;
+  auto set_occupied = SlotMeta::OccupiedFlag::SetTransform();
+  SlotMeta old_meta;
+  h.meta.Apply(set_occupied, &old_meta);
 
-  if (old_state == ClockHandle::kStateEmpty) {
+  if (old_meta.IsEmpty()) {
     // We've started inserting into an available slot, and taken
     // ownership.
     return true;
-  } else if (old_state != ClockHandle::kStateVisible) {
+  } else if (!old_meta.IsVisible()) {
     // Slot not usable / touchable now
     return false;
   }
@@ -255,15 +249,17 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
   // But first, we need to acquire a ref to read it. In fact, number of
   // refs for initial countdown, so that we boost the clock state if
   // this is a match.
-  old_meta =
-      h.meta.FetchAdd(ClockHandle::kAcquireIncrement * initial_countdown);
+  auto add_acquire =
+      AcquireCounter::PlusTransformPromiseNoOverflow(initial_countdown);
+  h.meta.Apply(add_acquire, &old_meta);
   // Like Lookup
-  if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) {
+  if (old_meta.IsVisible()) {
     // Acquired a read reference
     if (h.hashed_key == proto.hashed_key) {
       // Match. Release in a way that boosts the clock state
-      old_meta =
-          h.meta.FetchAdd(ClockHandle::kReleaseIncrement * initial_countdown);
+      auto add_release =
+          ReleaseCounter::PlusTransformPromiseNoOverflow(initial_countdown);
+      h.meta.Apply(add_release, &old_meta);
       // Correct for possible (but rare) overflow
       CorrectNearOverflow(old_meta, h.meta);
       // Insert detached instead (only if return handle needed)
@@ -273,8 +269,7 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
       // Mismatch.
       Unref(h, initial_countdown);
     }
-  } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
-                      ClockHandle::kStateInvisible)) {
+  } else if (UNLIKELY(old_meta.IsInvisible())) {
     // Pretend we never took the reference
     Unref(h, initial_countdown);
   } else {
@@ -286,25 +281,23 @@ inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
 }
 
 inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
-                             uint64_t initial_countdown, bool keep_ref) {
+                             uint32_t initial_countdown, bool keep_ref) {
   // Save data fields
   ClockHandleBasicData* h_alias = &h;
   *h_alias = proto;
 
   // Transition from "under construction" state to "visible" state
-  uint64_t new_meta = uint64_t{ClockHandle::kStateVisible}
-                      << ClockHandle::kStateShift;
+  SlotMeta new_meta;
+  new_meta.SetVisible();
 
   // Maybe with an outstanding reference
-  new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift;
-  new_meta |= (initial_countdown - keep_ref)
-              << ClockHandle::kReleaseCounterShift;
+  new_meta.SetAcquireCounter(initial_countdown);
+  new_meta.SetReleaseCounter(initial_countdown - (keep_ref ? 1 : 0));
 
 #ifndef NDEBUG
   // Save the state transition, with assertion
-  uint64_t old_meta = h.meta.Exchange(new_meta);
-  assert(old_meta >> ClockHandle::kStateShift ==
-         ClockHandle::kStateConstruction);
+  auto old_meta = h.meta.Exchange(new_meta);
+  assert(old_meta.IsUnderConstruction());
 #else
   // Save the state transition
   h.meta.Store(new_meta);
@@ -312,7 +305,7 @@ inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h,
 }
 
 bool TryInsert(const ClockHandleBasicData& proto, ClockHandle& h,
-               uint64_t initial_countdown, bool keep_ref,
+               uint32_t initial_countdown, bool keep_ref,
                bool* already_matches) {
   bool b = BeginSlotInsert(proto, h, initial_countdown, already_matches);
   if (b) {
@@ -326,35 +319,32 @@ template <class HandleImpl, class Func>
 void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin,
                               const HandleImpl* end,
                               bool apply_if_will_be_deleted) {
-  uint64_t check_state_mask = ClockHandle::kStateShareableBit;
-  if (!apply_if_will_be_deleted) {
-    check_state_mask |= ClockHandle::kStateVisibleBit;
-  }
-
   for (const HandleImpl* h = begin; h < end; ++h) {
     // Note: to avoid using compare_exchange, we have to be extra careful.
-    uint64_t old_meta = h->meta.LoadRelaxed();
+    SlotMeta old_meta = h->meta.LoadRelaxed();
     // Check if it's an entry visible to lookups
-    if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-      // Increment acquire counter. Note: it's possible that the entry has
-      // completely changed since we loaded old_meta, but incrementing acquire
-      // count is always safe. (Similar to optimistic Lookup here.)
-      old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
-      // Check whether we actually acquired a reference.
-      if ((old_meta >> ClockHandle::kStateShift) &
-          ClockHandle::kStateShareableBit) {
-        // Apply func if appropriate
-        if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) {
-          func(*h);
+    if (apply_if_will_be_deleted || old_meta.IsVisible()) {
+      if (old_meta.IsShareable()) {
+        // Increment acquire counter. Note: it's possible that the entry has
+        // completely changed since we loaded old_meta, but incrementing acquire
+        // count is always safe. (Similar to optimistic Lookup here.)
+        auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+        h->meta.Apply(add_acquire, &old_meta);
+        // Check whether we actually acquired a reference.
+        if (old_meta.IsShareable()) {
+          // Apply func if appropriate
+          if (apply_if_will_be_deleted || old_meta.IsVisible()) {
+            func(*h);
+          }
+          // Pretend we never took the reference
+          Unref(*h);
+          // No net change, so don't need to check for overflow
+        } else {
+          // For other states, incrementing the acquire counter has no effect
+          // so we don't need to undo it. Furthermore, we cannot safely undo
+          // it because we did not acquire a read reference to lock the
+          // entry in a Shareable state.
         }
-        // Pretend we never took the reference
-        Unref(*h);
-        // No net change, so don't need to check for overflow
-      } else {
-        // For other states, incrementing the acquire counter has no effect
-        // so we don't need to undo it. Furthermore, we cannot safely undo
-        // it because we did not acquire a read reference to lock the
-        // entry in a Shareable state.
       }
     }
   }
@@ -399,9 +389,9 @@ HandleImpl* BaseClockTable::StandaloneInsert(
   h->SetStandalone();
   // Single reference (standalone entries only created if returning a refed
   // Handle back to user)
-  uint64_t meta = uint64_t{ClockHandle::kStateInvisible}
-                  << ClockHandle::kStateShift;
-  meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift;
+  SlotMeta meta;
+  meta.SetInvisible();
+  meta.SetAcquireCounter(1);
   h->meta.Store(meta);
   // Keep track of how much of usage is standalone
   standalone_usage_.FetchAddRelaxed(proto.GetTotalCharge());
@@ -564,11 +554,10 @@ void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) {
   if (eviction_callback_) {
     // For key reconstructed from hash
     UniqueId64x2 unhashed;
-    took_value_ownership =
-        eviction_callback_(ClockCacheShard<FixedHyperClockTable>::ReverseHash(
-                               h->GetHash(), &unhashed, hash_seed_),
-                           static_cast<Cache::Handle*>(h),
-                           h->meta.LoadRelaxed() & ClockHandle::kHitBitMask);
+    took_value_ownership = eviction_callback_(
+        ClockCacheShard<FixedHyperClockTable>::ReverseHash(
+            h->GetHash(), &unhashed, hash_seed_),
+        static_cast<Cache::Handle*>(h), h->meta.LoadRelaxed().GetHit());
   }
   if (!took_value_ownership) {
     h->FreeData(allocator_);
@@ -648,7 +637,7 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
     // * Have to insert into a suboptimal location (more probes) so that the
     // old entry can be kept around as well.
 
-    uint64_t initial_countdown = GetInitialCountdown(priority);
+    uint32_t initial_countdown = GetInitialCountdown(priority);
     assert(initial_countdown > 0);
 
     HandleImpl* e =
@@ -693,34 +682,34 @@ Status BaseClockTable::Insert(const ClockHandleBasicData& proto,
 
 void BaseClockTable::Ref(ClockHandle& h) {
   // Increment acquire counter
-  uint64_t old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement);
+  SlotMeta old_meta;
+  h.meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1), &old_meta);
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   // Must have already had a reference
-  assert(GetRefcount(old_meta) > 0);
+  assert(old_meta.GetRefcount() > 0);
   (void)old_meta;
 }
 
 #ifndef NDEBUG
-void BaseClockTable::TEST_RefN(ClockHandle& h, size_t n) {
+void BaseClockTable::TEST_RefN(ClockHandle& h, uint32_t n) {
   // Increment acquire counter
-  uint64_t old_meta = h.meta.FetchAdd(n * ClockHandle::kAcquireIncrement);
+  SlotMeta old_meta;
+  h.meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(n), &old_meta);
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   (void)old_meta;
 }
 
-void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) {
+void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, uint32_t n) {
   assert(n > 0);
 
   // Like n-1 Releases, but assumes one more will happen in the caller to take
   // care of anything like erasing an unreferenced, invisible entry.
-  uint64_t old_meta =
-      h->meta.FetchAdd((n - 1) * ClockHandle::kReleaseIncrement);
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  SlotMeta old_meta;
+  h->meta.Apply(ReleaseCounter::PlusTransformPromiseNoOverflow(n - 1),
+                &old_meta);
+  assert(old_meta.IsShareable());
   (void)old_meta;
 }
 #endif
@@ -754,23 +743,20 @@ FixedHyperClockTable::~FixedHyperClockTable() {
   // in the table.
   for (size_t i = 0; i < GetTableSize(); i++) {
     HandleImpl& h = array_[i];
-    switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) {
-      case ClockHandle::kStateEmpty:
-        // noop
-        break;
-      case ClockHandle::kStateInvisible:  // rare but possible
-      case ClockHandle::kStateVisible:
-        assert(GetRefcount(h.meta.LoadRelaxed()) == 0);
-        h.FreeData(allocator_);
+    SlotMeta meta = h.meta.LoadRelaxed();
+    if (meta.IsShareable()) {
+      // NOTE: Reaching here invisible is rare but possible
+      assert(meta.GetRefcount() == 0);
+      h.FreeData(allocator_);
 #ifndef NDEBUG
-        Rollback(h.hashed_key, &h);
-        ReclaimEntryUsage(h.GetTotalCharge());
+      Rollback(h.hashed_key, &h);
+      ReclaimEntryUsage(h.GetTotalCharge());
 #endif
-        break;
-      // otherwise
-      default:
-        assert(false);
-        break;
+    } else {
+      // Should be no transient "under construction" states unless a thread
+      // was killed or we are being destructed while another thread is still
+      // operating on the structure
+      assert(meta.IsEmpty());
     }
   }
 
@@ -792,7 +778,7 @@ bool FixedHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) {
 }
 
 FixedHyperClockTable::HandleImpl* FixedHyperClockTable::DoInsert(
-    const ClockHandleBasicData& proto, uint64_t initial_countdown,
+    const ClockHandleBasicData& proto, uint32_t initial_countdown,
     bool keep_ref, InsertState&) {
   bool already_matches = false;
   HandleImpl* e = FindSlot(
@@ -843,47 +829,46 @@ FixedHyperClockTable::HandleImpl* FixedHyperClockTable::Lookup(
   HandleImpl* e = FindSlot(
       hashed_key,
       [&](HandleImpl* h) {
+        SlotMeta old_meta;
         // Mostly branch-free version (similar performance)
         /*
-        uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement,
-                                     std::memory_order_acquire);
-        bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U;
-        bool visible = (old_meta >> ClockHandle::kStateShift) & 1U;
-        bool match = (h->key == key) & visible;
-        h->meta.FetchSub(static_cast<uint64_t>(Shareable & !match) <<
-        ClockHandle::kAcquireCounterShift); return
-        match;
+        h->meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1),
+                      &old_meta);
+        bool shareable = old_meta.IsShareable();
+        bool visible = old_meta.IsVisible();
+        bool match = (h->hashed_key == hashed_key) & visible;
+        h->meta.Apply(AcquireCounter::MinusTransformPromiseNoUnderflow(
+            uint32_t{shareable} & uint32_t{!match}));
+        h->meta.Apply(SlotMeta::HitFlag::Or(match));
+        return match;
         */
         // Optimistic lookup should pay off when the table is relatively
         // sparse.
         constexpr bool kOptimisticLookup = true;
-        uint64_t old_meta;
         if (!kOptimisticLookup) {
           old_meta = h->meta.Load();
-          if ((old_meta >> ClockHandle::kStateShift) !=
-              ClockHandle::kStateVisible) {
+          if (!old_meta.IsVisible()) {
             return false;
           }
         }
         // (Optimistically) increment acquire counter
-        old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
+        h->meta.Apply(AcquireCounter::PlusTransformPromiseNoOverflow(1),
+                      &old_meta);
         // Check if it's an entry visible to lookups
-        if ((old_meta >> ClockHandle::kStateShift) ==
-            ClockHandle::kStateVisible) {
+        if (old_meta.IsVisible()) {
           // Acquired a read reference
           if (h->hashed_key == hashed_key) {
             // Match
             // Update the hit bit
             if (eviction_callback_) {
-              h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift);
+              h->meta.ApplyRelaxed(SlotMeta::HitFlag::SetTransform());
             }
             return true;
           } else {
             // Mismatch. Pretend we never took the reference
             Unref(*h);
           }
-        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
-                            ClockHandle::kStateInvisible)) {
+        } else if (UNLIKELY(old_meta.IsInvisible())) {
           // Pretend we never took the reference
           Unref(*h);
         } else {
@@ -907,53 +892,49 @@ bool FixedHyperClockTable::Release(HandleImpl* h, bool useful,
   // is only freed up by EvictFromClock (called by Insert when space is needed)
   // and Erase. We do this to avoid an extra atomic read of the variable usage_.
 
-  uint64_t old_meta;
+  SlotMeta old_meta;
   if (useful) {
     // Increment release counter to indicate was used
-    old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement);
+    auto add_release = ReleaseCounter::PlusTransformPromiseNoOverflow(1);
+    h->meta.Apply(add_release, &old_meta);
   } else {
     // Decrement acquire counter to pretend it never happened
-    old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement);
+    auto sub_acquire = AcquireCounter::MinusTransformPromiseNoUnderflow(1);
+    h->meta.Apply(sub_acquire, &old_meta);
   }
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   // No underflow
-  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
-          ClockHandle::kCounterMask) !=
-         ((old_meta >> ClockHandle::kReleaseCounterShift) &
-          ClockHandle::kCounterMask));
+  assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter());
 
-  if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
-                                    ClockHandle::kStateInvisible)) {
+  if (erase_if_last_ref || UNLIKELY(old_meta.IsInvisible())) {
     // FIXME: There's a chance here that another thread could replace this
     // entry and we end up erasing the wrong one.
 
-    // Update for last FetchAdd op
+    // Update for last Apply op
     if (useful) {
-      old_meta += ClockHandle::kReleaseIncrement;
+      old_meta.SetReleaseCounter(old_meta.GetReleaseCounter() + 1);
     } else {
-      old_meta -= ClockHandle::kAcquireIncrement;
+      old_meta.SetAcquireCounter(old_meta.GetAcquireCounter() - 1);
     }
     // Take ownership if no refs
+    SlotMeta construction_meta;
+    construction_meta.SetUnderConstruction();
     do {
-      if (GetRefcount(old_meta) != 0) {
+      if (old_meta.GetRefcount() != 0) {
         // Not last ref at some point in time during this Release call
         // Correct for possible (but rare) overflow
         CorrectNearOverflow(old_meta, h->meta);
         return false;
       }
-      if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                       << ClockHandle::kStateShift)) == 0) {
+      if (!old_meta.IsShareable()) {
         // Someone else took ownership
         return false;
       }
       // Note that there's a small chance that we release, another thread
       // replaces this entry with another, reaches zero refs, and then we end
       // up erasing that other entry. That's an acceptable risk / imprecision.
-    } while (
-        !h->meta.CasWeak(old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                       << ClockHandle::kStateShift));
+    } while (!h->meta.CasWeak(old_meta, construction_meta));
     // Took ownership
     size_t total_charge = h->GetTotalCharge();
     if (UNLIKELY(h->IsStandalone())) {
@@ -976,7 +957,7 @@ bool FixedHyperClockTable::Release(HandleImpl* h, bool useful,
 }
 
 #ifndef NDEBUG
-void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, uint32_t n) {
   if (n > 0) {
     // Do n-1 simple releases first
     TEST_ReleaseNMinus1(h, n);
@@ -993,30 +974,29 @@ void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) {
       [&](HandleImpl* h) {
         // Could be multiple entries in rare cases. Erase them all.
         // Optimistically increment acquire counter
-        uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
+        auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+        SlotMeta old_meta, meta;
+        h->meta.Apply(add_acquire, &old_meta, &meta);
         // Check if it's an entry visible to lookups
-        if ((old_meta >> ClockHandle::kStateShift) ==
-            ClockHandle::kStateVisible) {
+        if (meta.IsVisible()) {
           // Acquired a read reference
           if (h->hashed_key == hashed_key) {
-            // Match. Set invisible.
-            old_meta =
-                h->meta.FetchAnd(~(uint64_t{ClockHandle::kStateVisibleBit}
-                                   << ClockHandle::kStateShift));
-            // Apply update to local copy
-            old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit}
-                          << ClockHandle::kStateShift);
+            // Match. Take ownership if no other refs, or set invisible other
+            // refs exist.
             for (;;) {
-              uint64_t refcount = GetRefcount(old_meta);
+              uint32_t refcount = meta.GetRefcount();
               assert(refcount > 0);
               if (refcount > 1) {
                 // Not last ref at some point in time during this Erase call
-                // Pretend we never took the reference
+                // Set invisible
+                h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform());
+                // And pretend we never took the reference
                 Unref(*h);
                 break;
-              } else if (h->meta.CasWeak(
-                             old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                           << ClockHandle::kStateShift)) {
+              }
+              SlotMeta construction_meta;
+              construction_meta.SetUnderConstruction();
+              if (h->meta.CasWeak(meta, construction_meta)) {
                 // Took ownership
                 assert(hashed_key == h->hashed_key);
                 size_t total_charge = h->GetTotalCharge();
@@ -1032,8 +1012,7 @@ void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) {
             // Mismatch. Pretend we never took the reference
             Unref(*h);
           }
-        } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) ==
-                            ClockHandle::kStateInvisible)) {
+        } else if (UNLIKELY(old_meta.IsInvisible())) {
           // Pretend we never took the reference
           Unref(*h);
         } else {
@@ -1050,17 +1029,17 @@ void FixedHyperClockTable::EraseUnRefEntries() {
   for (size_t i = 0; i <= this->length_bits_mask_; i++) {
     HandleImpl& h = array_[i];
 
-    uint64_t old_meta = h.meta.LoadRelaxed();
-    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                    << ClockHandle::kStateShift) &&
-        GetRefcount(old_meta) == 0 &&
-        h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                       << ClockHandle::kStateShift)) {
-      // Took ownership
-      size_t total_charge = h.GetTotalCharge();
-      Rollback(h.hashed_key, &h);
-      FreeDataMarkEmpty(h, allocator_);
-      ReclaimEntryUsage(total_charge);
+    SlotMeta old_meta = h.meta.LoadRelaxed();
+    if (old_meta.IsShareable() && old_meta.GetRefcount() == 0) {
+      SlotMeta construction_meta;
+      construction_meta.SetUnderConstruction();
+      if (h.meta.CasStrong(old_meta, construction_meta)) {
+        // Took ownership
+        size_t total_charge = h.GetTotalCharge();
+        Rollback(h.hashed_key, &h);
+        FreeDataMarkEmpty(h, allocator_);
+        ReclaimEntryUsage(total_charge);
+      }
     }
   }
 }
@@ -1320,12 +1299,12 @@ bool ClockCacheShard<Table>::Release(HandleImpl* handle, bool useful,
 
 #ifndef NDEBUG
 template <class Table>
-void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, size_t n) {
+void ClockCacheShard<Table>::TEST_RefN(HandleImpl* h, uint32_t n) {
   table_.TEST_RefN(*h, n);
 }
 
 template <class Table>
-void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, size_t n) {
+void ClockCacheShard<Table>::TEST_ReleaseN(HandleImpl* h, uint32_t n) {
   table_.TEST_ReleaseN(h, n);
 }
 #endif
@@ -1373,8 +1352,8 @@ size_t ClockCacheShard<Table>::GetPinnedUsage() const {
       metadata_charge_policy_ == kFullChargeCacheMetadata;
   ConstApplyToEntriesRange(
       [&table_pinned_usage, charge_metadata](const HandleImpl& h) {
-        uint64_t meta = h.meta.LoadRelaxed();
-        uint64_t refcount = GetRefcount(meta);
+        SlotMeta meta = h.meta.LoadRelaxed();
+        uint32_t refcount = meta.GetRefcount();
         // Holding one ref for ConstApplyToEntriesRange
         assert(refcount > 0);
         if (refcount > 1) {
@@ -1494,7 +1473,7 @@ void AddShardEvaluation(const FixedHyperClockCache::Shard& shard,
 }
 
 bool IsSlotOccupied(const ClockHandle& h) {
-  return (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) != 0;
+  return !h.meta.LoadRelaxed().IsEmpty();
 }
 }  // namespace
 
@@ -1759,12 +1738,12 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
   // Must be at least something to match
   assert(hashed_key || shift > 0);
 
-  uint64_t old_meta;
+  SlotMeta old_meta, new_meta;
   // (Optimistically) increment acquire counter.
-  old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement);
+  auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+  h.meta.Apply(add_acquire, &old_meta, &new_meta);
   // Check if it's a referencable (sharable) entry
-  if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                   << ClockHandle::kStateShift)) == 0) {
+  if (!old_meta.IsShareable()) {
     // For non-sharable states, incrementing the acquire counter has no effect
     // so we don't need to undo it. Furthermore, we cannot safely undo
     // it because we did not acquire a read reference to lock the
@@ -1775,10 +1754,9 @@ inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h,
     return false;
   }
   // Else acquired a read reference
-  assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0);
+  assert(new_meta.GetRefcount() > 0);
   if (hashed_key && h.hashed_key == *hashed_key &&
-      LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit}
-                         << ClockHandle::kStateShift))) {
+      LIKELY(old_meta.IsVisible())) {
     // Match on full key, visible
     if (full_match_or_unknown) {
       *full_match_or_unknown = true;
@@ -2051,7 +2029,7 @@ AutoHyperClockTable::~AutoHyperClockTable() {
            HandleImpl::kUnusedMarker);
     assert(array_[i].chain_next_with_shift.LoadRelaxed() ==
            HandleImpl::kUnusedMarker);
-    assert(array_[i].meta.LoadRelaxed() == 0);
+    assert(array_[i].meta.LoadRelaxed() == SlotMeta{});
   }
 #endif          // MUST_FREE_HEAP_ALLOCATIONS
 #ifndef NDEBUG  // Extra invariant checking
@@ -2060,30 +2038,27 @@ AutoHyperClockTable::~AutoHyperClockTable() {
 #endif  // !NDEBUG
   for (size_t i = 0; i < used_end; i++) {
     HandleImpl& h = array_[i];
-    switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) {
-      case ClockHandle::kStateEmpty:
-        // noop
-        break;
-      case ClockHandle::kStateInvisible:  // rare but possible
-      case ClockHandle::kStateVisible:
-        assert(GetRefcount(h.meta.LoadRelaxed()) == 0);
-        h.FreeData(allocator_);
+    SlotMeta meta = h.meta.LoadRelaxed();
+    if (meta.IsShareable()) {
+      // NOTE: Reaching here invisible is rare but possible
+      assert(meta.GetRefcount() == 0);
+      h.FreeData(allocator_);
 #ifndef NDEBUG  // Extra invariant checking
-        usage_.FetchSubRelaxed(h.total_charge);
-        occupancy_.FetchSubRelaxed(1U);
-        was_populated[i] = true;
-        if (!h.chain_next_with_shift.LoadRelaxed().IsEnd()) {
-          assert(!h.chain_next_with_shift.LoadRelaxed().IsLocked());
-          size_t next = h.chain_next_with_shift.LoadRelaxed().GetNext();
-          assert(!was_pointed_to[next]);
-          was_pointed_to[next] = true;
-        }
+      usage_.FetchSubRelaxed(h.total_charge);
+      occupancy_.FetchSubRelaxed(1U);
+      was_populated[i] = true;
+      if (!h.chain_next_with_shift.LoadRelaxed().IsEnd()) {
+        assert(!h.chain_next_with_shift.LoadRelaxed().IsLocked());
+        size_t next = h.chain_next_with_shift.LoadRelaxed().GetNext();
+        assert(!was_pointed_to[next]);
+        was_pointed_to[next] = true;
+      }
 #endif  // !NDEBUG
-        break;
-      // otherwise
-      default:
-        assert(false);
-        break;
+    } else {
+      // Should be no transient "under construction" states unless a thread
+      // was killed or we are being destructed while another thread is still
+      // operating on the structure
+      assert(meta.IsEmpty());
     }
 #ifndef NDEBUG  // Extra invariant checking
     if (!h.head_next_with_shift.LoadRelaxed().IsEnd()) {
@@ -2691,20 +2666,17 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data,
           op_data->push_back(h);
           // Entries for eviction become purgeable
           purgeable = true;
-          assert((h->meta.Load() >> ClockHandle::kStateShift) ==
-                 ClockHandle::kStateConstruction);
+          assert(h->meta.Load().IsUnderConstruction());
         }
       } else {
         (void)op_data;
         (void)data;
-        purgeable = ((h->meta.Load() >> ClockHandle::kStateShift) &
-                     ClockHandle::kStateShareableBit) == 0;
+        purgeable = !h->meta.Load().IsShareable();
       }
     }
 
     if (purgeable) {
-      assert((h->meta.Load() >> ClockHandle::kStateShift) ==
-             ClockHandle::kStateConstruction);
+      assert(h->meta.Load().IsUnderConstruction());
       pending_purge = true;
     } else if (pending_purge) {
       if (prev_to_keep) {
@@ -2864,7 +2836,7 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home,
 }
 
 AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert(
-    const ClockHandleBasicData& proto, uint64_t initial_countdown,
+    const ClockHandleBasicData& proto, uint32_t initial_countdown,
     bool take_ref, InsertState& state) {
   size_t home;
   int orig_home_shift;
@@ -3149,14 +3121,14 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
 #endif
     if (probably_equal) {
       // Increment acquire counter for definitive check
-      uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement);
+      auto add_acquire = AcquireCounter::PlusTransformPromiseNoOverflow(1);
+      SlotMeta old_meta, new_meta;
+      h->meta.Apply(add_acquire, &old_meta, &new_meta);
       // Check if it's a referencable (sharable) entry
-      if (LIKELY(old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                             << ClockHandle::kStateShift))) {
-        assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0);
+      if (LIKELY(old_meta.IsShareable())) {
+        assert(new_meta.GetRefcount() > 0);
         if (LIKELY(h->hashed_key == hashed_key) &&
-            LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit}
-                               << ClockHandle::kStateShift))) {
+            LIKELY(old_meta.IsVisible())) {
           return h;
         } else {
           Unref(*h);
@@ -3277,7 +3249,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
         }
         // Update the hit bit
         if (eviction_callback_) {
-          h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift);
+          h->meta.ApplyRelaxed(SlotMeta::HitFlag::SetTransform());
         }
         // All done.
         return h;
@@ -3317,8 +3289,7 @@ AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup(
 }
 
 void AutoHyperClockTable::Remove(HandleImpl* h) {
-  assert((h->meta.Load() >> ClockHandle::kStateShift) ==
-         ClockHandle::kStateConstruction);
+  assert(h->meta.Load().IsUnderConstruction());
 
   const HandleImpl& c_h = *h;
   PurgeImpl(&c_h.hashed_key);
@@ -3326,26 +3297,23 @@ void AutoHyperClockTable::Remove(HandleImpl* h) {
 
 bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref,
                                          bool mark_invisible) {
-  uint64_t meta;
-  if (mark_invisible) {
-    // Set invisible
-    meta = h->meta.FetchAnd(
-        ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift));
-    // To local variable also
-    meta &=
-        ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift);
-  } else {
-    meta = h->meta.Load();
-  }
+  SlotMeta meta = h->meta.Load();
+  assert(!holding_ref || meta.IsShareable());
 
-  // Take ownership if no other refs
+  // Take ownership if no other refs, or set invisible if other refs exist (and
+  // mark_invisible is set).
+  SlotMeta construction_meta;
+  construction_meta.SetUnderConstruction();
   do {
-    if (GetRefcount(meta) != uint64_t{holding_ref}) {
+    if (meta.GetRefcount() != uint32_t{holding_ref}) {
       // Not last ref at some point in time during this call
+      if (mark_invisible) {
+        // Set invisible
+        h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform());
+      }
       return false;
     }
-    if ((meta & (uint64_t{ClockHandle::kStateShareableBit}
-                 << ClockHandle::kStateShift)) == 0) {
+    if (!meta.IsShareable()) {
       // Someone else took ownership
       return false;
     }
@@ -3353,8 +3321,7 @@ bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref,
     // another thread replaces this entry with another, reaches zero refs, and
     // then we end up erasing that other entry. That's an acceptable risk /
     // imprecision.
-  } while (!h->meta.CasWeak(meta, uint64_t{ClockHandle::kStateConstruction}
-                                      << ClockHandle::kStateShift));
+  } while (!h->meta.CasWeak(meta, construction_meta));
   // Took ownership
   // TODO? Delay freeing?
   h->FreeData(allocator_);
@@ -3381,27 +3348,24 @@ bool AutoHyperClockTable::Release(HandleImpl* h, bool useful,
   // is needed) and Erase. We do this to avoid an extra atomic read of the
   // variable usage_.
 
-  uint64_t old_meta;
+  SlotMeta old_meta;
   if (useful) {
     // Increment release counter to indicate was used
-    old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement);
+    auto add_release = ReleaseCounter::PlusTransformPromiseNoOverflow(1);
+    h->meta.Apply(add_release, &old_meta);
     // Correct for possible (but rare) overflow
     CorrectNearOverflow(old_meta, h->meta);
   } else {
     // Decrement acquire counter to pretend it never happened
-    old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement);
+    auto sub_acquire = AcquireCounter::MinusTransformPromiseNoUnderflow(1);
+    h->meta.Apply(sub_acquire, &old_meta);
   }
 
-  assert((old_meta >> ClockHandle::kStateShift) &
-         ClockHandle::kStateShareableBit);
+  assert(old_meta.IsShareable());
   // No underflow
-  assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
-          ClockHandle::kCounterMask) !=
-         ((old_meta >> ClockHandle::kReleaseCounterShift) &
-          ClockHandle::kCounterMask));
+  assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter());
 
-  if ((erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift ==
-                                     ClockHandle::kStateInvisible))) {
+  if ((erase_if_last_ref || UNLIKELY(old_meta.IsInvisible()))) {
     // FIXME: There's a chance here that another thread could replace this
     // entry and we end up erasing the wrong one.
     return TryEraseHandle(h, /*holding_ref=*/false, /*mark_invisible=*/false);
@@ -3411,7 +3375,7 @@ bool AutoHyperClockTable::Release(HandleImpl* h, bool useful,
 }
 
 #ifndef NDEBUG
-void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) {
+void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, uint32_t n) {
   if (n > 0) {
     // Do n-1 simple releases first
     TEST_ReleaseNMinus1(h, n);
@@ -3441,20 +3405,20 @@ void AutoHyperClockTable::EraseUnRefEntries() {
   for (size_t i = 0; i < usable_size; i++) {
     HandleImpl& h = array_[i];
 
-    uint64_t old_meta = h.meta.LoadRelaxed();
-    if (old_meta & (uint64_t{ClockHandle::kStateShareableBit}
-                    << ClockHandle::kStateShift) &&
-        GetRefcount(old_meta) == 0 &&
-        h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction}
-                                       << ClockHandle::kStateShift)) {
-      // Took ownership
-      h.FreeData(allocator_);
-      usage_.FetchSubRelaxed(h.total_charge);
-      // NOTE: could be more efficient with a dedicated variant of
-      // PurgeImpl, but this is not a common operation
-      Remove(&h);
-      MarkEmpty(h);
-      occupancy_.FetchSub(1U);
+    SlotMeta old_meta = h.meta.LoadRelaxed();
+    if (old_meta.IsShareable() && old_meta.GetRefcount() == 0) {
+      SlotMeta construction_meta;
+      construction_meta.SetUnderConstruction();
+      if (h.meta.CasStrong(old_meta, construction_meta)) {
+        // Took ownership
+        h.FreeData(allocator_);
+        usage_.FetchSubRelaxed(h.total_charge);
+        // NOTE: could be more efficient with a dedicated variant of
+        // PurgeImpl, but this is not a common operation
+        Remove(&h);
+        MarkEmpty(h);
+        occupancy_.FetchSub(1U);
+      }
     }
   }
 }
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index f2d6c7fe6c58..cdee0e93e5a7 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -317,40 +317,89 @@ struct ClockHandle : public ClockHandleBasicData {
   // | acquire counter      | release counter     | hit bit | state marker |
   // -----------------------------------------------------------------------
 
-  // For reading or updating counters in meta word.
-  static constexpr uint8_t kCounterNumBits = 30;
-  static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
-
-  static constexpr uint8_t kAcquireCounterShift = 0;
-  static constexpr uint64_t kAcquireIncrement = uint64_t{1}
-                                                << kAcquireCounterShift;
-  static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
-  static constexpr uint64_t kReleaseIncrement = uint64_t{1}
-                                                << kReleaseCounterShift;
-
-  // For setting the hit bit
-  static constexpr uint8_t kHitBitShift = 2U * kCounterNumBits;
-  static constexpr uint64_t kHitBitMask = uint64_t{1} << kHitBitShift;
-
-  // For reading or updating the state marker in meta word
-  static constexpr uint8_t kStateShift = kHitBitShift + 1;
-
-  // Bits contribution to state marker.
-  // Occupied means any state other than empty
-  static constexpr uint8_t kStateOccupiedBit = 0b100;
-  // Shareable means the entry is reference counted (visible or invisible)
-  // (only set if also occupied)
-  static constexpr uint8_t kStateShareableBit = 0b010;
-  // Visible is only set if also shareable
-  static constexpr uint8_t kStateVisibleBit = 0b001;
-
-  // Complete state markers (not shifted into full word)
-  static constexpr uint8_t kStateEmpty = 0b000;
-  static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
-  static constexpr uint8_t kStateInvisible =
-      kStateOccupiedBit | kStateShareableBit;
-  static constexpr uint8_t kStateVisible =
-      kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
+  struct SlotMeta : public BitFields<uint64_t, SlotMeta> {
+    // For reading or updating counters in meta word.
+    static constexpr uint8_t kCounterNumBits = 30;
+    // Number of times the a reference has been acquired (or attempted)
+    // since last reset by eviction processing
+    using AcquireCounter =
+        UnsignedBitField<SlotMeta, kCounterNumBits, NoPrevBitField>;
+    // Number of times the a reference has been released (or attempted)
+    // since last reset by eviction processing
+    using ReleaseCounter =
+        UnsignedBitField<SlotMeta, kCounterNumBits, AcquireCounter>;
+    // Metadata bit in support of secondary cache
+    using HitFlag = BoolBitField<SlotMeta, ReleaseCounter>;
+    // Occupied means any state other than empty
+    using OccupiedFlag = BoolBitField<SlotMeta, HitFlag>;
+    // Shareable means the entry is reference counted (visible or invisible)
+    // (only set if also occupied)
+    using ShareableFlag = BoolBitField<SlotMeta, OccupiedFlag>;
+    // Visible is only set if also shareable (invisible can't be found by
+    // Lookup)
+    using VisibleFlag = BoolBitField<SlotMeta, ShareableFlag>;
+
+    // Convenience functions
+    uint32_t GetAcquireCounter() const { return Get<AcquireCounter>(); }
+    void SetAcquireCounter(uint32_t val) { Set<AcquireCounter>(val); }
+    uint32_t GetReleaseCounter() const { return Get<ReleaseCounter>(); }
+    void SetReleaseCounter(uint32_t val) { Set<ReleaseCounter>(val); }
+    uint32_t GetRefcount() const {
+      return Get<AcquireCounter>() - Get<ReleaseCounter>();
+    }
+    bool GetHit() const { return Get<HitFlag>(); }
+    void SetHit(bool val) { Set<HitFlag>(val); }
+
+    // Some distinct states for the various state flags
+    bool IsEmpty() const {
+      bool rv = !Get<OccupiedFlag>();
+      if (rv) {
+        assert(!Get<ShareableFlag>());
+        assert(!Get<VisibleFlag>());
+      }
+      return rv;
+    }
+
+    bool IsUnderConstruction() const {
+      bool rv = Get<OccupiedFlag>() && !Get<ShareableFlag>();
+      if (rv) {
+        assert(!Get<VisibleFlag>());
+      }
+      return rv;
+    }
+    void SetUnderConstruction() {
+      Set<OccupiedFlag>(true);
+      Set<ShareableFlag>(false);
+      Set<VisibleFlag>(false);
+    }
+
+    bool IsShareable() const { return Get<ShareableFlag>(); }
+    bool IsInvisible() const {
+      bool rv = Get<ShareableFlag>() && !Get<VisibleFlag>();
+      if (rv) {
+        assert(Get<OccupiedFlag>());
+      }
+      return rv;
+    }
+    void SetInvisible() {
+      Set<OccupiedFlag>(true);
+      Set<ShareableFlag>(true);
+      Set<VisibleFlag>(false);
+    }
+
+    bool IsVisible() const {
+      bool rv = Get<ShareableFlag>() && Get<VisibleFlag>();
+      if (rv) {
+        assert(Get<OccupiedFlag>());
+      }
+      return rv;
+    }
+    void SetVisible() {
+      Set<OccupiedFlag>(true);
+      Set<ShareableFlag>(true);
+      Set<VisibleFlag>(true);
+    }
+  };
 
   // Constants for initializing the countdown clock. (Countdown clock is only
   // in effect with zero refs, acquire counter == release counter, and in that
@@ -364,7 +413,7 @@ struct ClockHandle : public ClockHandleBasicData {
   // TODO: make these coundown values tuning parameters for eviction?
 
   // See above. Mutable for read reference counting.
-  mutable AcqRelAtomic<uint64_t> meta{};
+  mutable AcqRelBitFieldsAtomic<SlotMeta> meta{};
 };  // struct ClockHandle
 
 class BaseClockTable {
@@ -431,9 +480,9 @@ class BaseClockTable {
   bool IsEvictionEffortExceeded(const BaseClockTable::EvictionData& data) const;
 #ifndef NDEBUG
   // Acquire N references
-  void TEST_RefN(ClockHandle& handle, size_t n);
+  void TEST_RefN(ClockHandle& handle, uint32_t n);
   // Helper for TEST_ReleaseN
-  void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
+  void TEST_ReleaseNMinus1(ClockHandle* handle, uint32_t n);
 #endif
 
  private:  // fns
@@ -586,7 +635,7 @@ class FixedHyperClockTable : public BaseClockTable {
   bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
 
   HandleImpl* DoInsert(const ClockHandleBasicData& proto,
-                       uint64_t initial_countdown, bool take_ref,
+                       uint32_t initial_countdown, bool take_ref,
                        InsertState& state);
 
   // Runs the clock eviction algorithm trying to reclaim at least
@@ -614,7 +663,7 @@ class FixedHyperClockTable : public BaseClockTable {
   }
 
   // Release N references
-  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+  void TEST_ReleaseN(HandleImpl* handle, uint32_t n);
 #endif
 
   // The load factor p is a real number in (0, 1) such that at all
@@ -897,7 +946,7 @@ class AutoHyperClockTable : public BaseClockTable {
   bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
 
   HandleImpl* DoInsert(const ClockHandleBasicData& proto,
-                       uint64_t initial_countdown, bool take_ref,
+                       uint32_t initial_countdown, bool take_ref,
                        InsertState& state);
 
   // Runs the clock eviction algorithm trying to reclaim at least
@@ -925,7 +974,7 @@ class AutoHyperClockTable : public BaseClockTable {
   }
 
   // Release N references
-  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+  void TEST_ReleaseN(HandleImpl* handle, uint32_t n);
 #endif
 
   // Maximum ratio of number of occupied slots to number of usable slots. The
@@ -1130,8 +1179,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
     return table_.TEST_MutableOccupancyLimit();
   }
   // Acquire/release N references
-  void TEST_RefN(HandleImpl* handle, size_t n);
-  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+  void TEST_RefN(HandleImpl* handle, uint32_t n);
+  void TEST_ReleaseN(HandleImpl* handle, uint32_t n);
 #endif
 
  private:  // data
diff --git a/test_util/sync_point.h b/test_util/sync_point.h
index 6bfb841926e9..081e90cb1231 100644
--- a/test_util/sync_point.h
+++ b/test_util/sync_point.h
@@ -6,10 +6,9 @@
 
 #include <assert.h>
 
+#include <atomic>
 #include <functional>
-#include <mutex>
 #include <string>
-#include <thread>
 #include <vector>
 
 #include "rocksdb/rocksdb_namespace.h"
@@ -190,6 +189,8 @@ namespace ROCKSDB_NAMESPACE {
 // Intentionally not based on std::exception to reduce places where this
 // would be caught
 struct TestableAssertionFailure {};
+// Tracks whether to throw on testable_assert failure instead of aborting.
+// This is an atomic counter for re-entrancy / thread-safety.
 extern std::atomic<int> g_throw_on_testable_assertion_failure;
 }  // namespace ROCKSDB_NAMESPACE
 #define testable_assert(cond)                                          \
@@ -202,7 +203,7 @@ extern std::atomic<int> g_throw_on_testable_assertion_failure;
     } else {                                                           \
       assert(cond);                                                    \
     }                                                                  \
-  } while (0)
+  } while (0)  // require ; in caller
 #define ASSERT_TESTABLE_FAILURE(expr)                                   \
   do {                                                                  \
     ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_add( \
@@ -210,5 +211,5 @@ extern std::atomic<int> g_throw_on_testable_assertion_failure;
     ASSERT_THROW(expr, ROCKSDB_NAMESPACE::TestableAssertionFailure);    \
     ROCKSDB_NAMESPACE::g_throw_on_testable_assertion_failure.fetch_sub( \
         1, std::memory_order_relaxed);                                  \
-  } while (0)
+  } while (0)  // require ; in caller
 #endif
diff --git a/util/bit_fields.h b/util/bit_fields.h
index aa49cc0e0b76..1f2eb50b664c 100644
--- a/util/bit_fields.h
+++ b/util/bit_fields.h
@@ -70,7 +70,7 @@ struct BitFields {
 
   // Return a copy with the given field modified
   template <typename BitFieldT>
-  Derived With(typename BitFieldT::V value) const {
+  constexpr Derived With(typename BitFieldT::V value) const {
     static_assert(std::is_same_v<typename BitFieldT::Parent, Derived>);
     Derived rv = static_cast<const Derived&>(*this);
     BitFieldT::SetIn(rv, value);
@@ -125,24 +125,26 @@ struct BitFields {
 // For building atomic updates affecting one or more fields, assuming all the
 // updates are bitwise-or.
 template <typename BitFieldsT>
-struct OrTransform {
+struct OrTransformer {
   using U = typename BitFieldsT::U;
   U to_or = 0;
   // + for general combine
-  OrTransform<BitFieldsT> operator+(OrTransform<BitFieldsT> other) const {
-    return OrTransform<BitFieldsT>{to_or | other.to_or};
+  OrTransformer<BitFieldsT> operator+(
+      const OrTransformer<BitFieldsT>& other) const {
+    return OrTransformer<BitFieldsT>{to_or | other.to_or};
   }
 };
 
 // For building atomic updates affecting one or more fields, assuming all the
 // updates are bitwise-and.
 template <typename BitFieldsT>
-struct AndTransform {
+struct AndTransformer {
   using U = typename BitFieldsT::U;
   U to_and = 0;
   // + for general combine
-  AndTransform<BitFieldsT> operator+(AndTransform<BitFieldsT> other) const {
-    return AndTransform<BitFieldsT>{to_and & other.to_and};
+  AndTransformer<BitFieldsT> operator+(
+      const AndTransformer<BitFieldsT>& other) const {
+    return AndTransformer<BitFieldsT>{to_and & other.to_and};
   }
 };
 
@@ -152,7 +154,7 @@ struct AndTransform {
 // corresponding preconditions. (NOTE that when representing a subtraction, we
 // rely on overflow of the unsigned representation.)
 template <typename BitFieldsT>
-struct AddTransform {
+struct AddTransformer {
   using U = typename BitFieldsT::U;
   U to_add = 0;
 #ifndef NDEBUG
@@ -162,7 +164,7 @@ struct AddTransform {
   };
   std::vector<Precondition> preconditions;
 #endif  // NDEBUG
-  void AssertPreconditions([[maybe_unused]] U from) {
+  void AssertPreconditions([[maybe_unused]] U from) const {
 #ifndef NDEBUG
     for (auto p : preconditions) {
       U tmp = (from & p.mask) + p.piece;
@@ -174,8 +176,9 @@ struct AddTransform {
 #endif  // NDEBUG
   }
   // + for general combine
-  AddTransform<BitFieldsT> operator+(AddTransform<BitFieldsT> other) const {
-    AddTransform<BitFieldsT> rv{to_add + other.to_add};
+  AddTransformer<BitFieldsT> operator+(
+      const AddTransformer<BitFieldsT>& other) const {
+    AddTransformer<BitFieldsT> rv{to_add + other.to_add};
 #ifndef NDEBUG
     rv.preconditions = preconditions;
     rv.preconditions.insert(rv.preconditions.end(), other.preconditions.begin(),
@@ -214,14 +217,18 @@ struct BoolBitField {
     return (bf.underlying & (U{1} << kBitOffset)) != 0;
   }
   static void SetIn(ParentBase& bf, bool value) {
+    // NOTE: avoiding conditional branches is usually best for speed on modern
+    // processors
     bf.underlying =
         (bf.underlying & ~(U{1} << kBitOffset)) | (U{value} << kBitOffset);
   }
-  static OrTransform<BitFieldsT> SetTransform() {
-    return OrTransform<BitFieldsT>{U{1} << kBitOffset};
+  static OrTransformer<BitFieldsT> SetTransform() { return Or(true); }
+  static OrTransformer<BitFieldsT> Or(bool b) {
+    return OrTransformer<BitFieldsT>{U{b} << kBitOffset};
   }
-  static AndTransform<BitFieldsT> ClearTransform() {
-    return AndTransform<BitFieldsT>{~(U{1} << kBitOffset)};
+  static AndTransformer<BitFieldsT> ClearTransform() { return And(false); }
+  static AndTransformer<BitFieldsT> And(bool b) {
+    return AndTransformer<BitFieldsT>{~(U{!b} << kBitOffset)};
   }
 };
 
@@ -258,18 +265,31 @@ struct UnsignedBitField {
     bf.underlying |= static_cast<U>(value & kMask) << kBitOffset;
   }
 
-  // Create a transfor for clearing this field to zero.
-  static AndTransform<BitFieldsT> ClearTransform() {
-    return AndTransform<BitFieldsT>{~(static_cast<U>(kMask) << kBitOffset)};
+  // Create a transform for clearing this field to zero.
+  static AndTransformer<BitFieldsT> ClearTransform() {
+    return AndTransformer<BitFieldsT>{~(static_cast<U>(kMask) << kBitOffset)};
+  }
+
+  // Create a transform for bitwise-and
+  static AndTransformer<BitFieldsT> AndTransform(V value) {
+    assert((value & ~kMask) == 0);
+    return AndTransformer<BitFieldsT>{
+        ~(static_cast<U>(value ^ kMask) << kBitOffset)};
+  }
+
+  // Create a transform for bitwise-or
+  static OrTransformer<BitFieldsT> OrTransform(V value) {
+    assert((value & ~kMask) == 0);
+    return OrTransformer<BitFieldsT>{static_cast<U>(value) << kBitOffset};
   }
 
   // Create a transform for adding a particular value, but with the precondition
   // that adding the value will not overflow the field. This applies for fields
   // that do not include the top bit of the underlying representation. Can be
   // combined with other additive transforms for other fields.
-  static AddTransform<BitFieldsT> PlusTransformPromiseNoOverflow(V value) {
+  static AddTransformer<BitFieldsT> PlusTransformPromiseNoOverflow(V value) {
     static_assert(!kIncludesTopBit);
-    AddTransform<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
+    AddTransformer<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
 #ifndef NDEBUG
     rv.preconditions.push_back(
         {static_cast<U>(kMask) << kBitOffset, rv.to_add});
@@ -281,9 +301,9 @@ struct UnsignedBitField {
   // in that field. This applies for fields that include the top bit of the
   // underlying representation. Can be combined with other additive transforms
   // for other fields.
-  static AddTransform<BitFieldsT> PlusTransformIgnoreOverflow(V value) {
+  static AddTransformer<BitFieldsT> PlusTransformIgnoreOverflow(V value) {
     static_assert(kIncludesTopBit);
-    AddTransform<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
+    AddTransformer<BitFieldsT> rv{static_cast<U>(value) << kBitOffset};
     return rv;
   }
 
@@ -292,9 +312,9 @@ struct UnsignedBitField {
   // applies for fields that do not include the top bit of the underlying
   // representation. Can be combined with other additive transforms for other
   // fields.
-  static AddTransform<BitFieldsT> MinusTransformPromiseNoUnderflow(V value) {
+  static AddTransformer<BitFieldsT> MinusTransformPromiseNoUnderflow(V value) {
     static_assert(!kIncludesTopBit);
-    AddTransform<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
+    AddTransformer<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
 #ifndef NDEBUG
     rv.preconditions.push_back(
         {static_cast<U>(kMask) << kBitOffset, rv.to_add});
@@ -306,9 +326,9 @@ struct UnsignedBitField {
   // underflow in that field. This applies for fields that include the top bit
   // of the underlying representation. Can be combined with other additive
   // transforms for other fields.
-  static AddTransform<BitFieldsT> MinusTransformIgnoreUnderflow(V value) {
+  static AddTransformer<BitFieldsT> MinusTransformIgnoreUnderflow(V value) {
     static_assert(kIncludesTopBit);
-    AddTransform<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
+    AddTransformer<BitFieldsT> rv{U{0} - (static_cast<U>(value) << kBitOffset)};
     return rv;
   }
 };
@@ -347,22 +367,22 @@ class RelaxedBitFieldsAtomic {
     return BitFieldsT{
         v_.exchange(desired.underlying, std::memory_order_relaxed)};
   }
-  void ApplyRelaxed(OrTransform<BitFieldsT> transform,
+  void ApplyRelaxed(const OrTransformer<BitFieldsT>& transform,
                     BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     ApplyImpl<std::memory_order_relaxed>(transform, before, after);
   }
-  void ApplyRelaxed(AndTransform<BitFieldsT> transform,
+  void ApplyRelaxed(const AndTransformer<BitFieldsT>& transform,
                     BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     ApplyImpl<std::memory_order_relaxed>(transform, before, after);
   }
-  void ApplyRelaxed(AddTransform<BitFieldsT> transform,
+  void ApplyRelaxed(const AddTransformer<BitFieldsT>& transform,
                     BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     ApplyImpl<std::memory_order_relaxed>(transform, before, after);
   }
 
  protected:  // fns
   template <std::memory_order kOrder>
-  void ApplyImpl(OrTransform<BitFieldsT> transform,
+  void ApplyImpl(const OrTransformer<BitFieldsT>& transform,
                  BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     U before_val = v_.fetch_or(transform.to_or, kOrder);
     if (before) {
@@ -373,7 +393,7 @@ class RelaxedBitFieldsAtomic {
     }
   }
   template <std::memory_order kOrder>
-  void ApplyImpl(AndTransform<BitFieldsT> transform,
+  void ApplyImpl(const AndTransformer<BitFieldsT>& transform,
                  BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     U before_val = v_.fetch_and(transform.to_and, kOrder);
     if (before) {
@@ -384,7 +404,7 @@ class RelaxedBitFieldsAtomic {
     }
   }
   template <std::memory_order kOrder>
-  void ApplyImpl(AddTransform<BitFieldsT> transform,
+  void ApplyImpl(const AddTransformer<BitFieldsT>& transform,
                  BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     U before_val = v_.fetch_add(transform.to_add, kOrder);
     transform.AssertPreconditions(before_val);
@@ -428,18 +448,18 @@ class AcqRelBitFieldsAtomic : public RelaxedBitFieldsAtomic<BitFieldsT> {
     return BitFieldsT{
         Base::v_.exchange(desired.underlying, std::memory_order_acq_rel)};
   }
-  void Apply(OrTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
-             BitFieldsT* after = nullptr) {
+  void Apply(const OrTransformer<BitFieldsT>& transform,
+             BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
                                                         after);
   }
-  void Apply(AndTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
-             BitFieldsT* after = nullptr) {
+  void Apply(const AndTransformer<BitFieldsT>& transform,
+             BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
                                                         after);
   }
-  void Apply(AddTransform<BitFieldsT> transform, BitFieldsT* before = nullptr,
-             BitFieldsT* after = nullptr) {
+  void Apply(const AddTransformer<BitFieldsT>& transform,
+             BitFieldsT* before = nullptr, BitFieldsT* after = nullptr) {
     Base::template ApplyImpl<std::memory_order_acq_rel>(transform, before,
                                                         after);
   }
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 58de6b1612c8..cbc72891172e 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -622,6 +622,29 @@ TEST(BitFieldsTest, BitFields) {
     ASSERT_EQ(after.Get<Field3>(), true);
     ASSERT_EQ(state.Get<Field4>(), 3U);
 
+    auto transform2a = Field2::And(true) + Field3::And(false);
+    acqrel.Apply(transform2a, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2b = Field2::And(false) + Field3::And(true);
+    acqrel.Apply(transform2b, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), false);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2c = Field2::Or(true) + Field3::Or(false);
+    acqrel.Apply(transform2c, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), false);
+
+    auto transform2d = Field2::Or(false) + Field3::Or(true);
+    acqrel.Apply(transform2d, &before, &after);
+    ASSERT_EQ(after.Get<Field2>(), true);
+    ASSERT_EQ(after.Get<Field3>(), true);
+
+    ASSERT_EQ(state.Get<Field1>(), 45U);
+    ASSERT_EQ(state.Get<Field4>(), 3U);
+
     auto transform3 = Field1::PlusTransformPromiseNoOverflow(10000U) +
                       Field4::MinusTransformPromiseNoUnderflow(3U);
     acqrel.Apply(transform3, &before, &after);
@@ -636,6 +659,17 @@ TEST(BitFieldsTest, BitFields) {
     ASSERT_EQ(after.Get<Field1>(), 9046U);
     ASSERT_EQ(after.Get<Field4>(), 31U);
 
+    auto transform4a =
+        Field1::AndTransform(8192U + 4096U) + Field4::AndTransform(15U);
+    acqrel.Apply(transform4a, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 8192U);
+    ASSERT_EQ(after.Get<Field4>(), 15U);
+
+    auto transform4b = Field1::OrTransform(127U) + Field4::OrTransform(16U);
+    acqrel.Apply(transform4b, &before, &after);
+    ASSERT_EQ(after.Get<Field1>(), 8192U + 127U);
+    ASSERT_EQ(after.Get<Field4>(), 31U);
+
     // Unmodified
     ASSERT_EQ(after.Get<Field2>(), true);
     ASSERT_EQ(after.Get<Field3>(), true);

From 8b6f98cdb4fcb2d8eae9666d3ef141695fa52c8a Mon Sep 17 00:00:00 2001
From: Pierre Moulon <pierrem@meta.com>
Date: Fri, 2 Jan 2026 17:25:57 -0800
Subject: [PATCH 411/500] Fixing typos in comments and documentation (#14205)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14205

Fixed various spelling errors throughout RocksDB codebase including:
- assiciated → associated
- disucssion → discussion
- satisifed → satisfied
- supoort → support
- capacit_limit → capacity_limit
- direclty → directly
- diable → disable
- opeartions → operations
- paylaod → payload
- happenning/happended → happening/happened
- intialized/initiallized → initialized
- asynchronosuly → asynchronously
- exisiting → existing
- persitence → persistence
- and several others

These changes are in comments, test code, and documentation only.

Reviewed By: pdillinger

Differential Revision: D89800154

fbshipit-source-id: 1681ec95a687b038c2bad48856f1abb4dbeb42cf
---
 cache/cache_test.cc                           |  2 +-
 cache/lru_cache_test.cc                       | 14 +++++++-------
 db/db_flush_test.cc                           | 14 +++++++-------
 file/file_prefetch_buffer.h                   |  6 +++---
 file/prefetch_test.cc                         | 19 +++++++++----------
 include/rocksdb/advanced_cache.h              |  2 +-
 include/rocksdb/experimental.h                |  6 +++---
 .../block_based/block_based_table_iterator.cc |  6 +++---
 .../block_cache_trace_analyzer_test.cc        |  2 +-
 9 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 12bcfe6cd437..b762fe4f8af7 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -644,7 +644,7 @@ using TypedHandle = SharedCache::TypedHandle;
 
 TEST_P(CacheTest, SetCapacity) {
   if (IsHyperClock()) {
-    // TODO: update test & code for limited supoort
+    // TODO: update test & code for limited support
     ROCKSDB_GTEST_BYPASS(
         "HyperClockCache doesn't support arbitrary capacity "
         "adjustments.");
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 486e595e12b4..efdef44bac0b 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -1503,7 +1503,7 @@ TEST_P(BasicSecondaryCacheTest, FullCapacityTest) {
         /*context*/ this, Cache::Priority::LOW);
     ASSERT_EQ(handle1, nullptr);
 
-    // k1 promotion can fail with strict_capacit_limit=true, but Lookup still
+    // k1 promotion can fail with strict_capacity_limit=true, but Lookup still
     // succeeds using a standalone handle
     handle1 = cache->Lookup(k1.AsSlice(), GetHelper(),
                             /*context*/ this, Cache::Priority::LOW);
@@ -1680,7 +1680,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
   // After Flush is successful, RocksDB will do the paranoid check for the new
   // SST file. Meta blocks are always cached in the block cache and they
   // will not be evicted. When block_2 is cache miss and read out, it is
-  // inserted to the block cache. Thefore, block_1 is evicted from block
+  // inserted to the block cache. Therefore, block_1 is evicted from block
   // cache and successfully inserted to the secondary cache. Here are 2
   // lookups in the secondary cache for block_1 and block_2.
   ASSERT_EQ(secondary_cache->num_inserts(), 1u);
@@ -1721,7 +1721,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
   v = Get(Key(0));
   ASSERT_EQ(1007, v.size());
   // This Get needs to access block_1, since block_1 is not in block cache
-  // there is one econdary cache lookup. Then, block_1 is cached in the
+  // there is one secondary cache lookup. Then, block_1 is cached in the
   // block cache.
   ASSERT_EQ(secondary_cache->num_inserts(), 2u);
   ASSERT_EQ(secondary_cache->num_lookups(), 5u);
@@ -1785,7 +1785,7 @@ TEST_P(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
   std::string v = Get(Key(0));
   ASSERT_EQ(1000, v.size());
   // Since the block cache is large enough, all the blocks are cached. we
-  // do not need to lookup the seondary cache.
+  // do not need to lookup the secondary cache.
   ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_EQ(secondary_cache->num_lookups(), 2u);
 
@@ -2150,7 +2150,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
   ASSERT_OK(Flush());
   Compact("a", "z");
 
-  // do th eread for all the key value pairs, so all the blocks should be in
+  // do the read for all the key value pairs, so all the blocks should be in
   // cache
   uint32_t start_insert = cache->GetInsertCount();
   uint32_t start_lookup = cache->GetLookupcount();
@@ -2464,7 +2464,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) {
   std::string v = Get(Key(0));
   ASSERT_EQ(1007, v.size());
 
-  // Check the data in first block. Cache miss, direclty read from SST file.
+  // Check the data in first block. Cache miss, directly read from SST file.
   ASSERT_EQ(secondary_cache->num_inserts(), 0u);
   ASSERT_EQ(secondary_cache->num_lookups(), 0u);
 
@@ -2598,7 +2598,7 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) {
 }
 
 // Two DB test. We create 2 DBs sharing the same block cache and secondary
-// cache. We diable the secondary cache option for DB2.
+// cache. We disable the secondary cache option for DB2.
 TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
   if (IsHyperClock()) {
     ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors");
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 21f88d795171..61f9b5757acc 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -101,7 +101,7 @@ TEST_F(DBFlushTest, SyncFail) {
   TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
   fault_injection_env->SetFilesystemActive(true);
   // Now the background job will do the flush; wait for it.
-  // Returns the IO error happend during flush.
+  // Returns the IO error happened during flush.
   ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("", FilesPerLevel());  // flush failed.
   Destroy(options);
@@ -518,11 +518,11 @@ TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
   // Note : one set of delete for KEY1, KEY2, KEY3 is written to
   // SSTable to propagate the delete operations to K-V pairs
   // that could have been inserted into the database during past Flush
-  // opeartions.
+  // operations.
   EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
       KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
 
-  // Additional useful paylaod.
+  // Additional useful payload.
   ASSERT_OK(Delete(KEY4));
   ASSERT_OK(Delete(KEY5));
   ASSERT_OK(Delete(KEY6));
@@ -614,7 +614,7 @@ TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
 
   // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
   // to SSTable to propagate the deleteRange operations to K-V pairs that could
-  // have been inserted into the database during past Flush opeartions.
+  // have been inserted into the database during past Flush operations.
   EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
       (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
       (KEY2.size() + KEY3.size() + sizeof(uint64_t));
@@ -842,7 +842,7 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) {
     ASSERT_OK(Put(1, Key(idx), std::string(1, 'v')));
   }
 
-  // To coerce a manual flush happenning in the middle of GetLiveFiles's flush,
+  // To coerce a manual flush happening in the middle of GetLiveFiles's flush,
   // we need to pause background flush thread and enable it later.
   std::shared_ptr<test::SleepingBackgroundTask> sleeping_task =
       std::make_shared<test::SleepingBackgroundTask>();
@@ -851,7 +851,7 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) {
                  sleeping_task.get(), Env::Priority::HIGH);
   sleeping_task->WaitUntilSleeping();
 
-  // Coerce a manual flush happenning in the middle of GetLiveFiles's flush
+  // Coerce a manual flush happening in the middle of GetLiveFiles's flush
   bool get_live_files_paused_at_sync_point = false;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::AtomicFlushMemTables:AfterScheduleFlush", [&](void* /* arg */) {
@@ -1428,7 +1428,7 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
   Close();
 }
 
-// Create a Compaction Fitler that will be invoked
+// Create a Compaction Filter that will be invoked
 // at flush time and will update the value of a KV pair
 // if the key string is "lower" than the filter_key_ string.
 class ConditionalUpdateFilter : public CompactionFilter {
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
index 51c0b4441a06..575e9ebcd795 100644
--- a/file/file_prefetch_buffer.h
+++ b/file/file_prefetch_buffer.h
@@ -93,8 +93,8 @@ struct BufferInfo {
   //
   // For example - if end offset of previous buffer was 100 and because of
   // readahead_size optimization, end_offset was trimmed to 60. Then for next
-  // prefetch call, start_offset should be intialized to 100 i.e  start_offset =
-  // buf->initial_end_offset_.
+  // prefetch call, start_offset should be initialized to 100 i.e  start_offset
+  // = buf->initial_end_offset_.
   uint64_t initial_end_offset_ = 0;
 
   bool IsDataBlockInBuffer(uint64_t offset, size_t length) {
@@ -155,7 +155,7 @@ enum class FilePrefetchBufferUsage {
 // When reusing the file system allocated buffer, overlap_buf_ is used if the
 // main buffer only contains part of the requested data. It is returned to
 // the caller after the remaining data is fetched.
-// If num_buffers_ > 1, then the data is prefetched asynchronosuly in the
+// If num_buffers_ > 1, then the data is prefetched asynchronously in the
 // buffers whenever the data is consumed from the buffers and that buffer is
 // freed.
 // If num_buffers > 1, then requested data can be overlapping between 2 buffers.
diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc
index c651046dd246..bcfeb38edc75 100644
--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@@ -669,7 +669,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
     MoveFilesToLevel(level);
   }
   Close();
-  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
+  std::vector<int> buff_prefetch_level_count = {0, 0, 0};
   ASSERT_OK(TryReopen(options));
   {
     auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
@@ -707,7 +707,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
         iter->Next();
       }
 
-      buff_prefectch_level_count[level] = buff_prefetch_count;
+      buff_prefetch_level_count[level] = buff_prefetch_count;
       if (support_prefetch && !use_direct_io) {
         if (level == 0) {
           ASSERT_FALSE(fs->IsPrefetchCalled());
@@ -728,7 +728,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
   }
 
   if (!support_prefetch) {
-    ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
+    ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]);
   }
 
   SyncPoint::GetInstance()->DisableProcessing();
@@ -814,7 +814,7 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
                                       "{initial_auto_readahead_size=0;}"}}));
           break;
         case 1:
-          // intial_auto_readahead_size and max_auto_readahead_size are set
+          // initial_auto_readahead_size and max_auto_readahead_size are set
           // same so readahead_size remains same.
           ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
                                       "{initial_auto_readahead_size=4096;max_"
@@ -1081,7 +1081,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
   }
   {
     /*
-     * Reesek keys from Single Data Block.
+     * Reseek keys from Single Data Block.
      */
     auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
     iter->Seek(BuildKey(0));
@@ -1116,9 +1116,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) {
     ASSERT_TRUE(iter->Valid());
     iter->Seek(BuildKey(1008));
     ASSERT_TRUE(iter->Valid());
-    iter->Seek(
-        BuildKey(996));  // Reseek won't prefetch any data and
-                         // readahead_size will be initiallized to 8*1024.
+    iter->Seek(BuildKey(996));  // Reseek won't prefetch any data and
+                                // readahead_size will be initialized to 8*1024.
     ASSERT_TRUE(iter->Valid());
     iter->Seek(BuildKey(992));
     ASSERT_TRUE(iter->Valid());
@@ -1590,7 +1589,7 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Combine(
         // Params are as follows -
         // Param 0 - TableOptions::index_shortening
-        // Param 2 - ReadOptinos::auto_readahead_size
+        // Param 2 - ReadOptions::auto_readahead_size
         ::testing::Values(
             BlockBasedTableOptions::IndexShorteningMode::kNoShortening,
             BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators,
@@ -3303,7 +3302,7 @@ TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) {
   ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 1);
   ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL), 8192);
 
-  // Now read some data with length doesn't align with aligment and it needs
+  // Now read some data with length doesn't align with alignment and it needs
   // prefetching. Read from 16000 with length 10000 (i.e. requested end offset -
   // 26000).
   ASSERT_TRUE(
diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h
index d8eeb7d2e381..8142228205e4 100644
--- a/include/rocksdb/advanced_cache.h
+++ b/include/rocksdb/advanced_cache.h
@@ -318,7 +318,7 @@ class Cache : public Customizable {
   // REQUIRES: handle must have been returned by a method on *this.
   virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0;
 
-  // Return the object assiciated with a handle returned by a successful
+  // Return the object associated with a handle returned by a successful
   // Lookup(). For historical reasons, this is also known at the "value"
   // associated with the key.
   // REQUIRES: handle must not have been released yet.
diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h
index d6a34c025728..42b40cfa4754 100644
--- a/include/rocksdb/experimental.h
+++ b/include/rocksdb/experimental.h
@@ -86,7 +86,7 @@ Status UpdateManifestForFilesState(
 // keys in a category to return an empty sequence of segments.
 //
 // To eliminate a confusing distinction between a segment that is empty vs.
-// "not present" for a particular key, each key is logically assiciated with
+// "not present" for a particular key, each key is logically associated with
 // an infinite sequence of segments, including some infinite tail of 0-length
 // segments. In practice, we only represent a finite sequence that (at least)
 // covers the non-trivial segments.
@@ -220,7 +220,7 @@ Status UpdateManifestForFilesState(
 // whole key.
 // * Range query - Whether there {definitely isn't, might be} any entries
 // within a lower and upper key bound, in an SST file (or partition, etc.).
-//    NOTE: For this disucssion, we ignore the detail of inclusive vs.
+//    NOTE: For this discussion, we ignore the detail of inclusive vs.
 //    exclusive bounds by assuming a generalized notion of "bound" (vs. key)
 //    that conveniently represents spaces between keys. For details, see
 //    https://github.com/facebook/rocksdb/pull/11434
@@ -300,7 +300,7 @@ Status UpdateManifestForFilesState(
 //     * Keys x and z are in categories in category set s, and
 //     * Key y is ordered x < y < z according to the CF comparator,
 // then both
-//     * The common segment prefix property is satisifed through ordinal i-1
+//     * The common segment prefix property is satisfied through ordinal i-1
 //     and with category set s
 //     * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where
 //     x_i..j is the concatenation of segments i through j of key x (etc.).
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index c507497244f2..0c6fa65834db 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -767,7 +767,7 @@ void BlockBasedTableIterator::InitializeStartAndEndOffsets(
       // It can be when Reseek is from block cache (which doesn't clear the
       // buffers in FilePrefetchBuffer but clears block handles from queue) and
       // reseek also lies within the buffer. So Next will get data from
-      // exisiting buffers untill this callback is made to prefetch additional
+      // existing buffers until this callback is made to prefetch additional
       // data. All handles need to be added to the queue starting from
       // index_iter_.
       assert(index_iter_->Valid());
@@ -1046,7 +1046,7 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
 
 void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
   assert(multi_scan_ && multi_scan_status_.ok());
-  // This is a MultiScan and Preapre() has been called.
+  // This is a MultiScan and Prepare() has been called.
 
   // Reset out of bound on seek, if it is out of bound again, it will be set
   // properly later in the code path
@@ -1153,7 +1153,7 @@ void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
       // This should never happen, the reason is that the
       // multi_scan_->next_scan_idx is set to a non zero value is due to a seek
       // target larger or equal to the start key of multi_scan_->next_scan_idx-1
-      // happended earlier. If a seek happens before the start key of
+      // happened earlier. If a seek happens before the start key of
       // multi_scan_->next_scan_idx-1, it would seek a key that is less than
       // what was seeked before.
       assert(!seek_target_before_previous_prepared_range);
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index 77a6d1b2bb3b..0b954617bdd3 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -88,7 +88,7 @@ class BlockCacheTracerTest : public testing::Test {
       case 4:
         return TableReaderCaller::kUserIterator;
     }
-    // This cannot happend.
+    // This cannot happen.
     assert(false);
     return TableReaderCaller::kMaxBlockCacheLookupCaller;
   }

From 387cb4aae7fefbf0fa07ebb0f993936ceaae729c Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 5 Jan 2026 20:47:46 -0800
Subject: [PATCH 412/500] Clarify/rename atomic wrapper stuff + blog post
 (#14213)

Summary:
* Some existing commentary and motivation around my atomic wrappers in atomic.h was based on a misreading of documentation. seq_cst *is* a safe substitute for acq_rel in all cases. I still like having a distinct type for RelaxedAtomic (as folly does) and a wrapper also for other cases to avoid readability traps like implicit conversion and implicit memory order. This PR is only comment changes and renaming.
* Create a blog post about bit fields API to help with lock-free (and low-lock) programming.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14213

Test Plan: esiting tests

Reviewed By: xingbowang

Differential Revision: D89971581

Pulled By: pdillinger

fbshipit-source-id: 9bd1181d692258d668189c2da8bd0e5d98fd6230
---
 cache/clock_cache.cc                          |   4 +-
 cache/clock_cache.h                           |  14 +-
 db/db_wal_test.cc                             |   2 +-
 db/db_write_test.cc                           |   6 +-
 .../_posts/2025-12-31-bit-fields-api.markdown | 279 ++++++++++++++++++
 memtable/inlineskiplist.h                     |   4 +-
 memtable/skiplist.h                           |   6 +-
 .../block_based/block_based_table_builder.cc  |   2 +-
 util/atomic.h                                 |  53 ++--
 util/bit_fields.h                             |  18 +-
 util/slice_test.cc                            |   2 +-
 11 files changed, 339 insertions(+), 51 deletions(-)
 create mode 100644 docs/_posts/2025-12-31-bit-fields-api.markdown

diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index dbd2a5b8fccb..70155791a41c 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -210,7 +210,7 @@ inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data,
 // motivates only checking for release counter in high state, not both in high
 // state.)
 inline void CorrectNearOverflow(SlotMeta old_meta,
-                                AcqRelBitFieldsAtomic<SlotMeta>& meta) {
+                                BitFieldsAtomic<SlotMeta>& meta) {
   // We clear both top-most counter bits at the same time.
   constexpr uint32_t kCounterTopBit = uint32_t{1}
                                       << (SlotMeta::kCounterNumBits - 1);
@@ -1924,7 +1924,7 @@ class AutoHyperClockTable::ChainRewriteLock {
     }
   }
 
-  AcqRelBitFieldsAtomic<NextWithShift>* head_ptr_;
+  BitFieldsAtomic<NextWithShift>* head_ptr_;
   NextWithShift saved_head_;
 };
 
diff --git a/cache/clock_cache.h b/cache/clock_cache.h
index cdee0e93e5a7..efce8a69e352 100644
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@@ -413,7 +413,7 @@ struct ClockHandle : public ClockHandleBasicData {
   // TODO: make these coundown values tuning parameters for eviction?
 
   // See above. Mutable for read reference counting.
-  mutable AcqRelBitFieldsAtomic<SlotMeta> meta{};
+  mutable BitFieldsAtomic<SlotMeta> meta{};
 };  // struct ClockHandle
 
 class BaseClockTable {
@@ -538,13 +538,13 @@ class BaseClockTable {
   // TODO: is this separation needed if we don't do background evictions?
   ALIGN_AS(CACHE_LINE_SIZE)
   // Number of elements in the table.
-  AcqRelAtomic<size_t> occupancy_{};
+  Atomic<size_t> occupancy_{};
 
   // Memory usage by entries tracked by the cache (including standalone)
-  AcqRelAtomic<size_t> usage_{};
+  Atomic<size_t> usage_{};
 
   // Part of usage by standalone entries (not in table)
-  AcqRelAtomic<size_t> standalone_usage_{};
+  Atomic<size_t> standalone_usage_{};
 
   // Maximum total charge of all elements stored in the table.
   // (Relaxed: eventual consistency/update is OK)
@@ -899,8 +899,8 @@ class AutoHyperClockTable : public BaseClockTable {
 
     // See above. The head pointer is logically independent of the rest of
     // the entry, including the chain next pointer.
-    AcqRelBitFieldsAtomic<NextWithShift> head_next_with_shift{kUnusedMarker};
-    AcqRelBitFieldsAtomic<NextWithShift> chain_next_with_shift{kUnusedMarker};
+    BitFieldsAtomic<NextWithShift> head_next_with_shift{kUnusedMarker};
+    BitFieldsAtomic<NextWithShift> chain_next_with_shift{kUnusedMarker};
 
     // For supporting CreateStandalone and some fallback cases.
     inline bool IsStandalone() const {
@@ -1056,7 +1056,7 @@ class AutoHyperClockTable : public BaseClockTable {
   // To maximize parallelization of Grow() operations, this field is only
   // updated opportunistically after Grow() operations and in DoInsert() where
   // it is found to be out-of-date. See CatchUpLengthInfoNoWait().
-  AcqRelAtomic<uint64_t> length_info_;
+  Atomic<uint64_t> length_info_;
 
   // An already-computed version of the usable length times the max load
   // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 75e13724a75e..641daeb0bfd6 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -1613,7 +1613,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
       return s;
     }
 
-    AcqRelAtomic<uint32_t> syncs_before_failure_{UINT32_MAX};
+    Atomic<uint32_t> syncs_before_failure_{UINT32_MAX};
 
    protected:
     class MyTestWritableFile : public FSWritableFileOwnerWrapper {
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index f6eeac7c5ee6..97fb86c14c2c 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -741,7 +741,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) {
     ExternalSstFileInfo external_info;
     ASSERT_OK(sst_file_writer.Finish(&external_info));
   }
-  AcqRelAtomic<bool> parallel_ingest_completed{false};
+  Atomic<bool> parallel_ingest_completed{false};
   port::Thread parallel_ingest{[&]() {
     IngestExternalFileOptions ingest_opts;
     ingest_opts.move_files = true;  // faster than copy
@@ -750,7 +750,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) {
     parallel_ingest_completed.Store(true);
   }};
 
-  AcqRelAtomic<bool> flush_completed{false};
+  Atomic<bool> flush_completed{false};
   port::Thread parallel_flush{[&]() {
     FlushOptions flush_opts;
     // NB: Flush with wait=false case is tested above in LockWALInEffect
@@ -762,7 +762,7 @@ TEST_P(DBWriteTest, LockWALConcurrentRecursive) {
     flush_completed.Store(true);
   }};
 
-  AcqRelAtomic<bool> parallel_put_completed{false};
+  Atomic<bool> parallel_put_completed{false};
   port::Thread parallel_put{[&]() {
     // This can make certain failure scenarios more likely:
     //   sleep(1);
diff --git a/docs/_posts/2025-12-31-bit-fields-api.markdown b/docs/_posts/2025-12-31-bit-fields-api.markdown
new file mode 100644
index 000000000000..40d1b60f5326
--- /dev/null
+++ b/docs/_posts/2025-12-31-bit-fields-api.markdown
@@ -0,0 +1,279 @@
+---
+title: "BitFields API: Type-Safe Bit Packing for Lock-Free Data Structures"
+layout: post
+author: pdillinger
+category: blog
+---
+
+Modern concurrent data structures increasingly rely on [atomic operations](https://en.cppreference.com/w/cpp/atomic/atomic) to avoid the overhead of locking. A valuable but under-utilized technique for maximizing the effectiveness of atomic operations is [bit packing](https://en.wikipedia.org/wiki/Bit_field)---fitting multiple logical fields into a single atomic variable for algorithmic simplicity and efficiency. However, language support for bit packing does not guarantee dense packing, and manually managing bit manipulation quickly becomes error-prone, especially when dealing with complex state machines.
+
+To address this in RocksDB, we have developed a reusable **BitFields API**, a type-safe, zero-overhead abstraction for bit packing in C++. This works in conjunction with clean wrappers for `std::atomic` for powerful and relatively safe bit-packing of atomic data. For broader use, a [variant of the code](https://github.com/facebook/folly/pull/2549) has been proposed for adding to folly.
+
+## The Problem: Managing Packed Bit Fields
+
+Consider HyperClockCache, an essentially lock-free cache implementation in RocksDB, which was [refactored to use this BitFields API](https://github.com/facebook/rocksdb/pull/14154). It is a hash table built on *slots* that can each hold a cache entry and relevant metadata. For atomic simplicity and efficiency, all the essential metadata for each slot is packed into a single 64-bit value:
+- The reference count and eviction metadata are together encoded into *acquire* and *release* counters, 30 bits each.
+- The possible states of {*empty*, *under construction/destruction*, *occupied+visible*, and *occupied+invisible*} are encoded into three state bits (instead of two, for easier decoding and manipulation).
+- A *hit* bit is used for secondary cache integration.
+
+Traditionally, you might write code like this:
+
+```cpp
+// Old approach: manual bit manipulation
+constexpr uint64_t kAcquireCounterShift = 0;
+constexpr uint64_t kReleaseCounterShift = 30;
+constexpr uint64_t kCounterMask = 0x3FFFFFFF;
+constexpr uint64_t kHitBitShift = 60;
+constexpr uint64_t kOccupiedShift = 61;
+constexpr uint64_t kShareableShift = 62;
+constexpr uint64_t kVisibleShift = 63;
+constexpr uint64_t kStateShift = kOccupiedShift;
+
+std::atomic<uint64_t> meta_;
+
+bool IsUnderConstruction(uint64_t meta) const {
+    return (meta & (uint64_t{1} << kOccupiedShift)) && !(meta & (uint64_t{1} << kShareableShift));
+}
+
+// Getting fields
+uint64_t meta = meta_.load(std::memory_order_acquire);
+if (IsUnderConstruction(meta)) {
+  // ...
+} else if ((meta >> kVisibleShift) & 1) {
+  uint32_t refcount =
+      static_cast<uint32_t>(((meta >> kAcquireCounterShift) -
+                             (meta >> kReleaseCounterShift)) & kCounterMask);
+  // ...
+}
+
+
+// Setting fields
+
+// Set the hit bit (relaxed)
+meta_.fetch_or(uint64_t{1} << kHitBitShift, std::memory_order_relaxed);
+
+// Set both counters to `new_count` (as in eviction processing)
+uint64_t meta = meta_.load(std::memory_order_relaxed);
+uint64_t new_meta =
+    (meta & ((uint64_t{1} << kHitBitShift) | (uint64_t{7} << kStateShift))) |
+    (new_count << kReleaseCounterShift) |
+    (new_count << kAcquireCounterShift);
+bool success = meta_.compare_exchange_strong(meta, new_meta,
+                                             std::memory_order_acq_rel);
+
+// Increment acquire counter by initial_countdown
+old_meta = meta_.fetch_add((uint64_t{1} << kAcquireCounterShift) * initial_countdown,
+                           std::memory_order_acq_rel);
+```
+
+This approach has several problems:
+1. **Error-prone**: Easy to get masks and shifts wrong
+2. **Maintenance burden**: Changes to field sizes require updating multiple constants
+3. **Abstraction challenges**: Even if writing a full set of well-tested getters and setters to hide all the details, details can leak in to do things like update multiple fields in one non-CAS (compare-and-swap) atomic operation.
+
+## New Solution: BitFields API
+
+The BitFields API provides a declarative, type-safe way to define bit-packed structures. Here's how the same example looks with BitFields:
+
+```cpp
+// New approach: declarative bit fields. (Each field must reference the
+// previous, so that the declaration machinery is simply stateless.)
+struct SlotMeta : public BitFields<uint64_t, SlotMeta> {
+  using AcquireCounter = UnsignedBitField<SlotMeta, 30, NoPrevBitField>;
+  using ReleaseCounter = UnsignedBitField<SlotMeta, 30, AcquireCounter>;
+  using HitFlag = BoolBitField<SlotMeta, ReleaseCounter>;
+  using OccupiedFlag = BoolBitField<SlotMeta, HitFlag>;
+  using ShareableFlag = BoolBitField<SlotMeta, OccupiedFlag>;
+  using VisibleFlag = BoolBitField<SlotMeta, ShareableFlag>;
+
+  // Convenience helpers
+  bool IsUnderConstruction() const {
+    return Get<OccupiedFlag>() && !Get<ShareableFlag>();
+  }
+};
+
+BitFieldsAtomic<SlotMeta> meta_;
+
+// Getting fields
+SlotMeta state = meta_.Load();
+if (state.IsUnderConstruction()) {
+  // ...
+} else if (state.Get<SlotMeta::VisibleFlag>()) {
+  uint32_t refcount = state.Get<SlotMeta::AcquireCounter>() -
+                      state.Get<SlotMeta::ReleaseCounter>();
+  // ...
+}
+
+// Setting fields
+
+// Set the hit bit (relaxed)
+meta_.ApplyRelaxed(SlotMeta::HitFlag::SetTransform());
+
+// Set both counters to `new_count` (as in eviction processing)
+SlotMeta meta = meta_.LoadRelaxed();
+SlotMeta new_meta = meta;
+new_meta.Set<SlotMeta::ReleaseCounter>(new_count);
+new_meta.Set<SlotMeta::AcquireCounter>(new_count);
+meta_.CasStrongRelaxed(meta, new_meta);
+
+// Increment acquire counter by initial_countdown
+auto add_acquire =
+    AcquireCounter::PlusTransformPromiseNoOverflow(initial_countdown);
+meta_.Apply(add_acquire, &old_meta);
+
+// Bonus: Atomic multi-field updates without compare-exchange
+auto transform = AcquireCounter::PlusTransformPromiseNoOverflow(1) +
+                 ReleaseCounter::PlusTransformPromiseNoOverflow(1);
+meta_.Apply(transform);
+```
+
+## Key Features
+
+### Type Safety and Self-Documentation
+
+Each field has a specific type (`bool` for `BoolBitField`, appropriately-sized unsigned int for `UnsignedBitField`) and clear semantic meaning. The field definitions are self-documenting: you can immediately see how many bits each field occupies and in what order.
+
+### [Zero Overhead](https://en.cppreference.com/w/cpp/language/Zero-overhead_principle)
+
+Because of heavy use of templates and constexpr operations and the ability to satisfy multiple field reads or writes from a single atomic operation, we have seen no runtime overhead vs. hand-written bit manipulation, in RocksDB. In one case, we verified the assembly code was identical.
+
+[For folly's LifoSem](https://github.com/facebook/folly/pull/2550), there was one case where an optimization hack with detected overflow from one field to another couldn't be replicated as efficiently with the BitFields API because it would violate overflow checking. For that case I dove into the underlying representation to bypass the BitFields overflow check.
+
+### Atomic Operations with Transforms
+
+One of the most powerful features is the ability to combine multiple field updates into a single atomic operation using "transforms", if they are all either (a) some combination of addition and subtraction, (b) bitwise-and, or (c) bitwise-or. For example:
+
+```cpp
+// Clear several but not all fields atomically
+auto and_transform = Field1::AndTransform(0) +
+                 Field2::ClearTransform() +
+                 Field4::ClearTransform();
+atomic_bitfields.Apply(and_transform, &old_state, &new_state);
+...
+// Set more than one boolean field atomically
+auto or_transform = Field2::SetTransform() +
+                 Field4::SetTransform();
+atomic_bitfields.Apply(or_transform, &old_state, &new_state);
+...
+auto add_transform = Field1::PlusTransformPromiseNoOverflow(1) +
+                     Field3::MinusTransformPromiseNoUnderflow(1);
+atomic_bitfields.Apply(add_transform, &old_state, &new_state);
+```
+
+Each `Apply()` generates a single atomic operation (e.g., `fetch_add` or `fetch_or`) that updates all the specified fields, and optionally returns both the old and new values. This enables a number of hacks for atomic updates without CAS.
+
+### Overflow Protection
+
+The API includes built-in overflow detection in debug builds:
+
+```cpp
+// An assertion will fail in debug builds if the counter overflows
+auto transform = Counter::PlusTransformPromiseNoOverflow(value);
+atomic.Apply(transform);
+```
+
+For fields at the top of the underlying representation (where overflow doesn't affect other fields), overflow is explicitly ignored. (A compile time error is generated if you try to use `PlusTransformPromiseNoOverflow` on a field at the top of the representation or `PlusTransformIgnoreOverflow` on a field not at the top of the representation.)
+
+```cpp
+// For wraparound counters
+auto transform = Counter::PlusTransformIgnoreOverflow(value);
+```
+
+This capability is used in a folly data structure called LifoSem, which [I have proposed to refactor](https://github.com/facebook/folly/pull/2550) to a proposed BitFields API variant for folly.
+
+### Compare-and-Swap (CAS) Support
+
+The atomic wrappers provide full CAS support for lock-free algorithms:
+
+```cpp
+SlotMeta expected = current_state;
+SlotMeta desired = expected.With<Field1>(new_value).With<Field2>(true);
+if (meta_.CasStrong(expected, desired)) {
+  // Successfully updated
+  ...
+}
+```
+
+### Atomic wrappers
+
+The BitFields API includes two atomic wrappers: `RelaxedBitFieldsAtomic` and `BitFieldsAtomic`. However, RocksDB also has versions of these wrappers for regular `std::atomic` variables that help with memory ordering discipline: `RelaxedAtomic` and `Atomic` in `util/atomic.h`.
+
+These wrappers help in a couple of ways:
+* **Self-document intended memory order**: An atomic field generally has a single memory order that all or most operations should use, typically either `std::memory_order_relaxed` or `std::memory_order_acq_rel`.
+* **More intentional memory orders and atomic operations**: The standard library's implicit conversions and default memory ordering (`memory_order_seq_cst`) make it easy to accidentally use sequential consistency with acquire/release ordering or even relaxed, which could hurt performance, and tend to hide where atomic operations are actually happening (e.g. implicit vs. explicit load).
+
+For example, instead of writing:
+```cpp
+std::atomic<uint64_t> stat_counter;
+stat_counter++;  // Uses memory_order_seq_cst implicitly - maybe inefficient
+```
+
+You write:
+```cpp
+RelaxedAtomic<uint64_t> stat_counter;
+stat_counter.FetchAddRelaxed(1);  // Explicitly relaxed - appropriate for a diagnostic counter
+```
+
+Or for data providing synchronization:
+```cpp
+Atomic<size_t> refcount;
+refcount.FetchAdd(1);  // Standard acquire-release semantics for coordinating with other threads
+```
+
+These wrappers complement the BitFields atomic wrappers by providing the same ordering discipline for non-packed atomic variables throughout much of RocksDB, creating a more readable and less clunky approach to concurrent programming. Migrating remaining uses of `std::atomic` is an ongoing effort.
+
+## Real-World Usage in RocksDB
+
+The BitFields API was developed along with the revamped parallel compression in RocksDB, but with the intention to also clean up the HyperClockCache (HCC) implementation. With that migration complete, we can see the benefits. Specifically, **by packing more of the state machine into a single atomic value, the parallel algorithms became both simpler and more efficient.** Concurrent algorithms that could have blown up in their state space with elaborate interleavings between threads trying not to block each other, e.g. because of multi-step consensus on work assignments, were instead able to quickly and more easily make progress, e.g. with atomically clear work assignments.
+
+### Before: Manual Bit Manipulation
+
+The old HCC code was difficult to read and maintain. Many of the common read and update operations had manually written helper functions, but it was not practical to develop the full set of functions needed for rare cases. Consider this code that clears the "visible" flag on a slot when an entry is erased from subsequent lookups but might still be referenced:
+
+```cpp
+// Old HCC code, without atomic wrappers
+uint64_t old_meta =
+        h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit}
+                                   << ClockHandle::kStateShift), std::memory_order_acq_rel);
+// Apply update to local copy
+uint64_t new_meta = old_meta & ~(uint64_t{ClockHandle::kStateVisibleBit}
+                            << ClockHandle::kStateShift);
+
+// New HCC code
+SlotMeta old_meta, new_meta;
+h->meta.Apply(SlotMeta::VisibleFlag::ClearTransform(), &old_meta, &new_meta);
+```
+
+Or this assertion that the acquire and release counters are different:
+
+```cpp
+// Old HCC code
+uint64_t old_meta = ...;
+assert(((old_meta >> ClockHandle::kAcquireCounterShift) &
+        ClockHandle::kCounterMask) !=
+        ((old_meta >> ClockHandle::kReleaseCounterShift) &
+        ClockHandle::kCounterMask));
+
+// New HCC code without single-purpose helper functions
+SlotMeta old_meta = ...;
+assert(old_meta.Get<SlotMeta::AcquireCounter>() !=
+       old_meta.Get<SlotMeta::ReleaseCounter>());
+
+// New HCC code, with single-purpose helper functions
+SlotMeta old_meta = ...;
+assert(old_meta.GetAcquireCounter() != old_meta.GetReleaseCounter());
+```
+
+Some hand-written helper functions or using directives are still useful for brevity, but even without them all the bit manipulation details are hidden in the BitFields implementation.
+
+## Future Directions
+
+We hope the proposed folly version is accepted to make the BitFields API available for broader usage. Additionally, some quality-of-life improvements are likely possible, perhaps including easier declaration and usage syntax, hopefully without delving into boost-like macro hell. Better runtime and compile time checks might also be possible.
+
+## Conclusion
+
+The BitFields API demonstrates that zero-overhead abstractions can significantly improve code quality without sacrificing performance. By providing type safety, self-documentation, and convenience features around bit manipulation and atomic operations, it makes lock-free programming more accessible and maintainable. Bit-packed atomics are arguably essential for *slaying the complexity dragon* of efficient lock-free and low-lock algorithms, because they reduce explosion in algorithm states.
+
+For RocksDB specifically, the migration to BitFields has made the HyperClockCache implementation substantially easier to understand and modify, while maintaining the same high-performance characteristics. Combined with the recent [parallel compression revamp](/blog/2025/10/08/parallel-compression-revamp.html), these improvements showcase our ongoing commitment to writing clean, efficient, and maintainable code.
+
+The BitFields API is available in RocksDB's util/bit_fields.h and can be adapted for use in other projects requiring efficient, type-safe bit packing. For those building high-performance concurrent systems, it offers a compelling alternative to manual bit manipulation—proving that safe abstractions and peak performance are not mutually exclusive.
diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h
index a25436af495b..d39091ec6d43 100644
--- a/memtable/inlineskiplist.h
+++ b/memtable/inlineskiplist.h
@@ -372,7 +372,7 @@ struct InlineSkipList<Comparator>::Node {
  private:
   // next_[0] is the lowest level link (level 0).  Higher levels are
   // stored _earlier_, so level 1 is at next_[-1].
-  AcqRelAtomic<Node*> next_[1];
+  Atomic<Node*> next_[1];
 };
 
 template <class Comparator>
@@ -813,7 +813,7 @@ char* InlineSkipList<Comparator>::AllocateKey(size_t key_size) {
 template <class Comparator>
 typename InlineSkipList<Comparator>::Node*
 InlineSkipList<Comparator>::AllocateNode(size_t key_size, int height) {
-  auto prefix = sizeof(AcqRelAtomic<Node*>) * (height - 1);
+  auto prefix = sizeof(Atomic<Node*>) * (height - 1);
 
   // prefix is space for the height - 1 pointers that we store before
   // the Node instance (next_[-(height - 1) .. -1]).  Node starts at
diff --git a/memtable/skiplist.h b/memtable/skiplist.h
index 979cffd111c7..594c6ec43ce4 100644
--- a/memtable/skiplist.h
+++ b/memtable/skiplist.h
@@ -197,14 +197,14 @@ struct SkipList<Key, Comparator>::Node {
 
  private:
   // Array of length equal to the node height.  next_[0] is lowest level link.
-  AcqRelAtomic<Node*> next_[1];
+  Atomic<Node*> next_[1];
 };
 
 template <typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::NewNode(
     const Key& key, int height) {
-  char* mem = allocator_->AllocateAligned(
-      sizeof(Node) + sizeof(AcqRelAtomic<Node*>) * (height - 1));
+  char* mem = allocator_->AllocateAligned(sizeof(Node) +
+                                          sizeof(Atomic<Node*>) * (height - 1));
   return new (mem) Node(key);
 }
 
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 74c90edea01b..c4e6895a2163 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -318,7 +318,7 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   // simplify the interesting interleavings that have to be considered and
   // accommodated.
   struct State : public BitFields<uint64_t, State> {};
-  ALIGN_AS(CACHE_LINE_SIZE) AcqRelBitFieldsAtomic<State> atomic_state;
+  ALIGN_AS(CACHE_LINE_SIZE) BitFieldsAtomic<State> atomic_state;
 
   // The first field is a bit for each ring buffer slot (max 32) for whether
   // that slot is ready to be claimed for writing by a worker thread. Because
diff --git a/util/atomic.h b/util/atomic.h
index 94575fc7ca8e..209be20f3f50 100644
--- a/util/atomic.h
+++ b/util/atomic.h
@@ -13,28 +13,28 @@ namespace ROCKSDB_NAMESPACE {
 
 // Background:
 // std::atomic is somewhat easy to misuse:
-// * Implicit conversion to T using std::memory_order_seq_cst, along with
-// memory order parameter defaults, make it easy to accidentally mix sequential
-// consistency ordering with acquire/release memory ordering. See
-// "The single total order might not be consistent with happens-before" at
-// https://en.cppreference.com/w/cpp/atomic/memory_order
+// * Implicit conversion to T makes it easy to use an unnecessarily strong
+// memory ordering (std::memory_order_seq_cst) and to hide atomic operations
+// that should be evident on reading the code.
+// * Similarly, defaulting to std::memory_order_seq_cst for atomic operations
+// makes it easy to use unnecessarily strong orderings. (It's always safe if
+// some ordering is safe, but it's better to be intentional and thoughtful when
+// carefully optimizing code with atomics.) Legitimate needs for seq_cst vs.
+// acq_rel are rare, such as drawing inferences across two atomics in
+// implementing hazard pointers.
 // * It's easy to use nonsensical (UB) combinations like store with
-// std::memory_order_acquire.
-// * It is unlikely that anything in RocksDB will need std::memory_order_seq_cst
-// because sequential consistency for the user, potentially writing from
-// multiple threads, is provided by explicit versioning with sequence numbers.
-// If threads A & B update separate atomics, it's typically OK if threads C & D
-// see those updates in different orders.
+// std::memory_order_acquire. Getting these right in development is an
+// unnecessary cognitive overhead even if they are caught by UBSAN.
 //
-// For such reasons, we provide wrappers below to make safe usage easier.
+// For such reasons, we provide wrappers below to make clear and explicit
+// usage of atomics easier.
 
-// Wrapper around std::atomic to avoid certain bugs (see Background above).
+// Wrapper around std::atomic for better code clarity (see Background above).
 //
-// This relaxed-only wrapper is intended for atomics that do not need
-// ordering constraints with other data reads/writes aside from those
-// necessary for computing data values or given by other happens-before
-// relationships. For example, a cross-thread counter that never returns
-// the same result can be a RelaxedAtomic.
+// This relaxed-only wrapper is intended for atomics that are not used to
+// synchronize other data across threads (only the atomic data), so can always
+// used relaxed memory ordering. For example, a cross-thread counter that never
+// returns the same result can be a RelaxedAtomic.
 template <typename T>
 class RelaxedAtomic {
  public:
@@ -72,14 +72,21 @@ class RelaxedAtomic {
   std::atomic<T> v_;
 };
 
-// Wrapper around std::atomic to avoid certain bugs (see Background above).
+// A reasonably general-purpose wrapper around std::atomic for better code
+// clarity (see Background above).
 //
-// Except for some unusual cases requiring sequential consistency, this is
-// a general-purpose atomic. Relaxed operations can be mixed in as appropriate.
+// Operations use std::memory_order_acq_rel by default (or just acquire or just
+// release for read-only and write-only operations), but relaxed operations are
+// also available and can be mixed in when appropriate.
+//
+// Future: add std::memory_order_seqcst variants like StoreSeqCst if/when
+// there's a need for them (rare). No distinct type is needed because the
+// distinction between acq_rel and seq_cst is more about where it is used in
+// combination with other atomics than the atomic itself.
 template <typename T>
-class AcqRelAtomic : public RelaxedAtomic<T> {
+class Atomic : public RelaxedAtomic<T> {
  public:
-  explicit AcqRelAtomic(T initial = {}) : RelaxedAtomic<T>(initial) {}
+  explicit Atomic(T initial = {}) : RelaxedAtomic<T>(initial) {}
   void Store(T desired) {
     RelaxedAtomic<T>::v_.store(desired, std::memory_order_release);
   }
diff --git a/util/bit_fields.h b/util/bit_fields.h
index 1f2eb50b664c..00a43ed90118 100644
--- a/util/bit_fields.h
+++ b/util/bit_fields.h
@@ -19,7 +19,7 @@ namespace ROCKSDB_NAMESPACE {
 // fields into atomic variables to reduce the need for locking in concurrent
 // code and/or to simplify reasoning on and accommodation of different
 // interesting, bug-prone interleavings. Convenient atomic wrappers
-// (RelaxedAtomic, AcqRelAtomic) are provided below to aid usage with atomics,
+// (RelaxedAtomic, Atomic) are provided below to aid usage with atomics,
 // especially for CAS updates, but it is even possible to combine operations on
 // multiple bit fields into a single non-CAS atomic operation using Transforms
 // below.
@@ -333,10 +333,11 @@ struct UnsignedBitField {
   }
 };
 
-// A handy wrapper for a relaxed atomic on some BitFields type (unlike
-// RelaxedAtomic for arithmetic types). For encapsulation, usual arithmetic
-// atomic operations are only available by calling Apply[Relaxed]() on
-// Transforms returned from field classes. Extending an example from BitFields:
+// A handy wrapper for a relaxed atomic on some BitFields type, like
+// RelaxedAtomic but without direct arithmetic operations. For encapsulation,
+// usual arithmetic atomic operations are only available by calling
+// ApplyRelaxed() on Transforms returned from field classes. Extending an
+// example from BitFields:
 //
 // auto transform = Field2::ClearTransform() + Field4::ClearTransform();
 // MyState old_state;
@@ -421,14 +422,15 @@ class RelaxedBitFieldsAtomic {
 };
 
 // A handy wrapper for an aquire-release atomic (also relaxed semantics
-// available) on some BitFields type. See RelaxedBitFieldsAtomic for more info.
+// available) on some BitFields type. See RelaxedBitFieldsAtomic and
+// Atomic in atomic.h for more info.
 template <typename BitFieldsT>
-class AcqRelBitFieldsAtomic : public RelaxedBitFieldsAtomic<BitFieldsT> {
+class BitFieldsAtomic : public RelaxedBitFieldsAtomic<BitFieldsT> {
  public:
   using Base = RelaxedBitFieldsAtomic<BitFieldsT>;
   using U = typename BitFieldsT::U;
 
-  explicit AcqRelBitFieldsAtomic(BitFieldsT initial = {}) : Base(initial) {}
+  explicit BitFieldsAtomic(BitFieldsT initial = {}) : Base(initial) {}
 
   void Store(BitFieldsT desired) {
     Base::v_.store(desired.underlying, std::memory_order_release);
diff --git a/util/slice_test.cc b/util/slice_test.cc
index cbc72891172e..9106ec3c6e58 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -590,7 +590,7 @@ TEST(BitFieldsTest, BitFields) {
     ASSERT_EQ(after.Get<Field5>(), 0U);
   }
   {
-    AcqRelBitFieldsAtomic<MyState> acqrel{state};
+    BitFieldsAtomic<MyState> acqrel{state};
     ASSERT_EQ(state, acqrel.Load());
     acqrel.Store(state2);
     ASSERT_EQ(state2, acqrel.Load());

From 4e10e0bcac85634a73fb55ae8ab41fae201eaadd Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 6 Jan 2026 10:09:07 -0800
Subject: [PATCH 413/500] Fix cases of db dir going missing in db_crashtest.py
 (#14219)

Summary:
My PR https://github.com/facebook/rocksdb/issues/14195 regressed a case in which db_crashtest.py calling db_stress with --destroy_db_initially=1 could lead to dbname directory being nonexistant for subsequent calls to gen_cmd -> finalize_and_sanitize -> is_direct_io_supported which would fail in creating a temporary file. Fix this (and clean up existing related code) using os.makedirs.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14219

Test Plan: I don't have a good reproducer for the error but some manual testing indicates this change is at least safe

Reviewed By: virajthakur

Differential Revision: D90138248

Pulled By: pdillinger

fbshipit-source-id: 0ed6524cd50f8632346a8583f26bf1f4941817ce
---
 tools/db_crashtest.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 1de1525c06ab..b63347eeee7d 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -474,10 +474,7 @@ def get_dbname(test_name):
                 print("Running DB cleanup command - %s\n" % cleanup_cmd)
                 # Ignore failure
                 os.system(cleanup_cmd)
-            try:
-                os.mkdir(dbname)
-            except OSError:
-                pass
+            os.makedirs(dbname, exist_ok=True)
     return dbname
 
 
@@ -535,6 +532,8 @@ def is_direct_io_supported(dbname):
     if is_remote_db:
         return False
     else:
+        # Note: db dir might be removed on check_mode change. Re-create it
+        os.makedirs(dbname, exist_ok=True)
         with tempfile.NamedTemporaryFile(dir=dbname) as f:
             try:
                 os.open(f.name, os.O_DIRECT)

From a7c1acbe9f808d0b9d6da3d6d5d822b2a2e26af1 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Tue, 6 Jan 2026 10:30:34 -0800
Subject: [PATCH 414/500] Mark RateLimiter::GetMode method as const (#14221)

Summary:
Mark RateLimiter::GetMode method as const

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14221

Test Plan: existing unit test

Reviewed By: jaykorean

Differential Revision: D90182630

Pulled By: xingbowang

fbshipit-source-id: 119f3cf0082e285a84ecdca224535f03f2afbf12
---
 include/rocksdb/rate_limiter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h
index 51383ba20adc..ede742aba6ac 100644
--- a/include/rocksdb/rate_limiter.h
+++ b/include/rocksdb/rate_limiter.h
@@ -132,7 +132,7 @@ class RateLimiter {
   }
 
  protected:
-  Mode GetMode() { return mode_; }
+  Mode GetMode() const { return mode_; }
 
  private:
   const Mode mode_;

From 6f03c3dfeaeef60c52d5cb48f278982c24d62312 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Tue, 6 Jan 2026 14:29:02 -0800
Subject: [PATCH 415/500] Fix a flaky unit test UdtTombstoneCollapsingTest
 (#14220)

Summary:
As compaction scheduling is not deterministic, the existing check is too strict sometimes, causing test to be flaky.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14220

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D90143556

Pulled By: xingbowang

fbshipit-source-id: 6780423c63324a4b20fc8b8ccac2051a094c9f4a
---
 db/db_with_timestamp_compaction_test.cc | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc
index 8348316b6c90..08802738f0b8 100644
--- a/db/db_with_timestamp_compaction_test.cc
+++ b/db/db_with_timestamp_compaction_test.cc
@@ -538,9 +538,26 @@ TEST_F(TimestampCompatibleCompactionTest, UdtTombstoneCollapsingTest) {
   // Validate CF size is less than 20% of the total data created to validate the
   // tombstones has collapsed
   uint64_t cf_size = 0;
-  ASSERT_TRUE(
-      db_->GetIntProperty(cfh, DB::Properties::kTotalSstFilesSize, &cf_size));
-  ASSERT_LE(cf_size, 0.2 * kTotalRecords * kValueSize);
+
+  // use TEST_WaitForCompact to wait for compaction to run for a while
+  WaitForCompactOptions wait_for_compact_options;
+  wait_for_compact_options.timeout = std::chrono::seconds(1);
+
+  // For some reason the background compaction never ends when calling
+  // TEST_WaitForCompact without timeout, which causes the test to timeout. This
+  // likely indicates a bug in the compaction picking logic.
+  // TODO (issue #14223, fix potential bug in compaction picking logic)
+  int timeout = 60;
+  auto threshold = kTotalRecords * kValueSize * 0.2;
+
+  do {
+    auto s = dbfull()->TEST_WaitForCompact(wait_for_compact_options);
+    ASSERT_TRUE(s.ok() || s.IsTimedOut());
+    ASSERT_TRUE(
+        db_->GetIntProperty(cfh, DB::Properties::kTotalSstFilesSize, &cf_size));
+  } while (cf_size > threshold && timeout-- > 0);
+
+  ASSERT_LE(cf_size, threshold);
 
   delete cfh;
 }

From 429b36c22d76403d275dd0e6877b08d4cea2bc90 Mon Sep 17 00:00:00 2001
From: zaidoon <zaidoon@cloudflare.com>
Date: Tue, 6 Jan 2026 19:03:18 -0800
Subject: [PATCH 416/500] Add C API for block_align option in
 BlockBasedTableOptions (#14153)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/14153

Reviewed By: archang19

Differential Revision: D90211012

Pulled By: jaykorean

fbshipit-source-id: fd87d3d74664f75fbe47946764b1d25aa731c020
---
 db/c.cc             | 5 +++++
 include/rocksdb/c.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index 894e35ef8ef8..7abab13a6fda 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3763,6 +3763,11 @@ void rocksdb_block_based_options_set_unpartitioned_pinning_tier(
       static_cast<ROCKSDB_NAMESPACE::PinningTier>(v);
 }
 
+void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.block_align = v;
+}
+
 /* FlushJobInfo */
 
 const char* rocksdb_flushjobinfo_cf_name(const rocksdb_flushjobinfo_t* info,
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 08557be3c03b..a50edd2f7ef6 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1231,6 +1231,8 @@ rocksdb_block_based_options_set_partition_pinning_tier(
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_unpartitioned_pinning_tier(
     rocksdb_block_based_table_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_manager(
     rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_sst_file_manager(

From 2b28885c805c16506eb6c8cbb75752a8a62c4021 Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Wed, 7 Jan 2026 10:34:21 -0800
Subject: [PATCH 417/500] Introducing IO Dispatcher (#14135)

Summary:
This diff introduces the IO Dispatcher, which will be used to simplify the code path for MultiScan, while further providing a centralized place to enact policy on how MultiScan is done (i.e., limit memory usage and pinned buffers for example). Right now this diff only encapsulates the functionality done during the Prepare of MultiScan.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14135

Reviewed By: anand1976

Differential Revision: D87837261

Pulled By: krhancoc

fbshipit-source-id: 2698910ade02bc3d182413ae07ce69fe7abb7ec5
---
 BUCK                                         |   7 +
 Makefile                                     |   3 +
 include/rocksdb/io_dispatcher.h              | 207 ++++
 src.mk                                       |   2 +
 table/block_based/block_based_table_reader.h |   2 +
 util/io_dispatcher_imp.cc                    | 576 +++++++++++
 util/io_dispatcher_imp.h                     |  35 +
 util/io_dispatcher_test.cc                   | 985 +++++++++++++++++++
 8 files changed, 1817 insertions(+)
 create mode 100644 include/rocksdb/io_dispatcher.h
 create mode 100644 util/io_dispatcher_imp.cc
 create mode 100644 util/io_dispatcher_imp.h
 create mode 100644 util/io_dispatcher_test.cc

diff --git a/BUCK b/BUCK
index c7fd89eeb18d..8fa8f35d3d9e 100644
--- a/BUCK
+++ b/BUCK
@@ -268,6 +268,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "util/dynamic_bloom.cc",
         "util/file_checksum_helper.cc",
         "util/hash.cc",
+        "util/io_dispatcher_imp.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
@@ -5214,6 +5215,12 @@ cpp_unittest_wrapper(name="interval_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="io_dispatcher_test",
+            srcs=["util/io_dispatcher_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="io_posix_test",
             srcs=["env/io_posix_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/Makefile b/Makefile
index 0c31657d76f5..4f62ad5b576e 100644
--- a/Makefile
+++ b/Makefile
@@ -1922,6 +1922,9 @@ blob_source_test: $(OBJ_DIR)/db/blob/blob_source_test.o $(TEST_LIBRARY) $(LIBRAR
 blob_garbage_meter_test: $(OBJ_DIR)/db/blob/blob_garbage_meter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+io_dispatcher_test: $(OBJ_DIR)/util/io_dispatcher_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 timer_test: $(OBJ_DIR)/util/timer_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/include/rocksdb/io_dispatcher.h b/include/rocksdb/io_dispatcher.h
new file mode 100644
index 000000000000..520be86abf31
--- /dev/null
+++ b/include/rocksdb/io_dispatcher.h
@@ -0,0 +1,207 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/options.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+/*
+ * IODispatcher is a class that allows users to submit groups of IO jobs to be
+ * dispatched asynchronously (or synchronously), upon submission the
+ * IODispatcher will return a ReadSet which act as an ownership object of those
+ * IOs. Users read from their readset when they require the data, and either
+ * poll for completion of the block, or read synchronously if the block is not
+ * in cache at that point.
+ *
+ * ReadSets have RAII semantics, meaning on destruction they will cancel any on
+ * going IO, and release the underlying pinned blocks.
+ *
+ * IODispatcher main goal is to act as control plane for all readers using the
+ * dispatcher, allowing for future ratelimiting and smarter dispatching policies
+ * in the future.
+ *
+* Example:
+ // Submitting an IO job and reading blocks:
+ //
+ // std::shared_ptr<IOJob> job = std::make_shared<IOJob>();
+ // job->table = table_reader;  // Provided BlockBasedTable*
+ // job->job_options.io_coalesce_threshold = 32 * 1024;
+ // job->job_options.read_options = read_options;  // Provided ReadOptions
+ //
+ // // Populate the job with block handles (e.g., from an index/iterator)
+ // job->block_handles.push_back(handle1);
+ // job->block_handles.push_back(handle2);
+ // job->block_handles.push_back(handle3);
+ //
+ // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+ // std::shared_ptr<ReadSet> read_set;
+ // Status s = dispatcher->SubmitJob(job, &read_set);
+ // if (!s.ok()) {
+ //   // Handle submit error
+ // }
+ //
+ // // Read by index
+ // for (size_t i = 1; i < job->block_handles.size(); ++i) {
+ //   CachableEntry<Block> block_entry;
+ //   Status rs = read_set->ReadIndex(i, &block_entry);
+ //   if (!rs.ok()) {
+ //     // Handle read error
+ //     continue;
+ //   }
+ //   // Use block_entry (block contents are pinned here)
+ // }
+ //
+ // // Or read by byte offset
+ // {
+ //   size_t offset = static_cast<size_t>(job->block_handles.front().offset());
+ //   CachableEntry<Block> block_entry;
+ //   Status rs = read_set->ReadOffset(offset, &block_entry);
+ //   if (rs.ok()) {
+ //     // Use block_entry
+ //   }
+ // }
+ //
+ // // Stats
+ // uint64_t cache_hits = read_set->GetNumCacheHits();
+ // uint64_t async_reads = read_set->GetNumAsyncReads();
+ // uint64_t sync_reads = read_set->GetNumSyncReads();
+
+ */
+
+class BlockHandle;
+struct ReadOptions;
+struct AsyncIOState;
+
+template <typename T>
+class CachableEntry;
+class Block;
+class BlockBasedTable;
+
+struct JobOptions {
+  uint64_t io_coalesce_threshold = 16 * 1024;
+  ReadOptions read_options;
+};
+
+class IOJob {
+ public:
+  std::vector<BlockHandle> block_handles;
+
+  // Table reader for accessing block cache and index
+  BlockBasedTable* table = nullptr;
+
+  // Job execution options
+  JobOptions job_options;
+};
+
+/*
+ * ReadSet represents a set of blocks that may be in cache, being read
+ * asynchronously, or need to be read synchronously. The Read() method
+ * transparently handles all three cases.
+ */
+class ReadSet {
+ public:
+  ReadSet() = default;
+  ~ReadSet();
+
+  ReadSet(const ReadSet&) = delete;
+  ReadSet& operator=(const ReadSet&) = delete;
+  ReadSet(ReadSet&&) noexcept = delete;
+  ReadSet& operator=(ReadSet&&) noexcept = delete;
+
+  // Read a block by index
+  // - If the block is in cache, returns it immediately
+  // - If the block is being read asynchronously, polls for completion and
+  // returns it
+  // - If the block needs to be read, performs a synchronous read and returns it
+  //
+  // block_index: Index into the original IOJob's block_handles vector
+  // out: Output parameter for the pinned block entry
+  //
+  // Returns: Status::OK() on success, error status otherwise
+  Status ReadIndex(size_t block_index, CachableEntry<Block>* out);
+  // Read a block by offset
+  // - If the block is in cache, returns it immediately
+  // - If the block is being read asynchronously, polls for completion and
+  // returns it
+  // - If the block needs to be read, performs a synchronous read and returns it
+
+  // block_offset: Byte Offset into the SST file of the block.
+
+  // out: Output parameter for the pinned block entry
+  Status ReadOffset(size_t offset, CachableEntry<Block>* out);
+
+  // Statistics accessors
+  uint64_t GetNumSyncReads() const { return num_sync_reads_; }
+  uint64_t GetNumAsyncReads() const { return num_async_reads_; }
+  uint64_t GetNumCacheHits() const { return num_cache_hits_; }
+
+ private:
+  friend class IODispatcherImpl;
+
+  // Job data
+  std::shared_ptr<IOJob> job_;
+
+  // Storage for pinned blocks (one per block handle in the job)
+  std::vector<CachableEntry<Block>> pinned_blocks_;
+
+  // Sorted index for binary search in ReadOffset.
+  // sorted_block_indices_[i] is the original index of the i-th smallest block
+  // by offset. Built once during SubmitJob for O(log n) ReadOffset lookups.
+  std::vector<size_t> sorted_block_indices_;
+
+  // Map from block index to async IO state for blocks being read
+  // asynchronously. Multiple block indices may map to the same async state when
+  // blocks are coalesced into a single IO request.
+  std::unordered_map<size_t, std::shared_ptr<AsyncIOState>> async_io_map_;
+
+  // Statistics counters
+  std::atomic<uint64_t> num_sync_reads_ = 0;
+  std::atomic<uint64_t> num_async_reads_ = 0;
+  std::atomic<uint64_t> num_cache_hits_ = 0;
+
+  // Poll and process a specific async IO request
+  Status PollAndProcessAsyncIO(
+      const std::shared_ptr<AsyncIOState>& async_state);
+
+  // Perform synchronous read for a specific block
+  Status SyncRead(size_t block_index);
+};
+
+/*
+ * IODispatcher handles IO operations synchronously or asynchronously based
+ * on JobOptions. When async is true, it uses ReadAsync; when false, it uses
+ * standard synchronous reads.
+ * */
+class IODispatcher {
+ protected:
+  IODispatcher() = default;
+
+ public:
+  virtual ~IODispatcher() {}
+
+  IODispatcher(const IODispatcher&) = delete;
+  IODispatcher& operator=(const IODispatcher&) = delete;
+  IODispatcher(IODispatcher&&) = delete;
+  IODispatcher& operator=(IODispatcher&&) = delete;
+
+  // Submit a job for IO processing
+  // job: The IO job to submit
+  // read_set: Output parameter that will be populated with the ReadSet on
+  // success Returns: Status::OK() on success, error status otherwise
+  virtual Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                           std::shared_ptr<ReadSet>* read_set) = 0;
+};
+
+IODispatcher* NewIODispatcher();
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index 5eac640572d1..fc54f2804f90 100644
--- a/src.mk
+++ b/src.mk
@@ -264,6 +264,7 @@ LIB_SOURCES =                                                   \
   util/string_util.cc                                           \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
+  util/io_dispatcher_imp.cc                                     \
   util/udt_util.cc                                              \
   util/write_batch_util.cc                                      \
   util/xxhash.cc                                                \
@@ -620,6 +621,7 @@ TEST_MAIN_SOURCES =                                                     \
   util/hash_test.cc                                                     \
   util/heap_test.cc                                                     \
   util/interval_test.cc                                                 \
+  util/io_dispatcher_test.cc                                            \
   util/random_test.cc                                                   \
   util/rate_limiter_test.cc                                             \
   util/repeatable_thread_test.cc                                        \
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index b20d0db194e6..b1dfa0c7e0c9 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -575,6 +575,8 @@ class BlockBasedTable : public TableReader {
   friend class PartitionedFilterBlockReader;
   friend class PartitionedFilterBlockTest;
   friend class DBBasicTest_MultiGetIOBufferOverrun_Test;
+  friend class ReadSet;
+  friend class IODispatcherTest;
 };
 
 // Maintaining state of a two-level iteration on a partitioned index structure.
diff --git a/util/io_dispatcher_imp.cc b/util/io_dispatcher_imp.cc
new file mode 100644
index 000000000000..1f247399ddb8
--- /dev/null
+++ b/util/io_dispatcher_imp.cc
@@ -0,0 +1,576 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/io_dispatcher_imp.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "file/random_access_file_reader.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/io_dispatcher.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/cachable_entry.h"
+#include "table/block_based/reader_common.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper function to create and pin a block from a buffer
+// Used by both ReadSet::PollAndProcessAsyncIO and IODispatcherImpl::Impl
+static Status CreateAndPinBlockFromBuffer(
+    const std::shared_ptr<IOJob>& job, const BlockHandle& block,
+    uint64_t buffer_start_offset, const Slice& buffer_data,
+    CachableEntry<Block>& pinned_block_entry) {
+  auto* rep = job->table->get_rep();
+
+  // Get decompressor
+  UnownedPtr<Decompressor> decompressor = rep->decompressor.get();
+  CachableEntry<DecompressorDict> cached_dict;
+
+  if (rep->uncompression_dict_reader) {
+    Status s = rep->uncompression_dict_reader->GetOrReadUncompressionDictionary(
+        nullptr, job->job_options.read_options, nullptr, nullptr, &cached_dict);
+    if (!s.ok()) {
+      return s;
+    }
+    if (cached_dict.GetValue()) {
+      decompressor = cached_dict.GetValue()->decompressor_.get();
+    }
+  }
+
+  // Create block from buffer data
+  const auto block_size_with_trailer =
+      BlockBasedTable::BlockSizeWithTrailer(block);
+  const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
+
+  CacheAllocationPtr data = AllocateBlock(
+      block_size_with_trailer, GetMemoryAllocator(rep->table_options));
+  memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
+         block_size_with_trailer);
+  BlockContents tmp_contents(std::move(data), block.size());
+
+#ifndef NDEBUG
+  tmp_contents.has_trailer = rep->footer.GetBlockTrailerSize() > 0;
+#endif
+
+  return job->table->CreateAndPinBlockInCache<Block_kData>(
+      job->job_options.read_options, block, decompressor, &tmp_contents,
+      &pinned_block_entry.As<Block_kData>());
+}
+
+// State for async IO operations (implementation detail)
+struct AsyncIOState {
+  AsyncIOState() : offset(static_cast<uint64_t>(-1)) {}
+  ~AsyncIOState() { read_req.status.PermitUncheckedError(); }
+
+  AsyncIOState(const AsyncIOState&) = delete;
+  AsyncIOState& operator=(const AsyncIOState&) = delete;
+  AsyncIOState(AsyncIOState&&) = default;
+  AsyncIOState& operator=(AsyncIOState&&) = default;
+
+  std::unique_ptr<char[]> buf;
+  AlignedBuf aligned_buf;
+  void* io_handle = nullptr;
+  IOHandleDeleter del_fn;
+  uint64_t offset;
+  std::vector<size_t> block_indices;
+  std::vector<BlockHandle> blocks;
+  FSReadRequest read_req;
+};
+
+// ReadSet destructor - clean up IO handles
+ReadSet::~ReadSet() {
+  for (auto& pair : async_io_map_) {
+    auto& async_state = pair.second;
+    if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
+      async_state->del_fn(async_state->io_handle);
+      async_state->io_handle = nullptr;
+    }
+  }
+}
+
+// Main Read() method - transparently handles cache, async IO, and sync reads
+Status ReadSet::ReadIndex(size_t block_index, CachableEntry<Block>* out) {
+  // Bounds check
+  if (block_index >= pinned_blocks_.size()) {
+    return Status::InvalidArgument("Block index out of range");
+  }
+
+  // Case 1: Block is already available (from cache or sync read during
+  // SubmitJob)
+  if (pinned_blocks_[block_index].GetValue()) {
+    *out = std::move(pinned_blocks_[block_index]);
+    // Note: Statistics for this block were already counted during SubmitJob
+    // (either as cache hit or sync read)
+    return Status::OK();
+  }
+
+  // Case 2: Block has async IO in progress - poll and process
+  if (job_->job_options.read_options.async_io) {
+    auto it = async_io_map_.find(block_index);
+    if (it != async_io_map_.end()) {
+      // Get the number of blocks in this coalesced async request BEFORE polling
+      // (since PollAndProcessAsyncIO will remove entries from the map)
+      size_t num_blocks_in_request = it->second->block_indices.size();
+
+      if (Status s = PollAndProcessAsyncIO(it->second); !s.ok()) {
+        return s;
+      }
+      // Count all blocks that were read in this async request
+      num_async_reads_ += num_blocks_in_request;
+
+      // After polling, the block should be in pinned_blocks_
+      if (pinned_blocks_[block_index].GetValue()) {
+        *out = std::move(pinned_blocks_[block_index]);
+        return Status::OK();
+      }
+
+      return Status::IOError("Failed to process async IO result");
+    }
+  }
+
+  // Case 3: Block needs synchronous read
+  Status s = SyncRead(block_index);
+  if (s.ok()) {
+    *out = std::move(pinned_blocks_[block_index]);
+    num_sync_reads_++;
+  }
+  return s;
+}
+
+Status ReadSet::ReadOffset(size_t offset, CachableEntry<Block>* out) {
+  if (sorted_block_indices_.empty()) {
+    return Status::InvalidArgument("ReadSet not initialized");
+  }
+
+  // Use binary search on the sorted index to find the block containing offset.
+  // sorted_block_indices_ contains original indices sorted by block offset.
+  const auto& block_handles = job_->block_handles;
+
+  // Binary search for the first block whose offset is > offset, then back up
+  auto it = std::upper_bound(sorted_block_indices_.begin(),
+                             sorted_block_indices_.end(), offset,
+                             [&block_handles](size_t off, size_t idx) {
+                               return off < block_handles[idx].offset();
+                             });
+
+  // If it == begin(), offset is before all blocks
+  if (it == sorted_block_indices_.begin()) {
+    return Status::InvalidArgument("Offset not found in any block");
+  }
+
+  // Back up to the candidate block (largest offset <= our offset)
+  --it;
+  size_t candidate_idx = *it;
+  const auto& handle = block_handles[candidate_idx];
+
+  // Check if offset falls within this block
+  if (offset >= handle.offset() && offset < (handle.offset() + handle.size())) {
+    return ReadIndex(candidate_idx, out);
+  }
+
+  return Status::InvalidArgument("Offset not found in any block");
+}
+
+// Poll and process async IO for a specific block
+Status ReadSet::PollAndProcessAsyncIO(
+    const std::shared_ptr<AsyncIOState>& async_state) {
+  auto* rep = job_->table->get_rep();
+
+  // Poll for IO completion using FileSystem Poll API
+  std::vector<void*> io_handles = {async_state->io_handle};
+  IOStatus io_s = rep->ioptions.env->GetFileSystem()->Poll(io_handles, 1);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+
+  // Check for read errors
+  if (!async_state->read_req.status.ok()) {
+    return async_state->read_req.status;
+  }
+
+  // Determine which buffer to use
+  const Slice buffer_data =
+      rep->file->use_direct_io()
+          ? Slice(static_cast<const char*>(async_state->aligned_buf.get()),
+                  async_state->read_req.len)
+          : Slice(async_state->buf.get(), async_state->read_req.len);
+
+  // Process all blocks in this async request
+  for (size_t i = 0; i < async_state->block_indices.size(); ++i) {
+    const size_t idx = async_state->block_indices[i];
+    const auto& block_handle = async_state->blocks[i];
+
+    Status s =
+        CreateAndPinBlockFromBuffer(job_, block_handle, async_state->offset,
+                                    buffer_data, pinned_blocks_[idx]);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clean up IO handle
+  if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
+    async_state->del_fn(async_state->io_handle);
+    async_state->io_handle = nullptr;
+  }
+
+  // Remove from map - all blocks in this request have been processed
+  // Store indices in a temporary vector to avoid iterator invalidation
+  std::vector<size_t> indices_to_remove = async_state->block_indices;
+  for (const auto idx : indices_to_remove) {
+    async_io_map_.erase(idx);
+  }
+
+  return Status::OK();
+}
+
+// Perform synchronous read for a specific block
+// This performs a direct synchronous read from disk when the block is not in
+// cache
+Status ReadSet::SyncRead(size_t block_index) {
+  const auto& block_handle = job_->block_handles[block_index];
+  auto* rep = job_->table->get_rep();
+
+  return job_->table->RetrieveBlock<Block_kData>(
+      /*prefetch_buffer=*/nullptr, job_->job_options.read_options, block_handle,
+      rep->decompressor.get(), &pinned_blocks_[block_index].As<Block_kData>(),
+      /*get_context=*/nullptr, /*lookup_context=*/nullptr,
+      /*for_compaction=*/false, /*use_cache=*/true,
+      /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
+}
+
+struct IODispatcherImpl::Impl {
+  Impl();
+  ~Impl();
+
+  // Non-copyable and non-movable
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+  Impl(Impl&&) = delete;
+  Impl& operator=(Impl&&) = delete;
+
+  Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                   std::shared_ptr<ReadSet>* read_set);
+
+ private:
+  void PrepareIORequests(
+      const std::shared_ptr<IOJob>& job,
+      const std::vector<size_t>& block_indices_to_read,
+      const std::vector<BlockHandle>& block_handles,
+      std::vector<FSReadRequest>* read_reqs,
+      std::vector<std::vector<size_t>>* coalesced_block_indices);
+
+  void ExecuteAsyncIO(
+      const std::shared_ptr<IOJob>& job,
+      const std::shared_ptr<ReadSet>& read_set,
+      std::vector<FSReadRequest>& read_reqs,
+      const std::vector<std::vector<size_t>>& coalesced_block_indices);
+
+  Status ExecuteSyncIO(
+      const std::shared_ptr<IOJob>& job,
+      const std::shared_ptr<ReadSet>& read_set,
+      std::vector<FSReadRequest>& read_reqs,
+      const std::vector<std::vector<size_t>>& coalesced_block_indices);
+};
+
+IODispatcherImpl::Impl::Impl() {}
+
+IODispatcherImpl::Impl::~Impl() {}
+
+Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
+                                         std::shared_ptr<ReadSet>* read_set) {
+  if (!read_set) {
+    return Status::InvalidArgument("read_set output parameter is null");
+  }
+
+  auto rs = std::make_shared<ReadSet>();
+
+  // Initialize ReadSet
+  rs->job_ = job;
+  rs->pinned_blocks_.resize(job->block_handles.size());
+
+  // Build sorted index for O(log n) ReadOffset lookups via binary search.
+  // sorted_block_indices_[i] = original index of i-th smallest block by offset.
+  rs->sorted_block_indices_.resize(job->block_handles.size());
+  for (size_t i = 0; i < job->block_handles.size(); ++i) {
+    rs->sorted_block_indices_[i] = i;
+  }
+  std::sort(rs->sorted_block_indices_.begin(), rs->sorted_block_indices_.end(),
+            [&job](size_t a, size_t b) {
+              return job->block_handles[a].offset() <
+                     job->block_handles[b].offset();
+            });
+
+  // Step 1: Check cache and pin cached blocks
+  std::vector<size_t> block_indices_to_read;
+
+  for (size_t i = 0; i < job->block_handles.size(); ++i) {
+    const auto& data_block_handle = job->block_handles[i];
+
+    // Lookup and pin block in cache
+    Status s = job->table->LookupAndPinBlocksInCache<Block_kData>(
+        job->job_options.read_options, data_block_handle,
+        &(rs->pinned_blocks_)[i].As<Block_kData>());
+
+    if (!s.ok()) {
+      continue;
+    }
+
+    if (!(rs->pinned_blocks_)[i].GetValue()) {
+      // Block not in cache - needs to be read from disk
+      block_indices_to_read.emplace_back(i);
+    }
+  }
+
+  // Step 2: Prepare IO requests for blocks not in cache
+  if (block_indices_to_read.empty()) {
+    // All blocks found in cache - count them as cache hits
+    rs->num_cache_hits_ = job->block_handles.size();
+    *read_set = std::move(rs);
+    return Status::OK();
+  }
+
+  // Count cache hits (blocks that were found in cache during lookup above)
+  rs->num_cache_hits_ =
+      job->block_handles.size() - block_indices_to_read.size();
+
+  // Prepare read requests - coalesce adjacent blocks
+  std::vector<FSReadRequest> read_reqs;
+  std::vector<std::vector<size_t>> coalesced_block_indices;
+  PrepareIORequests(job, block_indices_to_read, job->block_handles, &read_reqs,
+                    &coalesced_block_indices);
+
+  // Step 3: Execute IO requests based on JobOptions
+  if (job->job_options.read_options.async_io) {
+    ExecuteAsyncIO(job, rs, read_reqs, coalesced_block_indices);
+  } else {
+    Status s = ExecuteSyncIO(job, rs, read_reqs, coalesced_block_indices);
+    if (!s.ok()) {
+      return s;
+    }
+    // We bump this for sync reads
+    rs->num_sync_reads_ += block_indices_to_read.size();
+  }
+
+  *read_set = std::move(rs);
+  return Status::OK();
+}
+
+void IODispatcherImpl::Impl::PrepareIORequests(
+    const std::shared_ptr<IOJob>& job,
+    const std::vector<size_t>& block_indices_to_read,
+    const std::vector<BlockHandle>& block_handles,
+    std::vector<FSReadRequest>* read_reqs,
+    std::vector<std::vector<size_t>>* coalesced_block_indices) {
+  // This is necessary because block handles may not be in sorted order
+  std::vector<size_t> sorted_block_indices = block_indices_to_read;
+  std::sort(sorted_block_indices.begin(), sorted_block_indices.end(),
+            [&block_handles](size_t a, size_t b) {
+              return block_handles[a].offset() < block_handles[b].offset();
+            });
+
+  assert(coalesced_block_indices->empty());
+  coalesced_block_indices->resize(1);
+
+  for (const auto& block_idx : sorted_block_indices) {
+    if (!coalesced_block_indices->back().empty()) {
+      // Check if we can coalesce with previous block
+      const auto& last_block_handle =
+          block_handles[coalesced_block_indices->back().back()];
+      uint64_t last_block_end =
+          last_block_handle.offset() +
+          BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
+      uint64_t current_start = block_handles[block_idx].offset();
+
+      if (current_start >
+          last_block_end + job->job_options.io_coalesce_threshold) {
+        // Gap too large - start new IO request
+        coalesced_block_indices->emplace_back();
+      }
+    }
+    coalesced_block_indices->back().emplace_back(block_idx);
+  }
+
+  // Create FSReadRequest for each coalesced group
+  assert(read_reqs->empty());
+  read_reqs->reserve(coalesced_block_indices->size());
+
+  for (const auto& block_indices : *coalesced_block_indices) {
+    assert(!block_indices.empty());
+
+    // Find the min and max offsets in this coalesced group
+    // Since blocks are now sorted, first has min offset and last has max
+    const auto& first_block_handle = block_handles[block_indices[0]];
+    const auto& last_block_handle = block_handles[block_indices.back()];
+
+    const auto start_offset = first_block_handle.offset();
+    const auto end_offset =
+        last_block_handle.offset() +
+        BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
+
+    assert(end_offset > start_offset);
+
+    read_reqs->emplace_back();
+    read_reqs->back().offset = start_offset;
+    read_reqs->back().len = end_offset - start_offset;
+    read_reqs->back().scratch = nullptr;
+  }
+}
+
+void IODispatcherImpl::Impl::ExecuteAsyncIO(
+    const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
+    std::vector<FSReadRequest>& read_reqs,
+    const std::vector<std::vector<size_t>>& coalesced_block_indices) {
+  // Get file and IO options
+  auto* rep = job->table->get_rep();
+  IOOptions io_opts;
+  Status s =
+      rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
+  if (!s.ok()) {
+    return;
+  }
+
+  const bool direct_io = rep->file->use_direct_io();
+
+  // Submit async read requests and store them in the ReadSet
+  for (size_t i = 0; i < read_reqs.size(); ++i) {
+    auto async_state = std::make_shared<AsyncIOState>();
+
+    async_state->offset = read_reqs[i].offset;
+    async_state->block_indices = coalesced_block_indices[i];
+    async_state->read_req = std::move(read_reqs[i]);
+
+    for (const auto idx : coalesced_block_indices[i]) {
+      async_state->blocks.emplace_back(job->block_handles[idx]);
+    }
+
+    if (direct_io) {
+      async_state->read_req.scratch = nullptr;
+    } else {
+      async_state->buf.reset(new char[async_state->read_req.len]);
+      async_state->read_req.scratch = async_state->buf.get();
+    }
+
+    // Callback for async read completion
+    // TODO: Probably need to make this more useful.
+    auto cb = [](const FSReadRequest& /*req*/, void* /*cb_arg*/) {
+      // Placeholder callback - currently does nothing
+    };
+
+    s = rep->file->ReadAsync(async_state->read_req, io_opts, cb,
+                             async_state.get(), &async_state->io_handle,
+                             &async_state->del_fn,
+                             direct_io ? &async_state->aligned_buf : nullptr);
+
+    if (!s.ok()) {
+      continue;
+    }
+    assert(async_state->io_handle);
+
+    // Mark the status as permitted unchecked since we'll check it later
+    // in PollAndProcessAsyncIO
+
+    // Add async state to map for all blocks in this request
+    for (const auto idx : async_state->block_indices) {
+      read_set->async_io_map_[idx] = async_state;
+    }
+  }
+}
+
+Status IODispatcherImpl::Impl::ExecuteSyncIO(
+    const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
+    std::vector<FSReadRequest>& read_reqs,
+    const std::vector<std::vector<size_t>>& coalesced_block_indices) {
+  // Get file and IO options
+  auto* rep = job->table->get_rep();
+  IOOptions io_opts;
+  if (Status s =
+          rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
+      !s.ok()) {
+    return s;
+  }
+
+  const bool direct_io = rep->file->use_direct_io();
+
+  // Setup scratch buffers for MultiRead
+  std::unique_ptr<char[]> buf;
+
+  if (direct_io) {
+    for (auto& read_req : read_reqs) {
+      read_req.scratch = nullptr;
+    }
+  } else {
+    // Allocate a single contiguous buffer for all requests
+    size_t total_len = 0;
+    for (const auto& req : read_reqs) {
+      total_len += req.len;
+    }
+    buf.reset(new char[total_len]);
+    size_t offset = 0;
+    for (auto& read_req : read_reqs) {
+      read_req.scratch = buf.get() + offset;
+      offset += read_req.len;
+    }
+  }
+
+  // Execute MultiRead
+  AlignedBuf aligned_buf;
+  if (Status s =
+          rep->file->MultiRead(io_opts, read_reqs.data(), read_reqs.size(),
+                               direct_io ? &aligned_buf : nullptr);
+      !s.ok()) {
+    return s;
+  }
+
+  for (const auto& rq : read_reqs) {
+    if (!rq.status.ok()) {
+      return rq.status;
+    }
+  }
+
+  // Process all blocks from the MultiRead results
+  for (size_t i = 0; i < coalesced_block_indices.size(); ++i) {
+    const auto& read_req = read_reqs[i];
+    for (const auto& block_idx : coalesced_block_indices[i]) {
+      const auto& block_handle = job->block_handles[block_idx];
+
+      Status create_status = CreateAndPinBlockFromBuffer(
+          job, block_handle, read_req.offset, read_req.result,
+          read_set->pinned_blocks_[block_idx]);
+      if (!create_status.ok()) {
+        return create_status;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+IODispatcherImpl::IODispatcherImpl() : impl_(new Impl()) {}
+
+IODispatcherImpl::~IODispatcherImpl() = default;
+
+Status IODispatcherImpl::SubmitJob(const std::shared_ptr<IOJob>& job,
+                                   std::shared_ptr<ReadSet>* read_set) {
+  return impl_->SubmitJob(job, read_set);
+}
+
+IODispatcher* NewIODispatcher() { return new IODispatcherImpl(); }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_imp.h b/util/io_dispatcher_imp.h
new file mode 100644
index 000000000000..3324705ada3f
--- /dev/null
+++ b/util/io_dispatcher_imp.h
@@ -0,0 +1,35 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/io_dispatcher.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IODispatcherImpl : public IODispatcher {
+ public:
+  explicit IODispatcherImpl();
+  ~IODispatcherImpl() override;
+
+  Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                   std::shared_ptr<ReadSet>* read_set) override;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_test.cc b/util/io_dispatcher_test.cc
new file mode 100644
index 000000000000..9677b4c51897
--- /dev/null
+++ b/util/io_dispatcher_test.cc
@@ -0,0 +1,985 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/io_dispatcher.h"
+
+#include <memory>
+#include <mutex>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/block_based/block_based_table_reader.h"
+
+// Enable io_uring support for this test
+extern "C" bool RocksDbIOUringEnable() { return true; }
+
+namespace ROCKSDB_NAMESPACE {
+
+// Represents a single read operation recorded by the tracking file system
+struct ReadOp {
+  enum Type { kMultiRead, kReadAsync };
+  Type type;
+  // For MultiRead: contains all (offset, len) pairs in the request
+  // For ReadAsync: contains a single (offset, len) pair
+  std::vector<std::pair<uint64_t, size_t>> requests;
+};
+
+// Forward declaration
+class ReadTrackingFS;
+
+// Wrapper around FSRandomAccessFile that tracks read operations
+class ReadTrackingRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  ReadTrackingRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                               ReadTrackingFS* fs)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+ private:
+  ReadTrackingFS* fs_;
+};
+
+// FileSystem wrapper that tracks all read operations for verification
+class ReadTrackingFS : public FileSystemWrapper {
+ public:
+  explicit ReadTrackingFS(const std::shared_ptr<FileSystem>& target)
+      : FileSystemWrapper(target) {}
+
+  static const char* kClassName() { return "ReadTrackingFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    if (s.ok()) {
+      result->reset(new ReadTrackingRandomAccessFile(std::move(file), this));
+    }
+    return s;
+  }
+
+  // Record a MultiRead operation
+  void RecordMultiRead(const std::vector<std::pair<uint64_t, size_t>>& reqs) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ReadOp op;
+    op.type = ReadOp::kMultiRead;
+    op.requests = reqs;
+    read_ops_.push_back(std::move(op));
+  }
+
+  // Record a ReadAsync operation
+  void RecordReadAsync(uint64_t offset, size_t len) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ReadOp op;
+    op.type = ReadOp::kReadAsync;
+    op.requests.push_back({offset, len});
+    read_ops_.push_back(std::move(op));
+  }
+
+  // Get all recorded read operations
+  std::vector<ReadOp> GetReadOps() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return read_ops_;
+  }
+
+  // Clear recorded read operations
+  void ClearReadOps() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    read_ops_.clear();
+  }
+
+  // Get count of MultiRead operations
+  size_t GetMultiReadCount() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& op : read_ops_) {
+      if (op.type == ReadOp::kMultiRead) {
+        count++;
+      }
+    }
+    return count;
+  }
+
+  // Get count of ReadAsync operations
+  size_t GetReadAsyncCount() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    size_t count = 0;
+    for (const auto& op : read_ops_) {
+      if (op.type == ReadOp::kReadAsync) {
+        count++;
+      }
+    }
+    return count;
+  }
+
+ private:
+  mutable std::mutex mutex_;
+  std::vector<ReadOp> read_ops_;
+};
+
+IOStatus ReadTrackingRandomAccessFile::MultiRead(FSReadRequest* reqs,
+                                                 size_t num_reqs,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
+  // Record the read operation before executing it
+  std::vector<std::pair<uint64_t, size_t>> recorded_reqs;
+  recorded_reqs.reserve(num_reqs);
+  for (size_t i = 0; i < num_reqs; i++) {
+    recorded_reqs.push_back({reqs[i].offset, reqs[i].len});
+  }
+  fs_->RecordMultiRead(recorded_reqs);
+
+  // Delegate to underlying file
+  return target()->MultiRead(reqs, num_reqs, options, dbg);
+}
+
+IOStatus ReadTrackingRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  // Record the read operation before executing it
+  fs_->RecordReadAsync(req.offset, req.len);
+
+  // Delegate to underlying file
+  return target()->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg);
+}
+
+class IODispatcherTest : public DBTestBase {
+ public:
+  IODispatcherTest()
+      : DBTestBase("io_dispatcher_test", /*env_do_fsync=*/false) {}
+
+  ~IODispatcherTest() override {
+    // Close any open tables
+    for (auto& table : tables_) {
+      table.reset();
+    }
+    tables_.clear();
+  }
+
+  // Helper to collect block handles from a table
+  // We use TEST_GetDataBlockHandle to get handles for specific keys
+  // Since we know the keys we inserted, we can collect their block handles
+  Status CollectBlockHandles(BlockBasedTable* table, size_t num_keys,
+                             std::vector<BlockHandle>* block_handles_out) {
+    block_handles_out->clear();
+
+    ReadOptions read_options;
+    std::unordered_set<uint64_t> seen_offsets;
+
+    // Iterate through all keys and get their block handles
+    // We collect unique block handles (same block might contain multiple keys)
+    IndexBlockIter iiter_on_stack;
+    BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum};
+    auto iiter = table->NewIndexIterator(read_options, false, &iiter_on_stack,
+                                         nullptr, &context);
+    std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
+    if (iiter != &iiter_on_stack) {
+      iiter_unique_ptr.reset(iiter);
+    }
+
+    // Position the iterator at the first entry
+    iiter->SeekToFirst();
+
+    while (iiter->Valid()) {
+      auto handle = iiter->value().handle;
+      if (seen_offsets.find(handle.offset()) == seen_offsets.end()) {
+        block_handles_out->push_back(handle);
+        seen_offsets.insert(handle.offset());
+        if (block_handles_out->size() >= num_keys) {
+          break;
+        }
+      }
+      iiter->Next();
+    }
+
+    return Status::OK();
+  }
+
+  std::string test_dir_{};
+  Env* env_{};
+  std::shared_ptr<FileSystem> base_fs_;
+  std::shared_ptr<ReadTrackingFS> tracking_fs_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+
+  void SetUp() override {
+    SetupSyncPointsToMockDirectIO();
+    test_dir_ = test::PerThreadDBPath("block_based_table_reader_test");
+    env_ = Env::Default();
+    base_fs_ = FileSystem::Default();
+    tracking_fs_ = std::make_shared<ReadTrackingFS>(base_fs_);
+    ASSERT_OK(base_fs_->CreateDir(test_dir_, IOOptions(), nullptr));
+  }
+
+  void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
+
+  void NewFileWriter(const std::string& filename,
+                     std::unique_ptr<WritableFileWriter>* writer) {
+    std::string path = Path(filename);
+    EnvOptions env_options;
+    FileOptions foptions;
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(base_fs_->NewWritableFile(path, foptions, &file, nullptr));
+    writer->reset(new WritableFileWriter(std::move(file), path, env_options));
+  }
+
+  void NewFileReader(const std::string& filename, const FileOptions& opt,
+                     std::unique_ptr<RandomAccessFileReader>* reader,
+                     Statistics* stats = nullptr) {
+    std::string path = Path(filename);
+    std::unique_ptr<FSRandomAccessFile> f;
+    // Use tracking_fs_ to record read operations
+    ASSERT_OK(tracking_fs_->NewRandomAccessFile(path, opt, &f, nullptr));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get(),
+                                             /*io_tracer=*/nullptr,
+                                             /*stats=*/stats));
+  }
+
+  std::vector<std::shared_ptr<Statistics>> all_stats_;
+  std::vector<std::unique_ptr<BlockBasedTable>> tables_;
+
+  // Options must be stored as member variables to avoid use-after-scope
+  // The BlockBasedTable keeps references to these options
+  std::vector<std::unique_ptr<ImmutableOptions>> all_ioptions_;
+  std::vector<std::unique_ptr<EnvOptions>> all_env_options_;
+
+  // Helper to create an SST file and open it as a table
+  // Following pattern from table_test.cc TableConstructor
+  Status CreateAndOpenSST(int num_blocks,
+                          std::unique_ptr<BlockBasedTable>* table,
+                          std::vector<BlockHandle>* block_handles_out) {
+    // Create options - store in member variables to avoid use-after-scope
+    // The BlockBasedTable will keep references to these options
+    Options options{};
+    options.statistics = nullptr;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = NewLRUCache(8 * 1024 * 1024);
+    table_options.block_size = 16 * 1024;
+    table_options.no_block_cache = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    // Store these in member variables so they outlive the function
+    auto ioptions = std::make_unique<ImmutableOptions>(options);
+    auto moptions = MutableCFOptions{options};
+    InternalKeyComparator internal_comparator(options.comparator);
+
+    // Create in-memory file using StringSink (like table_test.cc)
+    auto table_name = "test_table";
+    std::unique_ptr<WritableFileWriter> file_writer;
+    NewFileWriter(table_name, &file_writer);
+
+    // Create table builder
+    std::string column_family_name;
+    const ReadOptions read_options;
+    const WriteOptions write_options;
+    std::vector<std::unique_ptr<InternalTblPropCollFactory>>
+        int_tbl_prop_coll_factories;
+    TableBuilderOptions builder_options(
+        *ioptions, moptions, read_options, write_options, internal_comparator,
+        &int_tbl_prop_coll_factories, kNoCompression, options.compression_opts,
+        0 /* column_family_id */, column_family_name, -1 /* level */,
+        kUnknownNewestKeyTime);
+
+    std::unique_ptr<TableBuilder> builder(
+        options.table_factory->NewTableBuilder(builder_options,
+                                               file_writer.get()));
+
+    Status s;
+    auto rnd = Random::GetTLSInstance();
+    // Add keys to the table
+    // 10k * 1Kib = ~10MiB
+    for (int i = 0; i < 10000; i++) {
+      std::string value = rnd->RandomString(2 << 10);
+      InternalKey ikey(Key(i), i, kTypeValue);
+      builder->Add(ikey.Encode(), value);
+    }
+    s = builder->Finish();
+    if (!s.ok()) {
+      return s;
+    }
+
+    uint64_t file_size = builder->FileSize();
+
+    IOOptions io_options;
+    s = file_writer->Flush(io_options);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Now open the file for reading using StringSource (like table_test.cc)
+    std::unique_ptr<RandomAccessFileReader> file;
+    FileOptions foptions;
+    foptions.use_direct_reads = false;
+
+    NewFileReader(table_name, foptions, &file, nullptr);
+
+    // Store EnvOptions and InternalKeyComparator to avoid use-after-scope
+    auto soptions = std::make_unique<EnvOptions>();
+    BlockCacheTracer block_cache_tracer;
+    std::unique_ptr<TableReader> table_reader;
+
+    auto ikc = InternalKeyComparator(options.comparator);
+    TableReaderOptions reader_options(*ioptions, moptions.prefix_extractor,
+                                      moptions.compression_manager.get(),
+                                      *soptions, ikc,
+                                      0 /* block_protection_bytes_per_key */);
+
+    s = options.table_factory->NewTableReader(reader_options, std::move(file),
+                                              file_size, &table_reader);
+
+    if (!s.ok()) {
+      return s;
+    }
+
+    table->reset(static_cast<BlockBasedTable*>(table_reader.release()));
+
+    // Collect actual block handles from the table's index
+    // This is similar to how block_based_table_iterator.cc CollectBlockHandles
+    // works
+    s = CollectBlockHandles(table->get(), num_blocks, block_handles_out);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // Store all options in member variables to keep them alive
+    all_ioptions_.push_back(std::move(ioptions));
+    all_env_options_.push_back(std::move(soptions));
+
+    return Status::OK();
+  }
+
+  static uint64_t cur_file_num_;
+};
+
+uint64_t IODispatcherTest::cur_file_num_ = 1;
+
+TEST_F(IODispatcherTest, BasicSSTRead) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read blocks using the new ReadSet API and verify they are valid
+  // ReadIndex will poll for async IO completion internally, no need to sleep
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+
+    // Verify the block has reasonable content
+    const Block* block_ptr = block.GetValue();
+    ASSERT_GT(block_ptr->size(), 0);
+  }
+
+  // Verify statistics - some blocks should have been read asynchronously
+  // Note: actual counts depend on cache behavior and IO completion
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  ASSERT_EQ(total_reads, block_handles.size());
+}
+
+TEST_F(IODispatcherTest, MultipleSSTFiles) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::vector<std::shared_ptr<ReadSet>> read_sets;
+  std::vector<std::vector<BlockHandle>> all_block_handles;
+
+  // Create and submit jobs for multiple SST files
+  for (int i = 0; i < 3; i++) {
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+
+    Status s = CreateAndOpenSST(30 + i * 10, &table, &block_handles);
+    ASSERT_OK(s);
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    tables_.push_back(std::move(table));
+
+    all_block_handles.push_back(block_handles);
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    read_sets.push_back(read_set);
+  }
+
+  // Verify all ReadSets can read their blocks successfully
+  // ReadIndex will poll for async IO completion internally, no need to sleep
+  for (size_t i = 0; i < read_sets.size(); ++i) {
+    for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
+      CachableEntry<Block> block;
+      Status read_status = read_sets[i]->ReadIndex(j, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+}
+
+TEST_F(IODispatcherTest, StatisticsTracking) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read all blocks - ReadIndex handles polling for async IO completion
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // Read the same blocks again - should all be cache hits now
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job, &read_set2);
+  ASSERT_OK(s);
+
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // After reading all blocks, verify statistics
+  uint64_t num_sync = read_set->GetNumSyncReads();
+  uint64_t num_async = read_set->GetNumAsyncReads();
+  uint64_t num_cache = read_set->GetNumCacheHits();
+
+  // Total reads should equal number of blocks
+  uint64_t total_reads = num_sync + num_async + num_cache;
+  ASSERT_EQ(total_reads, block_handles.size());
+}
+TEST_F(IODispatcherTest, AsyncAndSyncRead) {
+  // This test verifies the difference between async_io=true and async_io=false
+  // by checking the statistics after reading all blocks.
+  // Note: When io_uring is not available, async_io=true will fall back to sync.
+
+  for (auto async : {true, false}) {
+    std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+    Status s = CreateAndOpenSST(40, &table, &block_handles);
+    ASSERT_OK(s);
+    ASSERT_NE(table, nullptr);
+    ASSERT_GT(block_handles.size(), 0);
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    ReadOptions read_options;
+    // Ensure we don't use cache for this test - we want fresh reads
+    read_options.fill_cache = false;
+    job->job_options.read_options.async_io = async;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+
+    // Read all blocks - ReadIndex handles polling for async IO internally
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+
+      // Verify the block has reasonable content
+      const Block* block_ptr = block.GetValue();
+      ASSERT_GT(block_ptr->size(), 0);
+    }
+
+    // Verify statistics
+    uint64_t num_sync = read_set->GetNumSyncReads();
+    uint64_t num_async = read_set->GetNumAsyncReads();
+    uint64_t num_cache = read_set->GetNumCacheHits();
+
+    // Total reads should equal number of blocks
+    uint64_t total_reads = num_sync + num_async + num_cache;
+    EXPECT_EQ(total_reads, block_handles.size());
+
+    // When async_io is false, we always expect sync reads
+    if (!async) {
+      EXPECT_GT(num_sync, 0) << "Expected sync reads when async_io=false";
+      EXPECT_EQ(num_async, 0) << "Expected no async reads when async_io=false";
+    }
+    // When async_io is true:
+    // - If io_uring is available, we expect async reads
+    // - If io_uring is NOT available, ReadAsync returns NotSupported and
+    //   we fall back to sync reads. This is valid behavior.
+    // So we only verify that ALL blocks were read (checked above).
+  }
+}
+
+TEST_F(IODispatcherTest, VerifyBlockContent) {
+  // Test that blocks retrieved through ReadSet contain the correct data
+  // that was written to the SST file
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GT(block_handles.size(), 0);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read each block and verify its content
+  int t = 0;
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block_entry;
+    Status read_status = read_set->ReadIndex(i, &block_entry);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block_entry.GetValue(), nullptr);
+
+    Block* block = block_entry.GetValue();
+    ASSERT_GT(block->size(), 0);
+
+    // Create an iterator to walk through the block's keys
+    // We use InternalKeyComparator for data blocks
+    InternalKeyComparator internal_comparator(BytewiseComparator());
+    std::unique_ptr<DataBlockIter> iter(block->NewDataIterator(
+        internal_comparator.user_comparator(), kDisableGlobalSequenceNumber));
+
+    // Iterate through all keys in this block
+    size_t num_keys_in_block = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      num_keys_in_block++;
+
+      // Verify key is not empty
+      ASSERT_GT(iter->key().size(), 0)
+          << "Block " << i << " contains empty key";
+
+      // Verify value is not empty (we wrote 1KB values)
+      ASSERT_GT(iter->value().size(), 2 ^ 10)
+          << "Block " << i << " contains empty value";
+
+      // Parse the internal key
+      ParsedInternalKey parsed_key;
+      Status parse_status =
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err */);
+      ASSERT_OK(parse_status) << "Failed to parse internal key in block " << i;
+
+      // Verify the key matches the expected format from CreateAndOpenSST
+      // Keys are created with Key(i) which generates keys like "key000000"
+      std::string user_key = parsed_key.user_key.ToString();
+      auto check = Key(t);
+      t++;
+      ASSERT_TRUE(user_key.find("key") == 0)
+          << "Unexpected key format in block " << i << ": " << user_key;
+
+      ASSERT_EQ(check.c_str(), user_key);
+
+      // Verify value type is correct (should be kTypeValue)
+      ASSERT_EQ(parsed_key.type, kTypeValue)
+          << "Unexpected value type in block " << i;
+    }
+
+    // Verify iterator status after iteration
+    ASSERT_OK(iter->status()) << "Iterator error in block " << i;
+
+    // Each block should contain at least one key
+    ASSERT_GT(num_keys_in_block, 0) << "Block " << i << " contains no keys";
+  }
+}
+
+// We want to test here that even when we DONT read from the readset that all
+// pinned blocks will be unpinned.
+TEST_F(IODispatcherTest, ReadSetDestroysUnpinsBlocks) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_EQ(block_handles.size(), 30);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  ReadOptions read_options;
+  job->job_options.read_options.async_io =
+      false;  // Use sync IO so blocks are pinned immediately
+
+  auto* rep = table->get_rep();
+  auto cache = rep->table_options.block_cache.get();
+  ASSERT_NE(cache, nullptr);
+
+  auto initial_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_EQ(initial_pinned_usage, 0);
+
+  {
+    std::shared_ptr<ReadSet> read_set;
+    Status t = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(t);
+    ASSERT_NE(read_set, nullptr);
+
+    // With sync IO, blocks are already pinned in read_set->pinned_blocks_
+    // We do NOT call read_set->Read() - blocks should remain in pinned_blocks_
+
+    // At this point, blocks should be pinned in the ReadSet
+    auto pinned_usage_with_blocks = cache->GetPinnedUsage();
+    ASSERT_GT(pinned_usage_with_blocks, initial_pinned_usage)
+        << "Expected pinned usage to increase after SubmitJob, but "
+        << "initial=" << initial_pinned_usage
+        << " current=" << pinned_usage_with_blocks;
+
+    // ReadSet goes out of scope here, its destructor should unpin all blocks
+  }
+
+  // ReadSet destroyed - all blocks should be unpinned
+  auto final_pinned_usage = cache->GetPinnedUsage();
+  ASSERT_EQ(final_pinned_usage, initial_pinned_usage)
+      << "Expected pinned usage to return to initial value after ReadSet "
+      << "destruction, but initial=" << initial_pinned_usage
+      << " final=" << final_pinned_usage;
+}
+
+// Test that verifies the exact sequence of reads issued by the IO dispatcher.
+// This uses the ReadTrackingFS to capture all read operations and verify
+// that async_io=true uses ReadAsync while async_io=false uses MultiRead.
+TEST_F(IODispatcherTest, VerifyReadSequence) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 10);
+
+  // Clear any reads from table opening
+  tracking_fs_->ClearReadOps();
+
+  // Test 1: Synchronous reads should use MultiRead
+  {
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+
+    // Read all blocks
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+
+    // Verify that MultiRead was used for sync reads
+    auto read_ops = tracking_fs_->GetReadOps();
+    ASSERT_GT(tracking_fs_->GetMultiReadCount(), 0)
+        << "Expected MultiRead to be called for sync reads";
+    ASSERT_EQ(tracking_fs_->GetReadAsyncCount(), 0)
+        << "Expected no ReadAsync calls for sync reads";
+
+    // Verify MultiRead requests cover all blocks
+    size_t total_blocks_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        // Each MultiRead request may contain multiple coalesced blocks
+        total_blocks_in_multireads += op.requests.size();
+      }
+    }
+    // Note: blocks may be coalesced, so we check that reads were issued
+    ASSERT_GT(total_blocks_in_multireads, 0);
+  }
+
+  // Clear reads and test async mode
+  tracking_fs_->ClearReadOps();
+
+  // Test 2: Async reads should use ReadAsync
+  {
+    // Create a new table to avoid cache hits
+    std::unique_ptr<BlockBasedTable> table2;
+    std::vector<BlockHandle> block_handles2;
+    s = CreateAndOpenSST(20, &table2, &block_handles2);
+    ASSERT_OK(s);
+
+    tracking_fs_->ClearReadOps();
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles2;
+    job->table = table2.get();
+    job->job_options.read_options.async_io = true;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+
+    // Verify that ReadAsync was used
+    ASSERT_GT(tracking_fs_->GetReadAsyncCount(), 0)
+        << "Expected ReadAsync to be called for async reads";
+    ASSERT_EQ(tracking_fs_->GetMultiReadCount(), 0)
+        << "Expected no MultiRead calls for async reads";
+
+    // Read blocks - ReadIndex will poll for async IO completion internally
+    for (size_t i = 0; i < block_handles2.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+}
+
+// Test that verifies the coalescing logic: adjacent blocks within the
+// coalesce threshold should be combined into a single read request.
+TEST_F(IODispatcherTest, VerifyCoalescing) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Get many blocks so we can test coalescing behavior
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 20);
+
+  tracking_fs_->ClearReadOps();
+
+  // Test coalescing with sync reads (uses MultiRead)
+  {
+    auto job = std::make_shared<IOJob>();
+    // Use a subset of adjacent blocks
+    std::vector<BlockHandle> adjacent_blocks;
+    for (size_t i = 0; i < 10 && i < block_handles.size(); ++i) {
+      adjacent_blocks.push_back(block_handles[i]);
+    }
+    job->block_handles = adjacent_blocks;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    // Set a large coalesce threshold so all adjacent blocks are combined
+    job->job_options.io_coalesce_threshold = 1024 * 1024;  // 1MB
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+
+    for (size_t i = 0; i < adjacent_blocks.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+
+    // With a large coalesce threshold and adjacent blocks, we expect
+    // all blocks to be coalesced into a single MultiRead request
+    auto read_ops = tracking_fs_->GetReadOps();
+    size_t multiread_count = 0;
+    size_t total_requests_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        multiread_count++;
+        total_requests_in_multireads += op.requests.size();
+      }
+    }
+
+    // Adjacent blocks should be coalesced into a single read request
+    // (assuming they're within the coalesce threshold)
+    EXPECT_EQ(multiread_count, 1)
+        << "Expected 1 MultiRead call with coalesced blocks";
+    EXPECT_EQ(total_requests_in_multireads, 1)
+        << "Expected all adjacent blocks to be coalesced into 1 request";
+  }
+
+  tracking_fs_->ClearReadOps();
+
+  // Test with zero coalesce threshold and non-adjacent blocks
+  // Non-adjacent blocks (with gaps) should NOT be coalesced with threshold=0
+  {
+    // Create new table to avoid cache hits
+    std::unique_ptr<BlockBasedTable> table2;
+    std::vector<BlockHandle> block_handles2;
+    s = CreateAndOpenSST(50, &table2, &block_handles2);
+    ASSERT_OK(s);
+    ASSERT_GE(block_handles2.size(), 20);
+
+    tracking_fs_->ClearReadOps();
+
+    auto job = std::make_shared<IOJob>();
+    // Skip every other block to create gaps between requested blocks
+    // This ensures there are gaps that won't be bridged with threshold=0
+    std::vector<BlockHandle> non_adjacent_blocks;
+    for (size_t i = 0;
+         i < block_handles2.size() && non_adjacent_blocks.size() < 5; i += 2) {
+      non_adjacent_blocks.push_back(block_handles2[i]);
+    }
+    job->block_handles = non_adjacent_blocks;
+    job->table = table2.get();
+    job->job_options.read_options.async_io = false;
+    // Set zero coalesce threshold - blocks with gaps should not be coalesced
+    job->job_options.io_coalesce_threshold = 0;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+
+    for (size_t i = 0; i < non_adjacent_blocks.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+
+    // With zero coalesce threshold and non-adjacent blocks (with gaps),
+    // each block should be a separate request
+    auto read_ops = tracking_fs_->GetReadOps();
+    size_t total_requests_in_multireads = 0;
+    for (const auto& op : read_ops) {
+      if (op.type == ReadOp::kMultiRead) {
+        total_requests_in_multireads += op.requests.size();
+      }
+    }
+
+    // Each non-adjacent block should be a separate request since there are
+    // gaps between them and threshold=0 means no gap tolerance
+    EXPECT_EQ(total_requests_in_multireads, non_adjacent_blocks.size())
+        << "Expected each non-adjacent block to be a separate request with "
+           "zero coalesce threshold";
+  }
+}
+
+// Test that verifies the read request offsets and lengths match the
+// expected block handles.
+TEST_F(IODispatcherTest, VerifyReadRequestDetails) {
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_NE(table, nullptr);
+  ASSERT_GE(block_handles.size(), 5);
+
+  tracking_fs_->ClearReadOps();
+
+  // Use just a few non-adjacent blocks to avoid coalescing
+  std::vector<BlockHandle> test_blocks;
+  // Pick every other block to ensure they're not adjacent
+  for (size_t i = 0; i < block_handles.size(); i += 2) {
+    test_blocks.push_back(block_handles[i]);
+  }
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = test_blocks;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+  // Small coalesce threshold to minimize coalescing for this test
+  job->job_options.io_coalesce_threshold = 0;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+
+  for (size_t i = 0; i < test_blocks.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+  }
+
+  // Verify the read requests match the block handles
+  auto read_ops = tracking_fs_->GetReadOps();
+  std::unordered_set<uint64_t> expected_offsets;
+  for (const auto& handle : test_blocks) {
+    expected_offsets.insert(handle.offset());
+  }
+
+  std::unordered_set<uint64_t> actual_offsets;
+  for (const auto& op : read_ops) {
+    if (op.type == ReadOp::kMultiRead) {
+      for (const auto& req : op.requests) {
+        actual_offsets.insert(req.first);
+      }
+    }
+  }
+
+  // Verify all expected offsets were read
+  for (const auto& expected : expected_offsets) {
+    EXPECT_TRUE(actual_offsets.count(expected) > 0)
+        << "Expected read at offset " << expected << " but it was not found";
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

From 4bf2bcdbb3d4d0a33b5078487388679320e4c373 Mon Sep 17 00:00:00 2001
From: Josh Kang <jkangs@fb.com>
Date: Thu, 8 Jan 2026 15:15:31 -0800
Subject: [PATCH 418/500] Allow setting options for multiple column families
 (#14201)

Summary:
Currently to set options for multiple CFs, the caller must repeatedly call SetOptions() for each CF. This in turn serializes the entire options file each time. This PR exposes a new API that allows SetOptions to be called on multiple CFs at once, thus only paying the OPTIONS file serialization once.

Also added a new unit test for SetOptions.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14201

Reviewed By: pdillinger

Differential Revision: D89735181

Pulled By: joshkang97

fbshipit-source-id: 9b7a721b7e8769b653243b1581678ffd05d038e8
---
 db/db_impl/db_impl.cc                         | 141 ++++++++++++------
 db/db_impl/db_impl.h                          |   5 +-
 db/db_impl/db_impl_compaction_flush.cc        |  12 +-
 db/db_impl/db_impl_secondary.h                |   6 +-
 db/db_options_test.cc                         |  40 +++++
 include/rocksdb/db.h                          |  30 +++-
 include/rocksdb/utilities/stackable_db.h      |   9 +-
 .../public_api_changes/set_options.md         |   1 +
 8 files changed, 177 insertions(+), 67 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/set_options.md

diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 2aa7be859081..bc9d4adc4946 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -1177,23 +1177,38 @@ FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
 }
 
 Status DBImpl::SetOptions(
-    ColumnFamilyHandle* column_family,
-    const std::unordered_map<std::string, std::string>& options_map) {
+    const std::unordered_map<ColumnFamilyHandle*,
+                             std::unordered_map<std::string, std::string>>&
+        column_families_opts_map) {
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
   const WriteOptions write_options;
 
-  auto* cfd =
-      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
-  if (options_map.empty()) {
-    ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                   "SetOptions() on column family [%s], empty input",
-                   cfd->GetName().c_str());
-    return Status::InvalidArgument("empty input");
+  if (column_families_opts_map.empty()) {
+    return Status::OK();
+  }
+
+  for (const auto& cf_opts : column_families_opts_map) {
+    if (cf_opts.second.empty()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "SetOptions() on column family [%s], empty input",
+                     cf_opts.first->GetName().c_str());
+      return Status::InvalidArgument("empty input");
+    }
+  }
+
+  autovector<std::pair<ColumnFamilyData*,
+                       const std::unordered_map<std::string, std::string>*>>
+      column_family_datas;
+  for (const auto& cf_opts : column_families_opts_map) {
+    column_family_datas.push_back(
+        {static_cast_with_check<ColumnFamilyHandleImpl>(cf_opts.first)->cfd(),
+         &cf_opts.second});
   }
 
   InstrumentedMutexLock ol(&options_mutex_);
-  MutableCFOptions new_options_copy;  // For logging outside of DB mutex
+  autovector<MutableCFOptions>
+      new_options_copy;  // For logging outside of DB mutex
   Status s;
   Status persist_options_status;
   SuperVersionContext sv_context(/* create_superversion */ true);
@@ -1216,68 +1231,104 @@ Status DBImpl::SetOptions(
     // Thus aren't releasing the DB mutex from LogAndApply calling pre_cb,
     // through installing the new Version until the end of this block, after
     // installing the new SuperVersion.
-    auto pre_cb = [&]() -> Status {
-      Status cb_s = cfd->SetOptions(db_options, options_map);
-      if (cb_s.ok()) {
-        new_options_copy = cfd->GetLatestMutableCFOptions();
-      }
-      return cb_s;
-    };
     VersionEdit dummy_edit;
     dummy_edit.MarkNoManifestWriteDummy();
     TEST_SYNC_POINT_CALLBACK("DBImpl::SetOptions:dummy_edit", &dummy_edit);
-    s = versions_->LogAndApply(
-        cfd, read_options, write_options, &dummy_edit, &mutex_,
-        directories_.GetDbDir(), false /*new_descriptor_log=*/,
-        nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb);
-    if (!versions_->io_status().ok()) {
-      assert(!s.ok());
-      error_handler_.SetBGError(versions_->io_status(),
-                                BackgroundErrorReason::kManifestWrite);
+    for (const auto& cfd_opts : column_family_datas) {
+      auto* cfd = cfd_opts.first;
+      const auto* options_map_ptr = cfd_opts.second;
+      auto pre_cb = [&]() -> Status {
+        Status cb_s = cfd->SetOptions(db_options, *options_map_ptr);
+        if (cb_s.ok()) {
+          new_options_copy.emplace_back(cfd->GetLatestMutableCFOptions());
+        }
+        return cb_s;
+      };
+
+      s = versions_->LogAndApply(
+          cfd, read_options, write_options, &dummy_edit, &mutex_,
+          directories_.GetDbDir(), false /*new_descriptor_log=*/,
+          nullptr /*new_opts*/, {} /*manifest_wcb*/, pre_cb);
+      if (!versions_->io_status().ok()) {
+        assert(!s.ok());
+        error_handler_.SetBGError(versions_->io_status(),
+                                  BackgroundErrorReason::kManifestWrite);
+      }
+      if (!s.ok()) {
+        break;
+      }
     }
 
     if (s.ok()) {
       // Trigger possible flush/compactions. This has to be before we persist
       // options to file, otherwise there will be a deadlock with writer
       // thread.
-      InstallSuperVersionForConfigChange(cfd, &sv_context);
+      for (const auto& cfd_opts : column_family_datas) {
+        InstallSuperVersionForConfigChange(cfd_opts.first, &sv_context);
+      }
       persist_options_status =
           WriteOptionsFile(write_options, true /*db_mutex_already_held*/);
       bg_cv_.SignalAll();
 
-      assert(new_options_copy == cfd->GetLatestMutableCFOptions());
-      assert(cfd->GetLatestMutableCFOptions() ==
-             cfd->GetCurrentMutableCFOptions());
-      assert(cfd->GetCurrentMutableCFOptions() ==
-             cfd->current()->GetMutableCFOptions());
+#ifndef NDEBUG
+      for (size_t i = 0; i < column_family_datas.size(); ++i) {
+        auto* cfd = column_family_datas[i].first;
+        assert(new_options_copy[i] == cfd->GetLatestMutableCFOptions());
+        assert(cfd->GetLatestMutableCFOptions() ==
+               cfd->GetCurrentMutableCFOptions());
+        assert(cfd->GetCurrentMutableCFOptions() ==
+               cfd->current()->GetMutableCFOptions());
+      }
+#endif
     }
   }
   sv_context.Clean();
 
-  if (s.ok() && (options_map.count("preserve_internal_time_seconds") > 0 ||
-                 options_map.count("preclude_last_level_data_seconds") > 0)) {
-    s = RegisterRecordSeqnoTimeWorker();
+  if (s.ok()) {
+    bool needs_seqno_worker = false;
+    for (const auto& cf_opts : column_families_opts_map) {
+      if (cf_opts.second.count("preserve_internal_time_seconds") > 0 ||
+          cf_opts.second.count("preclude_last_level_data_seconds") > 0) {
+        needs_seqno_worker = true;
+        break;
+      }
+    }
+    if (needs_seqno_worker) {
+      s = RegisterRecordSeqnoTimeWorker();
+    }
   }
 
-  ROCKS_LOG_INFO(
-      immutable_db_options_.info_log,
-      "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
-  for (const auto& o : options_map) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
-                   o.second.c_str());
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "SetOptions() on [%zu] column families, inputs:",
+                 column_family_datas.size());
+  for (size_t i = 0; i < column_family_datas.size(); ++i) {
+    const auto* cfd = column_family_datas[i].first;
+    const auto* options_map_ptr = column_family_datas[i].second;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Set options on column family [%s] (%zu/%zu), inputs:",
+                   cfd->GetName().c_str(), i, column_family_datas.size());
+    for (const auto& o : *options_map_ptr) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n",
+                     o.first.c_str(), o.second.c_str());
+    }
   }
   if (s.ok()) {
-    ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                   "[%s] SetOptions() succeeded", cfd->GetName().c_str());
-    new_options_copy.Dump(immutable_db_options_.info_log.get());
+    for (size_t i = 0; i < column_family_datas.size(); ++i) {
+      const auto* cfd = column_family_datas[i].first;
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Set options on column family [%s] (%zu/%zu) succeeded, "
+                     "updated CF options:",
+                     cfd->GetName().c_str(), i, column_family_datas.size());
+      new_options_copy[i].Dump(immutable_db_options_.info_log.get());
+    }
     if (!persist_options_status.ok()) {
       // NOTE: WriteOptionsFile already logs on failure
       s = persist_options_status;
     }
   } else {
     persist_options_status.PermitUncheckedError();  // less important
-    ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
-                   cfd->GetName().c_str());
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetOptions() failed: %s",
+                   s.ToString().c_str());
   }
   LogFlush(immutable_db_options_.info_log);
   return s;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 9168c94f6810..451fbd41c70e 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -458,8 +458,9 @@ class DBImpl : public DB {
 
   using DB::SetOptions;
   Status SetOptions(
-      ColumnFamilyHandle* column_family,
-      const std::unordered_map<std::string, std::string>& options_map) override;
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+          column_families_opts_map) override;
 
   Status SetDBOptions(
       const std::unordered_map<std::string, std::string>& options_map) override;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 9f4d08e938ee..2d3ee60bb0fb 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2889,16 +2889,8 @@ Status DBImpl::WaitForFlushMemTables(
 
 Status DBImpl::EnableAutoCompaction(
     const std::vector<ColumnFamilyHandle*>& column_family_handles) {
-  Status s;
-  for (auto cf_ptr : column_family_handles) {
-    Status status =
-        this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
-    if (!status.ok()) {
-      s = status;
-    }
-  }
-
-  return s;
+  return SetOptions(column_family_handles,
+                    {{"disable_auto_compactions", "false"}});
 }
 
 // NOTE: Calling DisableManualCompaction() may overwrite the
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index c523fd9b873f..583b4081b3bc 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -216,9 +216,9 @@ class DBImplSecondary : public DBImpl {
 
   using DBImpl::SetOptions;
   Status SetOptions(
-      ColumnFamilyHandle* /*cfd*/,
-      const std::unordered_map<std::string, std::string>& /*options_map*/)
-      override {
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+      /*column_families_opts_map*/) override {
     // Currently not supported because changing certain options may cause
     // flush/compaction and/or write to MANIFEST.
     return Status::NotSupported("Not supported operation in secondary mode.");
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 36c4f211af76..07e5d27f23e8 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -1689,6 +1689,46 @@ TEST_F(DBOptionsTest, SetOptionsNoManifestWrite) {
   ASSERT_EQ(Get("x"), "x");
 }
 
+TEST_F(DBOptionsTest, SetOptionsMultipleColumnFamilies) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = CurrentOptions().env;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // Create two additional column families
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+  ReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+
+  // Verify initial state - auto compaction should be disabled
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[0]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[1]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions);
+
+  // Set options on multiple column families at once
+  ASSERT_OK(dbfull()->SetOptions({handles_[1], handles_[2]},
+                                 {{"disable_auto_compactions", "false"}}));
+
+  ASSERT_TRUE(
+      dbfull()->GetOptions(handles_[0]).disable_auto_compactions);  // unchanged
+  ASSERT_FALSE(
+      dbfull()->GetOptions(handles_[1]).disable_auto_compactions);  // changed
+  ASSERT_FALSE(
+      dbfull()->GetOptions(handles_[2]).disable_auto_compactions);  // changed
+
+  std::unordered_map<ColumnFamilyHandle*,
+                     std::unordered_map<std::string, std::string>>
+      options_map;
+  options_map[handles_[0]] = {{"disable_auto_compactions", "false"}};
+  options_map[handles_[1]] = {{"disable_auto_compactions", "true"}};
+  options_map[handles_[2]] = {{"disable_auto_compactions", "true"}};
+  ASSERT_OK(dbfull()->SetOptions(options_map));
+
+  ASSERT_FALSE(dbfull()->GetOptions(handles_[0]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[1]).disable_auto_compactions);
+  ASSERT_TRUE(dbfull()->GetOptions(handles_[2]).disable_auto_compactions);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 9a753ae3eafd..2abb7eb02513 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1628,15 +1628,39 @@ class DB {
   //  s = db->SetOptions(cfh, {{"block_based_table_factory",
   //                            "{prepopulate_block_cache=kDisable;}"}});
   virtual Status SetOptions(
-      ColumnFamilyHandle* /*column_family*/,
-      const std::unordered_map<std::string, std::string>& /*opts_map*/) {
-    return Status::NotSupported("Not implemented");
+      ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& opts_map) {
+    return SetOptions(std::vector<ColumnFamilyHandle*>{column_family},
+                      opts_map);
   }
   // Shortcut for SetOptions on the default column family handle.
   virtual Status SetOptions(
       const std::unordered_map<std::string, std::string>& new_options) {
     return SetOptions(DefaultColumnFamily(), new_options);
   }
+  // Shortcut where you want to apply the same options to multiple column
+  // families. Beneficial for avoiding reserialization of OPTIONS file.
+  virtual Status SetOptions(
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::unordered_map<std::string, std::string>& opts_map) {
+    std::unordered_map<ColumnFamilyHandle*,
+                       std::unordered_map<std::string, std::string>>
+        column_families_opts_map;
+    column_families_opts_map.reserve(column_families.size());
+    for (auto* cf : column_families) {
+      column_families_opts_map[cf] = opts_map;
+    }
+    return SetOptions(column_families_opts_map);
+  }
+  // SetOptions with potentially different options per column family. It is
+  // typically better to batch all option changes together as the OPTIONS file
+  // is written to once per SetOptions call.
+  virtual Status SetOptions(
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+      /*column_families_opts_map*/) {
+    return Status::NotSupported("Not implemented");
+  }
 
   // Like SetOptions but for DBOptions, including the same caveats for
   // usability, reliability, and performance. See GetDBOptionsFromMap() (and
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index c84dc06b8168..8cd4057fd553 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -549,10 +549,11 @@ class StackableDB : public DB {
   }
 
   using DB::SetOptions;
-  Status SetOptions(ColumnFamilyHandle* column_family_handle,
-                    const std::unordered_map<std::string, std::string>&
-                        new_options) override {
-    return db_->SetOptions(column_family_handle, new_options);
+  Status SetOptions(
+      const std::unordered_map<ColumnFamilyHandle*,
+                               std::unordered_map<std::string, std::string>>&
+          column_families_opts_map) override {
+    return db_->SetOptions(column_families_opts_map);
   }
 
   Status SetDBOptions(const std::unordered_map<std::string, std::string>&
diff --git a/unreleased_history/public_api_changes/set_options.md b/unreleased_history/public_api_changes/set_options.md
new file mode 100644
index 000000000000..eadc2620f7e1
--- /dev/null
+++ b/unreleased_history/public_api_changes/set_options.md
@@ -0,0 +1 @@
+New SetOptions API that allows setting options for multiple CFs, avoiding the need to reserialize OPTIONS file for each CF

From 4bcec5ae897f3403e52a5f854133fec70244f97b Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Fri, 9 Jan 2026 10:43:54 -0800
Subject: [PATCH 419/500] Fix autoconf download failure in folly build (#14226)

Summary:
Folly download dependencies directly from external source. Sometimes, this could fail due to external website instability. To solve this, we added github cache to cache the dependencies. We also added a python script to try different sources during download to reduce the chance of failure.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14226

Test Plan: github CI

Reviewed By: krhancoc, archang19

Differential Revision: D90343051

Pulled By: xingbowang

fbshipit-source-id: 3faad6aaa6c1bfd361b9e405c298856cd64bf457
---
 .../cache-getdeps-downloads/action.yml        |  21 +++
 .github/workflows/nightly.yml                 |   4 +
 .github/workflows/pr-jobs.yml                 |   3 +
 build_tools/getdeps_fallback_mirror.py        | 123 ++++++++++++++++++
 folly.mk                                      |  21 ++-
 5 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 .github/actions/cache-getdeps-downloads/action.yml
 create mode 100644 build_tools/getdeps_fallback_mirror.py

diff --git a/.github/actions/cache-getdeps-downloads/action.yml b/.github/actions/cache-getdeps-downloads/action.yml
new file mode 100644
index 000000000000..ca871bf1c8cd
--- /dev/null
+++ b/.github/actions/cache-getdeps-downloads/action.yml
@@ -0,0 +1,21 @@
+name: cache-getdeps-downloads
+description: Cache getdeps downloads to avoid unreliable mirrors and speed up builds
+outputs:
+  cache-hit:
+    description: Whether the cache was hit
+    value: ${{ steps.cache-downloads.outputs.cache-hit }}
+runs:
+  using: composite
+  steps:
+  - name: Cache getdeps downloads
+    id: cache-downloads
+    uses: actions/cache@v4
+    with:
+      # Use a fixed path that we control - folly.mk will sync with getdeps downloads dir
+      path: /tmp/rocksdb-getdeps-cache
+      # Use a rolling cache key - the cache accumulates downloads over time
+      # The key includes a weekly timestamp to ensure periodic refresh
+      key: getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-week-${{ github.run_id }}
+      restore-keys: |
+        getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-week-
+        getdeps-downloads-${{ runner.os }}-${{ runner.arch }}-
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 8fe6172da05a..e10a95ecd0a0 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -54,6 +54,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - uses: "./.github/actions/build-folly"
     - run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
@@ -68,6 +69,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - uses: "./.github/actions/cache-folly"
       id: cache-folly
@@ -86,6 +88,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - run: "DEBUG_LEVEL=0 make -j20 build_folly"
     - run: "USE_FOLLY=1 LIB_MODE=static DEBUG_LEVEL=0 V=1 make -j20 release"
@@ -162,6 +165,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - run: "(mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 -DCMAKE_CXX_FLAGS=-DGLOG_USE_GLOG_EXPORT .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index cfd7b0343b8d..a3cfcdbce73e 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -102,6 +102,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - uses: "./.github/actions/cache-folly"
       id: cache-folly
@@ -120,6 +121,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - run: USE_FOLLY_LITE=1 EXTRA_CXXFLAGS=-DGLOG_USE_GLOG_EXPORT V=1 make -j32 all
     - uses: "./.github/actions/post-steps"
@@ -133,6 +135,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
     - uses: "./.github/actions/pre-steps"
+    - uses: "./.github/actions/cache-getdeps-downloads"
     - uses: "./.github/actions/setup-folly"
     - uses: "./.github/actions/cache-folly"
       id: cache-folly
diff --git a/build_tools/getdeps_fallback_mirror.py b/build_tools/getdeps_fallback_mirror.py
new file mode 100644
index 000000000000..7b3bb31b584d
--- /dev/null
+++ b/build_tools/getdeps_fallback_mirror.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""
+Pre-download packages with unreliable mirrors using fallback mirrors.
+Reads package info from folly's getdeps manifest files.
+"""
+import sys
+import os
+import hashlib
+import subprocess
+import configparser
+
+def sha256_file(path):
+    """Calculate SHA256 hash of a file."""
+    h = hashlib.sha256()
+    try:
+        with open(path, 'rb') as f:
+            for chunk in iter(lambda: f.read(65536), b''):
+                h.update(chunk)
+        return h.hexdigest()
+    except Exception:
+        return None
+
+def parse_manifest(manifest_path):
+    """Parse a getdeps manifest file to extract download info."""
+    config = configparser.ConfigParser()
+    try:
+        config.read(manifest_path)
+        if 'download' in config:
+            return {
+                'url': config['download'].get('url', ''),
+                'sha256': config['download'].get('sha256', ''),
+            }
+    except Exception:
+        pass
+    return None
+
+def get_fallback_mirrors(url):
+    """Get fallback mirror URLs for a given URL."""
+    # Fallback mirror patterns for known unreliable hosts
+    mirror_fallbacks = {
+        "ftp.gnu.org/gnu/": [
+            "https://mirrors.kernel.org/gnu/",
+            "https://ftpmirror.gnu.org/gnu/",
+            "https://ftp.gnu.org/gnu/",
+        ],
+        "ftpmirror.gnu.org/gnu/": [
+            "https://mirrors.kernel.org/gnu/",
+            "https://ftpmirror.gnu.org/gnu/",
+            "https://ftp.gnu.org/gnu/",
+        ],
+    }
+
+    for pattern, mirrors in mirror_fallbacks.items():
+        if pattern in url:
+            # Extract the path after the pattern
+            path_start = url.find(pattern) + len(pattern)
+            path = url[path_start:]
+            return [mirror + path for mirror in mirrors]
+    return [url]  # No fallback, use original
+
+def main():
+    if len(sys.argv) != 4:
+        print(f"Usage: {sys.argv[0]} <download_dir> <cache_dir> <manifests_dir>")
+        sys.exit(1)
+
+    download_dir, cache_dir, manifests_dir = sys.argv[1], sys.argv[2], sys.argv[3]
+
+    # Packages known to have unreliable mirrors
+    packages_to_check = ["autoconf", "automake", "libtool"]
+
+    for package in packages_to_check:
+        manifest_path = os.path.join(manifests_dir, package)
+        if not os.path.exists(manifest_path):
+            continue
+
+        info = parse_manifest(manifest_path)
+        if not info or not info['url'] or not info['sha256']:
+            continue
+
+        # Determine filename from URL
+        url = info['url']
+        expected_sha256 = info['sha256']
+        url_filename = os.path.basename(url)
+
+        # getdeps uses format: {package}-{filename}
+        filename = f"{package}-{url_filename}"
+        filepath = os.path.join(download_dir, filename)
+        cache_path = os.path.join(cache_dir, filename)
+
+        # Check if already valid
+        if os.path.exists(filepath) and sha256_file(filepath) == expected_sha256:
+            print(f"  {filename}: OK (already downloaded)")
+            continue
+
+        # Check cache
+        if os.path.exists(cache_path) and sha256_file(cache_path) == expected_sha256:
+            print(f"  {filename}: OK (from cache)")
+            subprocess.run(['cp', cache_path, filepath], check=True)
+            continue
+
+        # Try fallback mirrors
+        mirrors = get_fallback_mirrors(url)
+        downloaded = False
+        for mirror_url in mirrors:
+            print(f"  {filename}: trying {mirror_url}...")
+            try:
+                subprocess.run(['wget', '-q', '-O', filepath, mirror_url], check=True, timeout=120)
+                if sha256_file(filepath) == expected_sha256:
+                    print(f"  {filename}: OK (downloaded)")
+                    subprocess.run(['cp', filepath, cache_path], check=False)
+                    downloaded = True
+                    break
+                else:
+                    os.remove(filepath)
+            except Exception:
+                if os.path.exists(filepath):
+                    os.remove(filepath)
+
+        if not downloaded:
+            print(f"  {filename}: WARNING - all mirrors failed")
+
+if __name__ == "__main__":
+    main()
diff --git a/folly.mk b/folly.mk
index edd7ab1c9ca7..b253c25b64be 100644
--- a/folly.mk
+++ b/folly.mk
@@ -116,10 +116,27 @@ checkout_folly:
 	perl -pi -e 's/(#include <atomic>)/$$1\n#include <cstring>/' third-party/folly/folly/lang/Exception.h
 	@# const mismatch
 	perl -pi -e 's/: environ/: (const char**)(environ)/' third-party/folly/folly/Subprocess.cpp
-	@# Use gnu.org mirrors to improve download speed (ftp.gnu.org is often super slow)
-	cd third-party/folly && perl -pi -e 's/ftp.gnu.org/ftpmirror.gnu.org/' `git grep -l ftp.gnu.org` README.md
+	@# Restore cached downloads and handle unreliable mirrors with fallback
+	@cd third-party/folly && \
+		DOWNLOAD_DIR=`$(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir | sed 's|/installed/.*|/downloads|'` && \
+		mkdir -p "$$DOWNLOAD_DIR" && \
+		CACHE_DIR="/tmp/rocksdb-getdeps-cache" && \
+		mkdir -p "$$CACHE_DIR" && \
+		echo "Restoring cached downloads..." && \
+		if ls "$$CACHE_DIR"/*.tar.gz "$$CACHE_DIR"/*.tar.xz "$$CACHE_DIR"/*.zip >/dev/null 2>&1; then \
+			cp -n "$$CACHE_DIR"/*.tar.gz "$$CACHE_DIR"/*.tar.xz "$$CACHE_DIR"/*.zip "$$DOWNLOAD_DIR/" 2>/dev/null || true; \
+		fi && \
+		echo "Handling known unreliable downloads with fallback mirrors..." && \
+		$(PYTHON) ../../build_tools/getdeps_fallback_mirror.py "$$DOWNLOAD_DIR" "$$CACHE_DIR" build/fbcode_builder/manifests
 	@# NOTE: boost and fmt source will be needed for any build including `USE_FOLLY_LITE` builds as those depend on those headers
 	cd third-party/folly && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch boost && GETDEPS_USE_WGET=1 $(PYTHON) build/fbcode_builder/getdeps.py fetch fmt
+	@# Update cache with any new downloads
+	@cd third-party/folly && \
+		DOWNLOAD_DIR=`$(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir | sed 's|/installed/.*|/downloads|'` && \
+		CACHE_DIR="/tmp/rocksdb-getdeps-cache" && \
+		if ls "$$DOWNLOAD_DIR"/*.tar.gz "$$DOWNLOAD_DIR"/*.tar.xz "$$DOWNLOAD_DIR"/*.zip >/dev/null 2>&1; then \
+			cp -n "$$DOWNLOAD_DIR"/*.tar.gz "$$DOWNLOAD_DIR"/*.tar.xz "$$DOWNLOAD_DIR"/*.zip "$$CACHE_DIR/" 2>/dev/null || true; \
+		fi
 
 CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
 

From a6325e9564d7eedec3993d2aef40829943222418 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Fri, 9 Jan 2026 14:37:46 -0800
Subject: [PATCH 420/500] Add block type to corruption error message (#14225)

Summary:
Add block type to corruption error message

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14225

Test Plan: Unit test

Reviewed By: jaykorean

Differential Revision: D90329899

Pulled By: xingbowang

fbshipit-source-id: 6fa925d1704c7c19c98d6067628b73a7c0904c3e
---
 .../block_based_table_reader_sync_and_async.h |  6 ++--
 table/block_based/block_type.h                | 30 +++++++++++++++++++
 table/block_based/reader_common.cc            |  5 ++--
 table/block_based/reader_common.h             |  5 +++-
 table/block_fetcher.cc                        |  6 ++--
 table/meta_blocks.cc                          |  6 ++--
 6 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h
index be0b05808067..dc9e66214022 100644
--- a/table/block_based/block_based_table_reader_sync_and_async.h
+++ b/table/block_based/block_based_table_reader_sync_and_async.h
@@ -220,7 +220,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
         // in each read request. Checksum is stored in the block trailer,
         // beyond the payload size.
         s = VerifyBlockChecksum(footer, data, handle.size(),
-                                rep_->file->file_name(), handle.offset());
+                                rep_->file->file_name(), handle.offset(),
+                                BlockType::kData);
         RecordTick(ioptions.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
         if (!s.ok()) {
           RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT);
@@ -248,7 +249,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
             assert(result.data() == data);
             assert(result.size() == BlockSizeWithTrailer(handle));
             s = VerifyBlockChecksum(footer, data, handle.size(),
-                                    rep_->file->file_name(), handle.offset());
+                                    rep_->file->file_name(), handle.offset(),
+                                    BlockType::kData);
             if (s.ok()) {
               RecordTick(ioptions.stats,
                          FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
diff --git a/table/block_based/block_type.h b/table/block_based/block_type.h
index 0098491c5dfc..b96f27385493 100644
--- a/table/block_based/block_type.h
+++ b/table/block_based/block_type.h
@@ -32,4 +32,34 @@ enum class BlockType : uint8_t {
   kInvalid
 };
 
+inline const char* BlockTypeToString(BlockType block_type) {
+  switch (block_type) {
+    case BlockType::kData:
+      return "Data";
+    case BlockType::kFilter:
+      return "Filter";
+    case BlockType::kFilterPartitionIndex:
+      return "FilterPartitionIndex";
+    case BlockType::kProperties:
+      return "Properties";
+    case BlockType::kCompressionDictionary:
+      return "CompressionDictionary";
+    case BlockType::kRangeDeletion:
+      return "RangeDeletion";
+    case BlockType::kHashIndexPrefixes:
+      return "HashIndexPrefixes";
+    case BlockType::kHashIndexMetadata:
+      return "HashIndexMetadata";
+    case BlockType::kMetaIndex:
+      return "MetaIndex";
+    case BlockType::kIndex:
+      return "Index";
+    case BlockType::kUserDefinedIndex:
+      return "UserDefinedIndex";
+    case BlockType::kInvalid:
+      return "Invalid";
+  }
+  return "Unknown";
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/reader_common.cc b/table/block_based/reader_common.cc
index 8f8c82ff43ac..fbafe414dd9a 100644
--- a/table/block_based/reader_common.cc
+++ b/table/block_based/reader_common.cc
@@ -25,7 +25,7 @@ void ForceReleaseCachedEntry(void* arg, void* h) {
 // WART: this is specific to block-based table
 Status VerifyBlockChecksum(const Footer& footer, const char* data,
                            size_t block_size, const std::string& file_name,
-                           uint64_t offset) {
+                           uint64_t offset, BlockType block_type) {
   PERF_TIMER_GUARD(block_checksum_time);
 
   assert(footer.GetBlockTrailerSize() == 5);
@@ -58,7 +58,8 @@ Status VerifyBlockChecksum(const Footer& footer, const char* data,
         std::string(modifier ? "(context removed)" : "") + " = " +
         std::to_string(stored) + ", computed = " + std::to_string(computed) +
         ", type = " + std::to_string(type) + "  in " + file_name + " offset " +
-        std::to_string(offset) + " size " + std::to_string(block_size));
+        std::to_string(offset) + " size " + std::to_string(block_size) +
+        ", block_type = " + BlockTypeToString(block_type));
   }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h
index 89518fd8c2a4..6d16f4069413 100644
--- a/table/block_based/reader_common.h
+++ b/table/block_based/reader_common.h
@@ -10,6 +10,7 @@
 
 #include "rocksdb/advanced_cache.h"
 #include "rocksdb/table.h"
+#include "table/block_based/block_type.h"
 
 namespace ROCKSDB_NAMESPACE {
 class Footer;
@@ -27,10 +28,12 @@ inline MemoryAllocator* GetMemoryAllocator(
 // Assumes block has a trailer past `data + block_size` as in format.h.
 // `file_name` provided for generating diagnostic message in returned status.
 // `offset` might be required for proper verification (also used for message).
+// `block_type` is included in the error message to provide context about
+// which type of block failed checksum verification.
 //
 // Returns Status::OK() on checksum match, or Status::Corruption() on checksum
 // mismatch.
 Status VerifyBlockChecksum(const Footer& footer, const char* data,
                            size_t block_size, const std::string& file_name,
-                           uint64_t offset);
+                           uint64_t offset, BlockType block_type);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index 6c73df23bee2..2f4ee64b19fc 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -33,9 +33,9 @@ inline void BlockFetcher::ProcessTrailerIfPresent() {
   if (footer_.GetBlockTrailerSize() > 0) {
     assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize);
     if (read_options_.verify_checksums) {
-      io_status_ = status_to_io_status(
-          VerifyBlockChecksum(footer_, slice_.data(), block_size_,
-                              file_->file_name(), handle_.offset()));
+      io_status_ = status_to_io_status(VerifyBlockChecksum(
+          footer_, slice_.data(), block_size_, file_->file_name(),
+          handle_.offset(), block_type_));
       RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
       if (!io_status_.ok()) {
         assert(io_status_.IsCorruption());
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 84f3a5343b46..d8be37e58b39 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -476,7 +476,8 @@ Status ReadTablePropertiesHelper(
     // (See write_global_seqno comment above)
     if (s.ok() && footer.GetBlockTrailerSize() > 0) {
       s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
-                              file->file_name(), handle.offset());
+                              file->file_name(), handle.offset(),
+                              BlockType::kProperties);
       if (s.IsCorruption()) {
         if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
           std::string tmp_buf(properties_block.data(), len);
@@ -485,7 +486,8 @@ Status ReadTablePropertiesHelper(
               handle.offset();
           EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
           s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
-                                  file->file_name(), handle.offset());
+                                  file->file_name(), handle.offset(),
+                                  BlockType::kProperties);
         }
       }
     }

From 2893c25ca2a21590781066d77e6af4faf0ac8d0e Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Sat, 10 Jan 2026 05:05:18 -0800
Subject: [PATCH 421/500] Support printing block checksum in sst_dump (#14222)

Summary:
Support printing block checksum in sst_dump

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14222

Test Plan:
manual test, sample output

```
Data Block # 1 @ 00B021
   Data block checksum type: 4  checksum value: 0x40614fb3  offset: 0  size: 4272  compression type: 0
```

Reviewed By: jaykorean

Differential Revision: D90286789

Pulled By: xingbowang

fbshipit-source-id: 71324e04549bea070d80b45a81b562ad331a7840
---
 table/block_based/block_based_table_reader.cc | 87 +++++++++++++++++++
 table/block_based/block_based_table_reader.h  |  4 +
 2 files changed, 91 insertions(+)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 094b0e0cb01b..b2d6a1e55813 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3241,6 +3241,17 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
                 "--------------------------------------\n";
   out_stream << "  " << rep_->footer.ToString() << "\n";
 
+  // Output Checksum Type Legend
+  out_stream << "Block Checksum Type Legend:\n"
+                "--------------------------------------\n";
+  out_stream << "  0 = kNoChecksum\n";
+  out_stream << "  1 = kCRC32c\n";
+  out_stream << "  2 = kxxHash\n";
+  out_stream << "  3 = kxxHash64\n";
+  out_stream << "  4 = kXXH3\n";
+  out_stream << "  (This file uses checksum type: "
+             << static_cast<int>(rep_->footer.checksum_type()) << ")\n\n";
+
   // Output MetaIndex
   out_stream << "Metaindex Details:\n"
                 "--------------------------------------\n";
@@ -3251,25 +3262,47 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
   Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex,
                                 &metaindex_iter);
   if (s.ok()) {
+    // Print metaindex block checksum
+    DumpBlockChecksumInfo(rep_->footer.metaindex_handle(), ro,
+                          "Metaindex block", out_stream);
+
     for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
          metaindex_iter->Next()) {
       s = metaindex_iter->status();
       if (!s.ok()) {
         return s;
       }
+      // Parse block handle from metaindex value
+      BlockHandle block_handle;
+      Slice input = metaindex_iter->value();
+      Status handle_status = block_handle.DecodeFrom(&input);
+
+      if (!handle_status.ok()) {
+        out_stream << "  Skip the block with type "
+                   << metaindex_iter->key().ToString()
+                   << " due to error: " << handle_status.ToString() << "\n\n";
+        continue;
+      }
+
       if (metaindex_iter->key() == kPropertiesBlockName) {
         out_stream << "  Properties block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Properties block", out_stream);
       } else if (metaindex_iter->key() == kCompressionDictBlockName) {
         out_stream << "  Compression dictionary block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Compression dictionary block",
+                              out_stream);
       } else if (strstr(metaindex_iter->key().ToString().c_str(),
                         "filter.rocksdb.") != nullptr) {
         out_stream << "  Filter block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Filter block", out_stream);
       } else if (metaindex_iter->key() == kRangeDelBlockName) {
         out_stream << "  Range deletion block handle: "
                    << metaindex_iter->value().ToString(true) << "\n";
+        DumpBlockChecksumInfo(block_handle, ro, "Range deletion block",
+                              out_stream);
       }
     }
     out_stream << "\n";
@@ -3346,11 +3379,61 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file,
   return Status::OK();
 }
 
+void BlockBasedTable::DumpBlockChecksumInfo(const BlockHandle& block_handle,
+                                            const ReadOptions& read_options,
+                                            const char* block_name,
+                                            std::ostream& out_stream) const {
+  if (rep_->footer.GetBlockTrailerSize() == 0) {
+    return;
+  }
+
+  size_t block_size = static_cast<size_t>(block_handle.size());
+  size_t block_size_with_trailer = block_size + kBlockTrailerSize;
+  std::unique_ptr<char[]> raw_block(new char[block_size_with_trailer]);
+  Slice raw_block_slice;
+  IOOptions opts;
+  IODebugContext dbg;
+  IOStatus io_s = rep_->file->PrepareIOOptions(read_options, opts, &dbg);
+  if (io_s.ok()) {
+    io_s = rep_->file->Read(opts, block_handle.offset(),
+                            block_size_with_trailer, &raw_block_slice,
+                            raw_block.get(), /*aligned_buf=*/nullptr, &dbg);
+  }
+  if (io_s.ok() && raw_block_slice.size() == block_size_with_trailer) {
+    const char* data = raw_block_slice.data();
+    uint8_t compression_type_byte = static_cast<uint8_t>(data[block_size]);
+    uint32_t stored_checksum = DecodeFixed32(data + block_size + 1);
+    uint32_t modifier = ChecksumModifierForContext(
+        rep_->footer.base_context_checksum(), block_handle.offset());
+    uint32_t actual_checksum = stored_checksum - modifier;
+    out_stream << "  " << block_name << " checksum type: "
+               << static_cast<int>(rep_->footer.checksum_type())
+               << "  checksum value: 0x" << std::hex << actual_checksum
+               << std::dec << "  offset: " << block_handle.offset()
+               << "  size: " << block_size << "  compression type: "
+               << static_cast<int>(compression_type_byte) << "\n";
+  } else {
+    out_stream << "  ERROR: Failed to read " << block_name << " checksum info";
+    if (!io_s.ok()) {
+      out_stream << " - " << io_s.ToString();
+    } else if (raw_block_slice.size() != block_size_with_trailer) {
+      out_stream << " - read " << raw_block_slice.size() << " bytes, expected "
+                 << block_size_with_trailer;
+    }
+    out_stream << "\n";
+  }
+}
+
 Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) {
   out_stream << "Index Details:\n"
                 "--------------------------------------\n";
   // TODO: plumb Env::IOActivity, Env::IOPriority
   const ReadOptions read_options;
+
+  // Print index block checksum information
+  DumpBlockChecksumInfo(rep_->index_handle, read_options, "Index block",
+                        out_stream);
+
   std::unique_ptr<InternalIteratorBase<IndexValue>> blockhandles_iter(
       NewIndexIterator(read_options, /*disable_prefix_seek=*/false,
                        /*input_iter=*/nullptr, /*get_context=*/nullptr,
@@ -3433,6 +3516,10 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream,
 
     out_stream << "Data Block # " << block_id << " @ "
                << blockhandles_iter->value().handle.ToString(true) << "\n";
+
+    // Read block checksum information
+    DumpBlockChecksumInfo(bh, read_options, "Data block", out_stream);
+
     out_stream << "--------------------------------------\n";
 
     std::unique_ptr<InternalIterator> datablock_iter;
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index b1dfa0c7e0c9..4663a83d5721 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -555,6 +555,10 @@ class BlockBasedTable : public TableReader {
   void DumpKeyValue(const Slice& key, const Slice& value,
                     std::ostream& out_stream,
                     bool show_sequence_number_type = false);
+  void DumpBlockChecksumInfo(const BlockHandle& block_handle,
+                             const ReadOptions& read_options,
+                             const char* block_name,
+                             std::ostream& out_stream) const;
 
   // Returns false if prefix_extractor exists and is compatible with that used
   // in building the table file, otherwise true.

From 256838180eeb11a6c72c5879a55f8f705e3b908c Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Wed, 14 Jan 2026 06:44:01 -0800
Subject: [PATCH 422/500] Fix stress test deadlock failure in TestPut (#14235)

Summary:
Deadlock or timeout is possible in TestPut, when TestMultiGet was executed at the same time, because it executes MaybeAddKeyToTxnForRYW, which writes to the same key space but does not acquire stress test level mutex. Therefore, RocksDB could return deadlock error.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14235

Test Plan: Stress test

Reviewed By: hx235

Differential Revision: D90621772

Pulled By: xingbowang

fbshipit-source-id: eb808193ded06b69a8161320f88d5ba4e20b4901
---
 db_stress_tool/no_batched_ops_stress.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 471c24a64ce8..becda50ec3e8 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -1904,6 +1904,17 @@ class NonBatchedOpsStressTest : public StressTest {
     } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
              initial_wal_write_may_succeed);
 
+    if ((s.IsDeadlock() || s.IsTimedOut()) &&
+        (FLAGS_use_multiget || FLAGS_use_multi_get_entity)) {
+      // Deadlock or timeout is ok, when multi get is tested. Because multi get
+      // tests execute MaybeAddKeyToTxnForRYW function which writes to the
+      // same key space but does not acquire stress test level mutex. So it is
+      // possible RocksDB returns deadlock or timeout. Return OK() for these
+      // cases
+      pending_expected_value.Rollback();
+      return Status::OK();
+    }
+
     if (!s.ok()) {
       pending_expected_value.Rollback();
       if (IsErrorInjectedAndRetryable(s)) {

From 57036b68d90c55b49fb6755f24c21e9ba99eb057 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 14 Jan 2026 09:35:16 -0800
Subject: [PATCH 423/500] Migrate blob handling to new compression APIs
 (#14234)

Summary:
as part of the effort to get rid of OLD_CompressData and OLD_UncompressData and the old implementations in compression.h.

It's unfortunate the the existing blob file schema doesn't allow storing blobs uncompressed when the compressed version is larger, so we have to work around that.

Note that use of GrowableBuffer in place of std::string is intended to avoid the potential performance overhead of zeroing out memory before overwriting it.

Also includes some cleanup of includes

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14234

Test Plan:
some unit test updates as needed. Crash test covers integrated blob support.

I'm not too concerned about performance, as until a future schema change, this code is committing the grave performance error of storing compressed data larger than uncompressed.

Reviewed By: mszeszko-meta, hx235

Differential Revision: D90544049

Pulled By: pdillinger

fbshipit-source-id: 2f2ed16de63990b797cc06c8dad36b5869dac302
---
 db/blob/blob_file_builder.cc                  | 45 +++++++-------
 db/blob/blob_file_builder.h                   |  7 ++-
 db/blob/blob_file_builder_test.cc             | 13 ++--
 db/blob/blob_file_reader.cc                   | 62 +++++++++++--------
 db/blob/blob_file_reader.h                    |  6 +-
 db/blob/blob_file_reader_test.cc              | 33 +++++-----
 db/blob/blob_source_test.cc                   | 15 ++---
 .../block_based/block_based_table_builder.cc  |  4 ++
 table/block_based/index_builder.h             |  1 +
 util/compression.cc                           | 36 +++++++++++
 util/compression.h                            | 13 +++-
 util/compression_test.cc                      |  7 ++-
 util/simple_mixed_compressor.cc               |  4 +-
 util/simple_mixed_compressor.h                |  5 +-
 utilities/blob_db/blob_compaction_filter.cc   | 22 ++++---
 utilities/blob_db/blob_db_impl.cc             | 57 +++++++++--------
 utilities/blob_db/blob_db_impl.h              |  6 +-
 17 files changed, 207 insertions(+), 129 deletions(-)

diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc
index 3a32269d8eb0..5e71c8a38236 100644
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@@ -67,6 +67,16 @@ BlobFileBuilder::BlobFileBuilder(
       min_blob_size_(mutable_cf_options->min_blob_size),
       blob_file_size_(mutable_cf_options->blob_file_size),
       blob_compression_type_(mutable_cf_options->blob_compression_type),
+      // TODO: support most CompressionOptions with a new CF option
+      // blob_compression_opts
+      // TODO with schema change: support custom compression manager and options
+      // such as max_compressed_bytes_per_kb
+      // NOTE: returns nullptr for kNoCompression
+      blob_compressor_(GetBuiltinV2CompressionManager()->GetCompressor(
+          CompressionOptions{}, blob_compression_type_)),
+      blob_compressor_wa_(blob_compressor_
+                              ? blob_compressor_->ObtainWorkingArea()
+                              : Compressor::ManagedWorkingArea{}),
       prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
       file_options_(file_options),
       write_options_(write_options),
@@ -113,7 +123,7 @@ Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
   }
 
   Slice blob = value;
-  std::string compressed_blob;
+  GrowableBuffer compressed_blob;
 
   {
     const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
@@ -254,36 +264,27 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
 }
 
 Status BlobFileBuilder::CompressBlobIfNeeded(
-    Slice* blob, std::string* compressed_blob) const {
+    Slice* blob, GrowableBuffer* compressed_blob) const {
   assert(blob);
   assert(compressed_blob);
   assert(compressed_blob->empty());
   assert(immutable_options_);
 
-  if (blob_compression_type_ == kNoCompression) {
+  if (!blob_compressor_) {
+    assert(blob_compression_type_ == kNoCompression);
     return Status::OK();
   }
+  assert(blob_compression_type_ != kNoCompression);
 
-  // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
-  CompressionOptions opts;
-  CompressionContext context(blob_compression_type_, opts);
+  // WART: always stored as compressed even when that increases the size.
 
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                       blob_compression_type_);
-
-  constexpr uint32_t compression_format_version = 2;
-
-  bool success = false;
-
-  {
-    StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
-                         BLOB_DB_COMPRESSION_MICROS);
-    success = OLD_CompressData(*blob, info, compression_format_version,
-                               compressed_blob);
-  }
-
-  if (!success) {
-    return Status::Corruption("Error compressing blob");
+  Status s;
+  StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+                       BLOB_DB_COMPRESSION_MICROS);
+  s = LegacyForceBuiltinCompression(*blob_compressor_, &blob_compressor_wa_,
+                                    *blob, compressed_blob);
+  if (!s.ok()) {
+    return s;
   }
 
   *blob = Slice(*compressed_blob);
diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h
index 6ba7181aa09f..95d55f6bd9b6 100644
--- a/db/blob/blob_file_builder.h
+++ b/db/blob/blob_file_builder.h
@@ -10,12 +10,14 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/compression_type.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/types.h"
+#include "util/aligned_buffer.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -76,7 +78,8 @@ class BlobFileBuilder {
  private:
   bool IsBlobFileOpen() const;
   Status OpenBlobFileIfNeeded();
-  Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+  Status CompressBlobIfNeeded(Slice* blob,
+                              GrowableBuffer* compressed_blob) const;
   Status WriteBlobToFile(const Slice& key, const Slice& blob,
                          uint64_t* blob_file_number, uint64_t* blob_offset);
   Status CloseBlobFile();
@@ -91,6 +94,8 @@ class BlobFileBuilder {
   uint64_t min_blob_size_;
   uint64_t blob_file_size_;
   CompressionType blob_compression_type_;
+  std::unique_ptr<Compressor> blob_compressor_;
+  mutable Compressor::ManagedWorkingArea blob_compressor_wa_;
   PrepopulateBlobCache prepopulate_blob_cache_;
   const FileOptions* file_options_;
   const WriteOptions* write_options_;
diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc
index 0981029d09de..c7b830717998 100644
--- a/db/blob/blob_file_builder_test.cc
+++ b/db/blob/blob_file_builder_test.cc
@@ -457,11 +457,12 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
       nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
       BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
 
-  SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
-                                        [](void* arg) {
-                                          bool* ret = static_cast<bool*>(arg);
-                                          *ret = false;
-                                        });
+  SyncPoint::GetInstance()->SetCallBack(
+      "LegacyForceBuiltinCompression:TamperWithStatus", [](void* arg) {
+        Status* ret = static_cast<Status*>(arg);
+        ASSERT_OK(*ret);
+        *ret = Status::Corruption("Tampered result");
+      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   constexpr char key[] = "1";
@@ -469,7 +470,7 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
 
   std::string blob_index;
 
-  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+  ASSERT_EQ(builder.Add(key, value, &blob_index).code(), Status::kCorruption);
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc
index 447f090b5070..3f419c5a0814 100644
--- a/db/blob/blob_file_reader.cc
+++ b/db/blob/blob_file_reader.cc
@@ -17,10 +17,10 @@
 #include "rocksdb/file_system.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
+#include "table/format.h"
 #include "table/multiget_context.h"
 #include "test_util/sync_point.h"
 #include "util/compression.h"
-#include "util/crc32c.h"
 #include "util/stop_watch.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -69,9 +69,16 @@ Status BlobFileReader::Create(
     }
   }
 
-  blob_file_reader->reset(
-      new BlobFileReader(std::move(file_reader), file_size, compression_type,
-                         immutable_options.clock, statistics));
+  std::shared_ptr<Decompressor> decompressor;
+  if (compression_type != kNoCompression) {
+    // The blob format has always used compression format 2
+    decompressor = GetBuiltinV2CompressionManager()->GetDecompressorOptimizeFor(
+        compression_type);
+  }
+
+  blob_file_reader->reset(new BlobFileReader(
+      std::move(file_reader), file_size, compression_type,
+      std::move(decompressor), immutable_options.clock, statistics));
 
   return Status::OK();
 }
@@ -282,11 +289,13 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
 
 BlobFileReader::BlobFileReader(
     std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
-    CompressionType compression_type, SystemClock* clock,
+    CompressionType compression_type,
+    std::shared_ptr<Decompressor> decompressor, SystemClock* clock,
     Statistics* statistics)
     : file_reader_(std::move(file_reader)),
       file_size_(file_size),
       compression_type_(compression_type),
+      decompressor_(std::move(decompressor)),
       clock_(clock),
       statistics_(statistics) {
   assert(file_reader_);
@@ -375,8 +384,9 @@ Status BlobFileReader::GetBlob(
   const Slice value_slice(record_slice.data() + adjustment, value_size);
 
   {
-    const Status s = UncompressBlobIfNeeded(
-        value_slice, compression_type, allocator, clock_, statistics_, result);
+    const Status s = UncompressBlobIfNeeded(value_slice, compression_type,
+                                            decompressor_.get(), allocator,
+                                            clock_, statistics_, result);
     if (!s.ok()) {
       return s;
     }
@@ -524,9 +534,9 @@ void BlobFileReader::MultiGetBlob(
 
     // Uncompress blob if needed
     Slice value_slice(record_slice.data() + adjustments[i], req->len);
-    *req->status =
-        UncompressBlobIfNeeded(value_slice, compression_type_, allocator,
-                               clock_, statistics_, &blob_reqs[i].second);
+    *req->status = UncompressBlobIfNeeded(
+        value_slice, compression_type_, decompressor_.get(), allocator, clock_,
+        statistics_, &blob_reqs[i].second);
     if (req->status->ok()) {
       total_bytes += record_slice.size();
     }
@@ -583,8 +593,8 @@ Status BlobFileReader::VerifyBlob(const Slice& record_slice,
 
 Status BlobFileReader::UncompressBlobIfNeeded(
     const Slice& value_slice, CompressionType compression_type,
-    MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics,
-    std::unique_ptr<BlobContents>* result) {
+    Decompressor* decompressor, MemoryAllocator* allocator, SystemClock* clock,
+    Statistics* statistics, std::unique_ptr<BlobContents>* result) {
   assert(result);
 
   if (compression_type == kNoCompression) {
@@ -593,31 +603,33 @@ Status BlobFileReader::UncompressBlobIfNeeded(
     return Status::OK();
   }
 
-  UncompressionContext context(compression_type);
-  UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                         compression_type);
+  assert(decompressor);
 
-  size_t uncompressed_size = 0;
-  constexpr uint32_t compression_format_version = 2;
+  Decompressor::Args args;
+  args.compression_type = compression_type;
+  args.compressed_data = value_slice;
 
-  CacheAllocationPtr output;
+  Status s = decompressor->ExtractUncompressedSize(args);
+  if (!s.ok()) {
+    return Status::Corruption(s.ToString());
+  }
+
+  CacheAllocationPtr output = AllocateBlock(args.uncompressed_size, allocator);
 
   {
     PERF_TIMER_GUARD(blob_decompress_time);
     StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
-    output = OLD_UncompressData(info, value_slice.data(), value_slice.size(),
-                                &uncompressed_size, compression_format_version,
-                                allocator);
+    s = decompressor->DecompressBlock(args, output.get());
   }
 
   TEST_SYNC_POINT_CALLBACK(
-      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &s);
 
-  if (!output) {
-    return Status::Corruption("Unable to uncompress blob");
+  if (!s.ok()) {
+    return Status::Corruption(s.ToString());
   }
 
-  result->reset(new BlobContents(std::move(output), uncompressed_size));
+  result->reset(new BlobContents(std::move(output), args.uncompressed_size));
 
   return Status::OK();
 }
diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h
index fa8aa501d45f..e13e3380302a 100644
--- a/db/blob/blob_file_reader.h
+++ b/db/blob/blob_file_reader.h
@@ -10,6 +10,7 @@
 
 #include "db/blob/blob_read_request.h"
 #include "file/random_access_file_reader.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/compression_type.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "util/autovector.h"
@@ -64,7 +65,8 @@ class BlobFileReader {
  private:
   BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
                  uint64_t file_size, CompressionType compression_type,
-                 SystemClock* clock, Statistics* statistics);
+                 std::shared_ptr<Decompressor> decompressor, SystemClock* clock,
+                 Statistics* statistics);
 
   static Status OpenFile(const ImmutableOptions& immutable_options,
                          const FileOptions& file_opts,
@@ -96,6 +98,7 @@ class BlobFileReader {
 
   static Status UncompressBlobIfNeeded(const Slice& value_slice,
                                        CompressionType compression_type,
+                                       Decompressor* decompressor,
                                        MemoryAllocator* allocator,
                                        SystemClock* clock,
                                        Statistics* statistics,
@@ -104,6 +107,7 @@ class BlobFileReader {
   std::unique_ptr<RandomAccessFileReader> file_reader_;
   uint64_t file_size_;
   CompressionType compression_type_;
+  std::shared_ptr<Decompressor> decompressor_;
   SystemClock* clock_;
   Statistics* statistics_;
 };
diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc
index 6297dd461c80..0e98d2619b02 100644
--- a/db/blob/blob_file_reader_test.cc
+++ b/db/blob/blob_file_reader_test.cc
@@ -65,7 +65,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
 
   ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
 
-  std::vector<std::string> compressed_blobs(num);
+  std::vector<GrowableBuffer> compressed_blobs(num);
   std::vector<Slice> blobs_to_write(num);
   if (kNoCompression == compression) {
     for (size_t i = 0; i < num; ++i) {
@@ -73,16 +73,13 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
       blob_sizes[i] = blobs[i].size();
     }
   } else {
-    CompressionOptions opts;
-    CompressionContext context(compression, opts);
-    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         compression);
-
-    constexpr uint32_t compression_format_version = 2;
+    auto compressor =
+        GetBuiltinV2CompressionManager()->GetCompressor({}, compression);
 
     for (size_t i = 0; i < num; ++i) {
-      ASSERT_TRUE(OLD_CompressData(blobs[i], info, compression_format_version,
-                                   &compressed_blobs[i]));
+      ASSERT_OK(LegacyForceBuiltinCompression(*compressor,
+                                              /*working_area=*/nullptr,
+                                              blobs[i], &compressed_blobs[i]));
       blobs_to_write[i] = compressed_blobs[i];
       blob_sizes[i] = compressed_blobs[i].size();
     }
@@ -809,11 +806,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
 
   SyncPoint::GetInstance()->SetCallBack(
       "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
-        CacheAllocationPtr* const output =
-            static_cast<CacheAllocationPtr*>(arg);
-        assert(output);
+        auto* result = static_cast<Status*>(arg);
+        assert(result);
 
-        output->reset();
+        *result = Status::Corruption("Injected result");
       });
 
   SyncPoint::GetInstance()->EnableProcessing();
@@ -824,11 +820,12 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
   std::unique_ptr<BlobContents> value;
   uint64_t bytes_read = 0;
 
-  ASSERT_TRUE(reader
-                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
-                            kSnappyCompression, prefetch_buffer, allocator,
-                            &value, &bytes_read)
-                  .IsCorruption());
+  ASSERT_EQ(reader
+                ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                          kSnappyCompression, prefetch_buffer, allocator,
+                          &value, &bytes_read)
+                .code(),
+            Status::Code::kCorruption);
   ASSERT_EQ(value, nullptr);
   ASSERT_EQ(bytes_read, 0);
 
diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc
index 01c61ac5e6d3..07c47ee50256 100644
--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@@ -67,7 +67,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
 
   ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
 
-  std::vector<std::string> compressed_blobs(num);
+  std::vector<GrowableBuffer> compressed_blobs(num);
   std::vector<Slice> blobs_to_write(num);
   if (kNoCompression == compression) {
     for (size_t i = 0; i < num; ++i) {
@@ -75,16 +75,13 @@ void WriteBlobFile(const ImmutableOptions& immutable_options,
       blob_sizes[i] = blobs[i].size();
     }
   } else {
-    CompressionOptions opts;
-    CompressionContext context(compression, opts);
-    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         compression);
-
-    constexpr uint32_t compression_format_version = 2;
+    auto compressor =
+        GetBuiltinV2CompressionManager()->GetCompressor({}, compression);
 
     for (size_t i = 0; i < num; ++i) {
-      ASSERT_TRUE(OLD_CompressData(blobs[i], info, compression_format_version,
-                                   &compressed_blobs[i]));
+      ASSERT_OK(LegacyForceBuiltinCompression(*compressor,
+                                              /*working_area=*/nullptr,
+                                              blobs[i], &compressed_blobs[i]));
       blobs_to_write[i] = compressed_blobs[i];
       blob_sizes[i] = compressed_blobs[i].size();
     }
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index c4e6895a2163..a4864912984f 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1911,6 +1911,10 @@ Status BlockBasedTableBuilder::CompressAndVerifyBlock(
       assert(type == kNoCompression ||
              r->table_options.verify_compression == (verify_decomp != nullptr));
 
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::CompressAndVerifyBlock:TamperWithResultType",
+          &type);
+
       // Some of the compression algorithms are known to be unreliable. If
       // the verify_compression flag is set then try to de-compress the
       // compressed data and compare to the input.
diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h
index fea042b1f8be..a33935c051d3 100644
--- a/table/block_based/index_builder.h
+++ b/table/block_based/index_builder.h
@@ -20,6 +20,7 @@
 #include "table/block_based/block_builder.h"
 #include "table/block_based/flush_block_policy_impl.h"
 #include "table/format.h"
+#include "util/atomic.h"
 
 namespace ROCKSDB_NAMESPACE {
 // The interface for building index.
diff --git a/util/compression.cc b/util/compression.cc
index 30b7e8b09e1d..3cde7c4c32ac 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -1776,4 +1776,40 @@ const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager() {
 // END built-in implementation of customization interface
 // ***********************************************************************
 
+Status LegacyForceBuiltinCompression(
+    Compressor& builtin_compressor,
+    Compressor::ManagedWorkingArea* working_area, Slice from,
+    GrowableBuffer* to) {
+  // For legacy cases that store compressed data even when it's larger than the
+  // uncompressed data (!!!), we need a reliable upper bound on the compressed
+  // size. This is based on consulting various algorithms documentation etc.
+  // and adding ~4 bytes for encoded uncompressed size. (Snappy is the worst
+  // case for multiplicative overhead at n + n/6, bounded by 19*n/16 to avoid
+  // costly division. Bzip2 is the worst case for additive overhead at 600
+  // bytes.)
+  size_t n = from.size();
+  size_t upper_bound = ((19 * n) >> 4) + 604;
+  // The upper bound has only been established considering built-in compression
+  // types through kZSTD. (Might need updating if this fails.)
+  assert(builtin_compressor.GetPreferredCompressionType() <= kZSTD);
+
+  to->ResetForSize(upper_bound);
+  CompressionType actual_type = kNoCompression;
+  Status s = builtin_compressor.CompressBlock(
+      from, to->data(), &to->MutableSize(), &actual_type, working_area);
+  TEST_SYNC_POINT_CALLBACK("LegacyForceBuiltinCompression:TamperWithStatus",
+                           &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+  if (actual_type == kNoCompression) {
+    // abort in debug builds
+    assert(actual_type != kNoCompression);
+    return Status::Corruption("Compression unexpectedly declined or aborted");
+  }
+  assert(actual_type == builtin_compressor.GetPreferredCompressionType());
+  return Status::OK();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compression.h b/util/compression.h
index c99bbba4d0d9..ff261d3ad513 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -27,7 +27,7 @@
 #include "rocksdb/options.h"
 #include "table/block_based/block_type.h"
 #include "test_util/sync_point.h"
-#include "util/atomic.h"
+#include "util/aligned_buffer.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
@@ -1831,6 +1831,17 @@ const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
 // END built-in implementation of customization interface
 // ***********************************************************************
 
+// The new compression APIs intentionally make it difficult to generate
+// compressed data larger than the original. (It is better to store the
+// uncompressed version in that case.) For legacy cases that must store
+// compressed data even when larger than the uncompressed, this is a convenient
+// wrapper to support that, with a compressor from BuiltinCompressionManager and
+// a GrowableBuffer.
+Status LegacyForceBuiltinCompression(
+    Compressor& builtin_compressor,
+    Compressor::ManagedWorkingArea* working_area, Slice from,
+    GrowableBuffer* to);
+
 // Records the compression type for subsequent WAL records.
 class CompressionTypeRecord {
  public:
diff --git a/util/compression_test.cc b/util/compression_test.cc
index 06571f233bf3..c40503b00ed9 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -829,9 +829,10 @@ TEST_P(CompressionFailuresTest, CompressionFailures) {
 
   if (compression_failure_type_ == kTestCompressionFail) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "CompressData:TamperWithReturnValue", [](void* arg) {
-          bool* ret = static_cast<bool*>(arg);
-          *ret = false;
+        "BlockBasedTableBuilder::CompressAndVerifyBlock:TamperWithResultType",
+        [](void* arg) {
+          CompressionType* ret = static_cast<CompressionType*>(arg);
+          *ret = kNoCompression;
         });
   } else if (compression_failure_type_ == kTestDecompressionFail) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 381cf2ec52c4..73e09e0ee6b0 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -15,8 +15,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 // MultiCompressorWrapper implementation
-MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts,
-                                               CompressionDict&& dict)
+MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts)
     : opts_(opts) {
   // TODO: make the compression manager a field
   auto builtInManager = GetBuiltinV2CompressionManager();
@@ -27,7 +26,6 @@ MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts,
     }
     compressors_.push_back(builtInManager->GetCompressor(opts, type));
   }
-  (void)dict;
 }
 
 size_t MultiCompressorWrapper::GetMaxSampleSizeIfWantDict(
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 79ba7b130c86..0d435394db05 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -10,15 +10,14 @@
 #include <memory>
 #include <vector>
 
-#include "compression.h"
 #include "rocksdb/advanced_compression.h"
+#include "util/atomic.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class MultiCompressorWrapper : public Compressor {
  public:
-  explicit MultiCompressorWrapper(const CompressionOptions& opts,
-                                  CompressionDict&& dict = {});
+  explicit MultiCompressorWrapper(const CompressionOptions& opts);
 
   size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;
   Slice GetSerializedDict() const override;
diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc
index 1ab292f2534a..9201c53ad9ef 100644
--- a/utilities/blob_db/blob_compaction_filter.cc
+++ b/utilities/blob_db/blob_compaction_filter.cc
@@ -123,10 +123,14 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
     return Decision::kIOError;
   }
   Slice new_blob_value(*new_value);
-  std::string compression_output;
+  GrowableBuffer compressed_output;
   if (blob_db_impl->bdb_options_.compression != kNoCompression) {
-    new_blob_value =
-        blob_db_impl->GetCompressedSlice(new_blob_value, &compression_output);
+    Status s = blob_db_impl->CompressBlob(new_blob_value, &compressed_output);
+    if (!s.ok()) {
+      // Best approximation
+      return Decision::kIOError;
+    }
+    new_blob_value = compressed_output.AsSlice();
   }
   uint64_t new_blob_file_number = 0;
   uint64_t new_blob_offset = 0;
@@ -336,7 +340,7 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
   assert(blob_db_impl->bdb_options_.enable_garbage_collection);
 
   BlobIndex blob_index;
-  const Status s = blob_index.DecodeFrom(existing_value);
+  Status s = blob_index.DecodeFrom(existing_value);
   if (!s.ok()) {
     gc_stats_.SetError();
     return BlobDecision::kCorruption;
@@ -369,7 +373,7 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
 
   PinnableSlice blob;
   CompressionType compression_type = kNoCompression;
-  std::string compression_output;
+  GrowableBuffer compressed_output;
   if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
     gc_stats_.SetError();
     return BlobDecision::kIOError;
@@ -387,9 +391,11 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
       }
     }
     if (blob_db_impl->bdb_options_.compression != kNoCompression) {
-      blob_db_impl->GetCompressedSlice(blob, &compression_output);
-      blob = PinnableSlice(&compression_output);
-      blob.PinSelf();
+      s = blob_db_impl->CompressBlob(blob, &compressed_output);
+      if (!s.ok()) {
+        return BlobDecision::kCorruption;
+      }
+      blob.PinSelf(compressed_output.AsSlice());
     }
   }
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 7bc20f7bf5d2..58d51471cd98 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -41,10 +41,6 @@
 #include "utilities/blob_db/blob_db_iterator.h"
 #include "utilities/blob_db/blob_db_listener.h"
 
-namespace {
-int kBlockBasedTableVersionFormat = 2;
-}  // end namespace
-
 namespace ROCKSDB_NAMESPACE::blob_db {
 
 bool BlobFileComparator::operator()(
@@ -87,7 +83,10 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       live_sst_size_(0),
       fifo_eviction_seq_(0),
       evict_expiration_up_to_(0),
-      debug_level_(0) {
+      debug_level_(0),
+      // NOTE: returns nullptr for kNoCompression
+      blob_compressor_(GetBuiltinV2CompressionManager()->GetCompressor(
+          CompressionOptions{}, bdb_options_.compression)) {
   clock_ = env_->GetSystemClock().get();
   blob_dir_ = (bdb_options_.path_relative)
                   ? dbname + "/" + bdb_options_.blob_dir
@@ -1087,18 +1086,32 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
       RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
     }
   } else {
-    std::string compression_output;
-    Slice value_compressed = GetCompressedSlice(value, &compression_output);
+    GrowableBuffer compression_output;
+    Slice value_maybe_compressed;
+    if (blob_compressor_) {
+      assert(bdb_options_.compression != kNoCompression);
+      assert(bdb_options_.compression ==
+             blob_compressor_->GetPreferredCompressionType());
+      s = CompressBlob(value, &compression_output);
+      if (!s.ok()) {
+        return s;
+      }
+      value_maybe_compressed = compression_output.AsSlice();
+    } else {
+      assert(bdb_options_.compression == kNoCompression);
+      value_maybe_compressed = value;
+    }
 
     std::string headerbuf;
-    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_compressed,
+    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_maybe_compressed,
                                        expiration);
 
     // Check DB size limit before selecting blob file to
     // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
     // done before calling SelectBlobFile().
     s = CheckSizeAndEvictBlobFiles(
-        write_options, headerbuf.size() + key.size() + value_compressed.size());
+        write_options,
+        headerbuf.size() + key.size() + value_maybe_compressed.size());
     if (!s.ok()) {
       return s;
     }
@@ -1112,8 +1125,8 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
     if (s.ok()) {
       assert(blob_file != nullptr);
       assert(blob_file->GetCompressionType() == bdb_options_.compression);
-      s = AppendBlob(write_options, blob_file, headerbuf, key, value_compressed,
-                     expiration, &index_entry);
+      s = AppendBlob(write_options, blob_file, headerbuf, key,
+                     value_maybe_compressed, expiration, &index_entry);
     }
     if (s.ok()) {
       if (expiration != kNoExpiration) {
@@ -1150,26 +1163,16 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
   return s;
 }
 
-Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
-                                     std::string* compression_output) const {
-  if (bdb_options_.compression == kNoCompression) {
-    return raw;
-  }
+Status BlobDBImpl::CompressBlob(const Slice& raw,
+                                GrowableBuffer* compression_output) const {
   StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
-  CompressionType type = bdb_options_.compression;
-  CompressionOptions opts;
-  CompressionContext context(type, opts);
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type);
-  OLD_CompressData(raw, info,
-                   GetCompressFormatForVersion(kBlockBasedTableVersionFormat),
-                   compression_output);
-  return *compression_output;
+  return LegacyForceBuiltinCompression(
+      *blob_compressor_, /*working_area=*/nullptr, raw, compression_output);
 }
 
 Decompressor& BlobDecompressor() {
-  static auto mgr = GetBuiltinCompressionManager(
-      GetCompressFormatForVersion(kBlockBasedTableVersionFormat));
-  static auto decompressor = mgr->GetDecompressor();
+  static auto decompressor =
+      GetBuiltinV2CompressionManager()->GetDecompressor();
 
   return *decompressor;
 }
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 42eefd0149b0..b19c546f4848 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -234,8 +234,8 @@ class BlobDBImpl : public BlobDB {
                             PinnableSlice* value,
                             CompressionType* compression_type);
 
-  Slice GetCompressedSlice(const Slice& raw,
-                           std::string* compression_output) const;
+  Status CompressBlob(const Slice& raw,
+                      GrowableBuffer* compression_output) const;
 
   Status DecompressSlice(const Slice& compressed_value,
                          CompressionType compression_type,
@@ -507,6 +507,8 @@ class BlobDBImpl : public BlobDB {
   int disable_file_deletions_ = 0;
 
   uint32_t debug_level_;
+
+  std::unique_ptr<Compressor> blob_compressor_;
 };
 
 Decompressor& BlobDecompressor();

From a1af6f9f6456b1d1f2aa3e61ad8ec94e676c8512 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 14 Jan 2026 09:35:54 -0800
Subject: [PATCH 424/500] Use new compression APIs internally for
 sample_for_compression (#14230)

Summary:
Trying to get rid of uses of OLD_CompressData / OLD_UncompressData. Some performance optimizations and corrections for better accounting also.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14230

Test Plan:
* exanded unit test to be more complete / rigorous
* manual before-and-after db_bench runs with the option, seeing table properties as expected

Reviewed By: hx235

Differential Revision: D90545476

Pulled By: pdillinger

fbshipit-source-id: 2f7c577574bcc4b2acafa002761ec1cad7fdb093
---
 db/db_properties_test.cc                      | 39 +++++---
 .../block_based/block_based_table_builder.cc  | 88 +++++++++++--------
 2 files changed, 79 insertions(+), 48 deletions(-)

diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 771c2dc8d881..160a5fcd774e 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -1519,16 +1519,14 @@ TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
 
 // Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
 TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
-  // Sampled compression requires at least one of the following four types.
-  if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
-      !ZSTD_Supported()) {
-    return;
-  }
-
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.table_properties_collector_factories.emplace_back(
       std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+  options.compression = kNoCompression;
+
+  bool fast_sampling_supported = Snappy_Supported() || LZ4_Supported();
+  bool slow_sampling_supported = ZSTD_Supported() || Zlib_Supported();
 
   for (bool sample_for_compression : {false, true}) {
     // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
@@ -1542,10 +1540,11 @@ TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
     // L1_0 ["a", "b"]
     //
     // L0_0 was created by flush. L1_0 was created by compaction. Each file
-    // contains one data block.
+    // contains one data block with enough data to be compressible.
     for (int i = 0; i < 3; ++i) {
-      ASSERT_OK(Put("a", "val"));
-      ASSERT_OK(Put("b", "val"));
+      for (int j = 0; j < 50; ++j) {
+        ASSERT_OK(Put(std::to_string(j), "thisismyvalue"));
+      }
       ASSERT_OK(Flush());
       if (i == 1) {
         ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -1558,13 +1557,33 @@ TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
     ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
     ASSERT_EQ(2, file_to_props.size());
     for (const auto& file_and_props : file_to_props) {
-      auto& user_props = file_and_props.second->user_collected_properties;
+      auto& props = *file_and_props.second;
+      auto& user_props = props.user_collected_properties;
       ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
                                       kNumSampledBlocksPropertyName) !=
                   user_props.end());
       ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
                                   kNumSampledBlocksPropertyName),
                 std::to_string(sample_for_compression ? 1 : 0));
+      if (sample_for_compression) {
+        EXPECT_GT(props.fast_compression_estimated_data_size, 0);
+        EXPECT_GT(props.slow_compression_estimated_data_size, 0);
+        if (fast_sampling_supported) {
+          EXPECT_LT(props.fast_compression_estimated_data_size,
+                    props.data_size);
+          if (slow_sampling_supported) {
+            EXPECT_LT(props.slow_compression_estimated_data_size,
+                      props.fast_compression_estimated_data_size);
+          }
+        }
+        if (slow_sampling_supported) {
+          EXPECT_LT(props.slow_compression_estimated_data_size,
+                    props.data_size);
+        }
+      } else {
+        EXPECT_EQ(props.fast_compression_estimated_data_size, 0);
+        EXPECT_EQ(props.slow_compression_estimated_data_size, 0);
+      }
     }
   }
 }
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index a4864912984f..9c11e7e7253e 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -842,6 +842,9 @@ struct BlockBasedTableBuilder::Rep {
 
   // A compressor for blocks in general, without dictionary compression
   std::unique_ptr<Compressor> basic_compressor;
+  // Built-in compressors for compression size sampling
+  std::unique_ptr<Compressor> fast_sample_compressor;
+  std::unique_ptr<Compressor> slow_sample_compressor;
   // A compressor for data blocks, which might be tuned differently and might
   // use dictionary compression (when applicable). See ~Rep() for some details.
   UnownedPtr<Compressor> data_block_compressor = nullptr;
@@ -1163,6 +1166,23 @@ struct BlockBasedTableBuilder::Rep {
       }
     }
 
+    if (sample_for_compression > 0) {
+      auto builtin = GetBuiltinCompressionManager(
+          GetCompressFormatForVersion(table_opt.format_version));
+      if (builtin->SupportsCompressionType(kLZ4Compression)) {
+        fast_sample_compressor = builtin->GetCompressor({}, kLZ4Compression);
+      } else if (builtin->SupportsCompressionType(kSnappyCompression)) {
+        fast_sample_compressor = builtin->GetCompressor({}, kSnappyCompression);
+      }
+      if (builtin->SupportsCompressionType(kZSTD)) {
+        slow_sample_compressor = builtin->GetCompressor({}, kZSTD);
+      } else if (builtin->SupportsCompressionType(kZlibCompression)) {
+        slow_sample_compressor = builtin->GetCompressor({}, kZlibCompression);
+      }
+      // NOTE: even if both sampling compressors are nullptr, we still populate
+      // the table properties with placeholder info
+    }
+
     switch (table_options.prepopulate_block_cache) {
       case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly:
         warm_cache = (reason == TableFileCreationReason::kFlush);
@@ -1586,51 +1606,43 @@ void BlockBasedTableBuilder::Flush(const Slice* first_key_in_next_block) {
   if (r->sample_for_compression > 0 &&
       Random::GetTLSInstance()->OneIn(
           static_cast<int>(r->sample_for_compression))) {
-    std::string sampled_output_fast;
-    std::string sampled_output_slow;
+    GrowableBuffer sampled_output;
+    sampled_output.ResetForSize(uncompressed_block_data.size());
+    size_t fast_size = uncompressed_block_data.size();
+    size_t slow_size = uncompressed_block_data.size();
 
     // Sampling with a fast compression algorithm
-    if (LZ4_Supported() || Snappy_Supported()) {
-      CompressionType c =
-          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
-      CompressionOptions options;
-      CompressionContext context(c, options);
-      CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c);
-
-      OLD_CompressData(
-          uncompressed_block_data, info_tmp,
-          GetCompressFormatForVersion(r->table_options.format_version),
-          &sampled_output_fast);
+    if (r->fast_sample_compressor) {
+      CompressionType result_type = kNoCompression;
+      Status s = r->fast_sample_compressor->CompressBlock(
+          uncompressed_block_data, sampled_output.data(), &fast_size,
+          &result_type, /*working_area=*/nullptr);
+      if (!s.ok() || result_type == kNoCompression) {
+        // For accounting, fall back on no compression
+        fast_size = uncompressed_block_data.size();
+      }
     }
 
     // Sampling with a slow but high-compression algorithm
-    if (ZSTD_Supported() || Zlib_Supported()) {
-      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
-      CompressionOptions options;
-      CompressionContext context(c, options);
-      CompressionInfo info_tmp(options, context,
-                               CompressionDict::GetEmptyDict(), c);
-
-      OLD_CompressData(
-          uncompressed_block_data, info_tmp,
-          GetCompressFormatForVersion(r->table_options.format_version),
-          &sampled_output_slow);
-    }
-
-    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
-      // Currently compression sampling is only enabled for data block.
-      r->sampled_input_data_bytes.FetchAddRelaxed(
-          uncompressed_block_data.size());
-      r->sampled_output_slow_data_bytes.FetchAddRelaxed(
-          sampled_output_slow.size());
-      r->sampled_output_fast_data_bytes.FetchAddRelaxed(
-          sampled_output_fast.size());
+    if (r->slow_sample_compressor) {
+      CompressionType result_type = kNoCompression;
+      Status s = r->slow_sample_compressor->CompressBlock(
+          uncompressed_block_data, sampled_output.data(), &slow_size,
+          &result_type, /*working_area=*/nullptr);
+      if (!s.ok() || result_type == kNoCompression) {
+        // For accounting, fall back on no compression
+        slow_size = uncompressed_block_data.size();
+      }
     }
 
-    NotifyCollectTableCollectorsOnBlockAdd(
-        r->table_properties_collectors, uncompressed_block_data.size(),
-        sampled_output_slow.size(), sampled_output_fast.size());
+    // NOTE: Currently compression sampling is only enabled for data block.
+    r->sampled_input_data_bytes.FetchAddRelaxed(uncompressed_block_data.size());
+    r->sampled_output_slow_data_bytes.FetchAddRelaxed(slow_size);
+    r->sampled_output_fast_data_bytes.FetchAddRelaxed(fast_size);
+
+    NotifyCollectTableCollectorsOnBlockAdd(r->table_properties_collectors,
+                                           uncompressed_block_data.size(),
+                                           slow_size, fast_size);
   } else {
     NotifyCollectTableCollectorsOnBlockAdd(
         r->table_properties_collectors, uncompressed_block_data.size(),

From c6d08d3efe0686a90b5ca877ca9577b8e1032f68 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 16 Jan 2026 10:01:42 -0800
Subject: [PATCH 425/500] Use new compression APIs in db_bench (#14241)

Summary:
To move away from OLD_CompressData / OLD_UncompressData. Also improved some error/warning messages.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14241

Test Plan: manual tests showing similar performance, runs with ASAN/UBSAN to check for issues

Reviewed By: hx235

Differential Revision: D90793708

Pulled By: pdillinger

fbshipit-source-id: e0655f7bed8d85e5ea110167dca73c6664f7465b
---
 tools/db_bench_tool.cc | 164 +++++++++++++++++++++++++++--------------
 1 file changed, 109 insertions(+), 55 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 16033434f564..f2c2798695c5 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -47,6 +47,7 @@
 #include "options/cf_options.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/advanced_compression.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
@@ -2930,12 +2931,18 @@ class Benchmark {
     return true;
   }
 
-  inline bool CompressSlice(const CompressionInfo& compression_info,
-                            const Slice& input, std::string* compressed) {
-    constexpr uint32_t compress_format_version = 2;
-
-    return OLD_CompressData(input, compression_info, compress_format_version,
-                            compressed);
+  std::unique_ptr<Compressor> GetCompressor() {
+    CompressionOptions opts;
+    opts.level = FLAGS_compression_level;
+    // TODO: inter-operate with FLAGS_compression_manager
+    auto compressor = GetBuiltinV2CompressionManager()->GetCompressor(
+        opts, FLAGS_compression_type_e);
+    if (compressor &&
+        compressor->GetPreferredCompressionType() != FLAGS_compression_type_e) {
+      // For benchmarking, don't fall back on a different compression type
+      compressor.reset();
+    }
+    return compressor;
   }
 
   void PrintHeader(const Options& options) {
@@ -3021,18 +3028,30 @@ class Benchmark {
       // The test string should not be too small.
       const int len = FLAGS_block_size;
       std::string input_str(len, 'y');
-      std::string compressed;
-      CompressionOptions opts;
-      CompressionContext context(FLAGS_compression_type_e, opts);
-      CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                           FLAGS_compression_type_e);
-      bool result = CompressSlice(info, Slice(input_str), &compressed);
-
-      if (!result) {
-        fprintf(stdout, "WARNING: %s compression is not enabled\n",
-                compression);
-      } else if (compressed.size() >= input_str.size()) {
-        fprintf(stdout, "WARNING: %s compression is not effective\n",
+      auto compressor = GetCompressor();
+      if (compressor) {
+        GrowableBuffer compressed;
+        compressed.ResetForSize(input_str.size());
+        CompressionType actual_type = kNoCompression;
+        auto working_area = compressor->ObtainWorkingArea();
+        Status s = compressor->CompressBlock(
+            Slice(input_str), compressed.data(), &compressed.MutableSize(),
+            &actual_type, &working_area);
+        if (!s.ok()) {
+          fprintf(stdout, "WARNING: compression test run failure: %s\n",
+                  s.ToString().c_str());
+        } else if (actual_type == kNoCompression) {
+          fprintf(stdout,
+                  "WARNING: %s compression is not effective or declined\n",
+                  compression);
+        } else if (actual_type != FLAGS_compression_type_e) {
+          fprintf(
+              stdout,
+              "WARNING: using %s compression in place of %s (unsupported?)\n",
+              CompressionTypeToString(actual_type).c_str(), compression);
+        }
+      } else {
+        fprintf(stdout, "WARNING: %s compression is not available\n",
                 compression);
       }
     }
@@ -4248,24 +4267,37 @@ class Benchmark {
     Slice input = gen.Generate(FLAGS_block_size);
     int64_t bytes = 0;
     int64_t produced = 0;
-    bool ok = true;
-    std::string compressed;
-    CompressionOptions opts;
-    opts.level = FLAGS_compression_level;
-    CompressionContext context(FLAGS_compression_type_e, opts);
-    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                         FLAGS_compression_type_e);
+    Status s;
+
+    auto compressor = GetCompressor();
+    if (!compressor) {
+      thread->stats.AddMessage("(compression type not supported)");
+      return;
+    }
+    auto working_area = compressor->ObtainWorkingArea();
+
+    GrowableBuffer compressed;
     // Compress 1G
-    while (ok && bytes < int64_t(1) << 30) {
-      compressed.clear();
-      ok = CompressSlice(info, input, &compressed);
+    while (bytes < int64_t(1) << 30) {
+      compressed.ResetForSize(input.size());
+      CompressionType actual_type = kNoCompression;
+      s = compressor->CompressBlock(input, compressed.data(),
+                                    &compressed.MutableSize(), &actual_type,
+                                    &working_area);
+      if (UNLIKELY(!s.ok())) {
+        break;
+      }
+      if (UNLIKELY(actual_type == kNoCompression)) {
+        s = Status::Aborted("Unable to compress smaller than input");
+        break;
+      }
       produced += compressed.size();
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
     }
 
-    if (!ok) {
-      thread->stats.AddMessage("(compression failure)");
+    if (!s.ok()) {
+      thread->stats.AddMessage("(compression failure: " + s.ToString() + ")");
     } else {
       char buf[340];
       snprintf(buf, sizeof(buf), "(output: %.1f%%)",
@@ -4278,37 +4310,59 @@ class Benchmark {
   void Uncompress(ThreadState* thread) {
     RandomGenerator gen;
     Slice input = gen.Generate(FLAGS_block_size);
-    std::string compressed;
-
-    CompressionOptions compression_opts;
-    compression_opts.level = FLAGS_compression_level;
-    CompressionContext compression_ctx(FLAGS_compression_type_e,
-                                       compression_opts);
-    CompressionInfo compression_info(compression_opts, compression_ctx,
-                                     CompressionDict::GetEmptyDict(),
-                                     FLAGS_compression_type_e);
-    UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
-    UncompressionInfo uncompression_info(uncompression_ctx,
-                                         UncompressionDict::GetEmptyDict(),
-                                         FLAGS_compression_type_e);
-
-    bool ok = CompressSlice(compression_info, input, &compressed);
-    int64_t bytes = 0;
-    size_t uncompressed_size = 0;
-    while (ok && bytes < 1024 * 1048576) {
-      constexpr uint32_t compress_format_version = 2;
 
-      CacheAllocationPtr uncompressed = OLD_UncompressData(
-          uncompression_info, compressed.data(), compressed.size(),
-          &uncompressed_size, compress_format_version);
+    auto compressor = GetCompressor();
+    if (!compressor) {
+      thread->stats.AddMessage("(compression type not supported)");
+      return;
+    }
 
-      ok = uncompressed.get() != nullptr;
+    // Compress the input first
+    GrowableBuffer compressed;
+    compressed.ResetForSize(input.size());
+    CompressionType actual_type = kNoCompression;
+    Status s = compressor->CompressBlock(
+        input, compressed.data(), &compressed.MutableSize(), &actual_type,
+        /*working_area=*/nullptr);
+    if (!s.ok()) {
+      thread->stats.AddMessage("(compression failure: " + s.ToString() + ")");
+      return;
+    }
+    if (actual_type != FLAGS_compression_type_e) {
+      thread->stats.AddMessage("(failed to compress smaller than input)");
+      return;
+    }
+
+    // TODO: inter-operate with FLAGS_compression_manager
+    auto decompressor =
+        GetBuiltinV2CompressionManager()->GetDecompressorOptimizeFor(
+            actual_type);
+    auto decomp_working_area = decompressor->ObtainWorkingArea(actual_type);
+
+    int64_t bytes = 0;
+    while (bytes < 1024 * 1048576) {
+      Decompressor::Args args;
+      args.compression_type = actual_type;
+      args.compressed_data = compressed.AsSlice();
+      args.working_area = &decomp_working_area;
+
+      s = decompressor->ExtractUncompressedSize(args);
+      if (UNLIKELY(!s.ok())) {
+        break;
+      }
+
+      CacheAllocationPtr uncompressed = AllocateBlock(args.uncompressed_size,
+                                                      /*allocator=*/nullptr);
+      s = decompressor->DecompressBlock(args, uncompressed.get());
+      if (UNLIKELY(!s.ok())) {
+        break;
+      }
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
     }
 
-    if (!ok) {
-      thread->stats.AddMessage("(compression failure)");
+    if (!s.ok()) {
+      thread->stats.AddMessage("(decompression failure: " + s.ToString() + ")");
     } else {
       thread->stats.AddBytes(bytes);
     }

From 88aff40c97bec8c4bb6a836ced5cb7111892e52b Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 16 Jan 2026 11:01:37 -0800
Subject: [PATCH 426/500] New io stats for unknown file temperature last vs.
 non-last (#14243)

Summary:
These will be useful for qualifying non-tiered workloads for tiered storage.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14243

Test Plan:
unit test included

I'm not concerned about performance because this fits pretty nicely into some existing code and only adds overhead when (expensive) IOs are done.

Reviewed By: jaykorean

Differential Revision: D90870348

Pulled By: pdillinger

fbshipit-source-id: 984411123bcd54c249a949da813ff04fedacc6a4
---
 db/db_test2.cc                    | 81 ++++++++++++++++++++++++++++++
 file/random_access_file_reader.cc | 82 ++++++++++++++++++-------------
 include/rocksdb/iostats_context.h | 13 +++++
 3 files changed, 141 insertions(+), 35 deletions(-)

diff --git a/db/db_test2.cc b/db/db_test2.cc
index 33da1ffaf12f..67230d846a29 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -6544,6 +6544,9 @@ TEST_F(DBTest2, LastLevelStatistics) {
 
     DestroyAndReopen(options);
 
+    get_iostats_context()->Reset();
+    IOStatsContext* iostats = get_iostats_context();
+
     // generate 1 sst on level 0
     ASSERT_OK(Put("foo1", "bar"));
     ASSERT_OK(Put("bar", "bar"));
@@ -6644,9 +6647,87 @@ TEST_F(DBTest2, LastLevelStatistics) {
     // Control
     ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
               options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT));
+
+    // Control: unknown temperature iostats should be zero since files have
+    // explicit temperatures (mapped or written)
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+        0);
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count,
+        0);
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read, 0);
+    EXPECT_EQ(
+        iostats->file_io_stats_by_temperature.unknown_last_level_read_count, 0);
   }
 }
 
+// Test the iostats for files with Temperature::kUnknown that is not mapped
+// to another temperature. These stats are used to indicate which non-tiered
+// workloads are most promising for tiering (so this test doesn't set
+// temperatures).
+TEST_F(DBTest2, UnknownLastLevelStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  // Generate 1 sst file on level 0 with kUnknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  // Read from the kUnknown file on non-last level
+  ASSERT_EQ("bar", Get("foo"));
+
+  // Verify unknown_non_last_level stats are populated
+  EXPECT_GT(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+      0);
+  EXPECT_GT(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count,
+      0);
+  // No reads from last level yet
+  EXPECT_EQ(iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read,
+            0);
+  EXPECT_EQ(iostats->file_io_stats_by_temperature.unknown_last_level_read_count,
+            0);
+
+  // Compact to the last level (level 6) explicitly using MoveFilesToLevel
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  MoveFilesToLevel(6);
+
+  // Reopen DB to ensure table cache is cleared and files are re-opened
+  // with correct is_last_level flag
+  Reopen(options);
+
+  // Reset iostats to measure only the following reads
+  get_iostats_context()->Reset();
+
+  // Read from the file now on last level (still kUnknown since
+  // last_level_temperature is not set)
+  ASSERT_EQ("bar", Get("foo"));
+
+  // Verify unknown_last_level stats are populated
+  EXPECT_GT(iostats->file_io_stats_by_temperature.unknown_last_level_bytes_read,
+            0);
+  EXPECT_GT(iostats->file_io_stats_by_temperature.unknown_last_level_read_count,
+            0);
+  // No new reads from non-last level
+  EXPECT_EQ(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+      0);
+  EXPECT_EQ(
+      iostats->file_io_stats_by_temperature.unknown_non_last_level_read_count,
+      0);
+}
+
 TEST_F(DBTest2, CheckpointFileTemperature) {
   class NoLinkTestFS : public FileTemperatureTestFS {
     using FileTemperatureTestFS::FileTemperatureTestFS;
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index f96609a01df3..ba376249d9da 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -66,41 +66,53 @@ inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
   }
 
   // record for temperature file
-  if (file_temperature != Temperature::kUnknown) {
-    switch (file_temperature) {
-      case Temperature::kHot:
-        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
-        RecordTick(stats, HOT_FILE_READ_BYTES, size);
-        RecordTick(stats, HOT_FILE_READ_COUNT, 1);
-        break;
-      case Temperature::kWarm:
-        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
-        RecordTick(stats, WARM_FILE_READ_BYTES, size);
-        RecordTick(stats, WARM_FILE_READ_COUNT, 1);
-        break;
-      case Temperature::kCool:
-        IOSTATS_ADD(file_io_stats_by_temperature.cool_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.cool_file_read_count, 1);
-        RecordTick(stats, COOL_FILE_READ_BYTES, size);
-        RecordTick(stats, COOL_FILE_READ_COUNT, 1);
-        break;
-      case Temperature::kCold:
-        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
-        RecordTick(stats, COLD_FILE_READ_BYTES, size);
-        RecordTick(stats, COLD_FILE_READ_COUNT, 1);
-        break;
-      case Temperature::kIce:
-        IOSTATS_ADD(file_io_stats_by_temperature.ice_file_bytes_read, size);
-        IOSTATS_ADD(file_io_stats_by_temperature.ice_file_read_count, 1);
-        RecordTick(stats, ICE_FILE_READ_BYTES, size);
-        RecordTick(stats, ICE_FILE_READ_COUNT, 1);
-        break;
-      default:
-        break;
-    }
+  switch (file_temperature) {
+    case Temperature::kHot:
+      IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
+      RecordTick(stats, HOT_FILE_READ_BYTES, size);
+      RecordTick(stats, HOT_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kWarm:
+      IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
+      RecordTick(stats, WARM_FILE_READ_BYTES, size);
+      RecordTick(stats, WARM_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kCool:
+      IOSTATS_ADD(file_io_stats_by_temperature.cool_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.cool_file_read_count, 1);
+      RecordTick(stats, COOL_FILE_READ_BYTES, size);
+      RecordTick(stats, COOL_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kCold:
+      IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
+      RecordTick(stats, COLD_FILE_READ_BYTES, size);
+      RecordTick(stats, COLD_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kIce:
+      IOSTATS_ADD(file_io_stats_by_temperature.ice_file_bytes_read, size);
+      IOSTATS_ADD(file_io_stats_by_temperature.ice_file_read_count, 1);
+      RecordTick(stats, ICE_FILE_READ_BYTES, size);
+      RecordTick(stats, ICE_FILE_READ_COUNT, 1);
+      break;
+    case Temperature::kUnknown:
+      if (is_last_level) {
+        IOSTATS_ADD(file_io_stats_by_temperature.unknown_last_level_bytes_read,
+                    size);
+        IOSTATS_ADD(file_io_stats_by_temperature.unknown_last_level_read_count,
+                    1);
+      } else {
+        IOSTATS_ADD(
+            file_io_stats_by_temperature.unknown_non_last_level_bytes_read,
+            size);
+        IOSTATS_ADD(
+            file_io_stats_by_temperature.unknown_non_last_level_read_count, 1);
+      }
+      break;
+    default:
+      break;
   }
 }
 
diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h
index c9ebad1b7043..8fce6181c0b4 100644
--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@@ -38,6 +38,10 @@ struct FileIOByTemperature {
   uint64_t cold_file_bytes_read;
   // the number of bytes read to Temperature::kIce file
   uint64_t ice_file_bytes_read;
+  // the number of bytes read to Temperature::kUnknown file not in last level
+  uint64_t unknown_non_last_level_bytes_read;
+  // the number of bytes read to Temperature::kUnknown file in last level
+  uint64_t unknown_last_level_bytes_read;
   // total number of reads to Temperature::kHot file
   uint64_t hot_file_read_count;
   // total number of reads to Temperature::kWarm file
@@ -48,6 +52,11 @@ struct FileIOByTemperature {
   uint64_t cold_file_read_count;
   // total number of reads to Temperature::kIce file
   uint64_t ice_file_read_count;
+  // total number of reads to Temperature::kUnknown file not in last level
+  uint64_t unknown_non_last_level_read_count;
+  // total number of reads to Temperature::kUnknown file in last level
+  uint64_t unknown_last_level_read_count;
+
   // reset all the statistics to 0.
   void Reset() {
     hot_file_bytes_read = 0;
@@ -55,11 +64,15 @@ struct FileIOByTemperature {
     cool_file_bytes_read = 0;
     cold_file_bytes_read = 0;
     ice_file_bytes_read = 0;
+    unknown_non_last_level_bytes_read = 0;
+    unknown_last_level_bytes_read = 0;
     hot_file_read_count = 0;
     warm_file_read_count = 0;
     cool_file_read_count = 0;
     cold_file_read_count = 0;
     ice_file_read_count = 0;
+    unknown_non_last_level_read_count = 0;
+    unknown_last_level_read_count = 0;
   }
 };
 

From 2a7a6a6d72bc18da6bef62ee5bdf7b8fd249fb19 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 20 Jan 2026 12:18:19 -0800
Subject: [PATCH 427/500] Don't assert on async_read.status in MultiScan code
 path (#14244)

Summary:
Surface async read errors instead of asserting on them. This makes it easier to debug stress test failures. Async reads can fail for legitimate reasons, such as fs errors.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14244

Reviewed By: hx235

Differential Revision: D90878515

Pulled By: anand1976

fbshipit-source-id: 6335d4b06ddf250b26842ce94e3f5263356b2695
---
 table/block_based/block_based_table_iterator.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 0c6fa65834db..e822eb8af13e 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -1361,7 +1361,6 @@ Status BlockBasedTableIterator::PollForBlock(size_t idx) {
   AsyncReadState& async_read = multi_scan_->async_states[async_idx->second];
   if (async_read.finished) {
     assert(async_read.io_handle == nullptr);
-    assert(async_read.status.ok());
     return async_read.status;
   }
 
@@ -1373,7 +1372,6 @@ Status BlockBasedTableIterator::PollForBlock(size_t idx) {
       return poll_s;
     }
   }
-  assert(async_read.status.ok());
   if (!async_read.status.ok()) {
     return async_read.status;
   }

From ea5e649225737b41db98315c39ebd9cb6ccc988f Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Tue, 20 Jan 2026 14:10:41 -0800
Subject: [PATCH 428/500] Fix an infinite compaction loop bug with udt (#14228)

Summary:
Problem

The TEST_WaitForCompact in
TimestampCompatibleCompactionTest.UdtTombstoneCollapsingTest would sometimes
run forever, indicating an infinite compaction loop.
Issue https://github.com/facebook/rocksdb/issues/14223

Root Cause

In ComputeBottommostFilesMarkedForCompaction(), files were marked for
bottommost compaction based only on the condition largest_seqno <
oldest_snapshot_seqnum. However, for User-Defined Timestamps (UDT) columns,
compaction can only zero sequence numbers when the file's maximum timestamp is
below full_history_ts_low.

When timestamps were above this threshold:
1. File gets marked for compaction (seqno condition met)
2. Compaction runs but cannot zero seqno (timestamp condition not met)
3. Output file immediately gets re-marked for compaction
4. Infinite loop

Solution

Added timestamp range tracking to FileMetaData and updated the marking logic to
check timestamps before marking files.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14228

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D90586045

Pulled By: xingbowang

fbshipit-source-id: addfa4f988db8c87fb513a1bf58ee54623a6c210
---
 db/builder.cc                                 |  13 +
 db/builder.h                                  |   7 +
 db/column_family.cc                           |   4 +-
 db/compaction/compaction_outputs.cc           |   6 +-
 db/compaction/compaction_picker.cc            |   9 +-
 db/compaction/compaction_picker.h             |  13 +-
 db/compaction/compaction_picker_fifo.cc       |  20 +-
 db/compaction/compaction_picker_fifo.h        |   4 +-
 db/compaction/compaction_picker_level.cc      |  15 +-
 db/compaction/compaction_picker_level.h       |   1 +
 db/compaction/compaction_picker_test.cc       | 404 +++++++++++----
 db/compaction/compaction_picker_universal.cc  |  17 +-
 db/compaction/compaction_picker_universal.h   |   3 +-
 db/db_impl/db_impl.cc                         |   9 +-
 db/db_impl/db_impl_compaction_flush.cc        |  25 +-
 db/db_impl/db_impl_experimental.cc            |   6 +-
 db/db_impl/db_impl_open.cc                    |   3 +-
 db/db_with_timestamp_compaction_test.cc       | 483 ++++++++++++++----
 db/experimental.cc                            |  20 +-
 db/external_sst_file_ingestion_job.cc         |   7 +-
 db/flush_job.cc                               |   3 +-
 db/repair.cc                                  |  20 +-
 db/version_builder_test.cc                    |   3 +-
 db/version_edit.cc                            |  15 +
 db/version_edit.h                             |  37 +-
 db/version_set.cc                             |  51 +-
 db/version_set.h                              |  15 +-
 db/version_set_test.cc                        |  27 +-
 .../fix_udt_infinite_compaction_loop.md       |   1 +
 29 files changed, 938 insertions(+), 303 deletions(-)
 create mode 100644 unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md

diff --git a/db/builder.cc b/db/builder.cc
index 14e943f3212e..0ca00a45bd5f 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -56,6 +56,18 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
   return tboptions.moptions.table_factory->NewTableBuilder(tboptions, file);
 }
 
+void ExtractTimestampFromTableProperties(const TableProperties& tp,
+                                         FileMetaData* meta) {
+  auto min_ts_iter = tp.user_collected_properties.find("rocksdb.timestamp_min");
+  if (min_ts_iter != tp.user_collected_properties.end()) {
+    meta->min_timestamp = min_ts_iter->second;
+  }
+  auto max_ts_iter = tp.user_collected_properties.find("rocksdb.timestamp_max");
+  if (max_ts_iter != tp.user_collected_properties.end()) {
+    meta->max_timestamp = max_ts_iter->second;
+  }
+}
+
 Status BuildTable(
     const std::string& dbname, VersionSet* versions,
     const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
@@ -355,6 +367,7 @@ Status BuildTable(
       assert(meta->fd.GetFileSize() > 0);
       tp = builder
                ->GetTableProperties();  // refresh now that builder is finished
+      ExtractTimestampFromTableProperties(tp, meta);
       if (memtable_payload_bytes != nullptr &&
           memtable_garbage_bytes != nullptr) {
         const CompactionIterationStats& ci_stats = c_iter.iter_stats();
diff --git a/db/builder.h b/db/builder.h
index 93e66c76e0a0..9f83a6f5dc16 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -41,6 +41,13 @@ class BlobFileCompletionCallback;
 TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
                               WritableFileWriter* file);
 
+// Extract min/max timestamps from table properties and populate FileMetaData.
+// This is used by both flush (BuildTable) and compaction (CompactionOutputs)
+// to populate timestamp range in FileMetaData from the TimestampTableProperties
+// collector output.
+void ExtractTimestampFromTableProperties(const TableProperties& tp,
+                                         FileMetaData* meta);
+
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
 // *meta will be filled with metadata about the generated table.
diff --git a/db/column_family.cc b/db/column_family.cc
index 406fd09767ac..3a34bae1f653 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1248,7 +1248,7 @@ Compaction* ColumnFamilyData::PickCompaction(
   auto* result = compaction_picker_->PickCompaction(
       GetName(), mutable_options, mutable_db_options, existing_snapshots,
       snapshot_checker, current_->storage_info(), log_buffer,
-      require_max_output_level);
+      GetFullHistoryTsLow(), require_max_output_level);
   if (result != nullptr) {
     result->FinalizeInputInfo(current_);
   }
@@ -1336,7 +1336,7 @@ Compaction* ColumnFamilyData::CompactRange(
       GetName(), mutable_cf_options, mutable_db_options,
       current_->storage_info(), input_level, output_level,
       compact_range_options, begin, end, compaction_end, conflict,
-      max_file_num_to_ignore, trim_ts);
+      max_file_num_to_ignore, trim_ts, GetFullHistoryTsLow());
   if (result != nullptr) {
     result->FinalizeInputInfo(current_);
   }
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index 34dc5f9ed135..8c86df870dee 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -49,8 +49,10 @@ Status CompactionOutputs::Finish(
     meta->fd.file_size = current_bytes;
     meta->tail_size = builder_->GetTailSize();
     meta->marked_for_compaction = builder_->NeedCompact();
-    meta->user_defined_timestamps_persisted = static_cast<bool>(
-        builder_->GetTableProperties().user_defined_timestamps_persisted);
+    const TableProperties& tp = builder_->GetTableProperties();
+    meta->user_defined_timestamps_persisted =
+        static_cast<bool>(tp.user_defined_timestamps_persisted);
+    ExtractTimestampFromTableProperties(tp, meta);
   }
   current_output().finished = true;
   stats_.bytes_written += current_bytes;
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index b92a507ce2d4..230cdd643967 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -611,7 +611,8 @@ Compaction* CompactionPicker::PickCompactionForCompactRange(
     int input_level, int output_level,
     const CompactRangeOptions& compact_range_options, const InternalKey* begin,
     const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
-    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts,
+    const std::string& full_history_ts_low) {
   // CompactionPickerFIFO has its own implementation of compact range
   assert(ioptions_.compaction_style != kCompactionStyleFIFO);
 
@@ -690,7 +691,8 @@ Compaction* CompactionPicker::PickCompactionForCompactRange(
         compact_range_options.blob_garbage_collection_age_cutoff);
 
     RegisterCompaction(c);
-    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options,
+                                     full_history_ts_low);
     return c;
   }
 
@@ -887,7 +889,8 @@ Compaction* CompactionPicker::PickCompactionForCompactRange(
   // takes running compactions into account (by skipping files that are already
   // being compacted). Since we just changed compaction score, we recalculate it
   // here
-  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options,
+                                   full_history_ts_low);
 
   return compaction;
 }
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index f5cfdb16f4c8..89d5c1841265 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -65,7 +65,8 @@ class CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer, bool require_max_output_level) = 0;
+      LogBuffer* log_buffer, const std::string& full_history_ts_low,
+      bool require_max_output_level = false) = 0;
 
   // The returned Compaction might not include the whole requested range.
   // In that case, compaction_end will be set to the next key that needs
@@ -82,7 +83,8 @@ class CompactionPicker {
       const CompactRangeOptions& compact_range_options,
       const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end, bool* manual_conflict,
-      uint64_t max_file_num_to_ignore, const std::string& trim_ts);
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts,
+      const std::string& full_history_ts_low);
 
   // The maximum allowed output level.  Default value is NumberLevels() - 1.
   virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
@@ -284,7 +286,8 @@ class NullCompactionPicker : public CompactionPicker {
       const std::vector<SequenceNumber>& /*existing_snapshots*/,
       const SnapshotChecker* /*snapshot_checker*/,
       VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
-      bool /*require_max_output_level*/ = false) override {
+      const std::string& /*full_history_ts_low*/,
+      bool /*require_max_output_level*/) override {
     return nullptr;
   }
 
@@ -298,8 +301,8 @@ class NullCompactionPicker : public CompactionPicker {
       const CompactRangeOptions& /*compact_range_options*/,
       const InternalKey* /*begin*/, const InternalKey* /*end*/,
       InternalKey** /*compaction_end*/, bool* /*manual_conflict*/,
-      uint64_t /*max_file_num_to_ignore*/,
-      const std::string& /*trim_ts*/) override {
+      uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/,
+      const std::string& /*full_history_ts_low*/) override {
     return nullptr;
   }
 
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index a569fc12a360..a1c4df368d1d 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -419,12 +419,19 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
   return c;
 }
 
+// The full_history_ts_low parameter is used to control bottommost file marking
+// for compaction when user-defined timestamps (UDT) are enabled.
+
+// TODO leverage full_history_ts_low for FIFO compaction, by trigggerring
+// compaction early for data that has already expired to achieve the goal of TTL
+// enforced compliance.
 Compaction* FIFOCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& /* existing_snapshots */,
     const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer, bool /* require_max_output_level*/) {
+    LogBuffer* log_buffer, const std::string& /* full_history_ts_low */,
+    bool /* require_max_output_level*/) {
   Compaction* c = nullptr;
   if (mutable_cf_options.ttl > 0) {
     c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
@@ -449,7 +456,8 @@ Compaction* FIFOCompactionPicker::PickCompactionForCompactRange(
     const CompactRangeOptions& /*compact_range_options*/,
     const InternalKey* /*begin*/, const InternalKey* /*end*/,
     InternalKey** compaction_end, bool* /*manual_conflict*/,
-    uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
+    uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/,
+    const std::string& full_history_ts_low) {
 #ifdef NDEBUG
   (void)input_level;
   (void)output_level;
@@ -458,10 +466,10 @@ Compaction* FIFOCompactionPicker::PickCompactionForCompactRange(
   assert(output_level == 0);
   *compaction_end = nullptr;
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
-  Compaction* c =
-      PickCompaction(cf_name, mutable_cf_options, mutable_db_options,
-                     /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr,
-                     vstorage, &log_buffer);
+  Compaction* c = PickCompaction(
+      cf_name, mutable_cf_options, mutable_db_options,
+      /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr, vstorage,
+      &log_buffer, full_history_ts_low, /* require_max_output_level */ false);
   log_buffer.FlushBufferToLog();
   return c;
 }
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index f1538506163b..2ddbd54b28ee 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -24,6 +24,7 @@ class FIFOCompactionPicker : public CompactionPicker {
       const std::vector<SequenceNumber>& /* existing_snapshots */,
       const SnapshotChecker* /* snapshot_checker */,
       VersionStorageInfo* version, LogBuffer* log_buffer,
+      const std::string& /* full_history_ts_low */,
       bool /* require_max_output_level*/ = false) override;
 
   Compaction* PickCompactionForCompactRange(
@@ -33,7 +34,8 @@ class FIFOCompactionPicker : public CompactionPicker {
       const CompactRangeOptions& compact_range_options,
       const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end, bool* manual_conflict,
-      uint64_t max_file_num_to_ignore, const std::string& trim_ts) override;
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts,
+      const std::string& full_history_ts_low) override;
 
   // The maximum allowed output level.  Always returns 0.
   int MaxOutputLevel() const override { return 0; }
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 132c5a72a191..090ad0bbfa7d 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -61,14 +61,16 @@ class LevelCompactionBuilder {
                          LogBuffer* log_buffer,
                          const MutableCFOptions& mutable_cf_options,
                          const ImmutableOptions& ioptions,
-                         const MutableDBOptions& mutable_db_options)
+                         const MutableDBOptions& mutable_db_options,
+                         const std::string& full_history_ts_low)
       : cf_name_(cf_name),
         vstorage_(vstorage),
         compaction_picker_(compaction_picker),
         log_buffer_(log_buffer),
         mutable_cf_options_(mutable_cf_options),
         ioptions_(ioptions),
-        mutable_db_options_(mutable_db_options) {}
+        mutable_db_options_(mutable_db_options),
+        full_history_ts_low_(full_history_ts_low) {}
 
   // Pick and return a compaction.
   Compaction* PickCompaction();
@@ -155,6 +157,7 @@ class LevelCompactionBuilder {
   const MutableCFOptions& mutable_cf_options_;
   const ImmutableOptions& ioptions_;
   const MutableDBOptions& mutable_db_options_;
+  const std::string& full_history_ts_low_;
   // Pick a path ID to place a newly generated file, with its level
   static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
                             const MutableCFOptions& mutable_cf_options,
@@ -571,7 +574,8 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
   // takes running compactions into account (by skipping files that are already
   // being compacted). Since we just changed compaction score, we recalculate it
   // here
-  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                    full_history_ts_low_);
   return c;
 }
 
@@ -976,10 +980,11 @@ Compaction* LevelCompactionPicker::PickCompaction(
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& /*existing_snapshots */,
     const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer, bool /* require_max_output_level*/) {
+    LogBuffer* log_buffer, const std::string& full_history_ts_low,
+    bool /* require_max_output_level*/) {
   LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer,
                                  mutable_cf_options, ioptions_,
-                                 mutable_db_options);
+                                 mutable_db_options, full_history_ts_low);
   return builder.PickCompaction();
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
index 34419f279841..e86c821aa309 100644
--- a/db/compaction/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@@ -26,6 +26,7 @@ class LevelCompactionPicker : public CompactionPicker {
       const std::vector<SequenceNumber>& /* existing_snapshots */,
       const SnapshotChecker* /* snapshot_checker */,
       VersionStorageInfo* vstorage, LogBuffer* log_buffer,
+      const std::string& full_history_ts_low,
       bool /*require_max_output_level*/ = false) override;
 
   bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index ddc4792d6b56..5260ac5abbfc 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -160,11 +160,19 @@ class CompactionPickerTestBase : public testing::Test {
         kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        true /* user_defined_timestamps_persisted */);
+        true /* user_defined_timestamps_persisted */, "" /* min timestamp */,
+        "" /* max timestamp */);
     f->compensated_file_size =
         (compensated_file_size != 0) ? compensated_file_size : file_size;
     // oldest_ancester_time is only used if newest_key_time is not available
     f->oldest_ancester_time = oldest_ancestor_time;
+    // Set min/max timestamps for UDT support
+    if (!ts_of_smallest.empty()) {
+      f->min_timestamp = ts_of_smallest.ToString();
+    }
+    if (!ts_of_largest.empty()) {
+      f->max_timestamp = ts_of_largest.ToString();
+    }
     TableProperties tp;
     tp.newest_key_time = newest_key_time;
     f->fd.table_reader = new mock::MockTableReader(mock::KVVector{}, tp);
@@ -195,6 +203,11 @@ class CompactionPickerTestBase : public testing::Test {
   }
 
   void UpdateVersionStorageInfo() {
+    UpdateVersionStorageInfoWithTsLow(/*full_history_ts_low=*/"");
+  }
+
+  void UpdateVersionStorageInfoWithTsLow(
+      const std::string& full_history_ts_low) {
     if (temp_vstorage_) {
       VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
                              vstorage_.get(), nullptr);
@@ -202,7 +215,8 @@ class CompactionPickerTestBase : public testing::Test {
       vstorage_ = std::move(temp_vstorage_);
     }
     vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
-    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                      full_history_ts_low);
     vstorage_->SetFinalized();
   }
 
@@ -242,6 +256,60 @@ class CompactionPickerU64TsTest : public CompactionPickerTestBase {
       : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {}
 
   ~CompactionPickerU64TsTest() override = default;
+
+ protected:
+  // Helper to create a U64 timestamp string from a uint64_t value
+  static std::string MakeU64Timestamp(uint64_t ts) {
+    std::string result;
+    PutFixed64(&result, ts);
+    return result;
+  }
+
+  // Helper to add a bottommost file with timestamps and setup version storage
+  // for testing bottommost file marking behavior
+  void SetupBottommostFileWithTimestamps(uint64_t min_ts, uint64_t max_ts,
+                                         uint64_t full_history_ts_low_val,
+                                         SequenceNumber oldest_snapshot_seqnum,
+                                         std::string* out_full_history_ts_low) {
+    std::string ts_small = MakeU64Timestamp(min_ts);
+    std::string ts_large = MakeU64Timestamp(max_ts);
+
+    Add(5, 1U, "100", "200", /*file_size=*/1000, /*path_id=*/0,
+        /*smallest_seq=*/10, /*largest_seq=*/40,
+        /*compensated_file_size=*/1000,
+        /*marked_for_compact=*/false, Temperature::kUnknown,
+        kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts_small, ts_large);
+
+    std::string full_history_ts_low = MakeU64Timestamp(full_history_ts_low_val);
+
+    UpdateVersionStorageInfoWithTsLow(full_history_ts_low);
+
+    vstorage_->UpdateOldestSnapshot(oldest_snapshot_seqnum,
+                                    /*allow_ingest_behind=*/false,
+                                    /*ucmp=*/ucmp_, full_history_ts_low);
+
+    if (out_full_history_ts_low) {
+      *out_full_history_ts_low = full_history_ts_low;
+    }
+  }
+
+  // Helper to add L0 files with timestamps for compaction trigger tests
+  void AddL0FilesWithTimestamps(uint64_t ts1_val, uint64_t ts2_val,
+                                uint64_t file_size = 1U) {
+    std::string ts1 = MakeU64Timestamp(ts1_val);
+    std::string ts2 = MakeU64Timestamp(ts2_val);
+
+    Add(0, 1U, "100", "200", file_size, /*path_id=*/0,
+        /*smallest_seq=*/100, /*largest_seq=*/100,
+        /*compensated_file_size=*/file_size,
+        /*marked_for_compact=*/false, Temperature::kUnknown,
+        kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts1, ts2);
+    Add(0, 2U, "150", "250", file_size, /*path_id=*/0,
+        /*smallest_seq=*/200, /*largest_seq=*/200,
+        /*compensated_file_size=*/file_size,
+        /*marked_for_compact=*/false, Temperature::kUnknown,
+        kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts1, ts2);
+  }
 };
 
 TEST_F(CompactionPickerTest, Empty) {
@@ -250,7 +318,7 @@ TEST_F(CompactionPickerTest, Empty) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -263,7 +331,7 @@ TEST_F(CompactionPickerTest, Single) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -278,7 +346,7 @@ TEST_F(CompactionPickerTest, Level0Trigger) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -293,7 +361,7 @@ TEST_F(CompactionPickerTest, Level1Trigger) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
@@ -313,7 +381,7 @@ TEST_F(CompactionPickerTest, Level1Trigger2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(2U, compaction->num_input_files(1));
@@ -346,7 +414,7 @@ TEST_F(CompactionPickerTest, LevelMaxScore) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
@@ -395,7 +463,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -421,7 +489,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -448,7 +516,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -479,7 +547,7 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_files(0));
   ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
@@ -513,7 +581,7 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
@@ -575,7 +643,7 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
     // output level should be the one above the bottom-most
     ASSERT_EQ(1, compaction->output_level());
@@ -620,7 +688,7 @@ TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(!compaction->is_trivial_move());
 }
@@ -648,7 +716,7 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction->is_trivial_move());
 }
@@ -678,7 +746,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
@@ -710,7 +778,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_FALSE(compaction);
 }
@@ -738,7 +806,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_FALSE(compaction);
 }
@@ -770,7 +838,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(!compaction ||
               compaction->start_level() != compaction->output_level());
 }
@@ -792,7 +860,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(0, compaction->start_level());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -818,7 +886,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->start_level());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -857,7 +925,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(3, compaction->start_level());
@@ -900,7 +968,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(2, compaction->start_level());
@@ -943,7 +1011,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(2, compaction->start_level());
@@ -992,7 +1060,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(3, compaction->start_level());
@@ -1037,7 +1105,7 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(4, compaction->output_level());
   ASSERT_EQ(3, compaction->start_level());
@@ -1090,7 +1158,7 @@ TEST_F(CompactionPickerTest,
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kUniversalSizeAmplification);
@@ -1174,7 +1242,7 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
@@ -1243,7 +1311,7 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
@@ -1311,7 +1379,7 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
@@ -1379,7 +1447,7 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
@@ -1459,7 +1527,7 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) {
         fifo_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(compaction->compaction_reason(),
               CompactionReason::kChangeTemperature);
@@ -1516,7 +1584,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Pick file 8 because it overlaps with 0 files on level 3.
@@ -1550,7 +1618,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 7 because overlapping ratio is the biggest.
@@ -1579,7 +1647,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 8 because overlapping ratio is the biggest.
@@ -1608,7 +1676,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 6 because overlapping ratio is the biggest.
@@ -1645,7 +1713,7 @@ TEST_F(CompactionPickerTest, CompactionPriRoundRobin) {
         local_level_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     // Since the max bytes for level 2 is 120M, picking one file to compact
     // makes the post-compaction level size less than 120M, there is exactly one
@@ -1686,7 +1754,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) {
       local_level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 
   // The maximum compaction bytes is very large in this case so we can igore its
@@ -1730,7 +1798,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) {
       local_level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 
   // The maximum compaction bytes is only 2500 bytes now. Even though we are
@@ -1775,7 +1843,7 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) {
       local_level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 
   // Cannot pick more files since we reach the last file in level 2
@@ -1835,7 +1903,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Picking file 8 because overlapping ratio is the biggest.
@@ -1864,7 +1932,7 @@ TEST_F(CompactionPickerTest, ParentIndexResetBug) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 }
 
 // This test checks ExpandWhileOverlapping() by having overlapping user keys
@@ -1883,7 +1951,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -1904,7 +1972,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(2U, compaction->num_input_files(0));
@@ -1933,7 +2001,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -1965,7 +2033,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -1990,7 +2058,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -2013,7 +2081,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2035,7 +2103,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_GE(1U, compaction->num_input_files(0));
@@ -2065,7 +2133,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(3U, compaction->num_input_files(0));
@@ -2099,7 +2167,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -2141,7 +2209,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2181,7 +2249,7 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2289,7 +2357,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
 }
 
@@ -2321,7 +2389,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 }
 
@@ -2356,7 +2424,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
 }
 
@@ -2658,7 +2726,7 @@ TEST_F(CompactionPickerTest, CompactionLimitWhenAddFileFromInputLevel) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(4U, compaction->num_input_files(0));
@@ -2694,7 +2762,7 @@ TEST_F(CompactionPickerTest, HitCompactionLimitWhenAddFileFromInputLevel) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -2726,7 +2794,7 @@ TEST_F(CompactionPickerTest, CompactRangeMaxCompactionBytes) {
           /*compact_range_options*/ {}, /*begin=*/nullptr, /*end=*/nullptr,
           &manual_end_ptr, &manual_conflict,
           /*max_file_num_to_ignore=*/std::numeric_limits<uint64_t>::max(),
-          /*trim_ts=*/""));
+          /*trim_ts=*/"", /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(2, compaction->output_level());
@@ -2755,7 +2823,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
 }
@@ -2781,7 +2849,7 @@ TEST_F(CompactionPickerTest, L0TrivialMove1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(2, compaction->num_input_files(0));
@@ -2811,7 +2879,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(1, compaction->num_input_files(0));
@@ -2838,7 +2906,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(4, compaction->num_input_files(0));
@@ -2867,7 +2935,7 @@ TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(3, compaction->num_input_files(0));
@@ -2898,7 +2966,7 @@ TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1, compaction->num_input_levels());
   ASSERT_EQ(1, compaction->num_input_files(0));
@@ -2927,7 +2995,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   // No trivial move, because partitioning is applied
   ASSERT_TRUE(!compaction->IsTrivialMove());
@@ -2951,7 +3019,7 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_FALSE(compaction->IsTrivialMove());
 }
@@ -2981,7 +3049,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3016,7 +3084,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3050,7 +3118,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3077,7 +3145,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3108,7 +3176,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3143,7 +3211,7 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_TRUE(compaction->IsTrivialMove());
   ASSERT_EQ(1, compaction->num_input_levels());
@@ -3179,7 +3247,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -3190,7 +3258,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   compaction.reset(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(2U, compaction->num_input_levels());
   ASSERT_EQ(1U, compaction->num_input_files(0));
@@ -3201,7 +3269,7 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
   compaction.reset(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() == nullptr);
   ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
 }
@@ -3228,7 +3296,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(5U, compaction->num_input_files(0));
@@ -3260,7 +3328,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_levels());
   ASSERT_EQ(4U, compaction->num_input_files(0));
@@ -3310,7 +3378,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a compaction to reduce sorted runs
@@ -3334,7 +3402,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_FALSE(compaction2);
 }
 
@@ -3365,7 +3433,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3396,7 +3464,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_FALSE(compaction2);
 }
 
@@ -3438,7 +3506,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
     ASSERT_TRUE(compaction);
     // Validate that its a delete triggered compaction
@@ -3463,14 +3531,15 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
       ASSERT_EQ(1U, compaction->num_input_files(1));
     }
 
-    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                      /*full_history_ts_low=*/"");
     // After recomputing the compaction score, only one marked file will remain
     random_index = 0;
     std::unique_ptr<Compaction> compaction2(
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_FALSE(compaction2);
     DeleteVersionStorage();
   }
@@ -3497,7 +3566,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3535,7 +3604,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3593,7 +3662,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
   // Validate that its a delete triggered compaction
@@ -3627,7 +3696,7 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction2);
   ASSERT_EQ(3U, compaction->num_input_files(0));
   ASSERT_TRUE(file_map_[1].first->being_compacted);
@@ -3662,7 +3731,8 @@ TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
           cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
           ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
           nullptr, nullptr, &manual_end, &manual_conflict,
-          std::numeric_limits<uint64_t>::max(), ""));
+          std::numeric_limits<uint64_t>::max(), "",
+          /*full_history_ts_low=*/""));
 
   ASSERT_TRUE(compaction);
 
@@ -3707,7 +3777,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // Make sure it's a size amp compaction and includes all files
   ASSERT_EQ(compaction->compaction_reason(),
@@ -3744,7 +3814,7 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // Internally, size amp compaction is evaluated before size ratio compaction.
   // Here to make sure it's size ratio compaction instead of size amp
@@ -3785,7 +3855,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // size amp compaction is still triggered even preclude_last_level is set
   ASSERT_EQ(compaction->compaction_reason(),
@@ -3820,7 +3890,7 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   // It's a Size Amp compaction, but doesn't include the last level file and
   // output to the proximal level.
@@ -3933,7 +4003,7 @@ TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   assert(compaction);
   ASSERT_TRUE(!compaction->is_trivial_move());
 }
@@ -4478,7 +4548,7 @@ TEST_F(CompactionPickerTest,
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(num_levels - 2, compaction->start_level());
   ASSERT_EQ(num_levels - 1, compaction->output_level());
@@ -4489,7 +4559,7 @@ TEST_F(CompactionPickerTest,
       level_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(second_compaction);
   ASSERT_EQ(num_levels - 1, compaction->output_level());
   ASSERT_EQ(num_levels - 2, compaction->start_level());
@@ -4536,7 +4606,7 @@ TEST_F(CompactionPickerTest,
   std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
       cf_name_, mutable_cf_options_, mutable_db_options_,
       /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-      vstorage_.get(), &log_buffer_));
+      vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
   ASSERT_TRUE(compaction);
   ASSERT_EQ(num_levels - 3, compaction->start_level());
   ASSERT_EQ(num_levels - 2, compaction->output_level());
@@ -4586,7 +4656,7 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) {
     std::unique_ptr<Compaction> compaction(compaction_picker.PickCompaction(
         cf_name_, mutable_cf_options_, mutable_db_options_,
         /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-        vstorage_.get(), &log_buffer_));
+        vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_TRUE(compaction.get() != nullptr);
     ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
               compaction->compaction_reason());
@@ -4663,7 +4733,7 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpLargeDB) {
           universal_compaction_picker.PickCompaction(
               cf_name_, mutable_cf_options_, mutable_db_options_,
               /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-              vstorage_.get(), &log_buffer_));
+              vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
       if (i == kMaxRuns) {
         // There are in total i + 1 > kMaxRuns sorted runs.
         // This triggers compaction ignoring size_ratio.
@@ -4711,7 +4781,7 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpSmallDB) {
         universal_compaction_picker.PickCompaction(
             cf_name_, mutable_cf_options_, mutable_db_options_,
             /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-            vstorage_.get(), &log_buffer_));
+            vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
     ASSERT_EQ(nullptr, compaction);
   }
 }
@@ -4746,7 +4816,7 @@ TEST_F(CompactionPickerTest, StandaloneRangeDeletionOnlyPicksOlderFiles) {
       universal_compaction_picker.PickCompaction(
           cf_name_, mutable_cf_options_, mutable_db_options_,
           /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr,
-          vstorage_.get(), &log_buffer_));
+          vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
 
   ASSERT_NE(nullptr, compaction);
   ASSERT_EQ(2U, compaction->num_input_levels());
@@ -4763,6 +4833,134 @@ TEST_F(CompactionPickerTest, StandaloneRangeDeletionOnlyPicksOlderFiles) {
   ASSERT_EQ(10U, compaction->input(1, 0)->fd.GetNumber());
 }
 
+// Tests for full_history_ts_low parameter in compaction picker.
+// The full_history_ts_low parameter is used to control bottommost file marking
+// for compaction when user-defined timestamps (UDT) are enabled.
+
+// Level compaction tests for full_history_ts_low:
+// These tests verify that bottommost files are correctly marked/unmarked
+// for compaction based on their max timestamp relative to full_history_ts_low.
+
+TEST_F(CompactionPickerU64TsTest,
+       BottommostNotMarkedWhenTimestampAboveFullHistoryTsLow) {
+  // Test that bottommost files are NOT marked for compaction when their
+  // max timestamp is >= full_history_ts_low. This prevents infinite
+  // compaction loops where timestamp could not be collapsed.
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File has max_ts = 1000, full_history_ts_low = 500
+  // Since 1000 >= 500, the file should NOT be marked for compaction.
+  SetupBottommostFileWithTimestamps(
+      /*min_ts=*/500, /*max_ts=*/1000, /*full_history_ts_low_val=*/500,
+      /*oldest_snapshot_seqnum=*/50, /*out_full_history_ts_low=*/nullptr);
+
+  // File's max_ts (1000) >= full_history_ts_low (500), so it should NOT
+  // be marked for bottommost compaction
+  ASSERT_TRUE(vstorage_->BottommostFilesMarkedForCompaction().empty());
+}
+
+TEST_F(CompactionPickerU64TsTest,
+       BottommostMarkedWhenTimestampBelowFullHistoryTsLow) {
+  // Test that bottommost files ARE marked for compaction when their
+  // max timestamp is < full_history_ts_low.
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File has max_ts = 100, full_history_ts_low = 500
+  // Since 100 < 500, the file SHOULD be marked for compaction.
+  SetupBottommostFileWithTimestamps(
+      /*min_ts=*/50, /*max_ts=*/100, /*full_history_ts_low_val=*/500,
+      /*oldest_snapshot_seqnum=*/50, /*out_full_history_ts_low=*/nullptr);
+
+  // File's max_ts (100) < full_history_ts_low (500), so it SHOULD be
+  // marked for bottommost compaction
+  ASSERT_EQ(1U, vstorage_->BottommostFilesMarkedForCompaction().size());
+  ASSERT_EQ(5, vstorage_->BottommostFilesMarkedForCompaction()[0].first);
+  ASSERT_EQ(1U, vstorage_->BottommostFilesMarkedForCompaction()[0]
+                    .second->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerU64TsTest,
+       BottommostNotMarkedWithEmptyFullHistoryTsLow) {
+  // Test that when full_history_ts_low is empty, files are still marked
+  // based on seqno condition (backward compatibility behavior).
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  std::string ts_small = MakeU64Timestamp(500);
+  std::string ts_large = MakeU64Timestamp(1000);
+
+  // Add a file at bottommost level with seqno < oldest_snapshot
+  Add(5, 1U, "100", "200", /*file_size=*/1000, /*path_id=*/0,
+      /*smallest_seq=*/10, /*largest_seq=*/40,
+      /*compensated_file_size=*/1000,
+      /*marked_for_compact=*/false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, kUnknownNewestKeyTime, ts_small, ts_large);
+
+  // Update version storage with empty full_history_ts_low
+  UpdateVersionStorageInfo();
+
+  // Update oldest snapshot with empty full_history_ts_low
+  vstorage_->UpdateOldestSnapshot(
+      /*oldest_snapshot_seqnum=*/50,
+      /*allow_ingest_behind=*/false,
+      /*ucmp=*/ucmp_,
+      /*full_history_ts_low=*/"");
+
+  // With empty full_history_ts_low and UDT enabled, the file should NOT be
+  // marked. When full_history_ts_low is empty, it means it was never set,
+  // effectively 0, which is smaller than any valid timestamp. Since the file's
+  // max_timestamp would be >= full_history_ts_low, it won't be marked.
+  ASSERT_EQ(0U, vstorage_->BottommostFilesMarkedForCompaction().size());
+}
+
+TEST_F(CompactionPickerU64TsTest, LevelPickCompactionWithFullHistoryTsLow) {
+  // Test that level compaction correctly passes full_history_ts_low
+  // and picks compaction appropriately
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  AddL0FilesWithTimestamps(/*ts1_val=*/100, /*ts2_val=*/200);
+
+  UpdateVersionStorageInfo();
+
+  std::string full_history_ts_low = MakeU64Timestamp(150);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_,
+      /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr, vstorage_.get(),
+      &log_buffer_, full_history_ts_low, /*require_max_output_level=*/false));
+
+  // Compaction should be picked for L0 files
+  ASSERT_NE(nullptr, compaction);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(0, compaction->start_level());
+}
+
+TEST_F(CompactionPickerU64TsTest, UniversalPickCompactionWithFullHistoryTsLow) {
+  // Test that universal compaction correctly accepts full_history_ts_low
+  constexpr uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  AddL0FilesWithTimestamps(/*ts1_val=*/100, /*ts2_val=*/200, kFileSize);
+
+  UpdateVersionStorageInfo();
+
+  std::string full_history_ts_low = MakeU64Timestamp(150);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_,
+          /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr,
+          vstorage_.get(), &log_buffer_, full_history_ts_low,
+          /*require_max_output_level=*/false));
+
+  // Universal compaction should be picked
+  ASSERT_NE(nullptr, compaction);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 13f2831c4a16..173e317a1006 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -39,7 +39,7 @@ class UniversalCompactionBuilder {
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
       UniversalCompactionPicker* picker, LogBuffer* log_buffer,
-      bool require_max_output_level)
+      bool require_max_output_level, const std::string& full_history_ts_low)
       : ioptions_(ioptions),
         icmp_(icmp),
         cf_name_(cf_name),
@@ -50,7 +50,8 @@ class UniversalCompactionBuilder {
         log_buffer_(log_buffer),
         require_max_output_level_(require_max_output_level),
         allow_ingest_behind_(ioptions.cf_allow_ingest_behind ||
-                             ioptions.allow_ingest_behind) {
+                             ioptions.allow_ingest_behind),
+        full_history_ts_low_(full_history_ts_low) {
     assert(icmp_);
     const auto* ucmp = icmp_->user_comparator();
     assert(ucmp);
@@ -450,6 +451,7 @@ class UniversalCompactionBuilder {
   std::map<uint64_t, size_t> file_marked_for_compaction_to_sorted_run_index_;
   bool require_max_output_level_;
   bool allow_ingest_behind_;
+  const std::string& full_history_ts_low_;
 
   std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
       const VersionStorageInfo& vstorage, int last_level,
@@ -595,16 +597,20 @@ bool UniversalCompactionPicker::NeedsCompaction(
   return false;
 }
 
+// TODO leverage full_history_ts_low in universal compaction picking. It could
+// help reduce the same infinite compaction loop issue found in level
+// compaction.
 Compaction* UniversalCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options,
     const std::vector<SequenceNumber>& existing_snapshots,
     const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-    LogBuffer* log_buffer, bool require_max_output_level) {
+    LogBuffer* log_buffer, const std::string& full_history_ts_low,
+    bool require_max_output_level) {
   UniversalCompactionBuilder builder(
       ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options,
       existing_snapshots, snapshot_checker, vstorage, this, log_buffer,
-      require_max_output_level);
+      require_max_output_level, full_history_ts_low);
   return builder.PickCompaction();
 }
 
@@ -825,7 +831,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
 
   picker_->RegisterCompaction(c);
-  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                    full_history_ts_low_);
 
   TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
                            c);
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
index d37fd65bb2a8..175c11c9f0c3 100644
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -26,7 +26,8 @@ class UniversalCompactionPicker : public CompactionPicker {
       const MutableDBOptions& mutable_db_options,
       const std::vector<SequenceNumber>& existing_snapshots,
       const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage,
-      LogBuffer* log_buffer, bool require_max_output_level = false) override;
+      LogBuffer* log_buffer, const std::string& full_history_ts_low,
+      bool require_max_output_level = false) override;
   int MaxOutputLevel() const override { return NumberLevels() - 1; }
 
   bool NeedsCompaction(const VersionStorageInfo* vstorage) const override;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index bc9d4adc4946..93cdbf5f36ad 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -4402,7 +4402,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
       for (auto* cfd : *versions_->GetColumnFamilySet()) {
         if (!cfd->AllowIngestBehind()) {
           cfd->current()->storage_info()->UpdateOldestSnapshot(
-              oldest_snapshot, /*allow_ingest_behind=*/false);
+              oldest_snapshot, /*allow_ingest_behind=*/false,
+              cfd->ioptions().user_comparator, cfd->GetFullHistoryTsLow());
           if (!cfd->current()
                    ->storage_info()
                    ->BottommostFilesMarkedForCompaction()
@@ -5038,7 +5039,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
     }
     if (!deleted_files.empty()) {
       vstorage->ComputeCompactionScore(cfd->ioptions(),
-                                       cfd->GetLatestMutableCFOptions());
+                                       cfd->GetLatestMutableCFOptions(),
+                                       cfd->GetFullHistoryTsLow());
     }
     if (edit.GetDeletedFiles().empty()) {
       job_context.Clean();
@@ -6902,7 +6904,8 @@ void DBImpl::TriggerPeriodicCompaction() {
       if (cfd->GetLatestCFOptions().periodic_compaction_seconds &&
           !cfd->queued_for_compaction()) {
         cfd->current()->storage_info()->ComputeCompactionScore(
-            cfd->ioptions(), cfd->GetLatestMutableCFOptions());
+            cfd->ioptions(), cfd->GetLatestMutableCFOptions(),
+            cfd->GetFullHistoryTsLow());
         EnqueuePendingCompaction(cfd);
         if (cfd->queued_for_compaction()) {
           ROCKS_LOG_INFO(immutable_db_options_.info_log,
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 2d3ee60bb0fb..877b61007b99 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1449,7 +1449,8 @@ Status DBImpl::PerformTrivialMove(Compaction& c, LogBuffer* log_buffer,
                         f->file_creation_time, f->epoch_number,
                         f->file_checksum, f->file_checksum_func_name,
                         f->unique_id, f->compensated_range_deletion_size,
-                        f->tail_size, f->user_defined_timestamps_persisted);
+                        f->tail_size, f->user_defined_timestamps_persisted,
+                        f->min_timestamp, f->max_timestamp);
       moved_bytes += static_cast<size_t>(c.input(l, i)->fd.GetFileSize());
       ROCKS_LOG_BUFFER(
           log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
@@ -1645,8 +1646,8 @@ Status DBImpl::CompactFilesImpl(
   // takes running compactions into account (by skipping files that are already
   // being compacted). Since we just changed compaction score, we recalculate it
   // here.
-  version->storage_info()->ComputeCompactionScore(cfd->ioptions(),
-                                                  c->mutable_cf_options());
+  version->storage_info()->ComputeCompactionScore(
+      cfd->ioptions(), c->mutable_cf_options(), cfd->GetFullHistoryTsLow());
 
   compaction_job.Prepare(std::nullopt /*subcompact to be computed*/);
 
@@ -1971,7 +1972,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
           f->oldest_ancester_time, f->file_creation_time, f->epoch_number,
           f->file_checksum, f->file_checksum_func_name, f->unique_id,
           f->compensated_range_deletion_size, f->tail_size,
-          f->user_defined_timestamps_persisted);
+          f->user_defined_timestamps_persisted, f->min_timestamp,
+          f->max_timestamp);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
@@ -3831,7 +3833,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
               ->current()
               ->storage_info()
               ->ComputeCompactionScore(c->immutable_options(),
-                                       c->mutable_cf_options());
+                                       c->mutable_cf_options(),
+                                       cfd->GetFullHistoryTsLow());
           EnqueuePendingCompaction(cfd);
 
           c.reset();
@@ -4088,7 +4091,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           in_file->unique_id,
           in_file->compensated_range_deletion_size,
           in_file->tail_size,
-          in_file->user_defined_timestamps_persisted};
+          in_file->user_defined_timestamps_persisted,
+          in_file->min_timestamp,
+          in_file->max_timestamp};
 
       out_files.push_back(std::move(out_file_metadata));
     }
@@ -4393,7 +4398,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           ->current()
           ->storage_info()
           ->ComputeCompactionScore(c->immutable_options(),
-                                   c->mutable_cf_options());
+                                   c->mutable_cf_options(),
+                                   cfd->GetFullHistoryTsLow());
       EnqueuePendingCompaction(cfd);
     }
   }
@@ -4499,7 +4505,7 @@ Compaction* DBImpl::CreateIntendedCompactionForwardedToBottomPriorityPool(
                      nullptr /* snapshot_checker */, c->compaction_reason());
 
   cfd->compaction_picker()->RegisterCompaction(intended_compaction);
-  vstorage->ComputeCompactionScore(io, mo);
+  vstorage->ComputeCompactionScore(io, mo, cfd->GetFullHistoryTsLow());
   intended_compaction->FinalizeInputInfo(cfd->current());
 
   return intended_compaction;
@@ -4871,7 +4877,8 @@ void DBImpl::ResetBottomPriCompactionIntent(ColumnFamilyData* cfd,
                                             std::unique_ptr<Compaction>& c) {
   c->ReleaseCompactionFiles(Status::OK());
   cfd->current()->storage_info()->ComputeCompactionScore(
-      c->immutable_options(), c->mutable_cf_options());
+      c->immutable_options(), c->mutable_cf_options(),
+      cfd->GetFullHistoryTsLow());
   c.reset();
 }
 
diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc
index 49d583e6623d..bb6a9a2e409c 100644
--- a/db/db_impl/db_impl_experimental.cc
+++ b/db/db_impl/db_impl_experimental.cc
@@ -46,7 +46,8 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
     // Since we have some more files to compact, we should also recompute
     // compaction score
     vstorage->ComputeCompactionScore(cfd->ioptions(),
-                                     cfd->GetLatestMutableCFOptions());
+                                     cfd->GetLatestMutableCFOptions(),
+                                     cfd->GetFullHistoryTsLow());
     EnqueuePendingCompaction(cfd);
     MaybeScheduleFlushOrCompaction();
   }
@@ -143,7 +144,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
                    f->file_creation_time, f->epoch_number, f->file_checksum,
                    f->file_checksum_func_name, f->unique_id,
                    f->compensated_range_deletion_size, f->tail_size,
-                   f->user_defined_timestamps_persisted);
+                   f->user_defined_timestamps_persisted, f->min_timestamp,
+                   f->max_timestamp);
     }
 
     status = versions_->LogAndApply(cfd, read_options, write_options, &edit,
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index cccc3ea2c708..fb6ad5094e7a 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -657,7 +657,8 @@ Status DBImpl::Recover(
                            f->file_creation_time, f->epoch_number,
                            f->file_checksum, f->file_checksum_func_name,
                            f->unique_id, f->compensated_range_deletion_size,
-                           f->tail_size, f->user_defined_timestamps_persisted);
+                           f->tail_size, f->user_defined_timestamps_persisted,
+                           f->min_timestamp, f->max_timestamp);
               ROCKS_LOG_WARN(immutable_db_options_.info_log,
                              "[%s] Moving #%" PRIu64
                              " from from_level-%d to from_level-%d %" PRIu64
diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc
index 08802738f0b8..1e35d43f829c 100644
--- a/db/db_with_timestamp_compaction_test.cc
+++ b/db/db_with_timestamp_compaction_test.cc
@@ -9,9 +9,11 @@
 
 #include <set>
 
+#include "db/column_family.h"
 #include "db/compaction/compaction.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/sst_file_reader.h"
 #include "test_util/testutil.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -50,6 +52,122 @@ class TimestampCompatibleCompactionTest : public DBTestBase {
     }
     return value;
   }
+
+  // Helper to get all files with their level and timestamps
+  std::vector<std::tuple<int, std::string, std::string>>
+  GetAllFileTimestamps() {
+    std::vector<std::tuple<int, std::string, std::string>> results;
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+    auto* vstorage = cfd->current()->storage_info();
+
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (auto* file : vstorage->LevelFiles(level)) {
+        results.emplace_back(level, file->min_timestamp, file->max_timestamp);
+      }
+    }
+    return results;
+  }
+
+  // Helper to compute overall min/max timestamps across all files
+  // Returns {min_ts, max_ts} as uint64_t values
+  // Asserts that all files have non-empty timestamps
+  std::pair<uint64_t, uint64_t> GetOverallTimestampRange() {
+    auto files = GetAllFileTimestamps();
+    EXPECT_GE(files.size(), 1U);
+
+    uint64_t overall_min = UINT64_MAX;
+    uint64_t overall_max = 0;
+    for (const auto& [level, min_ts, max_ts] : files) {
+      EXPECT_FALSE(min_ts.empty()) << "min_timestamp empty at level " << level;
+      EXPECT_FALSE(max_ts.empty()) << "max_timestamp empty at level " << level;
+
+      if (!min_ts.empty() && !max_ts.empty()) {
+        uint64_t file_min = DecodeFixed64(min_ts.data());
+        uint64_t file_max = DecodeFixed64(max_ts.data());
+        overall_min = std::min(overall_min, file_min);
+        overall_max = std::max(overall_max, file_max);
+      }
+    }
+    return {overall_min, overall_max};
+  }
+
+  // Helper to verify timestamp range matches expected values, including after
+  // reopen
+  void VerifyTimestampRangeWithPersistence(const Options& options,
+                                           uint64_t expected_min,
+                                           uint64_t expected_max) {
+    // Verify before reopen
+    auto [min_ts, max_ts] = GetOverallTimestampRange();
+    ASSERT_EQ(expected_min, min_ts);
+    ASSERT_EQ(expected_max, max_ts);
+
+    size_t file_count_before = GetAllFileTimestamps().size();
+
+    // Verify manifest persistence by reopening
+    Reopen(options);
+
+    // Verify after reopen
+    auto [reopened_min_ts, reopened_max_ts] = GetOverallTimestampRange();
+    ASSERT_EQ(expected_min, reopened_min_ts);
+    ASSERT_EQ(expected_max, reopened_max_ts);
+    ASSERT_EQ(file_count_before, GetAllFileTimestamps().size());
+  }
+
+  // Helper to create common options for UDT tests with level compaction
+  Options CreateTimestampOptions(bool disable_auto_compactions = false) {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.compaction_style = kCompactionStyleLevel;
+    options.num_levels = 4;
+    options.persist_user_defined_timestamps = true;
+    options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+    options.disable_auto_compactions = disable_auto_compactions;
+    return options;
+  }
+
+  // Helper to write test data with alternating timestamps in a range
+  // Writes keys [start_key, end_key) with timestamps alternating between
+  // min_ts and max_ts
+  void WriteDataWithTimestampRange(int start_key, int end_key, uint64_t min_ts,
+                                   uint64_t max_ts) {
+    std::string ts_buf;
+    for (int i = start_key; i < end_key; i++) {
+      ts_buf.clear();
+      uint64_t ts = (i % 2 == 0) ? min_ts : max_ts;
+      PutFixed64(&ts_buf, ts);
+      ASSERT_OK(db_->Put(WriteOptions(), Key(i), ts_buf,
+                         "value" + std::to_string(i)));
+    }
+  }
+
+  // Helper to check if any file has the expected timestamp range
+  bool HasFileWithTimestampRange(uint64_t expected_min, uint64_t expected_max) {
+    auto file_timestamps = GetAllFileTimestamps();
+    for (const auto& [level, min_ts, max_ts] : file_timestamps) {
+      if (!min_ts.empty() && !max_ts.empty()) {
+        uint64_t file_min = DecodeFixed64(min_ts.data());
+        uint64_t file_max = DecodeFixed64(max_ts.data());
+        if (file_min == expected_min && file_max == expected_max) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Helper to verify data is readable with a given timestamp
+  void VerifyDataReadable(int key, const std::string& expected_value,
+                          uint64_t read_ts) {
+    std::string value;
+    std::string ts_buf;
+    PutFixed64(&ts_buf, read_ts);
+    ReadOptions read_opts;
+    Slice ts_slice(ts_buf);
+    read_opts.timestamp = &ts_slice;
+    ASSERT_OK(db_->Get(read_opts, Key(key), &value));
+    ASSERT_EQ(expected_value, value);
+  }
 };
 
 TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
@@ -446,120 +564,283 @@ TEST_F(TimestampCompatibleCompactionTest, SeqnoZeroingWithUDT) {
   ASSERT_EQ("value3", value);
 }
 
-TEST_F(TimestampCompatibleCompactionTest, UdtTombstoneCollapsingTest) {
-  // This test validate tombstones accumulated at bottommost level due to UDT is
-  // cleaned up properly, avoiding high space amplification.
+// Test that files with max_timestamp >= full_history_ts_low are not marked
+// for bottommost compaction, which prevents infinite compaction loops.
+TEST_F(TimestampCompatibleCompactionTest,
+       BottommostCompactionRespectsFullHistoryTsLow) {
+  Options options = CreateTimestampOptions();
+  options.level0_file_num_compaction_trigger = 4;
 
-  // Create a new column family with UDT enabled
-  Options options = GetDefaultOptions();
-  ColumnFamilyHandle* cfh = nullptr;
-  options = GetDefaultOptions();
-  options.compaction_style = kCompactionStyleLevel;
-  options.num_levels = 7;
-  options.level0_file_num_compaction_trigger = 10;
-  options.persist_user_defined_timestamps = true;
-  options.comparator = BytewiseComparatorWithU64Ts();
-  options.target_file_size_base = 2 * 1024 * 1024;
-  options.max_bytes_for_level_base = 4 * 1024 * 1024;
-  options.max_bytes_for_level_multiplier = 2;
+  DestroyAndReopen(options);
 
-  ASSERT_OK(db_->CreateColumnFamily(options, "new_cf", &cfh));
+  // Write some data with timestamps 100-199
+  std::string ts_buf;
+  for (int i = 0; i < 100; i++) {
+    ts_buf.clear();
+    PutFixed64(&ts_buf, 100 + i);
+    ASSERT_OK(
+        db_->Put(WriteOptions(), Key(i), ts_buf, "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // Compact to the bottommost level
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Set full_history_ts_low to 150 - files with max_ts >= 150 should NOT be
+  // marked for bottommost compaction since seqno cannot be zeroed
+  ts_buf.clear();
+  PutFixed64(&ts_buf, 150);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf));
+
+  // Release a snapshot to potentially trigger bottommost file marking
+  // but files should NOT be marked because max_ts (199) >= full_history_ts_low
+  // (150)
+  const Snapshot* snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Wait for any scheduled compactions - should complete without infinite loop
+  // Use a reasonable timeout to detect infinite loops
+  WaitForCompactOptions wfc_options;
+  wfc_options.timeout = std::chrono::microseconds(5000000);  // 5 seconds
+  Status s = dbfull()->WaitForCompact(wfc_options);
+  // Should succeed without timeout (no infinite compaction loop)
+  ASSERT_TRUE(s.ok() || s.IsTimedOut());
+  if (s.IsTimedOut()) {
+    // If timeout, the fix is not working - this should not happen
+    FAIL() << "WaitForCompact timed out - possible infinite compaction loop";
+  }
+
+  // Now set full_history_ts_low beyond max timestamp in the file (200+)
+  // This should allow the file to be properly marked and compacted
+  ts_buf.clear();
+  PutFixed64(&ts_buf, 300);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf));
+
+  // Trigger another snapshot release to potentially mark files
+  snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Now compaction should clean up the file.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
 
+// Test that files are NOT marked for bottommost compaction when UDT is enabled
+// and full_history_ts_low has never been set (empty).
+TEST_F(TimestampCompatibleCompactionTest,
+       BottommostCompactionSkipsWhenFullHistoryTsLowNotSet) {
+  Options options = CreateTimestampOptions();
+
+  DestroyAndReopen(options);
+
+  // Write some data with timestamps 100-199
   std::string ts_buf;
-  uint64_t timestamp = 1000;
-  constexpr auto kBatchSize = 1000;
-  constexpr auto kTotalRecords = 100000;
-
-  int record_count = 0;
-  auto kValueSize = 1024;
-
-  Random rnd(0);
-  while (record_count < kTotalRecords) {
-    // Create rows with timestamp
-    for (int i = 0; i < kBatchSize; i++) {
-      timestamp = 1000 + record_count + i;
-      ts_buf = "";
-      PutFixed64(&ts_buf, timestamp);
-      Slice ts(ts_buf);
-      // generate a random value, so that they are not easily compressable
-      auto value = rnd.RandomString(kValueSize);
+  for (int i = 0; i < 100; i++) {
+    ts_buf.clear();
+    PutFixed64(&ts_buf, 100 + i);
+    ASSERT_OK(
+        db_->Put(WriteOptions(), Key(i), ts_buf, "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // Compact to the bottommost level without setting full_history_ts_low
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Verify files have valid max_timestamp
+  auto file_timestamps = GetAllFileTimestamps();
+  ASSERT_GE(file_timestamps.size(), 1U);
+  for (const auto& [level, min_ts, max_ts] : file_timestamps) {
+    ASSERT_FALSE(max_ts.empty()) << "max_timestamp should not be empty";
+  }
+
+  // full_history_ts_low is NOT set (empty), so files should NOT be marked
+  // for bottommost compaction even after releasing a snapshot.
+  // This tests the branch: if (full_history_ts_low.empty()) { continue; }
+  const Snapshot* snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Wait for any scheduled compactions
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Now set full_history_ts_low to a value > max_timestamp (199) in the file
+  // This should allow the file to be properly marked and compacted
+  ts_buf.clear();
+  PutFixed64(&ts_buf, 300);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_buf));
+
+  // Trigger another snapshot release to potentially mark files
+  snap = db_->GetSnapshot();
+  db_->ReleaseSnapshot(snap);
+
+  // Now compaction should be able to proceed since full_history_ts_low is set
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify data is still readable
+  VerifyDataReadable(0, "value0", 250);
+}
+
+// Test that ingested SST files created with UDT have their min/max timestamps
+// properly extracted from table properties and populated in FileMetaData.
+// This verifies the fix in external_sst_file_ingestion_job.cc that calls
+// ExtractTimestampFromTableProperties after creating FileMetaData.
+TEST_F(TimestampCompatibleCompactionTest,
+       IngestedFileTimestampsExtractedFromTableProperties) {
+  Options options = CreateTimestampOptions();
+
+  DestroyAndReopen(options);
+
+  // Create an SST file WITH timestamps using SstFileWriter
+  std::string sst_file = dbname_ + "/ingested_udt_file.sst";
+  const uint64_t kMinTs = 100;
+  const uint64_t kMaxTs = 200;
+
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(sst_file));
+
+    std::string ts_buf;
+    for (int i = 0; i < 10; i++) {
+      // Alternate between min and max timestamps
+      uint64_t ts = (i % 2 == 0) ? kMinTs : kMaxTs;
+      ts_buf.clear();
+      PutFixed64(&ts_buf, ts);
+      // SstFileWriter with UDT comparator requires key with timestamp
       ASSERT_OK(
-          db_->Put(WriteOptions(), cfh, Key(record_count + i), ts, value));
-    }
-    ASSERT_OK(db_->Flush(FlushOptions(), cfh));
-
-    // Create a snapshot for read, then release it, so that
-    // oldest_snapshot_seqnum_ is advanced periodically
-    auto snapshot = db_->GetSnapshot();
-    ReadOptions read_options;
-    std::string read_ts_buf = "";
-    timestamp = 1000 + record_count + kBatchSize;
-    PutFixed64(&read_ts_buf, timestamp);
-    Slice read_ts(read_ts_buf);
-    read_options.timestamp = &read_ts;
-    read_options.snapshot = snapshot;
-    std::string value;
-    ASSERT_OK(db_->Get(read_options, cfh, Key(record_count), &value, &ts_buf));
-    db_->ReleaseSnapshot(snapshot);
-
-    // Delete all of the rows created
-    for (int i = 0; i < kBatchSize; i++) {
-      timestamp = 2000 + record_count + i;
-      ts_buf = "";
-      PutFixed64(&ts_buf, timestamp);
-      Slice ts(ts_buf);
-      ASSERT_OK(db_->Delete(WriteOptions(), cfh, Key(record_count + i), ts));
-    }
-    ASSERT_OK(db_->Flush(FlushOptions(), cfh));
-    record_count += kBatchSize;
-
-    // Advance full_history_ts_low with some delay periodically
-    timestamp = 1000 + record_count - kBatchSize;
-    ts_buf = "";
-    PutFixed64(&ts_buf, timestamp);
-    ASSERT_OK(db_->IncreaseFullHistoryTsLow(cfh, ts_buf));
-
-    constexpr bool debug = false;
-    if (debug) {
-      // Print stats from time to time
-      if (record_count % (kTotalRecords / 10) == 0) {
-        std::string cf_stats;
-        ASSERT_TRUE(db_->GetProperty(cfh, "rocksdb.cfstats-no-file-histogram",
-                                     &cf_stats));
-        printf("%s\n", cf_stats.c_str());
-        printf("db path %s\n", dbname_.c_str());
-        printf("completed record count %d\n", record_count);
-        printf("completed record percentage %f%%\n",
-               100 * (float)record_count / kTotalRecords);
-      }
+          sst_file_writer.Put(Key(i), ts_buf, "value" + std::to_string(i)));
     }
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  // Verify the SST file has timestamp properties before ingestion
+  {
+    std::unique_ptr<SstFileReader> reader(new SstFileReader(options));
+    ASSERT_OK(reader->Open(sst_file));
+    auto props = reader->GetTableProperties();
+    auto& user_collected = props->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end())
+        << "SST file should have rocksdb.timestamp_min property";
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end())
+        << "SST file should have rocksdb.timestamp_max property";
+  }
+
+  // Ingest the SST file
+  IngestExternalFileOptions ifo;
+  ifo.move_files = false;
+  ASSERT_OK(db_->IngestExternalFile({sst_file}, ifo));
+
+  // Verify the ingested file has proper timestamps in FileMetaData
+  ASSERT_TRUE(HasFileWithTimestampRange(kMinTs, kMaxTs))
+      << "Ingested file should have min_timestamp=" << kMinTs
+      << " and max_timestamp=" << kMaxTs << " in FileMetaData";
+
+  // Verify timestamps persist after reopen
+  Reopen(options);
+
+  ASSERT_TRUE(HasFileWithTimestampRange(kMinTs, kMaxTs))
+      << "Ingested file timestamps should persist after reopen";
+
+  // Verify data is readable
+  VerifyDataReadable(0, "value0", kMaxTs);
+
+  // Clean up
+  ASSERT_OK(env_->DeleteFile(sst_file));
+}
+
+// Test that min/max timestamps are correctly tracked in FileMetaData and
+// persisted in the manifest during flush.
+TEST_F(TimestampCompatibleCompactionTest, TimestampRangePersistenceFlush) {
+  Options options = CreateTimestampOptions();
+
+  DestroyAndReopen(options);
+
+  // Expected timestamp range
+  const uint64_t kMinTs = 100;
+  const uint64_t kMaxTs = 200;
+
+  // Write data with specific timestamp range
+  WriteDataWithTimestampRange(0, 50, kMinTs, kMaxTs);
+  ASSERT_OK(Flush());
+
+  // First verify table properties have the timestamps
+  // (this confirms TimestampTablePropertiesCollector is working)
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(1U, props.size());
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end());
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end());
+    // Verify the collected timestamps match expected values
+    std::string collected_min_ts = user_collected.at("rocksdb.timestamp_min");
+    std::string collected_max_ts = user_collected.at("rocksdb.timestamp_max");
+    ASSERT_EQ(kMinTs, DecodeFixed64(collected_min_ts.data()));
+    ASSERT_EQ(kMaxTs, DecodeFixed64(collected_max_ts.data()));
   }
 
-  // Validate CF size is less than 20% of the total data created to validate the
-  // tombstones has collapsed
-  uint64_t cf_size = 0;
+  // Verify FileMetaData timestamps and persistence through reopen
+  VerifyTimestampRangeWithPersistence(options, kMinTs, kMaxTs);
 
-  // use TEST_WaitForCompact to wait for compaction to run for a while
-  WaitForCompactOptions wait_for_compact_options;
-  wait_for_compact_options.timeout = std::chrono::seconds(1);
+  // Verify we can still read the data
+  VerifyDataReadable(0, "value0", kMaxTs);
+}
+
+// Test that min/max timestamps are correctly merged during compaction
+// and persisted in the manifest.
+TEST_F(TimestampCompatibleCompactionTest, TimestampRangePersistenceCompaction) {
+  Options options = CreateTimestampOptions(true /* disable_auto_compactions */);
 
-  // For some reason the background compaction never ends when calling
-  // TEST_WaitForCompact without timeout, which causes the test to timeout. This
-  // likely indicates a bug in the compaction picking logic.
-  // TODO (issue #14223, fix potential bug in compaction picking logic)
-  int timeout = 60;
-  auto threshold = kTotalRecords * kValueSize * 0.2;
+  DestroyAndReopen(options);
 
-  do {
-    auto s = dbfull()->TEST_WaitForCompact(wait_for_compact_options);
-    ASSERT_TRUE(s.ok() || s.IsTimedOut());
-    ASSERT_TRUE(
-        db_->GetIntProperty(cfh, DB::Properties::kTotalSstFilesSize, &cf_size));
-  } while (cf_size > threshold && timeout-- > 0);
+  // Create multiple L0 files with different timestamp ranges
+  // File 1: timestamps 100-150
+  const uint64_t kFile1MinTs = 100;
+  const uint64_t kFile1MaxTs = 150;
+  WriteDataWithTimestampRange(0, 10, kFile1MinTs, kFile1MaxTs);
+  ASSERT_OK(Flush());
 
-  ASSERT_LE(cf_size, threshold);
+  // File 2: timestamps 50-80 (earlier range)
+  const uint64_t kFile2MinTs = 50;
+  const uint64_t kFile2MaxTs = 80;
+  WriteDataWithTimestampRange(10, 20, kFile2MinTs, kFile2MaxTs);
+  ASSERT_OK(Flush());
+
+  // File 3: timestamps 200-300 (later range)
+  const uint64_t kFile3MinTs = 200;
+  const uint64_t kFile3MaxTs = 300;
+  WriteDataWithTimestampRange(20, 30, kFile3MinTs, kFile3MaxTs);
+  ASSERT_OK(Flush());
 
-  delete cfh;
+  // Expected combined range: min=50, max=300
+  const uint64_t kExpectedMinTs = 50;
+  const uint64_t kExpectedMaxTs = 300;
+
+  // Verify we have 3 L0 files before compaction with valid timestamps
+  auto files_before = GetAllFileTimestamps();
+  ASSERT_EQ(3U, files_before.size());
+  for (const auto& [level, min_ts, max_ts] : files_before) {
+    ASSERT_EQ(0, level);  // All files should be in L0
+    ASSERT_FALSE(min_ts.empty());
+    ASSERT_FALSE(max_ts.empty());
+  }
+
+  // Trigger compaction
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify timestamp range and persistence through reopen
+  VerifyTimestampRangeWithPersistence(options, kExpectedMinTs, kExpectedMaxTs);
+
+  // Verify data is still readable
+  VerifyDataReadable(0, "value0", kExpectedMaxTs);
+  VerifyDataReadable(15, "value15", kExpectedMaxTs);
+  VerifyDataReadable(25, "value25", kExpectedMaxTs);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/experimental.cc b/db/experimental.cc
index 597767b37b70..6350dede4ac3 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -158,15 +158,17 @@ Status UpdateManifestForFilesState(
               // Current state inconsistent with manifest
               ++files_updated;
               edit.DeleteFile(level, number);
-              edit.AddFile(
-                  level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
-                  lf->smallest, lf->largest, lf->fd.smallest_seqno,
-                  lf->fd.largest_seqno, lf->marked_for_compaction, temp,
-                  lf->oldest_blob_file_number, lf->oldest_ancester_time,
-                  lf->file_creation_time, lf->epoch_number, lf->file_checksum,
-                  lf->file_checksum_func_name, lf->unique_id,
-                  lf->compensated_range_deletion_size, lf->tail_size,
-                  lf->user_defined_timestamps_persisted);
+              edit.AddFile(level, lf->fd.GetNumber(), lf->fd.GetPathId(),
+                           lf->fd.GetFileSize(), lf->smallest, lf->largest,
+                           lf->fd.smallest_seqno, lf->fd.largest_seqno,
+                           lf->marked_for_compaction, temp,
+                           lf->oldest_blob_file_number,
+                           lf->oldest_ancester_time, lf->file_creation_time,
+                           lf->epoch_number, lf->file_checksum,
+                           lf->file_checksum_func_name, lf->unique_id,
+                           lf->compensated_range_deletion_size, lf->tail_size,
+                           lf->user_defined_timestamps_persisted,
+                           lf->min_timestamp, lf->max_timestamp);
             }
           }
         } else {
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index fd7e195dd055..2b92bd2f09bd 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -11,6 +11,7 @@
 #include <unordered_set>
 #include <vector>
 
+#include "db/builder.h"
 #include "db/db_impl/db_impl.h"
 #include "db/version_edit.h"
 #include "file/file_util.h"
@@ -699,9 +700,13 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
             ? kReservedEpochNumberForFileIngestedBehind
             : cfd_->NewEpochNumber(),  // orders files ingested to L0
         file->file_checksum, file->file_checksum_func_name, file->unique_id, 0,
-        tail_size, file->user_defined_timestamps_persisted);
+        tail_size, file->user_defined_timestamps_persisted, "", "");
     f_metadata.temperature = file->file_temperature;
     f_metadata.marked_for_compaction = marked_for_compaction;
+    // Extract min/max timestamps from table properties for UDT support.
+    // This ensures ingested files have proper timestamp ranges in FileMetaData,
+    // similar to files created by flush and compaction.
+    ExtractTimestampFromTableProperties(file->table_properties, &f_metadata);
     edit_.AddFile(file->picked_level, f_metadata);
 
     *batch_uppermost_level =
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 12f94d7e7e7a..e5221afca878 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -1095,7 +1095,8 @@ Status FlushJob::WriteLevel0Table() {
                    meta_.file_creation_time, meta_.epoch_number,
                    meta_.file_checksum, meta_.file_checksum_func_name,
                    meta_.unique_id, meta_.compensated_range_deletion_size,
-                   meta_.tail_size, meta_.user_defined_timestamps_persisted);
+                   meta_.tail_size, meta_.user_defined_timestamps_persisted,
+                   meta_.min_timestamp, meta_.max_timestamp);
     edit_->SetBlobFileAdditions(std::move(blob_file_additions));
   }
   // Piggyback FlushJobInfo on the first first flushed memtable.
diff --git a/db/repair.cc b/db/repair.cc
index 05672957f805..941d69dedc11 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -704,17 +704,17 @@ class Repairer {
       VersionEdit dummy_edit;
       for (const auto* table : cf_id_and_tables.second) {
         // TODO(opt): separate out into multiple levels
+        const auto& meta = table->meta;
         dummy_edit.AddFile(
-            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
-            table->meta.fd.GetFileSize(), table->meta.smallest,
-            table->meta.largest, table->meta.fd.smallest_seqno,
-            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
-            table->meta.temperature, table->meta.oldest_blob_file_number,
-            table->meta.oldest_ancester_time, table->meta.file_creation_time,
-            table->meta.epoch_number, table->meta.file_checksum,
-            table->meta.file_checksum_func_name, table->meta.unique_id,
-            table->meta.compensated_range_deletion_size, table->meta.tail_size,
-            table->meta.user_defined_timestamps_persisted);
+            0, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(),
+            meta.smallest, meta.largest, meta.fd.smallest_seqno,
+            meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature,
+            meta.oldest_blob_file_number, meta.oldest_ancester_time,
+            meta.file_creation_time, meta.epoch_number, meta.file_checksum,
+            meta.file_checksum_func_name, meta.unique_id,
+            meta.compensated_range_deletion_size, meta.tail_size,
+            meta.user_defined_timestamps_persisted, meta.min_timestamp,
+            meta.max_timestamp);
       }
       s = dummy_version_builder.Apply(&dummy_edit);
       if (s.ok()) {
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 3c7d8a61d739..a3e249887ab1 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -76,7 +76,8 @@ class VersionBuilderTest : public testing::Test {
         oldest_blob_file_number, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     f->compensated_file_size = file_size;
     f->num_entries = num_entries;
     f->num_deletions = num_deletions;
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 88150181bf4c..67a6f3cc5ba3 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -304,6 +304,15 @@ void VersionEdit::EncodeToNewFile4(const FileMetaData& f, int level,
     char p = static_cast<char>(0);
     PutLengthPrefixedSlice(dst, Slice(&p, 1));
   }
+  // Encode min/max timestamp if they are non-empty
+  if (!f.min_timestamp.empty()) {
+    PutVarint32(dst, NewFileCustomTag::kMinTimestamp);
+    PutLengthPrefixedSlice(dst, Slice(f.min_timestamp));
+  }
+  if (!f.max_timestamp.empty()) {
+    PutVarint32(dst, NewFileCustomTag::kMaxTimestamp);
+    PutLengthPrefixedSlice(dst, Slice(f.max_timestamp));
+  }
   TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
                            dst);
 
@@ -443,6 +452,12 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input, int& max_level,
           }
           f.user_defined_timestamps_persisted = (field[0] == 1);
           break;
+        case kMinTimestamp:
+          f.min_timestamp = field.ToString();
+          break;
+        case kMaxTimestamp:
+          f.max_timestamp = field.ToString();
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
diff --git a/db/version_edit.h b/db/version_edit.h
index 4f60a86fa0e4..742d2f8b0e52 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -276,6 +276,14 @@ struct FileMetaData {
   // false, it's explicitly written to Manifest.
   bool user_defined_timestamps_persisted = true;
 
+  // Minimum user-defined timestamp in the file. Empty if no UDT or unknown.
+  // This is populated from the table properties "rocksdb.timestamp_min".
+  std::string min_timestamp;
+
+  // Maximum user-defined timestamp in the file. Empty if no UDT or unknown.
+  // This is populated from the table properties "rocksdb.timestamp_max".
+  std::string max_timestamp;
+
   FileMetaData() = default;
 
   FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
@@ -288,7 +296,9 @@ struct FileMetaData {
                const std::string& _file_checksum_func_name,
                UniqueId64x2 _unique_id,
                const uint64_t _compensated_range_deletion_size,
-               uint64_t _tail_size, bool _user_defined_timestamps_persisted)
+               uint64_t _tail_size, bool _user_defined_timestamps_persisted,
+               const std::string& _min_timestamp,
+               const std::string& _max_timestamp)
       : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
         smallest(smallest_key),
         largest(largest_key),
@@ -303,7 +313,9 @@ struct FileMetaData {
         file_checksum_func_name(_file_checksum_func_name),
         unique_id(std::move(_unique_id)),
         tail_size(_tail_size),
-        user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {
+        user_defined_timestamps_persisted(_user_defined_timestamps_persisted),
+        min_timestamp(_min_timestamp),
+        max_timestamp(_max_timestamp) {
     TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
   }
 
@@ -386,7 +398,8 @@ struct FileMetaData {
     usage += sizeof(*this);
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
     usage += smallest.size() + largest.size() + file_checksum.size() +
-             file_checksum_func_name.size();
+             file_checksum_func_name.size() + min_timestamp.size() +
+             max_timestamp.size();
     return usage;
   }
 
@@ -737,17 +750,19 @@ class VersionEdit {
                const std::string& file_checksum_func_name,
                const UniqueId64x2& unique_id,
                const uint64_t compensated_range_deletion_size,
-               uint64_t tail_size, bool user_defined_timestamps_persisted) {
+               uint64_t tail_size, bool user_defined_timestamps_persisted,
+               const std::string& min_timestamp = "",
+               const std::string& max_timestamp = "") {
     assert(smallest_seqno <= largest_seqno);
     new_files_.emplace_back(
         level,
-        FileMetaData(file, file_path_id, file_size, smallest, largest,
-                     smallest_seqno, largest_seqno, marked_for_compaction,
-                     temperature, oldest_blob_file_number, oldest_ancester_time,
-                     file_creation_time, epoch_number, file_checksum,
-                     file_checksum_func_name, unique_id,
-                     compensated_range_deletion_size, tail_size,
-                     user_defined_timestamps_persisted));
+        FileMetaData(
+            file, file_path_id, file_size, smallest, largest, smallest_seqno,
+            largest_seqno, marked_for_compaction, temperature,
+            oldest_blob_file_number, oldest_ancester_time, file_creation_time,
+            epoch_number, file_checksum, file_checksum_func_name, unique_id,
+            compensated_range_deletion_size, tail_size,
+            user_defined_timestamps_persisted, min_timestamp, max_timestamp));
     files_to_quarantine_.push_back(file);
     if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
       SetLastSequence(largest_seqno);
diff --git a/db/version_set.cc b/db/version_set.cc
index cf89ec8ad735..d716f6cbfcc1 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3737,7 +3737,8 @@ bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions,
 
 void VersionStorageInfo::ComputeCompactionScore(
     const ImmutableOptions& immutable_options,
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options,
+    const std::string& full_history_ts_low) {
   double total_downcompact_bytes = 0.0;
   // Historically, score is defined as actual bytes in a level divided by
   // the level's target size, and 1.0 is the threshold for triggering
@@ -3936,7 +3937,8 @@ void VersionStorageInfo::ComputeCompactionScore(
   ComputeFilesMarkedForCompaction(max_output_level);
   ComputeBottommostFilesMarkedForCompaction(
       immutable_options.cf_allow_ingest_behind ||
-      immutable_options.allow_ingest_behind);
+          immutable_options.allow_ingest_behind,
+      immutable_options.user_comparator, full_history_ts_low);
   ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
   ComputeFilesMarkedForPeriodicCompaction(
       immutable_options, mutable_cf_options.periodic_compaction_seconds,
@@ -4527,17 +4529,20 @@ void VersionStorageInfo::GenerateFileLocationIndex() {
   }
 }
 
-void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum,
-                                              bool allow_ingest_behind) {
+void VersionStorageInfo::UpdateOldestSnapshot(
+    SequenceNumber seqnum, bool allow_ingest_behind, const Comparator* ucmp,
+    const std::string& full_history_ts_low) {
   assert(seqnum >= oldest_snapshot_seqnum_);
   oldest_snapshot_seqnum_ = seqnum;
   if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
-    ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind);
+    ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind, ucmp,
+                                              full_history_ts_low);
   }
 }
 
 void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction(
-    bool allow_ingest_behind) {
+    bool allow_ingest_behind, const Comparator* ucmp,
+    const std::string& full_history_ts_low) {
   bottommost_files_marked_for_compaction_.clear();
   bottommost_files_mark_threshold_ = kMaxSequenceNumber;
   if (allow_ingest_behind) {
@@ -4558,12 +4563,39 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction(
         current_time - static_cast<int64_t>(bottommost_file_compaction_delay_);
   }
 
+  // For UDT, we need to check if the file's max timestamp is below
+  // full_history_ts_low. If not, the compaction won't be able to collapse the
+  // timestamp to clean up the tombstone , so marking the file would be futile
+  // and could cause an infinite compaction loop.
+  const bool has_udt = ucmp && ucmp->timestamp_size() > 0;
+
   for (auto& level_and_file : bottommost_files_) {
     if (!level_and_file.second->being_compacted &&
         level_and_file.second->fd.largest_seqno != 0) {
       // largest_seqno might be nonzero due to containing the final key in an
       // earlier compaction, whose seqnum we didn't zero out.
       if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+        if (has_udt) {
+          const std::string& max_ts = level_and_file.second->max_timestamp;
+          // If max_timestamp is empty, the file could come from very old
+          // version which does not have timestamp. In that case, we should pick
+          // the file for compaction. After compaction, the file will have
+          // max_timestamp set propertly.
+          if (!max_ts.empty()) {
+            // If full_history_ts_low is empty, it means it was never set, which
+            // means its value is 0. Therefore, it would be always smaller than
+            // max_timestamp
+            if (full_history_ts_low.empty()) {
+              continue;
+            }
+            // If max timestamp >= full_history_ts_low, skip this file
+            if (ucmp->CompareTimestamp(Slice(max_ts), full_history_ts_low) >=
+                0) {
+              continue;
+            }
+          }
+        }
+
         if (!needs_delay) {
           bottommost_files_marked_for_compaction_.push_back(level_and_file);
         } else if (creation_time_ub > 0) {
@@ -5639,7 +5671,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
   // compute new compaction score
   v->storage_info()->ComputeCompactionScore(
       column_family_data->ioptions(),
-      column_family_data->GetLatestMutableCFOptions());
+      column_family_data->GetLatestMutableCFOptions(),
+      column_family_data->GetFullHistoryTsLow());
 
   // Mark v finalized
   v->storage_info_.SetFinalized();
@@ -7102,7 +7135,6 @@ Status VersionSet::WriteCurrentStateToManifest(
 
         for (const auto& f : level_files) {
           assert(f);
-
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
                        f->fd.smallest_seqno, f->fd.largest_seqno,
@@ -7111,7 +7143,8 @@ Status VersionSet::WriteCurrentStateToManifest(
                        f->file_creation_time, f->epoch_number, f->file_checksum,
                        f->file_checksum_func_name, f->unique_id,
                        f->compensated_range_deletion_size, f->tail_size,
-                       f->user_defined_timestamps_persisted);
+                       f->user_defined_timestamps_persisted, f->min_timestamp,
+                       f->max_timestamp);
         }
       }
 
diff --git a/db/version_set.h b/db/version_set.h
index 365d2838183d..800e55259872 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -200,7 +200,8 @@ class VersionStorageInfo {
   // REQUIRES: db_mutex held!!
   // TODO find a better way to pass compaction_options_fifo.
   void ComputeCompactionScore(const ImmutableOptions& immutable_options,
-                              const MutableCFOptions& mutable_cf_options);
+                              const MutableCFOptions& mutable_cf_options,
+                              const std::string& full_history_ts_low);
 
   // Estimate est_comp_needed_bytes_
   void EstimateCompactionBytesNeeded(
@@ -230,8 +231,15 @@ class VersionStorageInfo {
   // oldest snapshot changes as that is when bottom-level files can become
   // eligible for compaction.
   //
+  // For columns with User Defined Timestamps (UDT), also checks that the
+  // file's largest timestamp is below full_history_ts_low before marking,
+  // since compaction can only collapse timestamp when it is below this
+  // threshold.
+  //
   // REQUIRES: DB mutex held
-  void ComputeBottommostFilesMarkedForCompaction(bool allow_ingest_behind);
+  void ComputeBottommostFilesMarkedForCompaction(
+      bool allow_ingest_behind, const Comparator* ucmp,
+      const std::string& full_history_ts_low);
 
   // This computes files_marked_for_forced_blob_gc_ and is called by
   // ComputeCompactionScore()
@@ -248,7 +256,8 @@ class VersionStorageInfo {
   // files marked for compaction.
   // REQUIRES: DB mutex held
   void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum,
-                            bool allow_ingest_behind);
+                            bool allow_ingest_behind, const Comparator* ucmp,
+                            const std::string& full_history_ts_low);
 
   int MaxInputLevel() const;
   int MaxOutputLevel(bool allow_ingest_behind) const;
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index fefde1170ae5..a4cf2698c078 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -56,7 +56,8 @@ class GenerateLevelFilesBriefTest : public testing::Test {
         kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
         kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     files_.push_back(f);
   }
 
@@ -172,7 +173,8 @@ class VersionStorageInfoTestBase : public testing::Test {
         kUnknownOldestAncesterTime, kUnknownFileCreationTime,
         kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
         kNullUniqueId64x2, compensated_range_deletion_size, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     vstorage_.AddFile(level, f);
   }
 
@@ -391,7 +393,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
   ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
   ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
 
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
   // Only L0 hits compaction.
   ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
 }
@@ -421,7 +424,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
   ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
   ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
 
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
   // Although L2 and l3 have higher unadjusted compaction score, considering
   // a relatively large L0 being compacted down soon, L4 is picked up for
   // compaction.
@@ -453,7 +457,8 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
   ASSERT_EQ(2, vstorage_.base_level());
   ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
 
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
   // Although L2 has higher unadjusted compaction score, considering
   // a relatively large L0 being compacted down soon, L3 is picked up for
   // compaction.
@@ -483,7 +488,8 @@ TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) {
   ASSERT_EQ(1, vstorage_.base_level());
   ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1));
   ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3));
-  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_,
+                                   /*full_history_ts_low=*/"");
 
   // Tests that levels 1 and 3 are eligible for compaction.
   // Levels 1 and 3 are much smaller than target size,
@@ -1346,7 +1352,8 @@ class VersionSetTestBase {
           Temperature::kUnknown, info.oldest_blob_file_number, 0, 0,
           info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
           kNullUniqueId64x2, 0, 0,
-          /* user_defined_timestamps_persisted */ true);
+          /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+          /* max timestamp */ "");
       if (info.file_missing) {
         ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr));
       }
@@ -3940,7 +3947,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
         largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
         file_num /* epoch_number */, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     added_files.emplace_back(0, meta);
   }
   WriteFileAdditionAndDeletionToManifest(
@@ -4001,7 +4009,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
         largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
         file_num /* epoch_number */, kUnknownFileChecksum,
         kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
-        /* user_defined_timestamps_persisted */ true);
+        /* user_defined_timestamps_persisted */ true, /* min timestamp */ "",
+        /* max timestamp */ "");
     added_files.emplace_back(0, meta);
   }
   WriteFileAdditionAndDeletionToManifest(
diff --git a/unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md b/unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md
new file mode 100644
index 000000000000..ac08736c72b1
--- /dev/null
+++ b/unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md
@@ -0,0 +1 @@
+Fixed an infinite compaction loop bug with User-Defined Timestamps (UDT) where bottommost files were repeatedly marked for compaction even though their timestamp could not be collapsed.

From eb5e1a2d1f7b739c8dc89aae8f9b66546f5932fe Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Tue, 20 Jan 2026 17:31:28 -0800
Subject: [PATCH 429/500] Use unique DB directory when TEST_TMPDIR is set
 (#14249)

Summary:
Some of the stress tests script run tests multiple times with TEST_TMPDIR set. When TEST_TMPDIR is set, the db directory is a fixed string. This caused the same DB directory was reused across db_crashtest.py script run. Typically, the DB folder is cleaned up after db_crashtest.py complete. But sometimes, the clean up command could fail. This caused the DB folder to be reused across different db_crashtest.py runs. Meantime, each db_crashtest.py run would randomize some of the parameters. This caused different parameters to be used with same DB directory, violating some of the assumption such as use_put_entity_one_in parameter to be not changed between runs. This change added a suffix to DB directory, so that each db_crashtest.py script run would generate a unique DB directory, which prevents the clean up failure issue causing test flaky.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14249

Test Plan:
Stress test local run

```
TEST_TMPDIR=/tmp/aaa /usr/local/bin/python3 -u tools/db_crashtest.py --stress_cmd=./db_stress --cleanup_cmd='' --simple blackbox  --duration 15 --interval 10

>>> Running db_stress with pid=113810: ./db_stress ... --db=/tmp/aaa/rocksdb_crashtest_blackbox_6967584463401575611 ...
```

Reviewed By: hx235

Differential Revision: D91069655

Pulled By: xingbowang

fbshipit-source-id: 327fc3cd0d8e3ef4b49e182e21bcd91a10647710
---
 tools/db_crashtest.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index b63347eeee7d..b681859ff5f8 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -461,13 +461,19 @@ def is_release_mode():
     return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0"
 
 
+# Generate a unique run ID for this script execution. This ensures each run
+# gets a unique database directory when TEST_TMPDIR is set, avoiding issues
+# with parameter changes (like use_put_entity_one_in) between runs.
+run_id = str(random.randint(0, 2**63))
+
+
 def get_dbname(test_name):
     test_dir_name = "rocksdb_crashtest_" + test_name
     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
     if test_tmpdir is None or test_tmpdir == "":
         dbname = tempfile.mkdtemp(prefix=test_dir_name)
     else:
-        dbname = test_tmpdir + "/" + test_dir_name
+        dbname = test_tmpdir + "/" + test_dir_name + "_" + run_id
         if not is_remote_db:
             shutil.rmtree(dbname, True)
             if cleanup_cmd is not None:

From a6af31747668307c146d012518bb9585c83f1d2f Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 21 Jan 2026 09:28:06 -0800
Subject: [PATCH 430/500] Use format_version=7 by default, fix perf bug
 (#14239)

Summary:
Since it's been > 6 months and we have production uses, migrate to fv=7 by default. One unit test needed an update for the change to table properties with fv=7.

On making this change, PresetCompressionDictTest tests detected extra memory usage by decompressing LZ4 with dictionary compression. This turned out to be a bug in `std::find` usage that led to using the ZSTD-optimized decompressor (with digested dictionary usage) in cases where it is not needed. I've fixed the bug and improved the unit tests that found the bug.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14239

Test Plan: existing tests, including format compatible CI job (updated, and run locally with SHORT_TEST=1)

Reviewed By: hx235

Differential Revision: D90728697

Pulled By: pdillinger

fbshipit-source-id: 8f1a0e9ca59a88c18eaa4cdfdea00309175ce30a
---
 db/compact_files_test.cc                      |  5 +-
 include/rocksdb/table.h                       |  2 +-
 .../org/rocksdb/BlockBasedTableConfig.java    |  2 +-
 tools/check_format_compatible.sh              |  4 +-
 unreleased_history/behavior_changes/fv7.md    |  1 +
 .../bug_fixes/compression_perf_fv7.md         |  1 +
 util/compression.cc                           |  5 +-
 util/compression_test.cc                      | 76 +++++++++----------
 8 files changed, 49 insertions(+), 47 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/fv7.md
 create mode 100644 unreleased_history/bug_fixes/compression_perf_fv7.md

diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index b1331d1ccff9..730921f9680b 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -433,8 +433,9 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
     ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
     ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
     for (const auto& name_and_table_props : all_tables_props) {
-      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
-                name_and_table_props.second->compression_name);
+      // As of format_version 7, more elaborate information is encoded into the
+      // compression_name property
+      ASSERT_EQ("BuiltinV2;02;", name_and_table_props.second->compression_name);
     }
     delete db;
   }
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 9727d30a3017..1f46217fbca3 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -593,7 +593,7 @@ struct BlockBasedTableOptions {
   // validation and sufficient time and number of releases have elapsed
   // (6 months recommended) to ensure a clean downgrade/revert path for users
   // who might only upgrade a few times per year.
-  uint32_t format_version = 6;
+  uint32_t format_version = 7;
 
   // Store index blocks on disk in compressed format. Changing this option to
   // false  will avoid the overhead of decompression if index blocks are evicted
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index 18d1bebacbd6..df21d774484d 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -37,7 +37,7 @@ public BlockBasedTableConfig() {
     wholeKeyFiltering = true;
     verifyCompression = false;
     readAmpBytesPerBit = 0;
-    formatVersion = 6;
+    formatVersion = 7;
     enableIndexCompression = true;
     blockAlign = false;
     superBlockAlignmentSize = 0;
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index d764467403d0..ede9263ecd38 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb" "10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb")
+declare -a db_forward_with_options_refs=("10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
@@ -145,7 +145,7 @@ declare -a db_forward_no_options_refs=() # N/A at the moment
 # To check for SST ingestion backward compatibility (new version reading
 # data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to
 # 5.14.x, 5.15.x)
-declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb")
+declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb" "6.21.fb" "6.22.fb" "6.23.fb" "6.24.fb" "6.25.fb" "6.26.fb" "6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb" "8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb" "9.9.fb" "9.10.fb" "9.11.fb" "10.0.fb" "10.1.fb" "10.2.fb" "10.3.fb")
 # To check for SST ingestion forward compatibility (old version reading
 # data from new) as well as backward compatibility
 declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
diff --git a/unreleased_history/behavior_changes/fv7.md b/unreleased_history/behavior_changes/fv7.md
new file mode 100644
index 000000000000..91be747f80d5
--- /dev/null
+++ b/unreleased_history/behavior_changes/fv7.md
@@ -0,0 +1 @@
+* The new default for `BlockBasedTableOptions::format_version` is 7, which has been supported since RocksDB 10.4.0 and is required in order to use CompressionManagers supporting custom compression types.
diff --git a/unreleased_history/bug_fixes/compression_perf_fv7.md b/unreleased_history/bug_fixes/compression_perf_fv7.md
new file mode 100644
index 000000000000..422e96bb7771
--- /dev/null
+++ b/unreleased_history/bug_fixes/compression_perf_fv7.md
@@ -0,0 +1 @@
+* Fixed a small performance bug with `format_version=7` when decompressing formats other than Snappy and ZSTD.
diff --git a/util/compression.cc b/util/compression.cc
index 3cde7c4c32ac..71eaa1393370 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -1632,10 +1632,13 @@ class BuiltinCompressionManagerV2 final : public CompressionManager {
       return nullptr;
     } else if (types_begin + 1 == types_end &&
                *types_begin == kSnappyCompression) {
+      // Exclusively Snappy
       return GetSnappyDecompressor();
-    } else if (std::find(types_begin, types_end, kZSTD)) {
+    } else if (std::find(types_begin, types_end, kZSTD) != types_end) {
+      // Includes ZSTD
       return GetZstdDecompressor();
     } else {
+      // Everything else
       return GetGeneralDecompressor();
     }
   }
diff --git a/util/compression_test.cc b/util/compression_test.cc
index c40503b00ed9..18144d99c93e 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -583,17 +583,15 @@ TEST_P(PresetCompressionDictTest, Flush) {
     ASSERT_GT(
         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
         0);
-    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-    // number of bytes needs to be adjusted in case the cached block is in
-    // ZSTD's digested dictionary format.
-    if (compression_type_ != kZSTD) {
-      // Although we limited buffering to `kBlockLen`, there may be up to two
-      // blocks of data included in the dictionary since we only check limit
-      // after each block is built.
-      ASSERT_LE(TestGetTickerCount(options,
-                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-                2 * kBlockLen);
-    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD), 1);
+    // Although we stop buffering after `kBlockLen` bytes, there may be up to
+    // two blocks of data included in the dictionary since we only check limit
+    // after each block is built. And because block cache charges for bytes used
+    // by ZSTD's digested dictionary, we need a larger factor for the memory
+    // overheads in that case.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        (compression_type_ == kZSTD ? 10 : 2) * kBlockLen);
   }
 }
 
@@ -642,8 +640,9 @@ TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
   }
   ASSERT_EQ("2,0,1", FilesPerLevel(0));
 
-  uint64_t prev_compression_dict_bytes_inserted =
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+
   // This L0->L1 compaction merges the two L0 files into L1. The produced L1
   // file is not bottommost due to the existing L2 file covering the same key-
   // range.
@@ -655,22 +654,20 @@ TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
   if (bottommost_) {
     ASSERT_EQ(
         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted);
+        0);
   } else {
     ASSERT_GT(
         TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted);
-    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-    // number of bytes needs to be adjusted in case the cached block is in
-    // ZSTD's digested dictionary format.
-    if (compression_type_ != kZSTD) {
-      // Although we limited buffering to `kBlockLen`, there may be up to two
-      // blocks of data included in the dictionary since we only check limit
-      // after each block is built.
-      ASSERT_LE(TestGetTickerCount(options,
-                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
-    }
+        0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD), 1);
+    // Although we stop buffering after `kBlockLen` bytes, there may be up to
+    // two blocks of data included in the dictionary since we only check limit
+    // after each block is built. And because block cache charges for bytes used
+    // by ZSTD's digested dictionary, we need a larger factor for the memory
+    // overheads in that case.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        (compression_type_ == kZSTD ? 10 : 2) * kBlockLen);
   }
 }
 
@@ -713,25 +710,24 @@ TEST_P(PresetCompressionDictTest, CompactBottommost) {
   }
   ASSERT_EQ("2", FilesPerLevel(0));
 
-  uint64_t prev_compression_dict_bytes_inserted =
-      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  PopTicker(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+
   CompactRangeOptions cro;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ("0,1", FilesPerLevel(0));
   ASSERT_GT(
       TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-      prev_compression_dict_bytes_inserted);
-  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
-  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
-  // digested dictionary format.
-  if (compression_type_ != kZSTD) {
-    // Although we limited buffering to `kBlockLen`, there may be up to two
-    // blocks of data included in the dictionary since we only check limit after
-    // each block is built.
-    ASSERT_LE(
-        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
-        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
-  }
+      0);
+  ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD), 1);
+  // Although we stop buffering after `kBlockLen` bytes, there may be up to
+  // two blocks of data included in the dictionary since we only check limit
+  // after each block is built. And because block cache charges for bytes used
+  // by ZSTD's digested dictionary, we need a larger factor for the memory
+  // overheads in that case.
+  ASSERT_LE(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      (compression_type_ == kZSTD ? 10 : 2) * kBlockLen);
 }
 
 class CompactionCompressionListener : public EventListener {

From f312633aff5329d396f48f40da540f9b48dff246 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Wed, 21 Jan 2026 16:26:06 -0800
Subject: [PATCH 431/500] Fix AbortIO documentation to match actual behavior
 (#14251)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14251

The AbortIO API documentation incorrectly stated that the callback
should NOT be called for aborted io_handles. However, the actual
implementation in fs_posix.cc does invoke the callback with
IOStatus::Aborted() status after cancelling requests:

```
// fs_posix.cc:1252-1260
if (posix_handle->req_count == 2 &&
    static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
  posix_handle->is_finished = true;
  FSReadRequest req;
  req.status = IOStatus::Aborted();
  posix_handle->cb(req, posix_handle->cb_arg);

  break;
}
```

This change corrects the documentation to match the actual behavior
in RocksDB.

Reviewed By: anand1976

Differential Revision: D91073466

fbshipit-source-id: 47ae14a09e9386cc68049ca272d6b712f5a9bed7
---
 include/rocksdb/file_system.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index 1a08d43041bb..16f807e4f299 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -750,7 +750,7 @@ class FileSystem : public Customizable {
   // Abort the read IO requests submitted asynchronously. Underlying FS is
   // required to support AbortIO API. AbortIO implementation should ensure that
   // the all the read requests related to io_handles should be aborted and
-  // it shouldn't call the callback for these io_handles.
+  // it should call the callback for these io_handles.
   virtual IOStatus AbortIO(std::vector<void*>& /*io_handles*/) {
     return IOStatus::OK();
   }

From f84351de981a89d029bc2520ad945726f2f4b930 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Wed, 21 Jan 2026 19:17:05 -0800
Subject: [PATCH 432/500] Fix AbortIO hang when aborting multiple io_uring
 handles (#14252)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14252

Fixed a bug in PosixFileSystem::AbortIO that could cause an infinite hang
when aborting multiple concurrent async IO handles.

The bug occurred in the completion processing loop: when an io_uring
completion arrived for a handle other than the one currently being waited
for (io_handles[i]), the code would increment that handle's req_count but
only mark it as finished if it also matched io_handles[i]. This meant
completions for other handles were consumed but those handles were never
marked as finished.

Later, when iterating to those handles, the code would enter
io_uring_wait_cqe expecting more completions, but they had already been
consumed - causing an infinite hang.

The fix aligns AbortIO's completion handling with what Poll() already does:
mark handles as finished whenever their completions arrive, regardless of
which handle we're currently waiting for in the outer loop. Only the break
statement remains conditional on matching io_handles[i].

Reviewed By: anand1976

Differential Revision: D91070044

fbshipit-source-id: 47faf5f0df3e26a2aa83444bbac623f43f560933
---
 env/env_test.cc | 194 ++++++++++++++++++++++++++++++++++++++++++++++++
 env/fs_posix.cc |  12 ++-
 2 files changed, 203 insertions(+), 3 deletions(-)

diff --git a/env/env_test.cc b/env/env_test.cc
index e6f56402ea77..c035a526c881 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -3725,6 +3725,200 @@ TEST_F(TestAsyncRead, InterleavingIOUringOperations) {
 #endif
 }
 
+// Helper function to run AbortIO test with parameterized read requests.
+// Each request is specified as {offset, length}.
+// use_direct_io: if true, opens the file with O_DIRECT to bypass page cache.
+// iterations: number of times to repeat the test (useful for race conditions).
+void TestAbortIOWithRequests(
+    Env* env, size_t file_size,
+    const std::vector<std::pair<uint64_t, size_t>>& read_specs,
+    bool use_direct_io = false, int iterations = 1) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  fprintf(stderr,
+          "TestAbortIOWithRequests: file_size=%zu, num_reads=%zu, "
+          "direct_io=%d, iterations=%d\n",
+          file_size, read_specs.size(), use_direct_io, iterations);
+  std::shared_ptr<FileSystem> fs = env->GetFileSystem();
+  std::string fname = test::PerThreadDBPath(env, "testfile_abortio");
+
+  constexpr size_t kSectorSize = 4096;
+
+  for (int iter = 0; iter < iterations; iter++) {
+    // 1. Create test file of specified size using direct IO
+    {
+      std::unique_ptr<FSWritableFile> wfile;
+      FileOptions file_opts;
+      file_opts.use_direct_writes = true;
+      ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr));
+
+      // Round up to full sectors for direct IO writes
+      size_t num_sectors = (file_size + kSectorSize - 1) / kSectorSize;
+      for (size_t i = 0; i < num_sectors; ++i) {
+        auto data = NewAligned(kSectorSize, static_cast<char>(i + 1));
+        Slice slice(data.get(), kSectorSize);
+        ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+      }
+
+      // Truncate to exact file size if not aligned to sector boundary
+      if (file_size % kSectorSize != 0) {
+        ASSERT_OK(wfile->Truncate(file_size, IOOptions(), nullptr));
+      }
+
+      ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+    }
+
+    // 2. Submit ReadAsync requests and immediately abort
+    {
+      FileOptions file_opts;
+      file_opts.use_direct_reads = use_direct_io;
+      std::unique_ptr<FSRandomAccessFile> file;
+      ASSERT_OK(fs->NewRandomAccessFile(fname, file_opts, &file, nullptr));
+
+      const size_t num_reads = read_specs.size();
+      IOOptions opts;
+      std::vector<void*> io_handles(num_reads);
+      std::vector<FSReadRequest> reqs(num_reads);
+      std::vector<std::unique_ptr<char, Deleter>> data;
+      std::vector<size_t> vals;
+      IOHandleDeleter del_fn;
+
+      // Initialize read requests from specs
+      for (size_t i = 0; i < num_reads; i++) {
+        reqs[i].offset = read_specs[i].first;
+        reqs[i].len = read_specs[i].second;
+        data.emplace_back(NewAligned(reqs[i].len, 0));
+        reqs[i].scratch = data.back().get();
+        vals.push_back(i);
+      }
+
+      // Callback
+      std::function<void(FSReadRequest&, void*)> callback =
+          [&](FSReadRequest& req, void* cb_arg) {
+            size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+            reqs[i].status = req.status;
+          };
+
+      // Submit all ReadAsync requests
+      for (size_t i = 0; i < num_reads; i++) {
+        void* cb_arg = static_cast<void*>(&(vals[i]));
+        IOStatus s = file->ReadAsync(reqs[i], opts, callback, cb_arg,
+                                     &(io_handles[i]), &del_fn, nullptr);
+        if (s.IsNotSupported()) {
+          // io_uring not supported, clean up and skip
+          fprintf(stderr,
+                  "WARNING: io_uring not supported, skipping test: %s\n",
+                  s.ToString().c_str());
+          for (size_t j = 0; j < i; j++) {
+            if (io_handles[j]) {
+              del_fn(io_handles[j]);
+            }
+          }
+          ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+          return;
+        }
+        ASSERT_OK(s);
+      }
+
+      // Immediately call AbortIO - this should NOT hang
+      ASSERT_OK(fs->AbortIO(io_handles));
+
+      // Clean up handles
+      for (size_t i = 0; i < num_reads; i++) {
+        if (io_handles[i]) {
+          del_fn(io_handles[i]);
+        }
+      }
+    }
+
+    ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+  }
+
+  fprintf(stderr, "TestAbortIOWithRequests: completed %d iterations\n",
+          iterations);
+#else
+  fprintf(stderr,
+          "TestAbortIOWithRequests: SKIPPED (ROCKSDB_IOURING_PRESENT not "
+          "defined)\n");
+  (void)env;
+  (void)file_size;
+  (void)read_specs;
+  (void)use_direct_io;
+  (void)iterations;
+#endif
+}
+
+// Test overlapping reads at aligned offsets (multiples of 4KB)
+TEST_F(TestAsyncRead, AbortIOOverlappingAligned) {
+  // 4 reads of 16KB each, overlapping by 8KB, all at 4KB-aligned offsets
+  // Read 0: [0, 16KB), Read 1: [8KB, 24KB), Read 2: [16KB, 32KB), Read 3:
+  // [24KB, 40KB)
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {0, 16384},
+      {8192, 16384},
+      {16384, 16384},
+      {24576, 16384},
+  };
+  TestAbortIOWithRequests(env_, 64 * 1024, specs);
+}
+
+// Test reads at unaligned offsets (not multiples of 4KB)
+TEST_F(TestAsyncRead, AbortIOUnalignedOffsets) {
+  // Reads starting at non-4KB-aligned offsets
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {1000, 8192},    // starts at 1000 (unaligned)
+      {5000, 12288},   // starts at 5000 (unaligned), spans multiple sectors
+      {15000, 8192},   // starts at 15000 (unaligned)
+      {25500, 16384},  // starts at 25500 (unaligned)
+  };
+  TestAbortIOWithRequests(env_, 64 * 1024, specs);
+}
+
+// Test mix of aligned and unaligned, various sizes
+TEST_F(TestAsyncRead, AbortIOMixedOffsets) {
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {0, 4096},       // aligned, 1 sector
+      {1500, 8192},    // unaligned, 2 sectors
+      {4096, 20480},   // aligned, 5 sectors
+      {7000, 4096},    // unaligned, spans 2 sectors
+      {16384, 32768},  // aligned, 8 sectors
+      {50000, 8192},   // unaligned
+  };
+  TestAbortIOWithRequests(env_, 128 * 1024, specs);
+}
+
+// Stress test with many concurrent handles
+TEST_F(TestAsyncRead, AbortIOStress) {
+  std::vector<std::pair<uint64_t, size_t>> specs;
+  // 16 overlapping reads with mixed alignment
+  for (int i = 0; i < 16; i++) {
+    uint64_t offset = i * 4000;          // Not aligned to 4KB
+    size_t len = 8192 + (i % 4) * 4096;  // 8KB to 20KB
+    specs.emplace_back(offset, len);
+  }
+  TestAbortIOWithRequests(env_, 256 * 1024, specs);
+}
+
+// Regression test for a fixed bug in AbortIO where out-of-order io_uring
+// completions could cause an infinite hang. The bug occurred when completions
+// for a different handle arrived while waiting for the current handle - the
+// code would consume those completions but not mark the handle as finished,
+// causing a hang when later iterating to that handle.
+//
+// Uses a large read (1MB) followed by a small read (4KB) with Direct I/O to
+// maximize the chance of out-of-order completions. Runs 100 iterations to
+// increase the likelihood of triggering the race condition.
+TEST_F(TestAsyncRead, AbortIOReversedHandles) {
+  // Request 0: LARGE (1MB) at offset 0
+  // Request 1: SMALL (4KB) at offset 1MB
+  std::vector<std::pair<uint64_t, size_t>> specs = {
+      {0, 1024 * 1024},     // 1MB read
+      {1024 * 1024, 4096},  // 4KB read at 1MB offset
+  };
+  // 2MB file, Direct I/O enabled, 100 iterations
+  TestAbortIOWithRequests(env_, 2 * 1024 * 1024, specs,
+                          /*use_direct_io=*/true, /*iterations=*/100);
+}
+
 struct StaticDestructionTester {
   bool activated = false;
   ~StaticDestructionTester() {
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 34efe1204f6d..de8152d781bf 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -1249,14 +1249,20 @@ class PosixFileSystem : public FileSystem {
         //
         // Every handle has to wait for 2 requests completion: original one and
         // the cancel request which is tracked by PosixHandle::req_count.
-        if (posix_handle->req_count == 2 &&
-            static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+        // Note: We must mark is_finished and invoke the callback for ANY handle
+        // that reaches req_count == 2, not just the one we're currently waiting
+        // for (io_handles[i]). Otherwise, if completions arrive out of order,
+        // we consume another handle's completions without marking it finished,
+        // causing an infinite hang when we later wait for that handle.
+        if (posix_handle->req_count == 2) {
           posix_handle->is_finished = true;
           FSReadRequest req;
           req.status = IOStatus::Aborted();
           posix_handle->cb(req, posix_handle->cb_arg);
 
-          break;
+          if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
+            break;
+          }
         }
       }
     }

From b89d290c20a3e18528118d2255a04f057fe34137 Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Wed, 21 Jan 2026 23:23:38 -0800
Subject: [PATCH 433/500] Add MultiScan statistics (#14248)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14248

### Overview

This diff introduces the addition of multi-scan statistics to RocksDB, enhancing the database's ability to monitor and analyze performance during multi-scan operations.

### Key Changes

#### Implemented Multi-Scan Statistics

The following statistics were implemented to provide deeper insights into multi-scan operations:

- **MULTISCAN_PREPARE_MICROS**: Measures the time (in microseconds) spent preparing for multi-scan operations.
- **MULTISCAN_BLOCKS_PER_PREPARE**: Tracks the number of blocks processed per multi-scan prepare operation.
- **Wasted Prefetch Blocks Count**: Counts the number of prefetched blocks that were not used (i.e., wasted) if the iterator is abandoned before accessing them.
- **MULTISCAN_TOTAL_BLOCKS_SCANNED**: Tracks the total number of blocks scanned during all multi-scan operations.
- **MULTISCAN_TOTAL_KEYS_SCANNED**: Measures the total number of keys scanned across all multi-scan operations.
- **MULTISCAN_TOTAL_MICROS**: Captures the total time (in microseconds) spent in multi-scan operations.
- **MULTISCAN_PREFETCHED_BLOCKS**: Counts the number of blocks that were prefetched during multi-scan operations.
- **MULTISCAN_USED_PREFETCH_BLOCKS**: Tracks the number of prefetched blocks that were actually used during multi-scan operations.

### Impact

This diff provides more fine-grained statistics for multi-scan operations, allowing developers and users to better understand and optimize the performance of their RocksDB instances.

Reviewed By: krhancoc

Differential Revision: D91053297

fbshipit-source-id: 7158741b9f026c0b5ce8ba1264dbd137e7fe985d
---
 db/db_iterator_test.cc                        | 146 ++++++++++++++++++
 include/rocksdb/statistics.h                  |  26 ++++
 java/rocksjni/portal.h                        |  44 ++++++
 .../main/java/org/rocksdb/HistogramType.java  |  14 ++
 .../src/main/java/org/rocksdb/TickerType.java |  49 ++++++
 monitoring/statistics.cc                      |  13 ++
 monitoring/stats_history_test.cc              |   4 +-
 .../block_based/block_based_table_iterator.cc |  65 +++++++-
 .../block_based/block_based_table_iterator.h  |  14 +-
 9 files changed, 367 insertions(+), 8 deletions(-)

diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 20d9d9fa2d59..48421e5b6dfb 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -5027,6 +5027,152 @@ TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithExternalFileIngestion) {
   ASSERT_EQ(total_keys, 400);
   iter.reset();
 }
+
+TEST_P(DBMultiScanIteratorTest, StatisticsTest) {
+  // Test that multi scan statistics are properly recorded
+  auto options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  // Use small block size to ensure multiple blocks
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 256;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create data across multiple blocks
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(3) << std::setfill('0') << i;
+    // Use larger values to ensure multiple blocks
+    ASSERT_OK(Put("k" + ss.str(), std::string(100, 'v')));
+  }
+  ASSERT_OK(Flush());
+
+  // Reset stats before multi scan
+  ASSERT_OK(options.statistics->Reset());
+
+  // Set up two scan ranges
+  std::vector<std::string> key_ranges({"k010", "k030", "k060", "k080"});
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert(key_ranges[0], key_ranges[1]);
+  scan_options.insert(key_ranges[2], key_ranges[3]);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+
+  // Iterate through all ranges
+  int count = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        (void)it;
+        count++;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  }
+  ASSERT_EQ(count, 40);  // 20 keys per range
+  iter.reset();
+
+  // Check statistics
+  // MULTISCAN_PREPARE_CALLS should be at least 1
+  ASSERT_GE(TestGetTickerCount(options, MULTISCAN_PREPARE_CALLS), 1);
+
+  // MULTISCAN_PREPARE_ERRORS should be 0
+  ASSERT_EQ(TestGetTickerCount(options, MULTISCAN_PREPARE_ERRORS), 0);
+
+  // MULTISCAN_SEEK_ERRORS should be 0
+  ASSERT_EQ(TestGetTickerCount(options, MULTISCAN_SEEK_ERRORS), 0);
+
+  // Blocks should be prefetched or from cache
+  uint64_t blocks_prefetched =
+      TestGetTickerCount(options, MULTISCAN_BLOCKS_PREFETCHED);
+  uint64_t blocks_from_cache =
+      TestGetTickerCount(options, MULTISCAN_BLOCKS_FROM_CACHE);
+  ASSERT_GT(blocks_prefetched + blocks_from_cache, 0);
+
+  // If blocks were prefetched, prefetch bytes and IO requests should be > 0
+  if (blocks_prefetched > 0) {
+    ASSERT_GT(TestGetTickerCount(options, MULTISCAN_PREFETCH_BYTES), 0);
+    uint64_t io_requests = TestGetTickerCount(options, MULTISCAN_IO_REQUESTS);
+    ASSERT_GT(io_requests, 0);
+    ASSERT_LE(io_requests, blocks_prefetched);
+  }
+
+  // Wasted blocks should be 0 since we iterated through everything
+  ASSERT_EQ(TestGetTickerCount(options, MULTISCAN_PREFETCH_BLOCKS_WASTED), 0);
+}
+
+TEST_P(DBMultiScanIteratorTest, StatisticsWastedBlocksTest) {
+  // Test that wasted blocks are tracked when iteration is abandoned early
+  auto options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  // Use small block size to ensure multiple blocks
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 256;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create data across multiple blocks
+  for (int i = 0; i < 100; ++i) {
+    std::stringstream ss;
+    ss << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put("k" + ss.str(), std::string(100, 'v')));
+  }
+  ASSERT_OK(Flush());
+
+  // Reset stats before multi scan
+  ASSERT_OK(options.statistics->Reset());
+
+  // Set up a large scan range
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.insert("k000", "k099");
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  std::unique_ptr<MultiScan> iter =
+      dbfull()->NewMultiScan(ro, cfh, scan_options);
+
+  // Only iterate through a few keys, then abandon
+  int count = 0;
+  try {
+    for (auto range : *iter) {
+      for (auto it : range) {
+        (void)it;
+        count++;
+        if (count >= 5) {
+          break;  // Abandon iteration early
+        }
+      }
+      if (count >= 5) {
+        break;
+      }
+    }
+  } catch (MultiScanException& ex) {
+    ASSERT_NOK(ex.status());
+    std::cerr << "Iterator returned status " << ex.what();
+    abort();
+  }
+  ASSERT_EQ(count, 5);
+
+  // Destroy iterator to trigger wasted blocks counting
+  iter.reset();
+
+  uint64_t blocks_prefetched =
+      TestGetTickerCount(options, MULTISCAN_BLOCKS_PREFETCHED);
+
+  // If blocks were prefetched, some should be wasted since we abandoned early
+  if (blocks_prefetched > 1) {
+    // We only read a few keys, so there should be wasted blocks
+    ASSERT_GT(TestGetTickerCount(options, MULTISCAN_PREFETCH_BLOCKS_WASTED), 0);
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 6438ff70556b..fb75ebee3fca 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -552,6 +552,27 @@ enum Tickers : uint32_t {
   // Failure to load the UDI during SST table open
   SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
 
+  // MultiScan statistics
+  // # of Prepare() calls
+  MULTISCAN_PREPARE_CALLS,
+  // # of Prepare() calls that failed
+  MULTISCAN_PREPARE_ERRORS,
+  // # of data blocks prefetched from storage during MultiScan
+  MULTISCAN_BLOCKS_PREFETCHED,
+  // # of blocks found already in cache during MultiScan Prepare
+  MULTISCAN_BLOCKS_FROM_CACHE,
+  // Total bytes prefetched during MultiScan
+  MULTISCAN_PREFETCH_BYTES,
+  // # of prefetched blocks that were never accessed
+  MULTISCAN_PREFETCH_BLOCKS_WASTED,
+  // # of actual I/O requests issued during MultiScan
+  MULTISCAN_IO_REQUESTS,
+  // # of non-adjacent blocks coalesced into single I/O (within
+  // io_coalesce_threshold)
+  MULTISCAN_IO_COALESCED_NONADJACENT,
+  // # of seeks that failed validation (out of order, etc.)
+  MULTISCAN_SEEK_ERRORS,
+
   TICKER_ENUM_MAX
 };
 
@@ -695,6 +716,11 @@ enum Histograms : uint32_t {
   // MultiScan Prefill iterator Prepare cost
   MULTISCAN_PREPARE_ITERATORS,
 
+  // Total Prepare() latency for MultiScan
+  MULTISCAN_PREPARE_MICROS,
+  // Distribution of blocks prefetched per MultiScan Prepare()
+  MULTISCAN_BLOCKS_PER_PREPARE,
+
   HISTOGRAM_ENUM_MAX
 };
 
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index 094ac379b174..b19f473cc388 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5289,6 +5289,24 @@ class TickerTypeJni {
         return -0x5D;
       case ROCKSDB_NAMESPACE::Tickers::SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT:
         return -0x5E;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_CALLS:
+        return -0x60;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_ERRORS:
+        return -0x61;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_PREFETCHED:
+        return -0x62;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_FROM_CACHE:
+        return -0x63;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BYTES:
+        return -0x64;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BLOCKS_WASTED:
+        return -0x65;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_REQUESTS:
+        return -0x66;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_COALESCED_NONADJACENT:
+        return -0x67;
+      case ROCKSDB_NAMESPACE::Tickers::MULTISCAN_SEEK_ERRORS:
+        return -0x68;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5768,6 +5786,24 @@ class TickerTypeJni {
       case -0x5E:
         return ROCKSDB_NAMESPACE::Tickers::
             SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT;
+      case -0x60:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_CALLS;
+      case -0x61:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREPARE_ERRORS;
+      case -0x62:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_PREFETCHED;
+      case -0x63:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_BLOCKS_FROM_CACHE;
+      case -0x64:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BYTES;
+      case -0x65:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_PREFETCH_BLOCKS_WASTED;
+      case -0x66:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_REQUESTS;
+      case -0x67:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_IO_COALESCED_NONADJACENT;
+      case -0x68:
+        return ROCKSDB_NAMESPACE::Tickers::MULTISCAN_SEEK_ERRORS;
       case -0x54:
         // -0x54 is the max value at this time. Since these values are exposed
         // directly to Java clients, we'll keep the value the same till the next
@@ -5924,6 +5960,10 @@ class HistogramTypeJni {
         return 0x3D;
       case ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES:
         return 0x3F;
+      case ROCKSDB_NAMESPACE::Histograms::MULTISCAN_PREPARE_MICROS:
+        return 0x40;
+      case ROCKSDB_NAMESPACE::Histograms::MULTISCAN_BLOCKS_PER_PREPARE:
+        return 0x41;
       case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
         // 0x3E is reserved for backwards compatibility on current minor
         // version.
@@ -6071,6 +6111,10 @@ class HistogramTypeJni {
             TABLE_OPEN_PREFETCH_TAIL_READ_BYTES;
       case 0x3F:
         return ROCKSDB_NAMESPACE::Histograms::COMPACTION_PREFETCH_BYTES;
+      case 0x40:
+        return ROCKSDB_NAMESPACE::Histograms::MULTISCAN_PREPARE_MICROS;
+      case 0x41:
+        return ROCKSDB_NAMESPACE::Histograms::MULTISCAN_BLOCKS_PER_PREPARE;
       case 0x3E:
         // 0x3E is reserved for backwards compatibility on current minor
         // version.
diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java
index 3825c90a4515..b4a56cc07e0d 100644
--- a/java/src/main/java/org/rocksdb/HistogramType.java
+++ b/java/src/main/java/org/rocksdb/HistogramType.java
@@ -212,6 +212,20 @@ public enum HistogramType {
 
   COMPACTION_PREFETCH_BYTES((byte) 0x3F),
 
+  /**
+   * MultiScan histogram statistics
+   */
+
+  /**
+   * Time spent in Iterator::Prepare() for multi-scan (microseconds)
+   */
+  MULTISCAN_PREPARE_MICROS((byte) 0x40),
+
+  /**
+   * Number of blocks per multi-scan Prepare() call
+   */
+  MULTISCAN_BLOCKS_PER_PREPARE((byte) 0x41),
+
   // 0x3E is reserved for backwards compatibility on current minor version.
   HISTOGRAM_ENUM_MAX((byte) 0x3E);
 
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index e5fb81a138ba..bf1c73a129fb 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -906,6 +906,55 @@ public enum TickerType {
      */
     REMOTE_COMPACT_RESUMED_BYTES((byte) -0x5F),
 
+    /**
+     * MultiScan statistics
+     */
+
+    /**
+     * # of calls to Iterator::Prepare() for multi-scan
+     */
+    MULTISCAN_PREPARE_CALLS((byte) -0x60),
+
+    /**
+     * # of errors during Iterator::Prepare() for multi-scan
+     */
+    MULTISCAN_PREPARE_ERRORS((byte) -0x61),
+
+    /**
+     * # of data blocks prefetched during multi-scan Prepare()
+     */
+    MULTISCAN_BLOCKS_PREFETCHED((byte) -0x62),
+
+    /**
+     * # of data blocks found in cache during multi-scan Prepare()
+     */
+    MULTISCAN_BLOCKS_FROM_CACHE((byte) -0x63),
+
+    /**
+     * Total bytes prefetched during multi-scan Prepare()
+     */
+    MULTISCAN_PREFETCH_BYTES((byte) -0x64),
+
+    /**
+     * # of prefetched blocks that were never accessed (wasted)
+     */
+    MULTISCAN_PREFETCH_BLOCKS_WASTED((byte) -0x65),
+
+    /**
+     * # of I/O requests issued during multi-scan Prepare()
+     */
+    MULTISCAN_IO_REQUESTS((byte) -0x66),
+
+    /**
+     * # of non-adjacent blocks coalesced into single I/O request
+     */
+    MULTISCAN_IO_COALESCED_NONADJACENT((byte) -0x67),
+
+    /**
+     * # of seek errors during multi-scan iteration
+     */
+    MULTISCAN_SEEK_ERRORS((byte) -0x68),
+
     TICKER_ENUM_MAX((byte) -0x54);
 
     private final byte value;
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 28d4278b2197..01b123d195e8 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -280,6 +280,17 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {NUMBER_WBWI_INGEST, "rocksdb.number.wbwi.ingest"},
     {SST_USER_DEFINED_INDEX_LOAD_FAIL_COUNT,
      "rocksdb.sst.user.defined.index.load.fail.count"},
+    {MULTISCAN_PREPARE_CALLS, "rocksdb.multiscan.prepare.calls"},
+    {MULTISCAN_PREPARE_ERRORS, "rocksdb.multiscan.prepare.errors"},
+    {MULTISCAN_BLOCKS_PREFETCHED, "rocksdb.multiscan.blocks.prefetched"},
+    {MULTISCAN_BLOCKS_FROM_CACHE, "rocksdb.multiscan.blocks.from.cache"},
+    {MULTISCAN_PREFETCH_BYTES, "rocksdb.multiscan.prefetch.bytes"},
+    {MULTISCAN_PREFETCH_BLOCKS_WASTED,
+     "rocksdb.multiscan.prefetch.blocks.wasted"},
+    {MULTISCAN_IO_REQUESTS, "rocksdb.multiscan.io.requests"},
+    {MULTISCAN_IO_COALESCED_NONADJACENT,
+     "rocksdb.multiscan.io.coalesced.nonadjacent"},
+    {MULTISCAN_SEEK_ERRORS, "rocksdb.multiscan.seek.errors"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@@ -354,6 +365,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
     {NUM_OP_PER_TRANSACTION, "rocksdb.num.op.per.transaction"},
     {MULTISCAN_PREPARE_ITERATORS,
      "rocksdb.multiscan.op.prepare.iterators.micros"},
+    {MULTISCAN_PREPARE_MICROS, "rocksdb.multiscan.prepare.micros"},
+    {MULTISCAN_BLOCKS_PER_PREPARE, "rocksdb.multiscan.blocks.per.prepare"},
 };
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index fab5914742aa..ee29bd20921a 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -185,7 +185,7 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
 
 TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   constexpr int kPeriodSec = 1;
-  constexpr int kEstimatedOneSliceSize = 16000;
+  constexpr int kEstimatedOneSliceSize = 22000;
 
   Options options;
   options.create_if_missing = true;
@@ -277,7 +277,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   // If `slice_count == 0` when new statistics are added, consider increasing
   // `kEstimatedOneSliceSize`
   ASSERT_EQ(slice_count, 1);
-  ASSERT_TRUE(stats_history_size_reopen < 16000 &&
+  ASSERT_TRUE(stats_history_size_reopen < kEstimatedOneSliceSize &&
               stats_history_size_reopen > 0);
   ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
   Close();
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index e822eb8af13e..00c1d2102fc3 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -920,6 +920,21 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
 }
 
 BlockBasedTableIterator::MultiScanState::~MultiScanState() {
+  // Count remaining non-empty blocks as wasted (iterator abandoned before
+  // accessing them). Start from cur_data_block_idx since blocks before that
+  // have already been processed and counted if skipped.
+  for (size_t i = cur_data_block_idx; i < pinned_data_blocks.size(); ++i) {
+    if (!pinned_data_blocks[i].IsEmpty()) {
+      ++wasted_blocks_count;
+    }
+  }
+
+  // Record wasted blocks stat
+  if (wasted_blocks_count > 0 && statistics != nullptr) {
+    RecordTick(statistics, MULTISCAN_PREFETCH_BLOCKS_WASTED,
+               wasted_blocks_count);
+  }
+
   // Abort any pending async IO operations to prevent callback being called
   // after async read states are destructed.
   if (!async_states.empty()) {
@@ -978,13 +993,19 @@ BlockBasedTableIterator::MultiScanState::~MultiScanState() {
 // moving forward.
 void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
   assert(!multi_scan_);
+  RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_CALLS);
+  StopWatch sw(table_->get_rep()->ioptions.clock, table_->GetStatistics(),
+               MULTISCAN_PREPARE_MICROS);
+
   if (!index_iter_->status().ok()) {
     multi_scan_status_ = index_iter_->status();
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
     return;
   }
   if (multi_scan_) {
     multi_scan_.reset();
     multi_scan_status_ = Status::InvalidArgument("Prepare already called");
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
     return;
   }
 
@@ -998,6 +1019,7 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       CollectBlockHandles(scan_opts, &scan_block_handles,
                           &block_index_ranges_per_scan, &data_block_separators);
   if (!multi_scan_status_.ok()) {
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
     return;
   }
 
@@ -1010,23 +1032,44 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       scan_block_handles, multiscan_opts, &block_indices_to_read,
       &pinned_data_blocks_guard, &prefetched_max_idx);
   if (!multi_scan_status_.ok()) {
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
     return;
   }
 
+  // Record cache hit/miss stats
+  size_t blocks_from_cache =
+      scan_block_handles.size() - block_indices_to_read.size();
+  RecordTick(table_->GetStatistics(), MULTISCAN_BLOCKS_FROM_CACHE,
+             blocks_from_cache);
+  RecordTick(table_->GetStatistics(), MULTISCAN_BLOCKS_PREFETCHED,
+             block_indices_to_read.size());
+
   std::vector<AsyncReadState> async_states;
   // Maps from block index into async read request (index into async_states[])
   UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
   if (!block_indices_to_read.empty()) {
     std::vector<FSReadRequest> read_reqs;
     std::vector<std::vector<size_t>> coalesced_block_indices;
+    size_t nonadjacent_coalesced = 0;
+    uint64_t total_prefetch_bytes = 0;
     PrepareIORequests(block_indices_to_read, scan_block_handles, multiscan_opts,
                       &read_reqs, &block_idx_to_readreq_idx,
-                      &coalesced_block_indices);
+                      &coalesced_block_indices, &nonadjacent_coalesced,
+                      &total_prefetch_bytes);
+
+    // Record I/O stats
+    RecordTick(table_->GetStatistics(), MULTISCAN_IO_REQUESTS,
+               read_reqs.size());
+    RecordTick(table_->GetStatistics(), MULTISCAN_PREFETCH_BYTES,
+               total_prefetch_bytes);
+    RecordTick(table_->GetStatistics(), MULTISCAN_IO_COALESCED_NONADJACENT,
+               nonadjacent_coalesced);
 
     multi_scan_status_ =
         ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
                   &read_reqs, &async_states, &pinned_data_blocks_guard);
     if (!multi_scan_status_.ok()) {
+      RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
       return;
     }
   }
@@ -1038,7 +1081,11 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
       std::move(pinned_data_blocks_guard), std::move(data_block_separators),
       std::move(block_index_ranges_per_scan),
       std::move(block_idx_to_readreq_idx), std::move(async_states),
-      prefetched_max_idx);
+      prefetched_max_idx, table_->GetStatistics());
+
+  // Record histogram for blocks per prepare
+  RecordInHistogram(table_->GetStatistics(), MULTISCAN_BLOCKS_PER_PREPARE,
+                    scan_block_handles.size());
 
   is_index_at_curr_block_ = false;
   block_iter_points_to_real_block_ = false;
@@ -1056,6 +1103,7 @@ void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
   if (!seek_target) {
     // start key must be set for multi-scan
     multi_scan_status_ = Status::InvalidArgument("No seek key for MultiScan");
+    RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS);
     return;
   }
 
@@ -1161,6 +1209,7 @@ void BlockBasedTableIterator::SeekMultiScan(const Slice* seek_target) {
         multi_scan_status_ = Status::InvalidArgument(
             "Seek target is before the previous prepared range at index " +
             std::to_string(multi_scan_->next_scan_idx));
+        RecordTick(table_->GetStatistics(), MULTISCAN_SEEK_ERRORS);
         return;
       }
       // It should only be possible to seek a key between the start of current
@@ -1248,6 +1297,7 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
        unpin_block_idx < cur_scan_start_idx; unpin_block_idx++) {
     if (!multi_scan_->pinned_data_blocks[unpin_block_idx].IsEmpty()) {
       multi_scan_->pinned_data_blocks[unpin_block_idx].Reset();
+      ++multi_scan_->wasted_blocks_count;
     }
   }
 
@@ -1263,6 +1313,7 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
     // Unpin the blocks that are passed
     if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
       multi_scan_->pinned_data_blocks[block_idx].Reset();
+      ++multi_scan_->wasted_blocks_count;
     }
     block_idx++;
   }
@@ -1303,6 +1354,7 @@ void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
     if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
              .IsEmpty()) {
       multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
+      ++multi_scan_->wasted_blocks_count;
     }
     multi_scan_->cur_data_block_idx++;
   }
@@ -1578,9 +1630,12 @@ void BlockBasedTableIterator::PrepareIORequests(
     const std::vector<BlockHandle>& scan_block_handles,
     const MultiScanArgs* multiscan_opts, std::vector<FSReadRequest>* read_reqs,
     UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
-    std::vector<std::vector<size_t>>* coalesced_block_indices) {
+    std::vector<std::vector<size_t>>* coalesced_block_indices,
+    size_t* nonadjacent_coalesced_count, uint64_t* total_prefetch_bytes) {
   assert(coalesced_block_indices->empty());
   coalesced_block_indices->resize(1);
+  *nonadjacent_coalesced_count = 0;
+  *total_prefetch_bytes = 0;
 
   for (const auto& block_idx : block_indices_to_read) {
     if (!coalesced_block_indices->back().empty()) {
@@ -1596,6 +1651,9 @@ void BlockBasedTableIterator::PrepareIORequests(
           last_block_end + multiscan_opts->io_coalesce_threshold) {
         // new IO
         coalesced_block_indices->emplace_back();
+      } else if (current_start > last_block_end) {
+        // Non-adjacent but within threshold, so coalesced
+        ++(*nonadjacent_coalesced_count);
       }
     }
     coalesced_block_indices->back().emplace_back(block_idx);
@@ -1648,6 +1706,7 @@ void BlockBasedTableIterator::PrepareIORequests(
     read_reqs->emplace_back();
     read_reqs->back().offset = start_offset;
     read_reqs->back().len = end_offset - start_offset;
+    *total_prefetch_bytes += read_reqs->back().len;
 
     if (multiscan_opts->use_async_io) {
       for (const auto& block_idx : block_indices) {
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index a12570d0e78c..de329351c67d 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -491,13 +491,18 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
     UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
     size_t prefetch_max_idx;
 
+    // For tracking wasted prefetch blocks
+    Statistics* statistics;
+    size_t wasted_blocks_count;
+
     MultiScanState(
         const std::shared_ptr<FileSystem>& _fs, const MultiScanArgs* _scan_opts,
         std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
         std::vector<std::string>&& _data_block_separators,
         std::vector<std::tuple<size_t, size_t>>&& _block_index_ranges_per_scan,
         UnorderedMap<size_t, size_t>&& _block_idx_to_readreq_idx,
-        std::vector<AsyncReadState>&& _async_states, size_t _prefetch_max_idx)
+        std::vector<AsyncReadState>&& _async_states, size_t _prefetch_max_idx,
+        Statistics* _statistics)
         : fs(_fs),
           scan_opts(_scan_opts),
           pinned_data_blocks(std::move(_pinned_data_blocks)),
@@ -507,7 +512,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
           cur_data_block_idx(0),
           async_states(std::move(_async_states)),
           block_idx_to_readreq_idx(std::move(_block_idx_to_readreq_idx)),
-          prefetch_max_idx(_prefetch_max_idx) {}
+          prefetch_max_idx(_prefetch_max_idx),
+          statistics(_statistics),
+          wasted_blocks_count(0) {}
 
     ~MultiScanState();
   };
@@ -728,7 +735,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
       const MultiScanArgs* multiscan_opts,
       std::vector<FSReadRequest>* read_reqs,
       UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
-      std::vector<std::vector<size_t>>* coalesced_block_indices);
+      std::vector<std::vector<size_t>>* coalesced_block_indices,
+      size_t* nonadjacent_coalesced_count, uint64_t* total_prefetch_bytes);
 
   Status ExecuteIO(
       const std::vector<BlockHandle>& scan_block_handles,

From a9906f0dd0603de36a5f7ca2deead8c234572e92 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 22 Jan 2026 11:48:06 -0800
Subject: [PATCH 434/500] A better approach to clearing DBs for crash test
 (#14254)

Summary:
Clearing DB dir for crash test is currently a hodgepodge of
1. Caller of db_crashtest.py maybe tries to clear the dir
2. db_crashtest.py tries to clear the dir in get_dbname() (but ignoring failure)
3. db_crashtest.py passes --destroy_db_initially to some db_stress calls as needed
4. db_crashtest.py tries to clear the dir between some db_stress calls
5. db_crashtest.py tries to clear the dir after everything is done and successful (no artifacts to investigate or save) (but ignoring failure)
6. Try to add more uniqueness to the directory from https://github.com/facebook/rocksdb/issues/14249

This change reverts or replaces 2, 4, 5, and 6 by doubling-down on (expanding) 3 and a small variant of it:

* crash_test.mk passes --destroy_db_initially=1 so that the first run of db_stress clears the db dir.
* After each db_stress invocation, db_crashtest.py resets destroy_db_initially=0 so that the next invocation reuses the same DB, except in cases where there is an incompatibility that requires a fresh DB (from cases 3 and 4 above).
* On success, uses new `db_stress --destroy_db_and_exit` option to clean up the DB dir without needing a custom cleanup_cmd (now ignored)

Note that although case 1 is likely obsolete, it is out of control of an open source PR.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14254

Test Plan: some manual runs

Reviewed By: xingbowang

Differential Revision: D91164731

Pulled By: pdillinger

fbshipit-source-id: 0a66c8c0e130c9eeacc55af411a18a09bc9debdf
---
 crash_test.mk                         |  2 +-
 db_stress_tool/db_stress_common.cc    | 19 +++++++++
 db_stress_tool/db_stress_common.h     |  6 +++
 db_stress_tool/db_stress_gflags.cc    |  4 ++
 db_stress_tool/db_stress_test_base.cc | 17 +-------
 db_stress_tool/db_stress_tool.cc      | 13 +++++++
 tools/db_crashtest.py                 | 56 +++++++++++++--------------
 7 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/crash_test.mk b/crash_test.mk
index 43cce994a23b..1b9960d581e1 100644
--- a/crash_test.mk
+++ b/crash_test.mk
@@ -8,7 +8,7 @@ DB_STRESS_CMD?=./db_stress
 include common.mk
 
 CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
-CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
+CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' --destroy_db_initially=1
 
 .PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
 	crash_test_with_wc_txn crash_test_with_wp_txn crash_test_with_wup_txn \
diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index ee7fc1cf5edc..c26401352234 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -877,5 +877,24 @@ Status DestroyUnverifiedSubdir(const std::string& dirname) {
   return s;
 }
 
+Status DbStressDestroyDb(const std::string& db_path) {
+  Status s;
+  Options options;
+  // NOTE: using db_stress_listener_env in order to see obsolete MANIFEST files
+  options.env = db_stress_listener_env;
+  // Remove DB files in a principled way to avoid issues
+  if (FLAGS_use_blob_db) {
+    s = blob_db::DestroyBlobDB(db_path, options, blob_db::BlobDBOptions());
+  } else {
+    s = DestroyDB(db_path, options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  // Remove everything else recursively, only reporting success if able to
+  // delete everything
+  return DestroyDir(db_stress_listener_env, db_path);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 619c24e75b40..edf6d918aea3 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -100,6 +100,7 @@ DECLARE_bool(enable_pipelined_write);
 DECLARE_bool(verify_before_write);
 DECLARE_bool(histogram);
 DECLARE_bool(destroy_db_initially);
+DECLARE_bool(destroy_db_and_exit);
 DECLARE_bool(verbose);
 DECLARE_bool(progress_reports);
 DECLARE_uint64(db_write_buffer_size);
@@ -820,5 +821,10 @@ Status SaveFilesInDirectory(const std::string& src_dirname,
                             const std::string& dst_dirname);
 Status DestroyUnverifiedSubdir(const std::string& dirname);
 Status InitUnverifiedSubdir(const std::string& dirname);
+
+// Destroy the DB at the given path under the env configured for db_stress.
+// Handles both regular DB and BlobDB, and cleans and removes the entire dir.
+Status DbStressDestroyDb(const std::string& db_path);
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index e9f7e172bd15..f543a00bcbc8 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -135,6 +135,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
 DEFINE_bool(destroy_db_initially, true,
             "Destroys the database dir before start if this is true");
 
+DEFINE_bool(destroy_db_and_exit, false,
+            "Destroys the database dir and exits. Useful for cleanup without "
+            "running stress test. Other options are mostly ignored.");
+
 DEFINE_bool(verbose, false, "Verbose");
 
 DEFINE_bool(progress_reports, true,
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index c2c000f506fb..2dc952de2cb7 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -77,22 +77,7 @@ StressTest::StressTest()
       secondary_db_(nullptr),
       is_db_stopped_(false) {
   if (FLAGS_destroy_db_initially) {
-    std::vector<std::string> files;
-    db_stress_env->GetChildren(FLAGS_db, &files);
-    for (unsigned int i = 0; i < files.size(); i++) {
-      if (Slice(files[i]).starts_with("heap-")) {
-        db_stress_env->DeleteFile(FLAGS_db + "/" + files[i]);
-      }
-    }
-
-    Options options;
-    options.env = db_stress_env;
-    // Remove files without preserving manfiest files
-    const Status s = !FLAGS_use_blob_db
-                         ? DestroyDB(FLAGS_db, options)
-                         : blob_db::DestroyBlobDB(FLAGS_db, options,
-                                                  blob_db::BlobDBOptions());
-
+    const Status s = DbStressDestroyDb(FLAGS_db);
     if (!s.ok()) {
       fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str());
       exit(1);
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index f22827e53fee..796a62b800bd 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -98,6 +98,19 @@ int db_stress_tool(int argc, char** argv) {
       raw_env, std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem()));
   db_stress_env = env_wrapper_guard.get();
 
+  // Handle --destroy_db_and_exit early, before other option validation
+  if (FLAGS_destroy_db_and_exit) {
+    s = DbStressDestroyDb(FLAGS_db);
+    if (s.ok()) {
+      fprintf(stdout, "Successfully destroyed db at %s\n", FLAGS_db.c_str());
+      return 0;
+    } else {
+      fprintf(stderr, "Failed to destroy db at %s: %s\n", FLAGS_db.c_str(),
+              s.ToString().c_str());
+      return 1;
+    }
+  }
+
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
   // The number of background threads should be at least as much the
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index b681859ff5f8..6c55f84b6011 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -60,10 +60,10 @@ def early_argument_parsing_before_main():
     global per_iteration_random_seed_override
     per_iteration_random_seed_override = args.per_iteration_random_seed_override
     global is_remote_db
-    # Set is_remote_db if remain_args has a non-empty --env_uri= argument
+    # Set is_remote_db if remain_args has a non-empty --env_uri= or --fs_uri= argument
     for arg in remain_args:
         parts = arg.split("=", 1)
-        if parts[0] == "--env_uri" and len(parts) > 1 and parts[1]:
+        if parts[0] in ["--env_uri", "--fs_uri"] and len(parts) > 1 and parts[1]:
             is_remote_db = True
             break
 
@@ -454,32 +454,20 @@ def apply_random_seed_per_iteration():
 _DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL"
 
 stress_cmd = "./db_stress"
-cleanup_cmd = None
 
 
 def is_release_mode():
     return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0"
 
 
-# Generate a unique run ID for this script execution. This ensures each run
-# gets a unique database directory when TEST_TMPDIR is set, avoiding issues
-# with parameter changes (like use_put_entity_one_in) between runs.
-run_id = str(random.randint(0, 2**63))
-
-
 def get_dbname(test_name):
     test_dir_name = "rocksdb_crashtest_" + test_name
     test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
     if test_tmpdir is None or test_tmpdir == "":
         dbname = tempfile.mkdtemp(prefix=test_dir_name)
     else:
-        dbname = test_tmpdir + "/" + test_dir_name + "_" + run_id
+        dbname = test_tmpdir + "/" + test_dir_name
         if not is_remote_db:
-            shutil.rmtree(dbname, True)
-            if cleanup_cmd is not None:
-                print("Running DB cleanup command - %s\n" % cleanup_cmd)
-                # Ignore failure
-                os.system(cleanup_cmd)
             os.makedirs(dbname, exist_ok=True)
     return dbname
 
@@ -1387,13 +1375,18 @@ def print_output_and_exit_on_error(stdout, stderr, print_stderr_separately=False
 
 
 def cleanup_after_success(dbname):
-    if not is_remote_db:
-        shutil.rmtree(dbname, True)
-    if cleanup_cmd is not None:
-        print("Running DB cleanup command - %s\n" % cleanup_cmd)
-        ret = os.system(cleanup_cmd)
-        if ret != 0:
-            print("WARNING: DB cleanup returned error %d\n" % ret)
+    # Use db_stress --destroy_db_and_exit, which simplifies remote DB cleanup
+    cleanup_cmd_parts = [stress_cmd, "--destroy_db_and_exit=1", "--db=" + dbname]
+    # Pass through relevant arguments for remote DB access
+    for arg in remain_args:
+        parts = arg.split("=", 1)
+        if parts[0] in ["--env_uri", "--fs_uri"]:
+            cleanup_cmd_parts.append(arg)
+    print("Running DB cleanup command - %s\n" % " ".join(cleanup_cmd_parts))
+    ret = subprocess.call(cleanup_cmd_parts)
+    if ret != 0:
+        print("ERROR: DB cleanup returned error %d\n" % ret)
+        sys.exit(2)
 
 
 # This script runs and kills db_stress multiple times. It checks consistency
@@ -1421,6 +1414,10 @@ def blackbox_crash_main(args, unknown_args):
 
         hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
 
+        # Reset destroy_db_initially after each run (it may have been set by
+        # command line for first run only)
+        cmd_params["destroy_db_initially"] = 0
+
         if not hit_timeout:
             print("Exit Before Killing")
             print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)
@@ -1563,7 +1560,7 @@ def whitebox_crash_main(args, unknown_args):
                 "`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style."
                 + "\n"
             )
-            additional_opts["destroy_db_initially"] = 1
+            cmd_params["destroy_db_initially"] = 1
         prev_compaction_style = cur_compaction_style
 
         cmd = gen_cmd(
@@ -1588,6 +1585,11 @@ def whitebox_crash_main(args, unknown_args):
         hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
             cmd, exit_time - time.time() + 900
         )
+
+        # Reset destroy_db_initially after each run (it may have been set by
+        # command line for first run, or set for various reasons for a step)
+        cmd_params["destroy_db_initially"] = 0
+
         msg = "check_mode={}, kill option={}, exitcode={}\n".format(
             check_mode, additional_opts["kill_random_test"], retncode
         )
@@ -1617,7 +1619,8 @@ def whitebox_crash_main(args, unknown_args):
         # First half of the duration, keep doing kill test. For the next half,
         # try different modes.
         if time.time() > half_time:
-            cleanup_after_success(dbname)
+            # Set next iteration to destroy DB (works for remote DB)
+            cmd_params["destroy_db_initially"] = 1
             if expected_values_dir is not None:
                 shutil.rmtree(expected_values_dir, True)
                 os.mkdir(expected_values_dir)
@@ -1633,7 +1636,6 @@ def whitebox_crash_main(args, unknown_args):
 
 def main():
     global stress_cmd
-    global cleanup_cmd
 
     parser = argparse.ArgumentParser(
         description="This script runs and kills \
@@ -1649,7 +1651,7 @@ def main():
     parser.add_argument("--test_multiops_txn", action="store_true")
     parser.add_argument("--stress_cmd")
     parser.add_argument("--test_tiered_storage", action="store_true")
-    parser.add_argument("--cleanup_cmd")
+    parser.add_argument("--cleanup_cmd")  # ignore old option for now
     parser.add_argument("--print_stderr_separately", action="store_true", default=False)
 
     all_params = dict(
@@ -1690,8 +1692,6 @@ def main():
 
     if args.stress_cmd:
         stress_cmd = args.stress_cmd
-    if args.cleanup_cmd:
-        cleanup_cmd = args.cleanup_cmd
     if args.test_type == "blackbox":
         blackbox_crash_main(args, unknown_args)
     if args.test_type == "whitebox":

From 6a79e02ebd64425d6b2ee5cc45732e8898ff8dba Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 23 Jan 2026 10:01:50 -0800
Subject: [PATCH 435/500] Support pre-defined compression dictionaries (#14253)

Summary:
... in addition to those derived from samples. This could be useful when trade-offs favor an offline trained dictionary that's good for the whole work load, which can involve heavy-weight training, vs. on-the-fly training on samples for each file, which has limitations.

This involves some breaking changes to some deeper parts of the new compression API. I'm not concerned about performance because this doesn't touch the per-block parts of the API, just the per-file parts.

Bonus: change to
CompressionManagerWrapper::FindCompatibleCompressionManager to implement what is likely the preferred behavior.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14253

Test Plan: unit test included

Reviewed By: hx235

Differential Revision: D91082208

Pulled By: pdillinger

fbshipit-source-id: 1442db65e15c9435437204c19787c96f7a40a207
---
 include/rocksdb/advanced_compression.h        | 154 ++++++++----
 .../block_based/block_based_table_builder.cc  |  47 +++-
 test_util/testutil.h                          |   4 +-
 util/auto_tune_compressor.cc                  |  13 +-
 util/auto_tune_compressor.h                   |   6 +-
 util/compression.cc                           |  95 +++++---
 util/compression_test.cc                      | 230 +++++++++++++++++-
 util/simple_mixed_compressor.cc               |   8 +-
 util/simple_mixed_compressor.h                |   4 +-
 9 files changed, 442 insertions(+), 119 deletions(-)

diff --git a/include/rocksdb/advanced_compression.h b/include/rocksdb/advanced_compression.h
index ae707b6479da..a680d870464f 100644
--- a/include/rocksdb/advanced_compression.h
+++ b/include/rocksdb/advanced_compression.h
@@ -11,6 +11,8 @@
 
 #pragma once
 
+#include <variant>
+
 #include "rocksdb/cache.h"
 #include "rocksdb/compression_type.h"
 #include "rocksdb/data_structure.h"
@@ -56,7 +58,64 @@ class Decompressor;
 // because RocksDB is not exception-safe. This could cause undefined behavior
 // including data loss, unreported corruption, deadlocks, and more.
 class Compressor {
- public:
+ public:  // Auxiliary types
+  // No dictionary should be used (for a given block type).
+  struct DictDisabled {};
+
+  // A recommendation for dictionary compression by collecting samples from
+  // blocks. The caller should collect up to `max_sample_bytes` of sample data
+  // and pass it to MaybeCloneSpecialized() to create a specialized compressor.
+  struct DictSampling {
+    // Maximum total bytes of sample data to collect from blocks.
+    // This controls how much data is buffered before dictionary training.
+    size_t max_sample_bytes = 0;
+  };
+
+  // A pre-defined dictionary that is recommended or specified for direct use
+  // with MaybeCloneSpecialized(), without any sampling.
+  struct DictPreDefined {
+    // The owned raw/serialized dictionary bytes. Recommend std::move to
+    // MaybeCloneSpecialized()
+    std::string dict_data;
+  };
+
+  // The result type for GetDictGuidance() - indicates how dictionary
+  // compression should be configured for a given block type.
+  using DictConfig = std::variant<DictDisabled, DictSampling, DictPreDefined>;
+
+  // Sample data collected from blocks for dictionary training.
+  struct DictSamples {
+    // All the sample input blocks stored contiguously
+    std::string sample_data;
+    // The lengths of each of the sample blocks in `sample_data`
+    std::vector<size_t> sample_lens;
+
+    bool empty() const { return sample_data.empty(); }
+    bool Verify() const {
+      size_t total_len = 0;
+      for (auto len : sample_lens) {
+        total_len += len;
+      }
+      return total_len == sample_data.size();
+    }
+  };
+
+  // Arguments for MaybeCloneSpecialized() - provides either samples, a
+  // pre-defined dictionary, or indicates no dictionary should be used.
+  // NOTE: DictPreDefined here is the same type as above, allowing the
+  // pre-defined dictionary from GetDictGuidance() to be passed through.
+  using DictConfigArgs =
+      std::variant<DictDisabled, DictSamples, DictPreDefined>;
+
+  // A WorkingArea is an optional structure (both for callers and
+  // implementations) that can enable optimizing repeated compressions by
+  // reusing working space or thread-local tracking of statistics or trends.
+  // This enables use of ZSTD context, for example.
+  //
+  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
+  struct WorkingArea {};
+
+ public:  // Functions
   Compressor() = default;
   virtual ~Compressor() = default;
 
@@ -69,15 +128,17 @@ class Compressor {
     return id;
   }
 
-  // Returns the max total bytes of for all sampled blocks for creating the data
-  // dictionary, or zero indicating dictionary compression should not be
-  // used/configured. This will typically be called after
-  // CompressionManager::GetCompressor() to see if samples should be accumulated
-  // and passed to MaybeCloneSpecialized().
-  virtual size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const {
+  // Returns the recommended dictionary configuration for the given block type.
+  // See the comments on DictConfig and variants for details.
+  //
+  // NOTE: This may be called on the "base" Compressor returned by
+  // CompressionManager, which is not yet configured with a dictionary,
+  // or it can be skipped by callers not intending to handle dictionary
+  // compression.
+  virtual DictConfig GetDictGuidance(CacheEntryRole block_type) const {
     // Default implementation: no dictionary
     (void)block_type;
-    return 0;
+    return DictDisabled{};
   }
 
   // Returns the serialized form of the data dictionary associated with this
@@ -94,52 +155,32 @@ class Compressor {
   // needed to implement MaybeCloneSpecialized() in wrapper compressors.
   virtual std::unique_ptr<Compressor> Clone() const = 0;
 
-  // Utility struct for providing sample data for the compression dictionary.
-  // Potentially extensible by callers of Compressor (but not recommended)
-  struct DictSampleArgs {
-    // All the sample input blocks stored contiguously
-    std::string sample_data;
-    // The lengths of each of the sample blocks in `sample_data`
-    std::vector<size_t> sample_lens;
-
-    bool empty() { return sample_data.empty(); }
-    bool Verify() {
-      size_t total_len = 0;
-      for (auto len : sample_lens) {
-        total_len += len;
-      }
-      return total_len == sample_data.size();
-    }
-  };
-
   // Create potential variants of the same Compressor that might be
   // (a) optimized for a particular block type (does not affect correct
   //     decompression), and/or
-  // (b) configured to use a compression dictionary, based on the given
-  //     samples (decompression must provide the dictionary from
-  //     GetSerializedDict())
+  // (b) configured to use a compression dictionary based on the provided
+  //     configuration (samples or pre-defined dictionary). See the comments on
+  //     DictConfigArgs and its variants for detail.
+  //
   // Return of nullptr indicates no specialization exists or was attempted
-  // and the caller is best to use the current Compressor for the desired
-  // scenario. Using CacheEntryRole:kMisc for block_type generally means
-  // "unspecified", and both parameters are merely suggestions. The exact
-  // dictionary associated with a returned compressor must be read from
-  // GetSerializedDict().
+  // and the caller should use the current Compressor for the desired scenario.
+  // Using CacheEntryRole::kMisc for block_type generally means "unspecified".
+  //
+  // The exact dictionary associated with a returned compressor must be read
+  // from GetSerializedDict().
   virtual std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
     // Default implementation: no specialization
     (void)block_type;
-    (void)dict_samples;
-    // Caller should have checked GetMaxSampleSizeIfWantDict before attempting
-    // to provide dictionary samples
-    assert(dict_samples.empty());
+    (void)dict_config;
     return nullptr;
   }
 
   // A convenience function when a clone is needed and may or may not be
   // specialized.
   std::unique_ptr<Compressor> CloneMaybeSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
-    auto clone = MaybeCloneSpecialized(block_type, std::move(dict_samples));
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
+    auto clone = MaybeCloneSpecialized(block_type, std::move(dict_config));
     if (clone == nullptr) {
       clone = Clone();
       assert(clone != nullptr);
@@ -147,14 +188,6 @@ class Compressor {
     return clone;
   }
 
-  // A WorkingArea is an optional structure (both for callers and
-  // implementations) that can enable optimizing repeated compressions by
-  // reusing working space or thread-local tracking of statistics or trends.
-  // This enables use of ZSTD context, for example.
-  //
-  // EXTENSIBLE or reinterpret_cast-able by custom Compressor implementations
-  struct WorkingArea {};
-
   // To allow for flexible re-use / reclaimation, we have explicit Get and
   // Release functions, and usually wrap in a special RAII smart pointer.
   // For example, a WorkingArea could be saved/recycled in thread-local or
@@ -423,6 +456,12 @@ class CompressionManager
   // which is valid at the discretion of the CompressionManager. Returning
   // nullptr should normally be the result if preferred == kNoCompression.
   //
+  // Compressors returned here are configured WITHOUT a dictionary, so that
+  // it's always possible to get correct compression->decompression results
+  // if not opting-in to dictionary handling. The compressors may recommend
+  // dictionary usage via GetDictGuidance() and creating a modified Compressor
+  // for that. See Compressor::GetDictGuidance() etc. for details.
+  //
   // These functions must be thread-safe.
 
   // Get a compressor for an SST file.
@@ -477,8 +516,8 @@ class CompressorWrapper : public Compressor {
   CompressorWrapper(const CompressorWrapper&) = delete;
   CompressorWrapper& operator=(const CompressorWrapper&) = delete;
 
-  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override {
-    return wrapped_->GetMaxSampleSizeIfWantDict(block_type);
+  DictConfig GetDictGuidance(CacheEntryRole block_type) const override {
+    return wrapped_->GetDictGuidance(block_type);
   }
 
   Slice GetSerializedDict() const override {
@@ -496,9 +535,9 @@ class CompressorWrapper : public Compressor {
   // when the wrapped Compressor uses the default implementation of
   // MaybeCloneSpecialized(). This needs to be overridden if not.
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override {
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override {
     auto clone =
-        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+        wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_config));
     // Assert default no-op MaybeCloneSpecialized()
     assert(clone == nullptr);
     return clone;
@@ -592,7 +631,14 @@ class CompressionManagerWrapper : public CompressionManager {
 
   std::shared_ptr<CompressionManager> FindCompatibleCompressionManager(
       Slice compatibility_name) override {
-    return wrapped_->FindCompatibleCompressionManager(compatibility_name);
+    // NOTE: We expect that the wrapped CompressionManager will generally
+    // be preferred if compatible, so the default implementation here does
+    // not purely defer to the wrapped instance
+    if (compatibility_name == CompatibilityName()) {
+      return shared_from_this();
+    } else {
+      return wrapped_->FindCompatibleCompressionManager(compatibility_name);
+    }
   }
 
   bool SupportsCompressionType(CompressionType type) const override {
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 9c11e7e7253e..bbd1ddde8135 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -113,9 +113,9 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
 // A convenience function for populating the Compressor* fields; see ~Rep()
 Compressor* MaybeCloneSpecialized(
     Compressor* compressor, CacheEntryRole block_type,
-    Compressor::DictSampleArgs&& dict_samples = {}) {
+    Compressor::DictConfigArgs&& dict_config = Compressor::DictDisabled{}) {
   auto specialized =
-      compressor->MaybeCloneSpecialized(block_type, std::move(dict_samples));
+      compressor->MaybeCloneSpecialized(block_type, std::move(dict_config));
   if (specialized) {
     // Caller is responsible for freeing when distinct
     return specialized.release();
@@ -833,7 +833,8 @@ struct BlockBasedTableBuilder::Rep {
   RelaxedAtomic<uint64_t> sampled_output_fast_data_bytes{0};
   uint32_t compression_parallel_threads;
   int max_compressed_bytes_per_kb;
-  size_t max_dict_sample_bytes = 0;
+  // Dictionary guidance for data blocks (from GetDictGuidance())
+  Compressor::DictConfig data_block_dict_guidance;
 
   // *** Compressors & decompressors - Yes, it seems like a lot here but ***
   // *** these are distinct fields to minimize extra conditionals and    ***
@@ -1122,9 +1123,12 @@ struct BlockBasedTableBuilder::Rep {
         index_block_working_area.compress =
             index_block_compressor->ObtainWorkingArea();
       }
-      max_dict_sample_bytes = basic_compressor->GetMaxSampleSizeIfWantDict(
-          CacheEntryRole::kDataBlock);
-      if (max_dict_sample_bytes > 0) {
+      data_block_dict_guidance =
+          basic_compressor->GetDictGuidance(CacheEntryRole::kDataBlock);
+      if (auto* sampling =
+              std::get_if<Compressor::DictSampling>(&data_block_dict_guidance);
+          sampling != nullptr && sampling->max_sample_bytes > 0) {
+        // Sampling mode: collect samples up to max_sample_bytes
         state = State::kBuffered;
         if (tbo.target_file_size == 0) {
           buffer_limit = tbo.compression_opts.max_dict_buffer_bytes;
@@ -1134,7 +1138,22 @@ struct BlockBasedTableBuilder::Rep {
           buffer_limit = std::min(tbo.target_file_size,
                                   tbo.compression_opts.max_dict_buffer_bytes);
         }
+      } else if (auto* predef = std::get_if<Compressor::DictPreDefined>(
+                     &data_block_dict_guidance);
+                 predef != nullptr && !predef->dict_data.empty()) {
+        // Pre-defined dictionary mode: use it immediately, no buffering
+        data_block_compressor = MaybeCloneSpecialized(
+            basic_compressor.get(), CacheEntryRole::kDataBlock,
+            Compressor::DictPreDefined{std::string{predef->dict_data}});
+        data_block_working_area.compress =
+            data_block_compressor->ObtainWorkingArea();
       } else {
+        assert(std::holds_alternative<Compressor::DictSampling>(
+                   data_block_dict_guidance) ||
+               std::holds_alternative<Compressor::DictPreDefined>(
+                   data_block_dict_guidance) ||
+               std::holds_alternative<Compressor::DictDisabled>(
+                   data_block_dict_guidance));
         // No distinct data block compressor using dictionary, but
         // implementation might still want to specialize for data blocks
         data_block_compressor = MaybeCloneSpecialized(
@@ -2632,14 +2651,18 @@ void BlockBasedTableBuilder::MaybeEnterUnbuffered(
       kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
   const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
 
-  Compressor::DictSampleArgs samples;
+  Compressor::DictSamples samples;
   size_t buffer_idx = kInitSampleIdx;
-  for (size_t i = 0; i < kNumBlocksBuffered &&
-                     samples.sample_data.size() < r->max_dict_sample_bytes;
+  // Get max_sample_bytes from the DictSampling guidance
+  auto* sampling =
+      std::get_if<Compressor::DictSampling>(&r->data_block_dict_guidance);
+  assert(sampling != nullptr);
+  size_t max_sample_bytes = sampling->max_sample_bytes;
+  for (size_t i = 0;
+       i < kNumBlocksBuffered && samples.sample_data.size() < max_sample_bytes;
        ++i) {
-    size_t copy_len =
-        std::min(r->max_dict_sample_bytes - samples.sample_data.size(),
-                 r->data_block_buffers[buffer_idx].size());
+    size_t copy_len = std::min(max_sample_bytes - samples.sample_data.size(),
+                               r->data_block_buffers[buffer_idx].size());
     samples.sample_data.append(r->data_block_buffers[buffer_idx], 0, copy_len);
     samples.sample_lens.emplace_back(copy_len);
 
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 3bd97ef14b76..c07b0139a4d4 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -796,9 +796,9 @@ struct CompressorCustomAlg : public CompressorWrapper {
   }
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override {
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override {
     auto clone =
-        wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples));
+        wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config));
     return std::make_unique<CompressorCustomAlg>(std::move(clone));
   }
 
diff --git a/util/auto_tune_compressor.cc b/util/auto_tune_compressor.cc
index 58d6ee968a43..c61ba97bbe9b 100644
--- a/util/auto_tune_compressor.cc
+++ b/util/auto_tune_compressor.cc
@@ -64,9 +64,9 @@ std::unique_ptr<Compressor> AutoSkipCompressorWrapper::Clone() const {
 }
 
 std::unique_ptr<Compressor> AutoSkipCompressorWrapper::MaybeCloneSpecialized(
-    CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
+    CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
   auto clone =
-      wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples));
+      wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config));
   return std::make_unique<AutoSkipCompressorWrapper>(std::move(clone), opts_);
 }
 
@@ -189,11 +189,10 @@ const char* CostAwareCompressor::Name() const { return "CostAwareCompressor"; }
 std::unique_ptr<Compressor> CostAwareCompressor::Clone() const {
   return std::make_unique<CostAwareCompressor>(opts_);
 }
-size_t CostAwareCompressor::GetMaxSampleSizeIfWantDict(
+Compressor::DictConfig CostAwareCompressor::GetDictGuidance(
     CacheEntryRole block_type) const {
   auto idx = allcompressors_index_.back();
-  return allcompressors_[idx.first][idx.second]->GetMaxSampleSizeIfWantDict(
-      block_type);
+  return allcompressors_[idx.first][idx.second]->GetDictGuidance(block_type);
 }
 
 Slice CostAwareCompressor::GetSerializedDict() const {
@@ -205,12 +204,12 @@ CompressionType CostAwareCompressor::GetPreferredCompressionType() const {
   return kZSTD;
 }
 std::unique_ptr<Compressor> CostAwareCompressor::MaybeCloneSpecialized(
-    CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
+    CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
   // TODO: full dictionary compression support. Currently this just falls
   // back on a non-multi compressor when asked to use a dictionary.
   auto idx = allcompressors_index_.back();
   return allcompressors_[idx.first][idx.second]->MaybeCloneSpecialized(
-      block_type, std::move(dict_samples));
+      block_type, std::move(dict_config));
 }
 Status CostAwareCompressor::CompressBlock(Slice uncompressed_data,
                                           char* compressed_output,
diff --git a/util/auto_tune_compressor.h b/util/auto_tune_compressor.h
index e3653fd45205..791193eb6c6b 100644
--- a/util/auto_tune_compressor.h
+++ b/util/auto_tune_compressor.h
@@ -66,7 +66,7 @@ class AutoSkipCompressorWrapper : public CompressorWrapper {
 
   std::unique_ptr<Compressor> Clone() const override;
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override;
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override;
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
                        CompressionType* out_compression_type,
@@ -153,12 +153,12 @@ class CostAwareCompressor : public Compressor {
   explicit CostAwareCompressor(const CompressionOptions& opts);
   const char* Name() const override;
   std::unique_ptr<Compressor> Clone() const override;
-  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;
+  DictConfig GetDictGuidance(CacheEntryRole block_type) const override;
   Slice GetSerializedDict() const override;
   CompressionType GetPreferredCompressionType() const override;
   ManagedWorkingArea ObtainWorkingArea() override;
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override;
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override;
 
   Status CompressBlock(Slice uncompressed_data, char* compressed_output,
                        size_t* compressed_output_size,
diff --git a/util/compression.cc b/util/compression.cc
index 71eaa1393370..612854b5ac19 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -222,9 +222,11 @@ class CompressorWithSimpleDictBase : public CompressorBase {
                                         std::string&& dict_data = {})
       : CompressorBase(opts), dict_data_(std::move(dict_data)) {}
 
-  size_t GetMaxSampleSizeIfWantDict(
-      CacheEntryRole /*block_type*/) const override {
-    return opts_.max_dict_bytes;
+  DictConfig GetDictGuidance(CacheEntryRole /*block_type*/) const override {
+    if (opts_.max_dict_bytes == 0) {
+      return DictDisabled{};
+    }
+    return DictSampling{opts_.max_dict_bytes};
   }
 
   // NOTE: empty dict is equivalent to no dict
@@ -236,13 +238,21 @@ class CompressorWithSimpleDictBase : public CompressorBase {
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole /*block_type*/,
-      DictSampleArgs&& dict_samples) const final override {
-    assert(dict_samples.Verify());
-    if (dict_samples.empty()) {
-      // Nothing to specialize on
-      return nullptr;
+      DictConfigArgs&& dict_config) const final override {
+    if (auto* samples = std::get_if<DictSamples>(&dict_config)) {
+      assert(samples->Verify());
+      if (samples->empty()) {
+        return nullptr;
+      }
+      return CloneForDict(std::move(samples->sample_data));
+    } else if (auto* predef = std::get_if<DictPreDefined>(&dict_config)) {
+      if (predef->dict_data.empty()) {
+        return nullptr;
+      }
+      return CloneForDict(std::move(predef->dict_data));
     } else {
-      return CloneForDict(std::move(dict_samples.sample_data));
+      assert(std::holds_alternative<DictDisabled>(dict_config));
+      return nullptr;
     }
   }
 
@@ -858,14 +868,15 @@ class BuiltinZSTDCompressorV2 final : public CompressorBase {
                                                      std::move(dict_copy));
   }
 
-  size_t GetMaxSampleSizeIfWantDict(
-      CacheEntryRole /*block_type*/) const override {
+  DictConfig GetDictGuidance(CacheEntryRole /*block_type*/) const override {
     if (opts_.max_dict_bytes == 0) {
       // Dictionary compression disabled
-      return 0;
+      return DictDisabled{};
     } else {
-      return opts_.zstd_max_train_bytes > 0 ? opts_.zstd_max_train_bytes
-                                            : opts_.max_dict_bytes;
+      size_t max_sample_bytes = opts_.zstd_max_train_bytes > 0
+                                    ? opts_.zstd_max_train_bytes
+                                    : opts_.max_dict_bytes;
+      return DictSampling{max_sample_bytes};
     }
   }
 
@@ -974,31 +985,49 @@ class BuiltinZSTDCompressorV2 final : public CompressorBase {
 
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
       CacheEntryRole /*block_type*/,
-      DictSampleArgs&& dict_samples) const override {
-    assert(dict_samples.Verify());
-    if (dict_samples.empty()) {
-      // Nothing to specialize on
+      DictConfigArgs&& dict_config) const override {
+    // Handle DictDisabled
+    // TODO: use holds_alternative
+    if (auto* disabled = std::get_if<DictDisabled>(&dict_config)) {
+      (void)disabled;
       return nullptr;
     }
+
     std::string dict_data;
-    // Migrated from BlockBasedTableBuilder::EnterUnbuffered()
-    if (opts_.zstd_max_train_bytes > 0) {
-      assert(dict_samples.sample_data.size() <= opts_.zstd_max_train_bytes);
-      if (opts_.use_zstd_dict_trainer) {
-        dict_data = ZSTD_TrainDictionary(dict_samples.sample_data,
-                                         dict_samples.sample_lens,
-                                         opts_.max_dict_bytes);
+
+    // Handle DictPreDefined - use the pre-defined dictionary directly
+    if (auto* predef = std::get_if<DictPreDefined>(&dict_config)) {
+      if (predef->dict_data.empty()) {
+        return nullptr;
+      }
+      dict_data = std::move(predef->dict_data);
+    }
+
+    // Handle DictSamples - train dictionary from samples
+    if (auto* samples = std::get_if<DictSamples>(&dict_config)) {
+      assert(samples->Verify());
+      if (samples->empty()) {
+        return nullptr;
+      }
+      // Migrated from BlockBasedTableBuilder::EnterUnbuffered()
+      if (opts_.zstd_max_train_bytes > 0) {
+        assert(samples->sample_data.size() <= opts_.zstd_max_train_bytes);
+        if (opts_.use_zstd_dict_trainer) {
+          dict_data = ZSTD_TrainDictionary(
+              samples->sample_data, samples->sample_lens, opts_.max_dict_bytes);
+        } else {
+          dict_data = ZSTD_FinalizeDictionary(
+              samples->sample_data, samples->sample_lens, opts_.max_dict_bytes,
+              opts_.level);
+        }
       } else {
-        dict_data = ZSTD_FinalizeDictionary(dict_samples.sample_data,
-                                            dict_samples.sample_lens,
-                                            opts_.max_dict_bytes, opts_.level);
+        assert(samples->sample_data.size() <= opts_.max_dict_bytes);
+        // ZSTD "raw content dictionary" - "Any buffer is a valid raw content
+        // dictionary." Or similar for other compressions.
+        dict_data = std::move(samples->sample_data);
       }
-    } else {
-      assert(dict_samples.sample_data.size() <= opts_.max_dict_bytes);
-      // ZSTD "raw content dictionary" - "Any buffer is a valid raw content
-      // dictionary." Or similar for other compressions.
-      dict_data = std::move(dict_samples.sample_data);
     }
+
     CompressionDict dict{std::move(dict_data), kZSTD, opts_.level};
     return std::make_unique<BuiltinZSTDCompressorV2>(opts_, std::move(dict));
   }
diff --git a/util/compression_test.cc b/util/compression_test.cc
index 18144d99c93e..e87e4195feb2 100644
--- a/util/compression_test.cc
+++ b/util/compression_test.cc
@@ -1362,9 +1362,9 @@ TEST_P(DBCompressionTestMaybeParallel, CompressionManagerWrapper) {
 
     std::unique_ptr<Compressor> MaybeCloneSpecialized(
         CacheEntryRole block_type,
-        DictSampleArgs&& dict_samples) const override {
+        DictConfigArgs&& dict_config) const override {
       std::unique_ptr<Compressor> result = std::make_unique<MyCompressor>(
-          wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_samples)));
+          wrapped_->CloneMaybeSpecialized(block_type, std::move(dict_config)));
       if (block_type == CacheEntryRole::kDataBlock) {
         result = std::make_unique<CheckDataBlockCompressorWrapper>(
             std::move(result));
@@ -2138,6 +2138,232 @@ TEST_F(DBCompressionCostPredictor, CostAwareCompressorManager) {
   ASSERT_OK(Flush());
 }
 
+// Test pre-defined dictionary compression with a custom CompressionManager
+TEST_F(DBCompressionTest, PreDefinedDictionaryCompression) {
+  if (!ZSTD_Supported()) {
+    ROCKSDB_GTEST_BYPASS("ZSTD compression not supported");
+    return;
+  }
+
+  // A custom compressor that returns a pre-defined dictionary
+  class PreDefinedDictCompressor : public CompressorWrapper {
+   public:
+    explicit PreDefinedDictCompressor(std::unique_ptr<Compressor> wrapped,
+                                      std::string dict_data)
+        : CompressorWrapper(std::move(wrapped)),
+          predefined_dict_(std::move(dict_data)) {}
+
+    const char* Name() const override { return "PreDefinedDictCompressor"; }
+
+    DictConfig GetDictGuidance(CacheEntryRole block_type) const override {
+      if (block_type == CacheEntryRole::kDataBlock &&
+          !predefined_dict_.empty()) {
+        return DictPreDefined{/*copy*/ predefined_dict_};
+      }
+      return DictDisabled{};
+    }
+
+    std::unique_ptr<Compressor> Clone() const override {
+      return std::make_unique<PreDefinedDictCompressor>(wrapped_->Clone(),
+                                                        predefined_dict_);
+    }
+
+    std::unique_ptr<Compressor> MaybeCloneSpecialized(
+        CacheEntryRole block_type,
+        DictConfigArgs&& dict_config) const override {
+      // Delegate to wrapped compressor for dictionary handling
+      auto specialized =
+          wrapped_->MaybeCloneSpecialized(block_type, std::move(dict_config));
+      if (specialized) {
+        return specialized;
+      }
+      return nullptr;
+    }
+
+   private:
+    std::string predefined_dict_;
+  };
+
+  // Custom CompatibilityName so the builtin compression manager won't be used
+  static const char* kTestCompatibilityName = "PreDefinedDictTest";
+
+  class PreDefinedDictManager : public CompressionManagerWrapper {
+   public:
+    explicit PreDefinedDictManager(std::shared_ptr<CompressionManager> wrapped,
+                                   std::string dict_data)
+        : CompressionManagerWrapper(std::move(wrapped)),
+          predefined_dict_(std::move(dict_data)) {}
+
+    const char* Name() const override { return "PreDefinedDictManager"; }
+
+    const char* CompatibilityName() const override {
+      return kTestCompatibilityName;
+    }
+
+    std::unique_ptr<Compressor> GetCompressorForSST(
+        const FilterBuildingContext& context, const CompressionOptions& opts,
+        CompressionType preferred) override {
+      auto base = wrapped_->GetCompressorForSST(context, opts, preferred);
+      if (base) {
+        return std::make_unique<PreDefinedDictCompressor>(std::move(base),
+                                                          predefined_dict_);
+      }
+      return nullptr;
+    }
+
+   private:
+    std::string predefined_dict_;
+  };
+
+  // A broken manager that ignores the dictionary when decompressing.
+  // This simulates a buggy decompressor that doesn't properly apply the
+  // dictionary, causing ZSTD to produce wrong output when decompressing
+  // dictionary-compressed data.
+  class BrokenDictManager : public CompressionManagerWrapper {
+   public:
+    explicit BrokenDictManager(std::shared_ptr<CompressionManager> wrapped)
+        : CompressionManagerWrapper(std::move(wrapped)) {}
+
+    const char* Name() const override { return "BrokenDictManager"; }
+
+    const char* CompatibilityName() const override {
+      return kTestCompatibilityName;
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressor() override {
+      return std::make_shared<IgnoreDictDecompressor>(
+          wrapped_->GetDecompressor());
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressorOptimizeFor(
+        CompressionType optimize_for_type) override {
+      return std::make_shared<IgnoreDictDecompressor>(
+          wrapped_->GetDecompressorOptimizeFor(optimize_for_type));
+    }
+
+    std::shared_ptr<Decompressor> GetDecompressorForTypes(
+        const CompressionType* types_begin,
+        const CompressionType* types_end) override {
+      return std::make_shared<IgnoreDictDecompressor>(
+          wrapped_->GetDecompressorForTypes(types_begin, types_end));
+    }
+
+   private:
+    // A decompressor that stores the dictionary (for GetSerializedDict) but
+    // ignores it during decompression, causing ZSTD to produce garbage
+    class IgnoreDictDecompressor : public DecompressorWrapper {
+     public:
+      explicit IgnoreDictDecompressor(std::shared_ptr<Decompressor> wrapped)
+          : DecompressorWrapper(std::move(wrapped)) {}
+
+      IgnoreDictDecompressor(std::shared_ptr<Decompressor> wrapped,
+                             std::string dict)
+          : DecompressorWrapper(std::move(wrapped)),
+            dict_(std::move(dict)),
+            dict_slice_(dict_) {}
+
+      const char* Name() const override { return "IgnoreDictDecompressor"; }
+
+      const Slice& GetSerializedDict() const override { return dict_slice_; }
+
+      Status MaybeCloneForDict(const Slice& serialized_dict,
+                               std::unique_ptr<Decompressor>* out) override {
+        // Store the dict but don't actually use it for decompression
+        *out = std::make_unique<IgnoreDictDecompressor>(
+            wrapped_,
+            std::string(serialized_dict.data(), serialized_dict.size()));
+        return Status::OK();
+      }
+
+     private:
+      std::string dict_;
+      Slice dict_slice_;
+    };
+  };
+
+  // Create a dictionary that will be heavily referenced. The key insight is
+  // that ZSTD dictionary compression works by finding matches between the input
+  // data and the dictionary content. To force ZSTD to create dictionary
+  // references, we need to use data that contains exact copies of dictionary
+  // content.
+  Random rnd(42);
+
+  // Create a dictionary with recognizable patterns
+  std::string predefined_dict;
+  std::vector<std::string> dict_patterns;
+  for (int i = 0; i < 50; i++) {
+    std::string pattern = rnd.RandomString(200);
+    dict_patterns.push_back(pattern);
+    predefined_dict += pattern;
+  }
+  // Total dict size: 50 * 200 = 10000 bytes
+  size_t kDictSize = predefined_dict.size();
+
+  auto mgr = std::make_shared<PreDefinedDictManager>(
+      GetBuiltinV2CompressionManager(), predefined_dict);
+
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = static_cast<int>(kDictSize);
+  options.compression_manager = mgr;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.enable_index_compression = true;
+  // Need format_version >= 7 for custom CompatibilityName
+  bbto.format_version = 7;
+  // Need dictionary block load statistics
+  bbto.block_cache = NewLRUCache(1 << 20);
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Write data that uses the same patterns from the dictionary.
+  // This forces ZSTD to create back-references to the dictionary.
+  std::vector<std::string> expected_values;
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    // Compose value from random dictionary patterns - same content as dict
+    for (int j = 0; j < 5; j++) {
+      value +=
+          dict_patterns[rnd.Uniform(static_cast<int>(dict_patterns.size()))];
+    }
+    expected_values.push_back(value);
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+
+  // Verify dictionary was used by checking that dict bytes were inserted
+  ASSERT_GE(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      predefined_dict.size());
+
+  // Read back data and verify correctness
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    ASSERT_EQ(value, expected_values[i]);
+  }
+
+  // Now re-open with a broken decompressor that ignores dictionary.
+  // This should result in corruption on read because ZSTD will fail to
+  // decompress data that references the missing dictionary content.
+  Close();
+  auto broken_mgr =
+      std::make_shared<BrokenDictManager>(GetBuiltinV2CompressionManager());
+  options.compression_manager = broken_mgr;
+  // New block cache to ensure dictionary is re-loaded, because the
+  // dictionary block in cache is actually associated with a decompressor
+  bbto.block_cache = NewLRUCache(1 << 20);
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_OK(TryReopen(options));
+
+  // Read should fail with corruption because the decompressor ignores
+  // the dictionary, causing ZSTD to produce garbage output
+  std::string value;
+  ASSERT_EQ(db_->Get(ReadOptions(), Key(0), &value).code(),
+            Status::kCorruption);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff --git a/util/simple_mixed_compressor.cc b/util/simple_mixed_compressor.cc
index 73e09e0ee6b0..46b2e74c3091 100644
--- a/util/simple_mixed_compressor.cc
+++ b/util/simple_mixed_compressor.cc
@@ -28,9 +28,9 @@ MultiCompressorWrapper::MultiCompressorWrapper(const CompressionOptions& opts)
   }
 }
 
-size_t MultiCompressorWrapper::GetMaxSampleSizeIfWantDict(
+Compressor::DictConfig MultiCompressorWrapper::GetDictGuidance(
     CacheEntryRole block_type) const {
-  return compressors_.back()->GetMaxSampleSizeIfWantDict(block_type);
+  return compressors_.back()->GetDictGuidance(block_type);
 }
 
 Slice MultiCompressorWrapper::GetSerializedDict() const {
@@ -46,11 +46,11 @@ Compressor::ManagedWorkingArea MultiCompressorWrapper::ObtainWorkingArea() {
 }
 
 std::unique_ptr<Compressor> MultiCompressorWrapper::MaybeCloneSpecialized(
-    CacheEntryRole block_type, DictSampleArgs&& dict_samples) const {
+    CacheEntryRole block_type, DictConfigArgs&& dict_config) const {
   // TODO: full dictionary compression support. Currently this just falls
   // back on a non-multi compressor when asked to use a dictionary.
   return compressors_.back()->MaybeCloneSpecialized(block_type,
-                                                    std::move(dict_samples));
+                                                    std::move(dict_config));
 }
 
 // RandomMixedCompressor implementation
diff --git a/util/simple_mixed_compressor.h b/util/simple_mixed_compressor.h
index 0d435394db05..f2499a8f4e99 100644
--- a/util/simple_mixed_compressor.h
+++ b/util/simple_mixed_compressor.h
@@ -19,12 +19,12 @@ class MultiCompressorWrapper : public Compressor {
  public:
   explicit MultiCompressorWrapper(const CompressionOptions& opts);
 
-  size_t GetMaxSampleSizeIfWantDict(CacheEntryRole block_type) const override;
+  DictConfig GetDictGuidance(CacheEntryRole block_type) const override;
   Slice GetSerializedDict() const override;
   CompressionType GetPreferredCompressionType() const override;
   ManagedWorkingArea ObtainWorkingArea() override;
   std::unique_ptr<Compressor> MaybeCloneSpecialized(
-      CacheEntryRole block_type, DictSampleArgs&& dict_samples) const override;
+      CacheEntryRole block_type, DictConfigArgs&& dict_config) const override;
 
  protected:
   const CompressionOptions opts_;

From ad218cacf6a89bb6e82d2cc748b3a85f5f87e3ce Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 26 Jan 2026 09:06:33 -0800
Subject: [PATCH 436/500] Temporary disable multiscan_use_async_io in crash
 test (#14263)

Summary:
Seeing many errors like this

```
Iterator diverged from control iterator which has value ...
iterator is not valid with status: IO error: Req failed: Unknown error -14
VerifyIterator failed. Control CF default
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14263

Test Plan: CI

Reviewed By: archang19

Differential Revision: D91478886

Pulled By: pdillinger

fbshipit-source-id: 94b955b6ecdb7a3cab39dac8e7b0d1047d49a0bb
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 6c55f84b6011..30dd435980af 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -444,7 +444,8 @@ def apply_random_seed_per_iteration():
     "use_multiscan": random.choice([1] + [0] * 3),
     # By default, `statistics` use kExceptDetailedTimers level
     "statistics": random.choice([0, 1]),
-    "multiscan_use_async_io": random.randint(0, 1),
+    # TODO: re-enable after resolving "Req failed: Unknown error -14" errors
+    "multiscan_use_async_io": 0,  # random.randint(0, 1),
 }
 
 _TEST_DIR_ENV_VAR = "TEST_TMPDIR"

From cc691126e5ca7886f7face543ae8121878c59fb7 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Mon, 26 Jan 2026 09:50:59 -0800
Subject: [PATCH 437/500] Start version 10.12 development (#14259)

Summary:
Update HISTORY, version number, format compatible test, and folly version

folly build now depends on libaio

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14259

Test Plan: CI

Reviewed By: anand1976

Differential Revision: D91356493

Pulled By: pdillinger

fbshipit-source-id: 9d85960c647758d5cb33e3910e714e2f7785fd06
---
 .github/actions/setup-folly/action.yml              |  4 ++--
 HISTORY.md                                          | 13 +++++++++++++
 folly.mk                                            |  2 +-
 include/rocksdb/version.h                           |  2 +-
 tools/check_format_compatible.sh                    |  2 +-
 unreleased_history/behavior_changes/fv7.md          |  1 -
 .../bug_fixes/compression_perf_fv7.md               |  1 -
 .../bug_fixes/fix_udt_infinite_compaction_loop.md   |  1 -
 .../bug_fixes/udt_seqno_zero_bug_fix.md             |  1 -
 unreleased_history/public_api_changes/lua.md        |  1 -
 .../public_api_changes/set_options.md               |  1 -
 11 files changed, 18 insertions(+), 11 deletions(-)
 delete mode 100644 unreleased_history/behavior_changes/fv7.md
 delete mode 100644 unreleased_history/bug_fixes/compression_perf_fv7.md
 delete mode 100644 unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md
 delete mode 100644 unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md
 delete mode 100644 unreleased_history/public_api_changes/lua.md
 delete mode 100644 unreleased_history/public_api_changes/set_options.md

diff --git a/.github/actions/setup-folly/action.yml b/.github/actions/setup-folly/action.yml
index af1d4b727be6..8702b92aa857 100644
--- a/.github/actions/setup-folly/action.yml
+++ b/.github/actions/setup-folly/action.yml
@@ -6,6 +6,6 @@ runs:
     run: |
       make checkout_folly
     shell: bash
-  - name: Install patchelf
-    run: apt-get update -y && apt-get install -y patchelf
+  - name: Install patchelf and libaio
+    run: apt-get update -y && apt-get install -y patchelf libaio-dev
     shell: bash
diff --git a/HISTORY.md b/HISTORY.md
index 9f440849b35a..277ade360676 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,6 +1,19 @@
 # Rocksdb Change Log
 > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
 
+## 10.11.0 (01/23/2026)
+### Public API Changes
+* New SetOptions API that allows setting options for multiple CFs, avoiding the need to reserialize OPTIONS file for each CF
+* Remove remaining pieces of Lua integration
+
+### Behavior Changes
+* The new default for `BlockBasedTableOptions::format_version` is 7, which has been supported since RocksDB 10.4.0 and is required in order to use CompressionManagers supporting custom compression types.
+
+### Bug Fixes
+* Fixed a small performance bug with `format_version=7` when decompressing formats other than Snappy and ZSTD.
+* Fixed an infinite compaction loop bug with User-Defined Timestamps (UDT) where bottommost files were repeatedly marked for compaction even though their timestamp could not be collapsed.
+* Bugfix for persisted UDT record sequence number zeroing logic.
+
 ## 10.10.0 (12/16/2025)
 ### Bug Fixes
 * Fixed a bug in best-efforts recovery that causes use-after-free crashes when accessing SST files that were cached during the recovery.
diff --git a/folly.mk b/folly.mk
index b253c25b64be..7709485f4a6c 100644
--- a/folly.mk
+++ b/folly.mk
@@ -98,7 +98,7 @@ endif  # FMT_SOURCE_PATH
 	PLATFORM_LDFLAGS += -lglog
 endif
 
-FOLLY_COMMIT_HASH = 94a8e82cf16a0e229fc4fc89140219434ba78fa2
+FOLLY_COMMIT_HASH = d2d1e6f746faa9ae7a973381dbd017634d04a040
 
 # For public CI runs, checkout folly in a way that can build with RocksDB.
 # This is mostly intended as a test-only simulation of Meta-internal folly
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 4b7a720f1ae4..0de620474ee1 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -12,7 +12,7 @@
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
 #define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 11
+#define ROCKSDB_MINOR 12
 #define ROCKSDB_PATCH 0
 
 // Make it easy to do conditional compilation based on version checks, i.e.
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index ede9263ecd38..075a512337c1 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -137,7 +137,7 @@ EOF
 
 # To check for DB forward compatibility with loading options (old version
 # reading data from new), as well as backward compatibility
-declare -a db_forward_with_options_refs=("10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb")
+declare -a db_forward_with_options_refs=("10.4.fb" "10.5.fb" "10.6.fb" "10.7.fb" "10.8.fb" "10.9.fb" "10.10.fb" "10.11.fb")
 # To check for DB forward compatibility without loading options (in addition
 # to the "with loading options" set), as well as backward compatibility
 declare -a db_forward_no_options_refs=() # N/A at the moment
diff --git a/unreleased_history/behavior_changes/fv7.md b/unreleased_history/behavior_changes/fv7.md
deleted file mode 100644
index 91be747f80d5..000000000000
--- a/unreleased_history/behavior_changes/fv7.md
+++ /dev/null
@@ -1 +0,0 @@
-* The new default for `BlockBasedTableOptions::format_version` is 7, which has been supported since RocksDB 10.4.0 and is required in order to use CompressionManagers supporting custom compression types.
diff --git a/unreleased_history/bug_fixes/compression_perf_fv7.md b/unreleased_history/bug_fixes/compression_perf_fv7.md
deleted file mode 100644
index 422e96bb7771..000000000000
--- a/unreleased_history/bug_fixes/compression_perf_fv7.md
+++ /dev/null
@@ -1 +0,0 @@
-* Fixed a small performance bug with `format_version=7` when decompressing formats other than Snappy and ZSTD.
diff --git a/unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md b/unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md
deleted file mode 100644
index ac08736c72b1..000000000000
--- a/unreleased_history/bug_fixes/fix_udt_infinite_compaction_loop.md
+++ /dev/null
@@ -1 +0,0 @@
-Fixed an infinite compaction loop bug with User-Defined Timestamps (UDT) where bottommost files were repeatedly marked for compaction even though their timestamp could not be collapsed.
diff --git a/unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md b/unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md
deleted file mode 100644
index 244fed53dcda..000000000000
--- a/unreleased_history/bug_fixes/udt_seqno_zero_bug_fix.md
+++ /dev/null
@@ -1 +0,0 @@
-Bugfix for persisted UDT record sequence number zeroing logic.
diff --git a/unreleased_history/public_api_changes/lua.md b/unreleased_history/public_api_changes/lua.md
deleted file mode 100644
index be62aef54e31..000000000000
--- a/unreleased_history/public_api_changes/lua.md
+++ /dev/null
@@ -1 +0,0 @@
-* Remove remaining pieces of Lua integration
diff --git a/unreleased_history/public_api_changes/set_options.md b/unreleased_history/public_api_changes/set_options.md
deleted file mode 100644
index eadc2620f7e1..000000000000
--- a/unreleased_history/public_api_changes/set_options.md
+++ /dev/null
@@ -1 +0,0 @@
-New SetOptions API that allows setting options for multiple CFs, avoiding the need to reserialize OPTIONS file for each CF

From a3fe685cdc7c7b3a9caa01ff0f9d1283d4a05646 Mon Sep 17 00:00:00 2001
From: Evan Jones <evan.jones@datadoghq.com>
Date: Tue, 27 Jan 2026 01:26:28 -0800
Subject: [PATCH 438/500] math.h BottomNBits: Fix integer underflow (#14231)

Summary:
When running make check on aarch64, hash_test reports an integer underflows:

    util/math.h:44:46: runtime error: signed integer overflow:
    -2147483648 - 1 cannot be represented in type 'int'
    util/math.h:44:46: runtime error: signed integer overflow:
    -9223372036854775808 - 1 cannot be represented in type 'long long'
    util/math.h:44:46: runtime error: signed integer overflow:
    -9223372036854775808 - 1 cannot be represented in type 'long'

The issue is when BottomNBits(int32 value, 31) does not use BMI2, it executes the following:

    return static_cast<T>(v & ((T{1} << nbits) - 1));

For int32_t, (1 << 31) is the minimum value, and -1 is an integer underflow. The fix is to cast T to an unsigned type and use that for the bit manipulation.

I used Compiler Explorer to verify that this still compiles to the BZHI instruction mentioned in the comment with -march=x86-64-v3: https://godbolt.org/z/8bcTE8xbf

To reproduce these errors on x86-64, disable the BMI code path:
```
USE_CLANG=1 PORTABLE=x86-64-v2 LDFLAGS=-fsanitize=undefined CXXFLAGS=-fsanitize=undefined make -j20 hash_test
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14231

Reviewed By: mszeszko-meta

Differential Revision: D91353147

Pulled By: pdillinger

fbshipit-source-id: 64cc191ccb9ecba20c260fab759e8881e30d2352
---
 util/hash_test.cc | 12 ++++++++++++
 util/math.h       |  4 +++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/util/hash_test.cc b/util/hash_test.cc
index ccc283a24376..dffdae4ce598 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -615,6 +615,13 @@ static void test_BitOps() {
 
     // BottomNBits
     {
+      // build the mask the extremely slow way
+      T bottom_n_mask = 0x00;
+      for (int j = 0; j < i; j++) {
+        bottom_n_mask <<= 1;
+        bottom_n_mask |= 0x1;
+      }
+
       // An essentially full length value
       T x = everyOtherBit;
       if (i > 2) {
@@ -623,6 +630,11 @@ static void test_BitOps() {
       }
       auto a = BottomNBits(x, i);
       auto b = BottomNBits(~x, i);
+
+      // check that a and b match the expected values
+      EXPECT_EQ(a, x & bottom_n_mask);
+      EXPECT_EQ(b, (~x) & bottom_n_mask);
+
       EXPECT_EQ(x | a, x);
       EXPECT_EQ(a | b, vm1);
       EXPECT_EQ(a & b, T{0});
diff --git a/util/math.h b/util/math.h
index e1948e0a313e..112a54f9ffb6 100644
--- a/util/math.h
+++ b/util/math.h
@@ -41,7 +41,9 @@ inline T BottomNBits(T v, int nbits) {
 #endif
   // Newer compilers compile this down to bzhi on x86, but some older
   // ones don't, thus the need for the intrinsic above.
-  return static_cast<T>(v & ((T{1} << nbits) - 1));
+  using UnsignedT = std::make_unsigned_t<T>;
+  UnsignedT mask = (static_cast<UnsignedT>(1) << nbits) - 1;
+  return static_cast<T>(static_cast<UnsignedT>(v) & mask);
 }
 
 // Fast implementation of floor(log2(v)). Undefined for 0 or negative

From de06ce37db8236d29595183018591508e26d5102 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 27 Jan 2026 11:13:48 -0800
Subject: [PATCH 439/500] Remove PutUntil API (#14257)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14257

Removes the 'unused' `PutUntil` API and updates `Put/PutWithTTL` to inline the previous implementation. Test helpers are updated to use `PutWithTTL` with computed TTL values instead.

Reviewed By: xingbowang

Differential Revision: D90900841

fbshipit-source-id: c6ab89fe32773f426b0bedc706bf5a2683ec31cf
---
 include/rocksdb/statistics.h      |   5 +-
 utilities/blob_db/blob_db.h       |  14 --
 utilities/blob_db/blob_db_impl.cc |  21 ++-
 utilities/blob_db/blob_db_impl.h  |   4 -
 utilities/blob_db/blob_db_test.cc | 258 +++++++++++++-----------------
 5 files changed, 125 insertions(+), 177 deletions(-)

diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index fb75ebee3fca..66625fe2dc99 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -301,7 +301,7 @@ enum Tickers : uint32_t {
   NUMBER_RATE_LIMITER_DRAINS,
 
   // BlobDB specific stats
-  // # of Put/PutTTL/PutUntil to BlobDB. Only applicable to legacy BlobDB.
+  // # of Put/PutWithTTL to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_PUT,
   // # of Write to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_NUM_WRITE,
@@ -651,8 +651,7 @@ enum Histograms : uint32_t {
   BLOB_DB_KEY_SIZE,
   // Size of values written to BlobDB. Only applicable to legacy BlobDB.
   BLOB_DB_VALUE_SIZE,
-  // BlobDB Put/PutWithTTL/PutUntil/Write latency. Only applicable to legacy
-  // BlobDB.
+  // BlobDB Put/PutWithTTL/Write latency. Only applicable to legacy BlobDB.
   BLOB_DB_WRITE_MICROS,
   // BlobDB Get latency. Only applicable to legacy BlobDB.
   BLOB_DB_GET_MICROS,
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 503d476fa51d..f480e0c1b5de 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -121,20 +121,6 @@ class BlobDB : public StackableDB {
     return PutWithTTL(options, key, value, ttl);
   }
 
-  // Put with expiration. Key with expiration time equal to
-  // std::numeric_limits<uint64_t>::max() means the key don't expire.
-  virtual Status PutUntil(const WriteOptions& options, const Slice& key,
-                          const Slice& value, uint64_t expiration) = 0;
-  virtual Status PutUntil(const WriteOptions& options,
-                          ColumnFamilyHandle* column_family, const Slice& key,
-                          const Slice& value, uint64_t expiration) {
-    if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
-      return Status::NotSupported(
-          "Blob DB doesn't support non-default column family.");
-    }
-    return PutUntil(options, key, value, expiration);
-  }
-
   using ROCKSDB_NAMESPACE::StackableDB::Get;
   Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
              const Slice& key, PinnableSlice* value,
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 58d51471cd98..e204d0e81940 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1035,18 +1035,27 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
 
 Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
                        const Slice& value) {
-  return PutUntil(options, key, value, kNoExpiration);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_PUT);
+  Status s;
+  WriteBatch batch;
+  {
+    // Release write_mutex_ before DB write to avoid race condition with
+    // flush begin listener, which also require write_mutex_ to sync
+    // blob files.
+    MutexLock l(&write_mutex_);
+    s = PutBlobValue(options, key, value, kNoExpiration, &batch);
+  }
+  if (s.ok()) {
+    s = db_->Write(options, &batch);
+  }
+  return s;
 }
 
 Status BlobDBImpl::PutWithTTL(const WriteOptions& options, const Slice& key,
                               const Slice& value, uint64_t ttl) {
   uint64_t now = EpochNow();
   uint64_t expiration = kNoExpiration - now > ttl ? now + ttl : kNoExpiration;
-  return PutUntil(options, key, value, expiration);
-}
-
-Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
-                            const Slice& value, uint64_t expiration) {
   StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_PUT);
   Status s;
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index b19c546f4848..6e3b830896a3 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -136,10 +136,6 @@ class BlobDBImpl : public BlobDB {
   Status PutWithTTL(const WriteOptions& options, const Slice& key,
                     const Slice& value, uint64_t ttl) override;
 
-  using BlobDB::PutUntil;
-  Status PutUntil(const WriteOptions& options, const Slice& key,
-                  const Slice& value, uint64_t expiration) override;
-
   using BlobDB::CompactFiles;
   Status CompactFiles(
       const CompactionOptions& compact_options,
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index d686c7bac264..4c0e75859756 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -116,10 +116,10 @@ class BlobDBTest : public testing::Test {
     }
   }
 
-  BlobDBImpl *blob_db_impl() { return static_cast<BlobDBImpl *>(blob_db_); }
+  BlobDBImpl* blob_db_impl() { return static_cast<BlobDBImpl*>(blob_db_); }
 
-  Status Put(const Slice &key, const Slice &value,
-             std::map<std::string, std::string> *data = nullptr) {
+  Status Put(const Slice& key, const Slice& value,
+             std::map<std::string, std::string>* data = nullptr) {
     Status s = blob_db_->Put(WriteOptions(), key, value);
     if (data != nullptr) {
       (*data)[key.ToString()] = value.ToString();
@@ -127,16 +127,16 @@ class BlobDBTest : public testing::Test {
     return s;
   }
 
-  void Delete(const std::string &key,
-              std::map<std::string, std::string> *data = nullptr) {
+  void Delete(const std::string& key,
+              std::map<std::string, std::string>* data = nullptr) {
     ASSERT_OK(blob_db_->Delete(WriteOptions(), key));
     if (data != nullptr) {
       data->erase(key);
     }
   }
 
-  Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl,
-                    std::map<std::string, std::string> *data = nullptr) {
+  Status PutWithTTL(const Slice& key, const Slice& value, uint64_t ttl,
+                    std::map<std::string, std::string>* data = nullptr) {
     Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl);
     if (data != nullptr) {
       (*data)[key.ToString()] = value.ToString();
@@ -144,12 +144,8 @@ class BlobDBTest : public testing::Test {
     return s;
   }
 
-  Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
-    return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
-  }
-
-  void PutRandomWithTTL(const std::string &key, uint64_t ttl, Random *rnd,
-                        std::map<std::string, std::string> *data = nullptr) {
+  void PutRandomWithTTL(const std::string& key, uint64_t ttl, Random* rnd,
+                        std::map<std::string, std::string>* data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(
@@ -159,24 +155,13 @@ class BlobDBTest : public testing::Test {
     }
   }
 
-  void PutRandomUntil(const std::string &key, uint64_t expiration, Random *rnd,
-                      std::map<std::string, std::string> *data = nullptr) {
-    int len = rnd->Next() % kMaxBlobSize + 1;
-    std::string value = rnd->HumanReadableString(len);
-    ASSERT_OK(blob_db_->PutUntil(WriteOptions(), Slice(key), Slice(value),
-                                 expiration));
-    if (data != nullptr) {
-      (*data)[key] = value;
-    }
-  }
-
-  void PutRandom(const std::string &key, Random *rnd,
-                 std::map<std::string, std::string> *data = nullptr) {
+  void PutRandom(const std::string& key, Random* rnd,
+                 std::map<std::string, std::string>* data = nullptr) {
     PutRandom(blob_db_, key, rnd, data);
   }
 
-  void PutRandom(DB *db, const std::string &key, Random *rnd,
-                 std::map<std::string, std::string> *data = nullptr) {
+  void PutRandom(DB* db, const std::string& key, Random* rnd,
+                 std::map<std::string, std::string>* data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
@@ -186,8 +171,8 @@ class BlobDBTest : public testing::Test {
   }
 
   void PutRandomToWriteBatch(
-      const std::string &key, Random *rnd, WriteBatch *batch,
-      std::map<std::string, std::string> *data = nullptr) {
+      const std::string& key, Random* rnd, WriteBatch* batch,
+      std::map<std::string, std::string>* data = nullptr) {
     int len = rnd->Next() % kMaxBlobSize + 1;
     std::string value = rnd->HumanReadableString(len);
     ASSERT_OK(batch->Put(key, value));
@@ -197,14 +182,14 @@ class BlobDBTest : public testing::Test {
   }
 
   // Verify blob db contain expected data and nothing more.
-  void VerifyDB(const std::map<std::string, std::string> &data) {
+  void VerifyDB(const std::map<std::string, std::string>& data) {
     VerifyDB(blob_db_, data);
   }
 
-  void VerifyDB(DB *db, const std::map<std::string, std::string> &data) {
+  void VerifyDB(DB* db, const std::map<std::string, std::string>& data) {
     // Verify normal Get
-    auto *cfh = db->DefaultColumnFamily();
-    for (auto &p : data) {
+    auto* cfh = db->DefaultColumnFamily();
+    for (auto& p : data) {
       PinnableSlice value_slice;
       ASSERT_OK(db->Get(ReadOptions(), cfh, p.first, &value_slice));
       ASSERT_EQ(p.second, value_slice.ToString());
@@ -214,9 +199,9 @@ class BlobDBTest : public testing::Test {
     }
 
     // Verify iterators
-    Iterator *iter = db->NewIterator(ReadOptions());
+    Iterator* iter = db->NewIterator(ReadOptions());
     iter->SeekToFirst();
-    for (auto &p : data) {
+    for (auto& p : data) {
       ASSERT_TRUE(iter->Valid());
       ASSERT_EQ(p.first, iter->key().ToString());
       ASSERT_EQ(p.second, iter->value().ToString());
@@ -228,16 +213,16 @@ class BlobDBTest : public testing::Test {
   }
 
   void VerifyBaseDB(
-      const std::map<std::string, KeyVersion> &expected_versions) {
-    auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-    DB *db = blob_db_->GetRootDB();
+      const std::map<std::string, KeyVersion>& expected_versions) {
+    auto* bdb_impl = static_cast<BlobDBImpl*>(blob_db_);
+    DB* db = blob_db_->GetRootDB();
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
     ASSERT_OK(GetAllKeyVersions(db, {}, {}, kMaxKeys, &versions));
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
-    for (auto &key_version : expected_versions) {
-      const KeyVersion &expected_version = key_version.second;
+    for (auto& key_version : expected_versions) {
+      const KeyVersion& expected_version = key_version.second;
       ASSERT_EQ(expected_version.user_key, versions[i].user_key);
       ASSERT_EQ(expected_version.sequence, versions[i].sequence);
       ASSERT_EQ(expected_version.type, versions[i].type);
@@ -255,7 +240,7 @@ class BlobDBTest : public testing::Test {
   }
 
   void VerifyBaseDBBlobIndex(
-      const std::map<std::string, BlobIndexVersion> &expected_versions) {
+      const std::map<std::string, BlobIndexVersion>& expected_versions) {
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
     ASSERT_OK(
@@ -263,8 +248,8 @@ class BlobDBTest : public testing::Test {
     ASSERT_EQ(versions.size(), expected_versions.size());
 
     size_t i = 0;
-    for (const auto &expected_pair : expected_versions) {
-      const BlobIndexVersion &expected_version = expected_pair.second;
+    for (const auto& expected_pair : expected_versions) {
+      const BlobIndexVersion& expected_version = expected_pair.second;
 
       ASSERT_EQ(versions[i].user_key, expected_version.user_key);
       ASSERT_EQ(versions[i].sequence, expected_version.sequence);
@@ -312,7 +297,7 @@ class BlobDBTest : public testing::Test {
   std::shared_ptr<MockSystemClock> mock_clock_;
   std::unique_ptr<Env> mock_env_;
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
-  BlobDB *blob_db_;
+  BlobDB* blob_db_;
 };  // class BlobDBTest
 
 TEST_F(BlobDBTest, Put) {
@@ -346,33 +331,7 @@ TEST_F(BlobDBTest, PutWithTTL) {
                      (ttl <= 50 ? nullptr : &data));
   }
   mock_clock_->SetCurrentTime(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->HasTTL());
-  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, PutUntil) {
-  Random rnd(301);
-  Options options;
-  options.env = mock_env_.get();
-  BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options, options);
-  std::map<std::string, std::string> data;
-  mock_clock_->SetCurrentTime(50);
-  for (size_t i = 0; i < 100; i++) {
-    uint64_t expiration = rnd.Next() % 100 + 50;
-    PutRandomUntil("key" + std::to_string(i), expiration, &rnd,
-                   (expiration <= 100 ? nullptr : &data));
-  }
-  mock_clock_->SetCurrentTime(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto* bdb_impl = static_cast<BlobDBImpl*>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   ASSERT_TRUE(blob_files[0]->HasTTL());
@@ -391,8 +350,8 @@ TEST_F(BlobDBTest, StackableDBGet) {
     PutRandom("key" + std::to_string(i), &rnd, &data);
   }
   for (size_t i = 0; i < 100; i++) {
-    StackableDB *db = blob_db_;
-    ColumnFamilyHandle *column_family = db->DefaultColumnFamily();
+    StackableDB* db = blob_db_;
+    ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
     std::string key = "key" + std::to_string(i);
     PinnableSlice pinnable_value;
     ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value));
@@ -429,7 +388,7 @@ TEST_F(BlobDBTest, GetIOError) {
   bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
-  ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily();
+  ColumnFamilyHandle* column_family = blob_db_->DefaultColumnFamily();
   PinnableSlice value;
   ASSERT_OK(Put("foo", "bar"));
   fault_injection_env_->SetFilesystemActive(false, Status::IOError());
@@ -605,7 +564,7 @@ TEST_F(BlobDBTest, EnableDisableCompressionGC) {
   VerifyDB(data);
 
   blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
+  for (const auto& bfile : blob_files) {
     ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
   }
 
@@ -625,7 +584,7 @@ TEST_F(BlobDBTest, EnableDisableCompressionGC) {
   VerifyDB(data);
 
   blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
+  for (const auto& bfile : blob_files) {
     ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
   }
 }
@@ -676,7 +635,7 @@ TEST_F(BlobDBTest, ChangeCompressionGC) {
 
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
+  for (const auto& bfile : blob_files) {
     ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
   }
 
@@ -693,7 +652,7 @@ TEST_F(BlobDBTest, ChangeCompressionGC) {
 
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
+  for (const auto& bfile : blob_files) {
     ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
   }
 
@@ -717,7 +676,7 @@ TEST_F(BlobDBTest, ChangeCompressionGC) {
 
   blob_db_impl()->TEST_DeleteObsoleteFiles();
   blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto &bfile : blob_files) {
+  for (const auto& bfile : blob_files) {
     ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType());
   }
 }
@@ -760,8 +719,8 @@ TEST_F(BlobDBTest, SstFileManager) {
   std::shared_ptr<SstFileManager> sst_file_manager(
       NewSstFileManager(mock_env_.get()));
   sst_file_manager->SetDeleteRateBytesPerSecond(1024 * 1024);
-  SstFileManagerImpl *sfm =
-      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(sst_file_manager.get());
 
   BlobDBOptions bdb_options;
   bdb_options.min_blob_size = 0;
@@ -771,10 +730,10 @@ TEST_F(BlobDBTest, SstFileManager) {
 
   int files_scheduled_to_delete = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
         assert(arg);
-        const std::string *const file_path =
-            static_cast<const std::string *>(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
         if (file_path->find(".blob") != std::string::npos) {
           ++files_scheduled_to_delete;
         }
@@ -805,10 +764,10 @@ TEST_F(BlobDBTest, SstFileManager) {
 TEST_F(BlobDBTest, SstFileManagerRestart) {
   int files_scheduled_to_delete = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::ScheduleFileDeletion", [&](void *arg) {
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
         assert(arg);
-        const std::string *const file_path =
-            static_cast<const std::string *>(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
         if (file_path->find(".blob") != std::string::npos) {
           ++files_scheduled_to_delete;
         }
@@ -818,8 +777,8 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
   std::shared_ptr<SstFileManager> sst_file_manager(
       NewSstFileManager(mock_env_.get()));
   sst_file_manager->SetDeleteRateBytesPerSecond(1024 * 1024);
-  SstFileManagerImpl *sfm =
-      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+  SstFileManagerImpl* sfm =
+      static_cast<SstFileManagerImpl*>(sst_file_manager.get());
 
   BlobDBOptions bdb_options;
   bdb_options.min_blob_size = 0;
@@ -834,7 +793,7 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
   Close();
 
   // Create 3 dummy trash files under the blob_dir
-  const auto &fs = db_options.env->GetFileSystem();
+  const auto& fs = db_options.env->GetFileSystem();
   ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false));
   ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true));
   ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false));
@@ -849,7 +808,7 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
   std::vector<std::string> all_files;
   ASSERT_OK(db_options.env->GetChildren(blob_dir, &all_files));
   int nfiles = 0;
-  for (const auto &f : all_files) {
+  for (const auto& f : all_files) {
     assert(!f.empty());
     if (f[0] == '.') {
       continue;
@@ -876,7 +835,7 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
     Destroy();
     Open(bdb_options, options);
 
-    const Snapshot *snapshot = nullptr;
+    const Snapshot* snapshot = nullptr;
 
     // First file
     ASSERT_OK(Put("key1", "value"));
@@ -938,8 +897,8 @@ TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   options.env = mock_env_.get();
   mock_clock_->SetCurrentTime(0);
   Open(BlobDBOptions(), options);
-  ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
-  ColumnFamilyHandle *handle = nullptr;
+  ColumnFamilyHandle* default_handle = blob_db_->DefaultColumnFamily();
+  ColumnFamilyHandle* handle = nullptr;
   std::string value;
   std::vector<std::string> values;
   // The call simply pass through to base db. It should succeed.
@@ -948,8 +907,6 @@ TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   ASSERT_TRUE(blob_db_->Put(WriteOptions(), handle, "k", "v").IsNotSupported());
   ASSERT_TRUE(blob_db_->PutWithTTL(WriteOptions(), handle, "k", "v", 60)
                   .IsNotSupported());
-  ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
-                  .IsNotSupported());
   WriteBatch batch;
   ASSERT_OK(batch.Put("k1", "v1"));
   ASSERT_OK(batch.Put(handle, "k2", "v2"));
@@ -986,8 +943,9 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
     PutRandom("key" + std::to_string(i), &rnd, &data);
   }
 
-  constexpr uint64_t expiration = 1000ULL;
-  PutRandomUntil("key100", expiration, &rnd, &data);
+  // At time 0, the stored expiration equals TTL
+  constexpr uint64_t ttl = 1000ULL;
+  PutRandomWithTTL("key100", ttl, &rnd, &data);
 
   std::vector<LiveFileMetaData> metadata;
   blob_db_->GetLiveFilesMetaData(&metadata);
@@ -1003,7 +961,7 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   const std::string filename2("/blob_dir/000002.blob");
   ASSERT_EQ(filename2, metadata[1].name);
   ASSERT_EQ(2, metadata[1].file_number);
-  ASSERT_EQ(expiration, metadata[1].oldest_ancester_time);
+  ASSERT_EQ(ttl, metadata[1].oldest_ancester_time);
   ASSERT_EQ(kDefaultColumnFamilyName, metadata[1].column_family_name);
 
   std::vector<std::string> livefile;
@@ -1046,7 +1004,7 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
   // Write to plain rocksdb.
   Options options;
   options.create_if_missing = true;
-  DB *db = nullptr;
+  DB* db = nullptr;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   for (size_t i = 0; i < kNumIteration; i++) {
     auto key_index = rnd.Next() % kNumKey;
@@ -1122,8 +1080,7 @@ TEST_F(BlobDBTest, FIFOEviction) {
 
   std::atomic<int> evict_count{0};
   SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictOldestBlobFile:Evicted",
-      [&](void *) { evict_count++; });
+      "BlobDBImpl::EvictOldestBlobFile:Evicted", [&](void*) { evict_count++; });
   SyncPoint::GetInstance()->EnableProcessing();
 
   // Each stored blob has an overhead of 32 bytes currently.
@@ -1183,8 +1140,7 @@ TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) {
 
   std::atomic<int> evict_count{0};
   SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictOldestBlobFile:Evicted",
-      [&](void *) { evict_count++; });
+      "BlobDBImpl::EvictOldestBlobFile:Evicted", [&](void*) { evict_count++; });
   SyncPoint::GetInstance()->EnableProcessing();
 
   std::string value(2000, 'v');
@@ -1319,7 +1275,7 @@ TEST_F(BlobDBTest, InlineSmallValues) {
   for (size_t i = 0; i < 1000; i++) {
     bool is_small_value = rnd.Next() % 2;
     bool has_ttl = rnd.Next() % 2;
-    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    uint64_t ttl = rnd.Next() % kMaxExpiration;
     int len = is_small_value ? 50 : 200;
     std::string key = "key" + std::to_string(i);
     std::string value = rnd.HumanReadableString(len);
@@ -1329,7 +1285,7 @@ TEST_F(BlobDBTest, InlineSmallValues) {
     if (!has_ttl) {
       ASSERT_OK(blob_db_->Put(WriteOptions(), key, value));
     } else {
-      ASSERT_OK(blob_db_->PutUntil(WriteOptions(), key, value, expiration));
+      ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, ttl));
     }
     ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
     versions[key] =
@@ -1338,7 +1294,7 @@ TEST_F(BlobDBTest, InlineSmallValues) {
   }
   VerifyDB(data);
   VerifyBaseDB(versions);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
+  auto* bdb_impl = static_cast<BlobDBImpl*>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(2, blob_files.size());
   std::shared_ptr<BlobFile> non_ttl_file;
@@ -1357,8 +1313,8 @@ TEST_F(BlobDBTest, InlineSmallValues) {
 TEST_F(BlobDBTest, UserCompactionFilter) {
   class CustomerFilter : public CompactionFilter {
    public:
-    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
-                std::string *new_value, bool *value_changed) const override {
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+                std::string* new_value, bool* value_changed) const override {
       *value_changed = false;
       // changing value size to test value transitions between inlined data
       // and stored-in-blob data
@@ -1380,12 +1336,12 @@ TEST_F(BlobDBTest, UserCompactionFilter) {
       return false;
     }
     bool IgnoreSnapshots() const override { return true; }
-    const char *Name() const override { return "CustomerFilter"; }
+    const char* Name() const override { return "CustomerFilter"; }
   };
   class CustomerFilterFactory : public CompactionFilterFactory {
-    const char *Name() const override { return "CustomerFilterFactory"; }
+    const char* Name() const override { return "CustomerFilterFactory"; }
     std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-        const CompactionFilter::Context & /*context*/) override {
+        const CompactionFilter::Context& /*context*/) override {
       return std::unique_ptr<CompactionFilter>(new CustomerFilter());
     }
   };
@@ -1467,14 +1423,14 @@ TEST_F(BlobDBTest, UserCompactionFilter) {
 TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
   class CustomerFilter : public CompactionFilter {
    public:
-    bool Filter(int /*level*/, const Slice & /*key*/, const Slice &value,
-                std::string *new_value, bool *value_changed) const override {
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+                std::string* new_value, bool* value_changed) const override {
       *new_value = value.ToString() + "_new";
       *value_changed = true;
       return false;
     }
     bool IgnoreSnapshots() const override { return true; }
-    const char *Name() const override { return "CustomerFilter"; }
+    const char* Name() const override { return "CustomerFilter"; }
   };
 
   constexpr size_t kNumPuts = 100;
@@ -1518,7 +1474,7 @@ TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
     VerifyDB(data);
 
     SyncPoint::GetInstance()->SetCallBack(
-        io_failure_cases[case_num], [&](void * /*arg*/) {
+        io_failure_cases[case_num], [&](void* /*arg*/) {
           fault_injection_env_->SetFilesystemActive(false, Status::IOError());
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -1542,7 +1498,7 @@ TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
 TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   constexpr size_t kNumKeys = 100;
   constexpr size_t kNumPuts = 1000;
-  constexpr uint64_t kMaxExpiration = 1000;
+  constexpr uint64_t kMaxTTL = 1000;
   constexpr uint64_t kCompactTime = 500;
   constexpr uint64_t kMinBlobSize = 100;
   Random rnd(301);
@@ -1559,14 +1515,15 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   for (size_t i = 0; i < kNumPuts; i++) {
     bool is_small_value = rnd.Next() % 2;
     bool has_ttl = rnd.Next() % 2;
-    uint64_t expiration = rnd.Next() % kMaxExpiration;
+    // At time 0, stored expiration equals TTL
+    uint64_t ttl = rnd.Next() % kMaxTTL;
     int len = is_small_value ? 10 : 200;
     std::string key = "key" + std::to_string(rnd.Next() % kNumKeys);
     std::string value = rnd.HumanReadableString(len);
     if (!has_ttl) {
       if (is_small_value) {
         std::string blob_entry;
-        BlobIndex::EncodeInlinedTTL(&blob_entry, expiration, value);
+        BlobIndex::EncodeInlinedTTL(&blob_entry, ttl, value);
         // Fake blob index with TTL. See what it will do.
         ASSERT_GT(kMinBlobSize, blob_entry.size());
         value = blob_entry;
@@ -1574,8 +1531,8 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
       ASSERT_OK(Put(key, value));
       data_after_compact[key] = value;
     } else {
-      ASSERT_OK(PutUntil(key, value, expiration));
-      if (expiration <= kCompactTime) {
+      ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, ttl));
+      if (ttl <= kCompactTime) {
         data_after_compact.erase(key);
       } else {
         data_after_compact[key] = value;
@@ -1588,7 +1545,7 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   mock_clock_->SetCurrentTime(kCompactTime);
   // Take a snapshot before compaction. Make sure expired blob indexes is
   // filtered regardless of snapshot.
-  const Snapshot *snapshot = blob_db_->GetSnapshot();
+  const Snapshot* snapshot = blob_db_->GetSnapshot();
   // Issue manual compaction to trigger compaction filter.
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   blob_db_->ReleaseSnapshot(snapshot);
@@ -1597,7 +1554,7 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   const size_t kMaxKeys = 10000;
   ASSERT_OK(GetAllKeyVersions(blob_db_, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(data_after_compact.size(), versions.size());
-  for (auto &version : versions) {
+  for (auto& version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
   }
   VerifyDB(data_after_compact);
@@ -1627,7 +1584,7 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
 
   const size_t kMaxKeys = 10000;
 
-  DB *base_db = blob_db_->GetRootDB();
+  DB* base_db = blob_db_->GetRootDB();
   std::vector<KeyVersion> versions;
   ASSERT_OK(GetAllKeyVersions(base_db, {}, {}, kMaxKeys, &versions));
   ASSERT_EQ(2, versions.size());
@@ -1759,7 +1716,8 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) {
 TEST_F(BlobDBTest, GarbageCollection) {
   constexpr size_t kNumPuts = 1 << 10;
 
-  constexpr uint64_t kExpiration = 1000;
+  // At time 0, stored expiration equals TTL
+  constexpr uint64_t kTTL = 1000;
   constexpr uint64_t kCompactTime = 500;
 
   constexpr uint64_t kKeySize = 7;  // "key" + 4 digits
@@ -1822,13 +1780,13 @@ TEST_F(BlobDBTest, GarbageCollection) {
     const std::string value = rnd.HumanReadableString(kLargeValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
-    ASSERT_OK(PutUntil(key, value, kExpiration));
+    ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, kTTL));
     ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
 
     data[key] = value;
     blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
     blob_index_versions[key] =
-        BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kExpiration,
+        BlobIndexVersion(key, /* file_number */ kNumBlobFiles + 1, kTTL,
                          sequence, kTypeBlobIndex);
   }
 
@@ -1838,13 +1796,13 @@ TEST_F(BlobDBTest, GarbageCollection) {
     const std::string value = rnd.HumanReadableString(kSmallValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
-    ASSERT_OK(PutUntil(key, value, kExpiration));
+    ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, kTTL));
     ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
 
     data[key] = value;
     blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
-    blob_index_versions[key] = BlobIndexVersion(
-        key, kInvalidBlobFileNumber, kExpiration, sequence, kTypeBlobIndex);
+    blob_index_versions[key] = BlobIndexVersion(key, kInvalidBlobFileNumber,
+                                                kTTL, sequence, kTypeBlobIndex);
   }
 
   // Finally, add a small non-TTL value (which will be stored as a regular
@@ -1888,8 +1846,8 @@ TEST_F(BlobDBTest, GarbageCollection) {
   // compaction.
   VerifyDB(data);
 
-  for (auto &pair : blob_value_versions) {
-    KeyVersion &version = pair.second;
+  for (auto& pair : blob_value_versions) {
+    KeyVersion& version = pair.second;
     version.sequence = 0;
   }
 
@@ -1897,8 +1855,8 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
   const uint64_t cutoff = static_cast<uint64_t>(
       bdb_options.garbage_collection_cutoff * kNumBlobFiles);
-  for (auto &pair : blob_index_versions) {
-    BlobIndexVersion &version = pair.second;
+  for (auto& pair : blob_index_versions) {
+    BlobIndexVersion& version = pair.second;
 
     version.sequence = 0;
 
@@ -1915,7 +1873,7 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
   VerifyBaseDBBlobIndex(blob_index_versions);
 
-  const Statistics *const statistics = options.statistics.get();
+  const Statistics* const statistics = options.statistics.get();
   assert(statistics);
 
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), cutoff);
@@ -1979,7 +1937,7 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
   ASSERT_TRUE(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
                   .IsIOError());
 
-  const Statistics *const statistics = db_options.statistics.get();
+  const Statistics* const statistics = db_options.statistics.get();
   assert(statistics);
 
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), 0);
@@ -2116,7 +2074,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     const std::vector<bool> expected_obsolete{false, false, false, false,
                                               false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2144,7 +2102,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     const std::vector<bool> expected_obsolete{false, false, false, false,
                                               false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2173,7 +2131,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
     const std::vector<bool> expected_obsolete{false, false, false, false,
                                               false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2211,7 +2169,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
     const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2241,7 +2199,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {7}, {3, 8, 23}, {4, 9}, {5, 10, 22}};
     const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2272,7 +2230,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
     const std::vector<bool> expected_obsolete{true, false, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2302,7 +2260,7 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
         {}, {}, {3, 8, 23, 25}, {4, 9}, {5, 10}};
     const std::vector<bool> expected_obsolete{true, true, false, false, false};
     for (size_t i = 0; i < 5; ++i) {
-      const auto &blob_file = blob_files[i];
+      const auto& blob_file = blob_files[i];
       ASSERT_EQ(blob_file->GetLinkedSstFiles(), expected_sst_files[i]);
       ASSERT_EQ(blob_file->Obsolete(), expected_obsolete[i]);
     }
@@ -2336,15 +2294,15 @@ TEST_F(BlobDBTest, ShutdownWait) {
   });
   // Force all tasks to be scheduled immediately.
   SyncPoint::GetInstance()->SetCallBack(
-      "TimeQueue::Add:item.end", [&](void *arg) {
-        std::chrono::steady_clock::time_point *tp =
-            static_cast<std::chrono::steady_clock::time_point *>(arg);
+      "TimeQueue::Add:item.end", [&](void* arg) {
+        std::chrono::steady_clock::time_point* tp =
+            static_cast<std::chrono::steady_clock::time_point*>(arg);
         *tp =
             std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
       });
 
   SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
+      "BlobDBImpl::EvictExpiredFiles:cb", [&](void* /*arg*/) {
         // Sleep 3 ms to increase the chance of data race.
         // We've synced up the code so that EvictExpiredFiles()
         // is called concurrently with ~BlobDBImpl().
@@ -2419,7 +2377,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
   ASSERT_EQ(blob_files.size(), 1);
 
   SyncPoint::GetInstance()->SetCallBack(
-      "BlobLogWriter::Sync", [this](void * /* arg */) {
+      "BlobLogWriter::Sync", [this](void* /* arg */) {
         fault_injection_env_->SetFilesystemActive(false, Status::IOError());
       });
   SyncPoint::GetInstance()->EnableProcessing();
@@ -2436,7 +2394,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
 }  // namespace ROCKSDB_NAMESPACE::blob_db
 
 // A black-box test for the ttl wrapper around rocksdb
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();

From acfea34c91ad975aa7be080fe1a68296609e99aa Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 27 Jan 2026 13:06:39 -0800
Subject: [PATCH 440/500] Work around GCC 12 false positive warning for
 string::insert (#14265)

Summary:
Work around a warning/linter false positive related to the use of string::insert. The code in question is legal C++, but GCC 12's libstdc++ implementation of string::insert internally uses memcpy, which can trigger undefined behavior warnings when the source and destination overlap.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14265

Reviewed By: pdillinger

Differential Revision: D91594561

Pulled By: mszeszko-meta

fbshipit-source-id: faa1487aba11a6581bf9ac8eb89442b6e4120427
---
 table/unique_id.cc | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/table/unique_id.cc b/table/unique_id.cc
index 8bfa8bcfd383..758ad574e948 100644
--- a/table/unique_id.cc
+++ b/table/unique_id.cc
@@ -199,12 +199,16 @@ Status GetUniqueIdFromTableProperties(const TableProperties &props,
 }
 
 std::string UniqueIdToHumanString(const std::string &id) {
-  // Not so efficient, but that's OK
-  std::string str = Slice(id).ToString(/*hex*/ true);
-  for (size_t i = 16; i < str.size(); i += 17) {
-    str.insert(i, "-");
+  std::string hex = Slice(id).ToString(/*hex*/ true);
+  std::string result;
+  result.reserve(hex.size() + hex.size() / 16);
+  for (size_t i = 0; i < hex.size(); i++) {
+    if (i > 0 && i % 16 == 0) {
+      result.push_back('-');
+    }
+    result.push_back(hex[i]);
   }
-  return str;
+  return result;
 }
 
 std::string InternalUniqueIdToHumanString(UniqueIdPtr in) {

From 2366f63e4fd9ad807dda0ed0e905b7393364f56b Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 11:30:37 -0800
Subject: [PATCH 441/500] Remove compression support (#14266)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14266

Compression is unused in production, making this dead code that adds unnecessary complexity.

Core changes:
- Remove `compression` field from `BlobDBOptions`
- Remove compression/decompression methods (`GetCompressedSlice`,
  `DecompressSlice`, `BlobDecompressor`)
- Simplify `ReadBlobFromOldFile` and `GetBlobValue` to handle only
  uncompressed blobs
- Update compaction filter to skip compression/decompression

CLI tool cleanup:
- Remove `--blob_db_compression_type` flag from db_bench (Stacked BlobDB)
- Remove `--show_uncompressed_blob` from blob_dump tool
- Remove `--dump_uncompressed_blobs` from ldb dump/file_dump commands
- blob_dump_tool now fails fast with NotSupported for compressed files

Tests:
- Remove compression-related tests (`Compression`, `DecompressAfterReopen`,
  `EnableDisableCompressionGC`, `ChangeCompressionGC`)

Reviewed By: xingbowang

Differential Revision: D91088957

fbshipit-source-id: 496ee41dcbd0023b794aa8a6d7dcc9c2451b7470
---
 include/rocksdb/utilities/ldb_cmd.h         |   1 -
 tools/blob_dump.cc                          |  17 +-
 tools/db_bench_tool.cc                      |  11 --
 tools/ldb_cmd.cc                            |  47 ++---
 tools/ldb_cmd_impl.h                        |   2 -
 tools/ldb_test.py                           |   4 +-
 utilities/blob_db/blob_compaction_filter.cc |  67 +------
 utilities/blob_db/blob_compaction_filter.h  |   3 +-
 utilities/blob_db/blob_db.cc                |   3 -
 utilities/blob_db/blob_db.h                 |   3 -
 utilities/blob_db/blob_db_impl.cc           | 110 ++---------
 utilities/blob_db/blob_db_impl.h            |  14 +-
 utilities/blob_db/blob_db_test.cc           | 206 --------------------
 utilities/blob_db/blob_dump_tool.cc         |  59 +-----
 utilities/blob_db/blob_dump_tool.h          |  12 +-
 utilities/blob_db/blob_file.cc              |   7 +-
 utilities/blob_db/blob_file.h               |   8 +-
 17 files changed, 54 insertions(+), 520 deletions(-)

diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index e0a1f06a7c8a..313b4ea33281 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -71,7 +71,6 @@ class LDBCommand {
   static const std::string ARG_BLOB_FILE_STARTING_LEVEL;
   static const std::string ARG_PREPOPULATE_BLOB_CACHE;
   static const std::string ARG_DECODE_BLOB_INDEX;
-  static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS;
   static const std::string ARG_READ_TIMESTAMP;
   static const std::string ARG_GET_WRITE_UNIX_TIME;
 
diff --git a/tools/blob_dump.cc b/tools/blob_dump.cc
index 23b5f8f7903a..520b194ee1a2 100644
--- a/tools/blob_dump.cc
+++ b/tools/blob_dump.cc
@@ -27,12 +27,10 @@ int main(int argc, char** argv) {
       {"file", required_argument, nullptr, 'f'},
       {"show_key", optional_argument, nullptr, 'k'},
       {"show_blob", optional_argument, nullptr, 'b'},
-      {"show_uncompressed_blob", optional_argument, nullptr, 'r'},
       {"show_summary", optional_argument, nullptr, 's'},
   };
   DisplayType show_key = DisplayType::kRaw;
   DisplayType show_blob = DisplayType::kNone;
-  DisplayType show_uncompressed_blob = DisplayType::kNone;
   bool show_summary = false;
   std::string file;
   while (true) {
@@ -47,7 +45,6 @@ int main(int argc, char** argv) {
                 "Usage: blob_dump --file=filename "
                 "[--show_key[=none|raw|hex|detail]] "
                 "[--show_blob[=none|raw|hex|detail]] "
-                "[--show_uncompressed_blob[=none|raw|hex|detail]] "
                 "[--show_summary]\n");
         return 0;
       case 'f':
@@ -73,17 +70,6 @@ int main(int argc, char** argv) {
           show_blob = DisplayType::kHex;
         }
         break;
-      case 'r':
-        if (optarg) {
-          if (display_types.count(arg_str) == 0) {
-            fprintf(stderr, "Unrecognized blob display type.\n");
-            return -1;
-          }
-          show_uncompressed_blob = display_types.at(arg_str);
-        } else {
-          show_uncompressed_blob = DisplayType::kHex;
-        }
-        break;
       case 's':
         show_summary = true;
         break;
@@ -93,8 +79,7 @@ int main(int argc, char** argv) {
     }
   }
   BlobDumpTool tool;
-  Status s =
-      tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary);
+  Status s = tool.Run(file, show_key, show_blob, show_summary);
   if (!s.ok()) {
     fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
     return -1;
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index f2c2798695c5..9deb9e093eb2 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1087,12 +1087,6 @@ DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
               "[Stacked BlobDB] Target size of each blob file.");
 
-DEFINE_string(
-    blob_db_compression_type, "snappy",
-    "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
-static enum ROCKSDB_NAMESPACE::CompressionType
-    FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
-
 // Integrated BlobDB options
 DEFINE_bool(
     enable_blob_files,
@@ -5204,7 +5198,6 @@ class Benchmark {
       blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
       blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
       blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
-      blob_db_options.compression = FLAGS_blob_db_compression_type_e;
       blob_db::BlobDB* ptr = nullptr;
       s = hooks.Open(options, blob_db_options, db_name, &ptr);
       if (s.ok()) {
@@ -9191,10 +9184,6 @@ int db_bench_tool(int argc, char** argv, ToolHooks& hooks) {
   FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType(
       FLAGS_compressed_secondary_cache_compression_type.c_str());
 
-  // Stacked BlobDB
-  FLAGS_blob_db_compression_type_e =
-      StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
-
   int env_opts = !FLAGS_env_uri.empty() + !FLAGS_fs_uri.empty();
   if (env_opts > 1) {
     fprintf(stderr, "Error: --env_uri and --fs_uri are mutually exclusive\n");
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 328f7d875414..8fa6d244e643 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -110,8 +110,6 @@ const std::string LDBCommand::ARG_BLOB_FILE_STARTING_LEVEL =
 const std::string LDBCommand::ARG_PREPOPULATE_BLOB_CACHE =
     "prepopulate_blob_cache";
 const std::string LDBCommand::ARG_DECODE_BLOB_INDEX = "decode_blob_index";
-const std::string LDBCommand::ARG_DUMP_UNCOMPRESSED_BLOBS =
-    "dump_uncompressed_blobs";
 const std::string LDBCommand::ARG_READ_TIMESTAMP = "read_timestamp";
 const std::string LDBCommand::ARG_GET_WRITE_UNIX_TIME = "get_write_unix_time";
 
@@ -201,7 +199,7 @@ void DumpSstFile(Options options, std::string filename, bool output_hex,
                  std::string from_key = "", std::string to_key = "");
 
 void DumpBlobFile(const std::string& filename, bool is_key_hex,
-                  bool is_value_hex, bool dump_uncompressed_blobs);
+                  bool is_value_hex);
 
 Status EncodeUserProvidedTimestamp(const std::string& user_timestamp,
                                    std::string* ts_buf);
@@ -2288,13 +2286,12 @@ DBDumperCommand::DBDumperCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(
-          options, flags, true /* is_read_only */,
-          BuildCmdLineOptions(
-              {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM, ARG_TO,
-               ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM, ARG_STATS,
-               ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET, ARG_TIMESTAMP,
-               ARG_PATH, ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
+    : LDBCommand(options, flags, true /* is_read_only */,
+                 BuildCmdLineOptions(
+                     {ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+                      ARG_TO, ARG_MAX_KEYS, ARG_COUNT_ONLY, ARG_COUNT_DELIM,
+                      ARG_STATS, ARG_TTL_START, ARG_TTL_END, ARG_TTL_BUCKET,
+                      ARG_TIMESTAMP, ARG_PATH, ARG_DECODE_BLOB_INDEX})),
       null_from_(true),
       null_to_(true),
       max_keys_(-1),
@@ -2342,7 +2339,6 @@ DBDumperCommand::DBDumperCommand(
   print_stats_ = IsFlagPresent(flags, ARG_STATS);
   count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY);
   decode_blob_index_ = IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX);
-  dump_uncompressed_blobs_ = IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS);
 
   if (is_key_hex_) {
     if (!null_from_) {
@@ -2377,7 +2373,6 @@ void DBDumperCommand::Help(std::string& ret) {
   ret.append(" [--" + ARG_TTL_END + "=<N>:- is exclusive]");
   ret.append(" [--" + ARG_PATH + "=<path_to_a_file>]");
   ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]");
-  ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "]");
   ret.append("\n");
 }
 
@@ -2424,8 +2419,7 @@ void DBDumperCommand::DoCommand() {
                          /*  json_ */ false, column_families_);
         break;
       case kBlobFile:
-        DumpBlobFile(path_, is_key_hex_, is_value_hex_,
-                     dump_uncompressed_blobs_);
+        DumpBlobFile(path_, is_key_hex_, is_value_hex_);
         break;
       default:
         exec_state_ = LDBCommandExecuteResult::Failed(
@@ -4718,22 +4712,16 @@ void DumpSstFile(Options options, std::string filename, bool output_hex,
 }
 
 void DumpBlobFile(const std::string& filename, bool is_key_hex,
-                  bool is_value_hex, bool dump_uncompressed_blobs) {
+                  bool is_value_hex) {
   using ROCKSDB_NAMESPACE::blob_db::BlobDumpTool;
   BlobDumpTool tool;
-  BlobDumpTool::DisplayType blob_type = is_value_hex
+  BlobDumpTool::DisplayType show_blob = is_value_hex
                                             ? BlobDumpTool::DisplayType::kHex
                                             : BlobDumpTool::DisplayType::kRaw;
-  BlobDumpTool::DisplayType show_uncompressed_blob =
-      dump_uncompressed_blobs ? blob_type : BlobDumpTool::DisplayType::kNone;
-  BlobDumpTool::DisplayType show_blob =
-      dump_uncompressed_blobs ? BlobDumpTool::DisplayType::kNone : blob_type;
-
   BlobDumpTool::DisplayType show_key = is_key_hex
                                            ? BlobDumpTool::DisplayType::kHex
                                            : BlobDumpTool::DisplayType::kRaw;
-  Status s = tool.Run(filename, show_key, show_blob, show_uncompressed_blob,
-                      /* show_summary */ true);
+  Status s = tool.Run(filename, show_key, show_blob, /* show_summary */ true);
   if (!s.ok()) {
     fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
   }
@@ -4757,17 +4745,13 @@ DBFileDumperCommand::DBFileDumperCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, true /* is_read_only */,
-                 BuildCmdLineOptions(
-                     {ARG_DECODE_BLOB_INDEX, ARG_DUMP_UNCOMPRESSED_BLOBS})),
-      decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)),
-      dump_uncompressed_blobs_(
-          IsFlagPresent(flags, ARG_DUMP_UNCOMPRESSED_BLOBS)) {}
+                 BuildCmdLineOptions({ARG_DECODE_BLOB_INDEX})),
+      decode_blob_index_(IsFlagPresent(flags, ARG_DECODE_BLOB_INDEX)) {}
 
 void DBFileDumperCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(DBFileDumperCommand::Name());
-  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "] ");
-  ret.append(" [--" + ARG_DUMP_UNCOMPRESSED_BLOBS + "] ");
+  ret.append(" [--" + ARG_DECODE_BLOB_INDEX + "]");
   ret.append("\n");
 }
 
@@ -4835,8 +4819,7 @@ void DBFileDumperCommand::DoCommand() {
       filename = NormalizePath(filename);
       std::cout << filename << std::endl;
       std::cout << "------------------------------" << std::endl;
-      DumpBlobFile(filename, /* is_key_hex */ false, /* is_value_hex */ false,
-                   dump_uncompressed_blobs_);
+      DumpBlobFile(filename, /* is_key_hex */ false, /* is_value_hex */ false);
       std::cout << std::endl;
     }
   }
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 1a30d402cee9..ee3122d805c0 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -47,7 +47,6 @@ class DBFileDumperCommand : public LDBCommand {
 
  private:
   bool decode_blob_index_;
-  bool dump_uncompressed_blobs_;
 };
 
 class DBLiveFilesMetadataDumperCommand : public LDBCommand {
@@ -109,7 +108,6 @@ class DBDumperCommand : public LDBCommand {
   bool print_stats_;
   std::string path_;
   bool decode_blob_index_;
-  bool dump_uncompressed_blobs_;
 
   static const std::string ARG_COUNT_ONLY;
   static const std::string ARG_COUNT_DELIM;
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index a8956f160f1d..1be7ae2cc9e9 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -613,7 +613,7 @@ def testDumpLiveFiles(self):
         # Call the dump_live_files function with the edited dbPath name.
         self.assertTrue(
             self.dumpLiveFiles(
-                "--db=%s --decode_blob_index --dump_uncompressed_blobs" % dbPath,
+                "--db=%s --decode_blob_index" % dbPath,
                 dumpFilePath,
             )
         )
@@ -881,7 +881,7 @@ def testBlobDump(self):
         expected_pattern = re.compile(regex)
         blob_files = self.getBlobFiles(dbPath)
         self.assertTrue(len(blob_files) >= 1)
-        cmd = "dump --path=%s --dump_uncompressed_blobs"
+        cmd = "dump --path=%s"
         self.assertRunOKFull(
             (cmd) % (blob_files[0]), expected_pattern, unexpected=False, isPattern=True
         )
diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc
index 9201c53ad9ef..9925759fb73a 100644
--- a/utilities/blob_db/blob_compaction_filter.cc
+++ b/utilities/blob_db/blob_compaction_filter.cc
@@ -94,10 +94,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     }
     // Read value from blob file.
     PinnableSlice blob;
-    CompressionType compression_type = kNoCompression;
-    constexpr bool need_decompress = true;
-    if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob, need_decompress,
-                             &compression_type)) {
+    if (!ReadBlobFromOldFile(ikey.user_key, blob_index, &blob)) {
       return Decision::kIOError;
     }
     CompactionFilter::Decision decision = ucf->FilterV2(
@@ -123,15 +120,6 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
     return Decision::kIOError;
   }
   Slice new_blob_value(*new_value);
-  GrowableBuffer compressed_output;
-  if (blob_db_impl->bdb_options_.compression != kNoCompression) {
-    Status s = blob_db_impl->CompressBlob(new_blob_value, &compressed_output);
-    if (!s.ok()) {
-      // Best approximation
-      return Decision::kIOError;
-    }
-    new_blob_value = compressed_output.AsSlice();
-  }
   uint64_t new_blob_file_number = 0;
   uint64_t new_blob_offset = 0;
   if (!WriteBlobToNewFile(key, new_blob_value, &new_blob_file_number,
@@ -142,8 +130,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
     return Decision::kIOError;
   }
   BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
-                        new_blob_value.size(),
-                        blob_db_impl->bdb_options_.compression);
+                        new_blob_value.size(), kNoCompression);
   return Decision::kChangeBlobIndex;
 }
 
@@ -205,14 +192,13 @@ bool BlobIndexCompactionFilterBase::OpenNewBlobFileIfNeeded() const {
 }
 
 bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
-    const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob,
-    bool need_decompress, CompressionType* compression_type) const {
+    const Slice& key, const BlobIndex& blob_index, PinnableSlice* blob) const {
   BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
   assert(blob_db_impl);
 
-  Status s = blob_db_impl->GetRawBlobFromFile(
-      key, blob_index.file_number(), blob_index.offset(), blob_index.size(),
-      blob, compression_type);
+  Status s = blob_db_impl->GetRawBlobFromFile(key, blob_index.file_number(),
+                                              blob_index.offset(),
+                                              blob_index.size(), blob);
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(
@@ -225,21 +211,6 @@ bool BlobIndexCompactionFilterBase::ReadBlobFromOldFile(
     return false;
   }
 
-  if (need_decompress && *compression_type != kNoCompression) {
-    s = blob_db_impl->DecompressSlice(*blob, *compression_type, blob);
-    if (!s.ok()) {
-      ROCKS_LOG_ERROR(
-          blob_db_impl->db_options_.info_log,
-          "Uncompression error during blob read from file: %" PRIu64
-          " blob_offset: %" PRIu64 " blob_size: %" PRIu64
-          " key: %s status: '%s'",
-          blob_index.file_number(), blob_index.offset(), blob_index.size(),
-          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
-
-      return false;
-    }
-  }
-
   return true;
 }
 
@@ -372,33 +343,11 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
   }
 
   PinnableSlice blob;
-  CompressionType compression_type = kNoCompression;
-  GrowableBuffer compressed_output;
-  if (!ReadBlobFromOldFile(key, blob_index, &blob, false, &compression_type)) {
+  if (!ReadBlobFromOldFile(key, blob_index, &blob)) {
     gc_stats_.SetError();
     return BlobDecision::kIOError;
   }
 
-  // If the compression_type is changed, re-compress it with the new compression
-  // type.
-  if (compression_type != blob_db_impl->bdb_options_.compression) {
-    if (compression_type != kNoCompression) {
-      const Status status =
-          blob_db_impl->DecompressSlice(blob, compression_type, &blob);
-      if (!status.ok()) {
-        gc_stats_.SetError();
-        return BlobDecision::kCorruption;
-      }
-    }
-    if (blob_db_impl->bdb_options_.compression != kNoCompression) {
-      s = blob_db_impl->CompressBlob(blob, &compressed_output);
-      if (!s.ok()) {
-        return BlobDecision::kCorruption;
-      }
-      blob.PinSelf(compressed_output.AsSlice());
-    }
-  }
-
   uint64_t new_blob_file_number = 0;
   uint64_t new_blob_offset = 0;
   if (!WriteBlobToNewFile(key, blob, &new_blob_file_number, &new_blob_offset)) {
@@ -412,7 +361,7 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
   }
 
   BlobIndex::EncodeBlob(new_value, new_blob_file_number, new_blob_offset,
-                        blob.size(), compression_type);
+                        blob.size(), kNoCompression);
 
   gc_stats_.AddRelocatedBlob(blob_index.size());
 
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
index cb83d0d034f5..dec84937c66c 100644
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -59,8 +59,7 @@ class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
   bool IsBlobFileOpened() const;
   virtual bool OpenNewBlobFileIfNeeded() const;
   bool ReadBlobFromOldFile(const Slice& key, const BlobIndex& blob_index,
-                           PinnableSlice* blob, bool need_decompress,
-                           CompressionType* compression_type) const;
+                           PinnableSlice* blob) const;
   bool WriteBlobToNewFile(const Slice& key, const Slice& blob,
                           uint64_t* new_blob_file_number,
                           uint64_t* new_blob_offset) const;
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index 25960bdd6c84..f2397f0ef272 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -92,9 +92,6 @@ void BlobDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.blob_file_size: %" PRIu64,
       blob_file_size);
-  ROCKS_LOG_HEADER(
-      log, "                               BlobDBOptions.compression: %d",
-      static_cast<int>(compression));
   ROCKS_LOG_HEADER(
       log, "                 BlobDBOptions.enable_garbage_collection: %d",
       enable_garbage_collection);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index f480e0c1b5de..3ede4c9d7f99 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -66,9 +66,6 @@ struct BlobDBOptions {
   // after it exceeds that size
   uint64_t blob_file_size = 256 * 1024 * 1024;
 
-  // what compression to use for Blob's
-  CompressionType compression = kNoCompression;
-
   // If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction
   // by rewriting the remaining live blobs to new files.
   bool enable_garbage_collection = false;
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index e204d0e81940..d8a1be3b7246 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -83,10 +83,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       live_sst_size_(0),
       fifo_eviction_seq_(0),
       evict_expiration_up_to_(0),
-      debug_level_(0),
-      // NOTE: returns nullptr for kNoCompression
-      blob_compressor_(GetBuiltinV2CompressionManager()->GetCompressor(
-          CompressionOptions{}, bdb_options_.compression)) {
+      debug_level_(0) {
   clock_ = env_->GetSystemClock().get();
   blob_dir_ = (bdb_options_.path_relative)
                   ? dbname + "/" + bdb_options_.blob_dir
@@ -708,7 +705,7 @@ std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(
       static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
   auto blob_file = std::make_shared<BlobFile>(
       this, blob_dir_, file_num, db_options_.info_log.get(), column_family_id,
-      bdb_options_.compression, has_ttl, expiration_range);
+      has_ttl, expiration_range);
 
   ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
                   blob_file->PathName().c_str(), reason.c_str());
@@ -1095,32 +1092,14 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
       RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
     }
   } else {
-    GrowableBuffer compression_output;
-    Slice value_maybe_compressed;
-    if (blob_compressor_) {
-      assert(bdb_options_.compression != kNoCompression);
-      assert(bdb_options_.compression ==
-             blob_compressor_->GetPreferredCompressionType());
-      s = CompressBlob(value, &compression_output);
-      if (!s.ok()) {
-        return s;
-      }
-      value_maybe_compressed = compression_output.AsSlice();
-    } else {
-      assert(bdb_options_.compression == kNoCompression);
-      value_maybe_compressed = value;
-    }
-
     std::string headerbuf;
-    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value_maybe_compressed,
-                                       expiration);
+    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value, expiration);
 
     // Check DB size limit before selecting blob file to
     // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
     // done before calling SelectBlobFile().
     s = CheckSizeAndEvictBlobFiles(
-        write_options,
-        headerbuf.size() + key.size() + value_maybe_compressed.size());
+        write_options, headerbuf.size() + key.size() + value.size());
     if (!s.ok()) {
       return s;
     }
@@ -1133,9 +1112,8 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
     }
     if (s.ok()) {
       assert(blob_file != nullptr);
-      assert(blob_file->GetCompressionType() == bdb_options_.compression);
-      s = AppendBlob(write_options, blob_file, headerbuf, key,
-                     value_maybe_compressed, expiration, &index_entry);
+      s = AppendBlob(write_options, blob_file, headerbuf, key, value,
+                     expiration, &index_entry);
     }
     if (s.ok()) {
       if (expiration != kNoExpiration) {
@@ -1172,44 +1150,6 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
   return s;
 }
 
-Status BlobDBImpl::CompressBlob(const Slice& raw,
-                                GrowableBuffer* compression_output) const {
-  StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
-  return LegacyForceBuiltinCompression(
-      *blob_compressor_, /*working_area=*/nullptr, raw, compression_output);
-}
-
-Decompressor& BlobDecompressor() {
-  static auto decompressor =
-      GetBuiltinV2CompressionManager()->GetDecompressor();
-
-  return *decompressor;
-}
-
-Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
-                                   CompressionType compression_type,
-                                   PinnableSlice* value_output) const {
-  assert(compression_type != kNoCompression);
-
-  BlockContents contents;
-  auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-
-  {
-    StopWatch decompression_sw(clock_, statistics_,
-                               BLOB_DB_DECOMPRESSION_MICROS);
-    Status s = DecompressBlockData(
-        compressed_value.data(), compressed_value.size(), compression_type,
-        BlobDecompressor(), &contents, cfh->cfd()->ioptions());
-    if (!s.ok()) {
-      return Status::Corruption("Unable to decompress blob.");
-    }
-  }
-
-  value_output->PinSelf(contents.data);
-
-  return Status::OK();
-}
-
 Status BlobDBImpl::CompactFiles(
     const CompactionOptions& compact_options,
     const std::vector<std::string>& input_file_names, const int output_level,
@@ -1409,11 +1349,10 @@ Status BlobDBImpl::AppendBlob(const WriteOptions& write_options,
 
   if (expiration == kNoExpiration) {
     BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset,
-                          value.size(), bdb_options_.compression);
+                          value.size(), kNoCompression);
   } else {
     BlobIndex::EncodeBlobTTL(index_entry, expiration, bfile->BlobFileNumber(),
-                             blob_offset, value.size(),
-                             bdb_options_.compression);
+                             blob_offset, value.size(), kNoCompression);
   }
 
   return s;
@@ -1511,39 +1450,14 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
     return Status::OK();
   }
 
-  CompressionType compression_type = kNoCompression;
-  s = GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
-                         blob_index.size(), value, &compression_type);
-  if (!s.ok()) {
-    return s;
-  }
-
-  if (compression_type != kNoCompression) {
-    s = DecompressSlice(*value, compression_type, value);
-    if (!s.ok()) {
-      if (debug_level_ >= 2) {
-        ROCKS_LOG_ERROR(
-            db_options_.info_log,
-            "Uncompression error during blob read from file: %" PRIu64
-            " blob_offset: %" PRIu64 " blob_size: %" PRIu64
-            " key: %s status: '%s'",
-            blob_index.file_number(), blob_index.offset(), blob_index.size(),
-            key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
-      }
-      return s;
-    }
-  }
-
-  return Status::OK();
+  return GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
+                            blob_index.size(), value);
 }
 
 Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
                                       uint64_t offset, uint64_t size,
-                                      PinnableSlice* value,
-                                      CompressionType* compression_type) {
+                                      PinnableSlice* value) {
   assert(value);
-  assert(compression_type);
-  assert(*compression_type == kNoCompression);
 
   if (!size) {
     value->PinSelf("");
@@ -1581,8 +1495,6 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
     blob_file = it->second;
   }
 
-  *compression_type = blob_file->GetCompressionType();
-
   // takes locks when called
   std::shared_ptr<RandomAccessFileReader> reader;
   Status s = GetBlobFileReader(blob_file, &reader);
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 6e3b830896a3..415f7ca6ee0c 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -227,15 +227,7 @@ class BlobDBImpl : public BlobDB {
 
   Status GetRawBlobFromFile(const Slice& key, uint64_t file_number,
                             uint64_t offset, uint64_t size,
-                            PinnableSlice* value,
-                            CompressionType* compression_type);
-
-  Status CompressBlob(const Slice& raw,
-                      GrowableBuffer* compression_output) const;
-
-  Status DecompressSlice(const Slice& compressed_value,
-                         CompressionType compression_type,
-                         PinnableSlice* value_output) const;
+                            PinnableSlice* value);
 
   // Close a file by appending a footer, and removes file from open files list.
   // REQUIRES: lock held on write_mutex_, write lock held on both the db mutex_
@@ -503,11 +495,7 @@ class BlobDBImpl : public BlobDB {
   int disable_file_deletions_ = 0;
 
   uint32_t debug_level_;
-
-  std::unique_ptr<Compressor> blob_compressor_;
 };
 
-Decompressor& BlobDecompressor();
-
 }  // namespace blob_db
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 4c0e75859756..12ac524b22d0 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -481,208 +481,6 @@ TEST_F(BlobDBTest, Override) {
   VerifyDB(data);
 }
 
-#ifdef SNAPPY
-TEST_F(BlobDBTest, Compression) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = CompressionType::kSnappyCompression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  for (size_t i = 0; i < 100; i++) {
-    PutRandom("put-key" + std::to_string(i), &rnd, &data);
-  }
-  for (int i = 0; i < 100; i++) {
-    WriteBatch batch;
-    for (size_t j = 0; j < 10; j++) {
-      PutRandomToWriteBatch("write-batch-key" + std::to_string(j * 100 + i),
-                            &rnd, &batch, &data);
-    }
-    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
-  }
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, DecompressAfterReopen) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = CompressionType::kSnappyCompression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  for (size_t i = 0; i < 100; i++) {
-    PutRandom("put-key" + std::to_string(i), &rnd, &data);
-  }
-  VerifyDB(data);
-  bdb_options.compression = CompressionType::kNoCompression;
-  Reopen(bdb_options);
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, EnableDisableCompressionGC) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.garbage_collection_cutoff = 1.0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = kSnappyCompression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  size_t data_idx = 0;
-  for (; data_idx < 100; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_EQ(kSnappyCompression, blob_files[0]->GetCompressionType());
-
-  // disable compression
-  bdb_options.compression = kNoCompression;
-  Reopen(bdb_options);
-
-  // Add more data with new compression type
-  for (; data_idx < 200; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(2, blob_files.size());
-  ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType());
-
-  // Enable GC. If we do it earlier the snapshot release triggered compaction
-  // may compact files and trigger GC before we can verify there are two files.
-  bdb_options.enable_garbage_collection = true;
-  Reopen(bdb_options);
-
-  // Trigger compaction
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  VerifyDB(data);
-
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto& bfile : blob_files) {
-    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
-  }
-
-  // enabling the compression again
-  bdb_options.compression = kSnappyCompression;
-  Reopen(bdb_options);
-
-  // Add more data with new compression type
-  for (; data_idx < 300; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  // Trigger compaction
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  VerifyDB(data);
-
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto& bfile : blob_files) {
-    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
-  }
-}
-
-#ifdef LZ4
-// Test switch compression types and run GC, it needs both Snappy and LZ4
-// support.
-TEST_F(BlobDBTest, ChangeCompressionGC) {
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
-  bdb_options.garbage_collection_cutoff = 1.0;
-  bdb_options.disable_background_tasks = true;
-  bdb_options.compression = kLZ4Compression;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  size_t data_idx = 0;
-  for (; data_idx < 100; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_EQ(kLZ4Compression, blob_files[0]->GetCompressionType());
-
-  // Change compression type
-  bdb_options.compression = kSnappyCompression;
-  Reopen(bdb_options);
-
-  // Add more data with Snappy compression type
-  for (; data_idx < 200; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  // Verify blob file compression type
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(2, blob_files.size());
-  ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType());
-
-  // Enable GC. If we do it earlier the snapshot release triggered compaction
-  // may compact files and trigger GC before we can verify there are two files.
-  bdb_options.enable_garbage_collection = true;
-  Reopen(bdb_options);
-
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  VerifyDB(data);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto& bfile : blob_files) {
-    ASSERT_EQ(kSnappyCompression, bfile->GetCompressionType());
-  }
-
-  // Disable compression
-  bdb_options.compression = kNoCompression;
-  Reopen(bdb_options);
-  for (; data_idx < 300; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  VerifyDB(data);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto& bfile : blob_files) {
-    ASSERT_EQ(kNoCompression, bfile->GetCompressionType());
-  }
-
-  // switching different compression types to generate mixed compression types
-  bdb_options.compression = kSnappyCompression;
-  Reopen(bdb_options);
-  for (; data_idx < 400; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  bdb_options.compression = kLZ4Compression;
-  Reopen(bdb_options);
-  for (; data_idx < 500; data_idx++) {
-    PutRandom("put-key" + std::to_string(data_idx), &rnd, &data);
-  }
-  VerifyDB(data);
-
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  VerifyDB(data);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  for (const auto& bfile : blob_files) {
-    ASSERT_EQ(kLZ4Compression, bfile->GetCompressionType());
-  }
-}
-#endif  // LZ4
-#endif  // SNAPPY
-
 TEST_F(BlobDBTest, MultipleWriters) {
   Open(BlobDBOptions());
 
@@ -1358,9 +1156,6 @@ TEST_F(BlobDBTest, UserCompactionFilter) {
   bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.blob_file_size = kMaxValueSize * 10;
   bdb_options.disable_background_tasks = true;
-  if (Snappy_Supported()) {
-    bdb_options.compression = CompressionType::kSnappyCompression;
-  }
   // case_num == 0: Test user defined compaction filter
   // case_num == 1: Test user defined compaction filter factory
   for (int case_num = 0; case_num < 2; case_num++) {
@@ -1440,7 +1235,6 @@ TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
   bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = kValueSize * 10;
   bdb_options.disable_background_tasks = true;
-  bdb_options.compression = CompressionType::kNoCompression;
 
   std::vector<std::string> io_failure_cases = {
       "BlobDBImpl::CreateBlobFileAndWriter",
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index e42a2fa49ad2..535b36fdfa11 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -16,10 +16,8 @@
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/file_system.h"
-#include "table/format.h"
 #include "util/coding.h"
 #include "util/string_util.h"
-#include "utilities/blob_db/blob_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE::blob_db {
 
@@ -27,9 +25,7 @@ BlobDumpTool::BlobDumpTool()
     : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {}
 
 Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
-                         DisplayType show_blob,
-                         DisplayType show_uncompressed_blob,
-                         bool show_summary) {
+                         DisplayType show_blob, bool show_summary) {
   constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
   Status s;
   const auto fs = FileSystem::Default();
@@ -55,8 +51,7 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   reader_.reset(new RandomAccessFileReader(std::move(file), filename));
   uint64_t offset = 0;
   uint64_t footer_offset = 0;
-  CompressionType compression = kNoCompression;
-  s = DumpBlobLogHeader(&offset, &compression);
+  s = DumpBlobLogHeader(&offset);
   if (!s.ok()) {
     return s;
   }
@@ -67,12 +62,10 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   uint64_t total_records = 0;
   uint64_t total_key_size = 0;
   uint64_t total_blob_size = 0;
-  uint64_t total_uncompressed_blob_size = 0;
-  if (show_key != DisplayType::kNone || show_summary) {
+  if (show_key != DisplayType::kNone) {
     while (offset < footer_offset) {
-      s = DumpRecord(show_key, show_blob, show_uncompressed_blob, show_summary,
-                     compression, &offset, &total_records, &total_key_size,
-                     &total_blob_size, &total_uncompressed_blob_size);
+      s = DumpRecord(show_key, show_blob, &offset, &total_records,
+                     &total_key_size, &total_blob_size);
       if (!s.ok()) {
         break;
       }
@@ -83,10 +76,6 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
     fprintf(stdout, "  total records: %" PRIu64 "\n", total_records);
     fprintf(stdout, "  total key size: %" PRIu64 "\n", total_key_size);
     fprintf(stdout, "  total blob size: %" PRIu64 "\n", total_blob_size);
-    if (compression != kNoCompression) {
-      fprintf(stdout, "  total raw blob size: %" PRIu64 "\n",
-              total_uncompressed_blob_size);
-    }
   }
   return s;
 }
@@ -112,8 +101,7 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
   return s;
 }
 
-Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
-                                       CompressionType* compression) {
+Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
   Slice slice;
   Status s = Read(0, BlobLogHeader::kSize, &slice);
   if (!s.ok()) {
@@ -128,17 +116,10 @@ Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
   fprintf(stdout, "  Version          : %" PRIu32 "\n", header.version);
   fprintf(stdout, "  Column Family ID : %" PRIu32 "\n",
           header.column_family_id);
-  std::string compression_str;
-  if (!GetStringFromCompressionType(&compression_str, header.compression)
-           .ok()) {
-    compression_str = "Unrecongnized compression type (" +
-                      std::to_string((int)header.compression) + ")";
-  }
-  fprintf(stdout, "  Compression      : %s\n", compression_str.c_str());
+  fprintf(stdout, "  Compression      : kNoCompression\n");
   fprintf(stdout, "  Expiration range : %s\n",
           GetString(header.expiration_range).c_str());
   *offset = BlobLogHeader::kSize;
-  *compression = header.compression;
   return s;
 }
 
@@ -171,12 +152,9 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
 }
 
 Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
-                                DisplayType show_uncompressed_blob,
-                                bool show_summary, CompressionType compression,
                                 uint64_t* offset, uint64_t* total_records,
                                 uint64_t* total_key_size,
-                                uint64_t* total_blob_size,
-                                uint64_t* total_uncompressed_blob_size) {
+                                uint64_t* total_blob_size) {
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n",
             *offset, *offset);
@@ -203,22 +181,6 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
   if (!s.ok()) {
     return s;
   }
-  // Decompress value
-  std::string uncompressed_value;
-  if (compression != kNoCompression &&
-      (show_uncompressed_blob != DisplayType::kNone || show_summary)) {
-    BlockContents contents;
-    UncompressionContext context(compression);
-    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
-                           compression);
-    s = DecompressBlockData(
-        slice.data() + key_size, static_cast<size_t>(value_size), compression,
-        BlobDecompressor(), &contents, ImmutableOptions(Options()));
-    if (!s.ok()) {
-      return s;
-    }
-    uncompressed_value = contents.data.ToString();
-  }
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "  key        : ");
     DumpSlice(Slice(slice.data(), static_cast<size_t>(key_size)), show_key);
@@ -228,16 +190,11 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
                       static_cast<size_t>(value_size)),
                 show_blob);
     }
-    if (show_uncompressed_blob != DisplayType::kNone) {
-      fprintf(stdout, "  raw blob   : ");
-      DumpSlice(Slice(uncompressed_value), show_uncompressed_blob);
-    }
   }
   *offset += key_size + value_size;
   *total_records += 1;
   *total_key_size += key_size;
   *total_blob_size += value_size;
-  *total_uncompressed_blob_size += uncompressed_value.size();
   return s;
 }
 
diff --git a/utilities/blob_db/blob_dump_tool.h b/utilities/blob_db/blob_dump_tool.h
index 9876245883ef..a538a38996d4 100644
--- a/utilities/blob_db/blob_dump_tool.h
+++ b/utilities/blob_db/blob_dump_tool.h
@@ -28,8 +28,7 @@ class BlobDumpTool {
   BlobDumpTool();
 
   Status Run(const std::string& filename, DisplayType show_key,
-             DisplayType show_blob, DisplayType show_uncompressed_blob,
-             bool show_summary);
+             DisplayType show_blob, bool show_summary);
 
  private:
   std::unique_ptr<RandomAccessFileReader> reader_;
@@ -37,14 +36,11 @@ class BlobDumpTool {
   size_t buffer_size_;
 
   Status Read(uint64_t offset, size_t size, Slice* result);
-  Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression);
+  Status DumpBlobLogHeader(uint64_t* offset);
   Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset);
   Status DumpRecord(DisplayType show_key, DisplayType show_blob,
-                    DisplayType show_uncompressed_blob, bool show_summary,
-                    CompressionType compression, uint64_t* offset,
-                    uint64_t* total_records, uint64_t* total_key_size,
-                    uint64_t* total_blob_size,
-                    uint64_t* total_uncompressed_blob_size);
+                    uint64_t* offset, uint64_t* total_records,
+                    uint64_t* total_key_size, uint64_t* total_blob_size);
   void DumpSlice(const Slice s, DisplayType type);
 
   template <class T>
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index 5a479dc8bd4b..a076f166ba94 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -25,18 +25,16 @@ BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
     : parent_(p), path_to_dir_(bdir), file_number_(fn), info_log_(info_log) {}
 
 BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
-                   Logger* info_log, uint32_t column_family_id,
-                   CompressionType compression, bool has_ttl,
+                   Logger* info_log, uint32_t column_family_id, bool has_ttl,
                    const ExpirationRange& expiration_range)
     : parent_(p),
       path_to_dir_(bdir),
       file_number_(fn),
       info_log_(info_log),
       column_family_id_(column_family_id),
-      compression_(compression),
       has_ttl_(has_ttl),
       expiration_range_(expiration_range),
-      header_(column_family_id, compression, has_ttl, expiration_range),
+      header_(column_family_id, kNoCompression, has_ttl, expiration_range),
       header_valid_(true) {}
 
 BlobFile::~BlobFile() {
@@ -259,7 +257,6 @@ Status BlobFile::ReadMetadata(const std::shared_ptr<FileSystem>& fs,
     return s;
   }
   column_family_id_ = header.column_family_id;
-  compression_ = header.compression;
   has_ttl_ = header.has_ttl;
   if (has_ttl_) {
     expiration_range_ = header.expiration_range;
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index f0ec83ebe8af..61f4a094af6e 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -51,9 +51,6 @@ class BlobFile {
   // Column family id.
   uint32_t column_family_id_{std::numeric_limits<uint32_t>::max()};
 
-  // Compression type of blobs in the file
-  CompressionType compression_{kNoCompression};
-
   // If true, the keys in this file all has TTL. Otherwise all keys don't
   // have TTL.
   bool has_ttl_{false};
@@ -108,8 +105,7 @@ class BlobFile {
            Logger* info_log);
 
   BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
-           Logger* info_log, uint32_t column_family_id,
-           CompressionType compression, bool has_ttl,
+           Logger* info_log, uint32_t column_family_id, bool has_ttl,
            const ExpirationRange& expiration_range);
 
   ~BlobFile();
@@ -201,8 +197,6 @@ class BlobFile {
 
   void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
 
-  CompressionType GetCompressionType() const { return compression_; }
-
   std::shared_ptr<BlobLogWriter> GetWriter() const { return log_writer_; }
 
   // Read blob file header and footer. Return corruption if file header is

From 80f3d86f21e4893003ecb0bd3eadacc7ac97f89f Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 14:02:49 -0800
Subject: [PATCH 442/500] Remove FIFO eviction support (#14268)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14268

FIFO eviction is unused in production, making this dead code that adds complexity.

Core changes:
- Remove `is_fifo` from `BlobDBOptions`
- Remove `CheckSizeAndEvictBlobFiles` and related methods
- Remove `fifo_eviction_seq` and `evict_expiration_up_to` from `BlobCompactionContext`

CLI tool cleanup:
- Remove `--blob_db_is_fifo` flag from db_bench
- Remove stale FIFO eviction comments

Tests:
- Remove FIFO-related tests (`FIFOEviction_*`, `FilterForFIFOEviction`)

Note: TTL-based expiration (`EvictExpiredFiles`) is preserved as it handles blob file cleanup based on TTL, which is separate from FIFO eviction.

Reviewed By: xingbowang

Differential Revision: D91088968

fbshipit-source-id: 123df98d1132095cef15473b76011de030c5df34
---
 tools/db_bench_tool.cc                      |   5 -
 utilities/blob_db/blob_compaction_filter.cc |  24 +-
 utilities/blob_db/blob_compaction_filter.h  |   2 -
 utilities/blob_db/blob_db.cc                |   3 -
 utilities/blob_db/blob_db.h                 |   5 -
 utilities/blob_db/blob_db_impl.cc           | 101 +------
 utilities/blob_db/blob_db_impl.h            |  39 +--
 utilities/blob_db/blob_db_listener.h        |   5 +-
 utilities/blob_db/blob_db_test.cc           | 285 --------------------
 9 files changed, 25 insertions(+), 444 deletions(-)

diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 9deb9e093eb2..a46d288d5972 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1055,10 +1055,6 @@ DEFINE_double(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
     "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
 
-DEFINE_bool(blob_db_is_fifo,
-            ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
-            "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
-
 DEFINE_uint64(blob_db_max_db_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
               "[Stacked BlobDB] Max size limit of the directory where blob "
@@ -5192,7 +5188,6 @@ class Benchmark {
       blob_db::BlobDBOptions blob_db_options;
       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
       blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
-      blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
       blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc
index 9925759fb73a..a87b5a614ca4 100644
--- a/utilities/blob_db/blob_compaction_filter.cc
+++ b/utilities/blob_db/blob_compaction_filter.cc
@@ -55,31 +55,10 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
   if (!blob_index.IsInlined() &&
       blob_index.file_number() < context_.next_file_number &&
       context_.current_blob_files.count(blob_index.file_number()) == 0) {
-    // Corresponding blob file gone (most likely, evicted by FIFO eviction).
     evicted_count_++;
     evicted_size_ += key.size() + value.size();
     return Decision::kRemove;
   }
-  if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() &&
-      blob_index.expiration() < context_.evict_expiration_up_to) {
-    // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
-    // get sequence number.
-    ParsedInternalKey ikey;
-    if (!ParseInternalKey(
-             key, &ikey,
-             context_.blob_db_impl->db_options_.allow_data_in_errors)
-             .ok()) {
-      assert(false);
-      return Decision::kKeep;
-    }
-    // Remove keys that could have been remove by last FIFO eviction.
-    // If get error while parsing key, ignore and continue.
-    if (ikey.sequence < context_.fifo_eviction_seq) {
-      evicted_count_++;
-      evicted_size_ += key.size() + value.size();
-      return Decision::kRemove;
-    }
-  }
   // Apply user compaction filter for all non-TTL blob data.
   if (ucf != nullptr && !blob_index.HasTTL()) {
     // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
@@ -281,8 +260,7 @@ bool BlobIndexCompactionFilterBase::CloseAndRegisterNewBlobFile() const {
     // TODO: plumb Env::IOActivity, Env::IOPriority
     s = blob_db_impl->CloseBlobFile(WriteOptions(), blob_file_);
 
-    // Note: we delay registering the new blob file until it's closed to
-    // prevent FIFO eviction from processing it during compaction/GC.
+    // Note: we delay registering the new blob file until it's closed.
     blob_db_impl->RegisterBlobFile(blob_file_);
   }
 
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
index dec84937c66c..1c55a53c2460 100644
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -21,8 +21,6 @@ struct BlobCompactionContext {
   BlobDBImpl* blob_db_impl = nullptr;
   uint64_t next_file_number = 0;
   std::unordered_set<uint64_t> current_blob_files;
-  SequenceNumber fifo_eviction_seq = 0;
-  uint64_t evict_expiration_up_to = 0;
 };
 
 struct BlobCompactionContextGC {
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index f2397f0ef272..7ee0ce1492ab 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -74,9 +74,6 @@ void BlobDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                             BlobDBOptions.path_relative: %d",
       path_relative);
-  ROCKS_LOG_HEADER(
-      log, "                                   BlobDBOptions.is_fifo: %d",
-      is_fifo);
   ROCKS_LOG_HEADER(
       log, "                               BlobDBOptions.max_db_size: %" PRIu64,
       max_db_size);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 3ede4c9d7f99..15c9d25a6166 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -35,11 +35,6 @@ struct BlobDBOptions {
   // whether the blob_dir path is relative or absolute.
   bool path_relative = true;
 
-  // When max_db_size is reached, evict blob files to free up space
-  // instead of returnning NoSpace error on write. Blob files will be
-  // evicted from oldest to newest, based on file creation time.
-  bool is_fifo = false;
-
   // Maximum size of the database (including SST files and blob files).
   //
   // Default: 0 (no limits)
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index d8a1be3b7246..2a8f8873f13c 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -81,8 +81,6 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       open_file_count_(0),
       total_blob_size_(0),
       live_sst_size_(0),
-      fifo_eviction_seq_(0),
-      evict_expiration_up_to_(0),
       debug_level_(0) {
   clock_ = env_->GetSystemClock().get();
   blob_dir_ = (bdb_options_.path_relative)
@@ -277,7 +275,7 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
     return s;
   }
 
-  UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kDBOpen));
+  UpdateLiveSSTSize();
 
   // Start background jobs.
   if (!bdb_options_.disable_background_tasks) {
@@ -591,7 +589,6 @@ bool BlobDBImpl::MarkBlobFileObsoleteIfNeeded(
   assert(blob_file->Immutable());
   assert(bdb_options_.enable_garbage_collection);
 
-  // Note: FIFO eviction could have marked this file obsolete already.
   if (blob_file->Obsolete()) {
     return true;
   }
@@ -1095,11 +1092,8 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
     std::string headerbuf;
     BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value, expiration);
 
-    // Check DB size limit before selecting blob file to
-    // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
-    // done before calling SelectBlobFile().
-    s = CheckSizeAndEvictBlobFiles(
-        write_options, headerbuf.size() + key.size() + value.size());
+    // Check DB size limit before selecting blob file.
+    s = CheckDbSizeLimit(headerbuf.size() + key.size() + value.size());
     if (!s.ok()) {
       return s;
     }
@@ -1187,8 +1181,6 @@ void BlobDBImpl::GetCompactionContextCommon(BlobCompactionContext* context) {
   for (auto& p : blob_files_) {
     context->current_blob_files.insert(p.first);
   }
-  context->fifo_eviction_seq = fifo_eviction_seq_;
-  context->evict_expiration_up_to = evict_expiration_up_to_;
 }
 
 void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
@@ -1216,7 +1208,7 @@ void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
   }
 }
 
-void BlobDBImpl::UpdateLiveSSTSize(const WriteOptions& write_options) {
+void BlobDBImpl::UpdateLiveSSTSize() {
   uint64_t live_sst_size = 0;
   bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
   if (ok) {
@@ -1229,90 +1221,21 @@ void BlobDBImpl::UpdateLiveSSTSize(const WriteOptions& write_options) {
         db_options_.info_log,
         "Failed to update total SST file size after flush or compaction.");
   }
-  {
-    // Trigger FIFO eviction if needed.
-    MutexLock l(&write_mutex_);
-    Status s = CheckSizeAndEvictBlobFiles(write_options, 0, true /*force*/);
-    if (s.IsNoSpace()) {
-      ROCKS_LOG_WARN(db_options_.info_log,
-                     "DB grow out-of-space after SST size updated. Current live"
-                     " SST size: %" PRIu64
-                     " , current blob files size: %" PRIu64 ".",
-                     live_sst_size_.load(), total_blob_size_.load());
-    }
-  }
 }
 
-Status BlobDBImpl::CheckSizeAndEvictBlobFiles(const WriteOptions& write_options,
-                                              uint64_t blob_size,
-                                              bool force_evict) {
-  write_mutex_.AssertHeld();
-
-  uint64_t live_sst_size = live_sst_size_.load();
-  if (bdb_options_.max_db_size == 0 ||
-      live_sst_size + total_blob_size_.load() + blob_size <=
-          bdb_options_.max_db_size) {
+Status BlobDBImpl::CheckDbSizeLimit(uint64_t blob_size) {
+  if (bdb_options_.max_db_size == 0) {
     return Status::OK();
   }
 
-  if (bdb_options_.is_fifo == false ||
-      (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) {
-    // FIFO eviction is disabled, or no space to insert new blob even we evict
-    // all blob files.
-    return Status::NoSpace(
-        "Write failed, as writing it would exceed max_db_size limit.");
+  uint64_t live_sst_size = live_sst_size_.load();
+  uint64_t total_blob_size = total_blob_size_.load();
+  if (live_sst_size + total_blob_size + blob_size <= bdb_options_.max_db_size) {
+    return Status::OK();
   }
 
-  std::vector<std::shared_ptr<BlobFile>> candidate_files;
-  CopyBlobFiles(&candidate_files);
-  std::sort(candidate_files.begin(), candidate_files.end(),
-            BlobFileComparator());
-  fifo_eviction_seq_ = GetLatestSequenceNumber();
-
-  WriteLock l(&mutex_);
-
-  while (!candidate_files.empty() &&
-         live_sst_size + total_blob_size_.load() + blob_size >
-             bdb_options_.max_db_size) {
-    std::shared_ptr<BlobFile> blob_file = candidate_files.back();
-    candidate_files.pop_back();
-    WriteLock file_lock(&blob_file->mutex_);
-    if (blob_file->Obsolete()) {
-      // File already obsoleted by someone else.
-      assert(blob_file->Immutable());
-      continue;
-    }
-    // FIFO eviction can evict open blob files.
-    if (!blob_file->Immutable()) {
-      Status s = CloseBlobFile(write_options, blob_file);
-      if (!s.ok()) {
-        return s;
-      }
-    }
-    assert(blob_file->Immutable());
-    auto expiration_range = blob_file->GetExpirationRange();
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Evict oldest blob file since DB out of space. Current "
-                   "live SST file size: %" PRIu64 ", total blob size: %" PRIu64
-                   ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64
-                   ".",
-                   live_sst_size, total_blob_size_.load(),
-                   bdb_options_.max_db_size, blob_file->BlobFileNumber());
-    ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/);
-    evict_expiration_up_to_ = expiration_range.first;
-    RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
-    RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
-               blob_file->BlobCount());
-    RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
-               blob_file->GetFileSize());
-    TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted");
-  }
-  if (live_sst_size + total_blob_size_.load() + blob_size >
-      bdb_options_.max_db_size) {
-    return Status::NoSpace(
-        "Write failed, as writing it would exceed max_db_size limit.");
-  }
-  return Status::OK();
+  return Status::NoSpace(
+      "Write failed, as writing it would exceed max_db_size limit.");
 }
 
 Status BlobDBImpl::AppendBlob(const WriteOptions& write_options,
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 415f7ca6ee0c..3144268886bd 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -93,10 +93,6 @@ class BlobDBImpl : public BlobDB {
   // how often to schedule expired files eviction.
   static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000;
 
-  // when should oldest file be evicted:
-  // on reaching 90% of blob_dir_size
-  static constexpr double kEvictOldestFileAtSize = 0.9;
-
   using BlobDB::Put;
   Status Put(const WriteOptions& options, const Slice& key,
              const Slice& value) override;
@@ -194,10 +190,10 @@ class BlobDBImpl : public BlobDB {
                              SequenceNumber obsolete_seq = 0,
                              bool update_size = true);
 
-  void TEST_EvictExpiredFiles();
-
   void TEST_DeleteObsoleteFiles();
 
+  void TEST_EvictExpiredFiles();
+
   uint64_t TEST_live_sst_size();
 
   const std::string& TEST_blob_dir() const { return blob_dir_; }
@@ -287,15 +283,14 @@ class BlobDBImpl : public BlobDB {
   // or GC). Check whether any snapshots exist which refer to the same.
   std::pair<bool, int64_t> DeleteObsoleteFiles(bool aborted);
 
-  // periodically check if open blob files and their TTL's has expired
-  // if expired, close the sequential writer and make the file immutable
-  std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
-
   // if the number of open files, approaches ULIMIT's this
   // task will close random readers, which are kept around for
   // efficiency
   std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
 
+  // Evict expired blob files from the TTL queue.
+  std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
+
   std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
 
   // Adds the background tasks to the timer queue
@@ -359,7 +354,11 @@ class BlobDBImpl : public BlobDB {
   void MarkUnreferencedBlobFilesObsolete();
   void MarkUnreferencedBlobFilesObsoleteDuringOpen();
 
-  void UpdateLiveSSTSize(const WriteOptions& write_options);
+  void UpdateLiveSSTSize();
+
+  // Check if writing blob_size bytes would exceed max_db_size limit.
+  // Returns Status::NoSpace() if limit would be exceeded.
+  Status CheckDbSizeLimit(uint64_t blob_size);
 
   Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
                            std::shared_ptr<RandomAccessFileReader>* reader);
@@ -386,14 +385,6 @@ class BlobDBImpl : public BlobDB {
 
   uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
 
-  // Check if inserting a new blob will make DB grow out of space.
-  // If is_fifo = true, FIFO eviction will be triggered to make room for the
-  // new blob. If force_evict = true, FIFO eviction will evict blob files
-  // even eviction will not make enough room for the new blob.
-  Status CheckSizeAndEvictBlobFiles(const WriteOptions& write_options,
-                                    uint64_t blob_size,
-                                    bool force_evict = false);
-
   Status CloseImpl();
 
   // name of the database directory
@@ -462,16 +453,6 @@ class BlobDBImpl : public BlobDB {
   // total size of SST files.
   std::atomic<uint64_t> live_sst_size_;
 
-  // Latest FIFO eviction timestamp
-  //
-  // REQUIRES: access with metex_ lock held.
-  uint64_t fifo_eviction_seq_;
-
-  // The expiration up to which latest FIFO eviction evicts.
-  //
-  // REQUIRES: access with metex_ lock held.
-  uint64_t evict_expiration_up_to_;
-
   std::list<std::shared_ptr<BlobFile>> obsolete_files_;
 
   // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block
diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h
index ce2ec182f5dc..822f71363391 100644
--- a/utilities/blob_db/blob_db_listener.h
+++ b/utilities/blob_db/blob_db_listener.h
@@ -27,14 +27,13 @@ class BlobDBListener : public EventListener {
 
   void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override {
     assert(blob_db_impl_ != nullptr);
-    blob_db_impl_->UpdateLiveSSTSize(WriteOptions(Env::IOActivity::kFlush));
+    blob_db_impl_->UpdateLiveSSTSize();
   }
 
   void OnCompactionCompleted(DB* /*db*/,
                              const CompactionJobInfo& /*info*/) override {
     assert(blob_db_impl_ != nullptr);
-    blob_db_impl_->UpdateLiveSSTSize(
-        WriteOptions(Env::IOActivity::kCompaction));
+    blob_db_impl_->UpdateLiveSSTSize();
   }
 
   const char* Name() const override { return kClassName(); }
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 12ac524b22d0..b694e799b038 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -852,7 +852,6 @@ TEST_F(BlobDBTest, OutOfSpace) {
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
   bdb_options.max_db_size = 200;
-  bdb_options.is_fifo = false;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
 
@@ -868,194 +867,6 @@ TEST_F(BlobDBTest, OutOfSpace) {
   ASSERT_TRUE(s.IsNoSpace());
 }
 
-TEST_F(BlobDBTest, FIFOEviction) {
-  BlobDBOptions bdb_options;
-  bdb_options.max_db_size = 200;
-  bdb_options.blob_file_size = 100;
-  bdb_options.is_fifo = true;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options);
-
-  std::atomic<int> evict_count{0};
-  SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictOldestBlobFile:Evicted", [&](void*) { evict_count++; });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  // Each stored blob has an overhead of 32 bytes currently.
-  // So a 100 byte blob should take up 132 bytes.
-  std::string value(100, 'v');
-  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
-  VerifyDB({{"key1", value}});
-
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-
-  // Adding another 100 bytes blob would take the total size to 264 bytes
-  // (2*132). max_db_size will be exceeded
-  // than max_db_size and trigger FIFO eviction.
-  ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
-  ASSERT_EQ(1, evict_count);
-  // key1 will exist until corresponding file be deleted.
-  VerifyDB({{"key1", value}, {"key2", value}});
-
-  // Adding another 100 bytes blob without TTL.
-  ASSERT_OK(blob_db_->Put(WriteOptions(), "key3", value));
-  ASSERT_EQ(2, evict_count);
-  // key1 and key2 will exist until corresponding file be deleted.
-  VerifyDB({{"key1", value}, {"key2", value}, {"key3", value}});
-
-  // The fourth blob file, without TTL.
-  ASSERT_OK(blob_db_->Put(WriteOptions(), "key4", value));
-  ASSERT_EQ(3, evict_count);
-  VerifyDB(
-      {{"key1", value}, {"key2", value}, {"key3", value}, {"key4", value}});
-
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(4, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->Obsolete());
-  ASSERT_TRUE(blob_files[1]->Obsolete());
-  ASSERT_TRUE(blob_files[2]->Obsolete());
-  ASSERT_FALSE(blob_files[3]->Obsolete());
-  auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
-  ASSERT_EQ(3, obsolete_files.size());
-  ASSERT_EQ(blob_files[0], obsolete_files[0]);
-  ASSERT_EQ(blob_files[1], obsolete_files[1]);
-  ASSERT_EQ(blob_files[2], obsolete_files[2]);
-
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
-  ASSERT_TRUE(obsolete_files.empty());
-  VerifyDB({{"key4", value}});
-}
-
-TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) {
-  Options options;
-  BlobDBOptions bdb_options;
-  bdb_options.max_db_size = 1000;
-  bdb_options.blob_file_size = 5000;
-  bdb_options.is_fifo = true;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options);
-
-  std::atomic<int> evict_count{0};
-  SyncPoint::GetInstance()->SetCallBack(
-      "BlobDBImpl::EvictOldestBlobFile:Evicted", [&](void*) { evict_count++; });
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  std::string value(2000, 'v');
-  ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace());
-  ASSERT_EQ(0, evict_count);
-}
-
-TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) {
-  BlobDBOptions bdb_options;
-  bdb_options.is_fifo = true;
-  bdb_options.min_blob_size = 100;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  // Use mock env to stop wall clock.
-  options.env = mock_env_.get();
-  options.disable_auto_compactions = true;
-  auto statistics = CreateDBStatistics();
-  options.statistics = statistics;
-  Open(bdb_options, options);
-
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
-        "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush"}});
-
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size());
-  std::string small_value(50, 'v');
-  std::map<std::string, std::string> data;
-  // Insert some data into LSM tree to make sure FIFO eviction take SST
-  // file size into account.
-  for (int i = 0; i < 1000; i++) {
-    ASSERT_OK(Put("key" + std::to_string(i), small_value, &data));
-  }
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-
-  uint64_t live_sst_size = 0;
-  ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize,
-                                       &live_sst_size));
-  ASSERT_TRUE(live_sst_size > 0);
-
-  TEST_SYNC_POINT(
-      "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush");
-
-  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
-
-  bdb_options.max_db_size = live_sst_size + 2000;
-  Reopen(bdb_options, options);
-  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
-
-  std::string value_1k(1000, 'v');
-  ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data));
-  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  VerifyDB(data);
-  // large_key2 evicts large_key1
-  ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  data.erase("large_key1");
-  VerifyDB(data);
-  // large_key3 get no enough space even after evicting large_key2, so it
-  // instead return no space error.
-  std::string value_2k(2000, 'v');
-  ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace());
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  // Verify large_key2 still exists.
-  VerifyDB(data);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
-// Test flush or compaction will trigger FIFO eviction since they update
-// total SST file size.
-TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) {
-  BlobDBOptions bdb_options;
-  bdb_options.max_db_size = 1000;
-  bdb_options.is_fifo = true;
-  bdb_options.min_blob_size = 100;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  // Use mock env to stop wall clock.
-  options.env = mock_env_.get();
-  auto statistics = CreateDBStatistics();
-  options.statistics = statistics;
-  options.compression = kNoCompression;
-  Open(bdb_options, options);
-
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
-        "BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush"}});
-
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  std::string value(800, 'v');
-  ASSERT_OK(PutWithTTL("large_key", value, 60));
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  VerifyDB({{"large_key", value}});
-
-  // Insert some small keys and flush to bring DB out of space.
-  std::map<std::string, std::string> data;
-  for (int i = 0; i < 10; i++) {
-    ASSERT_OK(Put("key" + std::to_string(i), "v", &data));
-  }
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-
-  TEST_SYNC_POINT("BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush");
-
-  // Verify large_key is deleted by FIFO eviction.
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  VerifyDB(data);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
 TEST_F(BlobDBTest, InlineSmallValues) {
   constexpr uint64_t kMaxExpiration = 1000;
   Random rnd(301);
@@ -1411,102 +1222,6 @@ TEST_F(BlobDBTest, FilterFileNotAvailable) {
   VerifyDB({});
 }
 
-// Test compaction filter should filter any inlined TTL keys that would have
-// been dropped by last FIFO eviction if they are store out-of-line.
-TEST_F(BlobDBTest, FilterForFIFOEviction) {
-  Random rnd(215);
-  BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 100;
-  bdb_options.ttl_range_secs = 60;
-  bdb_options.max_db_size = 0;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  // Use mock env to stop wall clock.
-  mock_clock_->SetCurrentTime(0);
-  options.env = mock_env_.get();
-  auto statistics = CreateDBStatistics();
-  options.statistics = statistics;
-  options.disable_auto_compactions = true;
-  Open(bdb_options, options);
-
-  SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
-        "BlobDBTest.FilterForFIFOEviction:AfterFlush"}});
-
-  SyncPoint::GetInstance()->EnableProcessing();
-
-  std::map<std::string, std::string> data;
-  std::map<std::string, std::string> data_after_compact;
-  // Insert some small values that will be inlined.
-  for (int i = 0; i < 1000; i++) {
-    std::string key = "key" + std::to_string(i);
-    std::string value = rnd.HumanReadableString(50);
-    uint64_t ttl = rnd.Next() % 120 + 1;
-    ASSERT_OK(PutWithTTL(key, value, ttl, &data));
-    if (ttl >= 60) {
-      data_after_compact[key] = value;
-    }
-  }
-  uint64_t num_keys_to_evict = data.size() - data_after_compact.size();
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-
-  TEST_SYNC_POINT("BlobDBTest.FilterForFIFOEviction:AfterFlush");
-
-  uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size();
-  ASSERT_GT(live_sst_size, 0);
-  VerifyDB(data);
-
-  bdb_options.max_db_size = live_sst_size + 30000;
-  bdb_options.is_fifo = true;
-  Reopen(bdb_options, options);
-  VerifyDB(data);
-
-  // Put two large values, each on a different blob file.
-  std::string large_value(10000, 'v');
-  ASSERT_OK(PutWithTTL("large_key1", large_value, 90));
-  ASSERT_OK(PutWithTTL("large_key2", large_value, 150));
-  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
-  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  data["large_key1"] = large_value;
-  data["large_key2"] = large_value;
-  VerifyDB(data);
-
-  // Put a third large value which will bring the DB out of space.
-  // FIFO eviction will evict the file of large_key1.
-  ASSERT_OK(PutWithTTL("large_key3", large_value, 150));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
-  blob_db_impl()->TEST_DeleteObsoleteFiles();
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-  data.erase("large_key1");
-  data["large_key3"] = large_value;
-  VerifyDB(data);
-
-  // Putting some more small values. These values shouldn't be evicted by
-  // compaction filter since they are inserted after FIFO eviction.
-  ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact));
-  ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact));
-
-  // FIFO eviction doesn't trigger again since there enough room for the flush.
-  ASSERT_OK(blob_db_->Flush(FlushOptions()));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-
-  // Manual compact and check if compaction filter evict those keys with
-  // expiration < 60.
-  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // All keys with expiration < 60, plus large_key1 is filtered by
-  // compaction filter.
-  ASSERT_EQ(num_keys_to_evict + 1,
-            statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT));
-  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
-  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
-  data_after_compact["large_key2"] = large_value;
-  data_after_compact["large_key3"] = large_value;
-  VerifyDB(data_after_compact);
-
-  SyncPoint::GetInstance()->DisableProcessing();
-}
-
 TEST_F(BlobDBTest, GarbageCollection) {
   constexpr size_t kNumPuts = 1 << 10;
 

From 6b5ccbbec6ba65bc8c301aa082559b0ae47d633b Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 16:08:10 -0800
Subject: [PATCH 443/500] Remove inline values support (#14270)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14270

Legacy BlobDB's inline values feature (storing small values directly in the LSM tree via `min_blob_size` threshold) is unused in production - all
deployments use `min_blob_size = 0`. This removes the functionality entirely.

Changes:
- Remove `min_blob_size` from `BlobDBOptions`
- Remove `IsInlined()` check from compaction filter (dead code path)
- Remove inline-related statistics (`BLOB_DB_WRITE_INLINED*`)
- Remove `InlineSmallValues` test
- Update stale comments referencing inlined data

Reviewed By: xingbowang

Differential Revision: D91088985

fbshipit-source-id: ec67848ece1a7dc071ca8e8a17faebb435394733
---
 db_stress_tool/db_stress_common.h             |   1 -
 db_stress_tool/db_stress_gflags.cc            |   6 -
 db_stress_tool/db_stress_test_base.cc         |   1 -
 include/rocksdb/statistics.h                  |  12 +-
 java/rocksjni/portal.h                        |   8 +-
 .../src/main/java/org/rocksdb/TickerType.java |   8 +-
 monitoring/statistics.cc                      |   4 +-
 tools/db_bench_tool.cc                        |   7 -
 utilities/blob_db/blob_compaction_filter.cc   |  18 +--
 utilities/blob_db/blob_db.cc                  |   3 -
 utilities/blob_db/blob_db.h                   |   4 -
 utilities/blob_db/blob_db_impl.cc             | 106 +++++-------
 utilities/blob_db/blob_db_test.cc             | 151 ++----------------
 13 files changed, 71 insertions(+), 258 deletions(-)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index edf6d918aea3..ba5e30c891c5 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -297,7 +297,6 @@ DECLARE_uint32(occ_lock_bucket_count);
 
 // Options for StackableDB-based BlobDB
 DECLARE_bool(use_blob_db);
-DECLARE_uint64(blob_db_min_blob_size);
 DECLARE_uint64(blob_db_bytes_per_sync);
 DECLARE_uint64(blob_db_file_size);
 DECLARE_bool(blob_db_enable_gc);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index f543a00bcbc8..22e417de4c69 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -425,12 +425,6 @@ DEFINE_bool(enable_write_thread_adaptive_yield,
 // Options for StackableDB-based BlobDB
 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB.");
 
-DEFINE_uint64(
-    blob_db_min_blob_size,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
-    "smaller than this will be inlined with the key in the LSM tree.");
-
 DEFINE_uint64(
     blob_db_bytes_per_sync,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 2dc952de2cb7..cf7a03ecffd4 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3844,7 +3844,6 @@ void StressTest::Open(SharedState* shared, bool reopen) {
         // StackableDB-based BlobDB
         if (FLAGS_use_blob_db) {
           blob_db::BlobDBOptions blob_db_options;
-          blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
           blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
           blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
           blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 66625fe2dc99..bdffbbb25a03 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -326,12 +326,12 @@ enum Tickers : uint32_t {
   // # of bytes (keys + value) read from BlobDB. Only applicable to legacy
   // BlobDB.
   BLOB_DB_BYTES_READ,
-  // # of keys written by BlobDB as non-TTL inlined value. Only applicable to
-  // legacy BlobDB.
-  BLOB_DB_WRITE_INLINED,
-  // # of keys written by BlobDB as TTL inlined value. Only applicable to legacy
-  // BlobDB.
-  BLOB_DB_WRITE_INLINED_TTL,
+  // Deprecated: min_blob_size is no longer configurable. Retained to avoid
+  // shifting enum values.
+  BLOB_DB_WRITE_INLINED_DEPRECATED,
+  // Deprecated: min_blob_size is no longer configurable. Retained to avoid
+  // shifting enum values.
+  BLOB_DB_WRITE_INLINED_TTL_DEPRECATED,
   // # of keys written by BlobDB as non-TTL blob value. Only applicable to
   // legacy BlobDB.
   BLOB_DB_WRITE_BLOB,
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index b19f473cc388..c0adc5eb9f49 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -5101,9 +5101,9 @@ class TickerTypeJni {
         return -0x1;
       case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ:
         return -0x2;
-      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED:
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_DEPRECATED:
         return -0x3;
-      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL:
+      case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL_DEPRECATED:
         return -0x4;
       case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB:
         return -0x5;
@@ -5594,9 +5594,9 @@ class TickerTypeJni {
       case -0x2:
         return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_BYTES_READ;
       case -0x3:
-        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED;
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_DEPRECATED;
       case -0x4:
-        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL;
+        return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_INLINED_TTL_DEPRECATED;
       case -0x5:
         return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_WRITE_BLOB;
       case -0x6:
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index bf1c73a129fb..41e6b7239425 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -550,14 +550,14 @@ public enum TickerType {
     BLOB_DB_BYTES_READ((byte) -0x2),
 
     /**
-     * # of keys written by BlobDB as non-TTL inlined value.
+     * Deprecated and unused. Retained to avoid shifting enum values.
      */
-    BLOB_DB_WRITE_INLINED((byte) -0x3),
+    @Deprecated BLOB_DB_WRITE_INLINED((byte) -0x3),
 
     /**
-     * # of keys written by BlobDB as TTL inlined value.
+     * Deprecated and unused. Retained to avoid shifting enum values.
      */
-    BLOB_DB_WRITE_INLINED_TTL((byte) -0x4),
+    @Deprecated BLOB_DB_WRITE_INLINED_TTL((byte) -0x4),
 
     /**
      * # of keys written by BlobDB as non-TTL blob value.
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 01b123d195e8..231e5b400288 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -169,8 +169,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
     {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
     {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
-    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
-    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
+    {BLOB_DB_WRITE_INLINED_DEPRECATED, "rocksdb.blobdb.write.inlined"},
+    {BLOB_DB_WRITE_INLINED_TTL_DEPRECATED, "rocksdb.blobdb.write.inlined.ttl"},
     {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
     {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
     {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index a46d288d5972..1ac794ca2d3b 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1069,12 +1069,6 @@ DEFINE_uint64(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
     "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
 
-DEFINE_uint64(
-    blob_db_min_blob_size,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
-    "smaller than this will be inlined with the key in the LSM tree.");
-
 DEFINE_uint64(blob_db_bytes_per_sync,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
               "[Stacked BlobDB] Bytes to sync blob file at.");
@@ -5190,7 +5184,6 @@ class Benchmark {
       blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
-      blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
       blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
       blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
       blob_db::BlobDB* ptr = nullptr;
diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc
index a87b5a614ca4..069daa0a63d3 100644
--- a/utilities/blob_db/blob_compaction_filter.cc
+++ b/utilities/blob_db/blob_compaction_filter.cc
@@ -32,7 +32,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     if (ucf == nullptr) {
       return Decision::kKeep;
     }
-    // Apply user compaction filter for inlined data.
+    // Apply user compaction filter for non-blob data.
     CompactionFilter::Decision decision =
         ucf->FilterV2(level, key, value_type, value, new_value, skip_until);
     if (decision == Decision::kChangeValue) {
@@ -52,8 +52,7 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     expired_size_ += key.size() + value.size();
     return Decision::kRemove;
   }
-  if (!blob_index.IsInlined() &&
-      blob_index.file_number() < context_.next_file_number &&
+  if (blob_index.file_number() < context_.next_file_number &&
       context_.current_blob_files.count(blob_index.file_number()) == 0) {
     evicted_count_++;
     evicted_size_ += key.size() + value.size();
@@ -88,13 +87,6 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
 
 CompactionFilter::Decision BlobIndexCompactionFilterBase::HandleValueChange(
     const Slice& key, std::string* new_value) const {
-  BlobDBImpl* const blob_db_impl = context_.blob_db_impl;
-  assert(blob_db_impl);
-
-  if (new_value->size() < blob_db_impl->bdb_options_.min_blob_size) {
-    // Keep new_value inlined.
-    return Decision::kChangeValue;
-  }
   if (!OpenNewBlobFileIfNeeded()) {
     return Decision::kIOError;
   }
@@ -295,12 +287,6 @@ CompactionFilter::BlobDecision BlobIndexCompactionFilterGC::PrepareBlobOutput(
     return BlobDecision::kCorruption;
   }
 
-  if (blob_index.IsInlined()) {
-    gc_stats_.AddBlob(blob_index.value().size());
-
-    return BlobDecision::kKeep;
-  }
-
   gc_stats_.AddBlob(blob_index.size());
 
   if (blob_index.HasTTL()) {
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index 7ee0ce1492ab..c159b44f2b2a 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -80,9 +80,6 @@ void BlobDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.ttl_range_secs: %" PRIu64,
       ttl_range_secs);
-  ROCKS_LOG_HEADER(
-      log, "                             BlobDBOptions.min_blob_size: %" PRIu64,
-      min_blob_size);
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.bytes_per_sync: %" PRIu64,
       bytes_per_sync);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 15c9d25a6166..2ec143459800 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -48,10 +48,6 @@ struct BlobDBOptions {
   // and so on
   uint64_t ttl_range_secs = 3600;
 
-  // The smallest value to store in blob log. Values smaller than this threshold
-  // will be inlined in base DB together with the key.
-  uint64_t min_blob_size = 0;
-
   // Allows OS to incrementally sync blob files to disk for every
   // bytes_per_sync bytes written. Users shouldn't rely on it for
   // persistency guarantee.
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 2a8f8873f13c..f8d80589d235 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -1071,69 +1071,54 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& write_options,
                                 const Slice& key, const Slice& value,
                                 uint64_t expiration, WriteBatch* batch) {
   write_mutex_.AssertHeld();
-  Status s;
-  std::string index_entry;
-  uint32_t column_family_id =
-      static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
-          ->GetID();
-  if (value.size() < bdb_options_.min_blob_size) {
-    if (expiration == kNoExpiration) {
-      // Put as normal value
-      s = batch->Put(key, value);
-      RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
-    } else {
-      // Inlined with TTL
-      BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
-      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
-                                           index_entry);
-      RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
-    }
-  } else {
-    std::string headerbuf;
-    BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value, expiration);
+  std::string headerbuf;
+  BlobLogWriter::ConstructBlobHeader(&headerbuf, key, value, expiration);
 
-    // Check DB size limit before selecting blob file.
-    s = CheckDbSizeLimit(headerbuf.size() + key.size() + value.size());
-    if (!s.ok()) {
-      return s;
-    }
+  // Check DB size limit before selecting blob file.
+  Status s = CheckDbSizeLimit(headerbuf.size() + key.size() + value.size());
+  if (!s.ok()) {
+    return s;
+  }
 
-    std::shared_ptr<BlobFile> blob_file;
+  std::shared_ptr<BlobFile> blob_file;
+  if (expiration != kNoExpiration) {
+    s = SelectBlobFileTTL(write_options, expiration, &blob_file);
+  } else {
+    s = SelectBlobFile(write_options, &blob_file);
+  }
+  std::string index_entry;
+  if (s.ok()) {
+    assert(blob_file != nullptr);
+    s = AppendBlob(write_options, blob_file, headerbuf, key, value, expiration,
+                   &index_entry);
+  }
+  if (s.ok()) {
     if (expiration != kNoExpiration) {
-      s = SelectBlobFileTTL(write_options, expiration, &blob_file);
-    } else {
-      s = SelectBlobFile(write_options, &blob_file);
-    }
-    if (s.ok()) {
-      assert(blob_file != nullptr);
-      s = AppendBlob(write_options, blob_file, headerbuf, key, value,
-                     expiration, &index_entry);
-    }
-    if (s.ok()) {
-      if (expiration != kNoExpiration) {
-        WriteLock file_lock(&blob_file->mutex_);
-        blob_file->ExtendExpirationRange(expiration);
-      }
-      s = CloseBlobFileIfNeeded(write_options, blob_file);
-    }
-    if (s.ok()) {
-      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
-                                           index_entry);
+      WriteLock file_lock(&blob_file->mutex_);
+      blob_file->ExtendExpirationRange(expiration);
     }
-    if (s.ok()) {
-      if (expiration == kNoExpiration) {
-        RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
-      } else {
-        RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
-      }
+    s = CloseBlobFileIfNeeded(write_options, blob_file);
+  }
+  if (s.ok()) {
+    const uint32_t column_family_id =
+        static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+            ->GetID();
+    s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                         index_entry);
+  }
+  if (s.ok()) {
+    if (expiration == kNoExpiration) {
+      RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
     } else {
-      ROCKS_LOG_ERROR(
-          db_options_.info_log,
-          "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
-          " status: '%s' blob_file: '%s'",
-          blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
-          s.ToString().c_str(), blob_file->DumpState().c_str());
+      RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
     }
+  } else {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
+        " status: '%s' blob_file: '%s'",
+        blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
+        s.ToString().c_str(), blob_file->DumpState().c_str());
   }
 
   RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
@@ -1366,13 +1351,6 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
     }
   }
 
-  if (blob_index.IsInlined()) {
-    // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
-    // memory buffer to avoid extra copy.
-    value->PinSelf(blob_index.value());
-    return Status::OK();
-  }
-
   return GetRawBlobFromFile(key, blob_index.file_number(), blob_index.offset(),
                             blob_index.size(), value);
 }
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index b694e799b038..007c1a5f649a 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -265,10 +265,7 @@ class BlobDBTest : public testing::Test {
       BlobIndex blob_index;
       ASSERT_OK(blob_index.DecodeFrom(versions[i].value));
 
-      const uint64_t file_number = !blob_index.IsInlined()
-                                       ? blob_index.file_number()
-                                       : kInvalidBlobFileNumber;
-      ASSERT_EQ(file_number, expected_version.file_number);
+      ASSERT_EQ(blob_index.file_number(), expected_version.file_number);
 
       const uint64_t expiration =
           blob_index.HasTTL() ? blob_index.expiration() : kNoExpiration;
@@ -303,7 +300,6 @@ class BlobDBTest : public testing::Test {
 TEST_F(BlobDBTest, Put) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -319,8 +315,6 @@ TEST_F(BlobDBTest, PutWithTTL) {
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
@@ -342,7 +336,6 @@ TEST_F(BlobDBTest, PutWithTTL) {
 TEST_F(BlobDBTest, StackableDBGet) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -385,7 +378,6 @@ TEST_F(BlobDBTest, GetIOError) {
   Options options;
   options.env = fault_injection_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   ColumnFamilyHandle* column_family = blob_db_->DefaultColumnFamily();
@@ -402,7 +394,6 @@ TEST_F(BlobDBTest, PutIOError) {
   Options options;
   options.env = fault_injection_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   fault_injection_env_->SetFilesystemActive(false, Status::IOError());
@@ -414,7 +405,6 @@ TEST_F(BlobDBTest, PutIOError) {
 TEST_F(BlobDBTest, WriteBatch) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -433,7 +423,6 @@ TEST_F(BlobDBTest, WriteBatch) {
 TEST_F(BlobDBTest, Delete) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -449,7 +438,6 @@ TEST_F(BlobDBTest, Delete) {
 TEST_F(BlobDBTest, DeleteBatch) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   for (size_t i = 0; i < 100; i++) {
@@ -467,7 +455,6 @@ TEST_F(BlobDBTest, DeleteBatch) {
 TEST_F(BlobDBTest, Override) {
   Random rnd(301);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
   std::map<std::string, std::string> data;
@@ -521,7 +508,6 @@ TEST_F(BlobDBTest, SstFileManager) {
       static_cast<SstFileManagerImpl*>(sst_file_manager.get());
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.enable_garbage_collection = true;
   bdb_options.garbage_collection_cutoff = 1.0;
   Options db_options;
@@ -579,7 +565,6 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
       static_cast<SstFileManagerImpl*>(sst_file_manager.get());
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   Options db_options;
 
   SyncPoint::GetInstance()->EnableProcessing();
@@ -620,7 +605,6 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
 
 TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.enable_garbage_collection = true;
   bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
@@ -728,7 +712,6 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   bdb_options.blob_dir = "blob_dir";
   bdb_options.path_relative = true;
   bdb_options.ttl_range_secs = 10;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
 
   Options options;
@@ -867,66 +850,13 @@ TEST_F(BlobDBTest, OutOfSpace) {
   ASSERT_TRUE(s.IsNoSpace());
 }
 
-TEST_F(BlobDBTest, InlineSmallValues) {
-  constexpr uint64_t kMaxExpiration = 1000;
-  Random rnd(301);
-  BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = kMaxExpiration;
-  bdb_options.min_blob_size = 100;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.disable_background_tasks = true;
-  Options options;
-  options.env = mock_env_.get();
-  mock_clock_->SetCurrentTime(0);
-  Open(bdb_options, options);
-  std::map<std::string, std::string> data;
-  std::map<std::string, KeyVersion> versions;
-  for (size_t i = 0; i < 1000; i++) {
-    bool is_small_value = rnd.Next() % 2;
-    bool has_ttl = rnd.Next() % 2;
-    uint64_t ttl = rnd.Next() % kMaxExpiration;
-    int len = is_small_value ? 50 : 200;
-    std::string key = "key" + std::to_string(i);
-    std::string value = rnd.HumanReadableString(len);
-    std::string blob_index;
-    data[key] = value;
-    SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
-    if (!has_ttl) {
-      ASSERT_OK(blob_db_->Put(WriteOptions(), key, value));
-    } else {
-      ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, ttl));
-    }
-    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
-    versions[key] =
-        KeyVersion(key, value, sequence,
-                   (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex);
-  }
-  VerifyDB(data);
-  VerifyBaseDB(versions);
-  auto* bdb_impl = static_cast<BlobDBImpl*>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(2, blob_files.size());
-  std::shared_ptr<BlobFile> non_ttl_file;
-  std::shared_ptr<BlobFile> ttl_file;
-  if (blob_files[0]->HasTTL()) {
-    ttl_file = blob_files[0];
-    non_ttl_file = blob_files[1];
-  } else {
-    non_ttl_file = blob_files[0];
-    ttl_file = blob_files[1];
-  }
-  ASSERT_FALSE(non_ttl_file->HasTTL());
-  ASSERT_TRUE(ttl_file->HasTTL());
-}
-
 TEST_F(BlobDBTest, UserCompactionFilter) {
   class CustomerFilter : public CompactionFilter {
    public:
     bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
                 std::string* new_value, bool* value_changed) const override {
       *value_changed = false;
-      // changing value size to test value transitions between inlined data
-      // and stored-in-blob data
+      // Test compaction filter modifying blob values
       if (value.size() % 4 == 1) {
         *new_value = value.ToString();
         // double size by duplicating value
@@ -956,15 +886,10 @@ TEST_F(BlobDBTest, UserCompactionFilter) {
   };
 
   constexpr size_t kNumPuts = 1 << 10;
-  // Generate both inlined and blob value
   constexpr uint64_t kMinValueSize = 1 << 6;
   constexpr uint64_t kMaxValueSize = 1 << 8;
-  constexpr uint64_t kMinBlobSize = 1 << 7;
-  static_assert(kMinValueSize < kMinBlobSize);
-  static_assert(kMaxValueSize > kMinBlobSize);
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.blob_file_size = kMaxValueSize * 10;
   bdb_options.disable_background_tasks = true;
   // case_num == 0: Test user defined compaction filter
@@ -1043,7 +968,6 @@ TEST_F(BlobDBTest, UserCompactionFilter_BlobIOError) {
   constexpr int kValueSize = 100;
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.blob_file_size = kValueSize * 10;
   bdb_options.disable_background_tasks = true;
 
@@ -1105,11 +1029,9 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   constexpr size_t kNumPuts = 1000;
   constexpr uint64_t kMaxTTL = 1000;
   constexpr uint64_t kCompactTime = 500;
-  constexpr uint64_t kMinBlobSize = 100;
   Random rnd(301);
   mock_clock_->SetCurrentTime(0);
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.disable_background_tasks = true;
   Options options;
   options.env = mock_env_.get();
@@ -1118,21 +1040,13 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   std::map<std::string, std::string> data;
   std::map<std::string, std::string> data_after_compact;
   for (size_t i = 0; i < kNumPuts; i++) {
-    bool is_small_value = rnd.Next() % 2;
     bool has_ttl = rnd.Next() % 2;
     // At time 0, stored expiration equals TTL
     uint64_t ttl = rnd.Next() % kMaxTTL;
-    int len = is_small_value ? 10 : 200;
+    int len = rnd.Next() % 200 + 10;
     std::string key = "key" + std::to_string(rnd.Next() % kNumKeys);
     std::string value = rnd.HumanReadableString(len);
     if (!has_ttl) {
-      if (is_small_value) {
-        std::string blob_entry;
-        BlobIndex::EncodeInlinedTTL(&blob_entry, ttl, value);
-        // Fake blob index with TTL. See what it will do.
-        ASSERT_GT(kMinBlobSize, blob_entry.size());
-        value = blob_entry;
-      }
       ASSERT_OK(Put(key, value));
       data_after_compact[key] = value;
     } else {
@@ -1169,7 +1083,6 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
 // blob file has been removed.
 TEST_F(BlobDBTest, FilterFileNotAvailable) {
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Options options;
   options.disable_auto_compactions = true;
@@ -1230,21 +1143,15 @@ TEST_F(BlobDBTest, GarbageCollection) {
   constexpr uint64_t kCompactTime = 500;
 
   constexpr uint64_t kKeySize = 7;  // "key" + 4 digits
-
-  constexpr uint64_t kSmallValueSize = 1 << 6;
-  constexpr uint64_t kLargeValueSize = 1 << 8;
-  constexpr uint64_t kMinBlobSize = 1 << 7;
-  static_assert(kSmallValueSize < kMinBlobSize);
-  static_assert(kLargeValueSize > kMinBlobSize);
+  constexpr uint64_t kValueSize = 1 << 8;
 
   constexpr size_t kBlobsPerFile = 8;
   constexpr size_t kNumBlobFiles = kNumPuts / kBlobsPerFile;
   constexpr uint64_t kBlobFileSize =
       BlobLogHeader::kSize +
-      (BlobLogRecord::kHeaderSize + kKeySize + kLargeValueSize) * kBlobsPerFile;
+      (BlobLogRecord::kHeaderSize + kKeySize + kValueSize) * kBlobsPerFile;
 
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.blob_file_size = kBlobFileSize;
   bdb_options.enable_garbage_collection = true;
   bdb_options.garbage_collection_cutoff = 0.25;
@@ -1262,14 +1169,14 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
   Random rnd(301);
 
-  // Add a bunch of large non-TTL values. These will be written to non-TTL
+  // Add a bunch of non-TTL values. These will be written to non-TTL
   // blob files and will be subject to GC.
   for (size_t i = 0; i < kNumPuts; ++i) {
     std::ostringstream oss;
     oss << "key" << std::setw(4) << std::setfill('0') << i;
 
     const std::string key(oss.str());
-    const std::string value = rnd.HumanReadableString(kLargeValueSize);
+    const std::string value = rnd.HumanReadableString(kValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(Put(key, value));
@@ -1282,11 +1189,11 @@ TEST_F(BlobDBTest, GarbageCollection) {
                          sequence, kTypeBlobIndex);
   }
 
-  // Add some small and/or TTL values that will be ignored during GC.
-  // First, add a large TTL value will be written to its own TTL blob file.
+  // Add a TTL value that will be written to its own TTL blob file (ignored
+  // during GC).
   {
     const std::string key("key2000");
-    const std::string value = rnd.HumanReadableString(kLargeValueSize);
+    const std::string value = rnd.HumanReadableString(kValueSize);
     const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
 
     ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, kTTL));
@@ -1299,37 +1206,6 @@ TEST_F(BlobDBTest, GarbageCollection) {
                          sequence, kTypeBlobIndex);
   }
 
-  // Now add a small TTL value (which will be inlined).
-  {
-    const std::string key("key3000");
-    const std::string value = rnd.HumanReadableString(kSmallValueSize);
-    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
-
-    ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), key, value, kTTL));
-    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
-
-    data[key] = value;
-    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeBlobIndex);
-    blob_index_versions[key] = BlobIndexVersion(key, kInvalidBlobFileNumber,
-                                                kTTL, sequence, kTypeBlobIndex);
-  }
-
-  // Finally, add a small non-TTL value (which will be stored as a regular
-  // value).
-  {
-    const std::string key("key4000");
-    const std::string value = rnd.HumanReadableString(kSmallValueSize);
-    const SequenceNumber sequence = blob_db_->GetLatestSequenceNumber() + 1;
-
-    ASSERT_OK(Put(key, value));
-    ASSERT_EQ(blob_db_->GetLatestSequenceNumber(), sequence);
-
-    data[key] = value;
-    blob_value_versions[key] = KeyVersion(key, value, sequence, kTypeValue);
-    blob_index_versions[key] = BlobIndexVersion(
-        key, kInvalidBlobFileNumber, kNoExpiration, sequence, kTypeValue);
-  }
-
   VerifyDB(data);
   VerifyBaseDB(blob_value_versions);
   VerifyBaseDBBlobIndex(blob_index_versions);
@@ -1391,7 +1267,7 @@ TEST_F(BlobDBTest, GarbageCollection) {
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_KEYS_RELOCATED),
             cutoff * kBlobsPerFile);
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_BYTES_RELOCATED),
-            cutoff * kBlobsPerFile * kLargeValueSize);
+            cutoff * kBlobsPerFile * kValueSize);
 
   // At this point, we should have 128 immutable non-TTL files with file numbers
   // 33..128 and 130..161. (129 was taken by the TTL blob file.)
@@ -1413,7 +1289,6 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
 TEST_F(BlobDBTest, GarbageCollectionFailure) {
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
   bdb_options.enable_garbage_collection = true;
   bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
@@ -1460,7 +1335,6 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
 TEST_F(BlobDBTest, EvictExpiredFile) {
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 100;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Options options;
   options.env = mock_env_.get();
@@ -1790,7 +1664,6 @@ TEST_F(BlobDBTest, MaintainBlobFileToSstMapping) {
 TEST_F(BlobDBTest, ShutdownWait) {
   BlobDBOptions bdb_options;
   bdb_options.ttl_range_secs = 100;
-  bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = false;
   Options options;
   options.env = mock_env_.get();
@@ -1854,7 +1727,6 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
   options.statistics = CreateDBStatistics();
 
   BlobDBOptions blob_options;
-  blob_options.min_blob_size = 0;
   blob_options.bytes_per_sync = 1 << 20;
   blob_options.disable_background_tasks = true;
 
@@ -1874,7 +1746,6 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
   options.env = fault_injection_env_.get();
 
   BlobDBOptions blob_options;
-  blob_options.min_blob_size = 0;
   blob_options.bytes_per_sync = 1 << 20;
   blob_options.disable_background_tasks = true;
 

From 148f6c98455322edc472878b4881116d9b941338 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 16:58:23 -0800
Subject: [PATCH 444/500] Replace GC cutoff threshold with a constant (#14272)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14272

All production deployments have the hardcoded (non-configurable) default for`garbage_collection_cutoff = 0.25`. This change removes the configurable option and replaces it with a fixed constant `kGarbageCollectionCutoff = 0.25`, simplifying the configuration surface.

Changes:
- Remove `garbage_collection_cutoff` from `BlobDBOptions`
- Add `kGarbageCollectionCutoff` constant (0.25) in blob_db_impl.cc
- Remove `--blob_db_gc_cutoff` flag from db_bench tool and db_stress
- Update tests to work with the fixed cutoff value

Reviewed By: xingbowang

Differential Revision: D91088998

fbshipit-source-id: 820fc7f1ad4c3fe8a15f22a92cd53fb96c56c6e1
---
 db_stress_tool/db_stress_common.h     |   1 -
 db_stress_tool/db_stress_gflags.cc    |   5 -
 db_stress_tool/db_stress_test_base.cc |   1 -
 tools/db_bench_tool.cc                |   6 --
 utilities/blob_db/blob_db.cc          |   3 -
 utilities/blob_db/blob_db.h           |   5 -
 utilities/blob_db/blob_db_impl.cc     |  15 ++-
 utilities/blob_db/blob_db_test.cc     | 127 +++++++++++++++++++-------
 8 files changed, 101 insertions(+), 62 deletions(-)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index ba5e30c891c5..e857c64d63a9 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -300,7 +300,6 @@ DECLARE_bool(use_blob_db);
 DECLARE_uint64(blob_db_bytes_per_sync);
 DECLARE_uint64(blob_db_file_size);
 DECLARE_bool(blob_db_enable_gc);
-DECLARE_double(blob_db_gc_cutoff);
 
 // Options for integrated BlobDB
 DECLARE_bool(allow_setting_blob_options_dynamically);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 22e417de4c69..49e51bf77136 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -439,11 +439,6 @@ DEFINE_bool(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
     "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
-DEFINE_double(
-    blob_db_gc_cutoff,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
-
 // Options for integrated BlobDB
 DEFINE_bool(allow_setting_blob_options_dynamically, false,
             "[Integrated BlobDB] Allow setting blob options dynamically.");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index cf7a03ecffd4..593cdceb9932 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3847,7 +3847,6 @@ void StressTest::Open(SharedState* shared, bool reopen) {
           blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
           blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
           blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
-          blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
 
           blob_db::BlobDB* blob_db = nullptr;
           s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 1ac794ca2d3b..3c6fefa98561 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1050,11 +1050,6 @@ DEFINE_bool(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
     "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
-DEFINE_double(
-    blob_db_gc_cutoff,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
-
 DEFINE_uint64(blob_db_max_db_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
               "[Stacked BlobDB] Max size limit of the directory where blob "
@@ -5181,7 +5176,6 @@ class Benchmark {
       // Stacked BlobDB
       blob_db::BlobDBOptions blob_db_options;
       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
-      blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
       blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index c159b44f2b2a..df1a65dad37d 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -89,9 +89,6 @@ void BlobDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                 BlobDBOptions.enable_garbage_collection: %d",
       enable_garbage_collection);
-  ROCKS_LOG_HEADER(
-      log, "                 BlobDBOptions.garbage_collection_cutoff: %f",
-      garbage_collection_cutoff);
   ROCKS_LOG_HEADER(
       log, "                  BlobDBOptions.disable_background_tasks: %d",
       disable_background_tasks);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 2ec143459800..8111543be89e 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -61,11 +61,6 @@ struct BlobDBOptions {
   // by rewriting the remaining live blobs to new files.
   bool enable_garbage_collection = false;
 
-  // The cutoff in terms of blob file age for garbage collection. Blobs in
-  // the oldest N non-TTL blob files will be rewritten when encountered during
-  // compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files.
-  double garbage_collection_cutoff = 0.25;
-
   // Disable all background job. Used for test only.
   bool disable_background_tasks = false;
 
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index f8d80589d235..72c53e235978 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -43,6 +43,11 @@
 
 namespace ROCKSDB_NAMESPACE::blob_db {
 
+// The cutoff in terms of blob file age for garbage collection. Blobs in the
+// oldest N non-TTL blob files will be rewritten when encountered during
+// compaction, where N = kGarbageCollectionCutoff * number_of_non_TTL_files.
+constexpr double kGarbageCollectionCutoff = 0.25;
+
 bool BlobFileComparator::operator()(
     const std::shared_ptr<BlobFile>& lhs,
     const std::shared_ptr<BlobFile>& rhs) const {
@@ -138,12 +143,6 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
     return Status::NotSupported("No blob directory in options");
   }
 
-  if (bdb_options_.garbage_collection_cutoff < 0.0 ||
-      bdb_options_.garbage_collection_cutoff > 1.0) {
-    return Status::InvalidArgument(
-        "Garbage collection cutoff must be in the interval [0.0, 1.0]");
-  }
-
   // Temporarily disable compactions in the base DB during open; save the user
   // defined value beforehand so we can restore it once BlobDB is initialized.
   // Note: this is only needed if garbage collection is enabled.
@@ -1185,8 +1184,8 @@ void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context,
 
   if (!live_imm_non_ttl_blob_files_.empty()) {
     auto it = live_imm_non_ttl_blob_files_.begin();
-    std::advance(it, bdb_options_.garbage_collection_cutoff *
-                         live_imm_non_ttl_blob_files_.size());
+    std::advance(
+        it, kGarbageCollectionCutoff * live_imm_non_ttl_blob_files_.size());
     context_gc->cutoff_file_number = it != live_imm_non_ttl_blob_files_.end()
                                          ? it->first
                                          : std::numeric_limits<uint64_t>::max();
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 007c1a5f649a..4f2122bcb307 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -509,7 +509,6 @@ TEST_F(BlobDBTest, SstFileManager) {
 
   BlobDBOptions bdb_options;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 1.0;
   Options db_options;
 
   int files_scheduled_to_delete = 0;
@@ -527,12 +526,22 @@ TEST_F(BlobDBTest, SstFileManager) {
 
   Open(bdb_options, db_options);
 
-  // Create one obselete file and clean it.
+  // Create 4 blob files. With GC cutoff of 0.25, the oldest file (file 1)
+  // will be in the GC zone: floor(0.25 * 4) = 1.
   ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   std::shared_ptr<BlobFile> bfile = blob_files[0];
   ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+
+  // Create 3 more blob files (files 2-4, outside GC zone).
+  for (int i = 1; i < 4; i++) {
+    ASSERT_OK(blob_db_->Put(WriteOptions(), "key" + std::to_string(i), "val"));
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(static_cast<size_t>(i + 1), blob_files.size());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[i]));
+  }
+
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   blob_db_impl()->TEST_DeleteObsoleteFiles();
 
@@ -540,7 +549,8 @@ TEST_F(BlobDBTest, SstFileManager) {
   ASSERT_EQ(1, files_scheduled_to_delete);
   Destroy();
   // Make sure that DestroyBlobDB() also goes through delete scheduler.
-  ASSERT_EQ(2, files_scheduled_to_delete);
+  // Remaining files: 3 original (files 2-4) + 1 GC output file = 4 files.
+  ASSERT_EQ(5, files_scheduled_to_delete);
   SyncPoint::GetInstance()->DisableProcessing();
   sfm->WaitForEmptyTrash();
 }
@@ -606,20 +616,27 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
 TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
   BlobDBOptions bdb_options;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
 
   Options options;
   options.disable_auto_compactions = true;
 
-  // i = when to take snapshot
+  // This test verifies that snapshots protect blob files from deletion during
+  // garbage collection. With fixed GC cutoff of 0.25 and 8 immutable files,
+  // floor(0.25 * 8) = 2 files are in the GC zone (files 1 and 2).
+  //
+  // We run 4 iterations with different snapshot timing:
+  //   i=0: snapshot after key1 (before key2) - protects file 1
+  //   i=1: snapshot after key2 (before key3) - protects files 1 and 2
+  //   i=2: snapshot after key9 (after all keys) - no protection needed
+  //   i=3: snapshot after Delete(key2) - no protection needed
   for (int i = 0; i < 4; i++) {
     Destroy();
     Open(bdb_options, options);
 
     const Snapshot* snapshot = nullptr;
 
-    // First file
+    // Create first blob file (will be in GC zone).
     ASSERT_OK(Put("key1", "value"));
     if (i == 0) {
       snapshot = blob_db_->GetSnapshot();
@@ -629,7 +646,8 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
     ASSERT_EQ(1, blob_files.size());
     ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
 
-    // Second file
+    // Create second blob file (will be in GC zone). We track this file
+    // to verify it becomes obsolete after GC relocates its blob.
     ASSERT_OK(Put("key2", "value"));
     if (i == 1) {
       snapshot = blob_db_->GetSnapshot();
@@ -637,39 +655,66 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
 
     blob_files = blob_db_impl()->TEST_GetBlobFiles();
     ASSERT_EQ(2, blob_files.size());
-    auto bfile = blob_files[1];
-    ASSERT_FALSE(bfile->Immutable());
-    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+    auto gc_target_file = blob_files[1];
+    ASSERT_FALSE(gc_target_file->Immutable());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(gc_target_file));
+
+    // Create files 3-8, all closed (these are outside GC zone).
+    for (int j = 3; j <= 8; j++) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value"));
+      blob_files = blob_db_impl()->TEST_GetBlobFiles();
+      ASSERT_EQ(static_cast<size_t>(j), blob_files.size());
+      ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[j - 1]));
+    }
 
-    // Third file
-    ASSERT_OK(Put("key3", "value"));
+    // Create file 9 but leave it open (mutable). Only immutable files are
+    // counted for GC cutoff calculation.
+    ASSERT_OK(Put("key9", "value"));
     if (i == 2) {
       snapshot = blob_db_->GetSnapshot();
     }
 
+    // Verify we have 9 total files (8 immutable + 1 mutable).
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(9, blob_files.size());
+
+    // Trigger GC via compaction. Blobs in files 1 and 2 will be relocated
+    // to a new GC output file.
     ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-    ASSERT_TRUE(bfile->Obsolete());
+
+    // Verify gc_target_file (file 2) is now obsolete.
+    ASSERT_TRUE(gc_target_file->Obsolete());
+    // Verify the obsolete sequence matches the latest sequence number.
     ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
-              bfile->GetObsoleteSequence());
+              gc_target_file->GetObsoleteSequence());
 
     Delete("key2");
     if (i == 3) {
       snapshot = blob_db_->GetSnapshot();
     }
 
-    ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+    // Verify we now have 10 files (9 original + 1 GC output file).
+    // Files 1 and 2 are obsolete but not yet deleted.
+    ASSERT_EQ(10, blob_db_impl()->TEST_GetBlobFiles().size());
     blob_db_impl()->TEST_DeleteObsoleteFiles();
 
     if (i >= 2) {
-      // The snapshot shouldn't see data in bfile
-      ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+      // Snapshot was taken after all keys were written, so it sees the
+      // post-compaction state where blob indexes point to the GC output file.
+      // Obsolete files 1 and 2 can be deleted immediately.
+      // Verify 8 files remain (10 - 2 obsolete files deleted).
+      ASSERT_EQ(8, blob_db_impl()->TEST_GetBlobFiles().size());
       blob_db_->ReleaseSnapshot(snapshot);
     } else {
-      // The snapshot will see data in bfile, so the file shouldn't be deleted
-      ASSERT_EQ(4, blob_db_impl()->TEST_GetBlobFiles().size());
+      // Snapshot was taken before compaction completed, so it may still
+      // reference blobs in the obsolete files. Files cannot be deleted.
+      // Verify all 10 files still exist.
+      ASSERT_EQ(10, blob_db_impl()->TEST_GetBlobFiles().size());
       blob_db_->ReleaseSnapshot(snapshot);
       blob_db_impl()->TEST_DeleteObsoleteFiles();
-      ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+      // After releasing the snapshot, obsolete files can be deleted.
+      // Verify 8 files remain.
+      ASSERT_EQ(8, blob_db_impl()->TEST_GetBlobFiles().size());
     }
   }
 }
@@ -1154,7 +1199,6 @@ TEST_F(BlobDBTest, GarbageCollection) {
   BlobDBOptions bdb_options;
   bdb_options.blob_file_size = kBlobFileSize;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 0.25;
   bdb_options.disable_background_tasks = true;
 
   Options options;
@@ -1238,8 +1282,9 @@ TEST_F(BlobDBTest, GarbageCollection) {
 
   VerifyBaseDB(blob_value_versions);
 
-  const uint64_t cutoff = static_cast<uint64_t>(
-      bdb_options.garbage_collection_cutoff * kNumBlobFiles);
+  // GC cutoff is fixed at 0.25
+  constexpr double kGCCutoff = 0.25;
+  const uint64_t cutoff = static_cast<uint64_t>(kGCCutoff * kNumBlobFiles);
   for (auto& pair : blob_index_versions) {
     BlobIndexVersion& version = pair.second;
 
@@ -1290,7 +1335,6 @@ TEST_F(BlobDBTest, GarbageCollection) {
 TEST_F(BlobDBTest, GarbageCollectionFailure) {
   BlobDBOptions bdb_options;
   bdb_options.enable_garbage_collection = true;
-  bdb_options.garbage_collection_cutoff = 1.0;
   bdb_options.disable_background_tasks = true;
 
   Options db_options;
@@ -1298,14 +1342,31 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
 
   Open(bdb_options, db_options);
 
-  // Write a couple of valid blobs.
+  // Create 4 blob files. With fixed GC cutoff of 0.25, the oldest file
+  // (floor(0.25 * 4) = 1) will be in the GC zone.
+  // The first file contains valid blobs for "foo" and "dead".
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Put("dead", "beef"));
 
-  // Write a fake blob reference into the base DB that points to a non-existing
-  // blob file.
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+  auto first_file = blob_files[0];
+  uint64_t first_file_number = first_file->BlobFileNumber();
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(first_file));
+
+  // Create 3 more blob files (files 2-4, outside GC zone).
+  for (int i = 1; i < 4; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), "value"));
+    blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(static_cast<size_t>(i + 1), blob_files.size());
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[i]));
+  }
+
+  // Write a fake blob index that points to the first file (in GC zone)
+  // but with an invalid offset beyond the file size. This will cause
+  // GC to fail when it tries to read this blob.
   std::string blob_index;
-  BlobIndex::EncodeBlob(&blob_index, /* file_number */ 1000, /* offset */ 1234,
+  BlobIndex::EncodeBlob(&blob_index, first_file_number, /* offset */ 999999,
                         /* size */ 5678, kNoCompression);
 
   WriteBatch batch;
@@ -1313,17 +1374,17 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
       &batch, blob_db_->DefaultColumnFamily()->GetID(), "key", blob_index));
   ASSERT_OK(blob_db_->GetRootDB()->Write(WriteOptions(), &batch));
 
-  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
-  ASSERT_EQ(blob_files.size(), 1);
-  auto blob_file = blob_files[0];
-  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
-
+  // Verify compaction fails with IO error due to invalid blob offset.
   ASSERT_TRUE(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
                   .IsIOError());
 
   const Statistics* const statistics = db_options.statistics.get();
   assert(statistics);
 
+  // Verify GC statistics:
+  // - Relocated 2 keys ("foo" and "dead") with 7 bytes ("bar" + "beef")
+  // - Failed on "key" which has invalid blob offset
+  // - Created 1 new GC output file before failing
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_FILES), 0);
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_NUM_NEW_FILES), 1);
   ASSERT_EQ(statistics->getTickerCount(BLOB_DB_GC_FAILURES), 1);

From 56d40be243f374a44139298d5d066ddbbaf352cf Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 18:03:05 -0800
Subject: [PATCH 445/500] Remove path_relative config option (#14273)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14273

The `path_relative` option in `BlobDBOptions` was never used in practice - all
production deployments use the default value of `true` (relative path). The
absolute path mode (`path_relative = false`) was essentially unsupported:
- `GetLiveFiles()` returned `NotSupported` for absolute paths
- `GetLiveFilesMetaData()` had an assertion that would crash for absolute paths

This change removes the option and simplifies the code to always use relative
paths for the blob directory.

Changes:
- Remove `path_relative` field from `BlobDBOptions`
- Simplify `blob_dir_` construction in `BlobDBImpl` constructor
- Simplify path construction in `DestroyBlobDB()`
- Remove `NotSupported` check in `GetLiveFiles()`
- Remove assertion in `GetLiveFilesMetaData()`
- Remove logging of `path_relative` in `Dump()`
- Remove redundant `path_relative = true` in tests

Reviewed By: xingbowang

Differential Revision: D91089016

fbshipit-source-id: 947b129e405a315b94ac73bc48b23103ba12d73b
---
 utilities/blob_db/blob_db.cc                   | 3 ---
 utilities/blob_db/blob_db.h                    | 3 ---
 utilities/blob_db/blob_db_impl.cc              | 8 ++------
 utilities/blob_db/blob_db_impl_filesnapshot.cc | 6 ------
 utilities/blob_db/blob_db_test.cc              | 1 -
 5 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index df1a65dad37d..3ddf650608a7 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -71,9 +71,6 @@ void BlobDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                                  BlobDBOptions.blob_dir: %s",
       blob_dir.c_str());
-  ROCKS_LOG_HEADER(
-      log, "                             BlobDBOptions.path_relative: %d",
-      path_relative);
   ROCKS_LOG_HEADER(
       log, "                               BlobDBOptions.max_db_size: %" PRIu64,
       max_db_size);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 8111543be89e..168a7399e7ed 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -32,9 +32,6 @@ struct BlobDBOptions {
   // Default is "blob_dir"
   std::string blob_dir = "blob_dir";
 
-  // whether the blob_dir path is relative or absolute.
-  bool path_relative = true;
-
   // Maximum size of the database (including SST files and blob files).
   //
   // Default: 0 (no limits)
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 72c53e235978..1a256e1bd02b 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -88,9 +88,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       live_sst_size_(0),
       debug_level_(0) {
   clock_ = env_->GetSystemClock().get();
-  blob_dir_ = (bdb_options_.path_relative)
-                  ? dbname + "/" + bdb_options_.blob_dir
-                  : bdb_options_.blob_dir;
+  blob_dir_ = dbname + "/" + bdb_options_.blob_dir;
   file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
 }
 
@@ -1977,9 +1975,7 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
   Env* env = soptions.env;
 
   Status status;
-  std::string blobdir;
-  blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir
-                                        : bdb_options.blob_dir;
+  std::string blobdir = dbname + "/" + bdb_options.blob_dir;
 
   std::vector<std::string> filenames;
   if (env->GetChildren(blobdir, &filenames).ok()) {
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index 250297404570..85f29d96aa29 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -59,10 +59,6 @@ Status BlobDBImpl::EnableFileDeletions() {
 Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
                                 uint64_t* manifest_file_size,
                                 bool flush_memtable) {
-  if (!bdb_options_.path_relative) {
-    return Status::NotSupported(
-        "Not able to get relative blob file path from absolute blob_dir.");
-  }
   // Hold a lock in the beginning to avoid updates to base DB during the call
   ReadLock rl(&mutex_);
   Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
@@ -80,8 +76,6 @@ Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
 }
 
 void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
-  // Path should be relative to db_name.
-  assert(bdb_options_.path_relative);
   // Hold a lock in the beginning to avoid updates to base DB during the call
   ReadLock rl(&mutex_);
   db_->GetLiveFilesMetaData(metadata);
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 4f2122bcb307..51e3557716f5 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -755,7 +755,6 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
 
   BlobDBOptions bdb_options;
   bdb_options.blob_dir = "blob_dir";
-  bdb_options.path_relative = true;
   bdb_options.ttl_range_secs = 10;
   bdb_options.disable_background_tasks = true;
 

From 053b0d54dcf7d58236da3d1a182be6775668bf1b Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 22:05:33 -0800
Subject: [PATCH 446/500] Remove blob_dir config option (#14275)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14275

No one ever sets `blob_dir` to a non-default value. Replace the configurable `blob_dir` option with a constant `kBlobDirName`. This simplifies the code and further reduces the configurability surface.

Reviewed By: xingbowang

Differential Revision: D91089039

fbshipit-source-id: 7d82e86415cc4bc89a7fe1399c29d4cc3058d1de
---
 utilities/blob_db/blob_db.cc                   | 3 ---
 utilities/blob_db/blob_db.h                    | 7 ++-----
 utilities/blob_db/blob_db_impl.cc              | 6 +++---
 utilities/blob_db/blob_db_impl_filesnapshot.cc | 4 ++--
 utilities/blob_db/blob_db_test.cc              | 1 -
 5 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index 3ddf650608a7..9b0073483d7f 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -68,9 +68,6 @@ Status BlobDB::Open(const DBOptions& db_options,
 BlobDB::BlobDB() : StackableDB(nullptr) {}
 
 void BlobDBOptions::Dump(Logger* log) const {
-  ROCKS_LOG_HEADER(
-      log, "                                  BlobDBOptions.blob_dir: %s",
-      blob_dir.c_str());
   ROCKS_LOG_HEADER(
       log, "                               BlobDBOptions.max_db_size: %" PRIu64,
       max_db_size);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 168a7399e7ed..49b4f97a2f4f 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -25,13 +25,10 @@ namespace blob_db {
 // users to use blob DB.
 
 constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
+// Name of the directory under the base DB where blobs will be stored.
+constexpr const char* kBlobDirName = "blob_dir";
 
 struct BlobDBOptions {
-  // Name of the directory under the base DB where blobs will be stored. Using
-  // a directory where the base DB stores its SST files is not supported.
-  // Default is "blob_dir"
-  std::string blob_dir = "blob_dir";
-
   // Maximum size of the database (including SST files and blob files).
   //
   // Default: 0 (no limits)
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 1a256e1bd02b..4c8e3302c140 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -88,7 +88,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       live_sst_size_(0),
       debug_level_(0) {
   clock_ = env_->GetSystemClock().get();
-  blob_dir_ = dbname + "/" + bdb_options_.blob_dir;
+  blob_dir_ = dbname + "/" + kBlobDirName;
   file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
 }
 
@@ -1970,12 +1970,12 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) {
 }
 
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
-                     const BlobDBOptions& bdb_options) {
+                     const BlobDBOptions& /*bdb_options*/) {
   const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
   Env* env = soptions.env;
 
   Status status;
-  std::string blobdir = dbname + "/" + bdb_options.blob_dir;
+  std::string blobdir = dbname + "/" + kBlobDirName;
 
   std::vector<std::string> filenames;
   if (env->GetChildren(blobdir, &filenames).ok()) {
diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc
index 85f29d96aa29..e46f3c8fbf38 100644
--- a/utilities/blob_db/blob_db_impl_filesnapshot.cc
+++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -70,7 +70,7 @@ Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
     auto blob_file = bfile_pair.second;
     // Path should be relative to db_name, but begin with slash.
     ret.emplace_back(
-        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber()));
+        BlobFileName("", kBlobDirName, blob_file->BlobFileNumber()));
   }
   return Status::OK();
 }
@@ -85,7 +85,7 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
     filemetadata.size = blob_file->GetFileSize();
     const uint64_t file_number = blob_file->BlobFileNumber();
     // Path should be relative to db_name, but begin with slash.
-    filemetadata.name = BlobFileName("", bdb_options_.blob_dir, file_number);
+    filemetadata.name = BlobFileName("", kBlobDirName, file_number);
     filemetadata.file_number = file_number;
     if (blob_file->HasTTL()) {
       filemetadata.oldest_ancester_time = blob_file->GetExpirationRange().first;
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 51e3557716f5..d5bcb3bf8baa 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -754,7 +754,6 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   Random rnd(301);
 
   BlobDBOptions bdb_options;
-  bdb_options.blob_dir = "blob_dir";
   bdb_options.ttl_range_secs = 10;
   bdb_options.disable_background_tasks = true;
 

From 374c8dd63552ea99136e6bebefa3b6226b588175 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Wed, 28 Jan 2026 23:13:03 -0800
Subject: [PATCH 447/500] Remove bytes_per_sync config option (#14276)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14276

No production user of Legacy overrides the default value of `bytes_per_sync`. Replace the option with a constant `kBytesPerSync` to further reduce legacy blob db customizability / configuration surface.

Reviewed By: xingbowang

Differential Revision: D91089096

fbshipit-source-id: 162df65646a4f3a3fab3586cf6ff223e1917d86e
---
 db_stress_tool/db_stress_common.h     | 1 -
 db_stress_tool/db_stress_gflags.cc    | 5 -----
 db_stress_tool/db_stress_test_base.cc | 1 -
 tools/db_bench_tool.cc                | 5 -----
 utilities/blob_db/blob_db.cc          | 3 ---
 utilities/blob_db/blob_db.h           | 9 ++++-----
 utilities/blob_db/blob_db_impl.cc     | 2 +-
 utilities/blob_db/blob_db_test.cc     | 2 --
 8 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index e857c64d63a9..2768a1eff1df 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -297,7 +297,6 @@ DECLARE_uint32(occ_lock_bucket_count);
 
 // Options for StackableDB-based BlobDB
 DECLARE_bool(use_blob_db);
-DECLARE_uint64(blob_db_bytes_per_sync);
 DECLARE_uint64(blob_db_file_size);
 DECLARE_bool(blob_db_enable_gc);
 
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 49e51bf77136..dead587f5945 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -425,11 +425,6 @@ DEFINE_bool(enable_write_thread_adaptive_yield,
 // Options for StackableDB-based BlobDB
 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB.");
 
-DEFINE_uint64(
-    blob_db_bytes_per_sync,
-    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-    "[Stacked BlobDB] Sync blob files once per every N bytes written.");
-
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
               "[Stacked BlobDB] Target size of each blob file.");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 593cdceb9932..e826b2552a78 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -3844,7 +3844,6 @@ void StressTest::Open(SharedState* shared, bool reopen) {
         // StackableDB-based BlobDB
         if (FLAGS_use_blob_db) {
           blob_db::BlobDBOptions blob_db_options;
-          blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
           blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
           blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 3c6fefa98561..ac7a8066b54b 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1064,10 +1064,6 @@ DEFINE_uint64(
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
     "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
 
-DEFINE_uint64(blob_db_bytes_per_sync,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-              "[Stacked BlobDB] Bytes to sync blob file at.");
-
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
               "[Stacked BlobDB] Target size of each blob file.");
@@ -5178,7 +5174,6 @@ class Benchmark {
       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
       blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
       blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
-      blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
       blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
       blob_db::BlobDB* ptr = nullptr;
       s = hooks.Open(options, blob_db_options, db_name, &ptr);
diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc
index 9b0073483d7f..16e75417d510 100644
--- a/utilities/blob_db/blob_db.cc
+++ b/utilities/blob_db/blob_db.cc
@@ -74,9 +74,6 @@ void BlobDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.ttl_range_secs: %" PRIu64,
       ttl_range_secs);
-  ROCKS_LOG_HEADER(
-      log, "                            BlobDBOptions.bytes_per_sync: %" PRIu64,
-      bytes_per_sync);
   ROCKS_LOG_HEADER(
       log, "                            BlobDBOptions.blob_file_size: %" PRIu64,
       blob_file_size);
diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index 49b4f97a2f4f..e799b3ee031f 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -28,6 +28,10 @@ constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
 // Name of the directory under the base DB where blobs will be stored.
 constexpr const char* kBlobDirName = "blob_dir";
 
+// Allows OS to incrementally sync blob files to disk for every
+// kBytesPerSync bytes written.
+constexpr uint64_t kBytesPerSync = 512 * 1024;
+
 struct BlobDBOptions {
   // Maximum size of the database (including SST files and blob files).
   //
@@ -42,11 +46,6 @@ struct BlobDBOptions {
   // and so on
   uint64_t ttl_range_secs = 3600;
 
-  // Allows OS to incrementally sync blob files to disk for every
-  // bytes_per_sync bytes written. Users shouldn't rely on it for
-  // persistency guarantee.
-  uint64_t bytes_per_sync = 512 * 1024;
-
   // the target size of each blob file. File will become immutable
   // after it exceeds that size
   uint64_t blob_file_size = 256 * 1024 * 1024;
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 4c8e3302c140..b1cbbbde0c96 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -89,7 +89,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       debug_level_(0) {
   clock_ = env_->GetSystemClock().get();
   blob_dir_ = dbname + "/" + kBlobDirName;
-  file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+  file_options_.bytes_per_sync = kBytesPerSync;
 }
 
 BlobDBImpl::~BlobDBImpl() {
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index d5bcb3bf8baa..44e7e59e268c 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -1786,7 +1786,6 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
   options.statistics = CreateDBStatistics();
 
   BlobDBOptions blob_options;
-  blob_options.bytes_per_sync = 1 << 20;
   blob_options.disable_background_tasks = true;
 
   Open(blob_options, options);
@@ -1805,7 +1804,6 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
   options.env = fault_injection_env_.get();
 
   BlobDBOptions blob_options;
-  blob_options.bytes_per_sync = 1 << 20;
   blob_options.disable_background_tasks = true;
 
   Open(blob_options, options);

From 83d24db3d5061518e61226518d79db98d2e977f7 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Thu, 29 Jan 2026 00:04:09 -0800
Subject: [PATCH 448/500] Remove unused public APIs (#14277)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14277

Remove `GetBlobDBOptions()` and `SyncBlobFiles()` from the public `BlobDB` interface. These methods were only used internally or in tests and are not needed by any production code. `GetBlobDBOptions()` is now replaced by storing bdb_options_ as a member in the test class. `SyncBlobFiles()` is moved to private in `BlobDBImpl` since it's only called internally. Also remove unused `kDeleteCheckPeriodMillisecs` constant.

Reviewed By: xingbowang

Differential Revision: D91089111

fbshipit-source-id: 9c92b6d9563cf241c69d8880b418e8bcb7acb6c5
---
 utilities/blob_db/blob_db.h       | 4 ----
 utilities/blob_db/blob_db_impl.cc | 2 --
 utilities/blob_db/blob_db_impl.h  | 9 ++-------
 utilities/blob_db/blob_db_test.cc | 5 +++--
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h
index e799b3ee031f..fc8b6e3099ab 100644
--- a/utilities/blob_db/blob_db.h
+++ b/utilities/blob_db/blob_db.h
@@ -174,10 +174,6 @@ class BlobDB : public StackableDB {
                      std::vector<ColumnFamilyHandle*>* handles,
                      BlobDB** blob_db);
 
-  virtual BlobDBOptions GetBlobDBOptions() const = 0;
-
-  virtual Status SyncBlobFiles(const WriteOptions& write_options) = 0;
-
   ~BlobDB() override {}
 
  protected:
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index b1cbbbde0c96..4437499b1fc1 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -131,8 +131,6 @@ Status BlobDBImpl::CloseImpl() {
   return s;
 }
 
-BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
-
 Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
   assert(handles != nullptr);
   assert(db_ == nullptr);
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index 3144268886bd..eb25fee7c18f 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -75,9 +75,6 @@ class BlobDBImpl : public BlobDB {
   friend class BlobIndexCompactionFilterGC;
 
  public:
-  // deletions check period
-  static constexpr uint32_t kDeleteCheckPeriodMillisecs = 2 * 1000;
-
   // sanity check task
   static constexpr uint32_t kSanityCheckPeriodMillisecs = 20 * 60 * 1000;
 
@@ -140,8 +137,6 @@ class BlobDBImpl : public BlobDB {
       std::vector<std::string>* const output_file_names = nullptr,
       CompactionJobInfo* compaction_job_info = nullptr) override;
 
-  BlobDBOptions GetBlobDBOptions() const override;
-
   BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
              const DBOptions& db_options,
              const ColumnFamilyOptions& cf_options);
@@ -161,8 +156,6 @@ class BlobDBImpl : public BlobDB {
 
   Status Open(std::vector<ColumnFamilyHandle*>* handles);
 
-  Status SyncBlobFiles(const WriteOptions& write_options) override;
-
   // Common part of the two GetCompactionContext methods below.
   // REQUIRES: read lock on mutex_
   void GetCompactionContextCommon(BlobCompactionContext* context);
@@ -214,6 +207,8 @@ class BlobDBImpl : public BlobDB {
   // Return true if a snapshot is created.
   bool SetSnapshotIfNeeded(ReadOptions* read_options);
 
+  Status SyncBlobFiles(const WriteOptions& write_options);
+
   Status GetImpl(const ReadOptions& read_options,
                  ColumnFamilyHandle* column_family, const Slice& key,
                  PinnableSlice* value, uint64_t* expiration = nullptr);
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 44e7e59e268c..7d225047eff0 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -84,6 +84,7 @@ class BlobDBTest : public testing::Test {
       options.stats_dump_period_sec = 0;
       options.stats_persist_period_sec = 0;
     }
+    bdb_options_ = bdb_options;
     return BlobDB::Open(options, bdb_options, dbname_, &blob_db_);
   }
 
@@ -109,10 +110,9 @@ class BlobDBTest : public testing::Test {
   void Destroy() {
     if (blob_db_) {
       Options options = blob_db_->GetOptions();
-      BlobDBOptions bdb_options = blob_db_->GetBlobDBOptions();
       delete blob_db_;
       blob_db_ = nullptr;
-      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options));
+      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options_));
     }
   }
 
@@ -295,6 +295,7 @@ class BlobDBTest : public testing::Test {
   std::unique_ptr<Env> mock_env_;
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
   BlobDB* blob_db_;
+  BlobDBOptions bdb_options_;
 };  // class BlobDBTest
 
 TEST_F(BlobDBTest, Put) {

From e94df3db52b13dbdda8e2e4c7e63fab466db03be Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Thu, 29 Jan 2026 00:53:52 -0800
Subject: [PATCH 449/500] Remove dead code and unused includes (#14278)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14278

Remove dead code from `BlobDBImpl`:
- `debug_level_ `member and associated unreachable debug logging
- `CopyBlobFiles()` - private method that was never called
- `FileDeleteOk_SnapshotCheckLocked()` - declared but never implemented
- `RemoveTimerQ()` - declared but never implemented

Remove unused includes:
- rocksdb/wal_filter.h from blob_db_impl.h
- rocksdb/utilities/transaction.h from blob_db_impl.cc
- table/meta_blocks.h from blob_db_impl.cc
- util/random.h from blob_db_impl.cc

Remove from BlobFile:
- `GetColumnFamilyId()` - declared/implemented but never called

Reviewed By: xingbowang

Differential Revision: D91089144

fbshipit-source-id: d9bce24122b3bb790644fe4e51ce4403c77a1abf
---
 utilities/blob_db/blob_db_impl.cc | 37 +------------------------------
 utilities/blob_db/blob_db_impl.h  |  8 -------
 utilities/blob_db/blob_file.cc    |  2 --
 utilities/blob_db/blob_file.h     |  2 --
 4 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 4437499b1fc1..22216fc82aae 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -28,13 +28,10 @@
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/utilities/stackable_db.h"
-#include "rocksdb/utilities/transaction.h"
-#include "table/meta_blocks.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
-#include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_compaction_filter.h"
@@ -85,8 +82,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       closed_(true),
       open_file_count_(0),
       total_blob_size_(0),
-      live_sst_size_(0),
-      debug_level_(0) {
+      live_sst_size_(0) {
   clock_ = env_->GetSystemClock().get();
   blob_dir_ = dbname + "/" + kBlobDirName;
   file_options_.bytes_per_sync = kBytesPerSync;
@@ -740,11 +736,6 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
       statistics_, Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS));
 
   uint64_t boffset = bfile->GetFileSize();
-  if (debug_level_ >= 2 && boffset) {
-    ROCKS_LOG_DEBUG(db_options_.info_log,
-                    "Open blob file: %s with offset: %" PRIu64, fpath.c_str(),
-                    boffset);
-  }
 
   BlobLogWriter::ElemType et = BlobLogWriter::kEtNone;
   if (bfile->file_size_ == BlobLogHeader::kSize) {
@@ -1365,15 +1356,6 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
   // valid offset.
   if (offset <
       (BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key.size())) {
-    if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Invalid blob index file_number: %" PRIu64
-                      " blob_offset: %" PRIu64 " blob_size: %" PRIu64
-                      " key: %s",
-                      file_number, offset, size,
-                      key.ToString(/* output_hex */ true).c_str());
-    }
-
     return Status::NotFound("Invalid blob offset");
   }
 
@@ -1463,15 +1445,6 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
                                blob_record.size() - sizeof(uint32_t));
   crc = crc32c::Mask(crc);  // Adjust for storage
   if (crc != crc_exp) {
-    if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(
-          db_options_.info_log,
-          "Blob crc mismatch file: %" PRIu64 " blob_offset: %" PRIu64
-          " blob_size: %" PRIu64 " key: %s status: '%s'",
-          file_number, offset, size,
-          key.ToString(/* output_hex */ true).c_str(), s.ToString().c_str());
-    }
-
     return Status::Corruption("Corruption. Blob CRC mismatch");
   }
 
@@ -1930,14 +1903,6 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
   return std::make_pair(!aborted, -1);
 }
 
-void BlobDBImpl::CopyBlobFiles(
-    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
-  ReadLock rl(&mutex_);
-  for (auto const& p : blob_files_) {
-    bfiles_copy->push_back(p.second);
-  }
-}
-
 Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) {
   if (_read_options.io_activity != Env::IOActivity::kUnknown &&
       _read_options.io_activity != Env::IOActivity::kDBIterator) {
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index eb25fee7c18f..227fc0726a8f 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -26,7 +26,6 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/wal_filter.h"
 #include "util/mutexlock.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_db.h"
@@ -286,8 +285,6 @@ class BlobDBImpl : public BlobDB {
   // Evict expired blob files from the TTL queue.
   std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
 
-  std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
-
   // Adds the background tasks to the timer queue
   void StartBackgroundTasks();
 
@@ -374,9 +371,6 @@ class BlobDBImpl : public BlobDB {
   // checks if there is no snapshot which is referencing the
   // blobs
   bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
-  bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
-
-  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
 
   uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
 
@@ -469,8 +463,6 @@ class BlobDBImpl : public BlobDB {
   //
   // REQUIRES: access with delete_file_mutex_ held.
   int disable_file_deletions_ = 0;
-
-  uint32_t debug_level_;
 };
 
 }  // namespace blob_db
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index a076f166ba94..38b65c297bd6 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -48,8 +48,6 @@ BlobFile::~BlobFile() {
   }
 }
 
-uint32_t BlobFile::GetColumnFamilyId() const { return column_family_id_; }
-
 std::string BlobFile::PathName() const {
   return BlobFileName(path_to_dir_, file_number_);
 }
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 61f4a094af6e..4110234d0a06 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -110,8 +110,6 @@ class BlobFile {
 
   ~BlobFile();
 
-  uint32_t GetColumnFamilyId() const;
-
   // Returns log file's absolute pathname.
   std::string PathName() const;
 

From 21a8b5f77fc25e1340fb0b9645af8929f1c77589 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Thu, 29 Jan 2026 12:44:27 -0800
Subject: [PATCH 450/500] Fixes the Windows VS 2022 build (#14280)

Summary:
When building a Release on Windows RTTI is not available, so asserts that use dynamic_cast need to be disabled

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14280

Reviewed By: nmk70

Differential Revision: D91807791

Pulled By: mszeszko-meta

fbshipit-source-id: e29c19c757bcd076a1f09ed40b306bb50ba9e882
---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 03837b672ac4..c0194e58d61c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -219,6 +219,10 @@ if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4996 /wd4100 /wd4324")
   endif()
+  if(CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DNDEBUG")
+    message(STATUS "Setting /DNDEBUG as CMAKE_BUILD_TYPE is set to ${CMAKE_BUILD_TYPE}")
+  endif()
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing -Wno-invalid-offsetof")

From 656b734a5f871bc0d83a490cb1bad67448a7e8fe Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Fri, 30 Jan 2026 05:53:04 -0800
Subject: [PATCH 451/500] Support abort background compaction jobs. (#14227)

Summary:
This adds a new public API to allow applications to abort all running compactions and prevent new ones from starting. Unlike DisableManualCompaction() which only pauses manual compactions and waits for them to finish naturally, AbortAllCompactions() actively signals running compactions (both automatic and manual) to terminate early and waits for them to complete before returning.

The abort signal is checked periodically during compaction (every 100 keys), so ongoing compactions abort quickly. Any output files from aborted compactions are automatically cleaned up to prevent partial results from being installed.

This is useful for scenarios where applications need to quickly stop all compaction activity, such as during graceful shutdown or when performing maintenance operations.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14227

Test Plan:
- Unit tests in db_compaction_abort_test.cc cover various abort scenarios including: abort before/during compaction, abort with multiple subcompactions, nested abort/resume calls, abort with CompactFiles API, abort across multiple column families, and timing guarantees
- Updated compaction_job_test.cc to include the new parameter

Reviewed By: anand1976

Differential Revision: D91480994

Pulled By: xingbowang

fbshipit-source-id: 36837971d8a540cd34d3ec28a78bc94b582625b0
---
 BUCK                                          |   6 +
 CMakeLists.txt                                |   1 +
 Makefile                                      |   3 +
 db/c.cc                                       |   8 +
 db/c_test.c                                   |   2 +-
 db/compaction/compaction_job.cc               | 145 ++-
 db/compaction/compaction_job.h                |  21 +-
 db/compaction/compaction_job_test.cc          |   7 +-
 db/compaction/compaction_outputs.h            |  19 +
 db/compaction/compaction_service_job.cc       |  29 +-
 db/compaction/subcompaction_state.h           |   8 +
 db/db_compaction_abort_test.cc                | 993 ++++++++++++++++++
 db/db_impl/db_impl.h                          |  10 +
 db/db_impl/db_impl_compaction_flush.cc        | 117 ++-
 db/db_test.cc                                 |   5 +-
 db/internal_stats.cc                          |  13 +
 db/internal_stats.h                           |   2 +
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   4 +
 db_stress_tool/db_stress_test_base.cc         |  26 +-
 db_stress_tool/db_stress_test_base.h          |   2 +
 include/rocksdb/db.h                          |  44 +
 include/rocksdb/listener.h                    |   3 +
 include/rocksdb/statistics.h                  |   2 +
 include/rocksdb/status.h                      |   8 +
 include/rocksdb/utilities/stackable_db.h      |   2 +
 java/rocksjni/rocksjni.cc                     |  22 +
 java/src/main/java/org/rocksdb/RocksDB.java   |  19 +
 monitoring/statistics.cc                      |   1 +
 monitoring/stats_history_test.cc              |   2 +-
 src.mk                                        |   1 +
 tools/db_crashtest.py                         |   1 +
 .../abort_compaction_apis.md                  |   1 +
 util/status.cc                                |   4 +-
 34 files changed, 1471 insertions(+), 61 deletions(-)
 create mode 100644 db/db_compaction_abort_test.cc
 create mode 100644 unreleased_history/public_api_changes/abort_compaction_apis.md

diff --git a/BUCK b/BUCK
index 8fa8f35d3d9e..7037c44e778f 100644
--- a/BUCK
+++ b/BUCK
@@ -4823,6 +4823,12 @@ cpp_unittest_wrapper(name="db_clip_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="db_compaction_abort_test",
+            srcs=["db/db_compaction_abort_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_compaction_filter_test",
             srcs=["db/db_compaction_filter_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0194e58d61c..b3fc440fe311 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1381,6 +1381,7 @@ if(WITH_TESTS)
         db/db_bloom_filter_test.cc
         db/db_compaction_filter_test.cc
         db/db_compaction_test.cc
+        db/db_compaction_abort_test.cc
         db/db_clip_test.cc
         db/db_dynamic_level_test.cc
         db/db_encryption_test.cc
diff --git a/Makefile b/Makefile
index 4f62ad5b576e..b2c3a8f6b741 100644
--- a/Makefile
+++ b/Makefile
@@ -1442,6 +1442,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR
 db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_compaction_abort_test: $(OBJ_DIR)/db/db_compaction_abort_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/db/c.cc b/db/c.cc
index 7abab13a6fda..dae0d0ebb569 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -8570,6 +8570,14 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) {
   db->rep->EnableManualCompaction();
 }
 
+void rocksdb_abort_all_compactions(rocksdb_t* db) {
+  db->rep->AbortAllCompactions();
+}
+
+void rocksdb_resume_all_compactions(rocksdb_t* db) {
+  db->rep->ResumeAllCompactions();
+}
+
 rocksdb_statistics_histogram_data_t*
 rocksdb_statistics_histogram_data_create() {
   return new rocksdb_statistics_histogram_data_t{};
diff --git a/db/c_test.c b/db/c_test.c
index 7f05dd2ab4b2..6811fe4ae8cb 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -4447,7 +4447,7 @@ int main(int argc, char** argv) {
 
   StartPhase("statistics");
   {
-    const uint32_t BYTES_WRITTEN_TICKER = 60;
+    const uint32_t BYTES_WRITTEN_TICKER = 61;
     const uint32_t DB_WRITE_HIST = 1;
 
     rocksdb_statistics_histogram_data_t* hist =
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 3d51f8fd5410..d5ac5738527b 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -128,6 +128,10 @@ const char* GetCompactionProximalOutputRangeTypeString(
   }
 }
 
+// Static constant for compaction abort flag - always false, used for
+// compaction service jobs that don't support abort signaling
+const std::atomic<int> CompactionJob::kCompactionAbortedFalse{0};
+
 CompactionJob::CompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
     const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
@@ -141,10 +145,10 @@ CompactionJob::CompactionJob(
     CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
     const std::shared_ptr<IOTracer>& io_tracer,
     const std::atomic<bool>& manual_compaction_canceled,
-    const std::string& db_id, const std::string& db_session_id,
-    std::string full_history_ts_low, std::string trim_ts,
-    BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
-    int* bg_bottom_compaction_scheduled)
+    const std::atomic<int>& compaction_aborted, const std::string& db_id,
+    const std::string& db_session_id, std::string full_history_ts_low,
+    std::string trim_ts, BlobFileCompletionCallback* blob_callback,
+    int* bg_compaction_scheduled, int* bg_bottom_compaction_scheduled)
     : compact_(new CompactionState(compaction)),
       internal_stats_(compaction->compaction_reason(), 1),
       db_options_(db_options),
@@ -168,6 +172,7 @@ CompactionJob::CompactionJob(
       versions_(versions),
       shutting_down_(shutting_down),
       manual_compaction_canceled_(manual_compaction_canceled),
+      compaction_aborted_(compaction_aborted),
       db_directory_(db_directory),
       blob_output_directory_(blob_output_directory),
       db_mutex_(db_mutex),
@@ -708,6 +713,7 @@ void CompactionJob::InitializeCompactionRun() {
 }
 
 void CompactionJob::RunSubcompactions() {
+  TEST_SYNC_POINT("CompactionJob::RunSubcompactions:BeforeStart");
   const size_t num_threads = compact_->sub_compact_states.size();
   assert(num_threads > 0);
   compact_->compaction->GetOrInitInputTableProperties();
@@ -753,6 +759,71 @@ void CompactionJob::RemoveEmptyOutputs() {
   }
 }
 
+void CompactionJob::CleanupAbortedSubcompactions() {
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+
+  uint64_t total_sst_files_deleted = 0;
+  uint64_t total_blob_files_deleted = 0;
+
+  // Track the first file deletion error to report at the end
+  Status first_error;
+  int deletion_errors = 0;
+
+  // Mark all subcompactions as aborted and delete their output files
+  for (auto& sub_compact : compact_->sub_compact_states) {
+    // Mark this subcompaction as aborted
+    sub_compact.status =
+        Status::Incomplete(Status::SubCode::kCompactionAborted);
+
+    // Delete all files (SST and blob) tracked during compaction.
+    // GetOutputFilePaths() contains ALL file paths created, including
+    // in-progress files that may have been removed from outputs_ or
+    // blob_file_additions_.
+    for (const bool is_proximal_level : {false, true}) {
+      if (is_proximal_level &&
+          !compact_->compaction->SupportsPerKeyPlacement()) {
+        continue;
+      }
+      for (const std::string& file_path :
+           sub_compact.Outputs(is_proximal_level)->GetOutputFilePaths()) {
+        Status s = env_->DeleteFile(file_path);
+        if (s.ok()) {
+          // Count SST vs blob files by checking extension
+          if (file_path.find(".sst") != std::string::npos) {
+            total_sst_files_deleted++;
+          } else if (file_path.find(".blob") != std::string::npos) {
+            total_blob_files_deleted++;
+          }
+        } else if (!s.IsNotFound()) {
+          if (first_error.ok()) {
+            first_error = s;
+          }
+          deletion_errors++;
+        }
+      }
+    }
+    sub_compact.CleanupOutputs();
+  }
+
+  if (stats_) {
+    RecordTick(stats_, COMPACTION_ABORTED);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Compaction aborted: deleted %" PRIu64
+                 " SST files and %" PRIu64 " blob files",
+                 cfd->GetName().c_str(), job_id_, total_sst_files_deleted,
+                 total_blob_files_deleted);
+
+  if (!first_error.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "[%s] [JOB %d] Cleanup completed with %d file deletion "
+                    "errors. First error: %s",
+                    cfd->GetName().c_str(), job_id_, deletion_errors,
+                    first_error.ToString().c_str());
+  }
+}
+
 bool CompactionJob::HasNewBlobFiles() const {
   for (const auto& state : compact_->sub_compact_states) {
     if (state.Current().HasBlobFileAdditions()) {
@@ -1004,6 +1075,15 @@ Status CompactionJob::Run() {
 
   Status status = CollectSubcompactionErrors();
 
+  // If compaction was aborted or manually paused, clean up any output files
+  // from completed subcompactions to prevent orphaned files on disk.
+  // Skip cleanup for resumable compaction (when progress writer is set)
+  // because the output files are needed for resumption.
+  if ((status.IsCompactionAborted() || status.IsManualCompactionPaused()) &&
+      compaction_progress_writer_ == nullptr) {
+    CleanupAbortedSubcompactions();
+  }
+
   if (status.ok()) {
     status = SyncOutputDirectories();
   }
@@ -1415,10 +1495,10 @@ InternalIterator* CompactionJob::CreateInputIterator(
   return input;
 }
 
-void CompactionJob::CreateBlobFileBuilder(SubcompactionState* sub_compact,
-                                          ColumnFamilyData* cfd,
-                                          BlobFileResources& blob_resources,
-                                          const WriteOptions& write_options) {
+void CompactionJob::CreateBlobFileBuilder(
+    SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+    std::unique_ptr<BlobFileBuilder>& blob_file_builder,
+    const WriteOptions& write_options) {
   const auto& mutable_cf_options =
       sub_compact->compaction->mutable_cf_options();
 
@@ -1427,24 +1507,24 @@ void CompactionJob::CreateBlobFileBuilder(SubcompactionState* sub_compact,
   if (mutable_cf_options.enable_blob_files &&
       sub_compact->compaction->output_level() >=
           mutable_cf_options.blob_file_starting_level) {
-    blob_resources.blob_file_builder = std::make_unique<BlobFileBuilder>(
+    blob_file_builder = std::make_unique<BlobFileBuilder>(
         versions_, fs_.get(), &sub_compact->compaction->immutable_options(),
         &mutable_cf_options, &file_options_, &write_options, db_id_,
         db_session_id_, job_id_, cfd->GetID(), cfd->GetName(), write_hint_,
         io_tracer_, blob_callback_, BlobFileCreationReason::kCompaction,
-        &blob_resources.blob_file_paths,
+        sub_compact->Current().GetOutputFilePathsPtr(),
         sub_compact->Current().GetBlobFileAdditionsPtr());
   } else {
-    blob_resources.blob_file_builder = nullptr;
+    blob_file_builder = nullptr;
   }
 }
 
 std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
     SubcompactionState* sub_compact, ColumnFamilyData* cfd,
     InternalIterator* input, const CompactionFilter* compaction_filter,
-    MergeHelper& merge, BlobFileResources& blob_resources,
+    MergeHelper& merge, std::unique_ptr<BlobFileBuilder>& blob_file_builder,
     const WriteOptions& write_options) {
-  CreateBlobFileBuilder(sub_compact, cfd, blob_resources, write_options);
+  CreateBlobFileBuilder(sub_compact, cfd, blob_file_builder, write_options);
 
   const std::string* const full_history_ts_low =
       full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
@@ -1456,7 +1536,7 @@ std::unique_ptr<CompactionIterator> CompactionJob::CreateCompactionIterator(
       job_context_->earliest_write_conflict_snapshot,
       job_context_->GetJobSnapshotSequence(), job_context_->snapshot_checker,
       env_, ShouldReportDetailedTime(env_, stats_), sub_compact->RangeDelAgg(),
-      blob_resources.blob_file_builder.get(), db_options_.allow_data_in_errors,
+      blob_file_builder.get(), db_options_.allow_data_in_errors,
       db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
       sub_compact->compaction
           ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
@@ -1495,10 +1575,17 @@ Status CompactionJob::ProcessKeyValue(
     SubcompactionState* sub_compact, ColumnFamilyData* cfd,
     CompactionIterator* c_iter, const CompactionFileOpenFunc& open_file_func,
     const CompactionFileCloseFunc& close_file_func, uint64_t& prev_cpu_micros) {
-  Status status;
-  const uint64_t kRecordStatsEvery = 1000;
+  // Cron interval for periodic operations: stats update, abort check,
+  // and sync points. Uses 1024 (power of 2) for efficient bitwise check.
+  const uint64_t kCronEveryMask = (1 << 10) - 1;
   [[maybe_unused]] const std::optional<const Slice> end = sub_compact->end;
 
+  // Check for abort signal before starting key processing
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kCompactionAborted);
+  }
+
+  Status status;
   IterKey prev_iter_output_key;
   ParsedInternalKey prev_iter_output_internal_key;
 
@@ -1511,8 +1598,16 @@ Status CompactionJob::ProcessKeyValue(
     assert(!end.has_value() ||
            cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
 
-    if (c_iter->iter_stats().num_input_records % kRecordStatsEvery ==
-        kRecordStatsEvery - 1) {
+    const uint64_t num_records = c_iter->iter_stats().num_input_records;
+
+    // Periodic cron operations: stats update, abort check.
+    if ((num_records & kCronEveryMask) == kCronEveryMask) {
+      // Check for abort signal periodically
+      if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+        status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+        break;
+      }
+
       UpdateSubcompactionJobStatsIncrementally(
           c_iter, &sub_compact->compaction_job_stats,
           db_options_.clock->CPUMicros(), prev_cpu_micros);
@@ -1719,6 +1814,7 @@ Status CompactionJob::FinalizeBlobFiles(SubcompactionState* sub_compact,
 }
 
 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  TEST_SYNC_POINT("CompactionJob::ProcessKeyValueCompaction:Start");
   assert(sub_compact);
   assert(sub_compact->compaction);
 
@@ -1772,11 +1868,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       false /* internal key corruption is expected */,
       job_context_->GetLatestSnapshotSequence(), job_context_->snapshot_checker,
       compact_->compaction->level(), db_options_.stats);
-  BlobFileResources blob_resources;
+  std::unique_ptr<BlobFileBuilder> blob_file_builder;
 
   auto c_iter =
       CreateCompactionIterator(sub_compact, cfd, input_iter, compaction_filter,
-                               merge, blob_resources, write_options);
+                               merge, blob_file_builder, write_options);
   assert(c_iter);
   c_iter->SeekToFirst();
 
@@ -1794,9 +1890,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   status = FinalizeProcessKeyValueStatus(cfd, input_iter, c_iter.get(), status);
 
   FinalizeSubcompaction(sub_compact, status, open_file_func, close_file_func,
-                        blob_resources.blob_file_builder.get(), c_iter.get(),
-                        input_iter, start_cpu_micros, prev_cpu_micros,
-                        io_stats);
+                        blob_file_builder.get(), c_iter.get(), input_iter,
+                        start_cpu_micros, prev_cpu_micros, io_stats);
 
   NotifyOnSubcompactionCompleted(sub_compact);
 }
@@ -2295,6 +2390,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
   Status s;
   IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
   s = io_s;
+  if (io_s.ok()) {
+    // Track the SST file path for cleanup on abort.
+    outputs.AddOutputFilePath(fname);
+  }
   if (sub_compact->io_status.ok()) {
     sub_compact->io_status = io_s;
     // Since this error is really a copy of the io_s that is checked below as s,
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index c9dac611cd6f..8b942c6fe64d 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -142,6 +142,9 @@ class SubcompactionState;
 
 class CompactionJob {
  public:
+  // Constant false aborted flag, used for compaction service jobs
+  static const std::atomic<int> kCompactionAbortedFalse;
+
   CompactionJob(int job_id, Compaction* compaction,
                 const ImmutableDBOptions& db_options,
                 const MutableDBOptions& mutable_db_options,
@@ -157,6 +160,7 @@ class CompactionJob {
                 Env::Priority thread_pri,
                 const std::shared_ptr<IOTracer>& io_tracer,
                 const std::atomic<bool>& manual_compaction_canceled,
+                const std::atomic<int>& compaction_aborted,
                 const std::string& db_id = "",
                 const std::string& db_session_id = "",
                 std::string full_history_ts_low = "", std::string trim_ts = "",
@@ -299,6 +303,7 @@ class CompactionJob {
   void RunSubcompactions();
   void UpdateTimingStats(uint64_t start_micros);
   void RemoveEmptyOutputs();
+  void CleanupAbortedSubcompactions();
   bool HasNewBlobFiles() const;
   Status CollectSubcompactionErrors();
   Status SyncOutputDirectories();
@@ -363,11 +368,6 @@ class CompactionJob {
     std::unique_ptr<InternalIterator> trim_history_iter;
   };
 
-  struct BlobFileResources {
-    std::vector<std::string> blob_file_paths;
-    std::unique_ptr<BlobFileBuilder> blob_file_builder;
-  };
-
   bool ShouldUseLocalCompaction(SubcompactionState* sub_compact);
   CompactionIOStatsSnapshot InitializeIOStats();
   Status SetupAndValidateCompactionFilter(
@@ -382,14 +382,14 @@ class CompactionJob {
       SubcompactionState* sub_compact, ColumnFamilyData* cfd,
       SubcompactionInternalIterators& iterators,
       SubcompactionKeyBoundaries& boundaries, ReadOptions& read_options);
-  void CreateBlobFileBuilder(SubcompactionState* sub_compact,
-                             ColumnFamilyData* cfd,
-                             BlobFileResources& blob_resources,
-                             const WriteOptions& write_options);
+  void CreateBlobFileBuilder(
+      SubcompactionState* sub_compact, ColumnFamilyData* cfd,
+      std::unique_ptr<BlobFileBuilder>& blob_file_builder,
+      const WriteOptions& write_options);
   std::unique_ptr<CompactionIterator> CreateCompactionIterator(
       SubcompactionState* sub_compact, ColumnFamilyData* cfd,
       InternalIterator* input_iter, const CompactionFilter* compaction_filter,
-      MergeHelper& merge, BlobFileResources& blob_resources,
+      MergeHelper& merge, std::unique_ptr<BlobFileBuilder>& blob_file_builder,
       const WriteOptions& write_options);
   std::pair<CompactionFileOpenFunc, CompactionFileCloseFunc> CreateFileHandlers(
       SubcompactionState* sub_compact, SubcompactionKeyBoundaries& boundaries);
@@ -461,6 +461,7 @@ class CompactionJob {
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
   const std::atomic<bool>& manual_compaction_canceled_;
+  const std::atomic<int>& compaction_aborted_;
   FSDirectory* db_directory_;
   FSDirectory* blob_output_directory_;
   InstrumentedMutex* db_mutex_;
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 95d74be4d485..ce55dfe4f8ee 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -676,8 +676,8 @@ class CompactionJobTestBase : public testing::Test {
         &event_logger, false, false, dbname_, &compaction_job_stats_,
         Env::Priority::USER, nullptr /* IOTracer */,
         /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
-        env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
-        full_history_ts_low_);
+        CompactionJob::kCompactionAbortedFalse, env_->GenerateUniqueId(),
+        DBImpl::GenerateDbSessionId(nullptr), full_history_ts_low_);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare(std::nullopt /*subcompact to be computed*/);
@@ -2545,7 +2545,8 @@ class ResumableCompactionJobTest : public CompactionJobTestBase {
         versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
         nullptr, stats.get(), &mutex_, &error_handler_, &job_context,
         table_cache_, &event_logger, false, false, dbname_, &job_stats,
-        Env::Priority::USER, nullptr, cancel_, env_->GenerateUniqueId(),
+        Env::Priority::USER, nullptr, cancel_,
+        CompactionJob::kCompactionAbortedFalse, env_->GenerateUniqueId(),
         DBImpl::GenerateDbSessionId(nullptr), "");
 
     compaction_job.Prepare(std::nullopt, compaction_progress,
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index 6f9de28efcfd..757e1b6b85ed 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -84,6 +84,19 @@ class CompactionOutputs {
 
   bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
 
+  // Get all file paths (SST and blob) created during compaction.
+  const std::vector<std::string>& GetOutputFilePaths() const {
+    return output_file_paths_;
+  }
+
+  std::vector<std::string>* GetOutputFilePathsPtr() {
+    return &output_file_paths_;
+  }
+
+  void AddOutputFilePath(const std::string& path) {
+    output_file_paths_.push_back(path);
+  }
+
   BlobGarbageMeter* CreateBlobGarbageMeter() {
     assert(!is_proximal_level_);
     blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
@@ -321,6 +334,12 @@ class CompactionOutputs {
   std::vector<BlobFileAddition> blob_file_additions_;
   std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
 
+  // All file paths (SST and blob) created during compaction.
+  // Used for cleanup on abort - ensures orphan files are deleted even if
+  // they were removed from outputs_ or blob_file_additions_ (e.g., by
+  // RemoveLastEmptyOutput when file_size is 0 because builder was abandoned).
+  std::vector<std::string> output_file_paths_;
+
   // Per level's output stat
   InternalStats::CompactionStats stats_;
 
diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc
index d9eea538193f..cb88c53d8f8d 100644
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@@ -117,6 +117,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
   std::string debug_str_before_wait =
       compaction->input_version()->DebugString(/*hex=*/true);
 
+  // TODO: Update CompactionService API to support abort and resume
+  // functionality. Currently, remote compaction jobs cannot be aborted via
+  // AbortAllCompactions() because the CompactionService interface lacks methods
+  // to signal abort to remote workers and to properly resume after an abort.
+  // The API needs to be extended with:
+  // - A method to signal abort to running remote compaction jobs
+  // - A method to resume/re-enable scheduling after an abort is lifted
+
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Waiting for remote compaction...",
                  compaction->column_family_data()->GetName().c_str(), job_id_);
@@ -312,16 +320,17 @@ CompactionServiceCompactionJob::CompactionServiceCompactionJob(
     std::string output_path,
     const CompactionServiceInput& compaction_service_input,
     CompactionServiceResult* compaction_service_result)
-    : CompactionJob(job_id, compaction, db_options, mutable_db_options,
-                    file_options, versions, shutting_down, log_buffer, nullptr,
-                    output_directory, nullptr, stats, db_mutex,
-                    db_error_handler, job_context, std::move(table_cache),
-                    event_logger,
-                    compaction->mutable_cf_options().paranoid_file_checks,
-                    compaction->mutable_cf_options().report_bg_io_stats, dbname,
-                    &(compaction_service_result->stats), Env::Priority::USER,
-                    io_tracer, manual_compaction_canceled, db_id, db_session_id,
-                    compaction->column_family_data()->GetFullHistoryTsLow()),
+    : CompactionJob(
+          job_id, compaction, db_options, mutable_db_options, file_options,
+          versions, shutting_down, log_buffer, nullptr, output_directory,
+          nullptr, stats, db_mutex, db_error_handler, job_context,
+          std::move(table_cache), event_logger,
+          compaction->mutable_cf_options().paranoid_file_checks,
+          compaction->mutable_cf_options().report_bg_io_stats, dbname,
+          &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+          manual_compaction_canceled, CompactionJob::kCompactionAbortedFalse,
+          db_id, db_session_id,
+          compaction->column_family_data()->GetFullHistoryTsLow()),
       output_path_(std::move(output_path)),
       compaction_input_(compaction_service_input),
       compaction_result_(compaction_service_result) {}
diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h
index 09af46540ca9..38785f9ae085 100644
--- a/db/compaction/subcompaction_state.h
+++ b/db/compaction/subcompaction_state.h
@@ -95,6 +95,14 @@ class SubcompactionState {
     proximal_level_outputs_.RemoveLastEmptyOutput();
   }
 
+  // Cleanup output builders for abandoning in-progress files.
+  void CleanupOutputs() {
+    compaction_outputs_.Cleanup();
+    if (compaction->SupportsPerKeyPlacement()) {
+      proximal_level_outputs_.Cleanup();
+    }
+  }
+
   void BuildSubcompactionJobInfo(
       SubcompactionJobInfo& subcompaction_job_info) const {
     const Compaction* c = compaction;
diff --git a/db/db_compaction_abort_test.cc b/db/db_compaction_abort_test.cc
new file mode 100644
index 000000000000..a76e1d689f1f
--- /dev/null
+++ b/db/db_compaction_abort_test.cc
@@ -0,0 +1,993 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <set>
+#include <thread>
+#include <unordered_map>
+
+#include "db/compaction/compaction_job.h"
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Helper class to manage abort synchronization in tests.
+//
+// Compaction abort could happen at various stage of compaction.
+// To test this, we need to trigger abort at different stage. This requires
+// precise control on the timing of abort API invocation. To achieve this in a
+// consistent way across various tests, we invoke AbortAllCompactions() within
+// the sync point callback, that is added at various stages of compaction.
+// However as the abort API is a blocking call, calling it within the sync point
+// callback on the compaction thread would cause deadlock. This test helper
+// class is designed to solve this challenge.
+//
+// 1. Abort must happen from a different thread:
+//    AbortAllCompactions() is typically called from the compaction thread
+//    via a sync point callback, so that we could precisely control the time of
+//    API invocation to simulate abort at different stage of compaction.
+//    However, we can't block the compaction thread waiting for the abort to
+//    complete - the compaction needs to continue executing to actually check
+//    the abort flag and exit. So we spawn a separate thread to call
+//    AbortAllCompactions().
+//
+// 2. We need to know when abort completes:
+//    After compaction returns (with aborted status), we often need to:
+//    - Verify state (e.g., no output files created)
+//    - Call ResumeAllCompactions()
+//    - Run compaction again to verify it succeeds
+//    We must wait for the abort thread to finish before proceeding, otherwise
+//    we might call Resume before Abort completes, causing race conditions.
+//
+// 3. Sync point callbacks may fire multiple times:
+//    With multiple subcompactions, a callback like
+//    "CompactionJob::ProcessKeyValueCompaction:Start" fires once per
+//    subcompaction. We only want to trigger abort once, so we use
+//    abort_triggered_ as a guard.
+//
+// 4. Tests may need multiple abort cycles:
+//    Some tests (e.g., MultipleAbortResumeSequence) do abort->resume->abort
+//    multiple times. The class supports this by auto-resetting when a
+//    previous abort has completed.
+class AbortSynchronizer {
+ public:
+  AbortSynchronizer() : abort_cv_(&abort_mutex_) {}
+
+  ~AbortSynchronizer() {
+    // Join the thread if it was started - ensures clean shutdown
+    if (abort_thread_.joinable()) {
+      abort_thread_.join();
+    }
+  }
+
+  // Non-copyable, non-movable due to thread member
+  AbortSynchronizer(const AbortSynchronizer&) = delete;
+  AbortSynchronizer& operator=(const AbortSynchronizer&) = delete;
+
+  // Trigger abort from a separate thread.
+  // - Safe to call multiple times; only first call in each cycle spawns thread
+  // - If a previous abort has completed, automatically resets state first
+  // - The spawned thread calls AbortAllCompactions() and signals completion
+  void TriggerAbort(DBImpl* db) {
+    // If previous abort completed, reset state to allow new abort
+    if (abort_triggered_.load() && abort_completed_.load()) {
+      Reset();
+    }
+
+    if (!abort_triggered_.exchange(true)) {
+      abort_thread_ = std::thread([this, db]() {
+        db->AbortAllCompactions();
+        SignalAbortCompleted();
+      });
+    }
+  }
+
+  // Wait for the abort thread to complete.
+  // Call this AFTER compaction returns to ensure the abort thread has finished
+  // before proceeding with Resume or other operations.
+  void WaitForAbortCompletion() {
+    MutexLock l(&abort_mutex_);
+    while (!abort_completed_.load()) {
+      abort_cv_.Wait();
+    }
+  }
+
+  // Reset state for reuse. Joins any previous thread first.
+  // Called automatically by TriggerAbort() if previous abort completed,
+  // but can also be called explicitly for clarity.
+  void Reset() {
+    if (abort_thread_.joinable()) {
+      abort_thread_.join();
+    }
+    abort_triggered_.store(false);
+    abort_completed_.store(false);
+  }
+
+  bool IsAbortTriggered() const { return abort_triggered_.load(); }
+
+ private:
+  void SignalAbortCompleted() {
+    MutexLock l(&abort_mutex_);
+    abort_completed_.store(true);
+    abort_cv_.SignalAll();
+  }
+
+  std::atomic<bool> abort_triggered_{false};  // Guards against multiple spawns
+  std::atomic<bool> abort_completed_{false};  // Signals thread completion
+  port::Mutex abort_mutex_;
+  port::CondVar abort_cv_;
+  std::thread abort_thread_;  // The thread that calls AbortAllCompactions()
+};
+
+// Helper to clean up SyncPoint state after tests
+inline void CleanupSyncPoints() {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Helper class that combines AbortSynchronizer with sync point setup for
+// deterministic abort triggering. This adds sync point coordination on top
+// of AbortSynchronizer:
+//
+// This is useful when you need deterministic timing - the callback won't
+// return until AbortAllCompactions() has actually set the abort flag,
+// guaranteeing the compaction will see it on the next check.
+class SyncPointAbortHelper {
+ public:
+  explicit SyncPointAbortHelper(const std::string& trigger_point)
+      : trigger_point_(trigger_point) {}
+
+  // Set up sync points and callbacks. Call this before starting compaction.
+  void Setup(DBImpl* db_impl) {
+    db_impl_ = db_impl;
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"DBImpl::AbortAllCompactions:FlagSet", kWaitPointName},
+    });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        trigger_point_, [this](void* /*arg*/) {
+          // Use AbortSynchronizer to handle the abort in a separate thread
+          abort_sync_.TriggerAbort(db_impl_);
+
+          // Wait for abort flag to be set via sync point dependency
+          // This ensures deterministic timing - compaction will see the flag
+          TEST_SYNC_POINT_CALLBACK(kWaitPointName, nullptr);
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  // Wait for the abort to complete. Call this after compaction returns.
+  void WaitForAbortCompletion() { abort_sync_.WaitForAbortCompletion(); }
+
+  // Clean up sync points and wait for abort completion in one call
+  void CleanupAndWait() {
+    CleanupSyncPoints();
+    WaitForAbortCompletion();
+  }
+
+ private:
+  static constexpr const char* kWaitPointName =
+      "SyncPointAbortHelper::WaitForAbort";
+  std::string trigger_point_;
+  DBImpl* db_impl_{nullptr};
+  AbortSynchronizer abort_sync_;
+};
+
+class DBCompactionAbortTest : public DBTestBase {
+ public:
+  DBCompactionAbortTest()
+      : DBTestBase("db_compaction_abort_test", /*env_do_fsync=*/false) {}
+
+ protected:
+  // Map to track the latest value of each key for verification
+  std::unordered_map<std::string, std::string> expected_values_;
+
+  // Statistics object for verifying compaction metrics
+  std::shared_ptr<Statistics> stats_;
+
+  // Get current options with statistics enabled
+  Options GetOptionsWithStats() {
+    Options options = CurrentOptions();
+    stats_ = CreateDBStatistics();
+    options.statistics = stats_;
+    return options;
+  }
+
+  // Populate database with test data.
+  // If overlapping=true, uses the same key range (0 to keys_per_file-1) in each
+  // file to ensure compaction has work to do.
+  // If overlapping=false, uses non-overlapping keys across files.
+  void PopulateData(int num_files, int keys_per_file, int value_size,
+                    bool overlapping = true, int seed = 301) {
+    Random rnd(seed);
+    for (int i = 0; i < num_files; ++i) {
+      for (int j = 0; j < keys_per_file; ++j) {
+        int key_index = overlapping ? j : (j + i * keys_per_file);
+        std::string key = Key(key_index);
+        std::string value = rnd.RandomString(value_size);
+        ASSERT_OK(Put(key, value));
+        expected_values_[key] = value;
+      }
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Verify data integrity by reading all keys and comparing with expected
+  // values
+  void VerifyDataIntegrity(int num_keys, int start_key = 0) {
+    std::string val;
+    for (int j = start_key; j < start_key + num_keys; ++j) {
+      std::string key = Key(j);
+      ASSERT_OK(dbfull()->Get(ReadOptions(), key, &val));
+      auto it = expected_values_.find(key);
+      if (it != expected_values_.end()) {
+        ASSERT_EQ(it->second, val) << "Value mismatch for key: " << key;
+      }
+    }
+  }
+
+  // Clear expected values (useful when reopening DB or between tests)
+  void ClearExpectedValues() { expected_values_.clear(); }
+
+  // Run the common abort test pattern with SyncPointAbortHelper:
+  // 1. Set up sync point abort helper
+  // 2. Run compaction and verify it's aborted
+  // 3. Verify COMPACTION_ABORTED stat increased (if stats enabled)
+  // 4. Clean up, resume, and verify compaction succeeds
+  // 5. Verify COMPACT_WRITE_BYTES increased (if stats enabled)
+  void RunSyncPointAbortTest(const std::string& trigger_point,
+                             CompactRangeOptions cro = CompactRangeOptions()) {
+    // Capture stats and file counts before abort
+    uint64_t aborted_before = 0;
+    uint64_t write_bytes_before = 0;
+    if (stats_) {
+      aborted_before = stats_->getTickerCount(COMPACTION_ABORTED);
+      write_bytes_before = stats_->getTickerCount(COMPACT_WRITE_BYTES);
+    }
+
+    SyncPointAbortHelper helper(trigger_point);
+    helper.Setup(dbfull());
+
+    Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(s.IsCompactionAborted());
+
+    // Verify abort was counted
+    if (stats_) {
+      uint64_t aborted_after = stats_->getTickerCount(COMPACTION_ABORTED);
+      ASSERT_GT(aborted_after, aborted_before)
+          << "COMPACTION_ABORTED stat should increase after abort";
+    }
+
+    helper.CleanupAndWait();
+    dbfull()->ResumeAllCompactions();
+
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+    // Verify compaction completed and wrote bytes
+    if (stats_) {
+      uint64_t write_bytes_after = stats_->getTickerCount(COMPACT_WRITE_BYTES);
+      ASSERT_GT(write_bytes_after, write_bytes_before)
+          << "COMPACT_WRITE_BYTES should increase after successful compaction";
+    }
+  }
+};
+
+// Parameterized test for abort with different number of max subcompactions.
+// This consolidates tests that were essentially duplicates with different
+// max_subcompactions values
+class DBCompactionAbortSubcompactionTest
+    : public DBCompactionAbortTest,
+      public ::testing::WithParamInterface<int> {};
+
+TEST_P(DBCompactionAbortSubcompactionTest, AbortWithVaryingSubcompactions) {
+  int max_subcompactions = GetParam();
+
+  Options options = GetOptionsWithStats();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = max_subcompactions;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100);
+
+  RunSyncPointAbortTest("CompactionJob::RunSubcompactions:BeforeStart");
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+INSTANTIATE_TEST_CASE_P(SubcompactionVariants,
+                        DBCompactionAbortSubcompactionTest,
+                        ::testing::Values(1, 2, 4),
+                        [](const ::testing::TestParamInfo<int>& param_info) {
+                          return "MaxSubcompactionCount_" +
+                                 std::to_string(param_info.param);
+                        });
+
+// Parameterized test for abort with different compaction styles
+// This consolidates tests for Level, Universal, and FIFO compaction styles
+class DBCompactionAbortStyleTest
+    : public DBCompactionAbortTest,
+      public ::testing::WithParamInterface<CompactionStyle> {
+ protected:
+  // Configure options based on compaction style
+  void ConfigureOptionsForStyle(Options& options, CompactionStyle style) {
+    options.compaction_style = style;
+    options.level0_file_num_compaction_trigger = 4;
+    options.disable_auto_compactions = true;
+
+    switch (style) {
+      case kCompactionStyleLevel:
+        // Level compaction uses default settings
+        break;
+      case kCompactionStyleUniversal:
+        options.compaction_options_universal.size_ratio = 10;
+        break;
+      case kCompactionStyleFIFO:
+        // Set a large max_table_files_size to avoid deletion compaction
+        options.compaction_options_fifo.max_table_files_size =
+            100 * 1024 * 1024;
+        // Enable intra-L0 compaction which goes through normal compaction path
+        options.compaction_options_fifo.allow_compaction = true;
+        options.max_open_files = -1;  // Required for FIFO compaction
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+TEST_P(DBCompactionAbortStyleTest, AbortCompaction) {
+  CompactionStyle style = GetParam();
+
+  Options options = GetOptionsWithStats();
+  options.max_subcompactions = 1;
+  ConfigureOptionsForStyle(options, style);
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100);
+
+  RunSyncPointAbortTest("CompactionJob::RunSubcompactions:BeforeStart");
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    CompactionStyleVariants, DBCompactionAbortStyleTest,
+    ::testing::Values(kCompactionStyleLevel, kCompactionStyleUniversal,
+                      kCompactionStyleFIFO),
+    [](const ::testing::TestParamInfo<CompactionStyle>& param_info) {
+      return OptionsHelper::compaction_style_to_string.at(param_info.param);
+    });
+
+TEST_F(DBCompactionAbortTest, AbortManualCompaction) {
+  Options options = GetOptionsWithStats();
+  options.level0_file_num_compaction_trigger = 10;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/5, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  RunSyncPointAbortTest("CompactionJob::ProcessKeyValueCompaction:Start", cro);
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortAutomaticCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = false;
+  Reopen(options);
+
+  Random rnd(301);
+  AbortSynchronizer abort_sync;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction:Start",
+      [&](void* /*arg*/) { abort_sync.TriggerAbort(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  CleanupSyncPoints();
+
+  abort_sync.WaitForAbortCompletion();
+  dbfull()->ResumeAllCompactions();
+
+  for (int j = 0; j < 100; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::string val;
+  for (int j = 0; j < 100; ++j) {
+    ASSERT_OK(dbfull()->Get(ReadOptions(), Key(j), &val));
+  }
+}
+
+TEST_F(DBCompactionAbortTest, AbortAndVerifyNoOutputFiles) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  int num_l0_files_before = NumTableFilesAtLevel(0);
+  int num_l1_files_before = NumTableFilesAtLevel(1);
+
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  CleanupSyncPoints();
+
+  int num_l0_files_after = NumTableFilesAtLevel(0);
+  int num_l1_files_after = NumTableFilesAtLevel(1);
+
+  ASSERT_EQ(num_l0_files_before, num_l0_files_after);
+  ASSERT_EQ(num_l1_files_before, num_l1_files_after);
+
+  helper.WaitForAbortCompletion();
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  int num_l0_files_final = NumTableFilesAtLevel(0);
+  int num_l1_files_final = NumTableFilesAtLevel(1);
+
+  ASSERT_EQ(0, num_l0_files_final);
+  ASSERT_GT(num_l1_files_final, 0);
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, MultipleAbortResumeSequence) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  for (int round = 0; round < 3; ++round) {
+    // Use SyncPointAbortHelper for deterministic abort timing - it waits
+    // for the abort flag to be set via sync point dependency
+    SyncPointAbortHelper helper(
+        "CompactionJob::ProcessKeyValueCompaction:Start");
+    helper.Setup(dbfull());
+
+    CompactRangeOptions cro;
+    Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(s.IsCompactionAborted());
+
+    helper.CleanupAndWait();
+    dbfull()->ResumeAllCompactions();
+  }
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortWithOutputFilesCleanup) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;  // Ensure compaction output goes to L1
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 50 * 1024;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/100);
+
+  SyncPointAbortHelper helper("CompactionJob::RunSubcompactions:BeforeStart");
+  helper.Setup(dbfull());
+
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  CleanupSyncPoints();
+
+  int num_l1_files_after_abort = NumTableFilesAtLevel(1);
+  ASSERT_EQ(0, num_l1_files_after_abort);
+
+  helper.WaitForAbortCompletion();
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Verify L0 files are compacted and L1 has output files
+  int num_l0_files_final = NumTableFilesAtLevel(0);
+  int num_l1_files_final = NumTableFilesAtLevel(1);
+  ASSERT_EQ(0, num_l0_files_final)
+      << "L0 should be empty after successful compaction";
+  ASSERT_GT(num_l1_files_final, 0)
+      << "L1 should have files after successful compaction";
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, NestedAbortResumeCalls) {
+  // Test that nested AbortAllCompactions() calls work correctly with the
+  // counter
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  // First abort call
+  dbfull()->AbortAllCompactions();
+
+  // Nested abort call (counter should be 2)
+  dbfull()->AbortAllCompactions();
+
+  // Compaction should still be blocked after one resume
+  dbfull()->ResumeAllCompactions();
+
+  // Compaction should still return aborted because counter is still 1
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  // Second resume - counter should be 0 now
+  dbfull()->ResumeAllCompactions();
+
+  // Compaction should succeed now
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortCompactFilesAPI) {
+  // Test that AbortAllCompactions works with CompactFiles API
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;  // Disable auto compaction
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  // Get the L0 file names
+  std::vector<std::string> files_to_compact;
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  for (const auto& file : cf_meta.levels[0].files) {
+    files_to_compact.push_back(file.name);
+  }
+  ASSERT_GE(files_to_compact.size(), 2);
+
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  CompactionOptions compact_options;
+  Status s = dbfull()->CompactFiles(compact_options, files_to_compact, 1);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  helper.CleanupAndWait();
+  dbfull()->ResumeAllCompactions();
+
+  // CompactFiles should work after resume
+  ASSERT_OK(dbfull()->CompactFiles(compact_options, files_to_compact, 1));
+
+  VerifyDataIntegrity(/*num_keys=*/100);
+}
+
+TEST_F(DBCompactionAbortTest, AbortDoesNotAffectFlush) {
+  // Test that AbortAllCompactions does not affect flush operations
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int j = 0; j < 100; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+  }
+
+  // Abort compactions
+  dbfull()->AbortAllCompactions();
+
+  // Flush should still work
+  ASSERT_OK(Flush());
+
+  // Write more data
+  for (int j = 100; j < 200; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(1000)));
+  }
+
+  // Flush should still work
+  ASSERT_OK(Flush());
+
+  // Resume compactions
+  dbfull()->ResumeAllCompactions();
+
+  VerifyDataIntegrity(/*num_keys=*/200);
+}
+
+TEST_F(DBCompactionAbortTest, AbortBeforeCompactionStarts) {
+  // Test aborting before any compaction has started
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/1000);
+
+  // Abort before starting compaction
+  dbfull()->AbortAllCompactions();
+
+  // Compaction should immediately return aborted
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  // Resume
+  dbfull()->ResumeAllCompactions();
+
+  // Now compaction should work
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Verify L0 files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+}
+
+// Test that in-progress blob and SST files are properly cleaned up when
+// compaction is aborted. This specifically tests the case where abort happens
+// while files are being written (opened but not yet completed/closed).
+// This catches the bug where files exist on disk but are removed from the
+// outputs_ vector (e.g., by RemoveLastEmptyOutput when file_size is 0 because
+// the builder was abandoned), leaving orphan files.
+TEST_F(DBCompactionAbortTest, AbortWithInProgressFileCleanup) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions =
+      1;  // Single subcompaction for deterministic behavior
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 32 * 1024;  // 32KB
+
+  // Enable BlobDB with garbage collection to force blob rewriting during
+  // compaction
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;  // All values go to blob files
+  options.blob_file_size =
+      1024 * 1024;  // 1MB - large enough to not close during test
+  // Enable blob garbage collection - this forces blob data to be rewritten
+  // during compaction, creating new blob files
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;  // Include all blob files
+  options.blob_garbage_collection_force_threshold = 0.0;  // Always force GC
+
+  Reopen(options);
+
+  // Write enough data to trigger the periodic abort check (every 1000 records).
+  // 4 files * 2000 keys = 2000 unique overlapping keys processed during
+  // compaction. The sync point triggers at 999, 1999, etc.
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/2000, /*value_size=*/500);
+
+  // Helper function to get blob files on disk with their names
+  auto GetBlobFilesOnDisk = [this]() -> std::vector<std::string> {
+    std::vector<std::string> blob_files;
+    std::vector<std::string> files;
+    EXPECT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& f : files) {
+      if (f.find(".blob") != std::string::npos) {
+        blob_files.push_back(f);
+      }
+    }
+    std::sort(blob_files.begin(), blob_files.end());
+    return blob_files;
+  };
+
+  // Helper function to get blob file count in metadata
+  auto GetBlobFilesInMetadata = [this]() -> std::vector<uint64_t> {
+    std::vector<uint64_t> blob_file_numbers;
+    ColumnFamilyMetaData cf_meta;
+    dbfull()->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta);
+    for (const auto& blob_meta : cf_meta.blob_files) {
+      blob_file_numbers.push_back(blob_meta.blob_file_number);
+    }
+    std::sort(blob_file_numbers.begin(), blob_file_numbers.end());
+    return blob_file_numbers;
+  };
+
+  // Helper function to get SST files on disk
+  auto GetSstFilesOnDisk = [this]() -> std::vector<std::string> {
+    std::vector<std::string> sst_files;
+    std::vector<std::string> files;
+    EXPECT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& f : files) {
+      if (f.find(".sst") != std::string::npos) {
+        sst_files.push_back(f);
+      }
+    }
+    std::sort(sst_files.begin(), sst_files.end());
+    return sst_files;
+  };
+
+  // Helper function to get SST file numbers in metadata
+  auto GetSstFilesInMetadata = [this]() -> std::vector<uint64_t> {
+    std::vector<uint64_t> sst_file_numbers;
+    ColumnFamilyMetaData cf_meta;
+    dbfull()->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta);
+    for (const auto& level : cf_meta.levels) {
+      for (const auto& file : level.files) {
+        // Extract file number from the file name (e.g., "000010.sst" -> 10)
+        uint64_t file_num = 0;
+        std::string fname = file.name;
+        // Remove leading path separators if present
+        size_t pos = fname.rfind('/');
+        if (pos != std::string::npos) {
+          fname = fname.substr(pos + 1);
+        }
+        if (sscanf(fname.c_str(), "%" PRIu64, &file_num) == 1) {
+          sst_file_numbers.push_back(file_num);
+        }
+      }
+    }
+    std::sort(sst_file_numbers.begin(), sst_file_numbers.end());
+    return sst_file_numbers;
+  };
+
+  std::vector<std::string> initial_blob_files = GetBlobFilesOnDisk();
+  std::vector<uint64_t> initial_meta_blobs = GetBlobFilesInMetadata();
+  std::vector<std::string> initial_sst_files = GetSstFilesOnDisk();
+  std::vector<uint64_t> initial_meta_ssts = GetSstFilesInMetadata();
+
+  ASSERT_GT(initial_blob_files.size(), 0u) << "Expected initial blob files";
+  ASSERT_EQ(initial_blob_files.size(), initial_meta_blobs.size())
+      << "Initial blob files should match between disk and metadata";
+  ASSERT_GT(initial_sst_files.size(), 0u) << "Expected initial SST files";
+  ASSERT_EQ(initial_sst_files.size(), initial_meta_ssts.size())
+      << "Initial SST files should match between disk and metadata";
+
+  // Tracking variables for blob file lifecycle
+  std::atomic<int> blob_writes{0};
+  std::atomic<bool> abort_triggered{false};
+  AbortSynchronizer abort_sync;
+
+  // Set up dependency: the wait point will block until FlagSet is hit
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::AbortAllCompactions:FlagSet",
+       "DBCompactionAbortTest::InProgressBlob:WaitForAbort"},
+  });
+
+  // Trigger abort after some blob writes during compaction output.
+  // This ensures we have an in-progress blob file when abort happens.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileBuilder::WriteBlobToFile:AddRecord", [&](void* /*arg*/) {
+        int count = blob_writes.fetch_add(1) + 1;
+
+        // Trigger abort after 100 blob writes - this ensures:
+        // 1. A blob file has been opened (for writing)
+        // 2. Some data has been written to it
+        // 3. But it's not yet completed (blob_file_size is 1MB)
+        if (count == 100 && !abort_triggered.exchange(true)) {
+          abort_sync.TriggerAbort(dbfull());
+          // Wait for abort flag to be set - this sync point blocks until
+          // FlagSet is processed
+          TEST_SYNC_POINT_CALLBACK(
+              "DBCompactionAbortTest::InProgressBlob:WaitForAbort", nullptr);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run compaction - it should be aborted while blob file is in-progress
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+
+  ASSERT_TRUE(s.IsIncomplete())
+      << "Expected compaction to be aborted, got: " << s.ToString();
+
+  CleanupSyncPoints();
+  abort_sync.WaitForAbortCompletion();
+
+  // Check state after abort
+  std::vector<std::string> post_abort_disk_blobs = GetBlobFilesOnDisk();
+  std::vector<uint64_t> post_abort_meta_blobs = GetBlobFilesInMetadata();
+  std::vector<std::string> post_abort_disk_ssts = GetSstFilesOnDisk();
+  std::vector<uint64_t> post_abort_meta_ssts = GetSstFilesInMetadata();
+
+  // This is the key assertion for blob files: files on disk should match
+  // metadata. If the in-progress blob file was NOT cleaned up, there will be an
+  // extra file on disk that's not in metadata (orphan).
+  ASSERT_EQ(post_abort_disk_blobs.size(), post_abort_meta_blobs.size())
+      << "Orphan blob file detected! In-progress blob file was not cleaned up "
+         "after abort. Files on disk: "
+      << post_abort_disk_blobs.size()
+      << ", Files in metadata: " << post_abort_meta_blobs.size()
+      << ". The difference indicates orphaned in-progress blob file(s).";
+
+  // This is the key assertion for SST files: files on disk should match
+  // metadata. If the in-progress SST file was NOT cleaned up, there will be an
+  // extra file on disk that's not in metadata (orphan).
+  ASSERT_EQ(post_abort_disk_ssts.size(), post_abort_meta_ssts.size())
+      << "Orphan SST file detected! In-progress SST file was not cleaned up "
+         "after abort. Files on disk: "
+      << post_abort_disk_ssts.size()
+      << ", Files in metadata: " << post_abort_meta_ssts.size()
+      << ". The difference indicates orphaned in-progress SST file(s).";
+
+  // Resume and complete compaction to verify DB is still functional
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // Verify data integrity - we wrote 4 files * 2000 keys with overlapping keys
+  VerifyDataIntegrity(/*num_keys=*/2000);
+}
+
+TEST_F(DBCompactionAbortTest, AbortBottommostLevelCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024 * 10;  // 10KB
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // Write data to fill multiple levels (non-overlapping keys)
+  PopulateData(/*num_files=*/6, /*keys_per_file=*/100,
+               /*value_size=*/500, /*overlapping=*/false);
+
+  // First compact to push data to lower levels
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Write more data to L0 (overlapping keys)
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/500);
+
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  // Trigger bottommost level compaction
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  helper.CleanupAndWait();
+  dbfull()->ResumeAllCompactions();
+
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  VerifyDataIntegrity(/*num_keys=*/600);
+}
+
+// Test that while compactions are aborted, atomic range replace
+// (IngestExternalFiles with atomic_replace_range) works correctly.
+// This verifies that the abort state doesn't block other write operations
+// like atomic range replace.
+TEST_F(DBCompactionAbortTest, AbortThenAtomicRangeReplace) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_subcompactions = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // Create a directory for SST files
+  std::string sst_files_dir = dbname_ + "_sst_files/";
+  ASSERT_OK(env_->CreateDirIfMissing(sst_files_dir));
+
+  // Populate initial data with overlapping keys
+  PopulateData(/*num_files=*/4, /*keys_per_file=*/100, /*value_size=*/500);
+
+  // Verify initial data
+  VerifyDataIntegrity(/*num_keys=*/100);
+
+  // Trigger compaction and abort it
+  SyncPointAbortHelper helper("CompactionJob::ProcessKeyValueCompaction:Start");
+  helper.Setup(dbfull());
+
+  CompactRangeOptions cro;
+  Status s = dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(s.IsCompactionAborted());
+
+  helper.CleanupAndWait();
+
+  // While compaction is still aborted, perform atomic range replace using
+  // IngestExternalFiles with atomic_replace_range. This verifies that the
+  // abort state doesn't block other write operations.
+  // Using RangeOpt() (empty range) means replace everything in the CF.
+
+  // Create an SST file with new data for keys 0-49 (replacing keys 0-99)
+  std::string sst_file_path = sst_files_dir + "atomic_replace_1.sst";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+
+  // Write new values for keys 0-49
+  Random rnd(42);
+  std::unordered_map<std::string, std::string> new_values;
+  for (int j = 0; j < 50; ++j) {
+    std::string key = Key(j);
+    std::string value = "replaced_" + rnd.RandomString(100);
+    ASSERT_OK(sst_file_writer.Put(key, value));
+    new_values[key] = value;
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+
+  // Perform atomic range replace for the entire column family.
+  // Using RangeOpt() (default constructor) means replace everything in the CF.
+  IngestExternalFileArg arg;
+  arg.column_family = db_->DefaultColumnFamily();
+  arg.external_files = {sst_file_path};
+  arg.atomic_replace_range = RangeOpt();
+  // snapshot_consistency must be false when using atomic_replace_range
+  arg.options.snapshot_consistency = false;
+
+  // Atomic range replace should work even while compactions are aborted
+  ASSERT_OK(db_->IngestExternalFiles({arg}));
+
+  // Now resume compactions after the atomic range replace
+  dbfull()->ResumeAllCompactions();
+
+  // Verify that the atomic range replace worked correctly:
+  // 1. Keys 0-49 should have new replaced values
+  std::string val;
+  for (int j = 0; j < 50; ++j) {
+    std::string key = Key(j);
+    ASSERT_OK(db_->Get(ReadOptions(), key, &val));
+    auto it = new_values.find(key);
+    ASSERT_NE(it, new_values.end());
+    ASSERT_EQ(it->second, val) << "Value mismatch for replaced key: " << key;
+  }
+
+  // 2. Keys 50-99 should not exist (they were replaced/deleted by atomic
+  // replace)
+  for (int j = 50; j < 100; ++j) {
+    std::string key = Key(j);
+    Status get_status = db_->Get(ReadOptions(), key, &val);
+    ASSERT_TRUE(get_status.IsNotFound())
+        << "Key " << key << " should not exist after full CF replace";
+  }
+
+  // Clean up SST files directory
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 451fbd41c70e..a3f25dd7788f 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -455,6 +455,8 @@ class DBImpl : public DB {
 
   void EnableManualCompaction() override;
   void DisableManualCompaction() override;
+  void AbortAllCompactions() override;
+  void ResumeAllCompactions() override;
 
   using DB::SetOptions;
   Status SetOptions(
@@ -2789,6 +2791,14 @@ class DBImpl : public DB {
   // compaction code paths.
   std::atomic<int> manual_compaction_paused_ = false;
 
+  // If non-zero, all compaction jobs (background automatic compactions,
+  // manual compactions via CompactRange, and foreground CompactFiles calls)
+  // are being aborted. Compactions will be signaled to stop. Any new
+  // compaction job would fail immediately. The value indicates how many threads
+  // have called AbortAllCompactions(). It is accessed in read mode outside the
+  // DB mutex in compaction code paths.
+  std::atomic<int> compaction_aborted_ = 0;
+
   // This condition variable is signaled on these conditions:
   // * whenever bg_compaction_scheduled_ goes down to 0
   // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 877b61007b99..9cf25f639da0 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -955,6 +955,10 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
 
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kCompactionAborted);
+  }
+
   if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
     return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
@@ -1487,6 +1491,11 @@ Status DBImpl::CompactFilesImpl(
     return Status::ShutdownInProgress();
   }
 
+  // triggered by AbortAllCompactions
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kCompactionAborted);
+  }
+
   // triggered by DisableManualCompactions or by user-set canceled flag in
   // CompactionOptions
   if (manual_compaction_paused_.load(std::memory_order_acquire) > 0 ||
@@ -1637,9 +1646,9 @@ Status DBImpl::CompactFilesImpl(
       c->mutable_cf_options().paranoid_file_checks,
       c->mutable_cf_options().report_bg_io_stats, dbname_,
       &compaction_job_stats, Env::Priority::USER, io_tracer_,
-      kManualCompactionCanceledFalse_, db_id_, db_session_id_,
-      c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
-      &blob_callback_, &bg_compaction_scheduled_,
+      kManualCompactionCanceledFalse_, compaction_aborted_, db_id_,
+      db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
+      c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
       &bg_bottom_compaction_scheduled_);
 
   // Creating a compaction influences the compaction score because the score
@@ -1710,6 +1719,11 @@ Status DBImpl::CompactFilesImpl(
                    "[%s] [JOB %d] Stopping manual compaction",
                    c->column_family_data()->GetName().c_str(),
                    job_context->job_id);
+  } else if (status.IsCompactionAborted()) {
+    // Don't report aborted compaction as error
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log, "[%s] [JOB %d] Compaction aborted",
+        c->column_family_data()->GetName().c_str(), job_context->job_id);
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log,
                    "[%s] [JOB %d] Compaction error: %s",
@@ -2170,6 +2184,17 @@ Status DBImpl::RunManualCompaction(
     return manual.status;
   }
 
+  if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    // All compactions are being aborted. Return immediately.
+    int counter = compaction_aborted_.load(std::memory_order_acquire);
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "RunManualCompaction: Aborting due to compaction_aborted_=%d", counter);
+    manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+    manual.done = true;
+    return manual.status;
+  }
+
   // When a manual compaction arrives, temporarily disable scheduling of
   // non-manual compactions and wait until the number of scheduled compaction
   // jobs drops to zero. This used to be needed to ensure that this manual
@@ -2194,6 +2219,13 @@ Status DBImpl::RunManualCompaction(
     // and `CompactRangeOptions::canceled` might not work well together.
     while (bg_bottom_compaction_scheduled_ > 0 ||
            bg_compaction_scheduled_ > 0) {
+      if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+        // Pretend the error came from compaction so the below cleanup/error
+        // handling code can process it.
+        manual.done = true;
+        manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+        break;
+      }
       if (manual_compaction_paused_ > 0 || manual.canceled == true) {
         // Pretend the error came from compaction so the below cleanup/error
         // handling code can process it.
@@ -2312,7 +2344,12 @@ Status DBImpl::RunManualCompaction(
     if (!scheduled) {
       // There is nothing scheduled to wait on, so any cancellation can end the
       // manual now.
-      if (manual_compaction_paused_ > 0 || manual.canceled == true) {
+      if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+        // Stop waiting since it was canceled. Pretend the error came from
+        // compaction so the below cleanup/error handling code can process it.
+        manual.done = true;
+        manual.status = Status::Incomplete(Status::SubCode::kCompactionAborted);
+      } else if (manual_compaction_paused_ > 0 || manual.canceled == true) {
         // Stop waiting since it was canceled. Pretend the error came from
         // compaction so the below cleanup/error handling code can process it.
         manual.done = true;
@@ -2930,6 +2967,61 @@ void DBImpl::EnableManualCompaction() {
   manual_compaction_paused_.fetch_sub(1, std::memory_order_release);
 }
 
+void DBImpl::AbortAllCompactions() {
+  InstrumentedMutexLock l(&mutex_);
+
+  // Increment the abort counter to signal all compactions to abort
+  compaction_aborted_.fetch_add(1, std::memory_order_release);
+
+  TEST_SYNC_POINT("DBImpl::AbortAllCompactions:FlagSet");
+
+  // Mark all manual compactions as canceled
+  for (const auto& manual_compaction : manual_compaction_dequeue_) {
+    manual_compaction->canceled = true;
+  }
+
+  // Wake up any waiting compaction threads to check the abort signal
+  bg_cv_.SignalAll();
+
+  // Wait for all running compactions (both manual and automatic) to finish
+  // or abort before returning.
+  // Note: bg_cv_.Wait() releases the mutex while waiting, so other threads
+  // can make progress and signal when compactions complete.
+  while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+         HasPendingManualCompaction()) {
+    bg_cv_.Wait();
+  }
+}
+
+void DBImpl::ResumeAllCompactions() {
+  InstrumentedMutexLock l(&mutex_);
+  int before = compaction_aborted_.load(std::memory_order_acquire);
+
+  // Guard against calling Resume without prior Abort
+  if (before <= 0) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "ResumeAllCompactions called without prior "
+                   "AbortAllCompactions (counter=%d)",
+                   before);
+    return;
+  }
+
+  // Decrement the abort counter
+  compaction_aborted_.fetch_sub(1, std::memory_order_release);
+
+  // As the operation is executed under db mutex, we could just use before value
+  // to calculate the current value.
+  int current = before - 1;
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "ResumeAllCompactions: counter %d -> %d", before, current);
+
+  // If this is the last resume call (abort counter back to 0), schedule
+  // compactions that may have been waiting
+  if (current == 0) {
+    MaybeScheduleFlushOrCompaction();
+  }
+}
+
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
   TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Start");
@@ -2994,6 +3086,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
   if (bg_compaction_paused_ > 0) {
     // we paused the background compaction
     return;
+  } else if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+    // we are aborting all compactions
+    return;
   } else if (error_handler_.IsBGWorkStopped()) {
     // Compaction is not part of the recovery sequence from a hard error. We
     // might get here because recovery might do a flush and install a new
@@ -3531,7 +3626,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
           10000);  // prevent hot loop
       mutex_.Lock();
     } else if (!s.ok() && !s.IsShutdownInProgress() &&
-               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped() &&
+               !s.IsCompactionAborted()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed compactions for the duration of
@@ -3563,6 +3659,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // case of a failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
                                         !s.IsManualCompactionPaused() &&
+                                        !s.IsCompactionAborted() &&
                                         !s.IsColumnFamilyDropped() &&
                                         !s.IsBusy());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
@@ -3667,6 +3764,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   if (!error_handler_.IsBGWorkStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       status = Status::ShutdownInProgress();
+    } else if (compaction_aborted_.load(std::memory_order_acquire) > 0) {
+      status = Status::Incomplete(Status::SubCode::kCompactionAborted);
     } else if (is_manual &&
                manual_compaction->canceled.load(std::memory_order_acquire)) {
       status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
@@ -4283,8 +4382,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         &compaction_job_stats, thread_pri, io_tracer_,
         is_manual ? manual_compaction->canceled
                   : kManualCompactionCanceledFalse_,
-        db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
-        c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
+        compaction_aborted_, db_id_, db_session_id_,
+        c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
+        &blob_callback_, &bg_compaction_scheduled_,
         &bg_bottom_compaction_scheduled_);
     compaction_job.Prepare(std::nullopt /*subcompact to be computed*/);
 
@@ -4367,7 +4467,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   }
 
   if (status.ok() || status.IsCompactionTooLarge() ||
-      status.IsManualCompactionPaused()) {
+      status.IsManualCompactionPaused() || status.IsCompactionAborted()) {
     // Done
   } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
@@ -4630,6 +4730,7 @@ void DBImpl::BuildCompactionJobInfo(
   compaction_job_info->cf_id = cfd->GetID();
   compaction_job_info->cf_name = cfd->GetName();
   compaction_job_info->status = st;
+  compaction_job_info->aborted = st.IsCompactionAborted();
   compaction_job_info->thread_id = env_->GetThreadID();
   compaction_job_info->job_id = job_id;
   compaction_job_info->base_input_level = c->start_level();
diff --git a/db/db_test.cc b/db/db_test.cc
index 7456679a152a..7909763ed0a5 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -3691,6 +3691,8 @@ class ModelDB : public DB {
   void EnableManualCompaction() override {}
 
   void DisableManualCompaction() override {}
+  void AbortAllCompactions() override {}
+  void ResumeAllCompactions() override {}
 
   Status WaitForCompact(
       const WaitForCompactOptions& /* wait_for_compact_options */) override {
@@ -6484,7 +6486,8 @@ TEST_P(DBTestWithParam, CompactionTotalTimeTest) {
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
   // Hard-coded number in CompactionJob::ProcessKeyValueCompaction().
-  const int kRecordStatsEvery = 1000;
+  // Uses 1024 (power of 2) for efficient bitwise check.
+  const int kRecordStatsEvery = 1024;
   // The stat COMPACTION_CPU_TOTAL_TIME should be recorded
   // during compaction and once more after compaction.
   ASSERT_EQ(n / kRecordStatsEvery + 1, record_count);
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index c25f7c589b1f..6b2d75385ba4 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -310,6 +310,7 @@ static const std::string aggregated_table_properties_at_level =
 static const std::string num_running_compactions = "num-running-compactions";
 static const std::string num_running_compaction_sorted_runs =
     "num-running-compaction-sorted-runs";
+static const std::string compaction_abort_count = "compaction-abort-count";
 static const std::string num_running_flushes = "num-running-flushes";
 static const std::string actual_delayed_write_rate =
     "actual-delayed-write-rate";
@@ -362,6 +363,8 @@ const std::string DB::Properties::kNumRunningCompactions =
     rocksdb_prefix + num_running_compactions;
 const std::string DB::Properties::kNumRunningCompactionSortedRuns =
     rocksdb_prefix + num_running_compaction_sorted_runs;
+const std::string DB::Properties::kCompactionAbortCount =
+    rocksdb_prefix + compaction_abort_count;
 const std::string DB::Properties::kNumRunningFlushes =
     rocksdb_prefix + num_running_flushes;
 const std::string DB::Properties::kBackgroundErrors =
@@ -594,6 +597,9 @@ const UnorderedMap<std::string, DBPropertyInfo>
         {DB::Properties::kNumRunningCompactionSortedRuns,
          {false, nullptr, &InternalStats::HandleNumRunningCompactionSortedRuns,
           nullptr, nullptr}},
+        {DB::Properties::kCompactionAbortCount,
+         {false, nullptr, &InternalStats::HandleCompactionAbortCount, nullptr,
+          nullptr}},
         {DB::Properties::kActualDelayedWriteRate,
          {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
           nullptr}},
@@ -1292,6 +1298,13 @@ bool InternalStats::HandleNumRunningCompactionSortedRuns(uint64_t* value,
   return true;
 }
 
+bool InternalStats::HandleCompactionAbortCount(uint64_t* value, DBImpl* db,
+                                               Version* /*version*/) {
+  *value = static_cast<uint64_t>(
+      db->compaction_aborted_.load(std::memory_order_acquire));
+  return true;
+}
+
 bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
                                            Version* /*version*/) {
   // Accumulated number of  errors in background flushes or compactions.
diff --git a/db/internal_stats.h b/db/internal_stats.h
index a1b4fbe6c555..347b3a617aae 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -852,6 +852,8 @@ class InternalStats {
                                    Version* version);
   bool HandleNumRunningCompactionSortedRuns(uint64_t* value, DBImpl* db,
                                             Version* version);
+  bool HandleCompactionAbortCount(uint64_t* value, DBImpl* db,
+                                  Version* version);
   bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
   bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
                                    Version* version);
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 2768a1eff1df..eca5656204f1 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -218,6 +218,7 @@ DECLARE_int32(reset_stats_one_in);
 DECLARE_int32(pause_background_one_in);
 DECLARE_int32(disable_file_deletions_one_in);
 DECLARE_int32(disable_manual_compaction_one_in);
+DECLARE_int32(abort_and_resume_compactions_one_in);
 DECLARE_int32(compact_range_width);
 DECLARE_int32(acquire_snapshot_one_in);
 DECLARE_bool(compare_full_db_state_snapshot);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index dead587f5945..0678609ecaa8 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -796,6 +796,10 @@ DEFINE_int32(
     "If non-zero, then DisableManualCompaction()+Enable will be called "
     "once for every N ops on average.  0 disables.");
 
+DEFINE_int32(abort_and_resume_compactions_one_in, 0,
+             "If non-zero, then AbortAllCompactions()+Resume will be called "
+             "once for every N ops on average. 0 disables.");
+
 DEFINE_int32(compact_range_width, 10000,
              "The width of the ranges passed to CompactRange().");
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index e826b2552a78..cbce41a89d2e 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -1271,6 +1271,11 @@ void StressTest::OperateDb(ThreadState* thread) {
         ProcessStatus(shared, "TestDisableManualCompaction", status);
       }
 
+      if (thread->rand.OneInOpt(FLAGS_abort_and_resume_compactions_one_in)) {
+        Status status = TestAbortAndResumeCompactions(thread);
+        ProcessStatus(shared, "TestAbortAndResumeCompactions", status);
+      }
+
       if (thread->rand.OneInOpt(FLAGS_verify_checksum_one_in)) {
         ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking);
         ThreadStatusUtil::SetThreadOperation(
@@ -3047,8 +3052,9 @@ void StressTest::TestCompactFiles(ThreadState* thread,
         // TOOD (hx235): allow an exact list of tolerable failures under stress
         // test
         bool non_ok_status_allowed =
-            s.IsManualCompactionPaused() || IsErrorInjectedAndRetryable(s) ||
-            s.IsAborted() || s.IsInvalidArgument() || s.IsNotSupported();
+            s.IsManualCompactionPaused() || s.IsCompactionAborted() ||
+            IsErrorInjectedAndRetryable(s) || s.IsAborted() ||
+            s.IsInvalidArgument() || s.IsNotSupported();
         if (!non_ok_status_allowed) {
           fprintf(stderr,
                   "Unable to perform CompactFiles(): %s under specified "
@@ -3141,6 +3147,20 @@ Status StressTest::TestDisableManualCompaction(ThreadState* thread) {
   return Status::OK();
 }
 
+Status StressTest::TestAbortAndResumeCompactions(ThreadState* thread) {
+  // Abort all running compactions and prevent new ones from starting
+  db_->AbortAllCompactions();
+  // Sleep to allow other threads to attempt operations while aborted
+  // Uses same sleep pattern as TestPauseBackground and
+  // TestDisableManualCompaction
+  int pwr2_micros =
+      std::min(thread->rand.Uniform(25), thread->rand.Uniform(25));
+  clock_->SleepForMicroseconds(1 << pwr2_micros);
+  // Resume compactions
+  db_->ResumeAllCompactions();
+  return Status::OK();
+}
+
 void StressTest::TestAcquireSnapshot(ThreadState* thread,
                                      int rand_column_family,
                                      const std::string& keystr, uint64_t i) {
@@ -3316,7 +3336,7 @@ void StressTest::TestCompactRange(ThreadState* thread, int64_t rand_key,
   if (!status.ok()) {
     // TOOD (hx235): allow an exact list of tolerable failures under stress test
     bool non_ok_status_allowed =
-        status.IsManualCompactionPaused() ||
+        status.IsManualCompactionPaused() || status.IsCompactionAborted() ||
         IsErrorInjectedAndRetryable(status) || status.IsAborted() ||
         status.IsInvalidArgument() || status.IsNotSupported();
     if (!non_ok_status_allowed) {
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index da1589be541a..3e8bc2af0def 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -332,6 +332,8 @@ class StressTest {
 
   Status TestDisableManualCompaction(ThreadState* thread);
 
+  Status TestAbortAndResumeCompactions(ThreadState* thread);
+
   void TestAcquireSnapshot(ThreadState* thread, int rand_column_family,
                            const std::string& keystr, uint64_t i);
 
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 2abb7eb02513..8b4be252cfd9 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -1253,6 +1253,10 @@ class DB {
     //  sorted runs being processed by currently running compactions.
     static const std::string kNumRunningCompactionSortedRuns;
 
+    //  "rocksdb.compaction-abort-count" - returns the current value of the
+    //      compaction abort counter.
+    static const std::string kCompactionAbortCount;
+
     //  "rocksdb.background-errors" - returns accumulated number of background
     //      errors.
     static const std::string kBackgroundErrors;
@@ -1731,6 +1735,46 @@ class DB {
   // DisableManualCompaction() has been called.
   virtual void EnableManualCompaction() = 0;
 
+  // Abort all compaction work/jobs. This function will signal all
+  // running compactions (both automatic and manual, background and foreground)
+  // to abort and will wait for them to finish or abort before returning. After
+  // this function returns, new compaction work will be aborted immediately
+  // until ResumeAllCompactions() is called.
+  //
+  // The compaction abort is checked periodically (every 1000 keys processed),
+  // so ongoing compactions should abort as well within a reasonable time.
+  // This function blocks until all compactions have completed or aborted.
+  //
+  // Any output files from aborted compactions are automatically cleaned up,
+  // ensuring no partial compaction results are installed, except for resumable
+  // compaction.
+  //
+  // This function supports concurrent abort requests from multiple callers
+  // without coordination between them. The call count is tracked, and
+  // compactions only resume after the number of ResumeAllCompactions() calls
+  // matches number of AbortAllCompactions() calls.
+  //
+  // Differences with other compaction control APIs:
+  // - DisableManualCompaction(): Only pauses manual compactions, waits for
+  //   them to finish naturally. AbortAllCompactions() actively cancels both
+  //   automatic and manual compactions.
+  // - PauseBackgroundWork(): Pauses all background work (flush + compaction),
+  //   waits for work to finish naturally. AbortAllCompactions() only affects
+  //   compactions and actively cancels them.
+  //
+  // Note: Compaction service (remote compaction) is not currently supported.
+  // Aborted compactions return Status::Incomplete with subcode
+  // kCompactionAborted.
+  virtual void AbortAllCompactions() = 0;
+
+  // Resume all compactions that were aborted by AbortAllCompactions().
+  // This function must be called as many times as AbortAllCompactions()
+  // has been called in order to resume compactions. This reference-counting
+  // behavior ensures that if multiple callers independently request an
+  // abort, compactions will not resume until all of them have called
+  // ResumeAllCompactions().
+  virtual void ResumeAllCompactions() = 0;
+
   // Wait for all flush and compactions jobs to finish. Jobs to wait include the
   // unscheduled (queued, but not scheduled yet). If the db is shutting down,
   // Status::ShutdownInProgress will be returned.
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index fe90a7b2ec94..1b41ca81f3d9 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -488,6 +488,9 @@ struct CompactionJobInfo {
   // Information about blob files deleted during compaction in Integrated
   // BlobDB.
   std::vector<BlobFileGarbageInfo> blob_file_garbage_infos;
+
+  // Whether this compaction was aborted via AbortAllCompactions()
+  bool aborted = false;
 };
 
 struct MemTableInfo {
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index bdffbbb25a03..ae4ef5792408 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -162,6 +162,8 @@ enum Tickers : uint32_t {
   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
   // If a compaction was canceled in sfm to prevent ENOSPC
   COMPACTION_CANCELLED,
+  // Number of compactions aborted via AbortAllCompactions()
+  COMPACTION_ABORTED,
 
   // Number of keys written to the database via the Put and Write call's
   NUMBER_KEYS_WRITTEN,
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index afb9651faf27..c3eeb082c3ed 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -117,6 +117,7 @@ class Status {
     kMergeOperandThresholdExceeded = 16,
     kPrefetchLimitReached = 17,
     kNotExpectedCodePath = 18,
+    kCompactionAborted = 19,
     kMaxSubCode
   };
 
@@ -483,6 +484,13 @@ class Status {
     return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
   }
 
+  // Returns true iff the status indicates compaction aborted. This
+  // is caused by a call to AbortAllCompactions
+  bool IsCompactionAborted() const {
+    MarkChecked();
+    return (code() == kIncomplete) && (subcode() == kCompactionAborted);
+  }
+
   // Returns true iff the status indicates a TxnNotPrepared error.
   bool IsTxnNotPrepared() const {
     MarkChecked();
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index 8cd4057fd553..f48acb2433db 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -375,6 +375,8 @@ class StackableDB : public DB {
   void DisableManualCompaction() override {
     return db_->DisableManualCompaction();
   }
+  void AbortAllCompactions() override { return db_->AbortAllCompactions(); }
+  void ResumeAllCompactions() override { return db_->ResumeAllCompactions(); }
 
   Status WaitForCompact(
       const WaitForCompactOptions& wait_for_compact_options) override {
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 57272170c326..af47c0e95644 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -2978,6 +2978,28 @@ void Java_org_rocksdb_RocksDB_continueBackgroundWork(JNIEnv* env, jclass,
   }
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    abortAllCompactions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_abortAllCompactions(JNIEnv*, jclass,
+                                                  jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  db->AbortAllCompactions();
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    resumeAllCompactions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_resumeAllCompactions(JNIEnv*, jclass,
+                                                   jlong jdb_handle) {
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  db->ResumeAllCompactions();
+}
+
 /*
  * Class:     org_rocksdb_RocksDB
  * Method:    enableAutoCompaction
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index fe2f38af64f9..ebe134726982 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -4084,6 +4084,23 @@ public void continueBackgroundWork() throws RocksDBException {
     continueBackgroundWork(nativeHandle_);
   }
 
+  /**
+   * Abort all running and pending compaction jobs. This method will signal
+   * all active compactions to terminate and wait for them to complete.
+   * No new compactions will be scheduled until {@link #resumeAllCompactions()} is called.
+   */
+  public void abortAllCompactions() {
+    abortAllCompactions(nativeHandle_);
+  }
+
+  /**
+   * Resume compaction scheduling after {@link #abortAllCompactions()} was called.
+   * Must be called the same number of times as {@link #abortAllCompactions()}.
+   */
+  public void resumeAllCompactions() {
+    resumeAllCompactions(nativeHandle_);
+  }
+
   /**
    * Enable automatic compactions for the given column
    * families if they were previously disabled.
@@ -5036,6 +5053,8 @@ private static native String[] compactFiles(final long handle, final long compac
   private static native void cancelAllBackgroundWork(final long handle, final boolean wait);
   private static native void pauseBackgroundWork(final long handle) throws RocksDBException;
   private static native void continueBackgroundWork(final long handle) throws RocksDBException;
+  private static native void abortAllCompactions(final long handle);
+  private static native void resumeAllCompactions(final long handle);
   private static native void enableAutoCompaction(
       final long handle, final long[] columnFamilyHandles) throws RocksDBException;
   private static native int numberLevels(final long handle, final long columnFamilyHandle);
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 231e5b400288..ccc92bcb6152 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -93,6 +93,7 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
      "rocksdb.compaction.optimized.del.drop.obsolete"},
     {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
+    {COMPACTION_ABORTED, "rocksdb.compaction.aborted"},
     {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
     {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
     {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index ee29bd20921a..f98917a5f4a3 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -185,7 +185,7 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
 
 TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   constexpr int kPeriodSec = 1;
-  constexpr int kEstimatedOneSliceSize = 22000;
+  constexpr int kEstimatedOneSliceSize = 22100;
 
   Options options;
   options.create_if_missing = true;
diff --git a/src.mk b/src.mk
index fc54f2804f90..0bae5ee333fd 100644
--- a/src.mk
+++ b/src.mk
@@ -490,6 +490,7 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_basic_test.cc                                                   \
   db/db_block_cache_test.cc                                             \
   db/db_bloom_filter_test.cc                                            \
+  db/db_compaction_abort_test.cc                                        \
   db/db_compaction_filter_test.cc                                       \
   db/db_compaction_test.cc                                              \
   db/db_clip_test.cc                                                    \
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 30dd435980af..707fdd27b594 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -199,6 +199,7 @@ def apply_random_seed_per_iteration():
     "pause_background_one_in": lambda: random.choice([10000, 1000000]),
     "disable_file_deletions_one_in": lambda: random.choice([10000, 1000000]),
     "disable_manual_compaction_one_in": lambda: random.choice([10000, 1000000]),
+    "abort_and_resume_compactions_one_in": lambda: random.choice([10000, 1000000]),
     "prefix_size": lambda: random.choice([-1, 1, 5, 7, 8]),
     "prefixpercent": 5,
     "progress_reports": 0,
diff --git a/unreleased_history/public_api_changes/abort_compaction_apis.md b/unreleased_history/public_api_changes/abort_compaction_apis.md
new file mode 100644
index 000000000000..d55882b3935d
--- /dev/null
+++ b/unreleased_history/public_api_changes/abort_compaction_apis.md
@@ -0,0 +1 @@
+Added new virtual methods `AbortAllCompactions()` and `ResumeAllCompactions()` to the `DB` class. Added new `Status::SubCode::kCompactionAborted` to indicate a compaction was aborted. Added `Status::IsCompactionAborted()` helper method to check if a status represents an aborted compaction.
diff --git a/util/status.cc b/util/status.cc
index 56d62b66190a..cf9e59e96757 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -46,7 +46,9 @@ static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
     "IO fenced off",          // kIOFenced
     "Merge operator failed",  // kMergeOperatorFailed
     "Number of operands merged exceeded threshold",  // kMergeOperandThresholdExceeded
-    "MultiScan reached file prefetch limit",         // kMultiScanPrefetchLimit
+    "MultiScan reached file prefetch limit",         // kPrefetchLimitReached
+    "Not expected code path",                        // kNotExpectedCodePath
+    "All compactions aborted",                       // kCompactionAborted
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,

From 27d70ecd7498b4b17bb0ae0785cd55156a973bd3 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Fri, 30 Jan 2026 12:18:44 -0800
Subject: [PATCH 452/500] Propagate Poll errors in
 FilePrefetchBuffer::PollIfNeeded (#14282)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14282

Previously, `PollIfNeeded` returned `void` and silently ignored errors from
`fs_->Poll()` by calling `PermitUncheckedError()`. This could lead to silent
data corruption or unexpected behavior when Poll operations fail.

This diff changes `PollIfNeeded` to return `Status` and properly propagate
Poll errors to callers. When Poll fails:
1. The IO handle is cleaned up via `DestroyAndClearIOHandle`
2. The error status is returned to the caller
3. Callers (`HandleOverlappingAsyncData` and `PrefetchInternal`) now check
   and propagate this error

Also adds a `TEST_SYNC_POINT_CALLBACK` to allow tests to inject Poll errors.

Reviewed By: anand1976

Differential Revision: D91624185

fbshipit-source-id: 8dd0ee6588ed1ce4bf080bcf857b778c5140ccf5
---
 file/file_prefetch_buffer.cc |  24 +++-
 file/file_prefetch_buffer.h  |   2 +-
 file/prefetch_test.cc        | 251 +++++++++++++++++++++++++++++++++++
 3 files changed, 272 insertions(+), 5 deletions(-)

diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
index dadc8e46ec07..e8aa7d10c512 100644
--- a/file/file_prefetch_buffer.cc
+++ b/file/file_prefetch_buffer.cc
@@ -351,7 +351,7 @@ void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) {
   assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset));
 }
 
-void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
+Status FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
   BufferInfo* buf = GetFirstBuffer();
 
   if (buf->async_read_in_progress_ && fs_ != nullptr) {
@@ -362,7 +362,16 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
       std::vector<void*> handles;
       handles.emplace_back(buf->io_handle_);
       StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
-      fs_->Poll(handles, 1).PermitUncheckedError();
+      IOStatus io_s = fs_->Poll(handles, 1);
+      // Allow tests to inject Poll errors
+      TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::PollIfNeeded:IOStatus",
+                               &io_s);
+      if (!io_s.ok()) {
+        // On Poll failure, clean up the handle and abort.
+        // DestroyAndClearIOHandle also sets async_read_in_progress_ to false.
+        DestroyAndClearIOHandle(buf);
+        return io_s;
+      }
     }
 
     // Reset and Release io_handle after the Poll API as request has been
@@ -373,6 +382,7 @@ void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
   // Always call outdated data after Poll as Buffers might be out of sync w.r.t
   // offset and length.
   ClearOutdatedData(offset, length);
+  return Status::OK();
 }
 
 // ReadAheadSizeTuning API calls readaheadsize_cb_
@@ -511,7 +521,10 @@ Status FilePrefetchBuffer::HandleOverlappingAsyncData(
   // by Seek, but the next access is at another offset.
   if (buf->async_read_in_progress_ &&
       buf->IsOffsetInBufferWithAsyncProgress(offset)) {
-    PollIfNeeded(offset, length);
+    Status poll_status = PollIfNeeded(offset, length);
+    if (!poll_status.ok()) {
+      return poll_status;
+    }
   }
 
   if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
@@ -646,7 +659,10 @@ Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
       return s;
     }
   } else {
-    PollIfNeeded(tmp_offset, tmp_length);
+    Status poll_status = PollIfNeeded(tmp_offset, tmp_length);
+    if (!poll_status.ok()) {
+      return poll_status;
+    }
   }
 
   AllocateBufferIfEmpty();
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
index 575e9ebcd795..5ebf1f051df9 100644
--- a/file/file_prefetch_buffer.h
+++ b/file/file_prefetch_buffer.h
@@ -431,7 +431,7 @@ class FilePrefetchBuffer {
   void ClearOutdatedData(uint64_t offset, size_t len);
 
   // It calls Poll API to check for any pending asynchronous request.
-  void PollIfNeeded(uint64_t offset, size_t len);
+  Status PollIfNeeded(uint64_t offset, size_t len);
 
   Status PrefetchInternal(const IOOptions& opts, RandomAccessFileReader* reader,
                           uint64_t offset, size_t length, size_t readahead_size,
diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc
index bcfeb38edc75..472fdc13bd1f 100644
--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@@ -2517,6 +2517,187 @@ TEST_P(PrefetchTest1, SeekParallelizationTest) {
   Close();
 }
 
+TEST_P(PrefetchTest1, PollErrorRecoveryDuringIteration) {
+  // This end-to-end test verifies that Poll() errors during async prefetching
+  // are properly propagated to the iterator. When Poll() fails, the iterator
+  // should stop and return an IOError status.
+  //
+  // With error injection on the 3rd Poll call, the iterator reads ~231 keys
+  // (out of 500) before encountering the error.
+
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+
+  const int kNumKeys = 500;
+  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
+      FileSystem::Default(), /*support_prefetch=*/false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  bool use_direct_io = GetParam();
+  Options options;
+  SetGenericOptions(env.get(), use_direct_io, options);
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  SetBlockBasedTableOptions(table_options);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    ROCKSDB_GTEST_SKIP("Direct IO not supported");
+    return;
+  }
+  ASSERT_OK(s);
+
+  // Write keys with known values so we can verify correctness
+  std::map<std::string, std::string> expected_data;
+  {
+    WriteBatch batch;
+    for (int i = 0; i < kNumKeys; i++) {
+      std::string key = BuildKey(i);
+      std::string value = "value_" + std::to_string(i) + "_" +
+                          std::string(100, 'x');  // Make values ~110 bytes
+      ASSERT_OK(batch.Put(key, value));
+      expected_data[key] = value;
+    }
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush());
+  }
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  // Set up callbacks to track async IO and inject Poll errors
+  std::atomic<int> poll_call_count{0};
+  std::atomic<int> poll_error_injected_count{0};
+  bool read_async_called = false;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PollIfNeeded:IOStatus", [&](void* arg) {
+        poll_call_count++;
+        int current_count = poll_call_count.load();
+
+        // Inject error on the third Poll call to allow some keys to be read
+        // first
+        if (current_count == 3) {
+          IOStatus* io_s = static_cast<IOStatus*>(arg);
+          *io_s = IOStatus::IOError("Injected Poll error for e2e testing");
+          poll_error_injected_count++;
+          std::cout << "PollErrorRecoveryDuringIteration: Injected error on "
+                       "Poll call #"
+                    << current_count << std::endl;
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UpdateResults::io_uring_result",
+      [&](void* /*arg*/) { read_async_called = true; });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Iterate through all keys with async IO enabled
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.adaptive_readahead = true;
+
+  int keys_read = 0;
+  int data_mismatches = 0;
+  Status iter_status;
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string key = iter->key().ToString();
+      std::string value = iter->value().ToString();
+
+      auto it = expected_data.find(key);
+      if (it == expected_data.end()) {
+        std::cout << "PollErrorRecoveryDuringIteration: Unexpected key: " << key
+                  << std::endl;
+        data_mismatches++;
+      } else if (it->second != value) {
+        std::cout << "PollErrorRecoveryDuringIteration: Value mismatch for key "
+                  << key << std::endl;
+        data_mismatches++;
+      }
+      keys_read++;
+    }
+    iter_status = iter->status();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Log results
+  std::cout << "PollErrorRecoveryDuringIteration: " << "read_async_called="
+            << read_async_called << ", poll_calls=" << poll_call_count.load()
+            << ", poll_errors_injected=" << poll_error_injected_count.load()
+            << ", keys_read=" << keys_read << ", expected_keys=" << kNumKeys
+            << ", data_mismatches=" << data_mismatches
+            << ", iter_status=" << iter_status.ToString() << std::endl;
+
+  // Verify no data mismatches occurred for keys that were read
+  ASSERT_EQ(data_mismatches, 0)
+      << "Found " << data_mismatches << " data mismatches";
+
+  if (read_async_called) {
+    // Async IO was used - verify Poll error was injected and propagated
+    ASSERT_EQ(poll_call_count.load(), 3)
+        << "Expected exactly 3 Poll calls when error injected on 3rd call";
+    ASSERT_EQ(poll_error_injected_count.load(), 1)
+        << "Expected exactly 1 Poll error to be injected";
+
+    // The iterator should have stopped with an error status
+    ASSERT_TRUE(iter_status.IsIOError())
+        << "Expected iterator to report IOError after Poll failure, got: "
+        << iter_status.ToString();
+
+    std::cout << "PollErrorRecoveryDuringIteration: Successfully verified "
+                 "Poll error was injected and propagated to iterator"
+              << std::endl;
+  } else {
+    // Async IO not supported - iterator should complete successfully
+    ASSERT_OK(iter_status);
+    ASSERT_EQ(keys_read, kNumKeys);
+    std::cout << "PollErrorRecoveryDuringIteration: Async IO (io_uring) not "
+                 "supported on this platform, verified data correctness"
+              << std::endl;
+  }
+
+  // Retry iteration without error injection - verify all data is still readable
+  // This confirms the Poll error didn't corrupt state
+  {
+    int retry_keys_read = 0;
+    int retry_data_mismatches = 0;
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string key = iter->key().ToString();
+      std::string value = iter->value().ToString();
+
+      auto it = expected_data.find(key);
+      if (it == expected_data.end()) {
+        retry_data_mismatches++;
+      } else if (it->second != value) {
+        retry_data_mismatches++;
+      }
+      retry_keys_read++;
+    }
+    ASSERT_OK(iter->status())
+        << "Retry iteration failed: " << iter->status().ToString();
+    ASSERT_EQ(retry_keys_read, kNumKeys)
+        << "Retry should read all " << kNumKeys << " keys";
+    ASSERT_EQ(retry_data_mismatches, 0)
+        << "Retry found " << retry_data_mismatches << " data mismatches";
+    std::cout << "PollErrorRecoveryDuringIteration: Retry succeeded, read all "
+              << retry_keys_read << " keys correctly" << std::endl;
+  }
+
+  Close();
+}
+
 namespace {
 #ifdef GFLAGS
 const int kMaxArgCount = 100;
@@ -3376,6 +3557,76 @@ TEST_F(FilePrefetchBufferTest, ForCompaction) {
       0);
 }
 
+TEST_F(FilePrefetchBufferTest, PollErrorPropagation) {
+  // This test verifies that Poll() errors in PollIfNeeded are properly
+  // propagated rather than being silently ignored.
+
+  std::string fname = "poll-error-test";
+  Random rand(0);
+  std::string content = rand.RandomString(32768);
+  Write(fname, content);
+
+  FileOptions opts;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+
+  // Set up readahead params for async prefetching
+  ReadaheadParams readahead_params;
+  readahead_params.initial_readahead_size = 16384;
+  readahead_params.max_readahead_size = 16384;
+
+  FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
+                         /*track_min_offset=*/false, fs());
+
+  Slice result;
+  // Start an async prefetch to set up async_read_in_progress_ state
+  Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result);
+
+  // Skip test on platforms that don't support async IO
+  if (s.IsNotSupported()) {
+    ROCKSDB_GTEST_SKIP("Async IO not supported on this platform");
+    return;
+  }
+  ASSERT_TRUE(s.IsTryAgain());
+  std::cout << "PollErrorPropagation: Async IO supported, proceeding with test"
+            << std::endl;
+
+  // Set up SyncPoint to inject Poll error
+  SyncPoint::GetInstance()->SetCallBack(
+      "FilePrefetchBuffer::PollIfNeeded:IOStatus", [&](void* arg) {
+        IOStatus* io_s = static_cast<IOStatus*>(arg);
+        *io_s = IOStatus::IOError("Injected Poll error for testing");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // TryReadFromCache will call PollIfNeeded to complete the async read
+  IOOptions io_opts;
+  io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
+  Status read_status;
+  bool found =
+      fpb.TryReadFromCache(io_opts, r.get(), 0, 4096, &result, &read_status);
+
+  // When PollIfNeeded fails:
+  // 1. PrefetchInternal returns the error status
+  // 2. TryReadFromCacheUntracked sets *status to the error and returns false
+  // Therefore: found should be false, and read_status should contain the error
+  ASSERT_FALSE(found) << "Expected TryReadFromCache to return false on Poll "
+                         "error, but it returned true";
+  ASSERT_TRUE(read_status.IsIOError())
+      << "Expected IOError status, got: " << read_status.ToString();
+  ASSERT_TRUE(read_status.ToString().find("Injected Poll error") !=
+              std::string::npos)
+      << "Expected error message to contain 'Injected Poll error', got: "
+      << read_status.ToString();
+
+  std::cout << "PollErrorPropagation: Poll error correctly propagated - "
+            << "found=" << found << ", status=" << read_status.ToString()
+            << std::endl;
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 class FSBufferPrefetchTest
     : public testing::Test,
       public ::testing::WithParamInterface<std::tuple<bool, bool>> {

From feffb67303b7d8f38fd91acb9a5b6f6f06c068c3 Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Mon, 2 Feb 2026 13:18:02 -0800
Subject: [PATCH 453/500] Replace Prefetch Logic in BlockBasedTableIterator
 with IODispatcher. (#14255)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This diff introduces the IODispatcher into the BlockBasedTableIterator. This replaces much of the prefetch logic with the logic found in IODispatcher.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14255

Test Plan:
I ran the following benchmark, %change is within noise tolerance. There shouldn't be any large performance improvement with this change, more so there should also not be any performance degradation.

MultiScan Benchmark: Current Branch vs Main

  Configuration:
  - Threads: 4
  - Ranges per scan: 10
  - Stride: 5000
  - Seek nexts: 100
  - Cache: Cold (dropped before each run)
  - Runs: 3

  Results:
  │ Mode  │ Main (ops/sec) │ Current (ops/sec)  │ Change │
  │ Sync   │ 8,901                 │ 9,032                      │ +1.5%  │
  │ Async │ 11,297                │ 11,947                     │ +5.8%  │

I further run db_stress test
```
make -j32 -f crash_test.mk J=32 blackbox_crash_test
```
Against my local machine for 60 minutes, on local flash, with async-io for multiscans always on.

Reviewed By: anand1976

Differential Revision: D91705195

Pulled By: krhancoc

fbshipit-source-id: acf2f944e8b715e99384c8cee79f8d241eadf5b8
---
 CMakeLists.txt                                |   1 +
 db/db_iter.cc                                 |   9 +
 db/db_iterator_test.cc                        | 386 ++++++++++---
 include/rocksdb/io_dispatcher.h               |  75 +++
 include/rocksdb/options.h                     |  12 +
 .../block_based/block_based_table_iterator.cc | 532 ++----------------
 .../block_based/block_based_table_iterator.h  | 175 +-----
 util/io_dispatcher_imp.cc                     | 124 +++-
 util/io_dispatcher_test.cc                    | 114 +---
 9 files changed, 612 insertions(+), 816 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3fc440fe311..e9134aa01889 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -918,6 +918,7 @@ set(SOURCES
         util/data_structure.cc
         util/dynamic_bloom.cc
         util/hash.cc
+        util/io_dispatcher_imp.cc
         util/murmurhash.cc
         util/random.cc
         util/rate_limiter.cc
diff --git a/db/db_iter.cc b/db/db_iter.cc
index 52a357247f06..bd8f179655a6 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -23,6 +23,7 @@
 #include "memory/arena.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
+#include "rocksdb/io_dispatcher.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
@@ -1621,6 +1622,14 @@ void DBIter::Prepare(const MultiScanArgs& scan_opts) {
   new_scan_opts.emplace(scan_opts);
   scan_opts_.swap(new_scan_opts);
   scan_index_ = 0;
+
+  // Create a shared IODispatcher if not provided. This allows all
+  // BlockBasedTableIterators in this scan to share a single dispatcher,
+  // enabling better IO coordination and future rate limiting.
+  if (!scan_opts_.value().io_dispatcher) {
+    scan_opts_->io_dispatcher.reset(NewIODispatcher());
+  }
+
   if (!scan_opts.empty()) {
     iter_.Prepare(&scan_opts_.value());
   } else {
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 48421e5b6dfb..645f0e7266c7 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -16,6 +16,7 @@
 #include "db/db_test_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/io_dispatcher.h"
 #include "rocksdb/iostats_context.h"
 #include "rocksdb/perf_context.h"
 #include "table/block_based/flush_block_policy_impl.h"
@@ -5028,150 +5029,373 @@ TEST_P(DBMultiScanIteratorTest, AsyncPrefetchWithExternalFileIngestion) {
   iter.reset();
 }
 
-TEST_P(DBMultiScanIteratorTest, StatisticsTest) {
-  // Test that multi scan statistics are properly recorded
+TEST_P(DBMultiScanIteratorTest, IODispatcherStatsVerification) {
+  // Test that verifies all IOs go through the IODispatcher by checking stats
   auto options = CurrentOptions();
-  options.statistics = CreateDBStatistics();
-  // Use small block size to ensure multiple blocks
-  BlockBasedTableOptions table_options;
-  table_options.block_size = 256;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.target_file_size_base = 1 << 15;  // 32KiB
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 50;
+  options.compression = kNoCompression;
   DestroyAndReopen(options);
 
-  // Create data across multiple blocks
-  for (int i = 0; i < 100; ++i) {
+  Random rnd(307);
+
+  // Create data - enough to create multiple data blocks
+  for (int i = 0; i < 500; ++i) {
     std::stringstream ss;
-    ss << std::setw(3) << std::setfill('0') << i;
-    // Use larger values to ensure multiple blocks
-    ASSERT_OK(Put("k" + ss.str(), std::string(100, 'v')));
+    ss << "k" << std::setw(5) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), rnd.RandomString(1 << 10)));  // 1KiB values
   }
   ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
 
-  // Reset stats before multi scan
-  ASSERT_OK(options.statistics->Reset());
-
-  // Set up two scan ranges
-  std::vector<std::string> key_ranges({"k010", "k030", "k060", "k080"});
+  // Set up scan ranges
+  std::vector<std::string> key_ranges({"k00000", "k00200", "k00300", "k00400"});
   ReadOptions ro;
   ro.fill_cache = GetParam();
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // Create a tracking IODispatcher to verify IO statistics
+  auto tracking_dispatcher = std::make_shared<TrackingIODispatcher>();
+
   MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = false;  // Use sync IO for predictable stats
+  scan_options.io_dispatcher = tracking_dispatcher;
   scan_options.insert(key_ranges[0], key_ranges[1]);
   scan_options.insert(key_ranges[2], key_ranges[3]);
 
-  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
   std::unique_ptr<MultiScan> iter =
       dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
 
-  // Iterate through all ranges
-  int count = 0;
+  // Scan through all data
+  int total_keys = 0;
   try {
     for (auto range : *iter) {
       for (auto it : range) {
-        (void)it;
-        count++;
+        it.first.ToString();
+        total_keys++;
       }
     }
   } catch (MultiScanException& ex) {
     ASSERT_NOK(ex.status());
     std::cerr << "Iterator returned status " << ex.what();
     abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
   }
-  ASSERT_EQ(count, 40);  // 20 keys per range
-  iter.reset();
 
-  // Check statistics
-  // MULTISCAN_PREPARE_CALLS should be at least 1
-  ASSERT_GE(TestGetTickerCount(options, MULTISCAN_PREPARE_CALLS), 1);
+  // We scanned ~200 keys in range 1 and ~100 keys in range 2
+  ASSERT_EQ(total_keys, 300);
 
-  // MULTISCAN_PREPARE_ERRORS should be 0
-  ASSERT_EQ(TestGetTickerCount(options, MULTISCAN_PREPARE_ERRORS), 0);
+  // Verify that IO operations went through the IODispatcher
+  // The total IO operations should be > 0 (either sync reads, async reads, or
+  // cache hits)
+  uint64_t total_ops = tracking_dispatcher->GetTotalIOOperations();
+  ASSERT_GT(total_ops, 0) << "Expected some IO operations through IODispatcher";
 
-  // MULTISCAN_SEEK_ERRORS should be 0
-  ASSERT_EQ(TestGetTickerCount(options, MULTISCAN_SEEK_ERRORS), 0);
+  // Verify that we have at least one ReadSet created
+  ASSERT_GT(tracking_dispatcher->GetReadSets().size(), 0)
+      << "Expected at least one ReadSet to be created";
 
-  // Blocks should be prefetched or from cache
-  uint64_t blocks_prefetched =
-      TestGetTickerCount(options, MULTISCAN_BLOCKS_PREFETCHED);
-  uint64_t blocks_from_cache =
-      TestGetTickerCount(options, MULTISCAN_BLOCKS_FROM_CACHE);
-  ASSERT_GT(blocks_prefetched + blocks_from_cache, 0);
+  // Since we used sync IO, we should have sync reads (or cache hits if cached)
+  uint64_t sync_reads = tracking_dispatcher->GetTotalSyncReads();
+  uint64_t cache_hits = tracking_dispatcher->GetTotalCacheHits();
+  ASSERT_GT(sync_reads + cache_hits, 0)
+      << "Expected sync reads or cache hits for sync IO mode";
 
-  // If blocks were prefetched, prefetch bytes and IO requests should be > 0
-  if (blocks_prefetched > 0) {
-    ASSERT_GT(TestGetTickerCount(options, MULTISCAN_PREFETCH_BYTES), 0);
-    uint64_t io_requests = TestGetTickerCount(options, MULTISCAN_IO_REQUESTS);
-    ASSERT_GT(io_requests, 0);
-    ASSERT_LE(io_requests, blocks_prefetched);
-  }
-
-  // Wasted blocks should be 0 since we iterated through everything
-  ASSERT_EQ(TestGetTickerCount(options, MULTISCAN_PREFETCH_BLOCKS_WASTED), 0);
+  iter.reset();
 }
 
-TEST_P(DBMultiScanIteratorTest, StatisticsWastedBlocksTest) {
-  // Test that wasted blocks are tracked when iteration is abandoned early
+TEST_P(DBMultiScanIteratorTest, IODispatcherPrefetchKnownBlocks) {
+  // Test that verifies we prefetch a known/expected number of blocks.
+  // Uses FlushBlockEveryKeyPolicyFactory to create exactly one block per key,
+  // making the block count predictable and verifiable.
   auto options = CurrentOptions();
-  options.statistics = CreateDBStatistics();
-  // Use small block size to ensure multiple blocks
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  // Configure to create exactly one block per key
   BlockBasedTableOptions table_options;
-  table_options.block_size = 256;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  // Use a block cache (required by IODispatcher), but use a fresh one
+  // that won't have any cached data
+  table_options.block_cache = NewLRUCache(10 * 1024 * 1024);  // 10MB cache
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   DestroyAndReopen(options);
 
-  // Create data across multiple blocks
-  for (int i = 0; i < 100; ++i) {
+  // Create exactly 100 keys, each in its own block
+  const int kNumKeys = 100;
+  const int kValueSize = 100;  // Fixed value size for predictability
+  std::string value(kValueSize, 'v');
+
+  for (int i = 0; i < kNumKeys; ++i) {
     std::stringstream ss;
-    ss << std::setw(3) << std::setfill('0') << i;
-    ASSERT_OK(Put("k" + ss.str(), std::string(100, 'v')));
+    ss << "k" << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), value));
   }
   ASSERT_OK(Flush());
 
-  // Reset stats before multi scan
-  ASSERT_OK(options.statistics->Reset());
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // Create a tracking IODispatcher to verify IO statistics
+  auto tracking_dispatcher = std::make_shared<TrackingIODispatcher>();
 
-  // Set up a large scan range
-  ReadOptions ro;
-  ro.fill_cache = GetParam();
+  // Define scan ranges with known block counts:
+  // Range 1: k000 to k020 (20 keys = 20 blocks)
+  // Range 2: k050 to k060 (10 keys = 10 blocks)
+  // Total expected blocks to read: 30
   MultiScanArgs scan_options(BytewiseComparator());
-  scan_options.insert("k000", "k099");
+  scan_options.use_async_io = false;  // Use sync IO for predictable stats
+  scan_options.io_dispatcher = tracking_dispatcher;
+  scan_options.insert("k000", "k020");
+  scan_options.insert("k050", "k060");
+
+  ReadOptions ro;
+  ro.fill_cache = false;  // Don't fill cache, ensure fresh reads
 
-  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
   std::unique_ptr<MultiScan> iter =
       dbfull()->NewMultiScan(ro, cfh, scan_options);
+  ASSERT_NE(iter, nullptr);
 
-  // Only iterate through a few keys, then abandon
-  int count = 0;
+  // Scan through all data and count keys
+  int total_keys = 0;
   try {
     for (auto range : *iter) {
       for (auto it : range) {
-        (void)it;
-        count++;
-        if (count >= 5) {
-          break;  // Abandon iteration early
-        }
-      }
-      if (count >= 5) {
-        break;
+        it.first.ToString();
+        total_keys++;
       }
     }
   } catch (MultiScanException& ex) {
     ASSERT_NOK(ex.status());
     std::cerr << "Iterator returned status " << ex.what();
     abort();
+  } catch (std::logic_error& ex) {
+    std::cerr << "Iterator returned logic error " << ex.what();
+    abort();
   }
-  ASSERT_EQ(count, 5);
 
-  // Destroy iterator to trigger wasted blocks counting
+  // Verify we scanned the expected number of keys
+  // Range 1: k000-k019 = 20 keys, Range 2: k050-k059 = 10 keys
+  ASSERT_EQ(total_keys, 30) << "Expected 30 keys from two ranges";
+
+  // Verify IODispatcher statistics
+  uint64_t total_ops = tracking_dispatcher->GetTotalIOOperations();
+  uint64_t sync_reads = tracking_dispatcher->GetTotalSyncReads();
+
+  // We should have at least as many IO operations as blocks we need to read
+  // (could be more due to index/filter blocks)
+  ASSERT_GE(total_ops, 30)
+      << "Expected at least 30 IO operations for 30 data blocks";
+
+  // Since cache is fresh and fill_cache=false, all should be sync reads
+  ASSERT_GE(sync_reads, 30)
+      << "Expected at least 30 sync reads for 30 data blocks";
+
+  // Verify we created ReadSets (one per range)
+  size_t num_readsets = tracking_dispatcher->GetReadSets().size();
+  ASSERT_GE(num_readsets, 1) << "Expected at least one ReadSet";
+
+  // Log the stats for debugging
+  std::cout << "IODispatcher Stats: total_ops=" << total_ops
+            << ", sync_reads=" << sync_reads
+            << ", async_reads=" << tracking_dispatcher->GetTotalAsyncReads()
+            << ", cache_hits=" << tracking_dispatcher->GetTotalCacheHits()
+            << ", readsets=" << num_readsets << std::endl;
+
   iter.reset();
+}
+
+TEST_P(DBMultiScanIteratorTest, IODispatcherCacheHitVerification) {
+  // Test that verifies cache hits are properly tracked through IODispatcher.
+  // First scan populates cache, second scan should show cache hits.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  // Enable block cache with enough space for all blocks
+  table_options.block_cache = NewLRUCache(10 * 1024 * 1024);  // 10MB cache
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create 50 keys, each in its own block
+  const int kNumKeys = 50;
+  std::string value(100, 'v');
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // First scan: populate the cache
+  {
+    auto dispatcher1 = std::make_shared<TrackingIODispatcher>();
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.use_async_io = false;
+    scan_options.io_dispatcher = dispatcher1;
+    scan_options.insert("k000", "k025");  // 25 keys
+
+    ReadOptions ro;
+    ro.fill_cache = true;  // Fill cache on first scan
+
+    std::unique_ptr<MultiScan> iter =
+        dbfull()->NewMultiScan(ro, cfh, scan_options);
+    ASSERT_NE(iter, nullptr);
+
+    int count = 0;
+    try {
+      for (auto range : *iter) {
+        for (auto it : range) {
+          it.first.ToString();
+          count++;
+        }
+      }
+    } catch (MultiScanException& ex) {
+      FAIL() << "First scan failed: " << ex.what();
+    }
+    ASSERT_EQ(count, 25);
+
+    // First scan should have sync reads (cache was empty)
+    uint64_t first_sync = dispatcher1->GetTotalSyncReads();
+    ASSERT_GE(first_sync, 25) << "First scan should have sync reads";
+
+    std::cout << "First scan stats: sync_reads=" << first_sync
+              << ", cache_hits=" << dispatcher1->GetTotalCacheHits()
+              << std::endl;
+  }
+
+  // Second scan: should get cache hits
+  {
+    auto dispatcher2 = std::make_shared<TrackingIODispatcher>();
+    MultiScanArgs scan_options(BytewiseComparator());
+    scan_options.use_async_io = false;
+    scan_options.io_dispatcher = dispatcher2;
+    scan_options.insert("k000", "k025");  // Same range as before
+
+    ReadOptions ro;
+    ro.fill_cache = true;
+
+    std::unique_ptr<MultiScan> iter =
+        dbfull()->NewMultiScan(ro, cfh, scan_options);
+    ASSERT_NE(iter, nullptr);
+
+    int count = 0;
+    try {
+      for (auto range : *iter) {
+        for (auto it : range) {
+          it.first.ToString();
+          count++;
+        }
+      }
+    } catch (MultiScanException& ex) {
+      FAIL() << "Second scan failed: " << ex.what();
+    }
+    ASSERT_EQ(count, 25);
+
+    // Second scan should have cache hits (blocks were cached in first scan)
+    uint64_t second_cache_hits = dispatcher2->GetTotalCacheHits();
+    uint64_t second_sync = dispatcher2->GetTotalSyncReads();
+
+    std::cout << "Second scan stats: sync_reads=" << second_sync
+              << ", cache_hits=" << second_cache_hits << std::endl;
+
+    // We expect cache hits on the second scan for data blocks
+    // Note: Some blocks might still need sync reads (e.g., if cache was
+    // evicted)
+    ASSERT_GE(second_cache_hits, 20)
+        << "Second scan should have cache hits for most blocks";
+  }
+}
 
-  uint64_t blocks_prefetched =
-      TestGetTickerCount(options, MULTISCAN_BLOCKS_PREFETCHED);
+TEST_P(DBMultiScanIteratorTest, WastedBlocksTracking) {
+  // Test that verifies wasted prefetch blocks are properly tracked.
+  // When blocks are prefetched but skipped (e.g., due to seek), they should
+  // be counted as wasted and recorded to MULTISCAN_PREFETCH_BLOCKS_WASTED.
+  auto options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
 
-  // If blocks were prefetched, some should be wasted since we abandoned early
-  if (blocks_prefetched > 1) {
-    // We only read a few keys, so there should be wasted blocks
-    ASSERT_GT(TestGetTickerCount(options, MULTISCAN_PREFETCH_BLOCKS_WASTED), 0);
+  BlockBasedTableOptions table_options;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(10 * 1024 * 1024);  // 10MB cache
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Create 100 keys, each in its own block
+  const int kNumKeys = 100;
+  std::string value(100, 'v');
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::stringstream ss;
+    ss << "k" << std::setw(3) << std::setfill('0') << i;
+    ASSERT_OK(Put(ss.str(), value));
   }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+
+  // Reset the wasted blocks counter before test
+  options.statistics->setTickerCount(MULTISCAN_PREFETCH_BLOCKS_WASTED, 0);
+
+  // Set up MultiScan with two non-contiguous ranges:
+  // Range 1: k000-k020 (20 keys/blocks)
+  // Range 2: k050-k070 (20 keys/blocks)
+  // The blocks between k020-k050 (30 blocks) should be wasted if prefetched
+  MultiScanArgs scan_options(BytewiseComparator());
+  scan_options.use_async_io = false;
+  scan_options.insert("k000", "k020");
+  scan_options.insert("k050", "k070");
+
+  ReadOptions ro;
+  ro.fill_cache = GetParam();
+
+  {
+    std::unique_ptr<MultiScan> iter =
+        dbfull()->NewMultiScan(ro, cfh, scan_options);
+    ASSERT_NE(iter, nullptr);
+
+    int count = 0;
+    try {
+      for (auto range : *iter) {
+        for (auto it : range) {
+          it.first.ToString();
+          count++;
+        }
+      }
+    } catch (MultiScanException& ex) {
+      FAIL() << "Scan failed: " << ex.what();
+    }
+
+    // We should have scanned 40 keys total (20 + 20)
+    ASSERT_EQ(count, 40);
+  }  // Iterator destroyed here, wasted blocks recorded
+
+  // Check that wasted blocks were recorded
+  // The exact count depends on how many blocks were prefetched between ranges
+  uint64_t wasted =
+      options.statistics->getTickerCount(MULTISCAN_PREFETCH_BLOCKS_WASTED);
+
+  // We expect some wasted blocks due to the gap between ranges
+  // The exact number depends on prefetch behavior, but should be > 0
+  // if blocks between k020-k050 were prefetched
+  std::cout << "Wasted blocks: " << wasted << std::endl;
+
+  // Note: The test verifies the tracking mechanism works.
+  // The actual count depends on prefetch heuristics which may vary.
 }
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/include/rocksdb/io_dispatcher.h b/include/rocksdb/io_dispatcher.h
index 520be86abf31..9c3fefd640b8 100644
--- a/include/rocksdb/io_dispatcher.h
+++ b/include/rocksdb/io_dispatcher.h
@@ -15,6 +15,9 @@
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
+
+class FileSystem;
+
 /*
  * IODispatcher is a class that allows users to submit groups of IO jobs to be
  * dispatched asynchronously (or synchronously), upon submission the
@@ -140,6 +143,16 @@ class ReadSet {
   // out: Output parameter for the pinned block entry
   Status ReadOffset(size_t offset, CachableEntry<Block>* out);
 
+  // Release a block by index, unpinning it from cache.
+  // After this call, ReadIndex() for this block will return an error.
+  // This is useful for eager memory reclamation when blocks are no longer
+  // needed.
+  void ReleaseBlock(size_t block_index);
+
+  // Check if a block at the given index is still available (not released).
+  // Returns true if the block can be read, false otherwise.
+  bool IsBlockAvailable(size_t block_index) const;
+
   // Statistics accessors
   uint64_t GetNumSyncReads() const { return num_sync_reads_; }
   uint64_t GetNumAsyncReads() const { return num_async_reads_; }
@@ -151,6 +164,9 @@ class ReadSet {
   // Job data
   std::shared_ptr<IOJob> job_;
 
+  // FileSystem for calling AbortIO in destructor
+  std::shared_ptr<FileSystem> fs_;
+
   // Storage for pinned blocks (one per block handle in the job)
   std::vector<CachableEntry<Block>> pinned_blocks_;
 
@@ -204,4 +220,63 @@ class IODispatcher {
 
 IODispatcher* NewIODispatcher();
 
+// TrackingIODispatcher wraps another IODispatcher and tracks all ReadSets
+// created. This is useful for testing to verify IO statistics.
+class TrackingIODispatcher : public IODispatcher {
+ public:
+  TrackingIODispatcher() : impl_(NewIODispatcher()) {}
+  explicit TrackingIODispatcher(IODispatcher* impl) : impl_(impl) {}
+
+  Status SubmitJob(const std::shared_ptr<IOJob>& job,
+                   std::shared_ptr<ReadSet>* read_set) override {
+    Status s = impl_->SubmitJob(job, read_set);
+    if (s.ok() && read_set && *read_set) {
+      read_sets_.push_back(*read_set);
+    }
+    return s;
+  }
+
+  // Get all ReadSets created by this dispatcher
+  const std::vector<std::shared_ptr<ReadSet>>& GetReadSets() const {
+    return read_sets_;
+  }
+
+  // Get aggregated statistics from all ReadSets
+  uint64_t GetTotalSyncReads() const {
+    uint64_t total = 0;
+    for (const auto& rs : read_sets_) {
+      total += rs->GetNumSyncReads();
+    }
+    return total;
+  }
+
+  uint64_t GetTotalAsyncReads() const {
+    uint64_t total = 0;
+    for (const auto& rs : read_sets_) {
+      total += rs->GetNumAsyncReads();
+    }
+    return total;
+  }
+
+  uint64_t GetTotalCacheHits() const {
+    uint64_t total = 0;
+    for (const auto& rs : read_sets_) {
+      total += rs->GetNumCacheHits();
+    }
+    return total;
+  }
+
+  // Get total IO operations (sum of all types)
+  uint64_t GetTotalIOOperations() const {
+    return GetTotalSyncReads() + GetTotalAsyncReads() + GetTotalCacheHits();
+  }
+
+  // Clear tracked ReadSets
+  void ClearReadSets() { read_sets_.clear(); }
+
+ private:
+  std::unique_ptr<IODispatcher> impl_;
+  std::vector<std::shared_ptr<ReadSet>> read_sets_;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index d8acfe8f7175..fdbc5f530424 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -58,6 +58,7 @@ class InternalKeyComparator;
 class WalFilter;
 class FileSystem;
 class UserDefinedIndexFactory;
+class IODispatcher;
 
 struct Options;
 struct DbPath;
@@ -1847,11 +1848,13 @@ class MultiScanArgs {
     io_coalesce_threshold = other.io_coalesce_threshold;
     max_prefetch_size = other.max_prefetch_size;
     use_async_io = other.use_async_io;
+    io_dispatcher = other.io_dispatcher;
   }
   MultiScanArgs(MultiScanArgs&& other) noexcept
       : io_coalesce_threshold(other.io_coalesce_threshold),
         max_prefetch_size(other.max_prefetch_size),
         use_async_io(other.use_async_io),
+        io_dispatcher(std::move(other.io_dispatcher)),
         comp_(other.comp_),
         original_ranges_(std::move(other.original_ranges_)) {}
 
@@ -1861,6 +1864,7 @@ class MultiScanArgs {
     io_coalesce_threshold = other.io_coalesce_threshold;
     max_prefetch_size = other.max_prefetch_size;
     use_async_io = other.use_async_io;
+    io_dispatcher = other.io_dispatcher;
     return *this;
   }
 
@@ -1871,6 +1875,7 @@ class MultiScanArgs {
       io_coalesce_threshold = other.io_coalesce_threshold;
       max_prefetch_size = other.max_prefetch_size;
       use_async_io = other.use_async_io;
+      io_dispatcher = std::move(other.io_dispatcher);
     }
     return *this;
   }
@@ -1918,6 +1923,7 @@ class MultiScanArgs {
     io_coalesce_threshold = other.io_coalesce_threshold;
     max_prefetch_size = other.max_prefetch_size;
     use_async_io = other.use_async_io;
+    io_dispatcher = other.io_dispatcher;
   }
 
   uint64_t io_coalesce_threshold = 16 << 10;  // 16KB by default
@@ -1939,6 +1945,12 @@ class MultiScanArgs {
   // When false, it will use synchronous MultiRead().
   bool use_async_io = false;
 
+  // Optional IODispatcher for multi-scan operations.
+  // If nullptr (default), a new IODispatcher is created internally.
+  // Users can provide their own IODispatcher for custom IO scheduling
+  // or for testing/monitoring purposes (e.g., to check IO statistics).
+  std::shared_ptr<IODispatcher> io_dispatcher = nullptr;
+
  private:
   // The comparator used for ordering ranges
   const Comparator* comp_;
diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc
index 00c1d2102fc3..e0e51469f6f3 100644
--- a/table/block_based/block_based_table_iterator.cc
+++ b/table/block_based/block_based_table_iterator.cc
@@ -919,57 +919,6 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize(
   ResetPreviousBlockOffset();
 }
 
-BlockBasedTableIterator::MultiScanState::~MultiScanState() {
-  // Count remaining non-empty blocks as wasted (iterator abandoned before
-  // accessing them). Start from cur_data_block_idx since blocks before that
-  // have already been processed and counted if skipped.
-  for (size_t i = cur_data_block_idx; i < pinned_data_blocks.size(); ++i) {
-    if (!pinned_data_blocks[i].IsEmpty()) {
-      ++wasted_blocks_count;
-    }
-  }
-
-  // Record wasted blocks stat
-  if (wasted_blocks_count > 0 && statistics != nullptr) {
-    RecordTick(statistics, MULTISCAN_PREFETCH_BLOCKS_WASTED,
-               wasted_blocks_count);
-  }
-
-  // Abort any pending async IO operations to prevent callback being called
-  // after async read states are destructed.
-  if (!async_states.empty()) {
-    std::vector<void*> io_handles_to_abort;
-    std::vector<AsyncReadState*> states_to_cleanup;
-
-    // Collect all pending IO handles
-    for (size_t i = 0; i < async_states.size(); ++i) {
-      auto& async_read = async_states[i];
-
-      if (async_read.io_handle != nullptr) {
-        assert(!async_read.finished);
-        io_handles_to_abort.push_back(async_read.io_handle);
-        states_to_cleanup.push_back(&async_read);
-      }
-    }
-
-    if (!io_handles_to_abort.empty()) {
-      IOStatus abort_status = fs->AbortIO(io_handles_to_abort);
-      if (!abort_status.ok()) {
-#ifndef NDEBUG
-        fprintf(stderr, "Error aborting async IO operations: %s\n",
-                abort_status.ToString().c_str());
-#endif
-        assert(false);
-      }
-      (void)abort_status;  // Suppress unused variable warning
-    }
-
-    for (auto async_read : states_to_cleanup) {
-      async_read->CleanUpIOHandle();
-    }
-  }
-}
-
 // Note:
 // - Iterator should not be reused for multiple multiscans or mixing
 // multiscan with regular iterator usage.
@@ -1023,69 +972,57 @@ void BlockBasedTableIterator::Prepare(const MultiScanArgs* multiscan_opts) {
     return;
   }
 
-  // Pin already cached blocks, collect remaining blocks to read
-  std::vector<size_t> block_indices_to_read;
-  std::vector<CachableEntry<Block>> pinned_data_blocks_guard(
-      scan_block_handles.size());
-  size_t prefetched_max_idx;
-  multi_scan_status_ = FilterAndPinCachedBlocks(
-      scan_block_handles, multiscan_opts, &block_indices_to_read,
-      &pinned_data_blocks_guard, &prefetched_max_idx);
+  // Calculate prefetch_max_idx (enforces max_prefetch_size)
+  size_t prefetch_max_idx = scan_block_handles.size();
+  if (multiscan_opts->max_prefetch_size > 0) {
+    uint64_t total_size = 0;
+    for (size_t i = 0; i < scan_block_handles.size(); ++i) {
+      total_size +=
+          BlockBasedTable::BlockSizeWithTrailer(scan_block_handles[i]);
+      if (total_size > multiscan_opts->max_prefetch_size) {
+        prefetch_max_idx = i;
+        break;
+      }
+    }
+  }
+
+  // Create block handles vector for IODispatcher (limited to prefetch_max_idx)
+  std::vector<BlockHandle> blocks_to_prefetch;
+  if (prefetch_max_idx > 0) {
+    blocks_to_prefetch.assign(scan_block_handles.begin(),
+                              scan_block_handles.begin() + prefetch_max_idx);
+  }
+
+  // Submit to IODispatcher
+  auto job = std::make_shared<IOJob>();
+  job->table = const_cast<BlockBasedTable*>(table_);
+  job->block_handles = std::move(blocks_to_prefetch);
+  job->job_options.io_coalesce_threshold =
+      multiscan_opts->io_coalesce_threshold;
+  job->job_options.read_options = read_options_;
+  job->job_options.read_options.async_io = multiscan_opts->use_async_io;
+
+  std::shared_ptr<ReadSet> read_set;
+  // IODispatcher should be provided by DBIter::Prepare() to enable sharing
+  // across all BlockBasedTableIterators in the scan. Create one if not
+  // provided (for direct calls to Prepare, e.g., in unit tests).
+  std::shared_ptr<IODispatcher> dispatcher = multiscan_opts->io_dispatcher;
+  if (!dispatcher) {
+    dispatcher.reset(NewIODispatcher());
+  }
+  multi_scan_status_ = dispatcher->SubmitJob(job, &read_set);
   if (!multi_scan_status_.ok()) {
     RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
     return;
   }
 
-  // Record cache hit/miss stats
-  size_t blocks_from_cache =
-      scan_block_handles.size() - block_indices_to_read.size();
-  RecordTick(table_->GetStatistics(), MULTISCAN_BLOCKS_FROM_CACHE,
-             blocks_from_cache);
-  RecordTick(table_->GetStatistics(), MULTISCAN_BLOCKS_PREFETCHED,
-             block_indices_to_read.size());
-
-  std::vector<AsyncReadState> async_states;
-  // Maps from block index into async read request (index into async_states[])
-  UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
-  if (!block_indices_to_read.empty()) {
-    std::vector<FSReadRequest> read_reqs;
-    std::vector<std::vector<size_t>> coalesced_block_indices;
-    size_t nonadjacent_coalesced = 0;
-    uint64_t total_prefetch_bytes = 0;
-    PrepareIORequests(block_indices_to_read, scan_block_handles, multiscan_opts,
-                      &read_reqs, &block_idx_to_readreq_idx,
-                      &coalesced_block_indices, &nonadjacent_coalesced,
-                      &total_prefetch_bytes);
-
-    // Record I/O stats
-    RecordTick(table_->GetStatistics(), MULTISCAN_IO_REQUESTS,
-               read_reqs.size());
-    RecordTick(table_->GetStatistics(), MULTISCAN_PREFETCH_BYTES,
-               total_prefetch_bytes);
-    RecordTick(table_->GetStatistics(), MULTISCAN_IO_COALESCED_NONADJACENT,
-               nonadjacent_coalesced);
-
-    multi_scan_status_ =
-        ExecuteIO(scan_block_handles, multiscan_opts, coalesced_block_indices,
-                  &read_reqs, &async_states, &pinned_data_blocks_guard);
-    if (!multi_scan_status_.ok()) {
-      RecordTick(table_->GetStatistics(), MULTISCAN_PREPARE_ERRORS);
-      return;
-    }
-  }
-
   // Successful Prepare, init related states so the iterator reads from prepared
-  // blocks.
+  // blocks. Note: data_block_separators keeps full size for seek logic.
   multi_scan_ = std::make_unique<MultiScanState>(
       table_->get_rep()->ioptions.env->GetFileSystem(), multiscan_opts,
-      std::move(pinned_data_blocks_guard), std::move(data_block_separators),
-      std::move(block_index_ranges_per_scan),
-      std::move(block_idx_to_readreq_idx), std::move(async_states),
-      prefetched_max_idx, table_->GetStatistics());
-
-  // Record histogram for blocks per prepare
-  RecordInHistogram(table_->GetStatistics(), MULTISCAN_BLOCKS_PER_PREPARE,
-                    scan_block_handles.size());
+      std::move(read_set), std::move(data_block_separators),
+      std::move(block_index_ranges_per_scan), prefetch_max_idx,
+      table_->GetStatistics());
 
   is_index_at_curr_block_ = false;
   block_iter_points_to_real_block_ = false;
@@ -1292,13 +1229,14 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
   }
 
   // Unpin all the blocks from multi_scan_->cur_data_block_idx to
-  // cur_scan_start_idx
+  // cur_scan_start_idx - these are wasted (prefetched but skipped)
   for (auto unpin_block_idx = multi_scan_->cur_data_block_idx;
        unpin_block_idx < cur_scan_start_idx; unpin_block_idx++) {
-    if (!multi_scan_->pinned_data_blocks[unpin_block_idx].IsEmpty()) {
-      multi_scan_->pinned_data_blocks[unpin_block_idx].Reset();
-      ++multi_scan_->wasted_blocks_count;
+    // Count as wasted if it was prefetched
+    if (unpin_block_idx < multi_scan_->prefetch_max_idx) {
+      multi_scan_->wasted_blocks_count++;
     }
+    multi_scan_->read_set->ReleaseBlock(unpin_block_idx);
   }
 
   // Take the max here to ensure we don't move backwards.
@@ -1310,11 +1248,11 @@ void BlockBasedTableIterator::MultiScanUnexpectedSeekTarget(
               *user_seek_target, /*a_has_ts=*/true,
               data_block_separators[block_idx],
               /*b_has_ts=*/false) > 0)) {
-    // Unpin the blocks that are passed
-    if (!multi_scan_->pinned_data_blocks[block_idx].IsEmpty()) {
-      multi_scan_->pinned_data_blocks[block_idx].Reset();
-      ++multi_scan_->wasted_blocks_count;
+    // Unpin the blocks that are passed - count as wasted if prefetched
+    if (block_idx < multi_scan_->prefetch_max_idx) {
+      multi_scan_->wasted_blocks_count++;
     }
+    multi_scan_->read_set->ReleaseBlock(block_idx);
     block_idx++;
   }
 
@@ -1348,14 +1286,13 @@ void BlockBasedTableIterator::MultiScanSeekTargetFromBlock(
   }
 
   // Move current data block index forward until block_idx, meantime, unpin all
-  // the blocks in between
+  // the blocks in between - these are wasted (prefetched but skipped)
   while (multi_scan_->cur_data_block_idx < block_idx) {
-    // unpin block
-    if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
-             .IsEmpty()) {
-      multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
-      ++multi_scan_->wasted_blocks_count;
+    // Count as wasted if it was prefetched
+    if (multi_scan_->cur_data_block_idx < multi_scan_->prefetch_max_idx) {
+      multi_scan_->wasted_blocks_count++;
     }
+    multi_scan_->read_set->ReleaseBlock(multi_scan_->cur_data_block_idx);
     multi_scan_->cur_data_block_idx++;
   }
   block_iter_points_to_real_block_ = true;
@@ -1385,11 +1322,8 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
     }
     // Move to the next pinned data block
     ResetDataIter();
-    // Unpin previous block if it is not reset by data iterator
-    if (!multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx]
-             .IsEmpty()) {
-      multi_scan_->pinned_data_blocks[multi_scan_->cur_data_block_idx].Reset();
-    }
+    // Unpin previous block via ReadSet
+    multi_scan_->read_set->ReleaseBlock(multi_scan_->cur_data_block_idx);
     ++multi_scan_->cur_data_block_idx;
 
     if (MultiScanLoadDataBlock(multi_scan_->cur_data_block_idx)) {
@@ -1401,106 +1335,6 @@ void BlockBasedTableIterator::FindBlockForwardInMultiScan() {
   } while (!block_iter_.Valid());
 }
 
-Status BlockBasedTableIterator::PollForBlock(size_t idx) {
-  assert(multi_scan_);
-  const auto async_idx = multi_scan_->block_idx_to_readreq_idx.find(idx);
-  if (async_idx == multi_scan_->block_idx_to_readreq_idx.end()) {
-    // Did not require async read, should already be pinned.
-    assert(multi_scan_->pinned_data_blocks[idx].GetValue());
-    return Status::OK();
-  }
-
-  AsyncReadState& async_read = multi_scan_->async_states[async_idx->second];
-  if (async_read.finished) {
-    assert(async_read.io_handle == nullptr);
-    return async_read.status;
-  }
-
-  {
-    std::vector<void*> handles = {async_read.io_handle};
-    Status poll_s =
-        table_->get_rep()->ioptions.env->GetFileSystem()->Poll(handles, 1);
-    if (!poll_s.ok()) {
-      return poll_s;
-    }
-  }
-  if (!async_read.status.ok()) {
-    return async_read.status;
-  }
-  async_read.CleanUpIOHandle();
-
-  // Initialize and pin blocks from async read result.
-  for (size_t i = 0; i < async_read.blocks.size(); ++i) {
-    const auto& block = async_read.blocks[i];
-
-    Status s = CreateAndPinBlockFromBuffer(
-        block, async_read.offset, async_read.result,
-        multi_scan_->pinned_data_blocks[async_read.block_indices[i]]);
-
-    if (!s.ok()) {
-      return s;
-    }
-    assert(multi_scan_->pinned_data_blocks[async_read.block_indices[i]]
-               .GetValue());
-  }
-  assert(multi_scan_->pinned_data_blocks[idx].GetValue());
-  return Status::OK();
-}
-
-Status BlockBasedTableIterator::CreateAndPinBlockFromBuffer(
-    const BlockHandle& block, uint64_t buffer_start_offset,
-    const Slice& buffer_data, CachableEntry<Block>& pinned_block_entry) {
-  // Get decompressor and handle dictionary loading
-  UnownedPtr<Decompressor> decompressor = table_->get_rep()->decompressor.get();
-  CachableEntry<DecompressorDict> cached_dict;
-
-  if (table_->get_rep()->uncompression_dict_reader) {
-    {
-      Status s =
-          table_->get_rep()
-              ->uncompression_dict_reader->GetOrReadUncompressionDictionary(
-                  /* prefetch_buffer= */ nullptr, read_options_,
-                  /* get_context= */ nullptr, /* lookup_context= */ nullptr,
-                  &cached_dict);
-      if (!s.ok()) {
-#ifndef NDEBUG
-        fprintf(stdout, "Prepare dictionary loading failed with %s\n",
-                s.ToString().c_str());
-#endif
-        return s;
-      }
-    }
-    if (!cached_dict.GetValue()) {
-#ifndef NDEBUG
-      fprintf(stdout, "Success but no dictionary read\n");
-#endif
-      return Status::InvalidArgument("No dictionary found");
-    }
-    decompressor = cached_dict.GetValue()->decompressor_.get();
-  }
-
-  // Create block from buffer data
-  const auto block_size_with_trailer =
-      BlockBasedTable::BlockSizeWithTrailer(block);
-  const auto block_offset_in_buffer = block.offset() - buffer_start_offset;
-
-  CacheAllocationPtr data =
-      AllocateBlock(block_size_with_trailer,
-                    GetMemoryAllocator(table_->get_rep()->table_options));
-  memcpy(data.get(), buffer_data.data() + block_offset_in_buffer,
-         block_size_with_trailer);
-  BlockContents tmp_contents(std::move(data), block.size());
-
-#ifndef NDEBUG
-  tmp_contents.has_trailer =
-      table_->get_rep()->footer.GetBlockTrailerSize() > 0;
-#endif
-
-  return table_->CreateAndPinBlockInCache<Block_kData>(
-      read_options_, block, decompressor, &tmp_contents,
-      &pinned_block_entry.As<Block_kData>());
-}
-
 constexpr auto kVerbose = false;
 
 Status BlockBasedTableIterator::CollectBlockHandles(
@@ -1586,252 +1420,4 @@ Status BlockBasedTableIterator::CollectBlockHandles(
   return Status::OK();
 }
 
-Status BlockBasedTableIterator::FilterAndPinCachedBlocks(
-    const std::vector<BlockHandle>& scan_block_handles,
-    const MultiScanArgs* multiscan_opts,
-    std::vector<size_t>* block_indices_to_read,
-    std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
-    size_t* prefetched_max_idx) {
-  uint64_t total_prefetch_size = 0;
-  *prefetched_max_idx = scan_block_handles.size();
-
-  for (size_t i = 0; i < scan_block_handles.size(); ++i) {
-    const auto& data_block_handle = scan_block_handles[i];
-
-    total_prefetch_size +=
-        BlockBasedTable::BlockSizeWithTrailer(data_block_handle);
-    if (multiscan_opts->max_prefetch_size > 0 &&
-        total_prefetch_size > multiscan_opts->max_prefetch_size) {
-      for (size_t j = i; j < scan_block_handles.size(); ++j) {
-        assert((*pinned_data_blocks_guard)[j].IsEmpty());
-      }
-      *prefetched_max_idx = i;
-      break;
-    }
-
-    Status s = table_->LookupAndPinBlocksInCache<Block_kData>(
-        read_options_, data_block_handle,
-        &(*pinned_data_blocks_guard)[i].As<Block_kData>());
-
-    if (!s.ok()) {
-      // Abort: block cache look up failed.
-      return s;
-    }
-    if (!(*pinned_data_blocks_guard)[i].GetValue()) {
-      // Block not in cache
-      block_indices_to_read->emplace_back(i);
-    }
-  }
-  return Status::OK();
-}
-
-void BlockBasedTableIterator::PrepareIORequests(
-    const std::vector<size_t>& block_indices_to_read,
-    const std::vector<BlockHandle>& scan_block_handles,
-    const MultiScanArgs* multiscan_opts, std::vector<FSReadRequest>* read_reqs,
-    UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
-    std::vector<std::vector<size_t>>* coalesced_block_indices,
-    size_t* nonadjacent_coalesced_count, uint64_t* total_prefetch_bytes) {
-  assert(coalesced_block_indices->empty());
-  coalesced_block_indices->resize(1);
-  *nonadjacent_coalesced_count = 0;
-  *total_prefetch_bytes = 0;
-
-  for (const auto& block_idx : block_indices_to_read) {
-    if (!coalesced_block_indices->back().empty()) {
-      // Check if we can coalesce.
-      const auto& last_block_handle =
-          scan_block_handles[coalesced_block_indices->back().back()];
-      uint64_t last_block_end =
-          last_block_handle.offset() +
-          BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
-      uint64_t current_start = scan_block_handles[block_idx].offset();
-
-      if (current_start >
-          last_block_end + multiscan_opts->io_coalesce_threshold) {
-        // new IO
-        coalesced_block_indices->emplace_back();
-      } else if (current_start > last_block_end) {
-        // Non-adjacent but within threshold, so coalesced
-        ++(*nonadjacent_coalesced_count);
-      }
-    }
-    coalesced_block_indices->back().emplace_back(block_idx);
-  }
-
-  assert(read_reqs->empty());
-  read_reqs->reserve(coalesced_block_indices->size());
-  for (const auto& block_indices : *coalesced_block_indices) {
-    assert(block_indices.size());
-    const auto& first_block_handle = scan_block_handles[block_indices[0]];
-    const auto& last_block_handle = scan_block_handles[block_indices.back()];
-
-    const auto start_offset = first_block_handle.offset();
-    const auto end_offset =
-        last_block_handle.offset() +
-        BlockBasedTable::BlockSizeWithTrailer(last_block_handle);
-#ifndef NDEBUG
-    // Debug print for failing the assertion below.
-    if (start_offset >= end_offset) {
-      fprintf(stderr, "scan_block_handles: ");
-      for (const auto& block : scan_block_handles) {
-        fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
-                block.offset(), block.size());
-      }
-      fprintf(stderr,
-              "\nfirst block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
-              first_block_handle.offset(), first_block_handle.size());
-      fprintf(stderr, "last block - offset: %" PRIu64 ", size: %" PRIu64 "\n",
-              last_block_handle.offset(), last_block_handle.size());
-
-      fprintf(stderr, "coalesced_block_indices: ");
-      for (const auto& b : *coalesced_block_indices) {
-        fprintf(stderr, "[");
-        for (const auto& block_idx : b) {
-          fprintf(stderr, "%zu ", block_idx);
-        }
-        fprintf(stderr, "] ");
-      }
-      fprintf(stderr, "\ncurrent blocks: ");
-      for (const auto& block_idx : block_indices) {
-        fprintf(stderr, "offset: %" PRIu64 ", size: %" PRIu64 "; ",
-                scan_block_handles[block_idx].offset(),
-                scan_block_handles[block_idx].size());
-      }
-      fprintf(stderr, "\n");
-    }
-#endif  // NDEBUG
-    assert(end_offset > start_offset);
-
-    read_reqs->emplace_back();
-    read_reqs->back().offset = start_offset;
-    read_reqs->back().len = end_offset - start_offset;
-    *total_prefetch_bytes += read_reqs->back().len;
-
-    if (multiscan_opts->use_async_io) {
-      for (const auto& block_idx : block_indices) {
-        (*block_idx_to_readreq_idx)[block_idx] = read_reqs->size() - 1;
-      }
-    }
-  }
-}
-
-Status BlockBasedTableIterator::ExecuteIO(
-    const std::vector<BlockHandle>& scan_block_handles,
-    const MultiScanArgs* multiscan_opts,
-    const std::vector<std::vector<size_t>>& coalesced_block_indices,
-    std::vector<FSReadRequest>* read_reqs,
-    std::vector<AsyncReadState>* async_states,
-    std::vector<CachableEntry<Block>>* pinned_data_blocks_guard) {
-  IOOptions io_opts;
-  Status s;
-  s = table_->get_rep()->file->PrepareIOOptions(read_options_, io_opts);
-  if (!s.ok()) {
-    // Abort: PrepareIOOptions failed
-    return s;
-  }
-  const bool direct_io = table_->get_rep()->file->use_direct_io();
-
-  if (multiscan_opts->use_async_io) {
-    async_states->resize(read_reqs->size());
-    for (size_t i = 0; i < read_reqs->size(); ++i) {
-      auto& read_req = (*read_reqs)[i];
-      auto& async_read = (*async_states)[i];
-
-      async_read.finished = false;
-      async_read.offset = read_req.offset;
-      async_read.block_indices = coalesced_block_indices[i];
-      for (const auto idx : coalesced_block_indices[i]) {
-        async_read.blocks.emplace_back(scan_block_handles[idx]);
-      }
-
-      if (direct_io) {
-        read_req.scratch = nullptr;
-      } else {
-        async_read.buf.reset(new char[read_req.len]);
-        read_req.scratch = async_read.buf.get();
-      }
-
-      auto cb = std::bind(&BlockBasedTableIterator::PrepareReadAsyncCallBack,
-                          this, std::placeholders::_1, std::placeholders::_2);
-      // TODO: for mmap, io_handle will not be set but callback will already
-      // be called.
-      s = table_->get_rep()->file.get()->ReadAsync(
-          read_req, io_opts, cb, &async_read, &async_read.io_handle,
-          &async_read.del_fn, direct_io ? &async_read.aligned_buf : nullptr);
-      if (!s.ok()) {
-#ifndef NDEBUG
-        fprintf(stderr, "ReadAsync failed with %s\n", s.ToString().c_str());
-#endif
-        assert(false);
-        return s;
-      }
-      for (auto& req : *read_reqs) {
-        if (!req.status.ok()) {
-          assert(false);
-          // Silence compiler warning about NRVO
-          s = req.status;
-          return s;
-        }
-      }
-    }
-  } else {
-    // Synchronous IO using MultiRead
-    std::unique_ptr<char[]> buf;
-
-    if (direct_io) {
-      for (auto& read_req : *read_reqs) {
-        read_req.scratch = nullptr;
-      }
-    } else {
-      // TODO: optimize if FSSupportedOps::kFSBuffer is supported.
-      size_t total_len = 0;
-      for (const auto& req : *read_reqs) {
-        total_len += req.len;
-      }
-      buf.reset(new char[total_len]);
-      size_t offset = 0;
-      for (auto& read_req : *read_reqs) {
-        read_req.scratch = buf.get() + offset;
-        offset += read_req.len;
-      }
-    }
-
-    AlignedBuf aligned_buf;
-    s = table_->get_rep()->file->MultiRead(io_opts, read_reqs->data(),
-                                           read_reqs->size(),
-                                           direct_io ? &aligned_buf : nullptr);
-    if (!s.ok()) {
-      return s;
-    }
-    for (auto& req : *read_reqs) {
-      if (!req.status.ok()) {
-        // Silence compiler warning about NRVO
-        s = req.status;
-        return s;
-      }
-    }
-
-    // Init blocks and pin them in block cache.
-    assert(read_reqs->size() == coalesced_block_indices.size());
-    for (size_t i = 0; i < coalesced_block_indices.size(); i++) {
-      const auto& read_req = (*read_reqs)[i];
-      for (const auto& block_idx : coalesced_block_indices[i]) {
-        const auto& block = scan_block_handles[block_idx];
-
-        assert((*pinned_data_blocks_guard)[block_idx].IsEmpty());
-        s = CreateAndPinBlockFromBuffer(block, read_req.offset, read_req.result,
-                                        (*pinned_data_blocks_guard)[block_idx]);
-        if (!s.ok()) {
-          assert(false);
-          // Abort: failed to create and pin block in cache
-          return s;
-        }
-        assert((*pinned_data_blocks_guard)[block_idx].GetValue());
-      }
-    }
-  }
-  return s;
-}
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index de329351c67d..d7c4d409305b 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -10,6 +10,7 @@
 #include <deque>
 
 #include "db/seqno_to_time_mapping.h"
+#include "rocksdb/io_dispatcher.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/block_based/block_based_table_reader_impl.h"
 #include "table/block_based/block_prefetcher.h"
@@ -244,13 +245,10 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter_;
 
   bool TEST_IsBlockPinnedByMultiScan(size_t block_idx) {
-    if (!multi_scan_) {
+    if (!multi_scan_ || !multi_scan_->read_set) {
       return false;
     }
-    if (block_idx >= multi_scan_->pinned_data_blocks.size()) {
-      return false;
-    }
-    return !multi_scan_->pinned_data_blocks[block_idx].IsEmpty();
+    return multi_scan_->read_set->IsBlockAvailable(block_idx);
   }
 
  private:
@@ -412,111 +410,56 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
   // *** END States used by both regular scan and multiscan
 
   // *** BEGIN MultiScan related states ***
-  struct AsyncReadState {
-    std::unique_ptr<char[]> buf{nullptr};
-    // Indices into pinned_data_blocks that this request reads.
-    std::vector<size_t> block_indices;
-    // BlockHandle for each block in block_indices.
-    std::vector<BlockHandle> blocks;
-    void* io_handle{nullptr};
-    IOHandleDeleter del_fn{nullptr};
-    // offset for this async read request.
-    uint64_t offset{0};
-
-    // These two states are populated from the FSReadRequest
-    // by ReadAsync callback
-    Status status;
-    Slice result;
-
-    // For direct I/O support
-    AlignedBuf aligned_buf{nullptr};
-
-    bool finished{false};
-
-    AsyncReadState() = default;
-    DECLARE_DEFAULT_MOVES(AsyncReadState);
-    // Delete copy operations
-    AsyncReadState(const AsyncReadState&) = delete;
-    AsyncReadState& operator=(const AsyncReadState&) = delete;
-
-    void CleanUpIOHandle() {
-      if (io_handle != nullptr) {
-        assert(del_fn);
-        del_fn(io_handle);
-        io_handle = nullptr;
-      }
-      finished = true;
-    }
-
-    ~AsyncReadState() {
-      // Should be cleaned up before destruction.
-      assert(io_handle == nullptr);
-    }
-  };
-
   struct MultiScanState {
     // For Aborting async I/Os in destructor.
     const std::shared_ptr<FileSystem> fs;
     const MultiScanArgs* scan_opts;
-    std::vector<CachableEntry<Block>> pinned_data_blocks;
-    // The separator of each data block in above pinned_data_blocks vector.
-    // Its size is same as pinned_data_blocks.
-    // The value of separator is larger than or equal to the last key in the
-    // corresponding data block.
+    // ReadSet owns pinned data blocks and handles async I/O
+    std::shared_ptr<ReadSet> read_set;
+    // The separator of each data block.
+    // Its size is same as the number of block handles submitted to
+    // IODispatcher. The value of separator is larger than or equal to the last
+    // key in the corresponding data block.
     std::vector<std::string> data_block_separators;
     // Track previously seeked key in multi-scan.
     // This is used to ensure that the seek key is keep moving forward, as
     // blocks that are smaller than the seek key are unpinned from memory.
     std::string prev_seek_key_;
 
-    // Indicies into pinned_data_blocks for data blocks for each scan range.
+    // Indicies into block handles for data blocks for each scan range.
     // inclusive start, exclusive end
     std::vector<std::tuple<size_t, size_t>> block_index_ranges_per_scan;
     size_t next_scan_idx;
     size_t cur_data_block_idx;
-
-    // States for async reads.
-    //
-    // Each async state correspond to an async read request.
-    // Each async read request may read content for multiple blocks
-    // (potentially coalesced). In PollForBlock(idx), we will poll for the
-    // completion of the async read request responsible for
-    // pinned_data_blocks[idx], and populate `pinned_data_blocks` with all the
-    // blocks read. To find out the async read request responsible for
-    // pinned_data_blocks[idx], we store the mapping in
-    // block_idx_to_readreq_idx. Index i is in block_idx_to_readreq_idx and
-    // block_idx_to_readreq_idx[i] = j iff pinned_data_blocks[i] is read by
-    // async_states[j].
-    std::vector<AsyncReadState> async_states;
-    UnorderedMap<size_t, size_t> block_idx_to_readreq_idx;
     size_t prefetch_max_idx;
 
-    // For tracking wasted prefetch blocks
+    // For tracking wasted prefetch blocks (prefetched but never read)
     Statistics* statistics;
     size_t wasted_blocks_count;
 
     MultiScanState(
         const std::shared_ptr<FileSystem>& _fs, const MultiScanArgs* _scan_opts,
-        std::vector<CachableEntry<Block>>&& _pinned_data_blocks,
+        std::shared_ptr<ReadSet>&& _read_set,
         std::vector<std::string>&& _data_block_separators,
         std::vector<std::tuple<size_t, size_t>>&& _block_index_ranges_per_scan,
-        UnorderedMap<size_t, size_t>&& _block_idx_to_readreq_idx,
-        std::vector<AsyncReadState>&& _async_states, size_t _prefetch_max_idx,
-        Statistics* _statistics)
+        size_t _prefetch_max_idx, Statistics* _statistics)
         : fs(_fs),
           scan_opts(_scan_opts),
-          pinned_data_blocks(std::move(_pinned_data_blocks)),
+          read_set(std::move(_read_set)),
           data_block_separators(std::move(_data_block_separators)),
           block_index_ranges_per_scan(std::move(_block_index_ranges_per_scan)),
           next_scan_idx(0),
           cur_data_block_idx(0),
-          async_states(std::move(_async_states)),
-          block_idx_to_readreq_idx(std::move(_block_idx_to_readreq_idx)),
           prefetch_max_idx(_prefetch_max_idx),
           statistics(_statistics),
           wasted_blocks_count(0) {}
 
-    ~MultiScanState();
+    ~MultiScanState() {
+      if (statistics && wasted_blocks_count > 0) {
+        RecordTick(statistics, MULTISCAN_PREFETCH_BLOCKS_WASTED,
+                   wasted_blocks_count);
+      }
+    }
   };
 
   Status multi_scan_status_;
@@ -644,24 +587,6 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   void FindBlockForwardInMultiScan();
 
-  void PrepareReadAsyncCallBack(FSReadRequest& req, void* cb_arg) {
-    // Record status, result and sanity check offset from `req`.
-    AsyncReadState* async_state = static_cast<AsyncReadState*>(cb_arg);
-
-    async_state->status = req.status;
-    async_state->result = req.result;
-
-    if (async_state->status.ok()) {
-      assert(async_state->offset == req.offset);
-      if (async_state->offset != req.offset) {
-        async_state->status = Status::InvalidArgument(
-            "offset mismatch between async read request " +
-            std::to_string(async_state->offset) + " and async callback " +
-            std::to_string(req.offset));
-      }
-    }
-  }
-
   void MultiScanSeekTargetFromBlock(const Slice* seek_target, size_t block_idx);
   void MultiScanUnexpectedSeekTarget(const Slice* seek_target,
                                      const Slice* user_seek_target);
@@ -684,68 +609,26 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
       return true;
     }
 
-    if (!multi_scan_->async_states.empty()) {
-      multi_scan_status_ = PollForBlock(idx);
-      if (!multi_scan_status_.ok()) {
-        return true;
-      }
+    // Use ReadSet to get block (handles cache/async/sync transparently)
+    CachableEntry<Block> block_entry;
+    multi_scan_status_ = multi_scan_->read_set->ReadIndex(idx, &block_entry);
+    if (!multi_scan_status_.ok()) {
+      return true;
     }
-    // This block should have been initialized
-    assert(multi_scan_->pinned_data_blocks[idx].GetValue());
+
+    assert(block_entry.GetValue());
     // Note that the block_iter_ takes ownership of the pinned data block
-    // TODO: we can delegate the clean up like with pinned_iters_mgr_ if
-    // need to pin blocks longer.
-    table_->NewDataBlockIterator<DataBlockIter>(
-        read_options_, multi_scan_->pinned_data_blocks[idx], &block_iter_,
-        Status::OK());
+    table_->NewDataBlockIterator<DataBlockIter>(read_options_, block_entry,
+                                                &block_iter_, Status::OK());
     return false;
   }
 
-  // After PollForBlock(idx), the async request that contains
-  // pinned_data_blocks[idx] should be done, and all blocks contained in this
-  // read request will be initialzed in pinned_data_blocks and pinned in block
-  // cache.
-  Status PollForBlock(size_t idx);
-
-  // Helper function to create and pin a block in cache from buffer data
-  // Handles decompressor setup with dictionary loading and block
-  // creation/pinning. The buffer_start_offset is the file offset where
-  // buffer_data starts.
-  Status CreateAndPinBlockFromBuffer(const BlockHandle& block,
-                                     uint64_t buffer_start_offset,
-                                     const Slice& buffer_data,
-                                     CachableEntry<Block>& pinned_block_entry);
-
   Status CollectBlockHandles(
       const std::vector<ScanOptions>& scan_opts,
       std::vector<BlockHandle>* scan_block_handles,
       std::vector<std::tuple<size_t, size_t>>* block_index_ranges_per_scan,
       std::vector<std::string>* data_block_boundary_keys);
 
-  Status FilterAndPinCachedBlocks(
-      const std::vector<BlockHandle>& scan_block_handles,
-      const MultiScanArgs* multiscan_opts,
-      std::vector<size_t>* block_indices_to_read,
-      std::vector<CachableEntry<Block>>* pinned_data_blocks_guard,
-      size_t* prefetched_max_idx);
-
-  void PrepareIORequests(
-      const std::vector<size_t>& block_indices_to_read,
-      const std::vector<BlockHandle>& scan_block_handles,
-      const MultiScanArgs* multiscan_opts,
-      std::vector<FSReadRequest>* read_reqs,
-      UnorderedMap<size_t, size_t>* block_idx_to_readreq_idx,
-      std::vector<std::vector<size_t>>* coalesced_block_indices,
-      size_t* nonadjacent_coalesced_count, uint64_t* total_prefetch_bytes);
-
-  Status ExecuteIO(
-      const std::vector<BlockHandle>& scan_block_handles,
-      const MultiScanArgs* multiscan_opts,
-      const std::vector<std::vector<size_t>>& coalesced_block_indices,
-      std::vector<FSReadRequest>* read_reqs,
-      std::vector<AsyncReadState>* async_states,
-      std::vector<CachableEntry<Block>>* pinned_data_blocks_guard);
-
   // *** END APIs relevant to multiscan ***
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_imp.cc b/util/io_dispatcher_imp.cc
index 1f247399ddb8..fd7d590185fc 100644
--- a/util/io_dispatcher_imp.cc
+++ b/util/io_dispatcher_imp.cc
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "file/random_access_file_reader.h"
@@ -94,7 +95,34 @@ struct AsyncIOState {
 };
 
 // ReadSet destructor - clean up IO handles
+// Must call AbortIO before deleting handles to avoid use-after-free when
+// io_uring completions arrive for deleted handles.
 ReadSet::~ReadSet() {
+  if (async_io_map_.empty()) {
+    return;
+  }
+
+  // Collect unique pending IO handles (multiple block indices may share the
+  // same async_state due to coalescing)
+  std::vector<void*> pending_handles;
+  std::unordered_set<void*> seen_handles;
+  for (auto& pair : async_io_map_) {
+    auto& async_state = pair.second;
+    if (async_state->io_handle != nullptr &&
+        seen_handles.find(async_state->io_handle) == seen_handles.end()) {
+      pending_handles.push_back(async_state->io_handle);
+      seen_handles.insert(async_state->io_handle);
+    }
+  }
+
+  // Abort all pending IO operations before deleting handles
+  if (!pending_handles.empty() && fs_) {
+    // AbortIO cancels pending requests and waits for completions
+    IOStatus s = fs_->AbortIO(pending_handles);
+    (void)s;  // Ignore errors in destructor
+  }
+
+  // Now safe to delete the handles
   for (auto& pair : async_io_map_) {
     auto& async_state = pair.second;
     if (async_state->io_handle != nullptr && async_state->del_fn != nullptr) {
@@ -187,6 +215,26 @@ Status ReadSet::ReadOffset(size_t offset, CachableEntry<Block>* out) {
   return Status::InvalidArgument("Offset not found in any block");
 }
 
+void ReadSet::ReleaseBlock(size_t block_index) {
+  if (block_index >= pinned_blocks_.size()) {
+    return;
+  }
+  // Unpin the block from cache
+  pinned_blocks_[block_index].Reset();
+  // Clean up any pending async IO for this block
+  async_io_map_.erase(block_index);
+}
+
+bool ReadSet::IsBlockAvailable(size_t block_index) const {
+  if (block_index >= pinned_blocks_.size()) {
+    return false;
+  }
+  // Block is available if it hasn't been released (still has a value or
+  // has pending async IO)
+  return pinned_blocks_[block_index].GetValue() != nullptr ||
+         async_io_map_.find(block_index) != async_io_map_.end();
+}
+
 // Poll and process async IO for a specific block
 Status ReadSet::PollAndProcessAsyncIO(
     const std::shared_ptr<AsyncIOState>& async_state) {
@@ -204,12 +252,9 @@ Status ReadSet::PollAndProcessAsyncIO(
     return async_state->read_req.status;
   }
 
-  // Determine which buffer to use
-  const Slice buffer_data =
-      rep->file->use_direct_io()
-          ? Slice(static_cast<const char*>(async_state->aligned_buf.get()),
-                  async_state->read_req.len)
-          : Slice(async_state->buf.get(), async_state->read_req.len);
+  // Use the result slice from the callback which has been correctly set
+  // with any necessary alignment adjustments for direct IO
+  const Slice& buffer_data = async_state->read_req.result;
 
   // Process all blocks in this async request
   for (size_t i = 0; i < async_state->block_indices.size(); ++i) {
@@ -276,11 +321,14 @@ struct IODispatcherImpl::Impl {
       std::vector<FSReadRequest>* read_reqs,
       std::vector<std::vector<size_t>>* coalesced_block_indices);
 
-  void ExecuteAsyncIO(
+  // Surface actual async IO errors to caller, but allow fallback for
+  // unsupported cases. Returns block indices that need sync fallback.
+  std::vector<size_t> ExecuteAsyncIO(
       const std::shared_ptr<IOJob>& job,
       const std::shared_ptr<ReadSet>& read_set,
       std::vector<FSReadRequest>& read_reqs,
-      const std::vector<std::vector<size_t>>& coalesced_block_indices);
+      const std::vector<std::vector<size_t>>& coalesced_block_indices,
+      Status* out_status);
 
   Status ExecuteSyncIO(
       const std::shared_ptr<IOJob>& job,
@@ -303,6 +351,7 @@ Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
 
   // Initialize ReadSet
   rs->job_ = job;
+  rs->fs_ = job->table->get_rep()->ioptions.env->GetFileSystem();
   rs->pinned_blocks_.resize(job->block_handles.size());
 
   // Build sorted index for O(log n) ReadOffset lookups via binary search.
@@ -358,7 +407,28 @@ Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
 
   // Step 3: Execute IO requests based on JobOptions
   if (job->job_options.read_options.async_io) {
-    ExecuteAsyncIO(job, rs, read_reqs, coalesced_block_indices);
+    // Try async IO - get back any blocks that need sync fallback (not
+    // supported) and surface any actual errors to caller
+    Status async_status;
+    std::vector<size_t> fallback_indices = ExecuteAsyncIO(
+        job, rs, read_reqs, coalesced_block_indices, &async_status);
+    if (!async_status.ok()) {
+      return async_status;
+    }
+
+    // Fall back to sync IO for blocks where async is not supported
+    if (!fallback_indices.empty()) {
+      std::vector<FSReadRequest> sync_read_reqs;
+      std::vector<std::vector<size_t>> sync_coalesced_indices;
+      PrepareIORequests(job, fallback_indices, job->block_handles,
+                        &sync_read_reqs, &sync_coalesced_indices);
+
+      Status s = ExecuteSyncIO(job, rs, sync_read_reqs, sync_coalesced_indices);
+      if (!s.ok()) {
+        return s;
+      }
+      rs->num_sync_reads_ += fallback_indices.size();
+    }
   } else {
     Status s = ExecuteSyncIO(job, rs, read_reqs, coalesced_block_indices);
     if (!s.ok()) {
@@ -433,17 +503,22 @@ void IODispatcherImpl::Impl::PrepareIORequests(
   }
 }
 
-void IODispatcherImpl::Impl::ExecuteAsyncIO(
+std::vector<size_t> IODispatcherImpl::Impl::ExecuteAsyncIO(
     const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
     std::vector<FSReadRequest>& read_reqs,
-    const std::vector<std::vector<size_t>>& coalesced_block_indices) {
+    const std::vector<std::vector<size_t>>& coalesced_block_indices,
+    Status* out_status) {
+  std::vector<size_t> fallback_block_indices;
+  *out_status = Status::OK();
+
   // Get file and IO options
   auto* rep = job->table->get_rep();
   IOOptions io_opts;
   Status s =
       rep->file->PrepareIOOptions(job->job_options.read_options, io_opts);
   if (!s.ok()) {
-    return;
+    *out_status = s;
+    return fallback_block_indices;
   }
 
   const bool direct_io = rep->file->use_direct_io();
@@ -468,9 +543,12 @@ void IODispatcherImpl::Impl::ExecuteAsyncIO(
     }
 
     // Callback for async read completion
-    // TODO: Probably need to make this more useful.
-    auto cb = [](const FSReadRequest& /*req*/, void* /*cb_arg*/) {
-      // Placeholder callback - currently does nothing
+    // Store the result slice and status back into async_state so we can access
+    // them after Poll() completes.
+    auto cb = [](const FSReadRequest& req, void* cb_arg) {
+      auto* state = static_cast<AsyncIOState*>(cb_arg);
+      state->read_req.result = req.result;
+      state->read_req.status = req.status;
     };
 
     s = rep->file->ReadAsync(async_state->read_req, io_opts, cb,
@@ -479,18 +557,26 @@ void IODispatcherImpl::Impl::ExecuteAsyncIO(
                              direct_io ? &async_state->aligned_buf : nullptr);
 
     if (!s.ok()) {
-      continue;
+      // Actual error - surface to caller
+      *out_status = s;
+      return fallback_block_indices;
     }
-    assert(async_state->io_handle);
 
-    // Mark the status as permitted unchecked since we'll check it later
-    // in PollAndProcessAsyncIO
+    if (async_state->io_handle == nullptr) {
+      // Async IO not supported - add to fallback list for sync IO
+      for (const auto idx : coalesced_block_indices[i]) {
+        fallback_block_indices.push_back(idx);
+      }
+      continue;
+    }
 
     // Add async state to map for all blocks in this request
     for (const auto idx : async_state->block_indices) {
       read_set->async_io_map_[idx] = async_state;
     }
   }
+
+  return fallback_block_indices;
 }
 
 Status IODispatcherImpl::Impl::ExecuteSyncIO(
diff --git a/util/io_dispatcher_test.cc b/util/io_dispatcher_test.cc
index 9677b4c51897..7f8e0a93115a 100644
--- a/util/io_dispatcher_test.cc
+++ b/util/io_dispatcher_test.cc
@@ -27,6 +27,13 @@
 // Enable io_uring support for this test
 extern "C" bool RocksDbIOUringEnable() { return true; }
 
+// Check if io_uring is available at compile time
+#ifdef ROCKSDB_IOURING_PRESENT
+static constexpr bool kIOUringPresent = true;
+#else
+static constexpr bool kIOUringPresent = false;
+#endif
+
 namespace ROCKSDB_NAMESPACE {
 
 // Represents a single read operation recorded by the tracking file system
@@ -392,7 +399,8 @@ TEST_F(IODispatcherTest, BasicSSTRead) {
   job->block_handles = block_handles;
   job->table = table.get();
   ReadOptions read_options;
-  job->job_options.read_options.async_io = true;
+  // Only use async IO when io_uring is available
+  job->job_options.read_options.async_io = kIOUringPresent;
 
   std::shared_ptr<ReadSet> read_set;
   s = dispatcher->SubmitJob(job, &read_set);
@@ -471,7 +479,8 @@ TEST_F(IODispatcherTest, StatisticsTracking) {
   auto job = std::make_shared<IOJob>();
   job->block_handles = block_handles;
   job->table = table.get();
-  job->job_options.read_options.async_io = true;
+  // Only use async IO when io_uring is available
+  job->job_options.read_options.async_io = kIOUringPresent;
 
   std::shared_ptr<ReadSet> read_set;
   s = dispatcher->SubmitJob(job, &read_set);
@@ -510,9 +519,13 @@ TEST_F(IODispatcherTest, StatisticsTracking) {
 TEST_F(IODispatcherTest, AsyncAndSyncRead) {
   // This test verifies the difference between async_io=true and async_io=false
   // by checking the statistics after reading all blocks.
-  // Note: When io_uring is not available, async_io=true will fall back to sync.
+  // Only test async_io=true when io_uring is available.
+  std::vector<bool> async_modes = {false};
+  if (kIOUringPresent) {
+    async_modes.push_back(true);
+  }
 
-  for (auto async : {true, false}) {
+  for (auto async : async_modes) {
     std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
 
     std::unique_ptr<BlockBasedTable> table;
@@ -704,99 +717,6 @@ TEST_F(IODispatcherTest, ReadSetDestroysUnpinsBlocks) {
       << " final=" << final_pinned_usage;
 }
 
-// Test that verifies the exact sequence of reads issued by the IO dispatcher.
-// This uses the ReadTrackingFS to capture all read operations and verify
-// that async_io=true uses ReadAsync while async_io=false uses MultiRead.
-TEST_F(IODispatcherTest, VerifyReadSequence) {
-  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
-
-  std::unique_ptr<BlockBasedTable> table;
-  std::vector<BlockHandle> block_handles;
-  Status s = CreateAndOpenSST(20, &table, &block_handles);
-  ASSERT_OK(s);
-  ASSERT_NE(table, nullptr);
-  ASSERT_GE(block_handles.size(), 10);
-
-  // Clear any reads from table opening
-  tracking_fs_->ClearReadOps();
-
-  // Test 1: Synchronous reads should use MultiRead
-  {
-    auto job = std::make_shared<IOJob>();
-    job->block_handles = block_handles;
-    job->table = table.get();
-    job->job_options.read_options.async_io = false;
-
-    std::shared_ptr<ReadSet> read_set;
-    s = dispatcher->SubmitJob(job, &read_set);
-    ASSERT_OK(s);
-    ASSERT_NE(read_set, nullptr);
-
-    // Read all blocks
-    for (size_t i = 0; i < block_handles.size(); ++i) {
-      CachableEntry<Block> block;
-      Status read_status = read_set->ReadIndex(i, &block);
-      ASSERT_OK(read_status);
-      ASSERT_NE(block.GetValue(), nullptr);
-    }
-
-    // Verify that MultiRead was used for sync reads
-    auto read_ops = tracking_fs_->GetReadOps();
-    ASSERT_GT(tracking_fs_->GetMultiReadCount(), 0)
-        << "Expected MultiRead to be called for sync reads";
-    ASSERT_EQ(tracking_fs_->GetReadAsyncCount(), 0)
-        << "Expected no ReadAsync calls for sync reads";
-
-    // Verify MultiRead requests cover all blocks
-    size_t total_blocks_in_multireads = 0;
-    for (const auto& op : read_ops) {
-      if (op.type == ReadOp::kMultiRead) {
-        // Each MultiRead request may contain multiple coalesced blocks
-        total_blocks_in_multireads += op.requests.size();
-      }
-    }
-    // Note: blocks may be coalesced, so we check that reads were issued
-    ASSERT_GT(total_blocks_in_multireads, 0);
-  }
-
-  // Clear reads and test async mode
-  tracking_fs_->ClearReadOps();
-
-  // Test 2: Async reads should use ReadAsync
-  {
-    // Create a new table to avoid cache hits
-    std::unique_ptr<BlockBasedTable> table2;
-    std::vector<BlockHandle> block_handles2;
-    s = CreateAndOpenSST(20, &table2, &block_handles2);
-    ASSERT_OK(s);
-
-    tracking_fs_->ClearReadOps();
-
-    auto job = std::make_shared<IOJob>();
-    job->block_handles = block_handles2;
-    job->table = table2.get();
-    job->job_options.read_options.async_io = true;
-
-    std::shared_ptr<ReadSet> read_set;
-    s = dispatcher->SubmitJob(job, &read_set);
-    ASSERT_OK(s);
-    ASSERT_NE(read_set, nullptr);
-
-    // Verify that ReadAsync was used
-    ASSERT_GT(tracking_fs_->GetReadAsyncCount(), 0)
-        << "Expected ReadAsync to be called for async reads";
-    ASSERT_EQ(tracking_fs_->GetMultiReadCount(), 0)
-        << "Expected no MultiRead calls for async reads";
-
-    // Read blocks - ReadIndex will poll for async IO completion internally
-    for (size_t i = 0; i < block_handles2.size(); ++i) {
-      CachableEntry<Block> block;
-      Status read_status = read_set->ReadIndex(i, &block);
-      ASSERT_OK(read_status);
-      ASSERT_NE(block.GetValue(), nullptr);
-    }
-  }
-}
 
 // Test that verifies the coalescing logic: adjacent blocks within the
 // coalesce threshold should be combined into a single read request.

From d1b63738e0043ff4cda3c15a5a8c6d52e10220d5 Mon Sep 17 00:00:00 2001
From: Hemal Shah <hemal@stripe.com>
Date: Mon, 2 Feb 2026 17:36:44 -0800
Subject: [PATCH 454/500] Add `WriteBatch::Handler::LogData` iteration callback
 function (#14245)

Summary:
Change adds `log_data_` function callback for when iterating over a `WriteBatch`. Previously only the `Put`, `Delete`, `Merge` operations were called into when iterating over an `WriteBatch` (and their `*_cf` equivalent through a different `WriteBatch::Handler` implementation).

To maintain backwards compatibility, previously exported function definitions remain the same, but new functions are exported for different languages to use the `LogData` callback on an iteration.

### Background

Hi - we use the [`rust-rocksdb`](https://github.com/rust-rocksdb/rust-rocksdb) bindings to work with `rocksdb` at Stripe. We are starting to make small contributions https://github.com/facebook/rocksdb/pull/14183 & https://github.com/facebook/rocksdb/pull/14136 and this adds on top of it. I saw that the `PutLogData` method is already exported for a `WriteBatch`, but there's no way to consume that. This change allows us to consume that information (with a follow up change on the [`rust-rocksdb`](https://github.com/rust-rocksdb/rust-rocksdb) repo.).

Thanks for your time looking into this. Previously we had trouble with meta's internal linters - I am happy to make appropriate change if something like that pops up again.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14245

Reviewed By: archang19

Differential Revision: D92069503

Pulled By: jaykorean

fbshipit-source-id: a4a3c885462f641c8df9e3401a0e4c1d38871c6f
---
 db/c.cc             | 46 +++++++++++++++++++++++++++++++++++++++++++++
 include/rocksdb/c.h | 13 +++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/db/c.cc b/db/c.cc
index dae0d0ebb569..9a9c0c0f9aa4 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3042,12 +3042,19 @@ class H : public WriteBatch::Handler {
   void* state_;
   void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
   void (*deleted_)(void*, const char* k, size_t klen);
+  void (*log_data_)(void*, const char* blob, size_t blob_len);
+
   void Put(const Slice& key, const Slice& value) override {
     (*put_)(state_, key.data(), key.size(), value.data(), value.size());
   }
   void Delete(const Slice& key) override {
     (*deleted_)(state_, key.data(), key.size());
   }
+  void LogData(const Slice& blob) override {
+    if (log_data_) {
+      (*log_data_)(state_, blob.data(), blob.size());
+    }
+  }
 };
 
 class HCF : public WriteBatch::Handler {
@@ -3058,6 +3065,8 @@ class HCF : public WriteBatch::Handler {
   void (*deleted_cf_)(void*, uint32_t cfid, const char* k, size_t klen);
   void (*merge_cf_)(void*, uint32_t cfid, const char* k, size_t klen,
                     const char* v, size_t vlen);
+  void (*log_data_)(void*, const char* blob, size_t blob_len);
+
   Status PutCF(uint32_t column_family_id, const Slice& key,
                const Slice& value) override {
     (*put_cf_)(state_, column_family_id, key.data(), key.size(), value.data(),
@@ -3074,6 +3083,11 @@ class HCF : public WriteBatch::Handler {
                  value.size());
     return Status::OK();
   }
+  void LogData(const Slice& blob) override {
+    if (log_data_) {
+      (*log_data_)(state_, blob.data(), blob.size());
+    }
+  }
 };
 
 void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
@@ -3085,6 +3099,20 @@ void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
   handler.state_ = state;
   handler.put_ = put;
   handler.deleted_ = deleted;
+  handler.log_data_ = nullptr;
+  b->rep.Iterate(&handler);
+}
+
+void rocksdb_writebatch_iterate_ld(
+    rocksdb_writebatch_t* b, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen),
+    void (*log_data)(void*, const char* blob, size_t blob_len)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  handler.log_data_ = log_data;
   b->rep.Iterate(&handler);
 }
 
@@ -3100,6 +3128,24 @@ void rocksdb_writebatch_iterate_cf(
   handler.put_cf_ = put_cf;
   handler.deleted_cf_ = deleted_cf;
   handler.merge_cf_ = merge_cf;
+  handler.log_data_ = nullptr;
+  b->rep.Iterate(&handler);
+}
+
+void rocksdb_writebatch_iterate_cf_ld(
+    rocksdb_writebatch_t* b, void* state,
+    void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                   const char* v, size_t vlen),
+    void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen),
+    void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                     const char* v, size_t vlen),
+    void (*log_data)(void*, const char* blob, size_t blob_len)) {
+  HCF handler;
+  handler.state_ = state;
+  handler.put_cf_ = put_cf;
+  handler.deleted_cf_ = deleted_cf;
+  handler.merge_cf_ = merge_cf;
+  handler.log_data_ = log_data;
   b->rep.Iterate(&handler);
 }
 
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index a50edd2f7ef6..5c88c8fe3015 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -950,6 +950,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
     rocksdb_writebatch_t*, void* state,
     void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
     void (*deleted)(void*, const char* k, size_t klen));
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_ld(
+    rocksdb_writebatch_t*, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen),
+    void (*log_data)(void*, const char* blob, size_t blob_len));
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf(
     rocksdb_writebatch_t*, void* state,
     void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen,
@@ -957,6 +962,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf(
     void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen),
     void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen,
                      const char* v, size_t vlen));
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate_cf_ld(
+    rocksdb_writebatch_t*, void* state,
+    void (*put_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                   const char* v, size_t vlen),
+    void (*deleted_cf)(void*, uint32_t cfid, const char* k, size_t klen),
+    void (*merge_cf)(void*, uint32_t cfid, const char* k, size_t klen,
+                     const char* v, size_t vlen),
+    void (*log_data)(void*, const char* blob, size_t blob_len));
 extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
     rocksdb_writebatch_t*, size_t* size);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_set_save_point(

From 92378eb3b871cff878937b3c77a36e87d1f37688 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Tue, 3 Feb 2026 05:53:50 -0800
Subject: [PATCH 455/500] Add CLAUDE.md and optimize tooling for claude code
 (#14293)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
* Add CLAUDE.md This CLAUDE.md is generated through the analysis of 9,012 commits from the RocksDB repository since 2016. It aggregate the commits into 8 major components based on the component it is changing. It selected top 100 most complex PR based on line of changes to collect the code review feedbacks. For each PR, we collected:

  • PR title and description to understand the change context
  • Files changed to identify affected components and measure complexity
  • Inline code review comments from reviewers
  • General review summaries and approval/change request feedback

The feedback was then categorized by RocksDB component and analyzed for recurring themes, patterns, and best practices.

This CLAUDE.md file could be used for guiding code generation and code review.

* Optimize tooling for claude code Add make check-progress and format-auto targets for automation

Add machine-parseable progress reporting for `make check` to support automated monitoring tools like Claude Code:

- Add `build_tools/check_progress.sh` script that outputs JSON progress
- Add `make check-progress` target to poll build/test progress
- Detects phases: compiling -> linking -> generating -> testing
- Reports failed tests with exit codes, signals, and log output
- Limits to 10 failures with last 50 lines of output each

Also add non-interactive formatting support:

- Add `-y` flag to format-diff.sh for auto-apply without prompts
- Add `make format-auto` target for CI/automation use

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14293

Test Plan: Local build

Reviewed By: pdillinger

Differential Revision: D92085763

Pulled By: xingbowang

fbshipit-source-id: ba122a4ff51087aec5c06bab804edfee34e13880
---
 Makefile                      |   9 +
 build_tools/check_progress.sh | 231 +++++++++++++++
 build_tools/format-diff.sh    |  22 +-
 claude_md/add_option.md       | 512 ++++++++++++++++++++++++++++++++++
 claude_md/add_public_api.md   | 504 +++++++++++++++++++++++++++++++++
 5 files changed, 1272 insertions(+), 6 deletions(-)
 create mode 100755 build_tools/check_progress.sh
 create mode 100644 claude_md/add_option.md
 create mode 100644 claude_md/add_public_api.md

diff --git a/Makefile b/Makefile
index b2c3a8f6b741..7c35b80d95f6 100644
--- a/Makefile
+++ b/Makefile
@@ -993,6 +993,11 @@ watch-log:
 dump-log:
 	bash -c '$(quoted_perl_command)' < LOG
 
+# Machine-parseable progress output for automated monitoring (e.g., Claude Code)
+# Outputs JSON: {"status":"running","completed":45,"total":100,"failed":0,"percent":45,"eta_seconds":120}
+check-progress:
+	@build_tools/check_progress.sh
+
 # If J != 1 and GNU parallel is installed, run the tests in parallel,
 # via the check_0 rule above.  Otherwise, run them sequentially.
 check: all
@@ -1204,6 +1209,10 @@ tags0:
 format:
 	build_tools/format-diff.sh
 
+# Non-interactive format (auto-apply without prompts, for CI/automation/Claude Code)
+format-auto:
+	build_tools/format-diff.sh -y
+
 check-format:
 	build_tools/format-diff.sh -c
 
diff --git a/build_tools/check_progress.sh b/build_tools/check_progress.sh
new file mode 100755
index 000000000000..d52a91dabd05
--- /dev/null
+++ b/build_tools/check_progress.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+# Output test progress in JSON format for machine parsing
+# Usage: build_tools/check_progress.sh
+
+LOG_FILE="LOG"
+T_DIR="t"
+SRC_MK="src.mk"
+
+# Maximum lines of test output to include per failed test
+MAX_OUTPUT_LINES=50
+
+# Helper to escape string for JSON (handles newlines, quotes, backslashes, tabs)
+json_escape() {
+    local str="$1"
+    # Use python for reliable JSON escaping if available, otherwise use sed
+    if command -v python3 &>/dev/null; then
+        printf '%s' "$str" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read())[1:-1], end="")'
+    else
+        printf '%s' "$str" | sed 's/\\/\\\\/g; s/"/\\"/g; s/\t/\\t/g; s/\r/\\r/g' | awk '{printf "%s\\n", $0}' | sed 's/\\n$//'
+    fi
+}
+
+# Helper to output JSON and exit
+output_json() {
+    local status="$1"
+    local completed="${2:-0}"
+    local total="${3:-0}"
+    local failed="${4:-0}"
+    local percent="${5:-0}"
+    local eta="${6:-0}"
+    local avg_time="${7:-0}"
+    local last_item="${8:-}"
+    local phase="${9:-}"
+    local failed_tests="${10:-}"
+
+    # Build JSON output
+    local json="{\"status\":\"$status\""
+
+    if [[ -n "$phase" ]]; then
+        json="$json,\"phase\":\"$phase\""
+    fi
+
+    json="$json,\"completed\":$completed,\"total\":$total,\"failed\":$failed,\"percent\":$percent"
+    json="$json,\"eta_seconds\":$eta,\"avg_time\":\"$avg_time\",\"last_item\":\"$(json_escape "$last_item")\""
+
+    if [[ -n "$failed_tests" ]]; then
+        json="$json,\"failed_tests\":[$failed_tests]"
+    fi
+
+    json="$json}"
+    echo "$json"
+}
+
+# Get failed test info with log output
+get_failed_tests_json() {
+    local log_file="$1"
+    local t_dir="$2"
+    local max_failures=10
+    local count=0
+    local first=true
+
+    # Get failed tests from LOG file
+    while IFS=$'\t' read -r seq host starttime runtime send recv exitval signal cmd; do
+        # Skip header line
+        [[ "$seq" == "Seq" ]] && continue
+
+        # Check if failed (exitval != 0 or signal != 0)
+        if [[ "$exitval" != "0" || "$signal" != "0" ]]; then
+            # Extract test name from command
+            test_name=$(echo "$cmd" | sed 's,.*/run-,,;s, .*,,')
+
+            # Get log file path
+            log_path="$t_dir/log-run-$test_name"
+
+            # Read test output (last N lines)
+            if [[ -f "$log_path" ]]; then
+                output=$(tail -n "$MAX_OUTPUT_LINES" "$log_path" 2>/dev/null)
+            else
+                output="(log file not found: $log_path)"
+            fi
+
+            # Escape output for JSON
+            escaped_output=$(json_escape "$output")
+
+            # Build JSON object for this failure
+            if [[ "$first" == "true" ]]; then
+                first=false
+            else
+                printf ","
+            fi
+            printf '{"test":"%s","exit_code":%d,"signal":%d,"output":"%s"}' \
+                "$test_name" "$exitval" "$signal" "$escaped_output"
+
+            ((count++))
+            if [[ $count -ge $max_failures ]]; then
+                break
+            fi
+        fi
+    done < "$log_file"
+}
+
+# Check if tests are running (LOG file exists)
+if [[ -f "$LOG_FILE" ]]; then
+    # Count total tests from t/run-* files
+    if [[ -d "$T_DIR" ]]; then
+        total=$(find "$T_DIR" -name 'run-*' -type f 2>/dev/null | wc -l)
+    else
+        total=0
+    fi
+
+    # If no parallel tests generated yet
+    if [[ "$total" -eq 0 ]]; then
+        output_json "running" 0 0 0 0 0 "0" "" "generating"
+        exit 0
+    fi
+
+    # Parse LOG file (skip header line)
+    # LOG format: Seq Host Starttime JobRuntime Send Receive Exitval Signal Command
+    completed=$(tail -n +2 "$LOG_FILE" 2>/dev/null | wc -l)
+
+    # Count failures
+    failed=$(awk -F'\t' 'NR>1 && ($7 != 0 || $8 != 0) {count++} END {print count+0}' "$LOG_FILE" 2>/dev/null)
+
+    # Get failed tests JSON with output (only if there are failures)
+    if [[ "$failed" -gt 0 ]]; then
+        failed_tests=$(get_failed_tests_json "$LOG_FILE" "$T_DIR")
+    else
+        failed_tests=""
+    fi
+
+    # Calculate percentage
+    if [[ "$total" -gt 0 ]]; then
+        percent=$((completed * 100 / total))
+    else
+        percent=0
+    fi
+
+    # Get last completed test name (extract from command column)
+    last_test=$(tail -1 "$LOG_FILE" 2>/dev/null | awk -F'\t' '{print $9}' | sed 's,.*/run-,,;s, .*,,;s,^./,,')
+
+    # Calculate ETA based on average time
+    if [[ "$completed" -gt 0 ]]; then
+        avg_time=$(awk -F'\t' 'NR>1 {sum+=$4; count++} END {if(count>0) printf "%.1f", sum/count; else print "0"}' "$LOG_FILE")
+        remaining=$((total - completed))
+        eta=$(awk "BEGIN {printf \"%.0f\", $avg_time * $remaining}")
+    else
+        avg_time="0"
+        eta="0"
+    fi
+
+    # Determine status
+    if [[ "$completed" -ge "$total" ]]; then
+        status="completed"
+    elif [[ "$completed" -gt 0 ]]; then
+        status="running"
+    else
+        status="starting"
+    fi
+
+    output_json "$status" "$completed" "$total" "$failed" "$percent" "$eta" "$avg_time" "$last_test" "testing" "$failed_tests"
+    exit 0
+fi
+
+# No LOG file - check if we're in compilation/linking phase
+# Count expected source files from src.mk
+if [[ -f "$SRC_MK" ]]; then
+    # Count LIB_SOURCES (library object files to compile)
+    expected_lib_objects=$(grep -E '\.cc\s*\\?$' "$SRC_MK" | grep -v '^#' | wc -l)
+
+    # Count TEST_MAIN_SOURCES (test binaries to link)
+    expected_test_binaries=$(sed -n '/^TEST_MAIN_SOURCES =/,/^[^ ]/p' "$SRC_MK" | grep -cE '\.cc\s*\\?$' 2>/dev/null || echo 0)
+else
+    expected_lib_objects=0
+    expected_test_binaries=0
+fi
+
+# Check for test generation phase (t/ directory being created)
+if [[ -d "$T_DIR" ]]; then
+    total=$(find "$T_DIR" -name 'run-*' -type f 2>/dev/null | wc -l)
+    if [[ "$total" -gt 0 ]]; then
+        output_json "running" 0 "$total" 0 0 0 "0" "" "generating"
+        exit 0
+    fi
+fi
+
+# Count compiled object files (in subdirectories matching source structure)
+# Object files are created as dir/file.o (e.g., cache/cache.o, db/db_impl.o)
+compiled_objects=0
+if [[ "$expected_lib_objects" -gt 0 ]]; then
+    # Count .o files in source directories
+    compiled_objects=$(find cache db env file logging memory memtable monitoring options port table test_util trace_replay util utilities -name '*.o' -type f 2>/dev/null | wc -l)
+fi
+
+# Count linked test binaries (test binaries are in current directory with _test suffix)
+linked_tests=0
+if [[ "$expected_test_binaries" -gt 0 ]]; then
+    linked_tests=$(find . -maxdepth 1 -name '*_test' -type f -executable 2>/dev/null | wc -l)
+fi
+
+# Determine phase based on what exists
+if [[ "$compiled_objects" -eq 0 && "$linked_tests" -eq 0 ]]; then
+    # Nothing compiled yet - not started or just beginning
+    output_json "not_started" 0 0 0 0 0 "0" ""
+    exit 0
+fi
+
+# Calculate total work units: compiling + linking
+total_work=$((expected_lib_objects + expected_test_binaries))
+completed_work=$((compiled_objects + linked_tests))
+
+if [[ "$total_work" -gt 0 ]]; then
+    percent=$((completed_work * 100 / total_work))
+else
+    percent=0
+fi
+
+# Determine phase
+if [[ "$compiled_objects" -lt "$expected_lib_objects" ]]; then
+    phase="compiling"
+    # Get most recently modified .o file as last_item
+    last_item=$(find cache db env file logging memory memtable monitoring options port table test_util trace_replay util utilities -name '*.o' -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2- | sed 's,^\./,,;s,\.o$,,')
+elif [[ "$linked_tests" -lt "$expected_test_binaries" ]]; then
+    phase="linking"
+    # Get most recently modified test binary as last_item
+    last_item=$(find . -maxdepth 1 -name '*_test' -type f -executable -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | cut -d' ' -f2- | sed 's,^\./,,')
+else
+    phase="generating"
+    last_item=""
+fi
+
+output_json "running" "$completed_work" "$total_work" 0 "$percent" 0 "0" "$last_item" "$phase"
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index 91cbb46a3412..55ee4bd6e24f 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -7,14 +7,18 @@ print_usage () {
   echo "Usage:"
   echo "format-diff.sh [OPTIONS]"
   echo "-c: check only."
+  echo "-y: auto-apply formatting without prompts (non-interactive mode)."
   echo "-h: print this message."
 }
 
-while getopts ':ch' OPTION; do
+while getopts ':cyh' OPTION; do
   case "$OPTION" in
     c)
       CHECK_ONLY=1
       ;;
+    y)
+      AUTO_APPLY=1
+      ;;
     h)
       print_usage
       exit 1
@@ -240,11 +244,16 @@ echo "$diffs" |
   sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
   sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
 
-echo -e "Would you like to fix the format automatically (y/n): \c"
+# Handle auto-apply mode (non-interactive)
+if [ "$AUTO_APPLY" ]; then
+  to_fix="y"
+else
+  echo -e "Would you like to fix the format automatically (y/n): \c"
 
-# Make sure under any mode, we can read user input.
-exec < /dev/tty
-read to_fix
+  # Make sure under any mode, we can read user input.
+  exec < /dev/tty
+  read to_fix
+fi
 
 if [ "$to_fix" != "y" ]
 then
@@ -261,7 +270,8 @@ fi
 echo "Files reformatted!"
 
 # Amend to last commit if user do the post-commit format check
-if [ -z "$uncommitted_code" ]; then
+# Skip amend prompt in auto-apply mode (user can amend manually if desired)
+if [ -z "$uncommitted_code" ] && [ -z "$AUTO_APPLY" ]; then
   echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
   read to_amend
 
diff --git a/claude_md/add_option.md b/claude_md/add_option.md
new file mode 100644
index 000000000000..77caa1dbeeeb
--- /dev/null
+++ b/claude_md/add_option.md
@@ -0,0 +1,512 @@
+# Adding New Options to RocksDB Public API
+
+This document provides guidance on how to add new options to RocksDB's public API. There are two main categories of options:
+
+1. **Standard Column Family Options** (Options/DBOptions/AdvancedColumnFamilyOptions)
+2. **BlockBasedTableOptions** (options specific to block-based table format)
+
+## Overview of Files to Modify
+
+### For Standard Column Family Options
+
+| File | Purpose |
+|------|---------|
+| `include/rocksdb/advanced_options.h` | Define the option with documentation |
+| `include/rocksdb/options.h` | Add reference in related option groups if needed |
+| `options/cf_options.h` | Add to `MutableCFOptions` or `ImmutableCFOptions` struct |
+| `options/cf_options.cc` | Register option for serialization/deserialization and logging |
+| `options/options_helper.cc` | Add to `UpdateColumnFamilyOptions()` for mutable options |
+| `options/options_settable_test.cc` | Add to test string for option parsing |
+| `db_stress_tool/db_stress_common.h` | Declare gflag |
+| `db_stress_tool/db_stress_gflags.cc` | Define gflag with default value |
+| `db_stress_tool/db_stress_test_base.cc` | Apply flag to options |
+| `tools/db_bench_tool.cc` | Add flag definition and apply to options |
+| `tools/db_crashtest.py` | Add randomized values for stress testing |
+| `unreleased_history/new_features/` | Add release note markdown file |
+
+### For BlockBasedTableOptions
+
+| File | Purpose |
+|------|---------|
+| `include/rocksdb/table.h` | Define the option in `BlockBasedTableOptions` struct |
+| `table/block_based/block_based_table_factory.cc` | Register for serialization, validation, and printing |
+| `options/options_settable_test.cc` | Add to `BlockBasedTableOptionsAllFieldsSettable` test |
+| `options/options_test.cc` | Add to `MutableCFOptions` test if applicable |
+| `db_stress_tool/db_stress_common.h` | Declare gflag |
+| `db_stress_tool/db_stress_gflags.cc` | Define gflag |
+| `db_stress_tool/db_stress_test_base.cc` | Apply flag to `block_based_options` |
+| `tools/db_bench_tool.cc` | Add flag definition and apply to `block_based_options` |
+| `tools/db_crashtest.py` | Add randomized values |
+| `java/src/main/java/org/rocksdb/BlockBasedTableConfig.java` | Java API |
+| `java/rocksjni/portal.h` | JNI portal for Java bindings |
+| `java/rocksjni/table.cc` | JNI implementation |
+| `java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java` | Java unit test |
+
+---
+
+## Pattern 1: Adding a Standard Column Family Option
+
+Example reference: commit `94e65a2e0b4f817aa4bfa4c96cdf867e7980d7bc` (memtable_veirfy_per_key_checksum_on_seek)
+
+### Step 1: Define the Option in Public Header
+
+**File: `include/rocksdb/advanced_options.h`**
+
+Add the option with documentation in `AdvancedColumnFamilyOptions` struct:
+
+```cpp
+// Enables additional integrity checks during seek.
+// Specifically, for skiplist-based memtables, key checksum validation could
+// be enabled during seek optionally. This is helpful to detect corrupted
+// memtable keys during reads. Enabling this feature incurs a performance
+// overhead due to additional key checksum validation during memtable seek
+// operation.
+// This option depends on memtable_protection_bytes_per_key to be non zero.
+// If memtable_protection_bytes_per_key is zero, no validation is performed.
+bool memtable_veirfy_per_key_checksum_on_seek = false;
+```
+
+### Step 2: Add to Internal Options Structs
+
+**File: `options/cf_options.h`**
+
+Add to `MutableCFOptions` struct (or `ImmutableCFOptions` for immutable options):
+
+```cpp
+// In MutableCFOptions constructor from Options:
+memtable_veirfy_per_key_checksum_on_seek(
+    options.memtable_veirfy_per_key_checksum_on_seek),
+
+// In MutableCFOptions default constructor:
+memtable_veirfy_per_key_checksum_on_seek(false),
+
+// In MutableCFOptions struct member declarations:
+bool memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 3: Register for Serialization/Deserialization
+
+**File: `options/cf_options.cc`**
+
+Add to the options type info map for serialization:
+
+```cpp
+{"memtable_veirfy_per_key_checksum_on_seek",
+ {offsetof(struct MutableCFOptions,
+           memtable_veirfy_per_key_checksum_on_seek),
+  OptionType::kBoolean, OptionVerificationType::kNormal,
+  OptionTypeFlags::kMutable}},
+```
+
+Add logging in `MutableCFOptions::Dump()`:
+
+```cpp
+ROCKS_LOG_INFO(log, "memtable_veirfy_per_key_checksum_on_seek: %d",
+               memtable_veirfy_per_key_checksum_on_seek);
+```
+
+### Step 4: Update Options Helper
+
+**File: `options/options_helper.cc`**
+
+Add to `UpdateColumnFamilyOptions()`:
+
+```cpp
+cf_opts->memtable_veirfy_per_key_checksum_on_seek =
+    moptions.memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 5: Add to Options Settable Test
+
+**File: `options/options_settable_test.cc`**
+
+Add to the test string in `ColumnFamilyOptionsAllFieldsSettable`:
+
+```cpp
+"memtable_veirfy_per_key_checksum_on_seek=1;"
+```
+
+### Step 6: Add db_stress Support
+
+**File: `db_stress_tool/db_stress_common.h`**
+
+```cpp
+DECLARE_bool(memtable_veirfy_per_key_checksum_on_seek);
+```
+
+**File: `db_stress_tool/db_stress_gflags.cc`**
+
+```cpp
+DEFINE_bool(
+    memtable_veirfy_per_key_checksum_on_seek,
+    ROCKSDB_NAMESPACE::Options().memtable_veirfy_per_key_checksum_on_seek,
+    "Sets CF option memtable_veirfy_per_key_checksum_on_seek.");
+```
+
+**File: `db_stress_tool/db_stress_test_base.cc`**
+
+```cpp
+options.memtable_veirfy_per_key_checksum_on_seek =
+    FLAGS_memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 7: Add db_bench Support
+
+**File: `tools/db_bench_tool.cc`**
+
+```cpp
+// Flag definition (near related flags):
+DEFINE_bool(memtable_veirfy_per_key_checksum_on_seek, false,
+            "Sets CF option memtable_veirfy_per_key_checksum_on_seek");
+
+// Apply flag to options (in InitializeOptionsFromFlags or similar):
+options.memtable_veirfy_per_key_checksum_on_seek =
+    FLAGS_memtable_veirfy_per_key_checksum_on_seek;
+```
+
+### Step 8: Add Crash Test Support
+
+**File: `tools/db_crashtest.py`**
+
+```python
+"memtable_veirfy_per_key_checksum_on_seek": lambda: random.choice([0] * 7 + [1]),
+```
+
+Also add constraint handling in `finalize_and_sanitize()` if needed:
+
+```python
+# only skip list memtable representation supports paranoid memory checks
+if dest_params.get("memtablerep") != "skip_list":
+    dest_params["memtable_veirfy_per_key_checksum_on_seek"] = 0
+```
+
+### Step 9: Add Release Note
+
+**File: `unreleased_history/new_features/<descriptive_name>.md`**
+
+```markdown
+A new flag memtable_veirfy_per_key_checksum_on_seek is added to AdvancedColumnFamilyOptions. When it is enabled, it will validate key checksum along the binary search path on skiplist based memtable during seek operation.
+```
+
+---
+
+## Pattern 2: Adding a BlockBasedTableOptions Option
+
+Example reference: commit `742741b175c5f238374c1714f9db3340d49de569` (super_block_alignment_size)
+
+### Step 1: Define the Option in Public Header
+
+**File: `include/rocksdb/table.h`**
+
+Add to `BlockBasedTableOptions` struct with documentation:
+
+```cpp
+// Align data blocks on super block alignment. Avoid a data block split across
+// super block boundaries. Works with/without compression.
+//
+// Here a "super block" refers to an aligned unit of underlying Filesystem
+// storage for which there is an extra cost when a random read involves two
+// such super blocks instead of just one. Configuring that size here suggests
+// inserting padding in the SST file to avoid a single SST block splitting
+// across two super blocks. Only power-of-two sizes are supported. See also
+// super_block_alignment_space_overhead_ratio. Default to 0, which means super
+// block alignment is disabled.
+size_t super_block_alignment_size = 0;
+
+// This option controls the storage space overhead of super block alignment.
+// It is used to calculate the max padding size allowed for super block
+// alignment. It is calculated in this way. If super_block_alignment_size is
+// 2MB, and super_block_alignment_overhead_ratio is 128, then the max padding
+// size allowed for super block alignment is 2MB / 128 = 16KB.
+// Note that, when it is set to 0, super block alignment is disabled.
+size_t super_block_alignment_space_overhead_ratio = 128;
+```
+
+### Step 2: Register for Serialization in Table Factory
+
+**File: `table/block_based/block_based_table_factory.cc`**
+
+Add to the type info map:
+
+```cpp
+{"super_block_alignment_size",
+ {offsetof(struct BlockBasedTableOptions, super_block_alignment_size),
+  OptionType::kSizeT, OptionVerificationType::kNormal}},
+{"super_block_alignment_space_overhead_ratio",
+ {offsetof(struct BlockBasedTableOptions,
+           super_block_alignment_space_overhead_ratio),
+  OptionType::kSizeT, OptionVerificationType::kNormal}},
+```
+
+Add validation in `ValidateOptions()`:
+
+```cpp
+if ((table_options_.super_block_alignment_size &
+     (table_options_.super_block_alignment_size - 1))) {
+  return Status::InvalidArgument(
+      "Super Block alignment requested but super block alignment size is not "
+      "a power of 2");
+}
+if (table_options_.super_block_alignment_size >
+    std::numeric_limits<uint32_t>::max()) {
+  return Status::InvalidArgument(
+      "Super block alignment size exceeds maximum number (4GiB) allowed");
+}
+```
+
+Add printing in `GetPrintableOptions()`:
+
+```cpp
+snprintf(buffer, kBufferSize,
+         "  super_block_alignment_size: %" ROCKSDB_PRIszt "\n",
+         table_options_.super_block_alignment_size);
+ret.append(buffer);
+```
+
+### Step 3: Add to Options Settable Test
+
+**File: `options/options_settable_test.cc`**
+
+Add to `BlockBasedTableOptionsAllFieldsSettable` test:
+
+```cpp
+"super_block_alignment_size=65536;"
+"super_block_alignment_space_overhead_ratio=4096;"
+```
+
+### Step 4: Add to Options Test
+
+**File: `options/options_test.cc`**
+
+```cpp
+ASSERT_OK(GetColumnFamilyOptionsFromString(
+    config_options, cf_opts,
+    "block_based_table_factory.super_block_alignment_size=65536; "
+    "block_based_table_factory.super_block_alignment_space_overhead_ratio=4096;",
+    &cf_opts));
+ASSERT_EQ(bbto->super_block_alignment_size, 65536);
+ASSERT_EQ(bbto->super_block_alignment_space_overhead_ratio, 4096);
+```
+
+### Step 5: Add db_stress Support
+
+**File: `db_stress_tool/db_stress_common.h`**
+
+```cpp
+DECLARE_uint64(super_block_alignment_size);
+DECLARE_uint64(super_block_alignment_space_overhead_ratio);
+```
+
+**File: `db_stress_tool/db_stress_gflags.cc`**
+
+```cpp
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "BlockBasedTableOptions.super_block_alignment_size");
+
+DEFINE_uint64(
+    super_block_alignment_space_overhead_ratio,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+        .super_block_alignment_space_overhead_ratio,
+    "BlockBasedTableOptions.super_block_alignment_space_overhead_ratio");
+```
+
+**File: `db_stress_tool/db_stress_test_base.cc`**
+
+```cpp
+block_based_options.super_block_alignment_size =
+    fLU64::FLAGS_super_block_alignment_size;
+block_based_options.super_block_alignment_space_overhead_ratio =
+    fLU64::FLAGS_super_block_alignment_space_overhead_ratio;
+```
+
+### Step 6: Add db_bench Support
+
+**File: `tools/db_bench_tool.cc`**
+
+```cpp
+// Flag definitions:
+DEFINE_uint64(
+    super_block_alignment_size,
+    ROCKSDB_NAMESPACE::BlockBasedTableOptions().super_block_alignment_size,
+    "Configure super block size");
+
+DEFINE_uint64(super_block_alignment_space_overhead_ratio,
+              ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+                  .super_block_alignment_space_overhead_ratio,
+              "Configure space overhead for super block alignment");
+
+// Apply to block_based_options (in the block where other options are set):
+block_based_options.super_block_alignment_size = FLAGS_super_block_alignment_size;
+block_based_options.super_block_alignment_space_overhead_ratio =
+    FLAGS_super_block_alignment_space_overhead_ratio;
+```
+
+### Step 7: Add Crash Test Support
+
+**File: `tools/db_crashtest.py`**
+
+```python
+"super_block_alignment_size": lambda: random.choice(
+    [0, 128 * 1024, 512 * 1024, 2 * 1024 * 1024]
+),
+"super_block_alignment_space_overhead_ratio": lambda: random.choice([0, 32, 4096]),
+```
+
+### Step 8: Add Java API Support
+
+**File: `java/src/main/java/org/rocksdb/BlockBasedTableConfig.java`**
+
+Add getter and setter methods:
+
+```java
+/**
+ * Get the super block alignment size.
+ *
+ * @return the super block alignment size.
+ */
+public long superBlockAlignmentSize() {
+  return superBlockAlignmentSize;
+}
+
+/**
+ * Set the super block alignment size.
+ * When set to 0, super block alignment is disabled.
+ *
+ * @param superBlockAlignmentSize the super block alignment size.
+ *
+ * @return the reference to the current option.
+ */
+public BlockBasedTableConfig setSuperBlockAlignmentSize(final long superBlockAlignmentSize) {
+  this.superBlockAlignmentSize = superBlockAlignmentSize;
+  return this;
+}
+```
+
+Add member variable:
+
+```java
+private long superBlockAlignmentSize;
+```
+
+Update constructor and native method signature.
+
+**File: `java/rocksjni/portal.h`**
+
+Update `GetMethodID` signature and add fields to Java object construction.
+
+**File: `java/rocksjni/table.cc`**
+
+Add parameters to JNI function and apply to options.
+
+**File: `java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java`**
+
+Add unit tests:
+
+```java
+@Test
+public void superBlockAlignmentSize() {
+  final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+  blockBasedTableConfig.setSuperBlockAlignmentSize(1024 * 1024);
+  assertThat(blockBasedTableConfig.superBlockAlignmentSize()).isEqualTo(1024 * 1024);
+}
+```
+
+---
+
+## Pattern 3: Adding C API for Existing Option
+
+Example reference: commit `429b36c22d76403d275dd0e6877b08d4cea2bc90` (block_align C API)
+
+If an option already exists but needs C API support:
+
+**File: `db/c.cc`**
+
+```cpp
+void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.block_align = v;
+}
+```
+
+**File: `include/rocksdb/c.h`**
+
+```cpp
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_align(
+    rocksdb_block_based_table_options_t*, unsigned char);
+```
+
+---
+
+## Unit Testing Guidelines
+
+### For Standard Options
+
+Add tests in appropriate test files (e.g., `db/db_memtable_test.cc`, `db/db_options_test.cc`):
+
+```cpp
+TEST_F(DBMemTableTest, YourOptionTest) {
+  Options options;
+  options.your_new_option = true;
+  Reopen(options);
+  // Test the behavior
+}
+```
+
+### For BlockBasedTableOptions
+
+Add tests in `db/db_flush_test.cc`, `table/block_based/block_based_table_reader_test.cc`, or `table/table_test.cc`:
+
+```cpp
+TEST_P(DBFlushYourFeatureTest, YourFeature) {
+  Options options;
+  BlockBasedTableOptions block_options;
+  block_options.your_new_option = some_value;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_options));
+
+  ASSERT_OK(options.table_factory->ValidateOptions(
+      DBOptions(options), ColumnFamilyOptions(options)));
+
+  Reopen(options);
+  // Test the behavior
+}
+```
+
+---
+
+## Option Type Reference
+
+Common option types used in serialization:
+
+| OptionType | C++ Type | Example |
+|------------|----------|---------|
+| `kBoolean` | `bool` | `paranoid_memory_checks` |
+| `kInt` | `int` | `max_write_buffer_number` |
+| `kInt32T` | `int32_t` | `level0_file_num_compaction_trigger` |
+| `kUInt32T` | `uint32_t` | `memtable_protection_bytes_per_key` |
+| `kUInt64T` | `uint64_t` | `target_file_size_base` |
+| `kSizeT` | `size_t` | `block_size` |
+| `kDouble` | `double` | `compression_ratio` |
+| `kString` | `std::string` | `db_log_dir` |
+
+---
+
+## Checklist Summary
+
+- [ ] Public header file with option definition and documentation
+- [ ] Internal options struct (MutableCFOptions or ImmutableCFOptions)
+- [ ] Options serialization/deserialization registration
+- [ ] Options logging in Dump() method
+- [ ] UpdateColumnFamilyOptions() for mutable options
+- [ ] options_settable_test.cc
+- [ ] db_stress_common.h (DECLARE)
+- [ ] db_stress_gflags.cc (DEFINE)
+- [ ] db_stress_test_base.cc (apply flag)
+- [ ] db_bench_tool.cc (DEFINE and apply)
+- [ ] db_crashtest.py (randomized values)
+- [ ] Unit tests
+- [ ] unreleased_history markdown file
+- [ ] Java API (for BlockBasedTableOptions)
+- [ ] C API (if needed)
+
diff --git a/claude_md/add_public_api.md b/claude_md/add_public_api.md
new file mode 100644
index 000000000000..684b89faeba5
--- /dev/null
+++ b/claude_md/add_public_api.md
@@ -0,0 +1,504 @@
+# RocksDB API Development Guide
+
+This document provides guidance for adding new public APIs to RocksDB, following the established patterns used by existing APIs like `CompactRange`.
+
+## API Layer Architecture
+
+RocksDB exposes public APIs through multiple layers. Users can access RocksDB through any of the three public APIs: C++ headers, C headers, or Java bindings.
+
+Here is an example for public header db.h:
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                     Level 1: Public APIs (User Entry Points)                │
+├───────────────────────┬─────────────────────────┬───────────────────────────┤
+│   C++ Public API      │     C API Bindings      │       Java/JNI API        │
+│ include/rocksdb/db.h  │   include/rocksdb/c.h   │ java/src/.../RocksDB.java │
+│ include/rocksdb/*.h   │                         │ java/src/.../*.java       │
+└───────────────────────┴────────────┬────────────┴───────────────────────────┘
+                                     ↓
+┌─────────────────────────────────────────────────────────────────────────────┐
+│              Level 2: C++ Implementation (Internal Core)                    │
+│              db/db_impl/db_impl*.cc, db/c.cc, java/rocksjni/*.cc            │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Step-by-Step Guide: Adding a New Public API
+
+### Step 1: Define the C++ Public Interface
+
+**File:** `include/rocksdb/db.h`
+
+Add the virtual method declaration in the `DB` class:
+
+\`\`\`cpp
+// Pure virtual - must be implemented by DBImpl
+virtual Status YourNewAPI(const YourAPIOptions& options,
+                          ColumnFamilyHandle* column_family,
+                          /* other params */) = 0;
+
+// Convenience overload for default column family
+virtual Status YourNewAPI(const YourAPIOptions& options,
+                          /* other params */) {
+  return YourNewAPI(options, DefaultColumnFamily(), /* other params */);
+}
+\`\`\`
+
+**Key Patterns:**
+- Use `Status` return type for error handling
+- Use `OptSlice` to avoid unnecessary levels of indirection and use of raw pointers.
+- Use `ColumnFamilyHandle*` for column family support
+- Provide convenience overloads for the default column family
+
+### Step 2: Define Options Struct (If Needed)
+
+**File:** `include/rocksdb/options.h`
+
+If your API has multiple configuration options, define an options struct:
+
+\`\`\`cpp
+struct YourAPIOptions {
+  // Document each option with clear comments
+  bool some_boolean_option = false;
+
+  // Default value explanation
+  int some_int_option = -1;
+
+  // Pointer options require careful lifetime management
+  std::atomic<bool>* canceled = nullptr;
+
+  // Enum options for multi-choice settings
+  YourEnumType some_enum = YourEnumType::kDefault;
+};
+\`\`\`
+
+**Key Patterns:**
+- Use sensible default values specified inline (e.g., `= false`, `= -1`)
+- Do NOT redundantly document the default value in comments; instead, document the rationale (why this default), historical context, and how different values are interpreted
+- Group related options logically
+- Consider thread-safety for pointer options
+
+### Step 3: Implement in DBImpl
+
+**Header:** `db/db_impl/db_impl.h`
+
+\`\`\`cpp
+using DB::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& options,
+                  ColumnFamilyHandle* column_family,
+                  /* other params */) override;
+
+// Private internal implementation if needed
+Status YourNewAPIInternal(const YourAPIOptions& options,
+                          ColumnFamilyHandle* column_family,
+                          /* other params */);
+\`\`\`
+
+**Implementation:** `db/db_impl/db_impl_<category>.cc`
+
+Choose the appropriate implementation file based on functionality:
+- `db_impl_compaction_flush.cc` - Compaction and flush operations
+- `db_impl_write.cc` - Write operations
+- `db_impl_open.cc` - DB opening/closing
+- `db_impl_files.cc` - File operations
+- `db_impl.cc` - General operations
+
+\`\`\`cpp
+Status DBImpl::YourNewAPI(const YourAPIOptions& options,
+                          ColumnFamilyHandle* column_family,
+                          /* other params */) {
+  // 1. Input validation
+  if (/* invalid input */) {
+    return Status::InvalidArgument("Error message");
+  }
+
+  // 2. Check for cancellation/abort conditions
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  // 3. Get column family data
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  // 4. Core implementation logic
+  // ...
+
+  return Status::OK();
+}
+\`\`\`
+
+### Step 4: Handle Special DB Types
+
+**StackableDB (Wrapper DBs):**
+**File:** `include/rocksdb/utilities/stackable_db.h`
+
+\`\`\`cpp
+using DB::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& options,
+                  ColumnFamilyHandle* column_family,
+                  /* other params */) override {
+  return db_->YourNewAPI(options, column_family, /* other params */);
+}
+\`\`\`
+
+**Secondary DB (Read-Only):**
+**File:** `db/db_impl/db_impl_secondary.h`
+
+\`\`\`cpp
+using DBImpl::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& /*options*/,
+                  ColumnFamilyHandle* /*column_family*/,
+                  /* other params */) override {
+  return Status::NotSupported("Not supported in secondary DB");
+}
+\`\`\`
+
+**CompactedDB (Read-Only):**
+**File:** `db/db_impl/compacted_db_impl.h`
+
+\`\`\`cpp
+using DBImpl::YourNewAPI;
+Status YourNewAPI(const YourAPIOptions& /*options*/,
+                  ColumnFamilyHandle* /*column_family*/,
+                  /* other params */) override {
+  return Status::NotSupported("Not supported for read-only DB");
+}
+\`\`\`
+
+### Step 5: Add C API Bindings
+
+**Header:** `include/rocksdb/c.h`
+
+\`\`\`c
+// Basic version
+extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api(
+    rocksdb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+// Column family version
+extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len);
+
+// With options and error handling
+extern ROCKSDB_LIBRARY_API void rocksdb_your_new_api_opt(
+    rocksdb_t* db, rocksdb_your_api_options_t* opt,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len,
+    char** errptr);
+\`\`\`
+
+**Implementation:** `db/c.cc`
+
+\`\`\`cpp
+void rocksdb_your_new_api(rocksdb_t* db, const char* start_key,
+                          size_t start_key_len, const char* limit_key,
+                          size_t limit_key_len) {
+  Slice a, b;
+  db->rep->YourNewAPI(
+      YourAPIOptions(),  // Default options
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_your_new_api_cf(rocksdb_t* db,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* start_key, size_t start_key_len,
+                             const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->YourNewAPI(
+      YourAPIOptions(),
+      column_family->rep,
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+\`\`\`
+
+**If you have options, also add:**
+
+\`\`\`cpp
+// Options struct wrapper
+struct rocksdb_your_api_options_t {
+  YourAPIOptions rep;
+};
+
+rocksdb_your_api_options_t* rocksdb_your_api_options_create() {
+  return new rocksdb_your_api_options_t;
+}
+
+void rocksdb_your_api_options_destroy(rocksdb_your_api_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_your_api_options_set_some_option(
+    rocksdb_your_api_options_t* opt, unsigned char value) {
+  opt->rep.some_boolean_option = value;
+}
+\`\`\`
+
+### Step 6: Add Java Bindings
+
+**Java API:** `java/src/main/java/org/rocksdb/RocksDB.java`
+
+\`\`\`java
+// Basic version
+public void yourNewAPI() throws RocksDBException {
+  yourNewAPI(null);
+}
+
+// Column family version
+public void yourNewAPI(ColumnFamilyHandle columnFamilyHandle)
+    throws RocksDBException {
+  yourNewAPI(nativeHandle_, null, -1, null, -1, 0,
+      columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+}
+
+// Range version
+public void yourNewAPI(final byte[] begin, final byte[] end)
+    throws RocksDBException {
+  yourNewAPI(null, begin, end);
+}
+
+// Full-featured version with options
+public void yourNewAPI(ColumnFamilyHandle columnFamilyHandle,
+                       final byte[] begin, final byte[] end,
+                       final YourAPIOptions options)
+    throws RocksDBException {
+  yourNewAPI(nativeHandle_,
+      begin, begin == null ? -1 : begin.length,
+      end, end == null ? -1 : end.length,
+      options.nativeHandle_,
+      columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+}
+
+// Native method declaration
+private static native void yourNewAPI(final long handle,
+    /* @Nullable */ final byte[] begin, final int beginLen,
+    /* @Nullable */ final byte[] end, final int endLen,
+    final long optionsHandle,
+    final long cfHandle);
+\`\`\`
+
+**Options Class:** `java/src/main/java/org/rocksdb/YourAPIOptions.java`
+
+\`\`\`java
+public class YourAPIOptions extends RocksObject {
+
+  public YourAPIOptions() {
+    super(newYourAPIOptions());
+  }
+
+  // Builder pattern setters
+  public YourAPIOptions setSomeBooleanOption(boolean value) {
+    setSomeBooleanOption(nativeHandle_, value);
+    return this;
+  }
+
+  // Getters
+  public boolean someBooleanOption() {
+    return someBooleanOption(nativeHandle_);
+  }
+
+  // Native method declarations
+  private static native long newYourAPIOptions();
+  private static native void disposeInternalJni(long handle);
+  private static native void setSomeBooleanOption(long handle, boolean value);
+  private static native boolean someBooleanOption(long handle);
+
+  @Override
+  protected final void disposeInternal(final long handle) {
+    disposeInternalJni(handle);
+  }
+}
+\`\`\`
+
+**JNI Implementation:** `java/rocksjni/rocksjni.cc`
+
+\`\`\`cpp
+void Java_org_rocksdb_RocksDB_yourNewAPI(
+    JNIEnv* env, jclass,
+    jlong jdb_handle, jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len,
+    jlong joptions_handle, jlong jcf_handle) {
+
+  // 1. Convert Java byte arrays to C++ strings
+  jboolean has_exception = JNI_FALSE;
+  std::string str_begin;
+  if (jbegin_len > 0) {
+    str_begin = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+        env, jbegin, jbegin_len,
+        [](const char* str, const size_t len) { return std::string(str, len); },
+        &has_exception);
+    if (has_exception == JNI_TRUE) return;
+  }
+
+  std::string str_end;
+  if (jend_len > 0) {
+    str_end = ROCKSDB_NAMESPACE::JniUtil::byteString<std::string>(
+        env, jend, jend_len,
+        [](const char* str, const size_t len) { return std::string(str, len); },
+        &has_exception);
+    if (has_exception == JNI_TRUE) return;
+  }
+
+  // 2. Get or create options
+  ROCKSDB_NAMESPACE::YourAPIOptions* options = nullptr;
+  if (joptions_handle == 0) {
+    options = new ROCKSDB_NAMESPACE::YourAPIOptions();
+  } else {
+    options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(joptions_handle);
+  }
+
+  // 3. Unwrap handles
+  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
+  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle =
+      jcf_handle == 0 ? db->DefaultColumnFamily()
+                      : reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
+
+  // 4. Create Slices
+  std::unique_ptr<ROCKSDB_NAMESPACE::Slice> begin;
+  std::unique_ptr<ROCKSDB_NAMESPACE::Slice> end;
+  if (jbegin_len > 0) begin.reset(new ROCKSDB_NAMESPACE::Slice(str_begin));
+  if (jend_len > 0) end.reset(new ROCKSDB_NAMESPACE::Slice(str_end));
+
+  // 5. Call C++ API
+  ROCKSDB_NAMESPACE::Status s = db->YourNewAPI(*options, cf_handle, begin.get(), end.get());
+
+  // 6. Cleanup if we created options
+  if (joptions_handle == 0) delete options;
+
+  // 7. Throw Java exception on error
+  ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
+}
+\`\`\`
+
+**Options JNI:** `java/rocksjni/your_api_options.cc`
+
+\`\`\`cpp
+jlong Java_org_rocksdb_YourAPIOptions_newYourAPIOptions(JNIEnv*, jclass) {
+  auto* options = new ROCKSDB_NAMESPACE::YourAPIOptions();
+  return GET_CPLUSPLUS_POINTER(options);
+}
+
+void Java_org_rocksdb_YourAPIOptions_disposeInternalJni(JNIEnv*, jclass, jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(jhandle);
+  delete options;
+}
+
+void Java_org_rocksdb_YourAPIOptions_setSomeBooleanOption(
+    JNIEnv*, jclass, jlong jhandle, jboolean value) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(jhandle);
+  options->some_boolean_option = static_cast<bool>(value);
+}
+
+jboolean Java_org_rocksdb_YourAPIOptions_someBooleanOption(JNIEnv*, jclass, jlong jhandle) {
+  auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::YourAPIOptions*>(jhandle);
+  return static_cast<jboolean>(options->some_boolean_option);
+}
+\`\`\`
+
+### Step 7: Update Build Files
+
+**Java CMakeLists.txt:** `java/CMakeLists.txt`
+
+Add your new Java source files:
+\`\`\`cmake
+src/main/java/org/rocksdb/YourAPIOptions.java
+src/test/java/org/rocksdb/YourAPIOptionsTest.java
+\`\`\`
+
+### Step 8: Add Release Notes
+
+**Directory:** `unreleased_history/`
+
+RocksDB uses individual files in the `unreleased_history/` directory rather than directly editing `HISTORY.md`. This avoids merge conflicts and ensures changes are attributed to the correct release version.
+
+Add a file to the appropriate subdirectory:
+- `unreleased_history/new_features/` - For new functionality
+- `unreleased_history/public_api_changes/` - For API changes
+- `unreleased_history/behavior_changes/` - For behavior modifications
+- `unreleased_history/bug_fixes/` - For bug fixes
+
+**Example:** `unreleased_history/new_features/your_new_api.md`
+
+\`\`\`markdown
+Added `YourNewAPI()` to support [describe functionality]. See `YourAPIOptions` for configuration.
+\`\`\`
+
+**Example:** `unreleased_history/public_api_changes/your_api_options.md`
+
+**Note:** Files should contain one line of markdown. The "* " prefix is automatically added if not included. These files are compiled into `HISTORY.md` during the release process.
+
+### Step 9: Add Tests
+
+**C++ Unit Tests:** `db/db_your_api_test.cc` or add to existing test file
+
+\`\`\`cpp
+TEST_F(DBTest, YourNewAPIBasic) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Setup test data
+  ASSERT_OK(Put(1, "key1", "value1"));
+  ASSERT_OK(Put(1, "key2", "value2"));
+
+  // Test your API
+  YourAPIOptions api_options;
+  api_options.some_boolean_option = true;
+  ASSERT_OK(db_->YourNewAPI(api_options, handles_[1], nullptr, nullptr));
+
+  // Verify results
+  // ...
+}
+\`\`\`
+
+**Java Tests:** `java/src/test/java/org/rocksdb/YourAPIOptionsTest.java`
+
+\`\`\`java
+public class YourAPIOptionsTest {
+  @Test
+  public void yourAPIOptions() {
+    try (final YourAPIOptions options = new YourAPIOptions()) {
+      assertFalse(options.someBooleanOption());
+      options.setSomeBooleanOption(true);
+      assertTrue(options.someBooleanOption());
+    }
+  }
+}
+\`\`\`
+
+## File Summary Checklist
+
+
+| Component | File(s) | Required |
+|-----------|---------|----------|
+| C++ Public Interface | `include/rocksdb/db.h` | ✓ |
+| Options Struct | `include/rocksdb/options.h` | If needed |
+| DBImpl Declaration | `db/db_impl/db_impl.h` | ✓ |
+| DBImpl Implementation | `db/db_impl/db_impl_*.cc` | ✓ |
+| StackableDB | `include/rocksdb/utilities/stackable_db.h` | ✓ |
+| Secondary DB | `db/db_impl/db_impl_secondary.h` | If not supported |
+| Compacted DB | `db/db_impl/compacted_db_impl.h` | If not supported |
+| C API Header | `include/rocksdb/c.h` | ✓ |
+| C API Implementation | `db/c.cc` | ✓ |
+| Java API | `java/src/main/java/org/rocksdb/RocksDB.java` | ✓ |
+| Java Options | `java/src/main/java/org/rocksdb/YourAPIOptions.java` | If needed |
+| JNI Implementation | `java/rocksjni/rocksjni.cc` | ✓ |
+| JNI Options | `java/rocksjni/your_api_options.cc` | If needed |
+| Java CMake | `java/CMakeLists.txt` | If new files |
+| Changelog | `unreleased_history/*.md` | ✓ |
+| C++ Tests | `db/db_*_test.cc` | ✓ |
+| Java Tests | `java/src/test/java/org/rocksdb/*Test.java` | ✓ |
+
+## Best Practices
+
+1. **Error Handling**: Always return `Status` objects in C++, throw exceptions in Java
+2. **Default Values**: Provide sensible defaults for all options
+3. **Documentation**: Add clear comments for all public methods and options
+4. **Column Family Support**: Always support column family operations
+5. **Thread Safety**: Document thread-safety guarantees
+6. **Backward Compatibility**: Avoid breaking existing API contracts
+7. **Testing**: Add comprehensive unit tests for all code paths

From 82ff0678d4c9e67d83a4fcbe86ab232c2360e05b Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Tue, 3 Feb 2026 12:22:09 -0800
Subject: [PATCH 456/500] Add a cleanup target to crash_test.mk (#14286)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14286

Add the db_c leanup target which can be used by CI test scripts to delete the db on failure. The db_crashtest.py doesn't automatically delete on error.

Reviewed By: jaykorean

Differential Revision: D91912877

fbshipit-source-id: d36ec0896fba64faaafe055d8673e437e85d0c3a
---
 crash_test.mk                      |  4 ++++
 db_stress_tool/db_stress_common.h  |  1 +
 db_stress_tool/db_stress_gflags.cc |  4 ++++
 db_stress_tool/db_stress_tool.cc   | 14 ++++++++++++++
 4 files changed, 23 insertions(+)

diff --git a/crash_test.mk b/crash_test.mk
index 1b9960d581e1..02e15a862aae 100644
--- a/crash_test.mk
+++ b/crash_test.mk
@@ -34,6 +34,7 @@ CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --
 	whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
 	whitebox_crash_test_with_optimistic_txn \
 	whitebox_crash_test_with_tiered_storage \
+	crash_test_db_cleanup \
 
 crash_test: $(DB_STRESS_CMD)
 # Do not parallelize
@@ -161,6 +162,9 @@ whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
+crash_test_db_cleanup: $(DB_STRESS_CMD)
+	$(DB_STRESS_CMD) --delete_dir_and_exit=$(TEST_TMPDIR)
+
 # Old names DEPRECATED
 crash_test_with_txn: crash_test_with_wc_txn
 whitebox_crash_test_with_txn: whitebox_crash_test_with_wc_txn
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index eca5656204f1..953e9a9dfd70 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -101,6 +101,7 @@ DECLARE_bool(verify_before_write);
 DECLARE_bool(histogram);
 DECLARE_bool(destroy_db_initially);
 DECLARE_bool(destroy_db_and_exit);
+DECLARE_string(delete_dir_and_exit);
 DECLARE_bool(verbose);
 DECLARE_bool(progress_reports);
 DECLARE_uint64(db_write_buffer_size);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 0678609ecaa8..c18ac0e11d79 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -139,6 +139,10 @@ DEFINE_bool(destroy_db_and_exit, false,
             "Destroys the database dir and exits. Useful for cleanup without "
             "running stress test. Other options are mostly ignored.");
 
+DEFINE_string(delete_dir_and_exit, "",
+              "Recursively deletes the specified directory and exits. "
+              "Useful for cleaning up TEST_TMPDIR after crash tests.");
+
 DEFINE_bool(verbose, false, "Verbose");
 
 DEFINE_bool(progress_reports, true,
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index 796a62b800bd..a32dcf557f52 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -111,6 +111,20 @@ int db_stress_tool(int argc, char** argv) {
     }
   }
 
+  // Handle --delete_dir_and_exit early, before other option validation
+  if (!FLAGS_delete_dir_and_exit.empty()) {
+    s = DestroyDir(raw_env, FLAGS_delete_dir_and_exit);
+    if (s.ok()) {
+      fprintf(stdout, "Successfully deleted directory %s\n",
+              FLAGS_delete_dir_and_exit.c_str());
+      return 0;
+    } else {
+      fprintf(stderr, "Failed to delete directory %s: %s\n",
+              FLAGS_delete_dir_and_exit.c_str(), s.ToString().c_str());
+      return 1;
+    }
+  }
+
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
   // The number of background threads should be at least as much the

From c2fab4629b2e531f65d05613bf5390f62f48eff7 Mon Sep 17 00:00:00 2001
From: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>
Date: Tue, 3 Feb 2026 11:16:23 -0800
Subject: [PATCH 457/500] Re-sync with internal repository

The internal and external repositories are out of sync. This Pull Request attempts to brings them back in sync by patching the GitHub repository. Please carefully review this patch. You must disable ShipIt for your project in order to merge this pull request. DO NOT IMPORT this pull request. Instead, merge it directly on GitHub using the MERGE BUTTON. Re-enable ShipIt after merging.

fbshipit-source-id: 08b287a3f343f6ac5872c2a059d91d1bed9ff0a8
---
 CLAUDE.md | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 CLAUDE.md

diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000000..6cbdb32e1cbc
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,274 @@
+# RocksDB Code Generation and Review Guidance
+
+This document provides guidance for generating and reviewing code in the RocksDB project, derived from analysis of code review feedback across hundreds of complex merged Pull Requests. Use this as a reference when writing code with AI assistants or conducting code reviews.
+
+---
+
+## General Best Practices
+
+### Code Quality and Maintainability
+
+**Clarity and Readability:** Write clear, self-documenting code. Use meaningful variable names, add comments for complex logic, and structure code to minimize cognitive load. Avoid clever tricks that sacrifice readability for marginal performance gains unless absolutely necessary.
+
+**Consistent Style:** Follow existing code style conventions. RocksDB uses `.clang-format` for formatting, specific naming conventions, and structural patterns. Deviations from these patterns are frequently flagged in reviews.
+
+**Error Handling:** Ensure robust error handling throughout the codebase. Use RocksDB's `Status` type consistently, propagate errors appropriately, and avoid silently ignoring failures. Reviewers pay close attention to edge cases and failure modes.
+
+### Testing Philosophy
+
+**Comprehensive Coverage:** Every change should include appropriate test coverage. This includes unit tests for isolated functionality, integration tests for component interactions, and stress tests for concurrency and performance validation. Reviewers will ask for additional tests if coverage is insufficient.
+
+**Edge Cases and Failure Modes:** Tests should explicitly cover edge cases, boundary conditions, and potential failure scenarios. This is especially important for changes affecting core database operations, compaction, or recovery logic.
+
+**Platform-Specific Testing:** RocksDB supports multiple platforms (Linux, Windows, macOS) and compilers (GCC, Clang, MSVC). Changes should be tested across relevant platforms, particularly when touching platform-specific code or using compiler-specific features.
+
+### Performance Considerations
+
+**Benchmarking and Profiling:** Performance claims should be backed by empirical evidence. Use RocksDB's benchmarking tools (e.g., `db_bench`) to validate improvements. Reviewers will request benchmark results for changes that could impact performance.
+
+**Avoid Premature Optimization:** Focus on correctness first, then optimize based on profiling data. Reviewers are skeptical of optimizations that add complexity without measurable benefit.
+
+**Memory and Resource Management:** Be mindful of memory allocations, especially in hot paths. Use RAII patterns, smart pointers, and RocksDB's memory management utilities appropriately.
+
+### API Design and Compatibility
+
+**Backwards Compatibility:** RocksDB maintains strong backwards compatibility guarantees. Breaking changes are rare and require extensive justification. When deprecating features, follow the project's deprecation policy (typically spanning multiple releases).
+
+**API Consistency:** New APIs should be consistent with existing patterns. Use similar naming conventions, parameter ordering, and return types. Reviewers will suggest changes to improve consistency with the broader codebase.
+
+**Documentation:** Public APIs must be thoroughly documented. Include usage examples, parameter descriptions, and notes on thread safety, performance characteristics, and compatibility considerations.
+
+---
+
+## Component-Specific Guidance
+
+### Database Core (`db`)
+
+The database core handles write-ahead logging (WAL), memtables, compaction, and recovery. This component receives the most scrutiny in code reviews.
+
+**Concurrency and Thread Safety:** Database operations are highly concurrent. Reviewers carefully examine locking strategies, atomic operations, and memory ordering. Document synchronization assumptions clearly. Use appropriate memory ordering semantics (`acquire`/`release` vs. `seq_cst`).
+
+**Compaction Logic:** Changes to compaction are complex and high-risk. Ensure that compaction logic respects configured parameters, handles edge cases (empty databases, single-file compactions), and maintains correctness under concurrent operations.
+
+**Error Propagation:** Database operations can fail in many ways (I/O errors, corruption, resource exhaustion). Ensure that errors are properly propagated, logged, and handled. Avoid assertions in production code paths.
+
+**Testing:** Database core changes require extensive testing, including unit tests, integration tests, and stress tests. Test with various configurations, compaction styles, and concurrent workloads.
+
+### Public Headers (`include`)
+
+Public headers define RocksDB's API surface. Changes here have the highest compatibility impact.
+
+**API Design:** New APIs should be intuitive, consistent with existing patterns, and well-documented. Consider how the API will be used in practice and avoid adding unnecessary complexity.
+
+**Backwards Compatibility:** Breaking changes to public APIs require extensive justification and a deprecation plan. Maintain ABI compatibility for bug fixes and patch releases.
+
+**Documentation:** Every public API must be thoroughly documented with usage examples, parameter descriptions, and notes on thread safety and performance characteristics.
+
+**Deprecation:** When deprecating APIs, follow the project's policy. Mark deprecated APIs clearly, provide migration guidance, and maintain support for at least one major release.
+
+### Internal Utilities (`util`)
+
+Internal utilities provide common functionality used throughout the codebase.
+
+**Code Reuse:** Utilities should be general-purpose and reusable. Avoid duplicating functionality that already exists elsewhere in the codebase.
+
+**Error Handling:** Utility functions should handle errors robustly and propagate them appropriately. Consider edge cases like overflow, underflow, and invalid inputs.
+
+**Testing:** Utility functions should have comprehensive test coverage, including edge cases and failure modes. Consider adding death tests for assertions.
+
+**Performance:** Utilities are often used in hot paths. Ensure that implementations are efficient and avoid unnecessary allocations or copies.
+
+### Table Management (`table`)
+
+Table management handles SST file format, block-based tables, and table readers/writers.
+
+**Block Format and Checksums:** Changes to block format require extreme care. Ensure that checksums are computed and verified correctly. Test with various compression algorithms and block sizes.
+
+**Iterator Correctness:** Table iterators are used throughout the codebase. Ensure that iterator semantics (Seek, Next, Prev) are correct, especially at boundaries and with deletions.
+
+**Caching and Prefetching:** Table readers interact with the block cache and prefetching logic. Ensure that cache keys are unique and that prefetching respects configured limits.
+
+**Performance:** Table operations are performance-critical. Benchmark changes that could impact read or write performance.
+
+### Utilities (`utilities`)
+
+Utilities include optional features like transactions, backup engine, and checkpoint.
+
+**Feature Isolation:** Utilities should be self-contained and not introduce unnecessary dependencies on core database internals.
+
+**Deprecation and Cleanup:** Legacy features are being phased out. When removing deprecated code, ensure that migration paths are documented and that users have sufficient warning.
+
+**Cross-Platform Compatibility:** Utilities often interact with OS-specific APIs. Ensure that code works on all supported platforms.
+
+### Options and Configuration (`options`)
+
+Options define RocksDB's configuration system.
+
+**Type Safety:** Use appropriate types for options (e.g., `uint32_t` for flags, scoped enums for enumerated values).
+
+**Deprecation Policy:** When deprecating options, follow the project's policy. Document the deprecation, provide migration guidance, and maintain support for at least one major release.
+
+**Dynamic Configuration:** Some options can be changed dynamically. Ensure that dynamic changes are thread-safe and take effect correctly.
+
+**Validation:** Validate option values and provide clear error messages for invalid configurations.
+
+### Cache (`cache`)
+
+Cache management is critical for RocksDB's performance.
+
+**Concurrency:** Cache operations are highly concurrent. Ensure that implementations are thread-safe and use appropriate synchronization primitives.
+
+**Performance:** Cache operations are in the hot path. Optimize for low latency and high throughput. Benchmark changes carefully.
+
+**Memory Management:** Cache implementations must manage memory carefully to avoid leaks and excessive allocations.
+
+**Eviction Policies:** Changes to eviction policies should be well-tested and benchmarked to ensure they improve overall performance.
+
+---
+
+## Code Review Checklist
+
+When reviewing RocksDB code (or preparing code for review), use this checklist:
+
+### Correctness
+- [ ] Does the change preserve database semantics (e.g., snapshot isolation, key ordering)?
+- [ ] Are all error cases handled appropriately?
+- [ ] Is the change thread-safe? Are synchronization primitives used correctly?
+- [ ] Are there any potential data races or deadlocks?
+
+### Testing
+- [ ] Does the change include appropriate test coverage?
+- [ ] Are edge cases and failure modes tested?
+- [ ] Have the tests been run on all supported platforms?
+- [ ] Are stress tests passing?
+
+### Performance
+- [ ] Are there benchmark results for performance-sensitive changes?
+- [ ] Does the change avoid unnecessary allocations or copies?
+- [ ] Are hot paths optimized appropriately?
+
+### API and Compatibility
+- [ ] Is the change backwards compatible?
+- [ ] Are new APIs consistent with existing patterns?
+- [ ] Is the public API documented?
+- [ ] Are deprecated features handled according to policy?
+
+### Code Quality
+- [ ] Does the code follow RocksDB's style conventions?
+- [ ] Is the code clear and maintainable?
+- [ ] Are comments and documentation sufficient?
+- [ ] Are there any code smells or anti-patterns?
+
+---
+
+## Common Review Feedback Patterns
+
+The following patterns emerged as frequent sources of review feedback:
+
+1. **Test Coverage:** Reviewers frequently request additional tests for edge cases, platform-specific behavior, and failure modes. Complex changes require comprehensive test coverage including unit tests, integration tests, and stress tests.
+
+2. **Error Handling:** Ensure proper error propagation using RocksDB's `Status` type. Avoid silent failures and provide clear error messages that include context about what failed and why.
+
+3. **API Design:** New APIs should be consistent with existing patterns. Use descriptive names that follow established conventions. Avoid breaking changes without strong justification and a clear deprecation plan.
+
+4. **Documentation:** Public APIs must be documented with usage examples and notes on thread safety, performance characteristics, and compatibility considerations. Complex internal logic should also be well-commented.
+
+5. **Performance:** Performance-sensitive changes require benchmark results to validate improvements. Use `db_bench` and other profiling tools to measure impact. Avoid premature optimization that adds complexity without measurable benefit.
+
+6. **Concurrency:** Thread safety is critical in RocksDB. Document synchronization assumptions clearly. Use appropriate memory ordering semantics. Consider potential race conditions and deadlocks.
+
+7. **Code Style:** Follow existing conventions for naming, formatting, and structure. Use `.clang-format` for consistent formatting. Prefer scoped enums (`enum class`) over unscoped enums.
+
+8. **Backwards Compatibility:** RocksDB maintains strong compatibility guarantees. Breaking changes require extensive justification. When deprecating features, provide migration guidance and maintain support across multiple releases.
+
+9. **Refactoring:** Reviewers appreciate refactoring that improves code readability and maintainability. Look for opportunities to deduplicate code and simplify complex logic.
+
+10. **Platform Compatibility:** Ensure changes work correctly on all supported platforms (Linux, Windows, macOS) and with all supported compilers (GCC, Clang, MSVC).
+
+---
+
+## Important tips
+
+### Build system
+* There are 3 build system. Make, CMake, BUCK(meta internal).
+* When a new .cc file is added, update Makefile, CMakeLists.txt, src.mk, BUCK.
+* Don't manually edit BUCK file, after updating src.mk, run
+    /usr/local/bin/python3 buckifier/buckify_rocksdb.py to update it
+* Use make to build and run the test. CMake and BUCK are not used locally.
+* Use `make dbg` command to build all of the unit test in debug mode.
+* For -j in make command, use the number of CPU cores to decide it.
+
+### Unit Test
+* After all of the unit tests are added, review them and try to extract common
+    reusable utility functions to reduce code duplication due to copy past between
+    unit tests. This should be done every time unit test is updated.
+* Don't use sleep to wait for certain events to happen. This will cause test to
+    be flaky. Instead, use sync point to synchronize thread progress.
+* Cap unit test execution with 60 seconds timeout.
+* When there are multiple unit tests need to be executed, try to use
+    gtest_parallel.py if available. E.g.
+    python3 ${GTEST_PARALLEL}/gtest_parallel.py ./table_test
+
+### Adding new public API
+    Refer to claude_md/add_public_api.md
+
+### Adding new option
+    Refer to claude_md/add_option.md
+
+### Metrics
+* When adding a new feature, evaluate whether there is opportunity to add
+    metrics. Try to avoid causing performance regression on hot path when adding
+    metrics.
+
+### Stress test
+* When adding a new feature, make sure stress test covers the new option.
+
+### DB bench update
+* When adding a performance related feature, support it in db_bench
+
+### Adding release note
+* Release note should be kept short at high level for external user consumption.
+
+### Final verification of the change
+* Execute make clean to clean all of the changes.
+* Execute make check to build all of the changes and execute all of the tests.
+    Note that executing all of the tests could take multiple minutes.
+
+### Monitoring make check progress
+* Use `make check-progress` to get machine-parseable JSON progress while
+    `make check` is running. This is useful for Claude Code to monitor long
+    builds without timeout issues.
+* Run `make check` in background, then poll progress:
+    ```bash
+    make check &
+    # Poll periodically:
+    make check-progress
+    ```
+* The output shows current phase and progress:
+    ```json
+    {"status":"running","phase":"compiling","completed":300,"total":919,...}
+    {"status":"running","phase":"testing","completed":1500,"total":29962,"failed":0,"percent":5,...}
+    {"status":"completed","phase":"testing","completed":29962,"total":29962,"failed":0,"percent":100,...}
+    ```
+* Phases: `compiling` -> `linking` -> `generating` -> `testing` -> `completed`
+* Key fields: `status`, `phase`, `completed`, `total`, `failed`, `percent`
+* When tests fail, `failed_tests` array shows details (up to 10 failures):
+    ```json
+    {"status":"running",...,"failed":3,"failed_tests":[
+      {"test":"cache_test-CacheTest.Usage","exit_code":1,"signal":0,"output":"...test log..."},
+      {"test":"env_test-EnvTest.Open","exit_code":0,"signal":11,"output":"...Segmentation fault..."}
+    ]}
+    ```
+* `exit_code`: non-zero means test assertion failed
+* `signal`: non-zero means test was killed (e.g., 9=SIGKILL, 6=SIGABRT, 11=SIGSEGV)
+* `output`: last 50 lines of test log including error messages and stack traces
+
+### Executing benchmark using db_bench
+* Since the goal is to measure performance, we need to build a release binary
+    using `make clean && DEBUG_LEVEL=0 make db_bench`. If there is an engine
+    crash due to bug, we need to switch back to debug build. Make sure to run
+    `make clean` before running `make dbg`.
+
+### Formatting code
+* After making change, use `make format-auto` to auto-apply formatting without
+    interactive prompts (Claude Code friendly).

From 48ec45d7bbc1d2c3526e1f1b6f0ac7495988a215 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 4 Feb 2026 15:11:09 -0800
Subject: [PATCH 458/500] Remove useless option
 CompressedSecondaryCacheOptions::compress_format_version (#14302)

Summary:
I don't think this option was ever useful. There was no compressed secondary cache compatibility issue that needed to accommodate compression format version 1. It was needlessly imported from legacy SST file formats. Version 1 is simply an inefficient format because it requires guessing the uncompressed size on decompression.

And as far as I know, we don't have any plans to make compressed secondary cache entries persistable across RocksDB versions. I.e. if persisting, we would simply tag the persistence layer with the version (perhaps major and minor) and throw out the cache whenever that changes. Then we don't have to deal with explicit schema versioning in persistenct caches. This is a workable approach because unlike SSTs, caches are not source-of-truth that need to survive version rollback.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14302

Test Plan: existing tests

Reviewed By: anand1976

Differential Revision: D92315003

Pulled By: pdillinger

fbshipit-source-id: 0b82cfdbd92bcd2b8fbddd6586824f53c88069c4
---
 cache/cache.cc                                      |  5 -----
 cache/compressed_secondary_cache.cc                 |  6 +-----
 cache/compressed_secondary_cache_test.cc            |  5 ++---
 db_stress_tool/db_stress_common.h                   |  1 -
 db_stress_tool/db_stress_gflags.cc                  |  6 ------
 db_stress_tool/db_stress_test_base.cc               |  1 -
 include/rocksdb/cache.h                             | 13 +------------
 tools/db_bench_tool.cc                              | 10 ----------
 tools/db_crashtest.py                               |  1 -
 .../remove_secondary_compress_format_version.md     |  1 +
 10 files changed, 5 insertions(+), 44 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_secondary_compress_format_version.md

diff --git a/cache/cache.cc b/cache/cache.cc
index 3556f61243e9..f94a379d200c 100644
--- a/cache/cache.cc
+++ b/cache/cache.cc
@@ -54,11 +54,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct CompressedSecondaryCacheOptions, compression_type),
           OptionType::kCompressionType, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
-        {"compress_format_version",
-         {offsetof(struct CompressedSecondaryCacheOptions,
-                   compress_format_version),
-          OptionType::kUInt32T, OptionVerificationType::kNormal,
-          OptionTypeFlags::kMutable}},
         {"enable_custom_split_merge",
          {offsetof(struct CompressedSecondaryCacheOptions,
                    enable_custom_split_merge),
diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index f570600339b8..5a53471725f2 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -50,8 +50,7 @@ CompressedSecondaryCache::CompressedSecondaryCache(
           std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
               cache_))),
       disable_cache_(opts.capacity == 0) {
-  auto mgr =
-      GetBuiltinCompressionManager(cache_options_.compress_format_version);
+  auto mgr = GetBuiltinCompressionManager(/*compression_format_version=*/2);
   compressor_ = mgr->GetCompressor(cache_options_.compression_opts,
                                    cache_options_.compression_type);
   decompressor_ =
@@ -356,9 +355,6 @@ std::string CompressedSecondaryCache::GetPrintableOptions() const {
                const_cast<CompressionOptions&>(cache_options_.compression_opts))
                .c_str());
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "    compress_format_version : %d\n",
-           cache_options_.compress_format_version);
-  ret.append(buffer);
   return ret;
 }
 
diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc
index ebd7759bfd0b..845df62f72c0 100644
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@@ -856,8 +856,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
     if (LZ4_Supported()) {
       sec_cache_uri =
           "compressed_secondary_cache://"
-          "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
-          "compress_format_version=2";
+          "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression";
     } else {
       ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
       sec_cache_uri =
@@ -888,7 +887,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
       sec_cache_uri =
           "compressed_secondary_cache://"
           "capacity=2048;num_shard_bits=0;compression_type=kLZ4Compression;"
-          "compress_format_version=2;enable_custom_split_merge=true";
+          "enable_custom_split_merge=true";
     } else {
       ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
       sec_cache_uri =
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 953e9a9dfd70..5ec396235283 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -398,7 +398,6 @@ DECLARE_bool(enable_index_compression);
 DECLARE_uint32(index_shortening);
 DECLARE_uint32(metadata_charge_policy);
 DECLARE_bool(use_adaptive_mutex_lru);
-DECLARE_uint32(compress_format_version);
 DECLARE_uint64(manifest_preallocation_size);
 DECLARE_bool(enable_checksum_handoff);
 DECLARE_string(compression_manager);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index c18ac0e11d79..eb2e7a7ca9a1 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -1390,12 +1390,6 @@ DEFINE_bool(use_adaptive_mutex_lru,
             ROCKSDB_NAMESPACE::LRUCacheOptions().use_adaptive_mutex,
             "LRUCacheOptions.use_adaptive_mutex");
 
-DEFINE_uint32(
-    compress_format_version,
-    static_cast<uint32_t>(ROCKSDB_NAMESPACE::CompressedSecondaryCacheOptions()
-                              .compress_format_version),
-    "CompressedSecondaryCacheOptions.compress_format_version");
-
 DEFINE_uint64(manifest_preallocation_size,
               ROCKSDB_NAMESPACE::Options().manifest_preallocation_size,
               "Options.manifest_preallocation_size");
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index cbce41a89d2e..8f3737975501 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -152,7 +152,6 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity,
     }
     CompressedSecondaryCacheOptions opts;
     opts.capacity = FLAGS_compressed_secondary_cache_size;
-    opts.compress_format_version = FLAGS_compress_format_version;
     if (FLAGS_enable_do_not_compress_roles) {
       opts.do_not_compress_roles = {CacheEntryRoleSet::All()};
     }
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index 0d3603a8e262..f52d5246bbfe 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -306,13 +306,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
   // Options specific to the compression algorithm
   CompressionOptions compression_opts;
 
-  // compress_format_version can have two values:
-  // compress_format_version == 1 -- decompressed size is not included in the
-  // block header. DEPRECATED
-  // compress_format_version == 2 -- decompressed size is included in the block
-  // header in varint32 format.
-  uint32_t compress_format_version = 2;
-
   // Enable the custom split and merge feature, which split the compressed value
   // into chunks so that they may better fit jemalloc bins.
   bool enable_custom_split_merge = false;
@@ -330,7 +323,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
       CacheMetadataChargePolicy _metadata_charge_policy =
           kDefaultCacheMetadataChargePolicy,
       CompressionType _compression_type = CompressionType::kLZ4Compression,
-      uint32_t _compress_format_version = 2,
       bool _enable_custom_split_merge = false,
       const CacheEntryRoleSet& _do_not_compress_roles =
           {CacheEntryRole::kFilterBlock})
@@ -339,7 +331,6 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions {
                         _use_adaptive_mutex, _metadata_charge_policy,
                         _low_pri_pool_ratio),
         compression_type(_compression_type),
-        compress_format_version(_compress_format_version),
         enable_custom_split_merge(_enable_custom_split_merge),
         do_not_compress_roles(_do_not_compress_roles) {}
 
@@ -360,7 +351,6 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
     CacheMetadataChargePolicy metadata_charge_policy =
         kDefaultCacheMetadataChargePolicy,
     CompressionType compression_type = CompressionType::kLZ4Compression,
-    uint32_t compress_format_version = 2,
     bool enable_custom_split_merge = false,
     const CacheEntryRoleSet& _do_not_compress_roles = {
         CacheEntryRole::kFilterBlock}) {
@@ -368,8 +358,7 @@ inline std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
              capacity, num_shard_bits, strict_capacity_limit,
              high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
              use_adaptive_mutex, metadata_charge_policy, compression_type,
-             compress_format_version, enable_custom_split_merge,
-             _do_not_compress_roles)
+             enable_custom_split_merge, _do_not_compress_roles)
       .MakeSharedSecondaryCache();
 }
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ac7a8066b54b..5eedc676f39a 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -626,14 +626,6 @@ DEFINE_int32(compressed_secondary_cache_compression_level,
              "dependent. If unset, we try to use the default for the library "
              "specified in `--compressed_secondary_cache_compression_type`");
 
-DEFINE_uint32(
-    compressed_secondary_cache_compress_format_version, 2,
-    "compress_format_version can have two values: "
-    "compress_format_version == 1 -- decompressed size is not included"
-    " in the block header."
-    "compress_format_version == 2 -- decompressed size is included"
-    " in the block header in varint32 format.");
-
 DEFINE_bool(use_tiered_cache, false,
             "If use_compressed_secondary_cache is true and "
             "use_tiered_volatile_cache is true, then allocate a tiered cache "
@@ -3215,8 +3207,6 @@ class Benchmark {
           FLAGS_compressed_secondary_cache_compression_type_e;
       secondary_cache_opts.compression_opts.level =
           FLAGS_compressed_secondary_cache_compression_level;
-      secondary_cache_opts.compress_format_version =
-          FLAGS_compressed_secondary_cache_compress_format_version;
       if (FLAGS_use_tiered_cache) {
         use_tiered_cache = true;
         adm_policy = StringToAdmissionPolicy(FLAGS_tiered_adm_policy.c_str());
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 707fdd27b594..970ba3939032 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -381,7 +381,6 @@ def apply_random_seed_per_iteration():
     "index_shortening": lambda: random.choice([0, 1, 2]),
     "metadata_charge_policy": lambda: random.choice([0, 1]),
     "use_adaptive_mutex_lru": lambda: random.choice([0, 1]),
-    "compress_format_version": lambda: random.choice([1, 2]),
     "manifest_preallocation_size": lambda: random.choice([0, 5 * 1024]),
     "enable_checksum_handoff": lambda: random.choice([0, 1]),
     "max_total_wal_size": lambda: random.choice([0] * 4 + [64 * 1024 * 1024]),
diff --git a/unreleased_history/public_api_changes/remove_secondary_compress_format_version.md b/unreleased_history/public_api_changes/remove_secondary_compress_format_version.md
new file mode 100644
index 000000000000..ecd17cfd7144
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_secondary_compress_format_version.md
@@ -0,0 +1 @@
+* Remove useless option `CompressedSecondaryCacheOptions::compress_format_version`

From 47344a0febad7626d21cf4aa75e7085a75cd17a2 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 5 Feb 2026 11:22:28 -0800
Subject: [PATCH 459/500] Fix string-conversion issue in
 internal_repo_rocksdb/repo/utilities/persistent_cache/volatile_tier_impl.cc
 +5 (#14296)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14296

This could is triggering `-Wstring-conversion`, which presents as:
```
warning: implicit conversion turns string literal into bool: A to B
```
This is often a bug and what was intended. The most frequent cause is the code was:
```
void foo(bool) { ... }
void foo(std::string) { ... }
foo("this gets interpreted as a bool");
```

It is also possible the issue is innocuous as part of an assert:
```
assert(!"this string is true, so the assertion is false");
EXPECT_FALSE("this string is true, so the expect fails");
```
in these cases the use is to "cute", so we modify the code to make it more obvious.
```
assert(false && "the compiler recognizes and doesn't complain about this pattern");
FAIL() << "much more obvious";
```

Reviewed By: anand1976

Differential Revision: D92013593

fbshipit-source-id: 0b4e00339bef3f76fc5b9ad35e2383c5e4f828f9
---
 utilities/persistent_cache/volatile_tier_impl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utilities/persistent_cache/volatile_tier_impl.cc b/utilities/persistent_cache/volatile_tier_impl.cc
index eea119e6094b..44b6187d0417 100644
--- a/utilities/persistent_cache/volatile_tier_impl.cc
+++ b/utilities/persistent_cache/volatile_tier_impl.cc
@@ -106,7 +106,7 @@ Status VolatileCacheTier::Lookup(const Slice& page_key,
 }
 
 bool VolatileCacheTier::Erase(const Slice& /*key*/) {
-  assert(!"not supported");
+  assert(false && "not supported");
   return true;
 }
 

From 6ac0da313eb6e7dd0a1713fd1bf52e1fa107b670 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Thu, 5 Feb 2026 15:22:52 -0800
Subject: [PATCH 460/500] Fix crash in GetLiveFilesStorageInfo on read-only DB
 (#14306)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14306

GetLiveFilesStorageInfo crashes when called on a read-only RocksDB because
it calls FlushWAL(), which accesses logs_.back() on an empty deque.

Root cause: DBImplReadOnly overrides SyncWAL() to return NotSupported, but
does NOT override FlushWAL(). Read-only DBs have an empty logs_ deque
because they don't create WAL writers during recovery - there's nothing to
write, so no WAL infrastructure is initialized.

The reason SyncWAL was originally marked NotSupported is that these WAL
operations (SyncWAL syncs buffer to disk, FlushWAL flushes to OS buffer)
require an active WAL writer at logs_.back().writer. Since read-only DBs:
1. Cannot perform writes
2. Don't create WAL files for writing
3. Have an empty logs_ deque

...there's no WAL writer to sync or flush. The operations are semantically
meaningless, not just "forbidden write operations."

The fix adds a FlushWAL override matching the SyncWAL pattern. The caller
in db_filesnapshot.cc:403-405 already handles IsNotSupported() gracefully:
  if (s.IsNotSupported()) { s = Status::OK(); }

Reviewed By: pdillinger

Differential Revision: D92419557

fbshipit-source-id: 7079071209b3c7be41a2c98c9b691e68bc031595
---
 db/db_basic_test.cc           | 37 +++++++++++++++++++++++++++++++++++
 db/db_impl/db_impl_readonly.h |  5 +++++
 2 files changed, 42 insertions(+)

diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 003e761466b9..c33f08628d10 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -242,6 +242,43 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
             Status::Code::kNotSupported);
 }
 
+TEST_F(DBBasicTest, ReadOnlyDBFlushWAL) {
+  // Test that FlushWAL returns NotSupported on read-only DB, and that
+  // GetLiveFilesStorageInfo works correctly even with manual_wal_flush=true.
+  // This is a regression test for a bug where GetLiveFilesStorageInfo would
+  // crash on read-only DBs with manual_wal_flush=true because FlushWAL
+  // accessed logs_.back() on an empty deque.
+  auto options = CurrentOptions();
+  options.manual_wal_flush = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("baz", "v3"));  // Unflushed data in WAL
+  Close();
+
+  // Reopen as read-only
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_EQ("v3", Get("baz"));
+
+  // FlushWAL should return NotSupported (not crash)
+  ASSERT_EQ(db_->FlushWAL(/*sync=*/false).code(), Status::Code::kNotSupported);
+  ASSERT_EQ(db_->FlushWAL(/*sync=*/true).code(), Status::Code::kNotSupported);
+
+  // GetLiveFilesStorageInfo should succeed (previously crashed with
+  // manual_wal_flush=true because it called FlushWAL which accessed
+  // logs_.back() on empty deque)
+  LiveFilesStorageInfoOptions lfsi_opts;
+  lfsi_opts.wal_size_for_flush = 0;
+  std::vector<LiveFileStorageInfo> files;
+  ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files));
+  ASSERT_GT(files.size(), 0);
+
+  Close();
+}
+
 TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
   auto options = CurrentOptions();
   options.write_dbid_to_manifest = false;
diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h
index 3edfeb0e5508..2f456561cc30 100644
--- a/db/db_impl/db_impl_readonly.h
+++ b/db/db_impl/db_impl_readonly.h
@@ -121,6 +121,11 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DBImpl::FlushWAL;
+  Status FlushWAL(const FlushWALOptions& /*options*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
   using DB::IngestExternalFile;
   Status IngestExternalFile(
       ColumnFamilyHandle* /*column_family*/,

From a668dcbe8c6823a987b04ab69c40d809af061000 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Fri, 6 Feb 2026 10:47:19 -0800
Subject: [PATCH 461/500] Add Txn db support to ldb (#14304)

Summary:
This change adds the ability to open and operate on databases as TransactionDB in the ldb command-line tool.

  New Command-Line Options

  - --use_txn - Opens the database as a TransactionDB instead of a regular DB
  - --txn_write_policy=<0|1|2> - Sets the transaction write policy:
    - 0 = WRITE_COMMITTED (default)
    - 1 = WRITE_PREPARED
    - 2 = WRITE_UNPREPARED

  Use Case
                                                                                                                                                                                                                      This is needed to inspect or modify databases that were created with WritePrepared or WriteUnprepared transactions, which require opening via TransactionDB::Open() rather than the regular DB::Open().

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14304

Test Plan:
Tests (tools/ldb_test.py): Adds testTxnPutGet() covering:
    - Basic put/get/delete with TransactionDB
    - All three write policies
    - Validation that --use_txn and --ttl are mutually exclusive

Reviewed By: pdillinger

Differential Revision: D92323195

Pulled By: anand1976

fbshipit-source-id: 0a62b8ea4e2985feed977fad72595d6fff75db09
---
 include/rocksdb/utilities/ldb_cmd.h | 11 ++++++
 tools/ldb_cmd.cc                    | 53 +++++++++++++++++++++++++++--
 tools/ldb_test.py                   | 38 +++++++++++++++++++++
 tools/ldb_tool.cc                   |  7 ++++
 4 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index 313b4ea33281..f5b1387d7042 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -23,6 +23,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/utilities/ldb_cmd_execute_result.h"
+#include "rocksdb/utilities/transaction_db.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -42,6 +43,8 @@ class LDBCommand {
   static const std::string ARG_TTL;
   static const std::string ARG_TTL_START;
   static const std::string ARG_TTL_END;
+  static const std::string ARG_USE_TXN;
+  static const std::string ARG_TXN_WRITE_POLICY;
   static const std::string ARG_TIMESTAMP;
   static const std::string ARG_TRY_LOAD_OPTIONS;
   static const std::string ARG_IGNORE_UNKNOWN_OPTIONS;
@@ -164,6 +167,7 @@ class LDBCommand {
   std::string column_family_name_;
   DB* db_;
   DBWithTTL* db_ttl_;
+  TransactionDB* db_txn_;
   std::map<std::string, ColumnFamilyHandle*> cf_handles_;
   std::map<uint32_t, const Comparator*> ucmps_;
 
@@ -182,6 +186,13 @@ class LDBCommand {
   /** If true, the value is treated as timestamp suffixed */
   bool is_db_ttl_;
 
+  /** If true, open the DB as TransactionDB */
+  bool is_db_txn_;
+
+  /** Transaction write policy (0=WRITE_COMMITTED, 1=WRITE_PREPARED,
+   * 2=WRITE_UNPREPARED) */
+  int txn_write_policy_;
+
   // If true, the kvs are output with their insert/modify timestamp in a ttl db
   bool timestamp_;
 
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 8fa6d244e643..8d39987af1e3 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -71,6 +71,8 @@ const std::string LDBCommand::ARG_CF_NAME = "column_family";
 const std::string LDBCommand::ARG_TTL = "ttl";
 const std::string LDBCommand::ARG_TTL_START = "start_time";
 const std::string LDBCommand::ARG_TTL_END = "end_time";
+const std::string LDBCommand::ARG_USE_TXN = "use_txn";
+const std::string LDBCommand::ARG_TXN_WRITE_POLICY = "txn_write_policy";
 const std::string LDBCommand::ARG_TIMESTAMP = "timestamp";
 const std::string LDBCommand::ARG_TRY_LOAD_OPTIONS = "try_load_options";
 const std::string LDBCommand::ARG_DISABLE_CONSISTENCY_CHECKS =
@@ -479,10 +481,13 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& valid_cmd_line_options)
     : db_(nullptr),
       db_ttl_(nullptr),
+      db_txn_(nullptr),
       is_read_only_(is_read_only),
       is_key_hex_(false),
       is_value_hex_(false),
       is_db_ttl_(false),
+      is_db_txn_(false),
+      txn_write_policy_(0),
       timestamp_(false),
       try_load_options_(false),
       create_if_missing_(false),
@@ -526,6 +531,21 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
   is_key_hex_ = IsKeyHex(options, flags);
   is_value_hex_ = IsValueHex(options, flags);
   is_db_ttl_ = IsFlagPresent(flags, ARG_TTL);
+  is_db_txn_ = IsFlagPresent(flags, ARG_USE_TXN);
+  itr = options.find(ARG_TXN_WRITE_POLICY);
+  if (itr != options.end()) {
+    try {
+      txn_write_policy_ = std::stoi(itr->second);
+      if (txn_write_policy_ < 0 || txn_write_policy_ > 2) {
+        fprintf(stderr, "Invalid txn_write_policy: %d. Must be 0, 1, or 2.\n",
+                txn_write_policy_);
+        txn_write_policy_ = 0;
+      }
+    } catch (const std::exception&) {
+      fprintf(stderr, "Invalid txn_write_policy value: %s\n",
+              itr->second.c_str());
+    }
+  }
   timestamp_ = IsFlagPresent(flags, ARG_TIMESTAMP);
   try_load_options_ = IsTryLoadOptions(options, flags);
   force_consistency_checks_ =
@@ -549,7 +569,34 @@ void LDBCommand::OpenDB() {
   // Open the DB.
   Status st;
   std::vector<ColumnFamilyHandle*> handles_opened;
-  if (is_db_ttl_) {
+  if (is_db_txn_) {
+    // TransactionDB mode
+    if (is_db_ttl_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "Cannot use both --ttl and --use_txn flags together");
+      return;
+    }
+    if (!secondary_path_.empty() || !leader_path_.empty()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "TransactionDB does not support secondary or follower mode");
+      return;
+    }
+    if (is_read_only_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "TransactionDB does not support read-only mode");
+      return;
+    }
+    TransactionDBOptions txn_db_options;
+    txn_db_options.write_policy =
+        static_cast<TxnDBWritePolicy>(txn_write_policy_);
+    if (column_families_.empty()) {
+      st = TransactionDB::Open(options_, txn_db_options, db_path_, &db_txn_);
+    } else {
+      st = TransactionDB::Open(options_, txn_db_options, db_path_,
+                               column_families_, &handles_opened, &db_txn_);
+    }
+    db_ = db_txn_;
+  } else if (is_db_ttl_) {
     // ldb doesn't yet support TTL DB with multiple column families
     if (!column_family_name_.empty() || !column_families_.empty()) {
       exec_state_ = LDBCommandExecuteResult::Failed(
@@ -690,7 +737,9 @@ std::vector<std::string> LDBCommand::BuildCmdLineOptions(
                                   ARG_BLOB_FILE_STARTING_LEVEL,
                                   ARG_PREPOPULATE_BLOB_CACHE,
                                   ARG_IGNORE_UNKNOWN_OPTIONS,
-                                  ARG_CF_NAME};
+                                  ARG_CF_NAME,
+                                  ARG_USE_TXN,
+                                  ARG_TXN_WRITE_POLICY};
   ret.insert(ret.end(), options.begin(), options.end());
   return ret;
 }
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index 1be7ae2cc9e9..e91d521d5b5c 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -347,6 +347,44 @@ def testTtlPutGet(self):
         self.assertRunFAIL("get --ttl a3")
         self.assertRunOK("checkconsistency", "OK")
 
+    def testTxnPutGet(self):
+        print("Running testTxnPutGet...")
+        # Test basic put/get with TransactionDB (WriteCommitted - default)
+        self.assertRunOK("put t1 v1 --use_txn --create_if_missing", "OK")
+        self.assertRunOK("put t2 v2 --use_txn", "OK")
+        self.assertRunOK("put t3 v3 --use_txn", "OK")
+        # Verify data can be read back with TransactionDB
+        self.assertRunOK("batchput t4 v4 t5 v5 --use_txn", "OK")
+
+        # Test with WritePrepared policy (txn_write_policy=1)
+        self.assertRunOK("put t6 v6 --use_txn --txn_write_policy=1", "OK")
+
+        # Test with WriteUnprepared policy (txn_write_policy=2)
+        self.assertRunOK("put t7 v7 --use_txn --txn_write_policy=2", "OK")
+
+        # Verify all data persists and can be read without --use_txn
+        # (regular DB::Open should work for WriteCommitted data)
+        self.assertRunOK(
+            "scan",
+            "t1 ==> v1\nt2 ==> v2\nt3 ==> v3\nt4 ==> v4\nt5 ==> v5\nt6 ==> v6\nt7 ==> v7",
+        )
+
+        # Test delete with TransactionDB
+        self.assertRunOK("delete t3 --use_txn", "OK")
+        self.assertRunOK(
+            "scan",
+            "t1 ==> v1\nt2 ==> v2\nt4 ==> v4\nt5 ==> v5\nt6 ==> v6\nt7 ==> v7",
+        )
+
+        # Verify that --use_txn and --ttl cannot be used together
+        self.assertRunFAIL("put x1 y1 --use_txn --ttl --create_if_missing")
+
+        # Verify invalid txn_write_policy values are handled
+        # (values outside 0-2 should fall back to 0)
+        self.assertRunOK("put t8 v8 --use_txn --txn_write_policy=0", "OK")
+
+        self.assertRunOK("checkconsistency", "OK")
+
     def testInvalidCmdLines(self):  # noqa: F811 T25377293 Grandfathered in
         print("Running testInvalidCmdLines...")
         # db not specified
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index b2e19524e834..bee8d6f4f9ef 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -52,6 +52,13 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   ret.append("  --" + LDBCommand::ARG_TTL +
              " with 'put','get','scan','dump','query','batchput'"
              " : DB supports ttl and value is internally timestamp-suffixed\n");
+  ret.append("  --" + LDBCommand::ARG_USE_TXN +
+             " : Open database as TransactionDB. Required for databases "
+             "created with WritePrepared or WriteUnprepared transactions.\n");
+  ret.append("  --" + LDBCommand::ARG_TXN_WRITE_POLICY +
+             "=<0|1|2> : Transaction write policy. "
+             "0=WRITE_COMMITTED (default), 1=WRITE_PREPARED, "
+             "2=WRITE_UNPREPARED\n");
   ret.append("  --" + LDBCommand::ARG_TRY_LOAD_OPTIONS +
              " : Try to load option file from DB. Default to true if " +
              LDBCommand::ARG_DB +

From 3695cb6767c0aaf7379616a702ff5ac352a17118 Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Fri, 6 Feb 2026 10:51:21 -0800
Subject: [PATCH 462/500] Fix AbortIO consuming completions for non-aborted
 handles (#14301)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14301

When AbortIO was called with a subset of outstanding async read handles,
it would consume io_uring completions for handles NOT in the abort set
but fail to finalize them. This caused subsequent Poll calls on those
handles to hang forever waiting for completions that had already been
consumed.

The fix adds an `is_being_aborted` flag to Posix_IOHandle that is set
when submitting the cancel request. When processing completions in
AbortIO, handles with this flag wait for req_count==2 (original + cancel),
while handles without the flag are finalized immediately at req_count==1.

Also refactored the completion finalization logic into a shared
FinalizeAsyncRead() helper function used by both Poll and AbortIO.

Reviewed By: mszeszko-meta, archang19

Differential Revision: D92230883

fbshipit-source-id: e6d11e009a4930e5608459771990f6cf7d46d827
---
 env/env_test.cc | 151 ++++++++++++++++++++++++++++++++++++++++++++++++
 env/fs_posix.cc |  38 ++++++------
 env/io_posix.h  |  26 +++++++++
 3 files changed, 194 insertions(+), 21 deletions(-)

diff --git a/env/env_test.cc b/env/env_test.cc
index c035a526c881..68c5c90e4c51 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -41,6 +41,9 @@
 #include "env/env_chroot.h"
 #include "env/env_encryption_ctr.h"
 #include "env/fs_readonly.h"
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include "env/io_posix.h"
+#endif
 #include "env/mock_env.h"
 #include "env/unique_id_gen.h"
 #include "logging/log_buffer.h"
@@ -3919,6 +3922,154 @@ TEST_F(TestAsyncRead, AbortIOReversedHandles) {
                           /*use_direct_io=*/true, /*iterations=*/100);
 }
 
+// Test for bug fix: AbortIO with partial handles should correctly handle
+// completions for non-aborted handles.
+//
+// Previously, AbortIO would consume completions for non-aborted handles but
+// not set is_finished (since it expected req_count==2 for all handles).
+// This caused subsequent Poll calls to hang forever.
+//
+// The fix correctly detects handles not in the abort set and finalizes them
+// immediately when their completion arrives (at req_count==1).
+TEST_F(TestAsyncRead, AbortIOPartialHandlesBug) {
+#if defined(ROCKSDB_IOURING_PRESENT)
+  std::shared_ptr<FileSystem> fs = env_->GetFileSystem();
+  std::string fname = test::PerThreadDBPath(env_, "testfile_abortio_partial");
+
+  constexpr size_t kSectorSize = 4096;
+  constexpr size_t kFileSize = 2 * 1024 * 1024;  // 2MB
+
+  // 1. Create test file with direct I/O
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    FileOptions file_opts;
+    file_opts.use_direct_writes = true;
+    ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr));
+
+    size_t num_sectors = kFileSize / kSectorSize;
+    for (size_t i = 0; i < num_sectors; ++i) {
+      auto data = NewAligned(kSectorSize, static_cast<char>(i + 1));
+      Slice slice(data.get(), kSectorSize);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+
+  // 2. Submit 3 ReadAsync requests, abort only the first one, then Poll the
+  // rest
+  {
+    FileOptions file_opts;
+    file_opts.use_direct_reads = true;
+    std::unique_ptr<FSRandomAccessFile> file;
+    ASSERT_OK(fs->NewRandomAccessFile(fname, file_opts, &file, nullptr));
+
+    IOOptions opts;
+    constexpr size_t kNumReads = 3;
+    std::vector<void*> io_handles(kNumReads);
+    std::vector<FSReadRequest> reqs(kNumReads);
+    std::vector<std::unique_ptr<char, Deleter>> data;
+    std::vector<size_t> vals;
+    IOHandleDeleter del_fn;
+    std::atomic<int> callbacks_invoked{0};
+
+    // H0: 1MB read, H1: 4KB read, H2: 4KB read
+    std::vector<std::pair<uint64_t, size_t>> read_specs = {
+        {0, 1024 * 1024},            // H0: 1MB at offset 0
+        {1024 * 1024, 4096},         // H1: 4KB at offset 1MB
+        {1024 * 1024 + 4096, 4096},  // H2: 4KB at offset 1MB+4KB
+    };
+
+    for (size_t i = 0; i < kNumReads; i++) {
+      reqs[i].offset = read_specs[i].first;
+      reqs[i].len = read_specs[i].second;
+      data.emplace_back(NewAligned(reqs[i].len, 0));
+      reqs[i].scratch = data.back().get();
+      vals.push_back(i);
+    }
+
+    std::function<void(FSReadRequest&, void*)> callback =
+        [&](FSReadRequest& req, void* cb_arg) {
+          size_t i = *(reinterpret_cast<size_t*>(cb_arg));
+          reqs[i].status = req.status;
+          callbacks_invoked++;
+        };
+
+    // Submit all ReadAsync requests
+    for (size_t i = 0; i < kNumReads; i++) {
+      void* cb_arg = static_cast<void*>(&(vals[i]));
+      IOStatus s = file->ReadAsync(reqs[i], opts, callback, cb_arg,
+                                   &(io_handles[i]), &del_fn, nullptr);
+      if (s.IsNotSupported()) {
+        // io_uring not supported, clean up and skip
+        for (size_t j = 0; j < i; j++) {
+          if (io_handles[j]) {
+            del_fn(io_handles[j]);
+          }
+        }
+        ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+        return;
+      }
+      ASSERT_OK(s);
+    }
+
+    // Wait for reads to complete in io_uring (completions in queue but not
+    // consumed). 5 seconds should be plenty for direct I/O reads to complete.
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+
+    // Abort ONLY H0 - this will consume all completions but should correctly
+    // finalize H1 and H2 (since they're not in the abort set).
+    std::vector<void*> abort_handles = {io_handles[0]};
+    ASSERT_OK(fs->AbortIO(abort_handles));
+
+    // Verify H0 is finished (aborted)
+    Posix_IOHandle* h0 = static_cast<Posix_IOHandle*>(io_handles[0]);
+    ASSERT_TRUE(h0->is_finished);
+    ASSERT_EQ(h0->req_count, 2u);  // original + cancel
+
+    // Verify H1 and H2 are finished (read completed, not aborted)
+    Posix_IOHandle* h1 = static_cast<Posix_IOHandle*>(io_handles[1]);
+    Posix_IOHandle* h2 = static_cast<Posix_IOHandle*>(io_handles[2]);
+    ASSERT_TRUE(h1->is_finished);
+    ASSERT_TRUE(h2->is_finished);
+    ASSERT_EQ(h1->req_count, 1u);  // only original (no cancel)
+    ASSERT_EQ(h2->req_count, 1u);  // only original (no cancel)
+
+    // Poll on H1, H2 - should return immediately since they're already finished
+    // Note: Poll must be called from the same thread (io_uring is thread-local)
+    std::vector<void*> poll_handles = {io_handles[1], io_handles[2]};
+
+    // Use a watchdog to detect hang (regression test)
+    std::atomic<bool> poll_completed{false};
+    std::thread watchdog([&]() {
+      for (int i = 0; i < 500; i++) {  // 5 seconds timeout
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        if (poll_completed) return;
+      }
+      // Bug regression: Poll hung
+      _exit(1);
+    });
+
+    fs->Poll(poll_handles, poll_handles.size());
+    poll_completed = true;
+    watchdog.join();
+
+    // Verify all callbacks were invoked
+    ASSERT_EQ(callbacks_invoked.load(), 3);
+
+    // Clean up handles
+    for (size_t i = 0; i < kNumReads; i++) {
+      if (io_handles[i]) {
+        del_fn(io_handles[i]);
+      }
+    }
+  }
+
+  ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+#else
+  (void)env_;  // Suppress unused variable warning
+#endif
+}
+
 struct StaticDestructionTester {
   bool activated = false;
   ~StaticDestructionTester() {
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index de8152d781bf..7080eef1a09d 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -1129,25 +1129,7 @@ class PosixFileSystem : public FileSystem {
         // Reset cqe data to catch any stray reuse of it
         static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
 
-        FSReadRequest req;
-        req.scratch = posix_handle->scratch;
-        req.offset = posix_handle->offset;
-        req.len = posix_handle->len;
-
-        size_t finished_len = 0;
-        size_t bytes_read = 0;
-        bool read_again = false;
-        UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len,
-                     true /*async_read*/, posix_handle->use_direct_io,
-                     posix_handle->alignment, finished_len, &req, bytes_read,
-                     read_again);
-        posix_handle->is_finished = true;
-        io_uring_cqe_seen(iu, cqe);
-        posix_handle->cb(req, posix_handle->cb_arg);
-
-        (void)finished_len;
-        (void)bytes_read;
-        (void)read_again;
+        FinalizeAsyncRead(iu, cqe, posix_handle);
 
         if (static_cast<Posix_IOHandle*>(io_handles[i]) == posix_handle) {
           break;
@@ -1188,6 +1170,11 @@ class PosixFileSystem : public FileSystem {
         return IOStatus::IOError("");
       }
 
+      // Mark this handle as being aborted. This is used when processing
+      // completions to distinguish between aborted handles (expect 2
+      // completions: original + cancel) and non-aborted handles (expect 1).
+      posix_handle->is_being_aborted = true;
+
       // Prepare the cancel request.
       struct io_uring_sqe* sqe;
       sqe = io_uring_get_sqe(iu);
@@ -1234,6 +1221,14 @@ class PosixFileSystem : public FileSystem {
         }
         posix_handle->req_count++;
 
+        if (!posix_handle->is_being_aborted) {
+          // This is a completion for a handle NOT being aborted.
+          // It only has 1 outstanding request (the original read), so we
+          // should finalize it now.
+          FinalizeAsyncRead(iu, cqe, posix_handle);
+          continue;
+        }
+
         // Reset cqe data to catch any stray reuse of it
         static_cast<struct io_uring_cqe*>(cqe)->user_data = 0xd5d5d5d5d5d5d5d5;
         io_uring_cqe_seen(iu, cqe);
@@ -1247,8 +1242,9 @@ class PosixFileSystem : public FileSystem {
         // - And finally, if the request to cancel wasn't
         //   found, the cancel request is completed with -ENOENT.
         //
-        // Every handle has to wait for 2 requests completion: original one and
-        // the cancel request which is tracked by PosixHandle::req_count.
+        // Every handle being aborted has to wait for 2 requests completion:
+        // original one and the cancel request which is tracked by
+        // PosixHandle::req_count.
         // Note: We must mark is_finished and invoke the callback for ANY handle
         // that reaches req_count == 2, not just the one we're currently waiting
         // for (io_handles[i]). Otherwise, if completions arrive out of order,
diff --git a/env/io_posix.h b/env/io_posix.h
index ca33b8e3e948..f8acffd60892 100644
--- a/env/io_posix.h
+++ b/env/io_posix.h
@@ -127,6 +127,7 @@ struct Posix_IOHandle {
         use_direct_io(_use_direct_io),
         alignment(_alignment),
         is_finished(false),
+        is_being_aborted(false),
         req_count(0) {}
 
   struct iovec iov;
@@ -139,6 +140,10 @@ struct Posix_IOHandle {
   bool use_direct_io;
   size_t alignment;
   bool is_finished;
+  // is_being_aborted is set by AbortIO when a cancel request is submitted.
+  // Used to distinguish between aborted handles (expect 2 completions) and
+  // non-aborted handles (expect 1 completion) when processing completions.
+  bool is_being_aborted;
   // req_count is used by AbortIO API to keep track of number of requests.
   uint32_t req_count;
 };
@@ -197,6 +202,27 @@ inline void UpdateResult(struct io_uring_cqe* cqe, const std::string& file_name,
   (void)len;
 #endif
 }
+
+// Finalize a completed async read request.
+// Processes the CQE result, marks the handle as finished, and invokes the
+// callback. This is shared between Poll and AbortIO (for non-aborted handles).
+inline void FinalizeAsyncRead(struct io_uring* iu, struct io_uring_cqe* cqe,
+                              Posix_IOHandle* posix_handle) {
+  FSReadRequest req;
+  req.scratch = posix_handle->scratch;
+  req.offset = posix_handle->offset;
+  req.len = posix_handle->len;
+
+  size_t finished_len = 0;
+  size_t bytes_read = 0;
+  bool read_again = false;
+  UpdateResult(cqe, "", req.len, posix_handle->iov.iov_len, true /*async_read*/,
+               posix_handle->use_direct_io, posix_handle->alignment,
+               finished_len, &req, bytes_read, read_again);
+  posix_handle->is_finished = true;
+  io_uring_cqe_seen(iu, cqe);
+  posix_handle->cb(req, posix_handle->cb_arg);
+}
 #endif
 
 #ifdef OS_LINUX

From 8f9cb1a7086bdf6504c84cba66dbaa50ea7a084e Mon Sep 17 00:00:00 2001
From: Ryan Hancock <krhancock@meta.com>
Date: Fri, 6 Feb 2026 11:29:30 -0800
Subject: [PATCH 463/500] Introduce Memory restrictions for IO Dispatcher.
 (#14300)

Summary:
Introduction of memory limiter for IO Dispatch.

Currently, the user has no way of enacting policy with IO dispatcher. One important policy is the ability to restrict the amount of memory a multiscan or set of multiscans is allowed to pin. This PR introduces the max_prefetch_memory_bytes in the IODispatcherOptions, allowing for users to specify bounds on block cache memory usage.

There seems to be a minor performance increase however, I have found the scans to be a bit noisy. Each benchmark is run with a stride size of 30000 keys. This was done to ensure we maintain parity with trunk.
```
Configuration: 10 concurrent scans, 1024B values, 5242880 byte SST files
Scan sizes: 1024 keys = 1MiB, 2048 keys = 2MiB, 4096 keys = 4MiB per scan

| Keys/Scan | Mode  | Main (ops/sec)   | Main (us/op)     | limiter              (ops/sec) | limiter            (us/op) | Delta ops/sec |
|-----------|-------|------------------|------------------|----------------------|--------------------|---------------|
| 1024      | sync  |   151.6 +/- 8.0   | 6591.14 +/- 343.30 |    170.6 +/- 4.0      | 5855.32 +/- 136.19   | +12.00%       |
| 1024      | async |   156.4 +/- 24.7  | 6589.64 +/- 1345.73 |    173.8 +/- 2.7      | 5744.51 +/- 91.35   | +11.00%       |
| 2048      | sync  |    77.8 +/- 1.6   | 12785.64 +/- 286.49 |     87.6 +/- 3.4      | 11354.01 +/- 441.71   | +12.00%       |
| 2048      | async |    85.6 +/- 4.7   | 11658.11 +/- 618.49 |     91.4 +/- 1.2      | 10873.63 +/- 143.49   | +6.00%        |
| 4096      | sync  |    43.2 +/- 1.5   | 22932.27 +/- 730.66 |     43.8 +/- 0.7      | 22563.90 +/- 320.93   | +1.00%        |
| 4096      | async |    45.4 +/- 0.8   | 21875.64 +/- 357.04 |     46.2 +/- 0.7      | 21416.95 +/- 311.89   | +1.00%        |

```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14300

Reviewed By: anand1976

Differential Revision: D92316556

Pulled By: krhancoc

fbshipit-source-id: dc0b7958a33b8ef5fa5af82b1c6d960041837fc1
---
 include/rocksdb/io_dispatcher.h | 166 ++++--
 include/rocksdb/statistics.h    |   8 +
 monitoring/statistics.cc        |   4 +
 util/io_dispatcher_imp.cc       | 471 +++++++++++++++--
 util/io_dispatcher_imp.h        |   5 +-
 util/io_dispatcher_test.cc      | 896 ++++++++++++++++++++++++++++++++
 6 files changed, 1468 insertions(+), 82 deletions(-)

diff --git a/include/rocksdb/io_dispatcher.h b/include/rocksdb/io_dispatcher.h
index 9c3fefd640b8..6354d72ad36d 100644
--- a/include/rocksdb/io_dispatcher.h
+++ b/include/rocksdb/io_dispatcher.h
@@ -6,8 +6,10 @@
 #pragma once
 
 #include <atomic>
+#include <functional>
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "rocksdb/options.h"
@@ -17,6 +19,22 @@
 namespace ROCKSDB_NAMESPACE {
 
 class FileSystem;
+class Statistics;
+
+// Forward declaration for internal implementation
+struct IODispatcherImplData;
+struct PendingPrefetchRequest;
+
+// Options for configuring IODispatcher behavior
+struct IODispatcherOptions {
+  // Maximum memory (in bytes) for prefetching across all ReadSets.
+  // When this limit is reached, SubmitJob() blocks until memory is released.
+  // Set to 0 (default) for unlimited prefetch memory.
+  size_t max_prefetch_memory_bytes = 0;
+
+  // Optional statistics for tracking memory limiter metrics
+  Statistics* statistics = nullptr;
+};
 
 /*
  * IODispatcher is a class that allows users to submit groups of IO jobs to be
@@ -33,51 +51,88 @@ class FileSystem;
  * dispatcher, allowing for future ratelimiting and smarter dispatching policies
  * in the future.
  *
-* Example:
- // Submitting an IO job and reading blocks:
- //
- // std::shared_ptr<IOJob> job = std::make_shared<IOJob>();
- // job->table = table_reader;  // Provided BlockBasedTable*
- // job->job_options.io_coalesce_threshold = 32 * 1024;
- // job->job_options.read_options = read_options;  // Provided ReadOptions
- //
- // // Populate the job with block handles (e.g., from an index/iterator)
- // job->block_handles.push_back(handle1);
- // job->block_handles.push_back(handle2);
- // job->block_handles.push_back(handle3);
- //
- // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
- // std::shared_ptr<ReadSet> read_set;
- // Status s = dispatcher->SubmitJob(job, &read_set);
- // if (!s.ok()) {
- //   // Handle submit error
- // }
- //
- // // Read by index
- // for (size_t i = 1; i < job->block_handles.size(); ++i) {
- //   CachableEntry<Block> block_entry;
- //   Status rs = read_set->ReadIndex(i, &block_entry);
- //   if (!rs.ok()) {
- //     // Handle read error
- //     continue;
- //   }
- //   // Use block_entry (block contents are pinned here)
- // }
- //
- // // Or read by byte offset
- // {
- //   size_t offset = static_cast<size_t>(job->block_handles.front().offset());
- //   CachableEntry<Block> block_entry;
- //   Status rs = read_set->ReadOffset(offset, &block_entry);
- //   if (rs.ok()) {
- //     // Use block_entry
- //   }
- // }
- //
- // // Stats
- // uint64_t cache_hits = read_set->GetNumCacheHits();
- // uint64_t async_reads = read_set->GetNumAsyncReads();
- // uint64_t sync_reads = read_set->GetNumSyncReads();
+ * Example 1: Basic Usage
+ * ----------------------
+ * // Submitting an IO job and reading blocks:
+ * //
+ * // std::shared_ptr<IOJob> job = std::make_shared<IOJob>();
+ * // job->table = table_reader;  // Provided BlockBasedTable*
+ * // job->job_options.io_coalesce_threshold = 32 * 1024;
+ * // job->job_options.read_options = read_options;  // Provided ReadOptions
+ * //
+ * // // Populate the job with block handles (e.g., from an index/iterator)
+ * // job->block_handles.push_back(handle1);
+ * // job->block_handles.push_back(handle2);
+ * // job->block_handles.push_back(handle3);
+ * //
+ * // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher());
+ * // std::shared_ptr<ReadSet> read_set;
+ * // Status s = dispatcher->SubmitJob(job, &read_set);
+ * // if (!s.ok()) {
+ * //   // Handle submit error
+ * // }
+ * //
+ * // // Read by index
+ * // for (size_t i = 1; i < job->block_handles.size(); ++i) {
+ * //   CachableEntry<Block> block_entry;
+ * //   Status rs = read_set->ReadIndex(i, &block_entry);
+ * //   if (!rs.ok()) {
+ * //     // Handle read error
+ * //     continue;
+ * //   }
+ * //   // Use block_entry (block contents are pinned here)
+ * // }
+ * //
+ * // // Or read by byte offset
+ * // {
+ * //   size_t offset =
+ static_cast<size_t>(job->block_handles.front().offset());
+ * //   CachableEntry<Block> block_entry;
+ * //   Status rs = read_set->ReadOffset(offset, &block_entry);
+ * //   if (rs.ok()) {
+ * //     // Use block_entry
+ * //   }
+ * // }
+ * //
+ * // // Stats
+ * // uint64_t cache_hits = read_set->GetNumCacheHits();
+ * // uint64_t async_reads = read_set->GetNumAsyncReads();
+ * // uint64_t sync_reads = read_set->GetNumSyncReads();
+ *
+ * Example 2: Memory-Limited Prefetching
+ * -------------------------------------
+ * // Configure a memory budget for prefetching to prevent unbounded memory use.
+ * // When the budget is exceeded, IODispatcher uses "partial prefetch":
+ * //   - Dispatches as many blocks as fit in available memory (earlier first)
+ * //   - Queues remaining blocks for later dispatch when memory is released
+ * //   - Never blocks on SubmitJob - remaining blocks are read on-demand
+ * //
+ * // IODispatcherOptions opts;
+ * // opts.max_prefetch_memory_bytes = 64 * 1024 * 1024;  // 64MB budget
+ * // opts.statistics = db_options.statistics.get();      // Optional metrics
+ * //
+ * // std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+ * //
+ * // // Submit a job that needs more memory than available
+ * // // Partial prefetch will dispatch what fits immediately
+ * // std::shared_ptr<ReadSet> read_set;
+ * // Status s = dispatcher->SubmitJob(job, &read_set);  // Never blocks
+ * //
+ * // // Read blocks in order - earlier blocks are more likely to be prefetched
+ * // for (size_t i = 0; i < job->block_handles.size(); ++i) {
+ * //   CachableEntry<Block> block;
+ * //   Status rs = read_set->ReadIndex(i, &block);
+ * //   // Use block...
+ * //
+ * //   // Release block when done to free memory for pending prefetches
+ * //   read_set->ReleaseBlock(i);  // Triggers dispatch of queued blocks
+ * // }
+ * //
+ * // Memory limiting statistics (when statistics is configured):
+ * // - PREFETCH_MEMORY_BYTES_GRANTED: Total bytes acquired for prefetching
+ * // - PREFETCH_MEMORY_BYTES_RELEASED: Total bytes released after use
+ * // - PREFETCH_MEMORY_REQUESTS_BLOCKED: Number of blocks that couldn't be
+ * //   prefetched immediately due to memory pressure
 
  */
 
@@ -180,6 +235,13 @@ class ReadSet {
   // blocks are coalesced into a single IO request.
   std::unordered_map<size_t, std::shared_ptr<AsyncIOState>> async_io_map_;
 
+  // For memory release notifications back to dispatcher (weak ref to avoid
+  // cycles)
+  std::weak_ptr<IODispatcherImplData> dispatcher_data_;
+
+  // Size of each block (parallel to pinned_blocks_) for memory accounting
+  std::vector<size_t> block_sizes_;
+
   // Statistics counters
   std::atomic<uint64_t> num_sync_reads_ = 0;
   std::atomic<uint64_t> num_async_reads_ = 0;
@@ -191,6 +253,16 @@ class ReadSet {
 
   // Perform synchronous read for a specific block
   Status SyncRead(size_t block_index);
+
+  // Remove a block from pending prefetch (called by ReadIndex/ReleaseBlock)
+  void RemoveFromPending(size_t block_index);
+
+  // Atomic flags indicating if block is pending prefetch (lock-free check)
+  std::unique_ptr<std::atomic<bool>[]> pending_prefetch_flags_;
+  size_t pending_prefetch_flags_size_ = 0;
+
+  // Reference to pending request (for removal notification)
+  std::shared_ptr<PendingPrefetchRequest> pending_request_;
 };
 
 /*
@@ -218,8 +290,12 @@ class IODispatcher {
                            std::shared_ptr<ReadSet>* read_set) = 0;
 };
 
+// Create IODispatcher with default options (no memory limit)
 IODispatcher* NewIODispatcher();
 
+// Create IODispatcher with custom options
+IODispatcher* NewIODispatcher(const IODispatcherOptions& options);
+
 // TrackingIODispatcher wraps another IODispatcher and tracks all ReadSets
 // created. This is useful for testing to verify IO statistics.
 class TrackingIODispatcher : public IODispatcher {
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index ae4ef5792408..7cecac05f7a1 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -575,6 +575,14 @@ enum Tickers : uint32_t {
   // # of seeks that failed validation (out of order, etc.)
   MULTISCAN_SEEK_ERRORS,
 
+  // IODispatcher memory limiting statistics
+  // # of bytes granted to prefetch requests
+  PREFETCH_MEMORY_BYTES_GRANTED,
+  // # of bytes released from prefetch memory
+  PREFETCH_MEMORY_BYTES_RELEASED,
+  // # of prefetch requests that were blocked waiting for memory
+  PREFETCH_MEMORY_REQUESTS_BLOCKED,
+
   TICKER_ENUM_MAX
 };
 
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index ccc92bcb6152..e6060cbeac20 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -292,6 +292,10 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {MULTISCAN_IO_COALESCED_NONADJACENT,
      "rocksdb.multiscan.io.coalesced.nonadjacent"},
     {MULTISCAN_SEEK_ERRORS, "rocksdb.multiscan.seek.errors"},
+    {PREFETCH_MEMORY_BYTES_GRANTED, "rocksdb.prefetch.memory.bytes.granted"},
+    {PREFETCH_MEMORY_BYTES_RELEASED, "rocksdb.prefetch.memory.bytes.released"},
+    {PREFETCH_MEMORY_REQUESTS_BLOCKED,
+     "rocksdb.prefetch.memory.requests.blocked"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
diff --git a/util/io_dispatcher_imp.cc b/util/io_dispatcher_imp.cc
index fd7d590185fc..2789414860c7 100644
--- a/util/io_dispatcher_imp.cc
+++ b/util/io_dispatcher_imp.cc
@@ -14,12 +14,15 @@
 
 #include "util/io_dispatcher_imp.h"
 
+#include <deque>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
 #include "file/random_access_file_reader.h"
+#include "monitoring/statistics_impl.h"
+#include "port/port.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/io_dispatcher.h"
 #include "rocksdb/options.h"
@@ -28,9 +31,19 @@
 #include "table/block_based/cachable_entry.h"
 #include "table/block_based/reader_common.h"
 #include "table/format.h"
+#include "test_util/sync_point.h"
+#include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// IODispatcherImplData is the base that provides ReleaseMemory interface
+// for ReadSets to call back when releasing blocks. Defined here so it's
+// visible to ReadSet methods.
+struct IODispatcherImplData {
+  virtual ~IODispatcherImplData() = default;
+  virtual void ReleaseMemory(size_t bytes) = 0;
+};
+
 // Helper function to create and pin a block from a buffer
 // Used by both ReadSet::PollAndProcessAsyncIO and IODispatcherImpl::Impl
 static Status CreateAndPinBlockFromBuffer(
@@ -98,6 +111,18 @@ struct AsyncIOState {
 // Must call AbortIO before deleting handles to avoid use-after-free when
 // io_uring completions arrive for deleted handles.
 ReadSet::~ReadSet() {
+  // Release memory for any blocks still pinned
+  // Note: block_sizes_[i] is only set for async IO reads where memory
+  // limiting applies. For sync reads, block_sizes_ remains 0, so this
+  // loop is effectively a no-op for sync reads.
+  if (auto dispatcher_data = dispatcher_data_.lock()) {
+    for (size_t i = 0; i < block_sizes_.size(); ++i) {
+      if (block_sizes_[i] > 0 && pinned_blocks_[i].GetValue()) {
+        dispatcher_data->ReleaseMemory(block_sizes_[i]);
+      }
+    }
+  }
+
   if (async_io_map_.empty()) {
     return;
   }
@@ -173,6 +198,9 @@ Status ReadSet::ReadIndex(size_t block_index, CachableEntry<Block>* out) {
   }
 
   // Case 3: Block needs synchronous read
+  // If this block was pending prefetch, remove it since we're reading it now
+  RemoveFromPending(block_index);
+
   Status s = SyncRead(block_index);
   if (s.ok()) {
     *out = std::move(pinned_blocks_[block_index]);
@@ -219,6 +247,22 @@ void ReadSet::ReleaseBlock(size_t block_index) {
   if (block_index >= pinned_blocks_.size()) {
     return;
   }
+
+  // Remove from pending if applicable
+  RemoveFromPending(block_index);
+
+  // Release memory BEFORE unpinning
+  // Note: block_sizes_[idx] is only set for async IO reads where memory
+  // limiting applies. For sync reads, block_sizes_ remains 0, so this
+  // check implicitly skips ReleaseMemory for sync reads.
+  if (pinned_blocks_[block_index].GetValue() &&
+      block_index < block_sizes_.size() && block_sizes_[block_index] > 0) {
+    if (auto dispatcher_data = dispatcher_data_.lock()) {
+      dispatcher_data->ReleaseMemory(block_sizes_[block_index]);
+    }
+    block_sizes_[block_index] = 0;  // Prevent double-release
+  }
+
   // Unpin the block from cache
   pinned_blocks_[block_index].Reset();
   // Clean up any pending async IO for this block
@@ -300,9 +344,51 @@ Status ReadSet::SyncRead(size_t block_index) {
       /*async_read=*/false, /*use_block_cache_for_lookup=*/true);
 }
 
-struct IODispatcherImpl::Impl {
-  Impl();
-  ~Impl();
+// A pre-coalesced group of blocks for prefetching
+struct CoalescedPrefetchGroup {
+  std::vector<size_t> block_indices;  // Blocks in this group (sorted by offset)
+  size_t total_bytes = 0;             // Total bytes for this IO
+};
+
+// State for a pending memory request waiting to be granted
+// Groups are pre-coalesced at queue time for efficient dispatch
+struct PendingPrefetchRequest {
+  std::weak_ptr<ReadSet> read_set;
+  std::shared_ptr<IOJob> job;
+
+  // Pre-coalesced groups ready for dispatch (ordered by first block index)
+  std::deque<CoalescedPrefetchGroup> coalesced_groups;
+
+  // Individual block indices still pending (for RemoveFromPending lookup)
+  std::unordered_set<size_t> block_indices_to_prefetch;
+
+  std::atomic<size_t> pending_bytes_{0};  // Track remaining bytes
+  mutable port::Mutex groups_mutex_;  // Protects groups and set modifications
+};
+
+// Remove a block from pending prefetch (called when block is read or released)
+void ReadSet::RemoveFromPending(size_t block_index) {
+  if (!pending_prefetch_flags_ || block_index >= pending_prefetch_flags_size_) {
+    return;
+  }
+
+  // Atomic exchange - returns true only if it was previously true
+  if (!pending_prefetch_flags_[block_index].exchange(false)) {
+    return;  // Already removed or never pending
+  }
+
+  if (pending_request_) {
+    MutexLock lock(&pending_request_->groups_mutex_);
+    pending_request_->block_indices_to_prefetch.erase(block_index);
+    pending_request_->pending_bytes_ -= block_sizes_[block_index];
+  }
+}
+
+// IODispatcherImpl::Impl inherits from IODispatcherImplData
+struct IODispatcherImpl::Impl : public IODispatcherImplData,
+                                public std::enable_shared_from_this<Impl> {
+  explicit Impl(const IODispatcherOptions& options);
+  ~Impl() override;
 
   // Non-copyable and non-movable
   Impl(const Impl&) = delete;
@@ -313,6 +399,18 @@ struct IODispatcherImpl::Impl {
   Status SubmitJob(const std::shared_ptr<IOJob>& job,
                    std::shared_ptr<ReadSet>* read_set);
 
+  // Memory management methods - non-blocking
+  bool TryAcquireMemory(size_t bytes);
+  void ReleaseMemory(size_t bytes) override;
+
+  // Memory limiting state
+  size_t max_prefetch_memory_bytes_ = 0;
+  std::atomic<size_t> memory_used_{0};  // Atomic for lock-free accounting
+  std::atomic<bool> has_pending_requests_{false};  // Fast-path check
+  port::Mutex memory_mutex_;  // Only for pending_prefetch_queue_ access
+  std::deque<std::shared_ptr<PendingPrefetchRequest>> pending_prefetch_queue_;
+  Statistics* statistics_ = nullptr;
+
  private:
   void PrepareIORequests(
       const std::shared_ptr<IOJob>& job,
@@ -335,12 +433,214 @@ struct IODispatcherImpl::Impl {
       const std::shared_ptr<ReadSet>& read_set,
       std::vector<FSReadRequest>& read_reqs,
       const std::vector<std::vector<size_t>>& coalesced_block_indices);
+
+  // Try to dispatch pending prefetch requests when memory becomes available
+  void TryDispatchPendingPrefetches();
+
+  // Dispatch prefetch for a specific ReadSet (called when memory is available)
+  void DispatchPrefetch(const std::shared_ptr<ReadSet>& read_set,
+                        const std::shared_ptr<IOJob>& job,
+                        const std::vector<size_t>& block_indices);
+
+  // Pre-coalesce blocks into groups, respecting max_group_bytes size limit.
+  // Returns groups ordered by first block index (earlier blocks first).
+  std::vector<CoalescedPrefetchGroup> PreCoalesceBlocks(
+      const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
+      const std::vector<size_t>& block_indices, size_t max_group_bytes);
 };
 
-IODispatcherImpl::Impl::Impl() {}
+IODispatcherImpl::Impl::Impl(const IODispatcherOptions& options)
+    : max_prefetch_memory_bytes_(options.max_prefetch_memory_bytes),
+      statistics_(options.statistics) {}
 
 IODispatcherImpl::Impl::~Impl() {}
 
+bool IODispatcherImpl::Impl::TryAcquireMemory(size_t bytes) {
+  if (max_prefetch_memory_bytes_ == 0) {
+    return true;  // No limit configured
+  }
+
+  // Lock-free memory acquisition using compare-exchange
+  size_t current = memory_used_.load(std::memory_order_relaxed);
+  while (true) {
+    if (current + bytes > max_prefetch_memory_bytes_) {
+      // Not enough memory - caller should queue for later
+      RecordTick(statistics_, PREFETCH_MEMORY_REQUESTS_BLOCKED);
+      return false;
+    }
+    if (memory_used_.compare_exchange_weak(current, current + bytes,
+                                           std::memory_order_release,
+                                           std::memory_order_relaxed)) {
+      RecordTick(statistics_, PREFETCH_MEMORY_BYTES_GRANTED, bytes);
+      return true;
+    }
+    // current is updated by compare_exchange_weak on failure, retry
+  }
+}
+
+void IODispatcherImpl::Impl::ReleaseMemory(size_t bytes) {
+  if (max_prefetch_memory_bytes_ == 0) {
+    return;  // No limit configured
+  }
+
+  // Lock-free memory release using atomic fetch_sub
+  size_t old_val = memory_used_.fetch_sub(bytes, std::memory_order_release);
+  assert(old_val >= bytes);
+  (void)old_val;  // Suppress unused warning in release builds
+  RecordTick(statistics_, PREFETCH_MEMORY_BYTES_RELEASED, bytes);
+
+  // Fast-path: skip dispatch attempt if no pending requests
+  // This avoids mutex contention in the common single-threaded iterator case
+  if (!has_pending_requests_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  // Try to dispatch pending prefetches now that memory is available
+  TryDispatchPendingPrefetches();
+}
+
+void IODispatcherImpl::Impl::TryDispatchPendingPrefetches() {
+  // Process pending prefetch requests - dispatch entire coalesced groups
+  while (true) {
+    std::shared_ptr<PendingPrefetchRequest> pending;
+
+    {
+      MutexLock lock(&memory_mutex_);
+      if (pending_prefetch_queue_.empty()) {
+        has_pending_requests_.store(false, std::memory_order_release);
+        return;
+      }
+
+      // Get the next pending request
+      pending = std::move(pending_prefetch_queue_.front());
+      pending_prefetch_queue_.pop_front();
+    }
+
+    // Check if the ReadSet is still alive
+    auto read_set = pending->read_set.lock();
+    if (!read_set) {
+      continue;  // ReadSet was destroyed, skip this request
+    }
+
+    // Try to acquire memory for coalesced groups (entire groups at a time)
+    std::vector<size_t> blocks_to_dispatch;
+    bool has_remaining_groups = false;
+
+    {
+      MutexLock lock(&pending->groups_mutex_);
+
+      while (!pending->coalesced_groups.empty()) {
+        auto& group = pending->coalesced_groups.front();
+
+        // Filter out blocks that were already read (not in pending set anymore)
+        std::vector<size_t> remaining_blocks;
+        size_t remaining_bytes = 0;
+        for (size_t idx : group.block_indices) {
+          if (pending->block_indices_to_prefetch.count(idx) > 0) {
+            remaining_blocks.push_back(idx);
+            remaining_bytes += read_set->block_sizes_[idx];
+          }
+        }
+
+        // Skip empty groups (all blocks were already read)
+        if (remaining_blocks.empty()) {
+          pending->coalesced_groups.pop_front();
+          continue;
+        }
+
+        // Try to acquire memory for remaining blocks only
+        if (TryAcquireMemory(remaining_bytes)) {
+          // Add all remaining blocks from this group to dispatch
+          for (size_t idx : remaining_blocks) {
+            blocks_to_dispatch.push_back(idx);
+            pending->block_indices_to_prefetch.erase(idx);
+          }
+          pending->pending_bytes_ -= remaining_bytes;
+          pending->coalesced_groups.pop_front();
+        } else {
+          // Not enough memory for this group - update with remaining blocks
+          group.block_indices = std::move(remaining_blocks);
+          group.total_bytes = remaining_bytes;
+          has_remaining_groups = true;
+          break;
+        }
+      }
+    }
+
+    // Save job before potential move of pending
+    auto job = pending->job;
+
+    // Requeue if groups remain
+    if (has_remaining_groups) {
+      MutexLock lock(&memory_mutex_);
+      pending_prefetch_queue_.push_front(std::move(pending));
+    } else {
+      // All groups dispatched, clear pending state
+      read_set->pending_request_.reset();
+    }
+
+    // Clear pending flags for dispatched blocks
+    if (read_set->pending_prefetch_flags_) {
+      for (size_t idx : blocks_to_dispatch) {
+        if (idx < read_set->pending_prefetch_flags_size_) {
+          read_set->pending_prefetch_flags_[idx].store(false);
+        }
+      }
+    }
+
+    // Dispatch acquired blocks
+    if (!blocks_to_dispatch.empty()) {
+      DispatchPrefetch(read_set, job, blocks_to_dispatch);
+    }
+
+    // If we dispatched nothing, stop (no memory available for any group)
+    if (blocks_to_dispatch.empty()) {
+      return;
+    }
+  }
+}
+
+void IODispatcherImpl::Impl::DispatchPrefetch(
+    const std::shared_ptr<ReadSet>& read_set, const std::shared_ptr<IOJob>& job,
+    const std::vector<size_t>& block_indices) {
+  // Sync point for testing partial prefetch - passes number of blocks being
+  // dispatched
+  TEST_SYNC_POINT_CALLBACK("IODispatcherImpl::DispatchPrefetch:BlockCount",
+                           const_cast<std::vector<size_t>*>(&block_indices));
+
+  // Prepare and execute IO for the given blocks
+  std::vector<FSReadRequest> read_reqs;
+  std::vector<std::vector<size_t>> coalesced_block_indices;
+  PrepareIORequests(job, block_indices, job->block_handles, &read_reqs,
+                    &coalesced_block_indices);
+
+  if (job->job_options.read_options.async_io) {
+    Status async_status;
+    std::vector<size_t> fallback_indices = ExecuteAsyncIO(
+        job, read_set, read_reqs, coalesced_block_indices, &async_status);
+
+    // For blocks where async is not supported, do sync IO
+    if (!fallback_indices.empty()) {
+      std::vector<FSReadRequest> sync_read_reqs;
+      std::vector<std::vector<size_t>> sync_coalesced_indices;
+      PrepareIORequests(job, fallback_indices, job->block_handles,
+                        &sync_read_reqs, &sync_coalesced_indices);
+      // Prefetch errors are ignored - user will get the error when reading
+      Status s =
+          ExecuteSyncIO(job, read_set, sync_read_reqs, sync_coalesced_indices);
+      s.PermitUncheckedError();
+      read_set->num_sync_reads_ += fallback_indices.size();
+    }
+    // Async errors are also ignored - user will get the error when reading
+    async_status.PermitUncheckedError();
+  } else {
+    // Prefetch errors are ignored - user will get the error when reading
+    Status s = ExecuteSyncIO(job, read_set, read_reqs, coalesced_block_indices);
+    s.PermitUncheckedError();
+    read_set->num_sync_reads_ += block_indices.size();
+  }
+}
+
 Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
                                          std::shared_ptr<ReadSet>* read_set) {
   if (!read_set) {
@@ -353,6 +653,7 @@ Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
   rs->job_ = job;
   rs->fs_ = job->table->get_rep()->ioptions.env->GetFileSystem();
   rs->pinned_blocks_.resize(job->block_handles.size());
+  rs->block_sizes_.resize(job->block_handles.size(), 0);
 
   // Build sorted index for O(log n) ReadOffset lookups via binary search.
   // sorted_block_indices_[i] = original index of i-th smallest block by offset.
@@ -399,43 +700,74 @@ Status IODispatcherImpl::Impl::SubmitJob(const std::shared_ptr<IOJob>& job,
   rs->num_cache_hits_ =
       job->block_handles.size() - block_indices_to_read.size();
 
-  // Prepare read requests - coalesce adjacent blocks
-  std::vector<FSReadRequest> read_reqs;
-  std::vector<std::vector<size_t>> coalesced_block_indices;
-  PrepareIORequests(job, block_indices_to_read, job->block_handles, &read_reqs,
-                    &coalesced_block_indices);
+  // Calculate block sizes for uncached blocks
+  for (const auto& idx : block_indices_to_read) {
+    size_t block_size =
+        BlockBasedTable::BlockSizeWithTrailer(job->block_handles[idx]);
+    rs->block_sizes_[idx] = block_size;
+  }
 
-  // Step 3: Execute IO requests based on JobOptions
-  if (job->job_options.read_options.async_io) {
-    // Try async IO - get back any blocks that need sync fallback (not
-    // supported) and surface any actual errors to caller
-    Status async_status;
-    std::vector<size_t> fallback_indices = ExecuteAsyncIO(
-        job, rs, read_reqs, coalesced_block_indices, &async_status);
-    if (!async_status.ok()) {
-      return async_status;
+  // Store dispatcher reference for release callbacks
+  rs->dispatcher_data_ = shared_from_this();
+
+  // Pre-coalesce blocks into groups, respecting memory budget per group
+  // This ensures we dispatch meaningful IO sizes, not tiny single-block IOs
+  // Both memory-limited and non-memory-limited paths use the same coalescing
+  auto coalesced_groups = PreCoalesceBlocks(job, rs, block_indices_to_read,
+                                            max_prefetch_memory_bytes_);
+
+  std::vector<size_t> blocks_to_dispatch;
+  std::deque<CoalescedPrefetchGroup> groups_to_queue;
+
+  // Try to acquire memory for entire coalesced groups
+  for (auto& group : coalesced_groups) {
+    if (TryAcquireMemory(group.total_bytes)) {
+      // Add all blocks from this group to dispatch
+      for (size_t idx : group.block_indices) {
+        blocks_to_dispatch.push_back(idx);
+      }
+    } else {
+      // Queue this group for later
+      groups_to_queue.push_back(std::move(group));
     }
+  }
 
-    // Fall back to sync IO for blocks where async is not supported
-    if (!fallback_indices.empty()) {
-      std::vector<FSReadRequest> sync_read_reqs;
-      std::vector<std::vector<size_t>> sync_coalesced_indices;
-      PrepareIORequests(job, fallback_indices, job->block_handles,
-                        &sync_read_reqs, &sync_coalesced_indices);
+  // Dispatch acquired blocks immediately
+  if (!blocks_to_dispatch.empty()) {
+    DispatchPrefetch(rs, job, blocks_to_dispatch);
+  }
 
-      Status s = ExecuteSyncIO(job, rs, sync_read_reqs, sync_coalesced_indices);
-      if (!s.ok()) {
-        return s;
+  // Queue remaining groups for later (only applies when memory limiting)
+  if (!groups_to_queue.empty()) {
+    auto pending = std::make_shared<PendingPrefetchRequest>();
+    pending->read_set = rs;
+    pending->job = job;
+
+    size_t pending_bytes = 0;
+    for (const auto& group : groups_to_queue) {
+      for (size_t idx : group.block_indices) {
+        pending->block_indices_to_prefetch.insert(idx);
       }
-      rs->num_sync_reads_ += fallback_indices.size();
+      pending_bytes += group.total_bytes;
     }
-  } else {
-    Status s = ExecuteSyncIO(job, rs, read_reqs, coalesced_block_indices);
-    if (!s.ok()) {
-      return s;
+    pending->coalesced_groups = std::move(groups_to_queue);
+    pending->pending_bytes_ = pending_bytes;
+
+    // Set up pending flags for queued blocks only
+    size_t num_blocks = job->block_handles.size();
+    rs->pending_prefetch_flags_ =
+        std::make_unique<std::atomic<bool>[]>(num_blocks);
+    rs->pending_prefetch_flags_size_ = num_blocks;
+    for (size_t idx : pending->block_indices_to_prefetch) {
+      rs->pending_prefetch_flags_[idx].store(true);
+    }
+    rs->pending_request_ = pending;
+
+    {
+      MutexLock lock(&memory_mutex_);
+      pending_prefetch_queue_.push_back(std::move(pending));
+      has_pending_requests_.store(true, std::memory_order_release);
     }
-    // We bump this for sync reads
-    rs->num_sync_reads_ += block_indices_to_read.size();
   }
 
   *read_set = std::move(rs);
@@ -503,6 +835,67 @@ void IODispatcherImpl::Impl::PrepareIORequests(
   }
 }
 
+std::vector<CoalescedPrefetchGroup> IODispatcherImpl::Impl::PreCoalesceBlocks(
+    const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& rs,
+    const std::vector<size_t>& block_indices, size_t max_group_bytes) {
+  std::vector<CoalescedPrefetchGroup> groups;
+
+  if (block_indices.empty()) {
+    return groups;
+  }
+
+  const auto& block_handles = job->block_handles;
+  const uint64_t coalesce_threshold = job->job_options.io_coalesce_threshold;
+
+  // Sort block indices by offset for coalescing
+  std::vector<size_t> sorted_indices = block_indices;
+  std::sort(sorted_indices.begin(), sorted_indices.end(),
+            [&block_handles](size_t a, size_t b) {
+              return block_handles[a].offset() < block_handles[b].offset();
+            });
+
+  // Build coalesced groups respecting max_group_bytes
+  groups.emplace_back();
+
+  for (size_t idx : sorted_indices) {
+    size_t block_size = rs->block_sizes_[idx];
+
+    // Skip blocks that are individually larger than the memory budget
+    // These will be read synchronously when needed (via ReadIndex fallback)
+    if (max_group_bytes > 0 && block_size > max_group_bytes) {
+      continue;
+    }
+
+    // Check if we need to start a new group
+    bool start_new_group = false;
+
+    if (!groups.back().block_indices.empty()) {
+      // Check gap with previous block
+      size_t last_idx = groups.back().block_indices.back();
+      const auto& last_handle = block_handles[last_idx];
+      uint64_t last_end = last_handle.offset() +
+                          BlockBasedTable::BlockSizeWithTrailer(last_handle);
+      uint64_t current_start = block_handles[idx].offset();
+
+      if (current_start > last_end + coalesce_threshold) {
+        start_new_group = true;  // Gap too large
+      } else if (max_group_bytes > 0 &&
+                 groups.back().total_bytes + block_size > max_group_bytes) {
+        start_new_group = true;  // Would exceed size limit
+      }
+    }
+
+    if (start_new_group) {
+      groups.emplace_back();
+    }
+
+    groups.back().block_indices.push_back(idx);
+    groups.back().total_bytes += block_size;
+  }
+
+  return groups;
+}
+
 std::vector<size_t> IODispatcherImpl::Impl::ExecuteAsyncIO(
     const std::shared_ptr<IOJob>& job, const std::shared_ptr<ReadSet>& read_set,
     std::vector<FSReadRequest>& read_reqs,
@@ -648,7 +1041,11 @@ Status IODispatcherImpl::Impl::ExecuteSyncIO(
   return Status::OK();
 }
 
-IODispatcherImpl::IODispatcherImpl() : impl_(new Impl()) {}
+IODispatcherImpl::IODispatcherImpl()
+    : impl_(std::make_shared<Impl>(IODispatcherOptions())) {}
+
+IODispatcherImpl::IODispatcherImpl(const IODispatcherOptions& options)
+    : impl_(std::make_shared<Impl>(options)) {}
 
 IODispatcherImpl::~IODispatcherImpl() = default;
 
@@ -659,4 +1056,8 @@ Status IODispatcherImpl::SubmitJob(const std::shared_ptr<IOJob>& job,
 
 IODispatcher* NewIODispatcher() { return new IODispatcherImpl(); }
 
+IODispatcher* NewIODispatcher(const IODispatcherOptions& options) {
+  return new IODispatcherImpl(options);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_imp.h b/util/io_dispatcher_imp.h
index 3324705ada3f..c4e52b86d546 100644
--- a/util/io_dispatcher_imp.h
+++ b/util/io_dispatcher_imp.h
@@ -21,7 +21,8 @@ namespace ROCKSDB_NAMESPACE {
 
 class IODispatcherImpl : public IODispatcher {
  public:
-  explicit IODispatcherImpl();
+  IODispatcherImpl();
+  explicit IODispatcherImpl(const IODispatcherOptions& options);
   ~IODispatcherImpl() override;
 
   Status SubmitJob(const std::shared_ptr<IOJob>& job,
@@ -29,7 +30,7 @@ class IODispatcherImpl : public IODispatcher {
 
  private:
   struct Impl;
-  std::unique_ptr<Impl> impl_;
+  std::shared_ptr<Impl> impl_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/io_dispatcher_test.cc b/util/io_dispatcher_test.cc
index 7f8e0a93115a..d5ea665e5ce4 100644
--- a/util/io_dispatcher_test.cc
+++ b/util/io_dispatcher_test.cc
@@ -12,6 +12,7 @@
 
 #include <memory>
 #include <mutex>
+#include <thread>
 
 #include "db/db_test_util.h"
 #include "db/dbformat.h"
@@ -23,6 +24,7 @@
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_reader.h"
+#include "test_util/sync_point.h"
 
 // Enable io_uring support for this test
 extern "C" bool RocksDbIOUringEnable() { return true; }
@@ -896,6 +898,900 @@ TEST_F(IODispatcherTest, VerifyReadRequestDetails) {
   }
 }
 
+// Test that memory limiting blocks when the limit is exceeded
+TEST_F(IODispatcherTest, MemoryLimitBlocksWhenExceeded) {
+  // Create dispatcher with a small memory limit (1MB)
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1 * 1024 * 1024;  // 1MB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+
+  // Submit a job - should succeed immediately (non-blocking)
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read all blocks - they may be read synchronously if prefetch was deferred
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that SubmitJob never blocks even when memory is exhausted
+TEST_F(IODispatcherTest, SubmitJobNeverBlocks) {
+  // Create dispatcher with a tiny memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1024;  // 1KB - very small
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+
+  // Submit first job - uses up all memory
+  auto job1 = std::make_shared<IOJob>();
+  job1->block_handles = block_handles;
+  job1->table = table.get();
+  job1->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set1;
+  s = dispatcher->SubmitJob(job1, &read_set1);
+  ASSERT_OK(s);  // Should succeed immediately
+
+  // Submit second job - should also succeed immediately (not block)
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(30, &table2, &block_handles2);
+  ASSERT_OK(s);
+
+  auto job2 = std::make_shared<IOJob>();
+  job2->block_handles = block_handles2;
+  job2->table = table2.get();
+  job2->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job2, &read_set2);
+  ASSERT_OK(s);  // Should succeed immediately - prefetch is just deferred
+
+  // Reads work - blocks are fetched synchronously on demand
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that releasing blocks triggers pending prefetches
+TEST_F(IODispatcherTest, BlockReleaseTriggersWaitingJob) {
+  // Create dispatcher with a small memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GT(block_handles.size(), 0);
+
+  // Submit first job
+  auto job1 = std::make_shared<IOJob>();
+  job1->block_handles = block_handles;
+  job1->table = table.get();
+  job1->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set1;
+  s = dispatcher->SubmitJob(job1, &read_set1);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set1, nullptr);
+
+  // Read all blocks from first job
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set1->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+  }
+
+  // Submit second job - prefetch will be deferred due to memory limit
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(20, &table2, &block_handles2);
+  ASSERT_OK(s);
+
+  auto job2 = std::make_shared<IOJob>();
+  job2->block_handles = block_handles2;
+  job2->table = table2.get();
+  job2->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set2;
+  s = dispatcher->SubmitJob(job2, &read_set2);
+  ASSERT_OK(s);  // Should succeed immediately
+  ASSERT_NE(read_set2, nullptr);
+
+  // Release blocks from first job - this should trigger pending prefetches
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    read_set1->ReleaseBlock(i);
+  }
+
+  // Read all blocks from second job - should work
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that multiple ReadSets share the memory budget
+TEST_F(IODispatcherTest, MultipleReadSetsShareMemoryBudget) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 10 * 1024 * 1024;  // 10MB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::vector<std::shared_ptr<ReadSet>> read_sets;
+  std::vector<std::vector<BlockHandle>> all_block_handles;
+
+  // Create and submit multiple jobs
+  for (int i = 0; i < 3; i++) {
+    std::unique_ptr<BlockBasedTable> table;
+    std::vector<BlockHandle> block_handles;
+
+    Status s = CreateAndOpenSST(20 + i * 5, &table, &block_handles);
+    ASSERT_OK(s);
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+    tables_.push_back(std::move(table));
+
+    all_block_handles.push_back(block_handles);
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    read_sets.push_back(read_set);
+  }
+
+  // Verify all ReadSets can read their blocks
+  for (size_t i = 0; i < read_sets.size(); ++i) {
+    for (size_t j = 0; j < all_block_handles[i].size(); ++j) {
+      CachableEntry<Block> block;
+      Status read_status = read_sets[i]->ReadIndex(j, &block);
+      ASSERT_OK(read_status);
+      ASSERT_NE(block.GetValue(), nullptr);
+    }
+  }
+
+  // Release all blocks from first ReadSet
+  for (size_t i = 0; i < all_block_handles[0].size(); ++i) {
+    read_sets[0]->ReleaseBlock(i);
+  }
+
+  // Create another job - should work because first ReadSet released memory
+  std::unique_ptr<BlockBasedTable> table_new;
+  std::vector<BlockHandle> block_handles_new;
+  Status s = CreateAndOpenSST(25, &table_new, &block_handles_new);
+  ASSERT_OK(s);
+
+  auto job_new = std::make_shared<IOJob>();
+  job_new->block_handles = block_handles_new;
+  job_new->table = table_new.get();
+  job_new->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set_new;
+  s = dispatcher->SubmitJob(job_new, &read_set_new);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set_new, nullptr);
+
+  for (size_t i = 0; i < block_handles_new.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set_new->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that no memory limiting is applied when max_prefetch_memory_bytes is 0
+TEST_F(IODispatcherTest, NoMemoryLimitWhenZero) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 0;  // No limit
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(50, &table, &block_handles);
+  ASSERT_OK(s);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test memory release on ReadSet destruction triggers pending prefetches
+TEST_F(IODispatcherTest, MemoryReleasedOnReadSetDestruction) {
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Create table outside the scope so it outlives the ReadSet
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+
+  // Second table - created now so it's available after first ReadSet is
+  // destroyed
+  std::unique_ptr<BlockBasedTable> table2;
+  std::vector<BlockHandle> block_handles2;
+  s = CreateAndOpenSST(30, &table2, &block_handles2);
+  ASSERT_OK(s);
+
+  std::shared_ptr<ReadSet> read_set2;
+
+  {
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = false;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s);
+    ASSERT_NE(read_set, nullptr);
+
+    // Submit second job while first is still alive - prefetch will be deferred
+    auto job2 = std::make_shared<IOJob>();
+    job2->block_handles = block_handles2;
+    job2->table = table2.get();
+    job2->job_options.read_options.async_io = false;
+
+    s = dispatcher->SubmitJob(job2, &read_set2);
+    ASSERT_OK(s);  // Should succeed immediately
+    ASSERT_NE(read_set2, nullptr);
+
+    // First ReadSet goes out of scope here and should release all memory,
+    // which triggers pending prefetches for second ReadSet
+  }
+
+  // Read all blocks from second job - should work because first ReadSet
+  // released its memory on destruction
+  for (size_t i = 0; i < block_handles2.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set2->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that partial prefetch dispatches as many blocks as memory allows
+// and queues the rest for later dispatch
+TEST_F(IODispatcherTest, PartialPrefetchDispatchesWhatFits) {
+  // Skip this test if io_uring is not available since partial prefetch
+  // only applies to async IO
+  if (!kIOUringPresent) {
+    return;  // io_uring not available, skip async IO test
+  }
+
+  // Create dispatcher with memory limit that allows only some blocks
+  // Each block is ~16KB, so 50KB allows roughly 3 blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Create 10 blocks - only ~3 should fit in memory
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+
+  // Use sync point to count blocks dispatched during SubmitJob
+  size_t blocks_dispatched_on_submit = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_dispatched_on_submit += indices->size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;  // Use async IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // With partial prefetch, we expect SOME blocks to have been dispatched
+  // (the ones that fit in memory), but not ALL blocks
+  // This is the key assertion: partial prefetch means > 0 blocks dispatched
+  // even though total memory needed exceeds the limit
+  EXPECT_GT(blocks_dispatched_on_submit, 0)
+      << "Expected some blocks to be dispatched with partial prefetch";
+  EXPECT_LT(blocks_dispatched_on_submit, block_handles.size())
+      << "Expected not all blocks to be dispatched (memory limit should apply)";
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Now read all blocks - remaining blocks will be fetched on demand
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // Verify all blocks were ultimately read
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+
+// Test that earlier block indices are prioritized in partial prefetch
+TEST_F(IODispatcherTest, PartialPrefetchPrioritizesEarlierIndices) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;  // io_uring not available, skip async IO test
+  }
+
+  // Create dispatcher with memory limit that allows only 1-2 blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 20 * 1024;  // 20KB - room for ~1 block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+
+  tracking_fs_->ClearReadOps();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+
+  // Get the async reads that were dispatched
+  auto read_ops = tracking_fs_->GetReadOps();
+
+  // Find the offset of the first async read
+  uint64_t first_async_offset = UINT64_MAX;
+  for (const auto& op : read_ops) {
+    if (op.type == ReadOp::kReadAsync && !op.requests.empty()) {
+      first_async_offset = std::min(first_async_offset, op.requests[0].first);
+    }
+  }
+
+  // The first async read should be for the first block (lowest offset)
+  // This verifies that earlier indices are prioritized
+  if (first_async_offset != UINT64_MAX) {
+    EXPECT_EQ(first_async_offset, block_handles[0].offset())
+        << "Expected first async read to be for the first block (earliest "
+           "index)";
+  }
+
+  // Read all blocks to complete the test
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+}
+
+// Test that blocks larger than the memory budget are excluded from prefetch
+// and fall back to synchronous read
+TEST_F(IODispatcherTest, OversizedBlocksFallbackToSyncRead) {
+  // Skip this test if io_uring is not available since we need async IO
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 3);
+
+  // Calculate the size of a single block
+  size_t single_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Create dispatcher with memory limit smaller than a single block
+  // This means ALL blocks are "oversized" and should fall back to sync read
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = single_block_size / 2;  // Half a block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Track dispatches - with oversized blocks, nothing should be dispatched
+  size_t blocks_dispatched = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_dispatched += indices->size();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // No blocks should have been dispatched since they're all oversized
+  EXPECT_EQ(blocks_dispatched, 0)
+      << "Expected no blocks to be dispatched when all blocks are oversized";
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // All blocks should still be readable via sync fallback
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // All reads should be sync since blocks couldn't be prefetched
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads for oversized blocks";
+}
+
+// Test that reading blocks before prefetch dispatch correctly updates
+// memory accounting for coalesced groups
+TEST_F(IODispatcherTest, PartialReadsUpdateCoalescedGroups) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  // Create dispatcher with memory limit that allows only some blocks
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read some blocks directly (simulating on-demand access before prefetch)
+  // This removes them from pending and should update coalesced group accounting
+  for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+  }
+
+  // Release the blocks we read - this frees memory
+  for (size_t i = 0; i < 5 && i < block_handles.size(); ++i) {
+    read_set->ReleaseBlock(i);
+  }
+
+  // Now read the remaining blocks - these should work correctly
+  // The key test: memory accounting should be correct even though some blocks
+  // were removed from pending groups before dispatch
+  for (size_t i = 5; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // Verify all remaining blocks were read successfully
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  // We read 5 blocks initially, then the remaining blocks
+  EXPECT_GE(total_reads, block_handles.size() - 5)
+      << "Expected at least the remaining blocks to be counted";
+}
+
+// Test that a mix of oversized and normal blocks works correctly
+TEST_F(IODispatcherTest, MixedOversizedAndNormalBlocks) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 5);
+
+  // Calculate the size of a typical block
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Create dispatcher with memory limit that allows exactly 2 typical blocks
+  // This means groups of 3+ blocks become "oversized" as a group
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 2;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // All blocks should be readable regardless of prefetch status
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // Verify total reads match
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+
+// Test that memory is properly accounted when groups are partially consumed
+TEST_F(IODispatcherTest, MemoryAccountingWithPartialGroupConsumption) {
+  // Skip this test if io_uring is not available
+  if (!kIOUringPresent) {
+    return;
+  }
+
+  // Create dispatcher with a specific memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 100 * 1024;  // 100KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(30, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = true;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // Read blocks one at a time and release them
+  // This tests that RemoveFromPending correctly updates pending state
+  // and that TryDispatchPendingPrefetches filters correctly
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+
+    // Release the block immediately after reading
+    read_set->ReleaseBlock(i);
+  }
+
+  // Verify total reads match
+  uint64_t total_reads = read_set->GetNumSyncReads() +
+                         read_set->GetNumAsyncReads() +
+                         read_set->GetNumCacheHits();
+  EXPECT_EQ(total_reads, block_handles.size());
+}
+
+// Test that sync prefetching respects memory limits
+TEST_F(IODispatcherTest, SyncPrefetchWithMemoryLimit) {
+  // Create dispatcher with a small memory limit
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 50 * 1024;  // 50KB
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // All blocks should be readable even with memory limits
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // Verify all were sync reads
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads with async_io=false";
+  EXPECT_EQ(read_set->GetNumAsyncReads(), 0)
+      << "Expected no async reads with async_io=false";
+}
+
+// Test that oversized blocks work correctly with sync IO
+TEST_F(IODispatcherTest, OversizedBlocksWithSyncIO) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(10, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 3);
+
+  // Calculate the size of a single block
+  size_t single_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Create dispatcher with memory limit smaller than a single block
+  // This means ALL blocks are "oversized"
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = single_block_size / 2;  // Half a block
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // All blocks should still be readable via sync fallback
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+
+  // All reads should be sync
+  EXPECT_GT(read_set->GetNumSyncReads(), 0)
+      << "Expected sync reads for oversized blocks";
+}
+
+// Test that a single block larger than total memory budget still works
+TEST_F(IODispatcherTest, SingleBlockLargerThanTotalMemory) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(5, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 1);
+
+  // Set memory limit to 1 byte - smaller than any block
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = 1;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Test with both sync and async modes
+  for (bool async : {false, true}) {
+    // Skip async if io_uring not available
+    if (async && !kIOUringPresent) {
+      continue;
+    }
+
+    auto job = std::make_shared<IOJob>();
+    job->block_handles = block_handles;
+    job->table = table.get();
+    job->job_options.read_options.async_io = async;
+
+    std::shared_ptr<ReadSet> read_set;
+    s = dispatcher->SubmitJob(job, &read_set);
+    ASSERT_OK(s) << "SubmitJob failed with async=" << async;
+    ASSERT_NE(read_set, nullptr);
+
+    // All blocks should be readable
+    for (size_t i = 0; i < block_handles.size(); ++i) {
+      CachableEntry<Block> block;
+      Status read_status = read_set->ReadIndex(i, &block);
+      ASSERT_OK(read_status)
+          << "Failed to read block " << i << " with async=" << async;
+      ASSERT_NE(block.GetValue(), nullptr)
+          << "Block " << i << " is null with async=" << async;
+    }
+  }
+}
+
+// Test that sync prefetching defers later groups and dispatches them
+// when memory is released
+TEST_F(IODispatcherTest, SyncPrefetchDefersAndDispatchesLaterGroups) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  // Create 10+ blocks so we have enough to test deferred dispatch
+  Status s = CreateAndOpenSST(20, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  // Calculate typical block size
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Set memory limit to fit approximately 3 blocks
+  // This should cause groups to be split and some deferred
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 3;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Track dispatch calls
+  std::vector<size_t> dispatch_counts;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        dispatch_counts.push_back(indices->size());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;  // Sync IO
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+  ASSERT_NE(read_set, nullptr);
+
+  // After SubmitJob, some blocks should have been dispatched (first group)
+  // and remaining groups should be queued
+  size_t initial_dispatch_count = dispatch_counts.size();
+  EXPECT_GT(initial_dispatch_count, 0)
+      << "Expected at least one dispatch during SubmitJob";
+
+  // Read and release first few blocks - this should trigger deferred dispatch
+  for (size_t i = 0; i < 3 && i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    ASSERT_NE(block.GetValue(), nullptr);
+    // Release to free memory
+    read_set->ReleaseBlock(i);
+  }
+
+  // After releasing blocks, more dispatches should have occurred
+  // as the pending queue gets processed
+  size_t dispatch_count_after_release = dispatch_counts.size();
+  EXPECT_GE(dispatch_count_after_release, initial_dispatch_count)
+      << "Expected more dispatches after releasing blocks";
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // All remaining blocks should still be readable
+  for (size_t i = 3; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status) << "Failed to read block " << i;
+    ASSERT_NE(block.GetValue(), nullptr) << "Block " << i << " is null";
+  }
+}
+
+// Test that coalesced groups are properly split based on memory budget
+TEST_F(IODispatcherTest, CoalescedGroupsSplitByMemoryBudget) {
+  std::unique_ptr<BlockBasedTable> table;
+  std::vector<BlockHandle> block_handles;
+  Status s = CreateAndOpenSST(15, &table, &block_handles);
+  ASSERT_OK(s);
+  ASSERT_GE(block_handles.size(), 10);
+
+  // Calculate typical block size
+  size_t typical_block_size =
+      BlockBasedTable::BlockSizeWithTrailer(block_handles[0]);
+
+  // Set memory limit to fit exactly 5 blocks
+  // With 10+ blocks, we should get at least 2 groups
+  IODispatcherOptions opts;
+  opts.max_prefetch_memory_bytes = typical_block_size * 5;
+  std::unique_ptr<IODispatcher> dispatcher(NewIODispatcher(opts));
+
+  // Track how many blocks are in each dispatch call
+  std::vector<size_t> blocks_per_dispatch;
+  SyncPoint::GetInstance()->SetCallBack(
+      "IODispatcherImpl::DispatchPrefetch:BlockCount", [&](void* arg) {
+        auto* indices = static_cast<std::vector<size_t>*>(arg);
+        blocks_per_dispatch.push_back(indices->size());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto job = std::make_shared<IOJob>();
+  job->block_handles = block_handles;
+  job->table = table.get();
+  job->job_options.read_options.async_io = false;
+
+  std::shared_ptr<ReadSet> read_set;
+  s = dispatcher->SubmitJob(job, &read_set);
+  ASSERT_OK(s);
+
+  // First dispatch should have at most 5 blocks (memory limit)
+  ASSERT_GT(blocks_per_dispatch.size(), 0);
+  EXPECT_LE(blocks_per_dispatch[0], 5)
+      << "First dispatch should be limited by memory budget";
+
+  // Read and release all blocks to trigger remaining dispatches
+  for (size_t i = 0; i < block_handles.size(); ++i) {
+    CachableEntry<Block> block;
+    Status read_status = read_set->ReadIndex(i, &block);
+    ASSERT_OK(read_status);
+    read_set->ReleaseBlock(i);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Verify each dispatch was limited by memory budget
+  for (size_t i = 0; i < blocks_per_dispatch.size(); ++i) {
+    EXPECT_LE(blocks_per_dispatch[i], 5)
+        << "Dispatch " << i << " exceeded memory budget";
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {

From 6284a7984711d2661656e5a8e0fdd1a19e31a998 Mon Sep 17 00:00:00 2001
From: Josh Kang <jkangs@meta.com>
Date: Fri, 6 Feb 2026 13:41:32 -0800
Subject: [PATCH 464/500] Upgrade clang format in CI to 21.1.2 (#14311)

Summary:
To make CI consistent with internal meta clang version.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14311

Test Plan:
CI shows correct version
```
Successfully installed clang-format-21.1.2
clang-format version 21.1.2
```

Reviewed By: xingbowang

Differential Revision: D92535441

Pulled By: joshkang97

fbshipit-source-id: ea21ea97b13a35b286f0c2ce18b3f01ffbf49afd
---
 .github/workflows/pr-jobs.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index a3cfcdbce73e..6e8080078095 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -44,6 +44,10 @@ jobs:
       run: python -m pip install --upgrade pip
     - name: Install argparse
       run: pip install argparse
+    - name: Install clang-format
+      run: |
+        pip install https://files.pythonhosted.org/packages/fb/ac/3c04772acc0257f5730e83adb542b2603c1a62d1315010ab593a980af404/clang_format-21.1.2-py2.py3-none-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
+        clang-format --version
     - name: Download clang-format-diff.py
       run: wget https://rocksdb-deps.s3.us-west-2.amazonaws.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
     - name: Check format

From 51feb255670d6659a2a2734a742bcde17c88a198 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Mon, 9 Feb 2026 09:21:12 -0800
Subject: [PATCH 465/500] Fix string-conversion issue in
 internal_repo_rocksdb/repo/utilities/persistent_cache/block_cache_tier_file.cc
 +2 (#14312)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14312

This could is triggering `-Wstring-conversion`, which presents as:
```
warning: implicit conversion turns string literal into bool: A to B
```
This is often a bug and what was intended. The most frequent cause is the code was:
```
void foo(bool) { ... }
void foo(std::string) { ... }
foo("this gets interpreted as a bool");
```

It is also possible the issue is innocuous as part of an assert:
```
assert(!"this string is true, so the assertion is false");
EXPECT_FALSE("this string is true, so the expect fails");
```
in these cases the use is to "cute", so we modify the code to make it more obvious.
```
assert(false && "the compiler recognizes and doesn't complain about this pattern");
FAIL() << "much more obvious";
```

Reviewed By: dmm-fb

Differential Revision: D92528316

fbshipit-source-id: 93fbb624e8731c4cdb559746b44c1aa71d786304
---
 utilities/persistent_cache/block_cache_tier_file.cc | 4 ++--
 utilities/persistent_cache/block_cache_tier_file.h  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc
index 493b92236753..110e74c9e0de 100644
--- a/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/utilities/persistent_cache/block_cache_tier_file.cc
@@ -254,7 +254,7 @@ bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val,
 
   CacheRecord rec;
   if (!rec.Deserialize(data)) {
-    assert(!"Error deserializing data");
+    assert(false && "Error deserializing data");
     Error(log_, "Error de-serializing record from file %s off %d",
           Path().c_str(), lba.off_);
     return false;
@@ -339,7 +339,7 @@ bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) {
   CacheRecord rec(key, val);
   if (!rec.Serialize(&bufs_, &buf_woff_)) {
     // unexpected error: unable to serialize the data
-    assert(!"Error serializing record");
+    assert(false && "Error serializing record");
     return false;
   }
 
diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h
index 7f329695f52c..82ee40d07369 100644
--- a/utilities/persistent_cache/block_cache_tier_file.h
+++ b/utilities/persistent_cache/block_cache_tier_file.h
@@ -101,14 +101,14 @@ class BlockCacheFile : public LRUElement<BlockCacheFile> {
   // append key/value to file and return LBA locator to user
   virtual bool Append(const Slice& /*key*/, const Slice& /*val*/,
                       LBA* const /*lba*/) {
-    assert(!"not implemented");
+    assert(false && "not implemented");
     return false;
   }
 
   // read from the record locator (LBA) and return key, value and status
   virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/,
                     char* /*scratch*/) {
-    assert(!"not implemented");
+    assert(false && "not implemented");
     return false;
   }
 

From 56cb88ec7958bf8cae1a247a9d7b6b945350aa15 Mon Sep 17 00:00:00 2001
From: Anand Ananthabhotla <anand76@meta.com>
Date: Tue, 10 Feb 2026 15:05:12 -0800
Subject: [PATCH 466/500] Fix racy assertion in AbortIOPartialHandlesBug test
 (#14319)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14319

The test asserted h1->is_finished and h2->is_finished immediately after AbortIO({H0}), before calling Poll. This is invalid because AbortIO only guarantees that handles in its abort set are finalized. Non-aborted handles' CQEs may or may not be consumed during AbortIO depending on io_uring completion ordering. If H0's two CQEs (original read + cancel) arrive before H1/H2's CQEs, AbortIO breaks out of its wait loop without processing them. Move the H1/H2 is_finished assertions to after Poll, which correctly handles either case. Also remove the racy req_count checks for non-aborted handles since Poll does not increment req_count.

Reviewed By: jaykorean

Differential Revision: D92848827

fbshipit-source-id: 0c09b44ceada99877e8311cff799fa94f1056545
---
 env/env_test.cc | 84 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/env/env_test.cc b/env/env_test.cc
index 68c5c90e4c51..68efa41c2c0b 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -3744,32 +3744,35 @@ void TestAbortIOWithRequests(
   std::shared_ptr<FileSystem> fs = env->GetFileSystem();
   std::string fname = test::PerThreadDBPath(env, "testfile_abortio");
 
-  constexpr size_t kSectorSize = 4096;
+  // 1. Create test file once (content doesn't change between iterations)
+  {
+    std::unique_ptr<FSWritableFile> wfile;
+    FileOptions file_opts;
+    file_opts.use_direct_writes = true;
+    ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr));
 
-  for (int iter = 0; iter < iterations; iter++) {
-    // 1. Create test file of specified size using direct IO
-    {
-      std::unique_ptr<FSWritableFile> wfile;
-      FileOptions file_opts;
-      file_opts.use_direct_writes = true;
-      ASSERT_OK(fs->NewWritableFile(fname, file_opts, &wfile, nullptr));
-
-      // Round up to full sectors for direct IO writes
-      size_t num_sectors = (file_size + kSectorSize - 1) / kSectorSize;
-      for (size_t i = 0; i < num_sectors; ++i) {
-        auto data = NewAligned(kSectorSize, static_cast<char>(i + 1));
-        Slice slice(data.get(), kSectorSize);
-        ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
-      }
+    // Query the file's required buffer alignment (logical block size)
+    // instead of hardcoding 4096, to support devices with different
+    // sector sizes.
+    size_t sector_size = wfile->GetRequiredBufferAlignment();
 
-      // Truncate to exact file size if not aligned to sector boundary
-      if (file_size % kSectorSize != 0) {
-        ASSERT_OK(wfile->Truncate(file_size, IOOptions(), nullptr));
-      }
+    // Round up to full sectors for direct IO writes
+    size_t num_sectors = (file_size + sector_size - 1) / sector_size;
+    for (size_t i = 0; i < num_sectors; ++i) {
+      auto data = NewAligned(sector_size, static_cast<char>(i + 1));
+      Slice slice(data.get(), sector_size);
+      ASSERT_OK(wfile->Append(slice, IOOptions(), nullptr));
+    }
 
-      ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+    // Truncate to exact file size if not aligned to sector boundary
+    if (file_size % sector_size != 0) {
+      ASSERT_OK(wfile->Truncate(file_size, IOOptions(), nullptr));
     }
 
+    ASSERT_OK(wfile->Close(IOOptions(), nullptr));
+  }
+
+  for (int iter = 0; iter < iterations; iter++) {
     // 2. Submit ReadAsync requests and immediately abort
     {
       FileOptions file_opts;
@@ -3784,6 +3787,7 @@ void TestAbortIOWithRequests(
       std::vector<std::unique_ptr<char, Deleter>> data;
       std::vector<size_t> vals;
       IOHandleDeleter del_fn;
+      std::atomic<int> callbacks_invoked{0};
 
       // Initialize read requests from specs
       for (size_t i = 0; i < num_reads; i++) {
@@ -3799,6 +3803,7 @@ void TestAbortIOWithRequests(
           [&](FSReadRequest& req, void* cb_arg) {
             size_t i = *(reinterpret_cast<size_t*>(cb_arg));
             reqs[i].status = req.status;
+            callbacks_invoked++;
           };
 
       // Submit all ReadAsync requests
@@ -3825,6 +3830,15 @@ void TestAbortIOWithRequests(
       // Immediately call AbortIO - this should NOT hang
       ASSERT_OK(fs->AbortIO(io_handles));
 
+      // Verify all handles are finished and all callbacks were invoked.
+      // Since all handles are passed to AbortIO, every handle is guaranteed
+      // to be finalized (either completed or cancelled).
+      for (size_t i = 0; i < num_reads; i++) {
+        Posix_IOHandle* h = static_cast<Posix_IOHandle*>(io_handles[i]);
+        ASSERT_TRUE(h->is_finished);
+      }
+      ASSERT_EQ(callbacks_invoked.load(), static_cast<int>(num_reads));
+
       // Clean up handles
       for (size_t i = 0; i < num_reads; i++) {
         if (io_handles[i]) {
@@ -3832,10 +3846,10 @@ void TestAbortIOWithRequests(
         }
       }
     }
-
-    ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
   }
 
+  ASSERT_OK(fs->DeleteFile(fname, IOOptions(), nullptr));
+
   fprintf(stderr, "TestAbortIOWithRequests: completed %d iterations\n",
           iterations);
 #else
@@ -4026,19 +4040,17 @@ TEST_F(TestAsyncRead, AbortIOPartialHandlesBug) {
     ASSERT_TRUE(h0->is_finished);
     ASSERT_EQ(h0->req_count, 2u);  // original + cancel
 
-    // Verify H1 and H2 are finished (read completed, not aborted)
-    Posix_IOHandle* h1 = static_cast<Posix_IOHandle*>(io_handles[1]);
-    Posix_IOHandle* h2 = static_cast<Posix_IOHandle*>(io_handles[2]);
-    ASSERT_TRUE(h1->is_finished);
-    ASSERT_TRUE(h2->is_finished);
-    ASSERT_EQ(h1->req_count, 1u);  // only original (no cancel)
-    ASSERT_EQ(h2->req_count, 1u);  // only original (no cancel)
+    // Note: H1 and H2 may or may not be finished at this point. AbortIO
+    // finalizes non-aborted handles whose CQEs arrive while waiting for
+    // aborted handles, but CQE ordering is non-deterministic. If H0's
+    // completions arrived first, H1/H2's CQEs are still in the queue.
+    // Poll handles either case correctly.
 
-    // Poll on H1, H2 - should return immediately since they're already finished
-    // Note: Poll must be called from the same thread (io_uring is thread-local)
+    // Poll on H1, H2 - completes them if not already finalized by AbortIO
     std::vector<void*> poll_handles = {io_handles[1], io_handles[2]};
 
-    // Use a watchdog to detect hang (regression test)
+    // Use a watchdog to detect hang (regression test for the original bug
+    // where AbortIO consumed non-aborted CQEs without finalizing them)
     std::atomic<bool> poll_completed{false};
     std::thread watchdog([&]() {
       for (int i = 0; i < 500; i++) {  // 5 seconds timeout
@@ -4053,6 +4065,12 @@ TEST_F(TestAsyncRead, AbortIOPartialHandlesBug) {
     poll_completed = true;
     watchdog.join();
 
+    // After Poll, H1 and H2 must be finished
+    Posix_IOHandle* h1 = static_cast<Posix_IOHandle*>(io_handles[1]);
+    Posix_IOHandle* h2 = static_cast<Posix_IOHandle*>(io_handles[2]);
+    ASSERT_TRUE(h1->is_finished);
+    ASSERT_TRUE(h2->is_finished);
+
     // Verify all callbacks were invoked
     ASSERT_EQ(callbacks_invoked.load(), 3);
 

From 3148c6cad43e820680a4bf4ebeb48d39f4c44c63 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Wed, 11 Feb 2026 14:09:12 -0800
Subject: [PATCH 467/500] Fix string-conversion issue in
 internal_repo_rocksdb/repo/table/block_based/index_builder.cc +1

Summary:
This could is triggering `-Wstring-conversion`, which presents as:
```
warning: implicit conversion turns string literal into bool: A to B
```
This is often a bug and what was intended. The most frequent cause is the code was:
```
void foo(bool) { ... }
void foo(std::string) { ... }
foo("this gets interpreted as a bool");
```

It is also possible the issue is innocuous as part of an assert:
```
assert(false && "this string is true, so the assertion is false");
EXPECT_FALSE("this string is true, so the expect fails");
```
in these cases the use is to "cute", so we modify the code to make it more obvious.
```
assert(false && "the compiler recognizes and doesn't complain about this pattern");
FAIL() << "much more obvious";
```

Differential Revision: D92886041

fbshipit-source-id: 6adfaa102f12e293491cc579ec92b48834d1d0a8
---
 table/block_based/index_builder.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc
index 7731f42790fa..8de01f0b7a22 100644
--- a/table/block_based/index_builder.cc
+++ b/table/block_based/index_builder.cc
@@ -66,7 +66,7 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
       break;
     }
     default: {
-      assert(!"Do not recognize the index type ");
+      assert(false && "Do not recognize the index type ");
       break;
     }
   }

From d8b1893c9d4e77a140d451405b6efd6583eac649 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 11 Feb 2026 14:43:41 -0800
Subject: [PATCH 468/500] DROP support for block-based SST format_version < 2
 (#14315)

Summary:
... and remove some old code and tech debt in the process.

This is arguably a great milestone and precendent in RocksDB history as for the first time we are explicitly dropping support for the ability to read source-of-truth data in old formats. (We previously dropped support for reading some old bloom filters, but those are performance optimizers not source-of-truth. https://github.com/facebook/rocksdb/issues/10184) However, DBs written with default settings since release 4.6.0, which is very nearly 10 years ago, can still be read. And by using compaction with intermediate versions, there's an upgrade path going back to (AFAIK) early releases of LevelDB (from which RocksDB was forked).

Some detail:
* The magic number for LevelDB SST files (0xdb4775248b80fb57, most recently called kLegacyBlockBasedTableMagicNumber) now only exists in the code to provide a good error message and to test that good error message.
* There is some notable refactoring and renaming around format_version handling. This is a bit of a messy area of code because the footer code being shared between different table formats (block-based, plain, cuckoo) means format_version in the footer is in ways tied to all of them, but in other ways is just tied to block-based table where we have been making updates. Hopefully code comments keep this clear.
* Now that there are old format_versions we can't read (and can't write authoritatively in tests), I've needed to split out kMinSupportedFormatVersion into a constant for reads and for writes, currently the same at format_version=2. Comments describe how to update these in the future.
* The idea of versioning the compression format is basically going away, though we're keeping BuiltinV2 in places just because it's already there. There's lots of room in the BuiltinV2 schema to expand to new built-in compression types, or new ways of handling existing compression algorithms. CompressionManager with CompatibilityName gives users the power to customize compression without the need for versions tied to format_version.

Immediate follow-up:
* Clean up compression loose ends like OLD_Compress, OLD_Uncompress

Suggested follow-up:
* Update plain table builder to migrate to new footer version so that we can drop support for legacy footer. We have to be careful that the (likely untested) forward compatibility path I put in place a while back works (or fix it and wait a while) before dropping support for plain table with legacy footer.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14315

Test Plan:
* Some tests updated / added
* A couple tests are obsolete: removed
* Also updated format compatible test, which now doesn't need to dig as far back into history building RocksDB.

Reviewed By: hx235

Differential Revision: D92577766

Pulled By: pdillinger

fbshipit-source-id: a23be846189d901ce087af4ca9a99cef18445cb7
---
 cache/compressed_secondary_cache.cc           |   2 +-
 db/column_family_test.cc                      |   8 +-
 db/comparator_db_test.cc                      |   2 +-
 db/db_bloom_filter_test.cc                    |   8 +-
 db/db_test.cc                                 |  66 -----
 db/db_test_util.cc                            |   2 +-
 include/rocksdb/table.h                       |   5 +-
 table/adaptive/adaptive_table_factory.cc      |   3 +-
 .../block_based/block_based_table_builder.cc  |  20 +-
 table/block_based/block_based_table_builder.h |   1 -
 .../block_based/block_based_table_factory.cc  |  31 +-
 table/block_based/block_based_table_factory.h |   2 -
 table/block_based/block_based_table_reader.cc |  18 +-
 table/block_based/block_test.cc               |   3 +-
 .../partitioned_filter_block_test.cc          |   2 +-
 table/block_fetcher_test.cc                   |   3 +-
 table/format.cc                               |  41 ++-
 table/format.h                                |  56 +++-
 table/sst_file_dumper.cc                      |   3 +-
 table/table_test.cc                           | 266 +++++++++++-------
 test_util/testutil.cc                         |   2 +-
 tools/check_format_compatible.sh              |   5 +-
 tools/sst_dump_tool.cc                        |   2 +-
 .../public_api_changes/remove_fv_1.md         |   1 +
 util/compression.cc                           | 175 +-----------
 util/compression.h                            |  13 -
 26 files changed, 289 insertions(+), 451 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_fv_1.md

diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc
index 5a53471725f2..d07a099ec872 100644
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@@ -50,7 +50,7 @@ CompressedSecondaryCache::CompressedSecondaryCache(
           std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
               cache_))),
       disable_cache_(opts.capacity == 0) {
-  auto mgr = GetBuiltinCompressionManager(/*compression_format_version=*/2);
+  auto mgr = GetBuiltinV2CompressionManager();
   compressor_ = mgr->GetCompressor(cache_options_.compression_opts,
                                    cache_options_.compression_type);
   decompressor_ =
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 3a2ca0617636..8579ab525076 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -517,7 +517,7 @@ class ColumnFamilyTest
 INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
                         testing::Values(test::kDefaultFormatVersion));
 INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
-                        testing::Values(kLatestFormatVersion));
+                        testing::Values(kLatestBbtFormatVersion));
 
 TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
   for (int iter = 0; iter < 3; ++iter) {
@@ -707,8 +707,8 @@ INSTANTIATE_TEST_CASE_P(
                     std::make_tuple(test::kDefaultFormatVersion, false)));
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, FlushEmptyCFTestWithParam,
-    testing::Values(std::make_tuple(kLatestFormatVersion, true),
-                    std::make_tuple(kLatestFormatVersion, false)));
+    testing::Values(std::make_tuple(kLatestBbtFormatVersion, true),
+                    std::make_tuple(kLatestBbtFormatVersion, false)));
 
 TEST_P(ColumnFamilyTest, AddDrop) {
   Open();
@@ -3636,7 +3636,7 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
 // the behavior of manual flush is that it skips retaining UDTs.
 class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase {
  public:
-  ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {}
+  ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestBbtFormatVersion) {}
 
   void SetUp() override {
     db_options_.allow_concurrent_memtable_write = false;
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index f9c0f47ef7be..af7355c5144d 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -318,7 +318,7 @@ class ComparatorDBTest
 INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
                         testing::Values(test::kDefaultFormatVersion));
 INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
-                        testing::Values(kLatestFormatVersion));
+                        testing::Values(kLatestBbtFormatVersion));
 
 TEST_P(ComparatorDBTest, Bytewise) {
   for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index c268400c78fe..51b259aeb8d9 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -922,14 +922,14 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         std::make_tuple(kAutoBloom,
                         FilterPartitioning::kCoupledPartitionedFilter,
-                        kLatestFormatVersion),
+                        kLatestBbtFormatVersion),
         std::make_tuple(kAutoBloom,
                         FilterPartitioning::kDecoupledPartitionedFilter,
-                        kLatestFormatVersion),
+                        kLatestBbtFormatVersion),
         std::make_tuple(kAutoBloom, FilterPartitioning::kUnpartitionedFilter,
-                        kLatestFormatVersion),
+                        kLatestBbtFormatVersion),
         std::make_tuple(kAutoRibbon, FilterPartitioning::kUnpartitionedFilter,
-                        kLatestFormatVersion)));
+                        kLatestBbtFormatVersion)));
 #endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
diff --git a/db/db_test.cc b/db/db_test.cc
index 7909763ed0a5..928808cffee1 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -6184,72 +6184,6 @@ TEST_F(DBTest, L0L1L2AndUpHitCounter) {
                                TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 }
 
-TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
-  // Allow testing format_version=1
-  bool& allow_unsupported_fv = TEST_AllowUnsupportedFormatVersion();
-  SaveAndRestore guard(&allow_unsupported_fv);
-  ASSERT_FALSE(allow_unsupported_fv);
-
-  // iter 0 -- zlib
-  // iter 1 -- bzip2
-  // iter 2 -- lz4
-  // iter 3 -- lz4HC
-  // iter 4 -- xpress
-  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
-                                    kLZ4Compression, kLZ4HCCompression,
-                                    kXpressCompression};
-  for (auto comp : compressions) {
-    if (!CompressionTypeSupported(comp)) {
-      continue;
-    }
-    // first_table_version 1 -- generate with table_version == 1, read with
-    // table_version == 2
-    // first_table_version 2 -- generate with table_version == 2, read with
-    // table_version == 1
-    for (int first_table_version = 1; first_table_version <= 2;
-         ++first_table_version) {
-      BlockBasedTableOptions table_options;
-      table_options.format_version = first_table_version;
-      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
-      Options options = CurrentOptions();
-
-      // Hack to generate old files (checked in factory construction)
-      allow_unsupported_fv = true;
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      ASSERT_EQ(options.table_factory->GetOptions<BlockBasedTableOptions>()
-                    ->format_version,
-                first_table_version);
-      // Able to read old files without the hack
-      allow_unsupported_fv = false;
-
-      options.create_if_missing = true;
-      options.compression = comp;
-      DestroyAndReopen(options);
-
-      int kNumKeysWritten = 1000;
-
-      Random rnd(301);
-      for (int i = 0; i < kNumKeysWritten; ++i) {
-        // compressible string
-        ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
-      }
-      ASSERT_OK(Flush());
-
-      table_options.format_version = first_table_version == 1 ? 2 : 1;
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      // format_version (for writing) is sanitized to minimum supported
-      ASSERT_EQ(options.table_factory->GetOptions<BlockBasedTableOptions>()
-                    ->format_version,
-                BlockBasedTableFactory::kMinSupportedFormatVersion);
-      Reopen(options);
-      for (int i = 0; i < kNumKeysWritten; ++i) {
-        auto r = Get(Key(i));
-        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
-      }
-    }
-  }
-}
-
 TEST_F(DBTest, CloseSpeedup) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleLevel;
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index a0608b30b4b8..1d2c0f268dbd 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -520,7 +520,7 @@ Options DBTestBase::GetOptions(
     }
     case kBlockBasedTableWithLatestFormat: {
       // In case different from default
-      table_options.format_version = kLatestFormatVersion;
+      table_options.format_version = kLatestBbtFormatVersion;
       break;
     }
     case kOptimizeFiltersForHits: {
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 1f46217fbca3..9be34a0284e3 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -558,8 +558,9 @@ struct BlockBasedTableOptions {
   uint32_t read_amp_bytes_per_bit = 0;
 
   // We currently have these format versions:
-  // 0 - 1 -- Unsupported for writing new files and quietly sanitized to 2.
-  // Read support is deprecated and could be removed in the future.
+  // 0 - 1 -- No longer supported. Attempting to read files with these format
+  // versions will return an error. To upgrade, load the data with RocksDB
+  // >= 4.6.0 and < 11.0.0, then run a full compaction.
   // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we
   // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
   // don't plan to run RocksDB before version 3.10, you should probably use
diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc
index f06b265328f8..db3f7625a710 100644
--- a/table/adaptive/adaptive_table_factory.cc
+++ b/table/adaptive/adaptive_table_factory.cc
@@ -51,8 +51,7 @@ Status AdaptiveTableFactory::NewTableReader(
       footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
     return plain_table_factory_->NewTableReader(
         table_reader_options, std::move(file), file_size, table);
-  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
-             footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
+  } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber) {
     return block_based_table_factory_->NewTableReader(
         ro, table_reader_options, std::move(file), file_size, table,
         prefetch_index_and_filter_in_cache);
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index bbd1ddde8135..46197d35dc24 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -134,9 +134,6 @@ Compressor* MaybeCloneSpecialized(
 // allocated
 // it must be not extern in one place.
 const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
-// We also support reading and writing legacy block based table format (for
-// backwards compatibility)
-const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
 
 // A collector that collects properties of interest to block-based table.
 // For now this class looks heavy-weight since we only write one additional
@@ -1096,10 +1093,7 @@ struct BlockBasedTableBuilder::Rep {
     auto* mgr = tbo.moptions.compression_manager.get();
     if (mgr == nullptr) {
       uses_explicit_compression_manager = false;
-      mgr = GetBuiltinCompressionManager(
-                GetCompressFormatForVersion(
-                    static_cast<uint32_t>(table_opt.format_version)))
-                .get();
+      mgr = GetBuiltinV2CompressionManager().get();
     } else {
       uses_explicit_compression_manager = true;
 
@@ -1186,8 +1180,7 @@ struct BlockBasedTableBuilder::Rep {
     }
 
     if (sample_for_compression > 0) {
-      auto builtin = GetBuiltinCompressionManager(
-          GetCompressFormatForVersion(table_opt.format_version));
+      auto builtin = GetBuiltinV2CompressionManager();
       if (builtin->SupportsCompressionType(kLZ4Compression)) {
         fast_sample_compressor = builtin->GetCompressor({}, kLZ4Compression);
       } else if (builtin->SupportsCompressionType(kSnappyCompression)) {
@@ -1388,11 +1381,10 @@ struct BlockBasedTableBuilder::Rep {
       // Use legacy compression_name property, populated at the end of
       // building the file. Not compatible with compression managers using
       // custom algorithms / compression types.
-      assert(Slice(mgr->CompatibilityName())
-                 .compare(GetBuiltinCompressionManager(
-                              GetCompressFormatForVersion(
-                                  static_cast<uint32_t>(props.format_version)))
-                              ->CompatibilityName()) == 0);
+      assert(
+          Slice(mgr->CompatibilityName())
+              .compare(GetBuiltinV2CompressionManager()->CompatibilityName()) ==
+          0);
     }
   }
   void PostPopulateCompressionProperties() {
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 1bd2bcc2b30a..0988f2b959ae 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -35,7 +35,6 @@ class WritableFile;
 struct BlockBasedTableOptions;
 
 extern const uint64_t kBlockBasedTableMagicNumber;
-extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 
 class BlockBasedTableBuilder : public TableBuilder {
  public:
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 64ae8b0e19eb..3101a82cd50b 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -485,19 +485,20 @@ void BlockBasedTableFactory::InitializeOptions() {
     }
   }
 
-  if (table_options_.format_version < kMinSupportedFormatVersion) {
+  if (table_options_.format_version < kMinSupportedBbtFormatVersionForWrite) {
+    // In TEST mode, allow writing format versions that are at least supported
+    // for reading (so that we have a way of testing the read side).
     if (TEST_AllowUnsupportedFormatVersion()) {
-      // Allow old format version for testing.
-      // And relevant old sanitization.
-      if (table_options_.format_version == 0 &&
-          table_options_.checksum != kCRC32c) {
-        // silently convert format_version to 1 to support non-CRC32c checksum
-        table_options_.format_version = 1;
+      if (table_options_.format_version <
+          kMinSupportedBbtFormatVersionForRead) {
+        table_options_.format_version = kMinSupportedBbtFormatVersionForWrite;
       }
     } else {
-      table_options_.format_version = kMinSupportedFormatVersion;
+      table_options_.format_version = kMinSupportedBbtFormatVersionForWrite;
     }
   }
+  // NOTE: do not sanitize too high format_version, so that it can be rejected
+  // in validation
 }
 
 Status BlockBasedTableFactory::PrepareOptions(const ConfigOptions& opts) {
@@ -627,8 +628,14 @@ Status BlockBasedTableFactory::ValidateOptions(
         "Enable pin_l0_filter_and_index_blocks_in_cache, "
         ", but block cache is disabled");
   }
-  if (!IsSupportedFormatVersion(table_options_.format_version) &&
-      !TEST_AllowUnsupportedFormatVersion()) {
+  // In TEST mode, also allow writing
+  // (a) old format_versions that for users are only supported for reads
+  // (b) future "draft" format versions that are not yet published to users
+  if (!(IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber,
+                                         table_options_.format_version) ||
+        (TEST_AllowUnsupportedFormatVersion() &&
+         table_options_.format_version >=
+             kMinSupportedBbtFormatVersionForRead))) {
     return Status::InvalidArgument(
         "Unsupported BlockBasedTable format_version. Please check "
         "include/rocksdb/table.h for more info");
@@ -636,9 +643,7 @@ Status BlockBasedTableFactory::ValidateOptions(
   bool using_builtin_compatible_compression = true;
   if (cf_opts.compression_manager &&
       strcmp(cf_opts.compression_manager->CompatibilityName(),
-             GetBuiltinCompressionManager(
-                 GetCompressFormatForVersion(table_options_.format_version))
-                 ->CompatibilityName()) != 0) {
+             GetBuiltinV2CompressionManager()->CompatibilityName()) != 0) {
     if (FormatVersionUsesCompressionManagerName(
             table_options_.format_version)) {
       using_builtin_compatible_compression = false;
diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h
index d1d13f4e2108..b05b45660401 100644
--- a/table/block_based/block_based_table_factory.h
+++ b/table/block_based/block_based_table_factory.h
@@ -87,8 +87,6 @@ class BlockBasedTableFactory : public TableFactory {
     return &shared_state_->tail_prefetch_stats;
   }
 
-  static constexpr int kMinSupportedFormatVersion = 2;
-
  protected:
   const void* GetOptionsPtr(const std::string& name) const override;
   Status ParseOption(const ConfigOptions& config_options,
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index b2d6a1e55813..b7e660f29ab0 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -576,9 +576,8 @@ Status GetDecompressor(const std::string& compression_name,
                        std::shared_ptr<Decompressor>* out_decompressor) {
   if (compression_name.empty()) {
     // Very old file (before RocksDB 4.9.0) that might contain compressed
-    // blocks. Get a general decompressor for the format version.
-    auto mgr_to_use = GetBuiltinCompressionManager(
-        GetCompressFormatForVersion(table_format_version));
+    // blocks. Get a general decompressor (for all supported format_versions)
+    auto mgr_to_use = GetBuiltinV2CompressionManager();
     *out_decompressor = mgr_to_use->GetDecompressor();
     return Status::OK();
   }
@@ -664,8 +663,7 @@ Status GetDecompressor(const std::string& compression_name,
                                 compression_name);
     } else if (saved_comp_type != kNoCompression) {
       // Use built-in compression manager
-      auto mgr_to_use = GetBuiltinCompressionManager(
-          GetCompressFormatForVersion(table_format_version));
+      auto mgr_to_use = GetBuiltinV2CompressionManager();
       *out_decompressor =
           mgr_to_use->GetDecompressorOptimizeFor(saved_comp_type);
     } else {
@@ -810,7 +808,8 @@ Status BlockBasedTable::Open(
     }
     return s;
   }
-  if (!IsSupportedFormatVersion(footer.format_version()) &&
+  if (!IsSupportedFormatVersionForRead(kBlockBasedTableMagicNumber,
+                                       footer.format_version()) &&
       !TEST_AllowUnsupportedFormatVersion()) {
     return Status::Corruption(
         "Unknown Footer version. Maybe this file was created with newer "
@@ -824,13 +823,6 @@ Status BlockBasedTable::Open(
   rep->file = std::move(file);
   rep->footer = footer;
 
-  // Some ancient versions (~2.5 - 2.7, format_version=1) could compress the
-  // metaindex block, so we need to allow for that
-  if (footer.format_version() < 2) {
-    auto mgr = GetBuiltinCompressionManager(/*compression_format_version=*/1);
-    rep->decompressor = mgr->GetDecompressor();
-  }
-
   // For fully portable/stable cache keys, we need to read the properties
   // block before setting up cache keys. TODO: consider setting up a bootstrap
   // cache key for PersistentCache to use for metaindex and properties blocks.
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index 5fcb0964da53..27a5bb7eb066 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -839,8 +839,7 @@ const BlockBasedTableOptions *kTableOptions() {
   return &opts;
 }
 Decompressor *kDecompressor() {
-  static auto mgr = GetBuiltinCompressionManager(
-      GetCompressFormatForVersion(kTableOptions()->format_version));
+  static auto mgr = GetBuiltinV2CompressionManager();
   static auto decomp = mgr->GetDecompressor();
   return decomp.get();
 }
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index 958b195c48d0..02869a879c61 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -350,7 +350,7 @@ INSTANTIATE_TEST_CASE_P(
     FormatVersions, PartitionedFilterBlockTest,
     testing::Combine(
         testing::ValuesIn(std::set<uint32_t>{
-            2, 3, 4, 5, test::kDefaultFormatVersion, kLatestFormatVersion}),
+            2, 3, 4, 5, test::kDefaultFormatVersion, kLatestBbtFormatVersion}),
         testing::ValuesIn(test::GetUDTTestModes()), testing::Bool()));
 
 TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc
index 0b1fa6c5a68e..e3d5dff735fd 100644
--- a/table/block_fetcher_test.cc
+++ b/table/block_fetcher_test.cc
@@ -319,8 +319,7 @@ class BlockFetcherTest : public testing::Test {
     PersistentCacheOptions persistent_cache_options;
     Footer footer;
     ReadFooter(file, &footer);
-    auto mgr = GetBuiltinCompressionManager(
-        GetCompressFormatForVersion(footer.format_version()));
+    auto mgr = GetBuiltinV2CompressionManager();
     std::unique_ptr<BlockFetcher> fetcher(new BlockFetcher(
         file, nullptr /* prefetch_buffer */, footer, roptions, block, contents,
         ioptions, do_uncompress, compressed, block_type,
diff --git a/table/format.cc b/table/format.cc
index 06a2135f5731..d0f80009d442 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -154,23 +154,18 @@ std::string IndexValue::ToString(bool hex, bool have_first_key) const {
 
 namespace {
 inline bool IsLegacyFooterFormat(uint64_t magic_number) {
-  return magic_number == kLegacyBlockBasedTableMagicNumber ||
-         magic_number == kLegacyPlainTableMagicNumber;
+  return magic_number == kLegacyPlainTableMagicNumber;
 }
+// Used when reading format_version=0 footers (plain tables)
 inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
-  if (magic_number == kLegacyBlockBasedTableMagicNumber) {
-    return kBlockBasedTableMagicNumber;
-  }
   if (magic_number == kLegacyPlainTableMagicNumber) {
     return kPlainTableMagicNumber;
   }
   assert(false);
   return magic_number;
 }
+// Used by plain tables to write format_version=0 footers
 inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
-  if (magic_number == kBlockBasedTableMagicNumber) {
-    return kLegacyBlockBasedTableMagicNumber;
-  }
   if (magic_number == kPlainTableMagicNumber) {
     return kLegacyPlainTableMagicNumber;
   }
@@ -178,14 +173,18 @@ inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
   return magic_number;
 }
 inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
-  if (magic_number == kBlockBasedTableMagicNumber ||
-      magic_number == kLegacyBlockBasedTableMagicNumber) {
+  if (magic_number == kBlockBasedTableMagicNumber) {
     return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
   } else {
     return 0;
   }
 }
 
+// NOTE: format_version 0 is still used by plain tables and format_version 1 by
+// cuckoo table. For block-based tables, format_version < 2 is no longer
+// supported for reading or writing. Legacy magic numbers on block-based tables
+// are used only for good error reporting.
+//
 // Footer format, in three parts:
 // * Part1
 //   -> format_version == 0 (inferred from legacy magic number)
@@ -229,7 +228,7 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
                             const BlockHandle& index_handle,
                             uint32_t base_context_checksum) {
   assert(magic_number != Footer::kNullTableMagicNumber);
-  assert(IsSupportedFormatVersion(format_version) ||
+  assert(IsSupportedFormatVersionForWrite(magic_number, format_version) ||
          TEST_AllowUnsupportedFormatVersion());
 
   char* part2;
@@ -251,6 +250,7 @@ Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
     EncodeFixed64(cur, magic_number);
     assert(cur + 8 == slice_.data() + slice_.size());
   } else {
+    // format_version == 0 is used by plain tables
     slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
     // Legacy SST files use kCRC32c checksum but it's not stored in footer.
     assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
@@ -337,9 +337,18 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
   uint64_t magic = DecodeFixed64(magic_ptr);
 
-  // We check for legacy formats here and silently upconvert them
+  // Legacy block-based tables (format_version < 2) are no longer supported.
+  // (This constant is only used here and in the corresponding test.)
+  if (magic == 0xdb4775248b80fb57ull) {
+    return Status::NotSupported(
+        "Unsupported legacy magic number for block-based SST format. Load with "
+        "RocksDB >= 4.6.0 and < 11.0.0 and run full compaction to upgrade.");
+  }
+
+  // Check for legacy formats
   bool legacy = IsLegacyFooterFormat(magic);
   if (legacy) {
+    // Legacy plain tables are still supported - upconvert magic
     magic = UpconvertLegacyFooterFormat(magic);
   }
   if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) {
@@ -355,6 +364,7 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   uint32_t computed_checksum = 0;
   uint64_t footer_offset = 0;
   if (legacy) {
+    // Legacy format (format_version=0, used by plain tables)
     // The size is already asserted to be at least kMinEncodedLength
     // at the beginning of the function
     input.remove_prefix(input.size() - kVersion0EncodedLength);
@@ -363,10 +373,11 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
   } else {
     part3_ptr = magic_ptr - 4;
     format_version_ = DecodeFixed32(part3_ptr);
-    if (UNLIKELY(!IsSupportedFormatVersion(format_version_) &&
+    if (UNLIKELY(!IsSupportedFormatVersionForRead(magic, format_version_) &&
                  !TEST_AllowUnsupportedFormatVersion())) {
-      return Status::Corruption("Corrupt or unsupported format_version: " +
-                                std::to_string(format_version_));
+      return Status::Corruption("Corrupt or unsupported format_version " +
+                                std::to_string(format_version_) +
+                                " for magic " + std::to_string(magic));
     }
     // All known format versions >= 1 occupy exactly this many bytes.
     if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) {
diff --git a/table/format.h b/table/format.h
index 38a5977abfd6..be7c0fa8abff 100644
--- a/table/format.h
+++ b/table/format.h
@@ -34,7 +34,6 @@ bool ShouldReportDetailedTime(Env* env, Statistics* stats);
 // the length of the magic number in bytes.
 constexpr uint32_t kMagicNumberLengthByte = 8;
 
-extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 extern const uint64_t kBlockBasedTableMagicNumber;
 
 extern const uint64_t kLegacyPlainTableMagicNumber;
@@ -163,22 +162,49 @@ inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum,
   return modifier & all_or_nothing;
 }
 
-inline uint32_t GetCompressFormatForVersion(uint32_t format_version) {
-  // As of format_version 2, we encode compressed block with
-  // compress_format_version == 2. Before that, the version is 1.
-  // DO NOT CHANGE THIS FUNCTION, it affects disk format
-  // As of format_version 7 and opening up to custom compression, the
-  // compression format version is essentially independent of the block-based
-  // table format version, and encoded in the compression_name table property.
-  // Thus, this function can go away once we remove support for reading
-  // format_version=1.
-  return format_version >= 2 ? 2 : 1;
-}
+constexpr uint32_t kLatestBbtFormatVersion = 7;
 
-constexpr uint32_t kLatestFormatVersion = 7;
+// Minimum format version supported for reading SST files in block-based format.
+//
+// When phasing out old format versions, first increase the write minimum,
+// then later (>= 6 mo) increase the read minimum when removing the
+// implementation for both read and write.
+constexpr uint32_t kMinSupportedBbtFormatVersionForRead = 2;
 
-inline bool IsSupportedFormatVersion(uint32_t version) {
-  return version <= kLatestFormatVersion;
+// Minimum format version supported for writing new SST files in block-based
+// format. This should be >= kMinSupportedFormatVersionForRead.
+//
+// When phasing out old format versions, first increase the write minimum,
+// then later (>= 6 mo) increase the read minimum when removing the
+// implementation for both read and write.
+constexpr uint32_t kMinSupportedBbtFormatVersionForWrite = 2;
+static_assert(kMinSupportedBbtFormatVersionForWrite >=
+              kMinSupportedBbtFormatVersionForRead);
+
+inline bool IsSupportedFormatVersionForRead(uint64_t magic, uint32_t version) {
+  if (magic == kBlockBasedTableMagicNumber) {
+    return version >= kMinSupportedBbtFormatVersionForRead &&
+           version <= kLatestBbtFormatVersion;
+  } else if (magic == kPlainTableMagicNumber) {
+    return version == 0;
+  } else if (magic == kCuckooTableMagicNumber) {
+    return version == 1;
+  } else {
+    return false;
+  }
+}
+
+inline bool IsSupportedFormatVersionForWrite(uint64_t magic, uint32_t version) {
+  if (magic == kBlockBasedTableMagicNumber) {
+    return version >= kMinSupportedBbtFormatVersionForWrite &&
+           version <= kLatestBbtFormatVersion;
+  } else if (magic == kPlainTableMagicNumber) {
+    return version == 0;
+  } else if (magic == kCuckooTableMagicNumber) {
+    return version == 1;
+  } else {
+    return false;
+  }
 }
 
 // Same as having a unique id in footer.
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 712f8fb0ccc5..6ccfe636e688 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -458,8 +458,7 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
 Status SstFileDumper::SetTableOptionsByMagicNumber(
     uint64_t table_magic_number) {
   assert(table_properties_);
-  if (table_magic_number == kBlockBasedTableMagicNumber ||
-      table_magic_number == kLegacyBlockBasedTableMagicNumber) {
+  if (table_magic_number == kBlockBasedTableMagicNumber) {
     // Preserve BlockBasedTableOptions on options_ when possible
     if (!options_.table_factory->IsInstanceOf(
             TableFactory::kBlockBasedTableName())) {
diff --git a/table/table_test.cc b/table/table_test.cc
index ae4fbea0e85c..768ed9a0f8fd 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -76,6 +76,7 @@
 #include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/compression.h"
+#include "util/defer.h"
 #include "util/file_checksum_helper.h"
 #include "util/random.h"
 #include "util/string_util.h"
@@ -674,35 +675,6 @@ static std::vector<TestArgs> GenerateArgList() {
   std::vector<int> restart_intervals = {16, 1, 1024};
   std::vector<uint32_t> compression_parallel_threads = {1, 4};
 
-  // Only add compression if it is supported
-  std::vector<std::pair<CompressionType, bool>> compression_types;
-  compression_types.emplace_back(kNoCompression, false);
-  if (Snappy_Supported()) {
-    compression_types.emplace_back(kSnappyCompression, false);
-  }
-  if (Zlib_Supported()) {
-    compression_types.emplace_back(kZlibCompression, false);
-    compression_types.emplace_back(kZlibCompression, true);
-  }
-  if (BZip2_Supported()) {
-    compression_types.emplace_back(kBZip2Compression, false);
-    compression_types.emplace_back(kBZip2Compression, true);
-  }
-  if (LZ4_Supported()) {
-    compression_types.emplace_back(kLZ4Compression, false);
-    compression_types.emplace_back(kLZ4Compression, true);
-    compression_types.emplace_back(kLZ4HCCompression, false);
-    compression_types.emplace_back(kLZ4HCCompression, true);
-  }
-  if (XPRESS_Supported()) {
-    compression_types.emplace_back(kXpressCompression, false);
-    compression_types.emplace_back(kXpressCompression, true);
-  }
-  if (ZSTD_Supported()) {
-    compression_types.emplace_back(kZSTD, false);
-    compression_types.emplace_back(kZSTD, true);
-  }
-
   for (auto test_type : test_types) {
     for (auto reverse_compare : reverse_compare_types) {
       if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
@@ -713,9 +685,9 @@ static std::vector<TestArgs> GenerateArgList() {
         one_arg.type = test_type;
         one_arg.reverse_compare = reverse_compare;
         one_arg.restart_interval = restart_intervals[0];
-        one_arg.compression = compression_types[0].first;
+        one_arg.compression = kNoCompression;
         one_arg.compression_parallel_threads = 1;
-        one_arg.format_version = 0;
+        one_arg.format_version = 0;  // Plain tables use their own versioning
         one_arg.use_mmap = true;
         test_args.push_back(one_arg);
         one_arg.use_mmap = false;
@@ -724,17 +696,20 @@ static std::vector<TestArgs> GenerateArgList() {
       }
 
       for (auto restart_interval : restart_intervals) {
-        for (auto compression_type : compression_types) {
+        for (auto compression_type : GetSupportedCompressions()) {
           for (auto num_threads : compression_parallel_threads) {
-            TestArgs one_arg;
-            one_arg.type = test_type;
-            one_arg.reverse_compare = reverse_compare;
-            one_arg.restart_interval = restart_interval;
-            one_arg.compression = compression_type.first;
-            one_arg.compression_parallel_threads = num_threads;
-            one_arg.format_version = compression_type.second ? 2 : 1;
-            one_arg.use_mmap = false;
-            test_args.push_back(one_arg);
+            // format_version = 7 changes some compression handling
+            for (uint32_t fv : {kMinSupportedBbtFormatVersionForRead, 7U}) {
+              TestArgs one_arg;
+              one_arg.type = test_type;
+              one_arg.reverse_compare = reverse_compare;
+              one_arg.restart_interval = restart_interval;
+              one_arg.compression = compression_type;
+              one_arg.compression_parallel_threads = num_threads;
+              one_arg.format_version = fv;
+              one_arg.use_mmap = false;
+              test_args.push_back(one_arg);
+            }
           }
         }
       }
@@ -5002,30 +4977,11 @@ TEST(TableTest, FooterTests) {
   BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size);
   uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5;
   uint32_t base_context_checksum = 123456789;
-  {
-    // legacy block based
-    FooterBuilder footer;
-    ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0,
-                           footer_offset, kCRC32c, meta_index, index));
-    Footer decoded_footer;
-    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
-    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
-    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
-    ASSERT_EQ(decoded_footer.format_version(), 0U);
-    ASSERT_EQ(decoded_footer.base_context_checksum(), 0U);
-    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U);
-    // Ensure serialized with legacy magic
-    ASSERT_EQ(
-        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
-        kLegacyBlockBasedTableMagicNumber);
-  }
-  // block based, various checksums, various versions
+  // block based, various checksums, various versions (format_version >= 2)
   for (auto t : GetSupportedChecksums()) {
-    for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) {
+    for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite;
+         IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, fv);
+         ++fv) {
       uint32_t maybe_bcc =
           FormatVersionUsesContextChecksum(fv) ? base_context_checksum : 0U;
       FooterBuilder footer;
@@ -5072,41 +5028,154 @@ TEST(TableTest, FooterTests) {
     }
   }
 
+  // plain table, various checksums, various versions (format_version >= 2)
+  // Plain tables have no block trailer (size 0), so set up separate handles
+  // Note: format_version >= 6 has complex footer checksum requirements,
+  // so we only test format_version 2-5 for plain tables here
   {
-    // legacy plain table
-    FooterBuilder footer;
-    ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 0,
-                           footer_offset, kNoChecksum, meta_index));
+    uint64_t plain_metaindex_size = r->Uniform(1000000);
+    // For plain tables: metaindex is at offset 0, footer immediately follows
+    BlockHandle plain_meta_index(0, plain_metaindex_size);
+    uint64_t plain_footer_offset = plain_metaindex_size;
+    for (auto t : GetSupportedChecksums()) {
+      for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite; fv < 6; ++fv) {
+        FooterBuilder footer;
+        ASSERT_OK(footer.Build(kPlainTableMagicNumber, fv, plain_footer_offset,
+                               t, plain_meta_index));
+        Footer decoded_footer;
+        ASSERT_OK(
+            decoded_footer.DecodeFrom(footer.GetSlice(), plain_footer_offset));
+        ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
+        ASSERT_EQ(decoded_footer.checksum_type(), t);
+        ASSERT_EQ(decoded_footer.metaindex_handle().offset(),
+                  plain_meta_index.offset());
+        ASSERT_EQ(decoded_footer.metaindex_handle().size(),
+                  plain_meta_index.size());
+        ASSERT_EQ(decoded_footer.format_version(), fv);
+        ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+      }
+    }
+  }
+}
+
+// Test that legacy SST formats (format_version < 2) are properly rejected
+TEST(TableTest, LegacyFormatRejectionTests) {
+  // Temporarily disable unsupported format version allowance for this test
+  bool& allow = TEST_AllowUnsupportedFormatVersion();
+  SaveAndRestore<bool> saved_allow(&allow, false);
+
+  // Test legacy block-based magic number from LevelDB should be rejected
+  {
+    // Construct a fake footer with legacy block-based magic number
+    std::array<char, Footer::kVersion0EncodedLength> fake_footer;
+    std::fill(fake_footer.begin(), fake_footer.end(), 0);
+    // Put legacy magic number at the end
+    EncodeFixed64(fake_footer.data() + fake_footer.size() - 8,
+                  0xdb4775248b80fb57ull /*legacy magic number*/);
+
     Footer decoded_footer;
-    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
-    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum_type(), kCRC32c);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
-    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
-    ASSERT_EQ(decoded_footer.format_version(), 0U);
-    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
-    // Ensure serialized with legacy magic
-    ASSERT_EQ(
-        DecodeFixed64(footer.GetSlice().data() + footer.GetSlice().size() - 8),
-        kLegacyPlainTableMagicNumber);
+    Status s = decoded_footer.DecodeFrom(
+        Slice(fake_footer.data(), fake_footer.size()), 0);
+    ASSERT_TRUE(s.IsNotSupported()) << s.ToString();
+    ASSERT_TRUE(s.ToString().find("nsupported legacy magic number") !=
+                std::string::npos)
+        << s.ToString();
+    ASSERT_TRUE(s.ToString().find("full compaction") != std::string::npos)
+        << s.ToString();
+  }
+
+  // Test format_version=1 with new magic number should be rejected
+  {
+    std::array<char, Footer::kNewVersionsEncodedLength> fake_footer;
+    std::fill(fake_footer.begin(), fake_footer.end(), 0);
+    // Part 1: checksum type
+    fake_footer[0] = kCRC32c;
+    // Part 3: format_version=1 and new magic number
+    char* part3 = fake_footer.data() + fake_footer.size() - 12;
+    EncodeFixed32(part3, 1);  // format_version = 1
+    EncodeFixed64(part3 + 4, kBlockBasedTableMagicNumber);
+
+    Footer decoded_footer;
+    Status s = decoded_footer.DecodeFrom(
+        Slice(fake_footer.data(), fake_footer.size()), 0);
+    // format_version=1 is not supported for read, should return Corruption
+    ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+    ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos)
+        << s.ToString();
   }
+
+  // Test format_version=0 with new magic number should be rejected
   {
-    // xxhash plain table (not currently used)
-    FooterBuilder footer;
-    ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 1,
-                           footer_offset, kxxHash, meta_index));
+    std::array<char, Footer::kNewVersionsEncodedLength> fake_footer;
+    std::fill(fake_footer.begin(), fake_footer.end(), 0);
+    // Part 1: checksum type
+    fake_footer[0] = kCRC32c;
+    // Part 3: format_version=0 and new magic number
+    char* part3 = fake_footer.data() + fake_footer.size() - 12;
+    EncodeFixed32(part3, 0);  // format_version = 0
+    EncodeFixed64(part3 + 4, kBlockBasedTableMagicNumber);
+
     Footer decoded_footer;
-    ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset));
-    ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber);
-    ASSERT_EQ(decoded_footer.checksum_type(), kxxHash);
-    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
-    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
-    ASSERT_EQ(decoded_footer.index_handle().offset(), 0U);
-    ASSERT_EQ(decoded_footer.index_handle().size(), 0U);
-    ASSERT_EQ(decoded_footer.format_version(), 1U);
-    ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U);
+    Status s = decoded_footer.DecodeFrom(
+        Slice(fake_footer.data(), fake_footer.size()), 0);
+    // format_version=0 is not supported for read, should return Corruption
+    ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+    ASSERT_TRUE(s.ToString().find("format_version") != std::string::npos)
+        << s.ToString();
+  }
+}
+
+// Test that configuring unsupported format_version for writing is sanitized
+// or rejected as appropriate
+TEST(TableTest, UnsupportedFormatVersionConfigTest) {
+  // Temporarily disable unsupported format version allowance for this test
+  bool& allow = TEST_AllowUnsupportedFormatVersion();
+  SaveAndRestore<bool> saved_allow(&allow, false);
+
+  // Test that format_version < kMinSupportedBbtFormatVersionForWrite is
+  // sanitized to kMinSupportedBbtFormatVersionForWrite during initialization
+  for (uint32_t fv = 0; fv < kMinSupportedBbtFormatVersionForWrite; ++fv) {
+    BlockBasedTableOptions table_options;
+    table_options.format_version = fv;
+    BlockBasedTableFactory factory(table_options);
+
+    // After construction, format_version should be sanitized
+    auto* opts = factory.GetOptions<BlockBasedTableOptions>();
+    ASSERT_EQ(opts->format_version, kMinSupportedBbtFormatVersionForWrite)
+        << "format_version=" << fv << " should be sanitized to "
+        << kMinSupportedBbtFormatVersionForWrite;
+  }
+
+  // Test that supported format versions are not changed
+  for (uint32_t fv = kMinSupportedBbtFormatVersionForWrite;
+       IsSupportedFormatVersionForWrite(kBlockBasedTableMagicNumber, fv);
+       ++fv) {
+    BlockBasedTableOptions table_options;
+    table_options.format_version = fv;
+    BlockBasedTableFactory factory(table_options);
+
+    auto* opts = factory.GetOptions<BlockBasedTableOptions>();
+    ASSERT_EQ(opts->format_version, fv)
+        << "format_version=" << fv << " should not be changed";
+
+    ColumnFamilyOptions cf_opts;
+    DBOptions db_opts;
+    Status s = factory.ValidateOptions(db_opts, cf_opts);
+    ASSERT_OK(s) << "format_version=" << fv << ": " << s.ToString();
+  }
+
+  // Test that format_version > kLatestBbtFormatVersion is rejected by
+  // ValidateOptions (not sanitized, since it could be a future version that
+  // requires newer code)
+  {
+    BlockBasedTableOptions table_options;
+    table_options.format_version = kLatestBbtFormatVersion + 1;
+    BlockBasedTableFactory factory(table_options);
+
+    ColumnFamilyOptions cf_opts;
+    DBOptions db_opts;
+    Status s = factory.ValidateOptions(db_opts, cf_opts);
+    ASSERT_TRUE(s.IsInvalidArgument()) << s.ToString();
   }
 }
 
@@ -5707,8 +5776,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
       read_options_for_helper.verify_checksums = false;
       PersistentCacheOptions cache_options;
 
-      auto mgr = GetBuiltinCompressionManager(
-          GetCompressFormatForVersion(footer.format_version()));
+      auto mgr = GetBuiltinV2CompressionManager();
       BlockFetcher block_fetcher(file, nullptr /* prefetch_buffer */, footer,
                                  read_options_for_helper, handle, contents,
                                  ioptions, false /* decompress */,
@@ -5846,8 +5914,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   PersistentCacheOptions pcache_opts;
-  auto mgr = GetBuiltinCompressionManager(
-      GetCompressFormatForVersion(footer.format_version()));
+  auto mgr = GetBuiltinV2CompressionManager();
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
@@ -5929,8 +5996,7 @@ TEST_P(BlockBasedTableTest, SeekMetaBlocks) {
   auto metaindex_handle = footer.metaindex_handle();
   BlockContents metaindex_contents;
   PersistentCacheOptions pcache_opts;
-  auto mgr = GetBuiltinCompressionManager(
-      GetCompressFormatForVersion(footer.format_version()));
+  auto mgr = GetBuiltinV2CompressionManager();
   BlockFetcher block_fetcher(
       table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
       metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index d5a786f0d735..acb8ec6aa1e7 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -44,7 +44,7 @@ const std::set<uint32_t> kFooterFormatVersionsToTest{
     6U,
     // In case any interesting future changes
     kDefaultFormatVersion,
-    kLatestFormatVersion,
+    kLatestBbtFormatVersion,
 };
 const ReadOptionsNoIo kReadOptionsNoIo;
 
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 075a512337c1..67639a0ca5e7 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -159,8 +159,9 @@ declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_wi
 
 # Branches (git refs) to check for DB backward compatibility (new version
 # reading data from old) (in addition to the "forward compatible" list)
-# NOTE: 2.7.fb.branch shows assertion violation in some configurations
-declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
+# NOTE: format_version < 2 support was removed, so we only test back to 4.6.fb
+# (when format_version=2 became the default)
+declare -a db_backward_only_refs=("4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
 
 if [ "$SHORT_TEST" ]; then
   # Use only the first (if exists) of each list
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index c288397d34b4..c155fa01b1e0 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -523,7 +523,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       bbto.block_size = block_size;
       bbto.enable_index_compression = enable_index_compression;
       // Maximize compression features available
-      bbto.format_version = kLatestFormatVersion;
+      bbto.format_version = kLatestBbtFormatVersion;
       options.table_factory = std::make_shared<BlockBasedTableFactory>(bbto);
     }
     options.compression_opts.max_dict_bytes = compression_max_dict_bytes;
diff --git a/unreleased_history/public_api_changes/remove_fv_1.md b/unreleased_history/public_api_changes/remove_fv_1.md
new file mode 100644
index 000000000000..dbeb3d870b69
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_fv_1.md
@@ -0,0 +1 @@
+* Drop support for reading (and writing) SST files using `BlockBasedTableOptions.format_version` < 2, which hasn't been the default format for about 10 years. An upgrade path is still possible with full compaction using a RocksDB version >= 4.6.0 and < 11.0.0 and then using the newer version.
diff --git a/util/compression.cc b/util/compression.cc
index 612854b5ac19..19e1e6584d65 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -162,60 +162,6 @@ class CompressorBase : public Compressor {
   CompressionOptions opts_;
 };
 
-class BuiltinCompressorV1 final : public CompressorBase {
- public:
-  const char* Name() const override { return "BuiltinCompressorV1"; }
-
-  explicit BuiltinCompressorV1(const CompressionOptions& opts,
-                               CompressionType type)
-      : CompressorBase(opts), type_(type) {
-    assert(type != kNoCompression);
-  }
-
-  CompressionType GetPreferredCompressionType() const override { return type_; }
-
-  std::unique_ptr<Compressor> Clone() const override {
-    return std::make_unique<BuiltinCompressorV1>(opts_, type_);
-  }
-
-  Status CompressBlock(Slice uncompressed_data, char* compressed_output,
-                       size_t* compressed_output_size,
-                       CompressionType* out_compression_type,
-                       ManagedWorkingArea* wa) override {
-    std::optional<CompressionContext> tmp_ctx;
-    CompressionContext* ctx = nullptr;
-    if (wa != nullptr && wa->owner() == this) {
-      ctx = static_cast<CompressionContext*>(wa->get());
-    }
-    if (ctx == nullptr) {
-      tmp_ctx.emplace(type_, opts_);
-      ctx = &*tmp_ctx;
-    }
-    CompressionInfo info(opts_, *ctx, CompressionDict::GetEmptyDict(), type_);
-    std::string str_output;
-    str_output.reserve(uncompressed_data.size());
-    if (!OLD_CompressData(uncompressed_data, info,
-                          1 /*compress_format_version*/, &str_output)) {
-      // Maybe rejected or bypassed
-      *compressed_output_size = str_output.size();
-      *out_compression_type = kNoCompression;
-      return Status::OK();
-    }
-    if (str_output.size() > *compressed_output_size) {
-      // Compression rejected
-      *out_compression_type = kNoCompression;
-      return Status::OK();
-    }
-    std::memcpy(compressed_output, str_output.data(), str_output.size());
-    *compressed_output_size = str_output.size();
-    *out_compression_type = type_;
-    return Status::OK();
-  }
-
- protected:
-  const CompressionType type_;
-};
-
 class CompressorWithSimpleDictBase : public CompressorBase {
  public:
   explicit CompressorWithSimpleDictBase(const CompressionOptions& opts,
@@ -1038,96 +984,6 @@ class BuiltinZSTDCompressorV2 final : public CompressorBase {
   const CompressionDict dict_;
 };
 
-// NOTE: this implementation is intentionally SIMPLE based on existing code
-// and NOT EFFICIENT because this is an old/deprecated format.
-class BuiltinDecompressorV1 final : public Decompressor {
- public:
-  const char* Name() const override { return "BuiltinDecompressorV1"; }
-
-  Status ExtractUncompressedSize(Args& args) override {
-    CacheAllocationPtr throw_away_output;
-    return DoUncompress(args, &throw_away_output, &args.uncompressed_size);
-  }
-
-  Status DecompressBlock(const Args& args, char* uncompressed_output) override {
-    uint64_t same_uncompressed_size = 0;
-    CacheAllocationPtr output;
-    Status s = DoUncompress(args, &output, &same_uncompressed_size);
-    if (same_uncompressed_size != args.uncompressed_size) {
-      s = Status::Corruption("Compressed block size mismatch");
-    }
-    if (s.ok()) {
-      // NOTE: simple but inefficient
-      memcpy(uncompressed_output, output.get(), args.uncompressed_size);
-    }
-    return s;
-  }
-
- protected:
-  Status DoUncompress(const Args& args, CacheAllocationPtr* out_data,
-                      uint64_t* out_uncompressed_size) {
-    assert(args.working_area == nullptr);
-    assert(*out_uncompressed_size == 0);
-
-    // NOTE: simple but inefficient
-    UncompressionContext dummy_ctx{args.compression_type};
-    UncompressionInfo info{dummy_ctx, UncompressionDict::GetEmptyDict(),
-                           args.compression_type};
-    const char* error_message = nullptr;
-    size_t size_t_uncompressed_size = 0;
-    *out_data = OLD_UncompressData(
-        info, args.compressed_data.data(), args.compressed_data.size(),
-        &size_t_uncompressed_size, 1 /*compress_format_version*/,
-        nullptr /*allocator*/, &error_message);
-    if (*out_data == nullptr) {
-      if (error_message != nullptr) {
-        return Status::Corruption(error_message);
-      } else {
-        return Status::Corruption("Corrupted compressed block contents");
-      }
-    }
-    *out_uncompressed_size = size_t_uncompressed_size;
-    assert(*out_uncompressed_size > 0);
-    return Status::OK();
-  }
-};
-
-class BuiltinCompressionManagerV1 final : public CompressionManager {
- public:
-  BuiltinCompressionManagerV1() = default;
-  ~BuiltinCompressionManagerV1() override = default;
-
-  const char* Name() const override { return "BuiltinCompressionManagerV1"; }
-
-  const char* CompatibilityName() const override { return "BuiltinV1"; }
-
-  std::unique_ptr<Compressor> GetCompressor(const CompressionOptions& opts,
-                                            CompressionType type) override {
-    // At the time of deprecating the writing of new format_version=1 files,
-    // ZSTD was the last supported built-in compression type.
-    if (type > kZSTD) {
-      // Unrecognized; fall back on default compression
-      type = ColumnFamilyOptions{}.compression;
-    }
-    if (type == kNoCompression) {
-      return nullptr;
-    } else {
-      return std::make_unique<BuiltinCompressorV1>(opts, type);
-    }
-  }
-
-  std::shared_ptr<Decompressor> GetDecompressor() override {
-    return std::shared_ptr<Decompressor>(shared_from_this(), &decompressor_);
-  }
-
-  bool SupportsCompressionType(CompressionType type) const override {
-    return CompressionTypeSupported(type);
-  }
-
- protected:
-  BuiltinDecompressorV1 decompressor_;
-};
-
 // Subroutines for BuiltinDecompressorV2
 
 Status Snappy_DecompressBlock(const Decompressor::Args& args,
@@ -1697,9 +1553,6 @@ class BuiltinCompressionManagerV2 final : public CompressionManager {
   }
 };
 
-const std::shared_ptr<BuiltinCompressionManagerV1>
-    kBuiltinCompressionManagerV1 =
-        std::make_shared<BuiltinCompressionManagerV1>();
 const std::shared_ptr<BuiltinCompressionManagerV2>
     kBuiltinCompressionManagerV2 =
         std::make_shared<BuiltinCompressionManagerV2>();
@@ -1728,14 +1581,6 @@ Status CompressionManager::CreateFromString(
   std::call_once(loaded, [&]() {
     auto& library = *ObjectLibrary::Default();
     // TODO: try to enhance ObjectLibrary to support singletons
-    library.AddFactory<CompressionManager>(
-        kBuiltinCompressionManagerV1->CompatibilityName(),
-        [](const std::string& /*uri*/,
-           std::unique_ptr<CompressionManager>* guard,
-           std::string* /*errmsg*/) {
-          *guard = std::make_unique<BuiltinCompressionManagerV1>();
-          return guard->get();
-        });
     library.AddFactory<CompressionManager>(
         kBuiltinCompressionManagerV2->CompatibilityName(),
         [](const std::string& /*uri*/,
@@ -1782,26 +1627,10 @@ CompressionManager::FindCompatibleCompressionManager(Slice compatibility_name) {
   }
 }
 
-const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
-    int compression_format_version) {
-  static const std::shared_ptr<CompressionManager> v1_as_base =
-      kBuiltinCompressionManagerV1;
+const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager() {
   static const std::shared_ptr<CompressionManager> v2_as_base =
       kBuiltinCompressionManagerV2;
-  static const std::shared_ptr<CompressionManager> none;
-  if (compression_format_version == 1) {
-    return v1_as_base;
-  } else if (compression_format_version == 2) {
-    return v2_as_base;
-  } else {
-    // Unrecognized. In some cases this is unexpected and the caller can
-    // rightfully crash.
-    return none;
-  }
-}
-
-const std::shared_ptr<CompressionManager>& GetBuiltinV2CompressionManager() {
-  return GetBuiltinCompressionManager(2);
+  return v2_as_base;
 }
 
 // ***********************************************************************
diff --git a/util/compression.h b/util/compression.h
index ff261d3ad513..6d6613a618d7 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -1818,19 +1818,6 @@ inline CacheAllocationPtr OLD_UncompressData(
   }
 }
 
-// ***********************************************************************
-// BEGIN built-in implementation of customization interface
-// ***********************************************************************
-
-// NOTE: to avoid compression API depending on block-based table API, uses
-// its own format version. See internal function GetCompressFormatForVersion()
-const std::shared_ptr<CompressionManager>& GetBuiltinCompressionManager(
-    int compression_format_version);
-
-// ***********************************************************************
-// END built-in implementation of customization interface
-// ***********************************************************************
-
 // The new compression APIs intentionally make it difficult to generate
 // compressed data larger than the original. (It is better to store the
 // uncompressed version in that case.) For legacy cases that must store

From c3184220b8eb7894ab8d6e5644429efc9160aaee Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 12 Feb 2026 21:29:10 -0800
Subject: [PATCH 469/500] Fix an internal comment about resumable compaction
 (#14215)

Summary:
**Context/Summary:** as titled

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14215

Test Plan: no code change

Reviewed By: jaykorean

Differential Revision: D90037003

Pulled By: hx235

fbshipit-source-id: 8621a8dedef474b02bb16531e0de4ea399659d21
---
 db/compaction/compaction_job.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index d5ac5738527b..39d75a10ca0b 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -2903,12 +2903,6 @@ void CompactionJob::RestoreCompactionOutputs(
 // - Status::NotFound(): No valid progress to resume from
 // - Status::Corruption(): Resume key is invalid, beyond input range, or output
 // restoration failed
-// - Other non-OK status: Iterator errors or file system issues during
-// restoration
-//
-// The caller must check for Status::IsIncomplete() to distinguish between
-// "no resume needed" (proceed with `InternalIterator::SeekToFirst()`) vs
-// "resume failed" scenarios.
 Status CompactionJob::MaybeResumeSubcompactionProgressOnInputIterator(
     SubcompactionState* sub_compact, InternalIterator* input_iter) {
   const ReadOptions read_options(Env::IOActivity::kCompaction);

From d72a471749751cf192b056a8433b6193fc312636 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 12 Feb 2026 22:30:55 -0800
Subject: [PATCH 470/500] Replace resumable compaction job unit test with
 compaction service unit test (#14191)

Summary:
**Context/Summary**:
compaction_job_test does low level assertions on what keys were saved and to resume in https://github.com/facebook/rocksdb/commit/1e5fa69c99ac8765783f5ce8a3a065b08f5b08a7 before the integration of the feature is done in a separate PR https://github.com/facebook/rocksdb/commit/f7e4009de1d16421a254dd7e799dd91c522d832c. Such low-level test makes it difficult to assert data correctness, is hard to understand by being tied to implementation details.

Therefore they are now replaced with db-level tests  with data correctness check, which is what we ultimately care out of those details. I also expand the test to cover wide column and TimedPut() which associates a key with write time.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14191

Test Plan:
- Only test change; I also manually traced every test to ensure correct resumption point; also removing
```
if (c_iter->IsCurrentKeyAlreadyScanned()) {
    return false;
  }
```
correctly leads to the expected error with resumption at merge, single delete and deletion at bottom

Reviewed By: jaykorean

Differential Revision: D89492846

Pulled By: hx235

fbshipit-source-id: 6c6ab3cbd643ca1b15d049a062da2c76165ef9db
---
 db/compaction/compaction_job_test.cc     | 454 --------------------
 db/compaction/compaction_service_test.cc | 509 +++++++++++++++++++++--
 db/db_impl/db_impl_secondary.cc          |  11 +-
 3 files changed, 473 insertions(+), 501 deletions(-)

diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index ce55dfe4f8ee..7a6f77ee222a 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -2409,460 +2409,6 @@ TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
                 kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true,
                 Env::IO_LOW, Env::IO_LOW);
 }
-
-class ResumableCompactionJobTest : public CompactionJobTestBase {
- public:
-  ResumableCompactionJobTest()
-      : CompactionJobTestBase(
-            test::PerThreadDBPath("allow_resumption_job_test"),
-            BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
-            /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
-
- protected:
-  static constexpr const char* kCancelBeforeThisKey = "cancel_before_this_key";
-  std::string progress_dir_;
-  bool enable_cancel_ = false;
-  std::atomic<int> stop_count_{0};
-  std::atomic<bool> cancel_{false};
-  SequenceNumber cancel_before_seqno = kMaxSequenceNumber;
-
-  void SetUp() override {
-    CompactionJobTestBase::SetUp();
-    SyncPoint::GetInstance()->SetCallBack(
-        "CompactionOutputs::ShouldStopBefore::manual_decision",
-        [this](void* p) {
-          auto* pair = static_cast<std::pair<bool*, const Slice>*>(p);
-          *(pair->first) = true;
-
-          // Cancel after outputting a specific key
-          if (enable_cancel_) {
-            ParsedInternalKey parsed_key;
-            if (ParseInternalKey(pair->second, &parsed_key, true).ok()) {
-              if (parsed_key.user_key == kCancelBeforeThisKey &&
-                  (cancel_before_seqno == kMaxSequenceNumber ||
-                   parsed_key.sequence == cancel_before_seqno)) {
-                cancel_.store(true);
-              }
-            }
-          }
-        });
-    SyncPoint::GetInstance()->EnableProcessing();
-  }
-
-  void TearDown() override {
-    SyncPoint::GetInstance()->DisableProcessing();
-    SyncPoint::GetInstance()->ClearAllCallBacks();
-
-    if (env_->FileExists(progress_dir_).ok()) {
-      std::vector<std::string> files;
-      EXPECT_OK(env_->GetChildren(progress_dir_, &files));
-      for (const auto& file : files) {
-        if (file != "." && file != "..") {
-          EXPECT_OK(env_->DeleteFile(progress_dir_ + "/" + file));
-        }
-      }
-      EXPECT_OK(env_->DeleteDir(progress_dir_));
-    }
-
-    CompactionJobTestBase::TearDown();
-  }
-
-  void NewDB() {
-    if (env_->FileExists(progress_dir_).ok()) {
-      std::vector<std::string> files;
-      EXPECT_OK(env_->GetChildren(progress_dir_, &files));
-      for (const auto& file : files) {
-        if (file != "." && file != "..") {
-          EXPECT_OK(env_->DeleteFile(progress_dir_ + "/" + file));
-        }
-      }
-      EXPECT_OK(env_->DeleteDir(progress_dir_));
-    }
-
-    CompactionJobTestBase::NewDB();
-
-    progress_dir_ = test::PerThreadDBPath("compaction_progress");
-    ASSERT_OK(env_->CreateDirIfMissing(progress_dir_));
-  }
-
-  void EnableCompactionCancel() { enable_cancel_ = true; }
-
-  void DisableCompactionCancel() {
-    enable_cancel_ = false;
-    cancel_.store(false);
-  }
-
-  std::unique_ptr<log::Writer> CreateCompactionProgressWriter(
-      const std::string& compaction_progress_file) {
-    std::unique_ptr<FSWritableFile> file;
-    EXPECT_OK(fs_->NewWritableFile(compaction_progress_file, FileOptions(),
-                                   &file, nullptr));
-    auto file_writer = std::make_unique<WritableFileWriter>(
-        std::move(file), compaction_progress_file, FileOptions());
-    auto compaction_progress_writer =
-        std::make_unique<log::Writer>(std::move(file_writer), 0, false);
-    return compaction_progress_writer;
-  }
-
-  Status RunCompactionWithProgressTracking(
-      const CompactionProgress& compaction_progress,
-      log::Writer* compaction_progress_writer,
-      std::vector<SequenceNumber> snapshots = {},
-      std::shared_ptr<Statistics> stats = nullptr) {
-    mutex_.Lock();
-
-    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
-    auto files = cfd->current()->storage_info()->LevelFiles(0);
-
-    db_options_.statistics = stats;
-    db_options_.stats = db_options_.statistics.get();
-
-    std::vector<CompactionInputFiles> compaction_input_files;
-    CompactionInputFiles level;
-    level.level = 0;
-    level.files = files;
-    compaction_input_files.push_back(level);
-
-    Compaction compaction(
-        cfd->current()->storage_info(), cfd->ioptions(),
-        cfd->GetLatestMutableCFOptions(), mutable_db_options_,
-        compaction_input_files, 1, mutable_cf_options_.target_file_size_base,
-        mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
-        cfd->GetLatestMutableCFOptions().compression_opts,
-        Temperature::kUnknown, 0, {}, std::nullopt, nullptr,
-        CompactionReason::kManualCompaction);
-    compaction.FinalizeInputInfo(cfd->current());
-
-    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
-    EventLogger event_logger(db_options_.info_log.get());
-    JobContext job_context(1, false);
-    job_context.InitSnapshotContext(nullptr, nullptr, kMaxSequenceNumber,
-                                    std::move(snapshots));
-    CompactionJobStats job_stats;
-
-    CompactionJob compaction_job(
-        0, &compaction, db_options_, mutable_db_options_, env_options_,
-        versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
-        nullptr, stats.get(), &mutex_, &error_handler_, &job_context,
-        table_cache_, &event_logger, false, false, dbname_, &job_stats,
-        Env::Priority::USER, nullptr, cancel_,
-        CompactionJob::kCompactionAbortedFalse, env_->GenerateUniqueId(),
-        DBImpl::GenerateDbSessionId(nullptr), "");
-
-    compaction_job.Prepare(std::nullopt, compaction_progress,
-                           compaction_progress_writer);
-    mutex_.Unlock();
-
-    compaction_job.Run().PermitUncheckedError();
-    EXPECT_OK(compaction_job.io_status());
-
-    mutex_.Lock();
-
-    bool compaction_released = false;
-    Status s = compaction_job.Install(&compaction_released);
-
-    mutex_.Unlock();
-    if (!compaction_released) {
-      compaction.ReleaseCompactionFiles(s);
-    }
-
-    return s;
-  }
-
-  SubcompactionProgress ReadAndParseProgress(
-      const std::string& compaction_progress_file) {
-    std::unique_ptr<FSSequentialFile> seq_file;
-    EXPECT_OK(fs_->NewSequentialFile(compaction_progress_file, FileOptions(),
-                                     &seq_file, nullptr));
-    auto file_reader = std::make_unique<SequentialFileReader>(
-        std::move(seq_file), compaction_progress_file, 0, nullptr);
-    log::Reader reader(nullptr, std::move(file_reader), nullptr, true, 0);
-
-    SubcompactionProgressBuilder builder;
-    std::string record;
-    Slice slice;
-
-    while (reader.ReadRecord(&slice, &record)) {
-      VersionEdit edit;
-      if (!edit.DecodeFrom(slice).ok()) {
-        continue;
-      }
-      builder.ProcessVersionEdit(edit);
-    }
-
-    EXPECT_TRUE(builder.HasAccumulatedSubcompactionProgress());
-
-    return builder.GetAccumulatedSubcompactionProgress();
-  }
-
-  // Test utility function to verify that compaction progress was correctly
-  // persisted to the progress file after compaction interruption.
-  //
-  // VERIFIES:
-  // - Progress file exists and has expected size (empty if no progress
-  // expected)
-  // - Next internal key to compact matches expected user key with proper format
-  // - Number of processed input records matches position in ordered input keys
-  // - Number of processed output records equals number of processed input
-  // records (by test design to simplify verification)
-  // - Each output file contains exactly one user key (by test design to
-  // simplify verification)
-  void VerifyCompactionProgressPersisted(
-      const std::string& compaction_progress_file,
-      const std::string& next_user_key_to_compact,
-      const std::vector<std::string>& ordered_intput_keys) {
-    ASSERT_OK(env_->FileExists(compaction_progress_file));
-
-    uint64_t file_size;
-    ASSERT_OK(env_->GetFileSize(compaction_progress_file, &file_size));
-
-    if (next_user_key_to_compact.empty()) {
-      ASSERT_EQ(file_size, 0);
-      return;
-    }
-
-    const auto& subcompaction_progress =
-        ReadAndParseProgress(compaction_progress_file);
-
-    ASSERT_FALSE(subcompaction_progress.next_internal_key_to_compact.empty());
-    ParsedInternalKey parsed_next_key;
-    ASSERT_OK(
-        ParseInternalKey(subcompaction_progress.next_internal_key_to_compact,
-                         &parsed_next_key, true /* log_err_key */));
-    ASSERT_EQ(parsed_next_key.user_key, next_user_key_to_compact);
-    ASSERT_EQ(parsed_next_key.sequence, kMaxSequenceNumber);
-    ASSERT_EQ(parsed_next_key.type, kValueTypeForSeek);
-
-    auto it = std::find(ordered_intput_keys.begin(), ordered_intput_keys.end(),
-                        next_user_key_to_compact);
-    ASSERT_TRUE(it != ordered_intput_keys.end());
-
-    auto next_key_index = std::distance(ordered_intput_keys.begin(), it);
-
-    ASSERT_EQ(subcompaction_progress.num_processed_input_records,
-              next_key_index);
-
-    ASSERT_EQ(subcompaction_progress.output_level_progress
-                  .GetNumProcessedOutputRecords(),
-              next_key_index);
-
-    ASSERT_EQ(
-        subcompaction_progress.output_level_progress.GetOutputFiles().size(),
-
-        next_key_index);
-
-    for (size_t i = 0;
-         i <
-         subcompaction_progress.output_level_progress.GetOutputFiles().size();
-         ++i) {
-      const auto& output_file =
-          subcompaction_progress.output_level_progress.GetOutputFiles()[i];
-      ASSERT_EQ(output_file.smallest.user_key().ToString(),
-                output_file.largest.user_key().ToString());
-      ASSERT_EQ(output_file.largest.user_key().ToString(),
-                ordered_intput_keys[i]);
-    }
-  }
-
-  void RunCancelAndResumeTest(
-      const std::initializer_list<mock::KVPair>& input_file_1,
-      const std::initializer_list<mock::KVPair>& input_file_2,
-      uint64_t last_sequence, const std::vector<uint64_t>& snapshots,
-      const std::string& expected_next_key_to_compact,
-      const std::vector<std::string>& expected_input_keys,
-      bool cancelled_past_mid_point = false) {
-    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
-
-    auto file1 = mock::MakeMockFile(input_file_1);
-    AddMockFile(file1);
-    auto file2 = mock::MakeMockFile(input_file_2);
-    AddMockFile(file2);
-    SetLastSequence(last_sequence);
-
-    // First compaction (will be cancelled)
-    std::string compaction_progress_file =
-        CompactionProgressFileName(progress_dir_, 123);
-    std::unique_ptr<log::Writer> compaction_progress_writer =
-        CreateCompactionProgressWriter(compaction_progress_file);
-
-    ASSERT_OK(stats->Reset());
-    EnableCompactionCancel();
-
-    Status status = RunCompactionWithProgressTracking(
-        CompactionProgress{}, compaction_progress_writer.get(), snapshots,
-        stats);
-
-    ASSERT_TRUE(status.IsManualCompactionPaused());
-    DisableCompactionCancel();
-
-    HistogramData cancelled_compaction_stats;
-    stats->histogramData(FILE_WRITE_COMPACTION_MICROS,
-                         &cancelled_compaction_stats);
-
-    VerifyCompactionProgressPersisted(compaction_progress_file,
-                                      expected_next_key_to_compact,
-                                      expected_input_keys);
-
-    // Resume compaction
-    CompactionProgress compaction_progress;
-    if (expected_next_key_to_compact != "") {
-      compaction_progress.push_back(
-          ReadAndParseProgress(compaction_progress_file));
-    }
-
-    std::string compaction_progress_file_2 =
-        CompactionProgressFileName(progress_dir_, 234);
-    std::unique_ptr<log::Writer> compaction_progress_writer_2 =
-        CreateCompactionProgressWriter(compaction_progress_file_2);
-
-    ASSERT_OK(stats->Reset());
-
-    status = RunCompactionWithProgressTracking(
-        compaction_progress, compaction_progress_writer_2.get(),
-        {} /* snapshots */, stats);
-
-    ASSERT_OK(status);
-
-    if (cancelled_past_mid_point) {
-      HistogramData resumed_compaction_stats;
-      stats->histogramData(FILE_WRITE_COMPACTION_MICROS,
-                           &resumed_compaction_stats);
-      ASSERT_GT(cancelled_compaction_stats.count,
-                resumed_compaction_stats.count);
-    }
-  }
-};
-
-TEST_F(ResumableCompactionJobTest, BasicProgressPersistence) {
-  NewDB();
-
-  auto file1 = mock::MakeMockFile({
-      {KeyStr("a", 1U, kTypeValue), "val1"},
-      {KeyStr("b", 2U, kTypeValue), "val2"},
-  });
-  AddMockFile(file1);
-
-  auto file2 = mock::MakeMockFile({
-      {KeyStr("c", 3U, kTypeValue), "val3"},
-      {KeyStr("d", 4U, kTypeValue), "val4"},
-  });
-  AddMockFile(file2);
-
-  SetLastSequence(4U);
-
-  std::string compaction_progress_file =
-      CompactionProgressFileName(progress_dir_, 123);
-
-  std::unique_ptr<log::Writer> compaction_progress_writer =
-      CreateCompactionProgressWriter(compaction_progress_file);
-
-  Status status = RunCompactionWithProgressTracking(
-      CompactionProgress(), compaction_progress_writer.get());
-
-  ASSERT_OK(status);
-
-  VerifyCompactionProgressPersisted(
-      compaction_progress_file, "d" /* next_user_key_to_compact */,
-      {"a", "b", "c", "d"} /* ordered_intput_keys */);
-}
-
-TEST_F(ResumableCompactionJobTest, BasicProgressResume) {
-  NewDB();
-
-  RunCancelAndResumeTest(
-      {{KeyStr("a", 1U, kTypeValue), "val1"},
-       {KeyStr("b", 2U, kTypeValue), "val2"}} /* input_file_1 */,
-      {{KeyStr("bb", 3U, kTypeValue), "val3"},
-       {KeyStr(kCancelBeforeThisKey, 4U, kTypeValue),
-        "val4"}} /* input_file_2 */,
-      4U /* last_sequence */, {} /* snapshots */,
-      kCancelBeforeThisKey /* expected_next_key_to_compact */,
-      {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */,
-      true /* cancelled_past_mid_point */);
-}
-
-TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSameKey) {
-  NewDB();
-
-  // `cancel_before_seqno` is set to 0U to force cancellation after
-  // `kCancelBeforeThisKey@1` instead of `kCancelBeforeThisKey@2`.
-  // The seqno is 0 because `kCancelBeforeThisKey@1` will have its sequence
-  // number zeroed during compaction while `kCancelBeforeThisKey@2` won't be
-  cancel_before_seqno = 0U;
-  RunCancelAndResumeTest(
-      {{KeyStr(kCancelBeforeThisKey, 1U, kTypeValue),
-        "val1"}} /* input_file_1 */,
-      {{KeyStr(kCancelBeforeThisKey, 2U, kTypeValue), "val11"},
-       {KeyStr("d", 3U, kTypeValue), "val2"}} /* input_file_2 */,
-      3U /* last_sequence */, {1U} /* snapshots */,
-      "" /* expected_next_key_to_compact */,
-      {kCancelBeforeThisKey, kCancelBeforeThisKey,
-       "d"} /* expected_input_keys */);
-}
-
-TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeleteRange) {
-  NewDB();
-
-  RunCancelAndResumeTest(
-      {{KeyStr("a", 1U, kTypeValue), "val1"},
-       {KeyStr("b", 2U, kTypeValue), "val2"},
-       {KeyStr(kCancelBeforeThisKey, 3U, kTypeValue),
-        "val3"}} /* input_file_1 */,
-      {{KeyStr(kCancelBeforeThisKey, 4U, kTypeRangeDeletion),
-        "range_deletion_end_key"},
-       {KeyStr("d", 5U, kTypeValue), "val4"}} /* input_file_2 */,
-      5U /* last_sequence */, {3U} /* snapshots */,
-      "b" /* expected_next_key_to_compact */,
-      {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
-       "d"} /* expected_input_keys */);
-}
-
-TEST_F(ResumableCompactionJobTest, NoProgressResumeOnMerge) {
-  merge_op_ = MergeOperators::CreateStringAppendOperator();
-  NewDB();
-
-  RunCancelAndResumeTest(
-      {{KeyStr("a", 1U, kTypeValue), "val1"},
-       {KeyStr("b", 2U, kTypeValue), "val2"}} /* input_file_1 */,
-      {{KeyStr("bb", 3U, kTypeValue), "val3"},
-       {KeyStr(kCancelBeforeThisKey, 4U, kTypeMerge),
-        "val4"}} /* input_file_2 */,
-      4U /* last_sequence */, {} /* snapshots */,
-      "bb" /* expected_next_key_to_compact */,
-      {"a", "b", "bb", kCancelBeforeThisKey} /* expected_input_keys */);
-}
-
-TEST_F(ResumableCompactionJobTest, NoProgressResumeOnSingleDelete) {
-  NewDB();
-
-  RunCancelAndResumeTest(
-      {{KeyStr("a", 1U, kTypeValue), "val1"},
-       {KeyStr("b", 2U, kTypeValue), "val2"},
-       {KeyStr(kCancelBeforeThisKey, 3U, kTypeValue),
-        "val3"}} /* input_file_1 */,
-      {{KeyStr(kCancelBeforeThisKey, 4U, kTypeSingleDeletion), ""},
-       {KeyStr("d", 5U, kTypeValue), "val4"}} /* input_file_2 */,
-      5U /* last_sequence */, {3U} /* snapshots */,
-      "b" /* expected_next_key_to_compact */,
-      {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
-       "d"} /* expected_input_keys */);
-}
-
-TEST_F(ResumableCompactionJobTest, NoProgressResumeOnDeletionAtBottom) {
-  NewDB();
-
-  RunCancelAndResumeTest(
-      {{KeyStr("a", 1U, kTypeValue), "val1"},
-       {KeyStr("b", 2U, kTypeValue), "val2"},
-       {KeyStr(kCancelBeforeThisKey, 3U, kTypeValue),
-        "val3"}} /* input_file_1 */,
-      {{KeyStr(kCancelBeforeThisKey, 4U, kTypeDeletion), ""},
-       {KeyStr("d", 5U, kTypeValue), "val4"}} /* input_file_2 */,
-      5U /* last_sequence */, {3U} /* snapshots */,
-      "b" /* expected_next_key_to_compact */,
-      {"a", "b", kCancelBeforeThisKey, kCancelBeforeThisKey,
-       "d"} /* expected_input_keys */);
-}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index bd0a52e4559c..421663d8a0ae 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -195,9 +195,11 @@ class MyTestCompactionService : public CompactionService {
   std::vector<std::shared_ptr<EventListener>> listeners_;
   std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
       table_properties_collector_factories_;
-  std::atomic_bool canceled_{false};
   std::atomic<CompactionServiceJobStatus> final_updated_status_{
       CompactionServiceJobStatus::kUseLocal};
+
+ protected:
+  std::atomic_bool canceled_{false};
 };
 
 class CompactionServiceTest : public DBTestBase {
@@ -2137,6 +2139,12 @@ class ResumableCompactionService : public MyTestCompactionService {
                                 {} /* table_properties_collector_factories */),
         scenario_(scenario) {}
 
+  // Set the user key where cancellation should happen.
+  void SetCancelAtKey(const std::string& key, SequenceNumber seqno) {
+    cancel_at_key_ = key;
+    cancel_at_seqno_ = seqno;
+  }
+
   CompactionServiceJobStatus Wait(const std::string& scheduled_job_id,
                                   std::string* result) override {
     std::string compaction_input = ExtractCompactionInput(scheduled_job_id);
@@ -2149,25 +2157,50 @@ class ResumableCompactionService : public MyTestCompactionService {
     // ASSUMPTION: This makes stats.count directly proportional to keys
     // processed.
     SyncPoint::GetInstance()->SetCallBack(
-        "CompactionOutputs::ShouldStopBefore::manual_decision", [](void* p) {
+        "CompactionOutputs::ShouldStopBefore::manual_decision",
+        [this](void* p) {
           auto* pair = static_cast<std::pair<bool*, const Slice>*>(p);
-          *(pair->first) = true;
+          *(pair->first) = true;  // Force file cut at every key
+
+          // If cancel_at_key_ is set, cancel when we encounter that key
+          if (!cancel_at_key_.empty() && !already_canceled_) {
+            ParsedInternalKey parsed_key;
+            if (ParseInternalKey(pair->second, &parsed_key, true).ok()) {
+              if (parsed_key.user_key.ToString() == cancel_at_key_) {
+                // Check sequence number if specified
+                if (cancel_at_seqno_ == kMaxSequenceNumber ||
+                    parsed_key.sequence == cancel_at_seqno_) {
+                  canceled_ = true;
+                  already_canceled_ = true;
+                }
+              }
+            }
+          }
         });
+
+    // If no cancel_at_key_ is set, use the original behavior:
     // Simulate cancelled compaction by overriding status at completion. So
     // compaction processes all keys before this point to make stats.count
     // comparison straightforward.
-    SyncPoint::GetInstance()->SetCallBack(
-        "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
-          auto s = static_cast<Status*>(status);
-          *s = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-        });
+    if (cancel_at_key_.empty()) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "DBImplSecondary::CompactWithoutInstallation::End",
+          [&](void* status) {
+            auto s = static_cast<Status*>(status);
+            *s = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+          });
+    }
     SyncPoint::GetInstance()->EnableProcessing();
 
     // Phase 1: Run compaction with resumption enabled and cancel it
-    // - Processes all input keys
+    // - Processes input keys until cancellation point
     // - Creates output files and saves progress
     // - Status overridden to "paused"
     open_and_compaction_options.allow_resumption = true;
+    open_and_compaction_options.canceled = &canceled_;
+    already_canceled_ = false;
+    canceled_ = false;
+
     auto phase1_stats =
         RunCancelledCompaction(open_and_compaction_options, scheduled_job_id,
                                compaction_input, override_options);
@@ -2188,6 +2221,9 @@ class ResumableCompactionService : public MyTestCompactionService {
       EXPECT_TRUE(cleanup_status.ok());
       EXPECT_OK(override_options.env->CreateDir(output_dir));
 
+      already_canceled_ = false;
+      canceled_ = false;
+
       phase2_stats =
           RunCancelledCompaction(open_and_compaction_options, scheduled_job_id,
                                  compaction_input, override_options);
@@ -2199,9 +2235,6 @@ class ResumableCompactionService : public MyTestCompactionService {
       EXPECT_EQ(phase2_stats.count, phase1_stats.count);
     }
 
-    SyncPoint::GetInstance()->ClearCallBack(
-        "DBImplSecondary::CompactWithoutInstallation::End");
-
     // Final phase: Run compaction to completion (no cancellation)
     if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
       // Attempt to resume but it ends up starting fresh
@@ -2220,6 +2253,12 @@ class ResumableCompactionService : public MyTestCompactionService {
       EXPECT_OK(override_options.env->CreateDir(output_dir));
     }
 
+    // Prevent triggering of cancellation
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImplSecondary::CompactWithoutInstallation::End");
+    already_canceled_ = true;
+    canceled_ = false;
+
     auto final_phase_stats =
         RunCompaction(open_and_compaction_options, scheduled_job_id,
                       compaction_input, override_options, result);
@@ -2227,36 +2266,38 @@ class ResumableCompactionService : public MyTestCompactionService {
     SyncPoint::GetInstance()->DisableProcessing();
     SyncPoint::GetInstance()->ClearAllCallBacks();
 
-    // Validate statistics based on scenario
-    if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
-      // ASSUMPTION: Phase 1 processes all keys before cancellation
-      EXPECT_GT(phase1_stats.count, 0);
-
-      // ASSUMPTION: Phase 2 runs with allow_resumption=false and an empty
-      // folder. Phase 2 then creates its own output files (but doesn't save
-      // progress). When Phase 3 starts with allow_resumption=true, it finds no
-      // progress file exists, so it cannot resume and must start from scratch,
-      // processing all input keys again.
-      // Result: Phase 3 does the same amount of work as Phase 1.
-      EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
-
-    } else if (scenario_ == TestScenario::kCancelThenResume) {
-      // ASSUMPTION: Phase 1 processes all keys before cancellation
-      EXPECT_GT(phase1_stats.count, 0);
-
-      // ASSUMPTION: Phase 1 processes all keys and saves progress before
-      // cancellation. Final phase resumes from Phase 1's saved progress.
-      // Since Phase 1 completed all processing before being cancelled, the
-      // final phase should do less work than Phase 1.
-      EXPECT_LT(final_phase_stats.count, phase1_stats.count);
-
-    } else {  // kCancelThenFreshStart
-      // ASSUMPTION: Phase 1 processes all keys before cancellation
-      EXPECT_GT(phase1_stats.count, 0);
-
-      // ASSUMPTION: Final phase starts fresh without resumption, so it
-      // processes all input keys again and creates the same number of files
-      EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+    // Validate statistics based on scenario (only when cancelling at end)
+    if (cancel_at_key_.empty()) {
+      if (scenario_ == TestScenario::kMultipleCancelToggleResumption) {
+        // ASSUMPTION: Phase 1 processes all keys before cancellation
+        EXPECT_GT(phase1_stats.count, 0);
+
+        // ASSUMPTION: Phase 2 runs with allow_resumption=false and an empty
+        // folder. Phase 2 then creates its own output files (but doesn't save
+        // progress). When Phase 3 starts with allow_resumption=true, it finds
+        // no progress file exists, so it cannot resume and must start from
+        // scratch, processing all input keys again. Result: Phase 3 does the
+        // same amount of work as Phase 1.
+        EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+
+      } else if (scenario_ == TestScenario::kCancelThenResume) {
+        // ASSUMPTION: Phase 1 processes all keys before cancellation
+        EXPECT_GT(phase1_stats.count, 0);
+
+        // ASSUMPTION: Phase 1 processes all keys and saves progress before
+        // cancellation. Final phase resumes from Phase 1's saved progress.
+        // Since Phase 1 completed all processing before being cancelled, the
+        // final phase should do less work than Phase 1.
+        EXPECT_LT(final_phase_stats.count, phase1_stats.count);
+
+      } else {  // kCancelThenFreshStart
+                // ASSUMPTION: Phase 1 processes all keys before cancellation
+        EXPECT_GT(phase1_stats.count, 0);
+
+        // ASSUMPTION: Final phase starts fresh without resumption, so it
+        // processes all input keys again and creates the same number of files
+        EXPECT_EQ(final_phase_stats.count, phase1_stats.count);
+      }
     }
 
     StoreResult(*result);
@@ -2326,6 +2367,9 @@ class ResumableCompactionService : public MyTestCompactionService {
   }
 
   TestScenario scenario_;
+  std::string cancel_at_key_;
+  SequenceNumber cancel_at_seqno_ = kMaxSequenceNumber;
+  std::atomic<bool> already_canceled_{false};
 };
 
 class ResumableCompactionServiceTest : public CompactionServiceTest {
@@ -2432,6 +2476,387 @@ TEST_F(ResumableCompactionServiceTest,
   RunCompactionCancelTest(ResumableCompactionService::TestScenario::
                               kMultipleCancelToggleResumption);
 }
+
+class ResumableCompactionKeyTypeTest : public CompactionServiceTest {
+ public:
+  explicit ResumableCompactionKeyTypeTest() : CompactionServiceTest() {}
+
+ protected:
+  void SetupResumableCompactionService(
+      Options& options, const std::string& cancel_at_key = "",
+      SequenceNumber cancel_at_seqno = kMaxSequenceNumber) {
+    options.disable_auto_compactions = true;
+    statistics_ = CreateDBStatistics();
+
+    resume_cs_ = std::make_shared<ResumableCompactionService>(
+        dbname_, options, statistics_,
+        ResumableCompactionService::TestScenario::kCancelThenResume);
+
+    if (!cancel_at_key.empty()) {
+      resume_cs_->SetCancelAtKey(cancel_at_key, cancel_at_seqno);
+    }
+
+    options.compaction_service = resume_cs_;
+    DestroyAndReopen(options);
+  }
+
+  void ResetStatistics() { ASSERT_OK(statistics_->Reset()); }
+
+  void VerifyResumeBytes() {
+    uint64_t resumed_bytes =
+        statistics_->getTickerCount(REMOTE_COMPACT_RESUMED_BYTES);
+    ASSERT_GT(resumed_bytes, 0);
+  }
+
+ private:
+  std::shared_ptr<ResumableCompactionService> resume_cs_;
+  std::shared_ptr<Statistics> statistics_;
+};
+
+// Cancel compaction right before processing key "c" to test resumption at a
+//  deletion at the non-bottom level. When resumed, compaction will continue
+//  from this deletion.
+TEST_F(ResumableCompactionKeyTypeTest,
+       CancelAndResumeWithDeleteAtNonBottomLevel) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Delete("c"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+//  deletion at the ottom level. When resumed, compaction will continue from
+//  the last saved progress point before the delete.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithDeleteAtBottomLevel) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Delete("c"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ResetStatistics();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("c", snapshot), "old_value");
+  ASSERT_EQ(Get("d"), "val4");
+  db_->ReleaseSnapshot(snapshot);
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// merge operand. When resumed, compaction will continue from the last saved
+// progress point before the merge operand.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithMerge) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("c", "new_value"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "old_value,new_value");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// single delete. When resumed, compaction will continue from the last saved
+// progress point before the single delete.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithSingleDelete) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(SingleDelete("c"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// range delete. When resumed, compaction will continue from the last saved
+// progress point before the range delete.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithRangeDelete) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c");
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "c", "c_"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "NOT_FOUND");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Test resumption when a key has multiple versions spanning across file
+// boundaries (i.e., the same key exists in multiple SST files).
+//
+// Scenario:
+//   File 1 largest key: key "b"
+//   File 2 smallest key: key "c" with seqno=4 (older version)
+//   File 3 largest key: key "c" with seqno=5 (newer version)
+//
+// Cancel compaction right before processing the older version of key "c".
+// Upon resumption, compaction continues from the saved progress point "b" and
+// correctly processes both versions
+TEST_F(ResumableCompactionKeyTypeTest,
+       CancelAndResumeWithKeySpanningFileBoundaries) {
+  Options options = CurrentOptions();
+
+  // Set up cancellation at the older version of the key which will have
+  // sequence number zero-ed out
+  SetupResumableCompactionService(options, "c" /*cancel_at_key*/, 0 /*seqno*/);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("c", "old_value"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put("c", "new_value"));
+  ASSERT_OK(Flush());
+
+  ResetStatistics();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "new_value");
+  ASSERT_EQ(Get("c", snapshot), "old_value");
+  ASSERT_EQ(Get("d"), "val4");
+  db_->ReleaseSnapshot(snapshot);
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// wide column. When resumed, compaction will continue
+// from the wide column.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithWideColumn) {
+  Options options = CurrentOptions();
+
+  SetupResumableCompactionService(options, "c" /*cancel_at_key*/);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  WideColumns columns{{"col1", "value1"}, {"col2", "value2"}};
+  ASSERT_OK(
+      db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "c", columns));
+  ASSERT_OK(Flush());
+
+  ResetStatistics();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+
+  PinnableWideColumns result;
+  ASSERT_OK(
+      db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), "c", &result));
+  WideColumns expected{{"col1", "value1"}, {"col2", "value2"}};
+  ASSERT_EQ(result.columns(), expected);
+
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
+
+// Cancel compaction right before processing key "c" to test resumption at a
+// timed put. When resumed, compaction will continue
+// from the timed put.
+TEST_F(ResumableCompactionKeyTypeTest, CancelAndResumeWithTimedPut) {
+  Options options = CurrentOptions();
+  options.preclude_last_level_data_seconds = 86400;  // Enable TimedPut feature
+  options.preserve_internal_time_seconds = 86400;    // Preserve write time
+
+  SetupResumableCompactionService(options, "c" /*cancel_at_key*/);
+
+  ASSERT_OK(Put("c", "old_value"));
+  ASSERT_OK(Put("c_placeholder", "placeholder"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(options.num_levels - 1);
+
+  ASSERT_OK(Put("a", "val1"));
+  ASSERT_OK(Put("b", "val2"));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  // Use TimedPut for key "c" with current write time
+  uint64_t write_time = env_->NowMicros() / 1000000;
+  ASSERT_OK(TimedPut("c", "val3", write_time /*write_unix_time*/));
+  ASSERT_OK(Put("d", "val4"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> input_files;
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+
+  for (const auto& file : cf_meta.levels[0].files) {
+    input_files.push_back(file.name);
+  }
+
+  ASSERT_EQ(input_files.size(), 2);
+
+  ResetStatistics();
+
+  CompactionOptions compact_options;
+  ASSERT_OK(
+      db_->CompactFiles(compact_options, input_files, 1 /* output_level*/));
+
+  ASSERT_EQ(Get("a"), "val1");
+  ASSERT_EQ(Get("b"), "val2");
+  ASSERT_EQ(Get("c"), "val3");
+  ASSERT_EQ(Get("d"), "val4");
+
+  VerifyResumeBytes();
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index c8517be25d73..b60c615f880c 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -987,11 +987,8 @@ Status DBImplSecondary::ParseCompactionProgressFile(
   Slice slice;
   std::string record;
 
-  while (compaction_progress_reader.ReadRecord(&slice, &record)) {
-    if (!reader_status.ok()) {
-      return reader_status;
-    }
-
+  while (compaction_progress_reader.ReadRecord(&slice, &record) &&
+         reader_status.ok()) {
     VersionEdit edit;
     s = edit.DecodeFrom(slice);
     if (!s.ok()) {
@@ -1004,6 +1001,10 @@ Status DBImplSecondary::ParseCompactionProgressFile(
     }
   }
 
+  if (!reader_status.ok()) {
+    return reader_status;
+  }
+
   if (!s.ok()) {
     return s;
   }

From 7ecc12110c0360f01ed830f3cb9ea559100ae627 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 13 Feb 2026 09:18:40 -0800
Subject: [PATCH 471/500] Fix format compatibility issues, extend test (#14323)

Summary:
See https://github.com/facebook/rocksdb/issues/14240 which brought this to my attention. Here I've added range deletions and compactions to the format compatible test, and fixed or worked-around compatibility issues (likely longstanding).

The first fix was in Version::MaybeInitializeFileMetaData for an assertion failure simply from adding range deletions from some 5.x version.

The second fix is a broader work-around for older SST files with unreliable num_entries/num_range_deletions/num_deletions statistics in their table properties. We depend on them only for some paranoid checks for compaction, so in my assessment the best way to deal with those files is to exclude the paranoid checks when dealing with the files with unrelaible data. (Details in code comments.) The important part is that compacting old files is exceptionally rare, so we aren't really interefering with the paranoid checks doing thier job on an ongoing basis.

This depends on https://github.com/facebook/rocksdb/issues/14315 (just landed) because there is a remaining undiagnosed problem with some very early releases, but I'm not fixing that because its support is being dropped.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14323

Test Plan: test extended (ran locally excluding some releases)

Reviewed By: xingbowang

Differential Revision: D93032653

Pulled By: pdillinger

fbshipit-source-id: f90b32f30ba4764692e68d23705f42c778e0dc1d
---
 db/compaction/compaction_job.cc               | 26 ++++++++++++++
 db/compaction/compaction_job.h                |  4 +++
 db/version_set.cc                             |  2 +-
 tools/check_format_compatible.sh              | 19 ++++++++++-
 tools/compact_db.sh                           | 31 +++++++++++++++++
 tools/generate_random_db.sh                   | 34 +++++++++++++++++++
 .../deleterange_format_compatible.md          |  1 +
 7 files changed, 115 insertions(+), 2 deletions(-)
 create mode 100755 tools/compact_db.sh
 create mode 100644 unreleased_history/bug_fixes/deleterange_format_compatible.md

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 39d75a10ca0b..a8baddee5dda 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -2537,6 +2537,32 @@ bool CompactionJob::UpdateInternalStatsFromInputFiles(
   bool has_error = false;
   const ReadOptions read_options(Env::IOActivity::kCompaction);
   const auto& input_table_properties = compaction->GetInputTableProperties();
+
+  // Check all input files for old block-based SST format_version. Why? Old
+  // block-based SST files from roughly version 5.0 to 5.18 could produce
+  // inaccurate num_entries counts due to the evolution of its handling along
+  // with num_range_deletions. We have to disable some paranoid checks when
+  // compacting files from such an old release. However, we don't have great
+  // information to identify those files, so we heuristically over-approximate
+  // that set of files using
+  // (a) format_version < 5, which will be true for any files from RocksDB <
+  // 6.6.0 and should not be true for any recent production files
+  // (b) to avoid including non-block-based SST files (which still use older
+  // format_version markers, and do not support DeleteRange), we also require
+  // the presence of the user property "rocksdb.block.based.table.index.type",
+  // which was added in RocksDB 2.8 and is always present in block-based tables.
+  for (const auto& tp_pair : input_table_properties) {
+    if (tp_pair.second && tp_pair.second->format_version < 5) {
+      // Check for block-based table by looking for its index type property
+      const auto& user_props = tp_pair.second->user_collected_properties;
+      if (user_props.find(BlockBasedTablePropertyNames::kIndexType) !=
+          user_props.end()) {
+        job_stats_->has_accurate_num_input_records = false;
+        break;
+      }
+    }
+  }
+
   for (int input_level = 0;
        input_level < static_cast<int>(compaction->num_input_levels());
        ++input_level) {
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 8b942c6fe64d..21486f89538e 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -254,6 +254,10 @@ class CompactionJob {
   // @param num_input_range_del if non-null, will be set to the number of range
   // deletion entries in this compaction input.
   //
+  // If any input file has potentially unreliable num_entries count (old SST
+  // files - details in implementation),
+  // job_stats_->has_accurate_num_input_records is set to false.
+  //
   // Returns true iff internal_stats_.output_level_stats.num_input_records and
   // num_input_range_del are calculated successfully.
   //
diff --git a/db/version_set.cc b/db/version_set.cc
index d716f6cbfcc1..f3f1ee727cf8 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3444,7 +3444,7 @@ bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options,
   // Ensure new invariants on old files
   file_meta->num_deletions =
       std::max(tp->num_deletions, tp->num_range_deletions);
-  file_meta->num_entries = std::max(tp->num_entries, tp->num_deletions);
+  file_meta->num_entries = std::max(tp->num_entries, file_meta->num_deletions);
   return true;
 }
 
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 67639a0ca5e7..8b4c5ccdd7c1 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -224,6 +224,17 @@ compare_db()
     set -e
 }
 
+compact_db()
+{
+    set +e
+    [ "$SANITY_CHECK" ] || bash "$script_copy_dir"/compact_db.sh "$1" "$2" "$3"
+    if [ $? -ne 0 ]; then
+        echo ==== Error compacting DB at $1 ====
+        exit 1
+    fi
+    set -e
+}
+
 write_external_sst()
 {
     set +e
@@ -388,10 +399,16 @@ DISABLE_WARNING_AS_ERROR=1 invoke_make ldb -j$J
 
 for checkout_ref in "${checkout_refs[@]}"
 do
-  # We currently assume DB backward compatibility for every branch listed
+  # We assume DB backward compatibility for every branch listed
   echo "== Use $current_checkout_name to open DB generated using $checkout_ref..."
   compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
 
+  echo "== Use $current_checkout_name to compact DB generated using $checkout_ref..."
+  compact_db $db_test_dir/$checkout_ref 1 0
+
+  echo "== After compaction, re-verify DB originally from $checkout_ref..."
+  compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump_after_compact.txt 1 0
+
   if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
     member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
   then
diff --git a/tools/compact_db.sh b/tools/compact_db.sh
new file mode 100755
index 000000000000..8bcd95c0e906
--- /dev/null
+++ b/tools/compact_db.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+# A shell script to compact DB generated by generate_random_db.sh.
+# ./ldb needs to be available to be executed.
+#
+# Usage: <SCRIPT> <DB Path> [if_try_load_options] [if_ignore_unknown_options]
+
+if [ "$#" -lt 1 ]; then
+  echo "usage: $BASH_SOURCE <db_directory> [if_try_load_options] [if_ignore_unknown_options]"
+  exit 1
+fi
+
+db_dir=$1
+try_load_options=${2:-"1"}
+ignore_unknown_options=${3:-"0"}
+extra_params=
+
+if [ "$try_load_options" = "0" ]; then
+  extra_params=" --try_load_options=false"
+elif [ "$try_load_options" = "1" ]; then
+  extra_params=" --try_load_options=true"
+fi
+
+if [ "$ignore_unknown_options" = "1" ]; then
+  extra_params="$extra_params --ignore_unknown_options"
+fi
+
+set -e
+echo == Compacting DB at $db_dir
+./ldb compact --db=$db_dir $extra_params
diff --git a/tools/generate_random_db.sh b/tools/generate_random_db.sh
index a05c1f5a2133..e668d56a2393 100755
--- a/tools/generate_random_db.sh
+++ b/tools/generate_random_db.sh
@@ -29,6 +29,12 @@ if ./ldb --version 2>/dev/null >/dev/null; then
   rm -rf $db_dir
 fi
 
+# Check if deleterange command is supported by grepping ldb --help
+deleterange_support=
+if ./ldb --help 2>&1 | grep -q deleterange; then
+  deleterange_support=1
+fi
+
 echo == Loading data from $input_data_dir to $db_dir
 
 declare -a compression_opts=("no" "snappy" "zlib" "bzip2")
@@ -65,5 +71,33 @@ do
   fi
   ./ldb load --db=$db_dir --compression_type=$c $d_arg --bloom_bits=10 \
     --auto_compaction=false --create_if_missing < $input_data_dir/$f
+
+  # Use md5sum of file to deterministically decide whether to add a range
+  # tombstone (approximately 1/4 of files) and which key to delete
+  file_path=$input_data_dir/$f
+  hash=$(md5sum "$file_path" | cut -c1-8)
+  hash_int=$((16#$hash))
+
+  if [ $((hash_int % 4)) -eq 0 ]; then
+    # Pick a key from this file based on the hash
+    line_count=$(wc -l < "$file_path")
+    if [ "$line_count" -gt 0 ]; then
+      line_num=$((hash_int % line_count + 1))
+      key=$(sed -n "${line_num}p" "$file_path" | cut -d' ' -f1)
+      if [ -n "$key" ]; then
+        # Create end key by appending a character to make a small range
+        end_key="${key}0"
+        if [ "$deleterange_support" == "1" ]; then
+          echo "== Deleting range [$key, $end_key) from $f"
+          ./ldb deleterange --db=$db_dir "$key" "$end_key"
+        else
+          # Fall back to point delete for equivalent logical contents
+          echo "== Deleting key $key from $f"
+          ./ldb delete --db=$db_dir "$key"
+        fi
+      fi
+    fi
+  fi
+
   let "n = n + 1"
 done
diff --git a/unreleased_history/bug_fixes/deleterange_format_compatible.md b/unreleased_history/bug_fixes/deleterange_format_compatible.md
new file mode 100644
index 000000000000..150faffc3695
--- /dev/null
+++ b/unreleased_history/bug_fixes/deleterange_format_compatible.md
@@ -0,0 +1 @@
+* Fix longstanding failures that can arise from reading and/or compacting old DB dirs with range deletions (likely from version < 5.19.0) in many newer versions.

From c33a4989ad80f199fccc00f40c9803f638dc66c6 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 13 Feb 2026 10:37:34 -0800
Subject: [PATCH 472/500] Default
 CompactionOptionsUniversal::reduce_file_locking to be true (#14329)

Summary:
**Context/Summary:**

Internal adoption has demonstrated stability and measurable improvements of this feature without much cost so we can turn it on by default. Eventually we'd like to remove this configuration and make this an expected behavior.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14329

Test Plan: Existing unit test

Reviewed By: mszeszko-meta

Differential Revision: D93210059

Pulled By: hx235

fbshipit-source-id: 04f77954e6624c8e60a2db030eb19eb341dd0fcf
---
 include/rocksdb/universal_compaction.h                      | 6 ++----
 .../behavior_changes/reduce_file_locking_default_true.md    | 1 +
 2 files changed, 3 insertions(+), 4 deletions(-)
 create mode 100644 unreleased_history/behavior_changes/reduce_file_locking_default_true.md

diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
index d94e9653aa61..9a52ee539db2 100644
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -111,8 +111,6 @@ class CompactionOptionsUniversal {
   // Default: false
   bool incremental;
 
-  // EXPERIMENTAL
-  //
   // If true, auto universal compaction picking will adjust to minimize locking
   // of input files when bottom priority compactions are waiting to run. This
   // can increase the likelihood of existing L0s being selected for compaction,
@@ -120,7 +118,7 @@ class CompactionOptionsUniversal {
   // the overrall write amplification and compaction load on low priority
   // threads.
   //
-  // Default: false (disabled)
+  // Default: true (enabled)
   //
   // This options does not apply to manual compactions.
   //
@@ -142,7 +140,7 @@ class CompactionOptionsUniversal {
         stop_style(kCompactionStopStyleTotalSize),
         allow_trivial_move(false),
         incremental(false),
-        reduce_file_locking(false) {}
+        reduce_file_locking(true) {}
 
   bool operator==(const CompactionOptionsUniversal& rhs) const = default;
 };
diff --git a/unreleased_history/behavior_changes/reduce_file_locking_default_true.md b/unreleased_history/behavior_changes/reduce_file_locking_default_true.md
new file mode 100644
index 000000000000..31968f307888
--- /dev/null
+++ b/unreleased_history/behavior_changes/reduce_file_locking_default_true.md
@@ -0,0 +1 @@
+Change the default value of `CompactionOptionsUniversal::reduce_file_locking` from `false` to `true` to improve write stall and reduce read regression

From 672389fd8c9f7c4efa74a8ba3d3945ff79bd149a Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 13 Feb 2026 11:18:05 -0800
Subject: [PATCH 473/500] Remove obsolete compression code and some .h->.cc
 movement (#14325)

Summary:
In follow-up to https://github.com/facebook/rocksdb/issues/14315

Remove obsolete code replaced by new Compressor/Decompressor interface:
* OLD_CompressData and OLD_UncompressData
* Individual compression/decompression functions (Snappy_*, Zlib_*, BZip2_*, LZ4_*, LZ4HC_*, XPRESS_*, ZSTD_Compress, ZSTD_Uncompress)
* CompressionInfo and UncompressionInfo classes
* UncompressionDict class
* compression::PutDecompressedSizeInfo and GetDecompressedSizeInfo

The only small refactoring in this change that is not pure code removal or movement is in blob_file_builder_test.cc.

Move some function implementations etc. from compression.h to compression.cc:
* CompressionTypeToString, CompressionTypeFromString, CompressionOptionsToString
* ZSTD_TrainDictionary (both overloads), ZSTD_FinalizeDictionary
* DecompressorDict::Populate
* Most compression library includes

Also cleaned up other includes of compression.h, which caused some other files to need new includes.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14325

Test Plan: existing tests

Reviewed By: hx235

Differential Revision: D93120580

Pulled By: pdillinger

fbshipit-source-id: ab5c50db7379c0387a8c0e379642c9ea2799eae5
---
 db/blob/blob_file_builder_test.cc             |   17 +-
 db/version_edit.h                             |    1 +
 env/env.cc                                    |    1 +
 options/cf_options.cc                         |    1 +
 port/port_example.h                           |   23 -
 table/block_based/uncompression_dict_reader.h |    1 -
 test_util/testutil.cc                         |    1 +
 .../block_cache_trace_analyzer_test.cc        |    1 +
 util/compression.cc                           |  261 ++++
 util/compression.h                            | 1290 +----------------
 10 files changed, 278 insertions(+), 1319 deletions(-)

diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc
index c7b830717998..ad09238e2f4f 100644
--- a/db/blob/blob_file_builder_test.cc
+++ b/db/blob/blob_file_builder_test.cc
@@ -403,22 +403,19 @@ TEST_F(BlobFileBuilderTest, Compression) {
   ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
   ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
 
-  CompressionOptions opts;
-  CompressionContext context(kSnappyCompression, opts);
-
-  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
-                       kSnappyCompression);
-
-  std::string compressed_value;
-  ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
-                              uncompressed_value.size(), &compressed_value));
+  auto compressor =
+      GetBuiltinV2CompressionManager()->GetCompressor({}, kSnappyCompression);
+  GrowableBuffer compressed_value;
+  ASSERT_OK(LegacyForceBuiltinCompression(*compressor, /*working_area=*/nullptr,
+                                          uncompressed_value,
+                                          &compressed_value));
 
   ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
             BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
 
   // Verify the contents of the new blob file as well as the blob reference
   std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
-      {key, compressed_value}};
+      {key, compressed_value.AsSlice().ToString()}};
   std::vector<std::string> blob_indexes{blob_index};
 
   VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
diff --git a/db/version_edit.h b/db/version_edit.h
index 742d2f8b0e52..ee6a6b01be43 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -25,6 +25,7 @@
 #include "rocksdb/advanced_options.h"
 #include "table/table_reader.h"
 #include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
diff --git a/env/env.cc b/env/env.cc
index bfc226a20928..80d65cced3a5 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -27,6 +27,7 @@
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_type.h"
 #include "util/autovector.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
diff --git a/options/cf_options.cc b/options/cf_options.cc
index ba1360aa841e..98c010406b43 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -30,6 +30,7 @@
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_type.h"
 #include "util/cast_util.h"
+#include "util/string_util.h"
 
 // NOTE: in this file, many option flags that were deprecated
 // and removed from the rest of the code have to be kept here
diff --git a/port/port_example.h b/port/port_example.h
index f9e94d00f865..6bbb5b2e330b 100644
--- a/port/port_example.h
+++ b/port/port_example.h
@@ -74,28 +74,5 @@ using OnceType = intptr_t;
 #define LEVELDB_ONCE_INIT 0
 void InitOnce(port::OnceType*, void (*initializer)());
 
-// ------------------ Compression -------------------
-
-// Store the snappy compression of "input[0,input_length-1]" in *output.
-// Returns false if snappy is not supported by this port.
-bool Snappy_Compress(const char* input, size_t input_length,
-                     std::string* output);
-
-// If input[0,input_length-1] looks like a valid snappy compressed
-// buffer, store the size of the uncompressed data in *result and
-// return true.  Else return false.
-bool Snappy_GetUncompressedLength(const char* input, size_t length,
-                                  size_t* result);
-
-// Attempt to snappy uncompress input[0,input_length-1] into *output.
-// Returns true if successful, false if the input is invalid lightweight
-// compressed data.
-//
-// REQUIRES: at least the first "n" bytes of output[] must be writable
-// where "n" is the result of a successful call to
-// Snappy_GetUncompressedLength.
-bool Snappy_Uncompress(const char* input_data, size_t input_length,
-                       char* output);
-
 }  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h
index 30ec81482b6e..d0579a66055c 100644
--- a/table/block_based/uncompression_dict_reader.h
+++ b/table/block_based/uncompression_dict_reader.h
@@ -18,7 +18,6 @@ struct BlockCacheLookupContext;
 class FilePrefetchBuffer;
 class GetContext;
 struct ReadOptions;
-struct UncompressionDict;
 
 // Provides access to the uncompression dictionary regardless of whether
 // it is owned by the reader or stored in the cache, or whether it is pinned
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index acb8ec6aa1e7..f9f9e0bf680a 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -29,6 +29,7 @@
 #include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
+#include "util/string_util.h"
 
 #ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
 void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index 0b954617bdd3..146e1d5c174e 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -26,6 +26,7 @@ int main() {
 #include "test_util/testutil.h"
 #include "tools/block_cache_analyzer/block_cache_trace_analyzer.h"
 #include "trace_replay/block_cache_tracer.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
diff --git a/util/compression.cc b/util/compression.cc
index 19e1e6584d65..f5ceb7a149df 100644
--- a/util/compression.cc
+++ b/util/compression.cc
@@ -5,12 +5,156 @@
 
 #include "util/compression.h"
 
+#ifdef BZIP2
+#include <bzlib.h>
+#endif  // BZIP2
+
+#include <limits>
+
+#ifdef LZ4
+#include <lz4.h>
+#include <lz4hc.h>
+#if LZ4_VERSION_NUMBER < 10700  // < r129
+#error "LZ4 support requires version >= 1.7.0 (lz4-devel)"
+#endif  // LZ4_VERSION_NUMBER < 10700
+#endif  // LZ4
+
+#ifdef SNAPPY
+#include <snappy-sinksource.h>
+#include <snappy.h>
+#endif  // SNAPPY
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif  // ZLIB
+
 #include "options/options_helper.h"
+#include "port/likely.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// WART: does not match OptionsHelper::compression_type_string_map
+std::string CompressionTypeToString(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return "NoCompression";
+    case kSnappyCompression:
+      return "Snappy";
+    case kZlibCompression:
+      return "Zlib";
+    case kBZip2Compression:
+      return "BZip2";
+    case kLZ4Compression:
+      return "LZ4";
+    case kLZ4HCCompression:
+      return "LZ4HC";
+    case kXpressCompression:
+      return "Xpress";
+    case kZSTD:
+      return "ZSTD";
+    case kDisableCompressionOption:
+      return "DisableOption";
+    default: {
+      bool is_custom = compression_type >= kFirstCustomCompression &&
+                       compression_type <= kLastCustomCompression;
+      unsigned char c = lossless_cast<unsigned char>(compression_type);
+      return (is_custom ? "Custom" : "Reserved") +
+             ToBaseCharsString<16>(2, c, /*uppercase=*/true);
+    }
+  }
+}
+
+// WART: does not match OptionsHelper::compression_type_string_map
+CompressionType CompressionTypeFromString(std::string compression_type_str) {
+  if (!compression_type_str.empty()) {
+    switch (compression_type_str[0]) {
+      case 'N':
+        if (compression_type_str == "NoCompression") {
+          return kNoCompression;
+        }
+        break;
+      case 'S':
+        if (compression_type_str == "Snappy") {
+          return kSnappyCompression;
+        }
+        break;
+      case 'Z':
+        if (compression_type_str == "ZSTD") {
+          return kZSTD;
+        }
+        if (compression_type_str == "Zlib") {
+          return kZlibCompression;
+        }
+        break;
+      case 'B':
+        if (compression_type_str == "BZip2") {
+          return kBZip2Compression;
+        }
+        break;
+      case 'L':
+        if (compression_type_str == "LZ4") {
+          return kLZ4Compression;
+        }
+        if (compression_type_str == "LZ4HC") {
+          return kLZ4HCCompression;
+        }
+        break;
+      case 'X':
+        if (compression_type_str == "Xpress") {
+          return kXpressCompression;
+        }
+        break;
+      default:;
+    }
+  }
+  // unrecognized
+  return kDisableCompressionOption;
+}
+
+std::string CompressionOptionsToString(
+    const CompressionOptions& compression_options) {
+  std::string result;
+  result.reserve(512);
+  result.append("window_bits=")
+      .append(std::to_string(compression_options.window_bits))
+      .append("; ");
+  result.append("level=")
+      .append(std::to_string(compression_options.level))
+      .append("; ");
+  result.append("strategy=")
+      .append(std::to_string(compression_options.strategy))
+      .append("; ");
+  result.append("max_dict_bytes=")
+      .append(std::to_string(compression_options.max_dict_bytes))
+      .append("; ");
+  result.append("zstd_max_train_bytes=")
+      .append(std::to_string(compression_options.zstd_max_train_bytes))
+      .append("; ");
+  // NOTE: parallel_threads is skipped because it doesn't really affect the file
+  // contents written, arguably doesn't belong in CompressionOptions
+  result.append("enabled=")
+      .append(std::to_string(compression_options.enabled))
+      .append("; ");
+  result.append("max_dict_buffer_bytes=")
+      .append(std::to_string(compression_options.max_dict_buffer_bytes))
+      .append("; ");
+  result.append("use_zstd_dict_trainer=")
+      .append(std::to_string(compression_options.use_zstd_dict_trainer))
+      .append("; ");
+  result.append("max_compressed_bytes_per_kb=")
+      .append(std::to_string(compression_options.max_compressed_bytes_per_kb))
+      .append("; ");
+  result.append("checksum=")
+      .append(std::to_string(compression_options.checksum))
+      .append("; ");
+  return result;
+}
+
 StreamingCompress* StreamingCompress::Create(CompressionType compression_type,
                                              const CompressionOptions& opts,
                                              uint32_t compress_format_version,
@@ -123,6 +267,123 @@ void ZSTDStreamingUncompress::Reset() {
 #endif
 }
 
+void DecompressorDict::Populate(Decompressor& from_decompressor, Slice dict) {
+  if (UNLIKELY(dict.empty())) {
+    dict_str_ = {};
+    dict_allocation_ = {};
+    // Appropriately reject bad files with empty dictionary block.
+    // It is longstanding not to write an empty dictionary block:
+    // https://github.com/facebook/rocksdb/blame/10.2.fb/table/block_based/block_based_table_builder.cc#L1841
+    decompressor_ = std::make_unique<FailureDecompressor>(
+        Status::Corruption("Decompression dictionary is empty"));
+  } else {
+    Status s = from_decompressor.MaybeCloneForDict(dict, &decompressor_);
+    if (decompressor_ == nullptr) {
+      dict_str_ = {};
+      dict_allocation_ = {};
+      assert(!s.ok());
+      decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
+    } else {
+      assert(s.ok());
+      assert(decompressor_->GetSerializedDict() == dict);
+    }
+  }
+
+  memory_usage_ = sizeof(struct DecompressorDict);
+  memory_usage_ += dict_str_.size();
+  if (dict_allocation_) {
+    auto allocator = dict_allocation_.get_deleter().allocator;
+    if (allocator) {
+      memory_usage_ +=
+          allocator->UsableSize(dict_allocation_.get(), GetRawDict().size());
+    } else {
+      memory_usage_ += GetRawDict().size();
+    }
+  }
+  memory_usage_ += decompressor_->ApproximateOwnedMemoryUsage();
+}
+
+// ZSTD dictionary training implementations
+std::string ZSTD_TrainDictionary(const std::string& samples,
+                                 const std::vector<size_t>& sample_lens,
+                                 size_t max_dict_bytes) {
+#ifdef ZSTD
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_trainFromBuffer(
+      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
+      static_cast<unsigned>(sample_lens.size()));
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  }
+  assert(dict_len <= max_dict_bytes);
+  dict_data.resize(dict_len);
+  return dict_data;
+#else
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD
+}
+
+std::string ZSTD_TrainDictionary(const std::string& samples,
+                                 size_t sample_len_shift,
+                                 size_t max_dict_bytes) {
+#ifdef ZSTD
+  // skips potential partial sample at the end of "samples"
+  size_t num_samples = samples.size() >> sample_len_shift;
+  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
+  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
+#else
+  assert(false);
+  (void)samples;
+  (void)sample_len_shift;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD
+}
+
+std::string ZSTD_FinalizeDictionary(const std::string& samples,
+                                    const std::vector<size_t>& sample_lens,
+                                    size_t max_dict_bytes, int level) {
+#ifdef ROCKSDB_ZDICT_FINALIZE
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  if (level == CompressionOptions::kDefaultCompressionLevel) {
+    // NB: ZSTD_CLEVEL_DEFAULT is historically == 3
+    level = ZSTD_CLEVEL_DEFAULT;
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_finalizeDictionary(
+      dict_data.data(), max_dict_bytes, samples.data(),
+      std::min(static_cast<size_t>(samples.size()), max_dict_bytes),
+      samples.data(), sample_lens.data(),
+      static_cast<unsigned>(sample_lens.size()),
+      {level, 0 /* notificationLevel */, 0 /* dictID */});
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  } else {
+    assert(dict_len <= max_dict_bytes);
+    dict_data.resize(dict_len);
+    return dict_data;
+  }
+#else
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  (void)level;
+  return "";
+#endif  // ROCKSDB_ZDICT_FINALIZE
+}
+
 // ***********************************************************************
 // BEGIN built-in implementation of customization interface
 // ***********************************************************************
diff --git a/util/compression.h b/util/compression.h
index 6d6613a618d7..beb07c8de694 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -10,49 +10,14 @@
 #pragma once
 
 #include <algorithm>
-#include <limits>
-
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-#ifdef OS_FREEBSD
-#include <malloc_np.h>
-#else  // OS_FREEBSD
-#include <malloc.h>
-#endif  // OS_FREEBSD
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-#include <string>
 
 #include "memory/memory_allocator_impl.h"
-#include "port/likely.h"
 #include "rocksdb/advanced_compression.h"
 #include "rocksdb/options.h"
 #include "table/block_based/block_type.h"
-#include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
-#include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
-#include "util/string_util.h"
-
-#ifdef SNAPPY
-#include <snappy-sinksource.h>
-#include <snappy.h>
-#endif
-
-#ifdef ZLIB
-#include <zlib.h>
-#endif
-
-#ifdef BZIP2
-#include <bzlib.h>
-#endif
-
-#if defined(LZ4)
-#include <lz4.h>
-#include <lz4hc.h>
-#if LZ4_VERSION_NUMBER < 10700  // < r129
-#error "LZ4 support requires version >= 1.7.0 (lz4-devel)"
-#endif
-#endif
 
 #ifdef ZSTD
 #include <zstd.h>
@@ -244,41 +209,7 @@ struct DecompressorDict {
   size_t ApproximateMemoryUsage() const { return memory_usage_; }
 
  private:
-  void Populate(Decompressor& from_decompressor, Slice dict) {
-    if (UNLIKELY(dict.empty())) {
-      dict_str_ = {};
-      dict_allocation_ = {};
-      // Appropriately reject bad files with empty dictionary block.
-      // It is longstanding not to write an empty dictionary block:
-      // https://github.com/facebook/rocksdb/blame/10.2.fb/table/block_based/block_based_table_builder.cc#L1841
-      decompressor_ = std::make_unique<FailureDecompressor>(
-          Status::Corruption("Decompression dictionary is empty"));
-    } else {
-      Status s = from_decompressor.MaybeCloneForDict(dict, &decompressor_);
-      if (decompressor_ == nullptr) {
-        dict_str_ = {};
-        dict_allocation_ = {};
-        assert(!s.ok());
-        decompressor_ = std::make_unique<FailureDecompressor>(std::move(s));
-      } else {
-        assert(s.ok());
-        assert(decompressor_->GetSerializedDict() == dict);
-      }
-    }
-
-    memory_usage_ = sizeof(struct DecompressorDict);
-    memory_usage_ += dict_str_.size();
-    if (dict_allocation_) {
-      auto allocator = dict_allocation_.get_deleter().allocator;
-      if (allocator) {
-        memory_usage_ +=
-            allocator->UsableSize(dict_allocation_.get(), GetRawDict().size());
-      } else {
-        memory_usage_ += GetRawDict().size();
-      }
-    }
-    memory_usage_ += decompressor_->ApproximateOwnedMemoryUsage();
-  }
+  void Populate(Decompressor& from_decompressor, Slice dict);
 };
 
 // Holds dictionary and related data, like ZSTD's digested compression
@@ -358,140 +289,6 @@ struct CompressionDict {
   CompressionDict& operator=(const CompressionDict&) = delete;
 };
 
-// Holds dictionary and related data, like ZSTD's digested uncompression
-// dictionary.
-struct UncompressionDict {
-  // Block containing the data for the compression dictionary in case the
-  // constructor that takes a string parameter is used.
-  std::string dict_;
-
-  // Block containing the data for the compression dictionary in case the
-  // constructor that takes a Slice parameter is used and the passed in
-  // CacheAllocationPtr is not nullptr.
-  CacheAllocationPtr allocation_;
-
-  // Slice pointing to the compression dictionary data. Can point to
-  // dict_, allocation_, or some other memory location, depending on how
-  // the object was constructed.
-  Slice slice_;
-
-#ifdef ROCKSDB_ZSTD_DDICT
-  // Processed version of the contents of slice_ for ZSTD compression.
-  ZSTD_DDict* zstd_ddict_ = nullptr;
-#endif  // ROCKSDB_ZSTD_DDICT
-
-  UncompressionDict(std::string&& dict, bool using_zstd)
-      : dict_(std::move(dict)), slice_(dict_) {
-#ifdef ROCKSDB_ZSTD_DDICT
-    if (!slice_.empty() && using_zstd) {
-      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
-      assert(zstd_ddict_ != nullptr);
-    }
-#else
-    (void)using_zstd;
-#endif  // ROCKSDB_ZSTD_DDICT
-  }
-
-  UncompressionDict(Slice slice, CacheAllocationPtr&& allocation,
-                    bool using_zstd)
-      : allocation_(std::move(allocation)), slice_(std::move(slice)) {
-#ifdef ROCKSDB_ZSTD_DDICT
-    if (!slice_.empty() && using_zstd) {
-      zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size());
-      assert(zstd_ddict_ != nullptr);
-    }
-#else
-    (void)using_zstd;
-#endif  // ROCKSDB_ZSTD_DDICT
-  }
-
-  UncompressionDict(UncompressionDict&& rhs)
-      : dict_(std::move(rhs.dict_)),
-        allocation_(std::move(rhs.allocation_)),
-        slice_(std::move(rhs.slice_))
-#ifdef ROCKSDB_ZSTD_DDICT
-        ,
-        zstd_ddict_(rhs.zstd_ddict_)
-#endif
-  {
-#ifdef ROCKSDB_ZSTD_DDICT
-    rhs.zstd_ddict_ = nullptr;
-#endif
-  }
-
-  ~UncompressionDict() {
-#ifdef ROCKSDB_ZSTD_DDICT
-    size_t res = 0;
-    if (zstd_ddict_ != nullptr) {
-      res = ZSTD_freeDDict(zstd_ddict_);
-    }
-    assert(res == 0);  // Last I checked they can't fail
-    (void)res;         // prevent unused var warning
-#endif                 // ROCKSDB_ZSTD_DDICT
-  }
-
-  UncompressionDict& operator=(UncompressionDict&& rhs) {
-    if (this == &rhs) {
-      return *this;
-    }
-
-    dict_ = std::move(rhs.dict_);
-    allocation_ = std::move(rhs.allocation_);
-    slice_ = std::move(rhs.slice_);
-
-#ifdef ROCKSDB_ZSTD_DDICT
-    zstd_ddict_ = rhs.zstd_ddict_;
-    rhs.zstd_ddict_ = nullptr;
-#endif
-
-    return *this;
-  }
-
-  // The object is self-contained if the string constructor is used, or the
-  // Slice constructor is invoked with a non-null allocation. Otherwise, it
-  // is the caller's responsibility to ensure that the underlying storage
-  // outlives this object.
-  bool own_bytes() const { return !dict_.empty() || allocation_; }
-
-  const Slice& GetRawDict() const { return slice_; }
-
-  // For TypedCacheInterface
-  const Slice& ContentSlice() const { return slice_; }
-  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kOtherBlock;
-  static constexpr BlockType kBlockType = BlockType::kCompressionDictionary;
-
-#ifdef ROCKSDB_ZSTD_DDICT
-  const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
-#endif  // ROCKSDB_ZSTD_DDICT
-
-  static const UncompressionDict& GetEmptyDict() {
-    static UncompressionDict empty_dict{};
-    return empty_dict;
-  }
-
-  size_t ApproximateMemoryUsage() const {
-    size_t usage = sizeof(struct UncompressionDict);
-    usage += dict_.size();
-    if (allocation_) {
-      auto allocator = allocation_.get_deleter().allocator;
-      if (allocator) {
-        usage += allocator->UsableSize(allocation_.get(), slice_.size());
-      } else {
-        usage += slice_.size();
-      }
-    }
-#ifdef ROCKSDB_ZSTD_DDICT
-    usage += ZSTD_sizeof_DDict(zstd_ddict_);
-#endif  // ROCKSDB_ZSTD_DDICT
-    return usage;
-  }
-
-  UncompressionDict() = default;
-  // Disable copy
-  UncompressionDict(const CompressionDict&) = delete;
-  UncompressionDict& operator=(const CompressionDict&) = delete;
-};
-
 class CompressionContext : public Compressor::WorkingArea {
  private:
 #ifdef ZSTD
@@ -563,25 +360,6 @@ class CompressionContext : public Compressor::WorkingArea {
   CompressionContext& operator=(const CompressionContext&) = delete;
 };
 
-// TODO: rename
-class CompressionInfo {
-  const CompressionOptions& opts_;
-  const CompressionContext& context_;
-  const CompressionDict& dict_;
-  const CompressionType type_;
-
- public:
-  CompressionInfo(const CompressionOptions& _opts,
-                  const CompressionContext& _context,
-                  const CompressionDict& _dict, CompressionType _type)
-      : opts_(_opts), context_(_context), dict_(_dict), type_(_type) {}
-
-  const CompressionOptions& options() const { return opts_; }
-  const CompressionContext& context() const { return context_; }
-  const CompressionDict& dict() const { return dict_; }
-  CompressionType type() const { return type_; }
-};
-
 // This is like a working area, reusable for different dicts, etc.
 // TODO: refactor / consolidate
 class UncompressionContext : public Decompressor::WorkingArea {
@@ -611,21 +389,6 @@ class UncompressionContext : public Decompressor::WorkingArea {
   }
 };
 
-class UncompressionInfo {
-  const UncompressionContext& context_;
-  const UncompressionDict& dict_;
-  const CompressionType type_;
-
- public:
-  UncompressionInfo(const UncompressionContext& _context,
-                    const UncompressionDict& _dict, CompressionType _type)
-      : context_(_context), dict_(_dict), type_(_type) {}
-
-  const UncompressionContext& context() const { return context_; }
-  const UncompressionDict& dict() const { return dict_; }
-  CompressionType type() const { return type_; }
-};
-
 inline bool Snappy_Supported() {
 #ifdef SNAPPY
   return true;
@@ -748,898 +511,13 @@ inline bool DictCompressionTypeSupported(CompressionType compression_type) {
 }
 
 // WART: does not match OptionsHelper::compression_type_string_map
-inline std::string CompressionTypeToString(CompressionType compression_type) {
-  switch (compression_type) {
-    case kNoCompression:
-      return "NoCompression";
-    case kSnappyCompression:
-      return "Snappy";
-    case kZlibCompression:
-      return "Zlib";
-    case kBZip2Compression:
-      return "BZip2";
-    case kLZ4Compression:
-      return "LZ4";
-    case kLZ4HCCompression:
-      return "LZ4HC";
-    case kXpressCompression:
-      return "Xpress";
-    case kZSTD:
-      return "ZSTD";
-    case kDisableCompressionOption:
-      return "DisableOption";
-    default: {
-      bool is_custom = compression_type >= kFirstCustomCompression &&
-                       compression_type <= kLastCustomCompression;
-      unsigned char c = lossless_cast<unsigned char>(compression_type);
-      return (is_custom ? "Custom" : "Reserved") +
-             ToBaseCharsString<16>(2, c, /*uppercase=*/true);
-    }
-  }
-}
+std::string CompressionTypeToString(CompressionType compression_type);
 
 // WART: does not match OptionsHelper::compression_type_string_map
-inline CompressionType CompressionTypeFromString(
-    std::string compression_type_str) {
-  if (!compression_type_str.empty()) {
-    switch (compression_type_str[0]) {
-      case 'N':
-        if (compression_type_str == "NoCompression") {
-          return kNoCompression;
-        }
-        break;
-      case 'S':
-        if (compression_type_str == "Snappy") {
-          return kSnappyCompression;
-        }
-        break;
-      case 'Z':
-        if (compression_type_str == "ZSTD") {
-          return kZSTD;
-        }
-        if (compression_type_str == "Zlib") {
-          return kZlibCompression;
-        }
-        break;
-      case 'B':
-        if (compression_type_str == "BZip2") {
-          return kBZip2Compression;
-        }
-        break;
-      case 'L':
-        if (compression_type_str == "LZ4") {
-          return kLZ4Compression;
-        }
-        if (compression_type_str == "LZ4HC") {
-          return kLZ4HCCompression;
-        }
-        break;
-      case 'X':
-        if (compression_type_str == "Xpress") {
-          return kXpressCompression;
-        }
-        break;
-      default:;
-    }
-  }
-  // unrecognized
-  return kDisableCompressionOption;
-}
-
-inline std::string CompressionOptionsToString(
-    const CompressionOptions& compression_options) {
-  std::string result;
-  result.reserve(512);
-  result.append("window_bits=")
-      .append(std::to_string(compression_options.window_bits))
-      .append("; ");
-  result.append("level=")
-      .append(std::to_string(compression_options.level))
-      .append("; ");
-  result.append("strategy=")
-      .append(std::to_string(compression_options.strategy))
-      .append("; ");
-  result.append("max_dict_bytes=")
-      .append(std::to_string(compression_options.max_dict_bytes))
-      .append("; ");
-  result.append("zstd_max_train_bytes=")
-      .append(std::to_string(compression_options.zstd_max_train_bytes))
-      .append("; ");
-  // NOTE: parallel_threads is skipped because it doesn't really affect the file
-  // contents written, arguably doesn't belong in CompressionOptions
-  result.append("enabled=")
-      .append(std::to_string(compression_options.enabled))
-      .append("; ");
-  result.append("max_dict_buffer_bytes=")
-      .append(std::to_string(compression_options.max_dict_buffer_bytes))
-      .append("; ");
-  result.append("use_zstd_dict_trainer=")
-      .append(std::to_string(compression_options.use_zstd_dict_trainer))
-      .append("; ");
-  result.append("max_compressed_bytes_per_kb=")
-      .append(std::to_string(compression_options.max_compressed_bytes_per_kb))
-      .append("; ");
-  result.append("checksum=")
-      .append(std::to_string(compression_options.checksum))
-      .append("; ");
-  return result;
-}
-
-// compress_format_version can have two values:
-// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
-// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
-// way.
-// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
-// start of compressed block. Snappy and XPRESS instead extract the decompressed
-// size from the compressed block itself, same as version 1.
-
-inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
-                            size_t length, ::std::string* output) {
-#ifdef SNAPPY
-  output->resize(snappy::MaxCompressedLength(length));
-  size_t outlen;
-  snappy::RawCompress(input, length, &(*output)[0], &outlen);
-  output->resize(outlen);
-  return true;
-#else
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-inline CacheAllocationPtr Snappy_Uncompress(
-    const char* input, size_t length, size_t* uncompressed_size,
-    MemoryAllocator* allocator = nullptr) {
-#ifdef SNAPPY
-  size_t uncompressed_length = 0;
-  if (!snappy::GetUncompressedLength(input, length, &uncompressed_length)) {
-    return nullptr;
-  }
-
-  CacheAllocationPtr output = AllocateBlock(uncompressed_length, allocator);
-
-  if (!snappy::RawUncompress(input, length, output.get())) {
-    return nullptr;
-  }
-
-  *uncompressed_size = uncompressed_length;
-
-  return output;
-#else
-  (void)input;
-  (void)length;
-  (void)uncompressed_size;
-  (void)allocator;
-  return nullptr;
-#endif
-}
-
-namespace compression {
-// returns size
-inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) {
-  PutVarint32(output, length);
-  return output->size();
-}
-
-inline bool GetDecompressedSizeInfo(const char** input_data,
-                                    size_t* input_length,
-                                    uint32_t* output_len) {
-  auto new_input_data =
-      GetVarint32Ptr(*input_data, *input_data + *input_length, output_len);
-  if (new_input_data == nullptr) {
-    return false;
-  }
-  *input_length -= (new_input_data - *input_data);
-  *input_data = new_input_data;
-  return true;
-}
-}  // namespace compression
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool Zlib_Compress(const CompressionInfo& info,
-                          uint32_t compress_format_version, const char* input,
-                          size_t length, ::std::string* output) {
-#ifdef ZLIB
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  }
-
-  // The memLevel parameter specifies how much memory should be allocated for
-  // the internal compression state.
-  // memLevel=1 uses minimum memory but is slow and reduces compression ratio.
-  // memLevel=9 uses maximum memory for optimal speed.
-  // The default value is 8. See zconf.h for more details.
-  static const int memLevel = 8;
-  int level;
-  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
-    level = Z_DEFAULT_COMPRESSION;
-  } else {
-    level = info.options().level;
-  }
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-  int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
-                        memLevel, info.options().strategy);
-  if (st != Z_OK) {
-    return false;
-  }
-
-  Slice compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    // Initialize the compression library's dictionary
-    st = deflateSetDictionary(
-        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
-        static_cast<unsigned int>(compression_dict.size()));
-    if (st != Z_OK) {
-      deflateEnd(&_stream);
-      return false;
-    }
-  }
-
-  // Get an upper bound on the compressed size.
-  size_t upper_bound =
-      deflateBound(&_stream, static_cast<unsigned long>(length));
-  output->resize(output_header_len + upper_bound);
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (Bytef*)input;
-  _stream.avail_in = static_cast<unsigned int>(length);
-
-  // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(upper_bound);
-  _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
-
-  bool compressed = false;
-  st = deflate(&_stream, Z_FINISH);
-  if (st == Z_STREAM_END) {
-    compressed = true;
-    output->resize(output->size() - _stream.avail_out);
-  }
-  // The only return value we really care about is Z_STREAM_END.
-  // Z_OK means insufficient output space. This means the compression is
-  // bigger than decompressed size. Just fail the compression in that case.
-
-  deflateEnd(&_stream);
-  return compressed;
-#else
-  (void)info;
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline CacheAllocationPtr Zlib_Uncompress(
-    const UncompressionInfo& info, const char* input_data, size_t input_length,
-    size_t* uncompressed_size, uint32_t compress_format_version,
-    MemoryAllocator* allocator = nullptr, int windowBits = -14) {
-#ifdef ZLIB
-  uint32_t output_len = 0;
-  if (compress_format_version == 2) {
-    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                              &output_len)) {
-      return nullptr;
-    }
-  } else {
-    // Assume the decompressed data size will 5x of compressed size, but round
-    // to the page size
-    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
-    output_len = static_cast<uint32_t>(
-        std::min(proposed_output_len,
-                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
-  }
-
-  z_stream _stream;
-  memset(&_stream, 0, sizeof(z_stream));
-
-  // For raw inflate, the windowBits should be -8..-15.
-  // If windowBits is bigger than zero, it will use either zlib
-  // header or gzip header. Adding 32 to it will do automatic detection.
-  int st =
-      inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
-  if (st != Z_OK) {
-    return nullptr;
-  }
+CompressionType CompressionTypeFromString(std::string compression_type_str);
 
-  const Slice& compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    // Initialize the compression library's dictionary
-    st = inflateSetDictionary(
-        &_stream, reinterpret_cast<const Bytef*>(compression_dict.data()),
-        static_cast<unsigned int>(compression_dict.size()));
-    if (st != Z_OK) {
-      return nullptr;
-    }
-  }
-
-  _stream.next_in = (Bytef*)input_data;
-  _stream.avail_in = static_cast<unsigned int>(input_length);
-
-  auto output = AllocateBlock(output_len, allocator);
-
-  _stream.next_out = (Bytef*)output.get();
-  _stream.avail_out = static_cast<unsigned int>(output_len);
-
-  bool done = false;
-  while (!done) {
-    st = inflate(&_stream, Z_SYNC_FLUSH);
-    switch (st) {
-      case Z_STREAM_END:
-        done = true;
-        break;
-      case Z_OK: {
-        // No output space. Increase the output space by 20%.
-        // We should never run out of output space if
-        // compress_format_version == 2
-        assert(compress_format_version != 2);
-        size_t old_sz = output_len;
-        uint32_t output_len_delta = output_len / 5;
-        output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        auto tmp = AllocateBlock(output_len, allocator);
-        memcpy(tmp.get(), output.get(), old_sz);
-        output = std::move(tmp);
-
-        // Set more output.
-        _stream.next_out = (Bytef*)(output.get() + old_sz);
-        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
-        break;
-      }
-      case Z_BUF_ERROR:
-      default:
-        inflateEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  // If we encoded decompressed block size, we should have no bytes left
-  assert(compress_format_version != 2 || _stream.avail_out == 0);
-  assert(output_len >= _stream.avail_out);
-  *uncompressed_size = output_len - _stream.avail_out;
-  inflateEnd(&_stream);
-  return output;
-#else
-  (void)info;
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)compress_format_version;
-  (void)allocator;
-  (void)windowBits;
-  return nullptr;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-inline bool BZip2_Compress(const CompressionInfo& /*info*/,
-                           uint32_t compress_format_version, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef BZIP2
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  }
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(output_header_len + length);
-
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  // Block size 1 is 100K.
-  // 0 is for silent.
-  // 30 is the default workFactor
-  int st = BZ2_bzCompressInit(&_stream, 1, 0, 30);
-  if (st != BZ_OK) {
-    return false;
-  }
-
-  // Compress the input, and put compressed data in output.
-  _stream.next_in = (char*)input;
-  _stream.avail_in = static_cast<unsigned int>(length);
-
-  // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(length);
-  _stream.next_out = output->data() + output_header_len;
-
-  bool compressed = false;
-  st = BZ2_bzCompress(&_stream, BZ_FINISH);
-  if (st == BZ_STREAM_END) {
-    compressed = true;
-    output->resize(output->size() - _stream.avail_out);
-  }
-  // The only return value we really care about is BZ_STREAM_END.
-  // BZ_FINISH_OK means insufficient output space. This means the compression
-  // is bigger than decompressed size. Just fail the compression in that case.
-
-  BZ2_bzCompressEnd(&_stream);
-  return compressed;
-#else
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is not included in the
-// block header
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-inline CacheAllocationPtr BZip2_Uncompress(
-    const char* input_data, size_t input_length, size_t* uncompressed_size,
-    uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
-#ifdef BZIP2
-  uint32_t output_len = 0;
-  if (compress_format_version == 2) {
-    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                              &output_len)) {
-      return nullptr;
-    }
-  } else {
-    // Assume the decompressed data size will 5x of compressed size, but round
-    // to the next page size
-    size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096;
-    output_len = static_cast<uint32_t>(
-        std::min(proposed_output_len,
-                 static_cast<size_t>(std::numeric_limits<uint32_t>::max())));
-  }
-
-  bz_stream _stream;
-  memset(&_stream, 0, sizeof(bz_stream));
-
-  int st = BZ2_bzDecompressInit(&_stream, 0, 0);
-  if (st != BZ_OK) {
-    return nullptr;
-  }
-
-  _stream.next_in = (char*)input_data;
-  _stream.avail_in = static_cast<unsigned int>(input_length);
-
-  auto output = AllocateBlock(output_len, allocator);
-
-  _stream.next_out = (char*)output.get();
-  _stream.avail_out = static_cast<unsigned int>(output_len);
-
-  bool done = false;
-  while (!done) {
-    st = BZ2_bzDecompress(&_stream);
-    switch (st) {
-      case BZ_STREAM_END:
-        done = true;
-        break;
-      case BZ_OK: {
-        // No output space. Increase the output space by 20%.
-        // We should never run out of output space if
-        // compress_format_version == 2
-        assert(compress_format_version != 2);
-        uint32_t old_sz = output_len;
-        output_len = output_len * 1.2;
-        auto tmp = AllocateBlock(output_len, allocator);
-        memcpy(tmp.get(), output.get(), old_sz);
-        output = std::move(tmp);
-
-        // Set more output.
-        _stream.next_out = (char*)(output.get() + old_sz);
-        _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
-        break;
-      }
-      default:
-        BZ2_bzDecompressEnd(&_stream);
-        return nullptr;
-    }
-  }
-
-  // If we encoded decompressed block size, we should have no bytes left
-  assert(compress_format_version != 2 || _stream.avail_out == 0);
-  assert(output_len >= _stream.avail_out);
-  *uncompressed_size = output_len - _stream.avail_out;
-  BZ2_bzDecompressEnd(&_stream);
-  return output;
-#else
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)compress_format_version;
-  (void)allocator;
-  return nullptr;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is included in the
-// block header using memcpy, which makes database non-portable)
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool LZ4_Compress(const CompressionInfo& info,
-                         uint32_t compress_format_version, const char* input,
-                         size_t length, ::std::string* output) {
-#ifdef LZ4
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    // new encoding, using varint32 to store size information
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  } else {
-    // legacy encoding, which is not really portable (depends on big/little
-    // endianness)
-    output_header_len = 8;
-    output->resize(output_header_len);
-    char* p = const_cast<char*>(output->c_str());
-    memcpy(p, &length, sizeof(length));
-  }
-  int compress_bound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(output_header_len + compress_bound));
-
-  int outlen;
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  LZ4_stream_t* stream = LZ4_createStream();
-  Slice compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    LZ4_loadDict(stream, compression_dict.data(),
-                 static_cast<int>(compression_dict.size()));
-  }
-#if LZ4_VERSION_NUMBER >= 10700  // r129+
-  int acceleration;
-  if (info.options().level < 0) {
-    acceleration = -info.options().level;
-  } else {
-    acceleration = 1;
-  }
-  outlen = LZ4_compress_fast_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound, acceleration);
-#else  // up to r128
-  outlen = LZ4_compress_limitedOutput_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound);
-#endif
-  LZ4_freeStream(stream);
-#else   // up to r123
-  outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
-                                      static_cast<int>(length), compress_bound);
-#endif  // LZ4_VERSION_NUMBER >= 10400
-
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(static_cast<size_t>(output_header_len + outlen));
-  return true;
-#else  // LZ4
-  (void)info;
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is included in the
-// block header using memcpy, which makes database non-portable)
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
-                                         const char* input_data,
-                                         size_t input_length,
-                                         size_t* uncompressed_size,
-                                         uint32_t compress_format_version,
-                                         MemoryAllocator* allocator = nullptr) {
-#ifdef LZ4
-  uint32_t output_len = 0;
-  if (compress_format_version == 2) {
-    // new encoding, using varint32 to store size information
-    if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                              &output_len)) {
-      return nullptr;
-    }
-  } else {
-    // legacy encoding, which is not really portable (depends on big/little
-    // endianness)
-    if (input_length < 8) {
-      return nullptr;
-    }
-    if (port::kLittleEndian) {
-      memcpy(&output_len, input_data, sizeof(output_len));
-    } else {
-      memcpy(&output_len, input_data + 4, sizeof(output_len));
-    }
-    input_length -= 8;
-    input_data += 8;
-  }
-
-  auto output = AllocateBlock(output_len, allocator);
-
-  int decompress_bytes = 0;
-
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
-  const Slice& compression_dict = info.dict().GetRawDict();
-  if (compression_dict.size()) {
-    LZ4_setStreamDecode(stream, compression_dict.data(),
-                        static_cast<int>(compression_dict.size()));
-  }
-  decompress_bytes = LZ4_decompress_safe_continue(
-      stream, input_data, output.get(), static_cast<int>(input_length),
-      static_cast<int>(output_len));
-  LZ4_freeStreamDecode(stream);
-#else   // up to r123
-  decompress_bytes = LZ4_decompress_safe(input_data, output.get(),
-                                         static_cast<int>(input_length),
-                                         static_cast<int>(output_len));
-#endif  // LZ4_VERSION_NUMBER >= 10400
-
-  if (decompress_bytes < 0) {
-    return nullptr;
-  }
-  assert(decompress_bytes == static_cast<int>(output_len));
-  *uncompressed_size = decompress_bytes;
-  return output;
-#else  // LZ4
-  (void)info;
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)compress_format_version;
-  (void)allocator;
-  return nullptr;
-#endif
-}
-
-// compress_format_version == 1 -- decompressed size is included in the
-// block header using memcpy, which makes database non-portable)
-// compress_format_version == 2 -- decompressed size is included in the block
-// header in varint32 format
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool LZ4HC_Compress(const CompressionInfo& info,
-                           uint32_t compress_format_version, const char* input,
-                           size_t length, ::std::string* output) {
-#ifdef LZ4
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = 0;
-  if (compress_format_version == 2) {
-    // new encoding, using varint32 to store size information
-    output_header_len = compression::PutDecompressedSizeInfo(
-        output, static_cast<uint32_t>(length));
-  } else {
-    // legacy encoding, which is not really portable (depends on big/little
-    // endianness)
-    output_header_len = 8;
-    output->resize(output_header_len);
-    char* p = const_cast<char*>(output->c_str());
-    memcpy(p, &length, sizeof(length));
-  }
-  int compress_bound = LZ4_compressBound(static_cast<int>(length));
-  output->resize(static_cast<size_t>(output_header_len + compress_bound));
-
-  int outlen;
-  int level;
-  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
-    level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
-  } else {
-    level = info.options().level;
-  }
-#if LZ4_VERSION_NUMBER >= 10400  // r124+
-  LZ4_streamHC_t* stream = LZ4_createStreamHC();
-  LZ4_resetStreamHC(stream, level);
-  Slice compression_dict = info.dict().GetRawDict();
-  const char* compression_dict_data =
-      compression_dict.size() > 0 ? compression_dict.data() : nullptr;
-  size_t compression_dict_size = compression_dict.size();
-  if (compression_dict_data != nullptr) {
-    LZ4_loadDictHC(stream, compression_dict_data,
-                   static_cast<int>(compression_dict_size));
-  }
-
-#if LZ4_VERSION_NUMBER >= 10700  // r129+
-  outlen =
-      LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len],
-                               static_cast<int>(length), compress_bound);
-#else   // r124-r128
-  outlen = LZ4_compressHC_limitedOutput_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound);
-#endif  // LZ4_VERSION_NUMBER >= 10700
-  LZ4_freeStreamHC(stream);
-
-#elif LZ4_VERSION_MAJOR  // r113-r123
-  outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
-                                         static_cast<int>(length),
-                                         compress_bound, level);
-#else                    // up to r112
-  outlen =
-      LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
-                                   static_cast<int>(length), compress_bound);
-#endif                   // LZ4_VERSION_NUMBER >= 10400
-
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(static_cast<size_t>(output_header_len + outlen));
-  return true;
-#else  // LZ4
-  (void)info;
-  (void)compress_format_version;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-#ifdef XPRESS
-inline bool XPRESS_Compress(const char* input, size_t length,
-                            std::string* output) {
-  return port::xpress::Compress(input, length, output);
-}
-#else
-inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
-                            std::string* /*output*/) {
-  return false;
-}
-#endif
-
-#ifdef XPRESS
-inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
-                               size_t* uncompressed_size) {
-  return port::xpress::Decompress(input_data, input_length, uncompressed_size);
-}
-#else
-inline char* XPRESS_Uncompress(const char* /*input_data*/,
-                               size_t /*input_length*/,
-                               size_t* /*uncompressed_size*/) {
-  return nullptr;
-}
-#endif
-
-inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
-                          size_t length, ::std::string* output) {
-#ifdef ZSTD
-  if (length > std::numeric_limits<uint32_t>::max()) {
-    // Can't compress more than 4GB
-    return false;
-  }
-
-  size_t output_header_len = compression::PutDecompressedSizeInfo(
-      output, static_cast<uint32_t>(length));
-
-  size_t compressBound = ZSTD_compressBound(length);
-  // TODO: use resize_and_overwrite with c++23
-  output->resize(static_cast<size_t>(output_header_len + compressBound));
-  size_t outlen = 0;
-  ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
-  assert(context != nullptr);
-  if (info.dict().GetDigestedZstdCDict() != nullptr) {
-    ZSTD_CCtx_refCDict(context, info.dict().GetDigestedZstdCDict());
-  } else {
-    ZSTD_CCtx_loadDictionary(context, info.dict().GetRawDict().data(),
-                             info.dict().GetRawDict().size());
-  }
-
-  // Compression level is set in `contex` during CreateNativeContext()
-  outlen = ZSTD_compress2(context, &(*output)[output_header_len], compressBound,
-                          input, length);
-  if (outlen == 0) {
-    return false;
-  }
-  output->resize(output_header_len + outlen);
-  return true;
-#else  // ZSTD
-  (void)info;
-  (void)input;
-  (void)length;
-  (void)output;
-  return false;
-#endif
-}
-
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-// @param error_message If not null, will be set if decompression fails.
-//
-// Returns nullptr if decompression fails.
-inline CacheAllocationPtr ZSTD_Uncompress(
-    const UncompressionInfo& info, const char* input_data, size_t input_length,
-    size_t* uncompressed_size, MemoryAllocator* allocator = nullptr,
-    const char** error_message = nullptr) {
-#ifdef ZSTD
-  static const char* const kErrorDecodeOutputSize =
-      "Cannot decode output size.";
-  static const char* const kErrorOutputLenMismatch =
-      "Decompressed size does not match header.";
-  uint32_t output_len = 0;
-  if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
-                                            &output_len)) {
-    if (error_message) {
-      *error_message = kErrorDecodeOutputSize;
-    }
-    return nullptr;
-  }
-
-  CacheAllocationPtr output = AllocateBlock(output_len, allocator);
-  size_t actual_output_length = 0;
-  ZSTD_DCtx* context = info.context().GetZSTDContext();
-  assert(context != nullptr);
-#ifdef ROCKSDB_ZSTD_DDICT
-  if (info.dict().GetDigestedZstdDDict() != nullptr) {
-    actual_output_length = ZSTD_decompress_usingDDict(
-        context, output.get(), output_len, input_data, input_length,
-        info.dict().GetDigestedZstdDDict());
-  } else {
-#endif  // ROCKSDB_ZSTD_DDICT
-    actual_output_length = ZSTD_decompress_usingDict(
-        context, output.get(), output_len, input_data, input_length,
-        info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
-#ifdef ROCKSDB_ZSTD_DDICT
-  }
-#endif  // ROCKSDB_ZSTD_DDICT
-  if (ZSTD_isError(actual_output_length)) {
-    if (error_message) {
-      *error_message = ZSTD_getErrorName(actual_output_length);
-    }
-    return nullptr;
-  } else if (actual_output_length != output_len) {
-    if (error_message) {
-      *error_message = kErrorOutputLenMismatch;
-    }
-    return nullptr;
-  }
-
-  *uncompressed_size = actual_output_length;
-  return output;
-#else  // ZSTD
-  (void)info;
-  (void)input_data;
-  (void)input_length;
-  (void)uncompressed_size;
-  (void)allocator;
-  (void)error_message;
-  return nullptr;
-#endif
-}
+std::string CompressionOptionsToString(
+    const CompressionOptions& compression_options);
 
 inline bool ZSTD_TrainDictionarySupported() {
 #ifdef ZSTD
@@ -1652,50 +530,6 @@ inline bool ZSTD_TrainDictionarySupported() {
 #endif
 }
 
-inline std::string ZSTD_TrainDictionary(const std::string& samples,
-                                        const std::vector<size_t>& sample_lens,
-                                        size_t max_dict_bytes) {
-#ifdef ZSTD
-  assert(samples.empty() == sample_lens.empty());
-  if (samples.empty()) {
-    return "";
-  }
-  std::string dict_data(max_dict_bytes, '\0');
-  size_t dict_len = ZDICT_trainFromBuffer(
-      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
-      static_cast<unsigned>(sample_lens.size()));
-  if (ZDICT_isError(dict_len)) {
-    return "";
-  }
-  assert(dict_len <= max_dict_bytes);
-  dict_data.resize(dict_len);
-  return dict_data;
-#else
-  assert(false);
-  (void)samples;
-  (void)sample_lens;
-  (void)max_dict_bytes;
-  return "";
-#endif  // ZSTD
-}
-
-inline std::string ZSTD_TrainDictionary(const std::string& samples,
-                                        size_t sample_len_shift,
-                                        size_t max_dict_bytes) {
-#ifdef ZSTD
-  // skips potential partial sample at the end of "samples"
-  size_t num_samples = samples.size() >> sample_len_shift;
-  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
-  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
-#else
-  assert(false);
-  (void)samples;
-  (void)sample_len_shift;
-  (void)max_dict_bytes;
-  return "";
-#endif  // ZSTD
-}
-
 inline bool ZSTD_FinalizeDictionarySupported() {
 #ifdef ROCKSDB_ZDICT_FINALIZE
   return true;
@@ -1704,120 +538,6 @@ inline bool ZSTD_FinalizeDictionarySupported() {
 #endif
 }
 
-inline std::string ZSTD_FinalizeDictionary(
-    const std::string& samples, const std::vector<size_t>& sample_lens,
-    size_t max_dict_bytes, int level) {
-#ifdef ROCKSDB_ZDICT_FINALIZE
-  assert(samples.empty() == sample_lens.empty());
-  if (samples.empty()) {
-    return "";
-  }
-  if (level == CompressionOptions::kDefaultCompressionLevel) {
-    // NB: ZSTD_CLEVEL_DEFAULT is historically == 3
-    level = ZSTD_CLEVEL_DEFAULT;
-  }
-  std::string dict_data(max_dict_bytes, '\0');
-  size_t dict_len = ZDICT_finalizeDictionary(
-      dict_data.data(), max_dict_bytes, samples.data(),
-      std::min(static_cast<size_t>(samples.size()), max_dict_bytes),
-      samples.data(), sample_lens.data(),
-      static_cast<unsigned>(sample_lens.size()),
-      {level, 0 /* notificationLevel */, 0 /* dictID */});
-  if (ZDICT_isError(dict_len)) {
-    return "";
-  } else {
-    assert(dict_len <= max_dict_bytes);
-    dict_data.resize(dict_len);
-    return dict_data;
-  }
-#else
-  assert(false);
-  (void)samples;
-  (void)sample_lens;
-  (void)max_dict_bytes;
-  (void)level;
-  return "";
-#endif  // ROCKSDB_ZDICT_FINALIZE
-}
-
-inline bool OLD_CompressData(const Slice& raw,
-                             const CompressionInfo& compression_info,
-                             uint32_t compress_format_version,
-                             std::string* compressed_output) {
-  bool ret = false;
-
-  // Will return compressed block contents if (1) the compression method is
-  // supported in this platform and (2) the compression rate is "good enough".
-  switch (compression_info.type()) {
-    case kSnappyCompression:
-      ret = Snappy_Compress(compression_info, raw.data(), raw.size(),
-                            compressed_output);
-      break;
-    case kZlibCompression:
-      ret = Zlib_Compress(compression_info, compress_format_version, raw.data(),
-                          raw.size(), compressed_output);
-      break;
-    case kBZip2Compression:
-      ret = BZip2_Compress(compression_info, compress_format_version,
-                           raw.data(), raw.size(), compressed_output);
-      break;
-    case kLZ4Compression:
-      ret = LZ4_Compress(compression_info, compress_format_version, raw.data(),
-                         raw.size(), compressed_output);
-      break;
-    case kLZ4HCCompression:
-      ret = LZ4HC_Compress(compression_info, compress_format_version,
-                           raw.data(), raw.size(), compressed_output);
-      break;
-    case kXpressCompression:
-      ret = XPRESS_Compress(raw.data(), raw.size(), compressed_output);
-      break;
-    case kZSTD:
-      ret = ZSTD_Compress(compression_info, raw.data(), raw.size(),
-                          compressed_output);
-      break;
-    default:
-      // Do not recognize this compression type
-      break;
-  }
-
-  TEST_SYNC_POINT_CALLBACK("CompressData:TamperWithReturnValue",
-                           static_cast<void*>(&ret));
-
-  return ret;
-}
-
-inline CacheAllocationPtr OLD_UncompressData(
-    const UncompressionInfo& uncompression_info, const char* data, size_t n,
-    size_t* uncompressed_size, uint32_t compress_format_version,
-    MemoryAllocator* allocator = nullptr,
-    const char** error_message = nullptr) {
-  switch (uncompression_info.type()) {
-    case kSnappyCompression:
-      return Snappy_Uncompress(data, n, uncompressed_size, allocator);
-    case kZlibCompression:
-      return Zlib_Uncompress(uncompression_info, data, n, uncompressed_size,
-                             compress_format_version, allocator);
-    case kBZip2Compression:
-      return BZip2_Uncompress(data, n, uncompressed_size,
-                              compress_format_version, allocator);
-    case kLZ4Compression:
-    case kLZ4HCCompression:
-      return LZ4_Uncompress(uncompression_info, data, n, uncompressed_size,
-                            compress_format_version, allocator);
-    case kXpressCompression:
-      // XPRESS allocates memory internally, thus no support for custom
-      // allocator.
-      return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size));
-    case kZSTD:
-      // TODO(cbi): error message handling for other compression algorithms.
-      return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size,
-                             allocator, error_message);
-    default:
-      return CacheAllocationPtr();
-  }
-}
-
 // The new compression APIs intentionally make it difficult to generate
 // compressed data larger than the original. (It is better to store the
 // uncompressed version in that case.) For legacy cases that must store

From 871f79d6ef2ab88badfea53bba15778954bcb4e2 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Fri, 13 Feb 2026 11:56:22 -0800
Subject: [PATCH 474/500] Reformat source files (#14331)

Summary:
probably something changed, maybe https://github.com/facebook/rocksdb/issues/14311

Full command:
```
git ls-files '*.cc' '*.h' | grep -v '^third-party/' | grep -v 'range_tree' | xargs clang-format -i
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14331

Test Plan: CI

Reviewed By: mszeszko-meta

Differential Revision: D93246992

Pulled By: pdillinger

fbshipit-source-id: 6bc5b97978fef8aee52823dadb6daa4bea57343d
---
 cache/cache_entry_stats.h                     |  18 +-
 cache/cache_key.cc                            |   6 +-
 cache/cache_key.h                             |  10 +-
 cache/cache_reservation_manager.h             |  32 +--
 db/db_with_timestamp_basic_test.cc            |   8 +-
 db/experimental.cc                            |   3 +-
 env/fs_posix.cc                               |   4 +-
 env/io_posix.cc                               |   2 +-
 include/rocksdb/c.h                           |   2 +-
 include/rocksdb/functor_wrapper.h             |   2 +-
 include/rocksdb/unique_id.h                   |  10 +-
 java/rocksjni/config_options.cc               |  34 ++--
 java/rocksjni/env_options.cc                  |  82 ++++----
 java/rocksjni/import_column_family_options.cc |  20 +-
 java/rocksjni/kv_helper.h                     |   6 +-
 java/rocksjni/memory_util.cc                  |  14 +-
 java/rocksjni/rocksjni.cc                     |  15 +-
 java/rocksjni/sst_file_readerjni.cc           |  39 ++--
 java/rocksjni/sst_file_writerjni.cc           | 116 ++++++-----
 java/rocksjni/table.cc                        |  16 +-
 .../table_properties_collector_factory.cc     |   8 +-
 java/rocksjni/testable_event_listener.cc      |   8 +-
 microbench/ribbon_bench.cc                    |  10 +-
 monitoring/perf_context.cc                    |   4 +-
 options/options_settable_test.cc              |   2 +-
 port/jemalloc_helper.h                        |  32 ++-
 table/block_based/block_test.cc               | 186 +++++++++---------
 table/block_based/mock_block_based_table.h    |   2 +-
 table/cleanable_test.cc                       |   4 +-
 table/unique_id.cc                            |  30 +--
 table/unique_id_impl.h                        |  16 +-
 tools/dump/rocksdb_undump.cc                  |   2 +-
 util/bloom_impl.h                             |  32 +--
 util/coding.h                                 |   6 +-
 util/crc32c_arm64.cc                          |  17 +-
 util/crc32c_arm64.h                           |   2 +-
 util/crc32c_ppc.h                             |   2 +-
 util/dynamic_bloom_test.cc                    |   6 +-
 util/filter_bench.cc                          |  36 ++--
 util/gflags_compat.h                          |   6 +-
 util/hash_test.cc                             |  14 +-
 util/io_dispatcher_test.cc                    |   1 -
 util/mutexlock.h                              |  38 ++--
 util/ribbon_alg.h                             |  50 ++---
 utilities/cassandra/test_utils.cc             |   2 +-
 utilities/cassandra/test_utils.h              |   2 +-
 utilities/object_registry.cc                  |  80 ++++----
 .../write_batch_with_index_internal.h         |   6 +-
 48 files changed, 518 insertions(+), 525 deletions(-)

diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h
index 9968995da95a..f8c5e422e896 100644
--- a/cache/cache_entry_stats.h
+++ b/cache/cache_entry_stats.h
@@ -101,23 +101,23 @@ class CacheEntryStatsCollector {
   }
 
   // Gets saved stats, regardless of age
-  void GetStats(Stats *stats) {
+  void GetStats(Stats* stats) {
     std::lock_guard<std::mutex> lock(saved_mutex_);
     *stats = saved_stats_;
   }
 
-  Cache *GetCache() const { return cache_; }
+  Cache* GetCache() const { return cache_; }
 
   // Gets or creates a shared instance of CacheEntryStatsCollector in the
   // cache itself, and saves into `ptr`. This shared_ptr will hold the
   // entry in cache until all refs are destroyed.
-  static Status GetShared(Cache *raw_cache, SystemClock *clock,
-                          std::shared_ptr<CacheEntryStatsCollector> *ptr) {
+  static Status GetShared(Cache* raw_cache, SystemClock* clock,
+                          std::shared_ptr<CacheEntryStatsCollector>* ptr) {
     assert(raw_cache);
     BasicTypedCacheInterface<CacheEntryStatsCollector, CacheEntryRole::kMisc>
         cache{raw_cache};
 
-    const Slice &cache_key = GetCacheKey();
+    const Slice& cache_key = GetCacheKey();
     auto h = cache.Lookup(cache_key);
     if (h == nullptr) {
       // Not yet in cache, but Cache doesn't provide a built-in way to
@@ -152,7 +152,7 @@ class CacheEntryStatsCollector {
   }
 
  private:
-  explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
+  explicit CacheEntryStatsCollector(Cache* cache, SystemClock* clock)
       : saved_stats_(),
         working_stats_(),
         last_start_time_micros_(0),
@@ -160,7 +160,7 @@ class CacheEntryStatsCollector {
         cache_(cache),
         clock_(clock) {}
 
-  static const Slice &GetCacheKey() {
+  static const Slice& GetCacheKey() {
     // For each template instantiation
     static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
     static Slice ckey_slice = ckey.AsSlice();
@@ -175,8 +175,8 @@ class CacheEntryStatsCollector {
   uint64_t last_start_time_micros_;
   uint64_t last_end_time_micros_;
 
-  Cache *const cache_;
-  SystemClock *const clock_;
+  Cache* const cache_;
+  SystemClock* const clock_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/cache_key.cc b/cache/cache_key.cc
index addff61d17b0..a5553c0d257c 100644
--- a/cache/cache_key.cc
+++ b/cache/cache_key.cc
@@ -24,7 +24,7 @@ namespace ROCKSDB_NAMESPACE {
 //              0 |      >= 1<<63 | CreateUniqueForProcessLifetime
 //            > 0 |           any | OffsetableCacheKey.WithOffset
 
-CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
+CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache* cache) {
   // +1 so that we can reserve all zeros for "unset" cache key
   uint64_t id = cache->NewId() + 1;
   // Ensure we don't collide with CreateUniqueForProcessLifetime
@@ -297,8 +297,8 @@ CacheKey CacheKey::CreateUniqueForProcessLifetime() {
 //
 // TODO: Nevertheless / regardless, an efficient way to detect (and thus
 // quantify) block cache corruptions, including collisions, should be added.
-OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
-                                       const std::string &db_session_id,
+OffsetableCacheKey::OffsetableCacheKey(const std::string& db_id,
+                                       const std::string& db_session_id,
                                        uint64_t file_number) {
   UniqueId64x2 internal_id;
   Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
diff --git a/cache/cache_key.h b/cache/cache_key.h
index 0b93c6bd9472..4cf5d2e7d34b 100644
--- a/cache/cache_key.h
+++ b/cache/cache_key.h
@@ -44,13 +44,13 @@ class CacheKey {
   inline Slice AsSlice() const {
     static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key");
     assert(!IsEmpty());
-    return Slice(reinterpret_cast<const char *>(this), sizeof(*this));
+    return Slice(reinterpret_cast<const char*>(this), sizeof(*this));
   }
 
   // Create a CacheKey that is unique among others associated with this Cache
   // instance. Depends on Cache::NewId. This is useful for block cache
   // "reservations".
-  static CacheKey CreateUniqueForCacheLifetime(Cache *cache);
+  static CacheKey CreateUniqueForCacheLifetime(Cache* cache);
 
   // Create a CacheKey that is unique among others for the lifetime of this
   // process. This is useful for saving in a static data member so that
@@ -87,7 +87,7 @@ class OffsetableCacheKey : private CacheKey {
 
   // Constructs an OffsetableCacheKey with the given information about a file.
   // This constructor never generates an "empty" base key.
-  OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
+  OffsetableCacheKey(const std::string& db_id, const std::string& db_session_id,
                      uint64_t file_number);
 
   // Creates an OffsetableCacheKey from an SST unique ID, so that cache keys
@@ -134,9 +134,9 @@ class OffsetableCacheKey : private CacheKey {
     static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize,
                   "8 byte common prefix expected");
     assert(!IsEmpty());
-    assert(&this->file_num_etc64_ == static_cast<const void *>(this));
+    assert(&this->file_num_etc64_ == static_cast<const void*>(this));
 
-    return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
+    return Slice(reinterpret_cast<const char*>(this), kCommonPrefixSize);
   }
 };
 
diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h
index a7b06dea2073..deff5be8a285 100644
--- a/cache/cache_reservation_manager.h
+++ b/cache/cache_reservation_manager.h
@@ -44,8 +44,8 @@ class CacheReservationManager {
                                         bool increase) = 0;
   virtual Status MakeCacheReservation(
       std::size_t incremental_memory_used,
-      std::unique_ptr<CacheReservationManager::CacheReservationHandle>
-          *handle) = 0;
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>*
+          handle) = 0;
   virtual std::size_t GetTotalReservedCacheSize() = 0;
   virtual std::size_t GetTotalMemoryUsed() = 0;
 };
@@ -90,11 +90,11 @@ class CacheReservationManagerImpl
                                        bool delayed_decrease = false);
 
   // no copy constructor, copy assignment, move constructor, move assignment
-  CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete;
-  CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) =
+  CacheReservationManagerImpl(const CacheReservationManagerImpl&) = delete;
+  CacheReservationManagerImpl& operator=(const CacheReservationManagerImpl&) =
       delete;
-  CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete;
-  CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) =
+  CacheReservationManagerImpl(CacheReservationManagerImpl&&) = delete;
+  CacheReservationManagerImpl& operator=(CacheReservationManagerImpl&&) =
       delete;
 
   ~CacheReservationManagerImpl() override;
@@ -178,7 +178,7 @@ class CacheReservationManagerImpl
   // REQUIRES: handle != nullptr
   Status MakeCacheReservation(
       std::size_t incremental_memory_used,
-      std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle)
       override;
 
   // Return the size of the cache (which is a multiple of kSizeDummyEntry)
@@ -200,7 +200,7 @@ class CacheReservationManagerImpl
   // For testing only - it is to help ensure the CacheItemHelperForRole<R>
   // accessed from CacheReservationManagerImpl and the one accessed from the
   // test are from the same translation units
-  static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole();
+  static const Cache::CacheItemHelper* TEST_GetCacheItemHelperForRole();
 
  private:
   static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
@@ -216,7 +216,7 @@ class CacheReservationManagerImpl
   bool delayed_decrease_;
   std::atomic<std::size_t> cache_allocated_size_;
   std::size_t memory_used_;
-  std::vector<Cache::Handle *> dummy_handles_;
+  std::vector<Cache::Handle*> dummy_handles_;
   CacheKey cache_key_;
 };
 
@@ -251,14 +251,14 @@ class ConcurrentCacheReservationManager
       std::shared_ptr<CacheReservationManager> cache_res_mgr) {
     cache_res_mgr_ = std::move(cache_res_mgr);
   }
-  ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) =
+  ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager&) =
       delete;
-  ConcurrentCacheReservationManager &operator=(
-      const ConcurrentCacheReservationManager &) = delete;
-  ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) =
+  ConcurrentCacheReservationManager& operator=(
+      const ConcurrentCacheReservationManager&) = delete;
+  ConcurrentCacheReservationManager(ConcurrentCacheReservationManager&&) =
       delete;
-  ConcurrentCacheReservationManager &operator=(
-      ConcurrentCacheReservationManager &&) = delete;
+  ConcurrentCacheReservationManager& operator=(
+      ConcurrentCacheReservationManager&&) = delete;
 
   ~ConcurrentCacheReservationManager() override {}
 
@@ -286,7 +286,7 @@ class ConcurrentCacheReservationManager
 
   inline Status MakeCacheReservation(
       std::size_t incremental_memory_used,
-      std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
+      std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle)
       override {
     std::unique_ptr<CacheReservationManager::CacheReservationHandle>
         wrapped_handle;
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index 17390681e0b4..983080eae78f 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -1427,8 +1427,12 @@ TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
   {
     std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
     WriteBatch batch(0, 0, 0, kTimestampSize);
-    { ASSERT_OK(batch.Put("a", "new_value")); }
-    { ASSERT_OK(batch.Put("b", "new_value")); }
+    {
+      ASSERT_OK(batch.Put("a", "new_value"));
+    }
+    {
+      ASSERT_OK(batch.Put("b", "new_value"));
+    }
     s = batch.UpdateTimestamps(
         ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; });
     ASSERT_OK(s);
diff --git a/db/experimental.cc b/db/experimental.cc
index 6350dede4ac3..b6efc1a47534 100644
--- a/db/experimental.cc
+++ b/db/experimental.cc
@@ -1188,7 +1188,8 @@ class SstQueryFilterConfigsManagerImpl : public SstQueryFilterConfigsManager {
             break;
           default:
             // TODO? Report problem
-            {}
+            {
+            }
             // Unknown filter type
         }
         if (!may_match) {
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 7080eef1a09d..4601242f6c8a 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -674,7 +674,7 @@ class PosixFileSystem : public FileSystem {
 
   IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
                        uint64_t* size, IODebugContext* /*dbg*/) override {
-    struct stat sbuf {};
+    struct stat sbuf{};
     if (stat(fname.c_str(), &sbuf) != 0) {
       *size = 0;
       return IOError("while stat a file for size", fname, errno);
@@ -981,7 +981,7 @@ class PosixFileSystem : public FileSystem {
   // file size. However this API only works on opened file.
   IOStatus GetFileSizeOnOpenedFile(const int fd, const std::string& name,
                                    uint64_t* size) {
-    struct stat sb {};
+    struct stat sb{};
     *size = 0;
     // Get file information using fstat
     if (fstat(fd, &sb) == -1) {
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 489e5b3a9e50..80cb1e05aeae 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -610,7 +610,7 @@ PosixRandomAccessFile::PosixRandomAccessFile(
 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
 
 IOStatus PosixRandomAccessFile::GetFileSize(uint64_t* result) {
-  struct stat sbuf {};
+  struct stat sbuf{};
   if (fstat(fd_, &sbuf) != 0) {
     *result = 0;
     return IOError("While fstat with fd " + std::to_string(fd_), filename_,
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 5c88c8fe3015..55e03ea96937 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -2728,7 +2728,7 @@ rocksdb_slicetransform_create(
     unsigned char (*in_range)(void*, const char* key, size_t length),
     const char* (*name)(void*));
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
-    rocksdb_slicetransform_create_fixed_prefix(size_t);
+rocksdb_slicetransform_create_fixed_prefix(size_t);
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
 rocksdb_slicetransform_create_noop(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
diff --git a/include/rocksdb/functor_wrapper.h b/include/rocksdb/functor_wrapper.h
index 17b021bf73b5..50007b85d77a 100644
--- a/include/rocksdb/functor_wrapper.h
+++ b/include/rocksdb/functor_wrapper.h
@@ -44,7 +44,7 @@ void call(Function f, Tuple t) {
 template <typename... Args>
 class FunctorWrapper {
  public:
-  explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+  explicit FunctorWrapper(std::function<void(Args...)> functor, Args&&... args)
       : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
 
   void invoke() { detail::call(functor_, args_); }
diff --git a/include/rocksdb/unique_id.h b/include/rocksdb/unique_id.h
index eb0c778266cb..3c0c0eb5b1bf 100644
--- a/include/rocksdb/unique_id.h
+++ b/include/rocksdb/unique_id.h
@@ -33,8 +33,8 @@ namespace ROCKSDB_NAMESPACE {
 // And assuming one generates many SST files in the lifetime of each process,
 // the probability of ID collisions is much "better than random"; see
 // https://github.com/pdillinger/unique_id
-Status GetUniqueIdFromTableProperties(const TableProperties &props,
-                                      std::string *out_id);
+Status GetUniqueIdFromTableProperties(const TableProperties& props,
+                                      std::string* out_id);
 
 // Computes a 192-bit (24 binary char) stable, universally unique ID
 // with an extra 64 bits of uniqueness compared to the standard ID. It is only
@@ -44,12 +44,12 @@ Status GetUniqueIdFromTableProperties(const TableProperties &props,
 // example above would expect a global file ID collision every 4 days with
 // 128-bit IDs (using some worst-case assumptions about process lifetime).
 // It's 10^17 years with 192-bit IDs.
-Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
-                                              std::string *out_id);
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties& props,
+                                              std::string* out_id);
 
 // Converts a binary string (unique id) to hexadecimal, with each 64 bits
 // separated by '-', e.g. 6474DF650323BDF0-B48E64F3039308CA-17284B32E7F7444B
 // Also works on unique id prefix.
-std::string UniqueIdToHumanString(const std::string &id);
+std::string UniqueIdToHumanString(const std::string& id);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/java/rocksjni/config_options.cc b/java/rocksjni/config_options.cc
index 1532dd9e80ad..2f243f978423 100644
--- a/java/rocksjni/config_options.cc
+++ b/java/rocksjni/config_options.cc
@@ -19,9 +19,9 @@
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv*, jclass,
                                                        jlong jhandle) {
-  auto *co = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(jhandle);
+  auto* co = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(jhandle);
   assert(co != nullptr);
   delete co;
 }
@@ -31,8 +31,8 @@ void Java_org_rocksdb_ConfigOptions_disposeInternalJni(JNIEnv *, jclass,
  * Method:    newConfigOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) {
-  auto *cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions();
+jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv*, jclass) {
+  auto* cfg_opt = new ROCKSDB_NAMESPACE::ConfigOptions();
   return GET_CPLUSPLUS_POINTER(cfg_opt);
 }
 
@@ -41,11 +41,11 @@ jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) {
  * Method:    setEnv
  * Signature: (JJ;)V
  */
-void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle,
+void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv*, jclass, jlong handle,
                                            jlong rocksdb_env_handle) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
-  auto *rocksdb_env =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Env *>(rocksdb_env_handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
+  auto* rocksdb_env =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Env*>(rocksdb_env_handle);
   cfg_opt->env = rocksdb_env;
 }
 
@@ -54,10 +54,10 @@ void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle,
  * Method:    setDelimiter
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass,
+void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv* env, jclass,
                                                  jlong handle, jstring s) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
-  const char *delim = env->GetStringUTFChars(s, nullptr);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
+  const char* delim = env->GetStringUTFChars(s, nullptr);
   if (delim == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -71,10 +71,10 @@ void Java_org_rocksdb_ConfigOptions_setDelimiter(JNIEnv *env, jclass,
  * Method:    setIgnoreUnknownOptions
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv*, jclass,
                                                             jlong handle,
                                                             jboolean b) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
   cfg_opt->ignore_unknown_options = static_cast<bool>(b);
 }
 
@@ -83,10 +83,10 @@ void Java_org_rocksdb_ConfigOptions_setIgnoreUnknownOptions(JNIEnv *, jclass,
  * Method:    setInputStringsEscaped
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv*, jclass,
                                                            jlong handle,
                                                            jboolean b) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
   cfg_opt->input_strings_escaped = static_cast<bool>(b);
 }
 
@@ -95,9 +95,9 @@ void Java_org_rocksdb_ConfigOptions_setInputStringsEscaped(JNIEnv *, jclass,
  * Method:    setSanityLevel
  * Signature: (JI)V
  */
-void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv *, jclass,
+void Java_org_rocksdb_ConfigOptions_setSanityLevel(JNIEnv*, jclass,
                                                    jlong handle, jbyte level) {
-  auto *cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions *>(handle);
+  auto* cfg_opt = reinterpret_cast<ROCKSDB_NAMESPACE::ConfigOptions*>(handle);
   cfg_opt->sanity_level =
       ROCKSDB_NAMESPACE::SanityLevelJni::toCppSanityLevel(level);
 }
diff --git a/java/rocksjni/env_options.cc b/java/rocksjni/env_options.cc
index c3a9ae825da1..3f2577193e65 100644
--- a/java/rocksjni/env_options.cc
+++ b/java/rocksjni/env_options.cc
@@ -13,28 +13,28 @@
 #include "rocksdb/env.h"
 #include "rocksjni/cplusplus_to_java_convert.h"
 
-#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt)                          \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+#define ENV_OPTIONS_SET_BOOL(_jhandle, _opt)                         \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt = \
       static_cast<bool>(_opt)
 
-#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt)                        \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+#define ENV_OPTIONS_SET_SIZE_T(_jhandle, _opt)                       \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt = \
       static_cast<size_t>(_opt)
 
-#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt)                      \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt = \
+#define ENV_OPTIONS_SET_UINT64_T(_jhandle, _opt)                     \
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt = \
       static_cast<uint64_t>(_opt)
 
 #define ENV_OPTIONS_GET(_jhandle, _opt) \
-  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(_jhandle)->_opt
+  reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(_jhandle)->_opt
 
 /*
  * Class:     org_rocksdb_EnvOptions
  * Method:    newEnvOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) {
-  auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions();
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv*, jclass) {
+  auto* env_opt = new ROCKSDB_NAMESPACE::EnvOptions();
   return GET_CPLUSPLUS_POINTER(env_opt);
 }
 
@@ -43,11 +43,11 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(JNIEnv *, jclass) {
  * Method:    newEnvOptions
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass,
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv*, jclass,
                                                    jlong jdboptions_handle) {
-  auto *db_options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions *>(jdboptions_handle);
-  auto *env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options);
+  auto* db_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jdboptions_handle);
+  auto* env_opt = new ROCKSDB_NAMESPACE::EnvOptions(*db_options);
   return GET_CPLUSPLUS_POINTER(env_opt);
 }
 
@@ -56,9 +56,9 @@ jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(JNIEnv *, jclass,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv*, jclass,
                                                     jlong jhandle) {
-  auto *eo = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(jhandle);
+  auto* eo = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(jhandle);
   assert(eo != nullptr);
   delete eo;
 }
@@ -68,8 +68,7 @@ void Java_org_rocksdb_EnvOptions_disposeInternalJni(JNIEnv *, jclass,
  * Method:    setUseMmapReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass,
-                                                 jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv*, jclass, jlong jhandle,
                                                  jboolean use_mmap_reads) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads);
 }
@@ -79,7 +78,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *, jclass,
  * Method:    useMmapReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv*, jclass,
                                                   jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_mmap_reads);
 }
@@ -89,7 +88,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *, jclass,
  * Method:    setUseMmapWrites
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv*, jclass,
                                                   jlong jhandle,
                                                   jboolean use_mmap_writes) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes);
@@ -100,7 +99,7 @@ void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *, jclass,
  * Method:    useMmapWrites
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv*, jclass,
                                                    jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
 }
@@ -110,7 +109,7 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *, jclass,
  * Method:    setUseDirectReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv*, jclass,
                                                    jlong jhandle,
                                                    jboolean use_direct_reads) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
@@ -121,7 +120,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *, jclass,
  * Method:    useDirectReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv*, jclass,
                                                     jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_direct_reads);
 }
@@ -132,7 +131,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *, jclass,
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
-    JNIEnv *, jclass, jlong jhandle, jboolean use_direct_writes) {
+    JNIEnv*, jclass, jlong jhandle, jboolean use_direct_writes) {
   ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
 }
 
@@ -141,7 +140,7 @@ void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
  * Method:    useDirectWrites
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv*, jclass,
                                                      jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, use_direct_writes);
 }
@@ -151,7 +150,7 @@ jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *, jclass,
  * Method:    setAllowFallocate
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass,
+void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv*, jclass,
                                                    jlong jhandle,
                                                    jboolean allow_fallocate) {
   ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate);
@@ -162,7 +161,7 @@ void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *, jclass,
  * Method:    allowFallocate
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv*, jclass,
                                                     jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, allow_fallocate);
 }
@@ -172,8 +171,7 @@ jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *, jclass,
  * Method:    setSetFdCloexec
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass,
-                                                 jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv*, jclass, jlong jhandle,
                                                  jboolean set_fd_cloexec) {
   ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec);
 }
@@ -183,7 +181,7 @@ void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *, jclass,
  * Method:    setFdCloexec
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv*, jclass,
                                                   jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, set_fd_cloexec);
 }
@@ -193,8 +191,7 @@ jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *, jclass,
  * Method:    setBytesPerSync
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass,
-                                                 jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv*, jclass, jlong jhandle,
                                                  jlong bytes_per_sync) {
   ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync);
 }
@@ -204,8 +201,7 @@ void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *, jclass,
  * Method:    bytesPerSync
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass,
-                                               jlong jhandle) {
+jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv*, jclass, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, bytes_per_sync);
 }
 
@@ -215,7 +211,7 @@ jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *, jclass,
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
-    JNIEnv *, jclass, jlong jhandle, jboolean fallocate_with_keep_size) {
+    JNIEnv*, jclass, jlong jhandle, jboolean fallocate_with_keep_size) {
   ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size);
 }
 
@@ -224,7 +220,7 @@ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
  * Method:    fallocateWithKeepSize
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass,
+jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv*, jclass,
                                                            jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size);
 }
@@ -235,7 +231,7 @@ jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *, jclass,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
-    JNIEnv *, jclass, jlong jhandle, jlong compaction_readahead_size) {
+    JNIEnv*, jclass, jlong jhandle, jlong compaction_readahead_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size);
 }
 
@@ -244,7 +240,7 @@ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
  * Method:    compactionReadaheadSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass,
+jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv*, jclass,
                                                           jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, compaction_readahead_size);
 }
@@ -255,7 +251,7 @@ jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *, jclass,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
-    JNIEnv *, jclass, jlong jhandle, jlong writable_file_max_buffer_size) {
+    JNIEnv*, jclass, jlong jhandle, jlong writable_file_max_buffer_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size);
 }
 
@@ -264,7 +260,7 @@ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
  * Method:    writableFileMaxBufferSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass,
+jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv*, jclass,
                                                             jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size);
 }
@@ -274,11 +270,11 @@ jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *, jclass,
  * Method:    setRateLimiter
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *, jclass, jlong jhandle,
+void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv*, jclass, jlong jhandle,
                                                 jlong rl_handle) {
-  auto *sptr_rate_limiter =
-      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter> *>(
+  auto* sptr_rate_limiter =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::RateLimiter>*>(
           rl_handle);
-  auto *env_opt = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions *>(jhandle);
+  auto* env_opt = reinterpret_cast<ROCKSDB_NAMESPACE::EnvOptions*>(jhandle);
   env_opt->rate_limiter = sptr_rate_limiter->get();
 }
diff --git a/java/rocksjni/import_column_family_options.cc b/java/rocksjni/import_column_family_options.cc
index 1a9bded516b1..cd7bdfe007fa 100644
--- a/java/rocksjni/import_column_family_options.cc
+++ b/java/rocksjni/import_column_family_options.cc
@@ -16,8 +16,8 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions(
-    JNIEnv *, jclass) {
-  ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *opts =
+    JNIEnv*, jclass) {
+  ROCKSDB_NAMESPACE::ImportColumnFamilyOptions* opts =
       new ROCKSDB_NAMESPACE::ImportColumnFamilyOptions();
   return GET_CPLUSPLUS_POINTER(opts);
 }
@@ -28,9 +28,9 @@ jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles(
-    JNIEnv *, jobject, jlong jhandle, jboolean jmove_files) {
-  auto *options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *>(jhandle);
+    JNIEnv*, jobject, jlong jhandle, jboolean jmove_files) {
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions*>(jhandle);
   options->move_files = static_cast<bool>(jmove_files);
 }
 
@@ -39,10 +39,10 @@ void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles(
  * Method:    moveFiles
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject,
+jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv*, jobject,
                                                               jlong jhandle) {
-  auto *options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *>(jhandle);
+  auto* options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions*>(jhandle);
   return static_cast<jboolean>(options->move_files);
 }
 
@@ -51,9 +51,9 @@ jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv *,
+void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv*,
                                                                 jobject,
                                                                 jlong jhandle) {
-  delete reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *>(
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::ImportColumnFamilyOptions*>(
       jhandle);
 }
\ No newline at end of file
diff --git a/java/rocksjni/kv_helper.h b/java/rocksjni/kv_helper.h
index 5f0a8ffc57eb..75f254b173cd 100644
--- a/java/rocksjni/kv_helper.h
+++ b/java/rocksjni/kv_helper.h
@@ -81,7 +81,7 @@ class KVException : public std::exception {
     }
   }
 
-  KVException(jint code) : kCode_(code){};
+  KVException(jint code) : kCode_(code) {};
 
   virtual const char* what() const noexcept {
     return "Exception raised by JNI. There may be a Java exception in the "
@@ -176,13 +176,13 @@ class JByteArrayPinnableSlice {
       : env_(env),
         jbuffer_(jbuffer),
         jbuffer_off_(jbuffer_off),
-        jbuffer_len_(jbuffer_len){};
+        jbuffer_len_(jbuffer_len) {};
 
   /**
    * @brief Construct an empty new JByteArrayPinnableSlice object
    *
    */
-  JByteArrayPinnableSlice(JNIEnv* env) : env_(env){};
+  JByteArrayPinnableSlice(JNIEnv* env) : env_(env) {};
 
   PinnableSlice& pinnable_slice() { return pinnable_slice_; }
 
diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc
index c87c4f403bbb..d60a89296481 100644
--- a/java/rocksjni/memory_util.cc
+++ b/java/rocksjni/memory_util.cc
@@ -21,9 +21,9 @@
  * Signature: ([J[J)Ljava/util/Map;
  */
 jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
-    JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
+    JNIEnv* env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
   jboolean has_exception = JNI_FALSE;
-  std::vector<ROCKSDB_NAMESPACE::DB *> dbs =
+  std::vector<ROCKSDB_NAMESPACE::DB*> dbs =
       ROCKSDB_NAMESPACE::JniUtil::fromJPointers<ROCKSDB_NAMESPACE::DB>(
           env, jdb_handles, &has_exception);
   if (has_exception == JNI_TRUE) {
@@ -31,18 +31,18 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
     return nullptr;
   }
 
-  std::unordered_set<const ROCKSDB_NAMESPACE::Cache *> cache_set;
+  std::unordered_set<const ROCKSDB_NAMESPACE::Cache*> cache_set;
   jsize cache_handle_count = env->GetArrayLength(jcache_handles);
   if (cache_handle_count > 0) {
-    jlong *ptr_jcache_handles =
+    jlong* ptr_jcache_handles =
         env->GetLongArrayElements(jcache_handles, nullptr);
     if (ptr_jcache_handles == nullptr) {
       // exception thrown: OutOfMemoryError
       return nullptr;
     }
     for (jsize i = 0; i < cache_handle_count; i++) {
-      auto *cache_ptr =
-          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+      auto* cache_ptr =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
               ptr_jcache_handles[i]);
       cache_set.insert(cache_ptr->get());
     }
@@ -68,7 +68,7 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
       jobject>
       fn_map_kv = [env](
                       const std::pair<ROCKSDB_NAMESPACE::MemoryUtil::UsageType,
-                                      uint64_t> &pair) {
+                                      uint64_t>& pair) {
         // Construct key
         const jobject jusage_type = ROCKSDB_NAMESPACE::ByteJni::valueOf(
             env, ROCKSDB_NAMESPACE::MemoryUsageTypeJni::toJavaMemoryUsageType(
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index af47c0e95644..45e8f507d9d2 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -67,11 +67,11 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
 jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(JNIEnv* env, jclass,
                                                           jlong jopt_handle,
                                                           jstring jdb_path) {
-  return rocksdb_open_helper(env, jopt_handle, jdb_path,
-                             (ROCKSDB_NAMESPACE::Status(*)(
-                                 const ROCKSDB_NAMESPACE::Options&,
-                                 const std::string&, ROCKSDB_NAMESPACE::DB**)) &
-                                 ROCKSDB_NAMESPACE::DB::Open);
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path,
+      (ROCKSDB_NAMESPACE::Status (*)(
+          const ROCKSDB_NAMESPACE::Options&, const std::string&,
+          ROCKSDB_NAMESPACE::DB**))&ROCKSDB_NAMESPACE::DB::Open);
 }
 
 /*
@@ -213,12 +213,11 @@ jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J(
     jobjectArray jcolumn_names, jlongArray jcolumn_options) {
   return rocksdb_open_helper(
       env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
-      (ROCKSDB_NAMESPACE::Status(*)(
+      (ROCKSDB_NAMESPACE::Status (*)(
           const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
           const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
           std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
-          ROCKSDB_NAMESPACE::DB**)) &
-          ROCKSDB_NAMESPACE::DB::Open);
+          ROCKSDB_NAMESPACE::DB**))&ROCKSDB_NAMESPACE::DB::Open);
 }
 
 /*
diff --git a/java/rocksjni/sst_file_readerjni.cc b/java/rocksjni/sst_file_readerjni.cc
index 4af472ecfb1c..c0370b1d64d8 100644
--- a/java/rocksjni/sst_file_readerjni.cc
+++ b/java/rocksjni/sst_file_readerjni.cc
@@ -24,12 +24,11 @@
  * Method:    newSstFileReader
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
+jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv* /*env*/,
                                                       jclass /*jcls*/,
                                                       jlong joptions) {
-  auto *options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
-  ROCKSDB_NAMESPACE::SstFileReader *sst_file_reader =
+  auto* options = reinterpret_cast<const ROCKSDB_NAMESPACE::Options*>(joptions);
+  ROCKSDB_NAMESPACE::SstFileReader* sst_file_reader =
       new ROCKSDB_NAMESPACE::SstFileReader(*options);
   return GET_CPLUSPLUS_POINTER(sst_file_reader);
 }
@@ -39,15 +38,15 @@ jlong Java_org_rocksdb_SstFileReader_newSstFileReader(JNIEnv * /*env*/,
  * Method:    open
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileReader_open(JNIEnv* env, jclass /*jcls*/,
                                          jlong jhandle, jstring jfile_path) {
-  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  const char* file_path = env->GetStringUTFChars(jfile_path, nullptr);
   if (file_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle)->Open(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle)->Open(
           file_path);
   env->ReleaseStringUTFChars(jfile_path, file_path);
 
@@ -61,13 +60,13 @@ void Java_org_rocksdb_SstFileReader_open(JNIEnv *env, jclass /*jcls*/,
  * Method:    newIterator
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
+jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv* /*env*/,
                                                  jclass /*jcls*/, jlong jhandle,
                                                  jlong jread_options_handle) {
-  auto *sst_file_reader =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
-  auto *read_options =
-      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions *>(jread_options_handle);
+  auto* sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
+  auto* read_options =
+      reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jread_options_handle);
   return GET_CPLUSPLUS_POINTER(sst_file_reader->NewIterator(*read_options));
 }
 
@@ -76,10 +75,10 @@ jlong Java_org_rocksdb_SstFileReader_newIterator(JNIEnv * /*env*/,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/,
+void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv* /*env*/,
                                                        jclass /*jcls*/,
                                                        jlong jhandle) {
-  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
 }
 
 /*
@@ -87,10 +86,10 @@ void Java_org_rocksdb_SstFileReader_disposeInternalJni(JNIEnv * /*env*/,
  * Method:    verifyChecksum
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv* env, jclass /*jcls*/,
                                                    jlong jhandle) {
-  auto *sst_file_reader =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  auto* sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
   auto s = sst_file_reader->VerifyChecksum();
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -102,11 +101,11 @@ void Java_org_rocksdb_SstFileReader_verifyChecksum(JNIEnv *env, jclass /*jcls*/,
  * Method:    getTableProperties
  * Signature: (J)J
  */
-jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv *env,
+jobject Java_org_rocksdb_SstFileReader_getTableProperties(JNIEnv* env,
                                                           jclass /*jcls*/,
                                                           jlong jhandle) {
-  auto *sst_file_reader =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader *>(jhandle);
+  auto* sst_file_reader =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileReader*>(jhandle);
   std::shared_ptr<const ROCKSDB_NAMESPACE::TableProperties> tp =
       sst_file_reader->GetTableProperties();
   jobject jtable_properties =
diff --git a/java/rocksjni/sst_file_writerjni.cc b/java/rocksjni/sst_file_writerjni.cc
index 481adbc85640..fbe888ab01b3 100644
--- a/java/rocksjni/sst_file_writerjni.cc
+++ b/java/rocksjni/sst_file_writerjni.cc
@@ -25,27 +25,26 @@
  * Signature: (JJJB)J
  */
 jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(
-    JNIEnv * /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions,
+    JNIEnv* /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions,
     jlong jcomparator_handle, jbyte jcomparator_type) {
-  ROCKSDB_NAMESPACE::Comparator *comparator = nullptr;
+  ROCKSDB_NAMESPACE::Comparator* comparator = nullptr;
   switch (jcomparator_type) {
     // JAVA_COMPARATOR
     case 0x0:
-      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback *>(
+      comparator = reinterpret_cast<ROCKSDB_NAMESPACE::ComparatorJniCallback*>(
           jcomparator_handle);
       break;
 
     // JAVA_NATIVE_COMPARATOR_WRAPPER
     case 0x1:
       comparator =
-          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator *>(jcomparator_handle);
+          reinterpret_cast<ROCKSDB_NAMESPACE::Comparator*>(jcomparator_handle);
       break;
   }
-  auto *env_options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions *>(jenvoptions);
-  auto *options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
-  ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer =
+  auto* env_options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions*>(jenvoptions);
+  auto* options = reinterpret_cast<const ROCKSDB_NAMESPACE::Options*>(joptions);
+  ROCKSDB_NAMESPACE::SstFileWriter* sst_file_writer =
       new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options, comparator);
   return GET_CPLUSPLUS_POINTER(sst_file_writer);
 }
@@ -55,15 +54,14 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(
  * Method:    newSstFileWriter
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/,
+jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv* /*env*/,
                                                           jclass /*jcls*/,
                                                           jlong jenvoptions,
                                                           jlong joptions) {
-  auto *env_options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions *>(jenvoptions);
-  auto *options =
-      reinterpret_cast<const ROCKSDB_NAMESPACE::Options *>(joptions);
-  ROCKSDB_NAMESPACE::SstFileWriter *sst_file_writer =
+  auto* env_options =
+      reinterpret_cast<const ROCKSDB_NAMESPACE::EnvOptions*>(jenvoptions);
+  auto* options = reinterpret_cast<const ROCKSDB_NAMESPACE::Options*>(joptions);
+  ROCKSDB_NAMESPACE::SstFileWriter* sst_file_writer =
       new ROCKSDB_NAMESPACE::SstFileWriter(*env_options, *options);
   return GET_CPLUSPLUS_POINTER(sst_file_writer);
 }
@@ -73,15 +71,15 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/,
  * Method:    open
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_open(JNIEnv* env, jclass /*jcls*/,
                                          jlong jhandle, jstring jfile_path) {
-  const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
+  const char* file_path = env->GetStringUTFChars(jfile_path, nullptr);
   if (file_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Open(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Open(
           file_path);
   env->ReleaseStringUTFChars(jfile_path, file_path);
 
@@ -95,14 +93,14 @@ void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jclass /*jcls*/,
  * Method:    put
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv* env, jclass /*jcls*/,
                                              jlong jhandle, jlong jkey_handle,
                                              jlong jvalue_handle) {
-  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
-  auto *value_slice =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jvalue_handle);
+  auto* key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jkey_handle);
+  auto* value_slice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jvalue_handle);
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Put(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Put(
           *key_slice, *value_slice);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -114,28 +112,28 @@ void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jclass /*jcls*/,
  * Method:    put
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv* env, jclass /*jcls*/,
                                                  jlong jhandle, jbyteArray jkey,
                                                  jbyteArray jval) {
-  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
                                      env->GetArrayLength(jkey));
 
-  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
   if (value == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
     return;
   }
-  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char *>(value),
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
                                        env->GetArrayLength(jval));
 
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Put(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Put(
           key_slice, value_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
@@ -151,15 +149,15 @@ void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
  * Method:    putDirect
  * Signature: (JLjava/nio/ByteBuffer;IILjava/nio/ByteBuffer;II)V
  */
-void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv* env, jclass /*jcls*/,
                                               jlong jdb_handle, jobject jkey,
                                               jint jkey_off, jint jkey_len,
                                               jobject jval, jint jval_off,
                                               jint jval_len) {
-  auto *writer =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jdb_handle);
-  auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice &key,
-                             ROCKSDB_NAMESPACE::Slice &value) {
+  auto* writer =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jdb_handle);
+  auto put = [&env, &writer](ROCKSDB_NAMESPACE::Slice& key,
+                             ROCKSDB_NAMESPACE::Slice& value) {
     ROCKSDB_NAMESPACE::Status s = writer->Put(key, value);
     if (s.ok()) {
       return;
@@ -175,10 +173,10 @@ void Java_org_rocksdb_SstFileWriter_putDirect(JNIEnv *env, jclass /*jcls*/,
  * Method:    fileSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/,
+jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv* /*env*/, jclass /*jcls*/,
                                               jlong jdb_handle) {
-  auto *writer =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jdb_handle);
+  auto* writer =
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jdb_handle);
   return static_cast<jlong>(writer->FileSize());
 }
 
@@ -187,14 +185,14 @@ jlong Java_org_rocksdb_SstFileWriter_fileSize(JNIEnv * /*env*/, jclass /*jcls*/,
  * Method:    merge
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv* env, jclass /*jcls*/,
                                                jlong jhandle, jlong jkey_handle,
                                                jlong jvalue_handle) {
-  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
-  auto *value_slice =
-      reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jvalue_handle);
+  auto* key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jkey_handle);
+  auto* value_slice =
+      reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jvalue_handle);
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Merge(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Merge(
           *key_slice, *value_slice);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -206,29 +204,29 @@ void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jclass /*jcls*/,
  * Method:    merge
  * Signature: (J[B[B)V
  */
-void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv* env, jclass /*jcls*/,
                                                    jlong jhandle,
                                                    jbyteArray jkey,
                                                    jbyteArray jval) {
-  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
                                      env->GetArrayLength(jkey));
 
-  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
   if (value == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
     return;
   }
-  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char *>(value),
+  ROCKSDB_NAMESPACE::Slice value_slice(reinterpret_cast<char*>(value),
                                        env->GetArrayLength(jval));
 
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Merge(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Merge(
           key_slice, value_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
@@ -244,19 +242,19 @@ void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jclass /*jcls*/,
  * Method:    delete
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv* env, jclass /*jcls*/,
                                                  jlong jhandle,
                                                  jbyteArray jkey) {
-  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
   if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char *>(key),
+  ROCKSDB_NAMESPACE::Slice key_slice(reinterpret_cast<char*>(key),
                                      env->GetArrayLength(jkey));
 
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Delete(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Delete(
           key_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
@@ -271,12 +269,12 @@ void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jclass /*jcls*/,
  * Method:    delete
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv* env, jclass /*jcls*/,
                                                jlong jhandle,
                                                jlong jkey_handle) {
-  auto *key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice *>(jkey_handle);
+  auto* key_slice = reinterpret_cast<ROCKSDB_NAMESPACE::Slice*>(jkey_handle);
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Delete(
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Delete(
           *key_slice);
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
@@ -288,10 +286,10 @@ void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jclass /*jcls*/,
  * Method:    finish
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/,
+void Java_org_rocksdb_SstFileWriter_finish(JNIEnv* env, jclass /*jcls*/,
                                            jlong jhandle) {
   ROCKSDB_NAMESPACE::Status s =
-      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle)->Finish();
+      reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle)->Finish();
   if (!s.ok()) {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
   }
@@ -302,8 +300,8 @@ void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jclass /*jcls*/,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv * /*env*/,
+void Java_org_rocksdb_SstFileWriter_disposeInternalJni(JNIEnv* /*env*/,
                                                        jclass /*jobj*/,
                                                        jlong jhandle) {
-  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter *>(jhandle);
+  delete reinterpret_cast<ROCKSDB_NAMESPACE::SstFileWriter*>(jhandle);
 }
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 63eb3feca324..10747212fd1f 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -23,7 +23,7 @@
  * Signature: (IIDIIBZZ)J
  */
 jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
-    JNIEnv * /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key,
+    JNIEnv* /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key,
     jdouble jhash_table_ratio, jint jindex_sparseness, jint jhuge_page_tlb_size,
     jbyte jencoding_type, jboolean jfull_scan_mode,
     jboolean jstore_index_in_file) {
@@ -48,7 +48,7 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
  * Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
-    JNIEnv *, jclass, jboolean jcache_index_and_filter_blocks,
+    JNIEnv*, jclass, jboolean jcache_index_and_filter_blocks,
     jboolean jcache_index_and_filter_blocks_with_high_priority,
     jboolean jpin_l0_filter_and_index_blocks_in_cache,
     jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value,
@@ -89,8 +89,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     options.block_cache = nullptr;
   } else {
     if (jblock_cache_handle > 0) {
-      std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *pCache =
-          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache> *>(
+      std::shared_ptr<ROCKSDB_NAMESPACE::Cache>* pCache =
+          reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
               jblock_cache_handle);
       options.block_cache = *pCache;
     } else if (jblock_cache_size >= 0) {
@@ -108,8 +108,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     }
   }
   if (jpersistent_cache_handle > 0) {
-    std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache> *pCache =
-        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache> *>(
+    std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>* pCache =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>*>(
             jpersistent_cache_handle);
     options.persistent_cache = *pCache;
   }
@@ -124,8 +124,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
       static_cast<bool>(joptimize_filters_for_memory);
   options.use_delta_encoding = static_cast<bool>(juse_delta_encoding);
   if (jfilter_policy_handle > 0) {
-    std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *pFilterPolicy =
-        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy> *>(
+    std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>* pFilterPolicy =
+        reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>*>(
             jfilter_policy_handle);
     options.filter_policy = *pFilterPolicy;
   }
diff --git a/java/rocksjni/table_properties_collector_factory.cc b/java/rocksjni/table_properties_collector_factory.cc
index 60e1df6e8b13..365a50d7eb5a 100644
--- a/java/rocksjni/table_properties_collector_factory.cc
+++ b/java/rocksjni/table_properties_collector_factory.cc
@@ -17,9 +17,9 @@
  * Signature: (JJD)J
  */
 jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionCollectorFactory(
-    JNIEnv *, jclass, jlong sliding_window_size, jlong deletion_trigger,
+    JNIEnv*, jclass, jlong sliding_window_size, jlong deletion_trigger,
     jdouble deletion_ratio) {
-  auto *wrapper = new TablePropertiesCollectorFactoriesJniWrapper();
+  auto* wrapper = new TablePropertiesCollectorFactoriesJniWrapper();
   wrapper->table_properties_collector_factories =
       ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory(
           sliding_window_size, deletion_trigger, deletion_ratio);
@@ -32,8 +32,8 @@ jlong Java_org_rocksdb_TablePropertiesCollectorFactory_newCompactOnDeletionColle
  * Signature: (J)J
  */
 void Java_org_rocksdb_TablePropertiesCollectorFactory_deleteCompactOnDeletionCollectorFactory(
-    JNIEnv *, jclass, jlong jhandle) {
+    JNIEnv*, jclass, jlong jhandle) {
   auto instance =
-      reinterpret_cast<TablePropertiesCollectorFactoriesJniWrapper *>(jhandle);
+      reinterpret_cast<TablePropertiesCollectorFactoriesJniWrapper*>(jhandle);
   delete instance;
 }
diff --git a/java/rocksjni/testable_event_listener.cc b/java/rocksjni/testable_event_listener.cc
index 483ade160561..febf8cbd1bb7 100644
--- a/java/rocksjni/testable_event_listener.cc
+++ b/java/rocksjni/testable_event_listener.cc
@@ -78,9 +78,9 @@ static TableProperties newTablePropertiesForTest() {
  * Signature: (J)V
  */
 void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
-    JNIEnv *, jclass, jlong jhandle) {
-  const auto &el =
-      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener> *>(
+    JNIEnv*, jclass, jlong jhandle) {
+  const auto& el =
+      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
           jhandle);
 
   TableProperties table_properties = newTablePropertiesForTest();
@@ -127,7 +127,7 @@ void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
   compaction_job_info.output_file_infos = {};
   compaction_job_info.table_properties = {
       {"tableProperties", std::shared_ptr<TableProperties>(
-                              &table_properties, [](TableProperties *) {})}};
+                              &table_properties, [](TableProperties*) {})}};
   compaction_job_info.compaction_reason = CompactionReason::kFlush;
   compaction_job_info.compression = CompressionType::kSnappyCompression;
 
diff --git a/microbench/ribbon_bench.cc b/microbench/ribbon_bench.cc
index d0fb2ec9ab2e..58cd710a4c70 100644
--- a/microbench/ribbon_bench.cc
+++ b/microbench/ribbon_bench.cc
@@ -32,7 +32,7 @@ struct KeyMaker {
     // To get range [avg_size - 2, avg_size + 2]
     // use range [smallest_size, smallest_size + 4]
     len += FastRange32((val_num >> 5) * 1234567891, 5);
-    char *data = buf_.get() + start;
+    char* data = buf_.get() + start;
     // Populate key data such that all data makes it into a key of at
     // least 8 bytes. We also don't want all the within-filter key
     // variance confined to a contiguous 32 bits, because then a 32 bit
@@ -51,7 +51,7 @@ struct KeyMaker {
 // 1. filter config bits_per_key
 // 2. average data key length
 // 3. data entry number
-static void CustomArguments(benchmark::internal::Benchmark *b) {
+static void CustomArguments(benchmark::internal::Benchmark* b) {
   const auto kImplCount =
       static_cast<int>(BloomLikeFilterPolicy::GetAllFixedImpls().size());
   for (int filter_impl = 0; filter_impl < kImplCount; ++filter_impl) {
@@ -66,7 +66,7 @@ static void CustomArguments(benchmark::internal::Benchmark *b) {
   b->ArgNames({"filter_impl", "bits_per_key", "key_len_avg", "entry_num"});
 }
 
-static void FilterBuild(benchmark::State &state) {
+static void FilterBuild(benchmark::State& state) {
   // setup data
   auto filter = BloomLikeFilterPolicy::Create(
       BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
@@ -89,7 +89,7 @@ static void FilterBuild(benchmark::State &state) {
 }
 BENCHMARK(FilterBuild)->Apply(CustomArguments);
 
-static void FilterQueryPositive(benchmark::State &state) {
+static void FilterQueryPositive(benchmark::State& state) {
   // setup data
   auto filter = BloomLikeFilterPolicy::Create(
       BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
@@ -117,7 +117,7 @@ static void FilterQueryPositive(benchmark::State &state) {
 }
 BENCHMARK(FilterQueryPositive)->Apply(CustomArguments);
 
-static void FilterQueryNegative(benchmark::State &state) {
+static void FilterQueryNegative(benchmark::State& state) {
   // setup data
   auto filter = BloomLikeFilterPolicy::Create(
       BloomLikeFilterPolicy::GetAllFixedImpls().at(state.range(0)),
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index a38f6ec01805..59f5f19f66df 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -259,10 +259,10 @@ void PerfContext::Reset() {
 #endif
 }
 
-void PerfContextByLevel::Reset(){
+void PerfContextByLevel::Reset() {
 #ifndef NPERF_CONTEXT
 #define EMIT_FIELDS(x) x = 0;
-    DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS)
+  DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS)
 #undef EMIT_FIELDS
 #endif
 }
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index c752b2401718..135a4461847c 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -508,7 +508,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
   // ColumnFamilyOptions.
   const OffsetGap kColumnFamilyOptionsExcluded = {
       {offsetof(struct ColumnFamilyOptions, inplace_callback),
-       sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))},
+       sizeof(UpdateStatus (*)(char*, uint32_t*, Slice, std::string*))},
       {offsetof(struct ColumnFamilyOptions,
                 memtable_insert_with_hint_prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index d89d0b8c38f2..1fca386c01c3 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -59,33 +59,31 @@ static inline bool HasJemalloc() { return true; }
 
 // Declare non-standard jemalloc APIs as weak symbols. We can null-check these
 // symbols to detect whether jemalloc is linked with the binary.
-extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW*
 mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
     __attribute__((__weak__));
-extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
-rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
-extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int)
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW*
+rallocx(void*, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW xallocx(void*, size_t, size_t, int)
     __attribute__((__weak__));
-extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int)
-    JEMALLOC_ATTR(pure) __attribute__((__weak__));
-extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__));
-extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int)
+extern "C" size_t JEMALLOC_NOTHROW sallocx(const void*, int) JEMALLOC_ATTR(pure)
+    __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW dallocx(void*, int) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW sdallocx(void*, size_t, int)
     __attribute__((__weak__));
 extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure)
     __attribute__((__weak__));
-extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *,
+extern "C" int JEMALLOC_NOTHROW mallctl(const char*, void*, size_t*, void*,
                                         size_t) __attribute__((__weak__));
-extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *,
-                                                 size_t *)
-    __attribute__((__weak__));
-extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *,
-                                             size_t *, void *, size_t)
+extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char*, size_t*, size_t*)
     __attribute__((__weak__));
-extern "C" void JEMALLOC_NOTHROW
-malloc_stats_print(void (*)(void *, const char *), void *, const char *)
+extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t*, size_t, void*,
+                                             size_t*, void*, size_t)
     __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW malloc_stats_print(
+    void (*)(void*, const char*), void*, const char*) __attribute__((__weak__));
 extern "C" size_t JEMALLOC_NOTHROW
-malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW
+malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) JEMALLOC_CXX_THROW
     __attribute__((__weak__));
 
 // Check if Jemalloc is linked with the binary. Note the main program might be
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index 27a5bb7eb066..e93ac5979a4f 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -33,10 +33,10 @@
 namespace ROCKSDB_NAMESPACE {
 
 std::string GenerateInternalKey(int primary_key, int secondary_key,
-                                int padding_size, Random *rnd,
+                                int padding_size, Random* rnd,
                                 size_t ts_sz = 0) {
   char buf[50];
-  char *p = &buf[0];
+  char* p = &buf[0];
   snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
   std::string k(p);
   if (padding_size) {
@@ -55,8 +55,8 @@ std::string GenerateInternalKey(int primary_key, int secondary_key,
 // Generate random key value pairs.
 // The generated key will be sorted. You can tune the parameters to generated
 // different kinds of test key/value pairs for different scenario.
-void GenerateRandomKVs(std::vector<std::string> *keys,
-                       std::vector<std::string> *values, const int from,
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
                        const int len, const int step = 1,
                        const int padding_size = 0,
                        const int keys_share_prefix = 1, size_t ts_sz = 0) {
@@ -133,7 +133,7 @@ TEST_P(BlockTest, SimpleTest) {
 
   // read contents of block sequentially
   int count = 0;
-  InternalIterator *iter = reader.NewDataIterator(
+  InternalIterator* iter = reader.NewDataIterator(
       options.comparator, kDisableGlobalSequenceNumber, nullptr /* iter */,
       nullptr /* stats */, false /* block_contents_pinned */,
       shouldPersistUDT());
@@ -169,9 +169,9 @@ TEST_P(BlockTest, SimpleTest) {
 
 // return the block contents
 BlockContents GetBlockContents(
-    std::unique_ptr<BlockBuilder> *builder,
-    const std::vector<std::string> &keys,
-    const std::vector<std::string> &values, bool key_use_delta_encoding,
+    std::unique_ptr<BlockBuilder>* builder,
+    const std::vector<std::string>& keys,
+    const std::vector<std::string>& values, bool key_use_delta_encoding,
     size_t ts_sz, bool should_persist_udt, const int /*prefix_group_size*/ = 1,
     BlockBasedTableOptions::DataBlockIndexType dblock_index_type =
         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch) {
@@ -194,8 +194,8 @@ BlockContents GetBlockContents(
 }
 
 void CheckBlockContents(BlockContents contents, const int max_key,
-                        const std::vector<std::string> &keys,
-                        const std::vector<std::string> &values,
+                        const std::vector<std::string>& keys,
+                        const std::vector<std::string>& values,
                         bool is_udt_enabled, bool should_persist_udt) {
   const size_t prefix_size = 6;
   // create block reader
@@ -356,8 +356,8 @@ class BlockReadAmpBitmapSlowAndAccurate {
 TEST_F(BlockTest, BlockReadAmpBitmap) {
   uint32_t pin_offset = 0;
   SyncPoint::GetInstance()->SetCallBack(
-      "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
-        pin_offset = *(static_cast<uint32_t *>(arg));
+      "BlockReadAmpBitmap:rnd", [&pin_offset](void* arg) {
+        pin_offset = *(static_cast<uint32_t*>(arg));
       });
   SyncPoint::GetInstance()->EnableProcessing();
   std::vector<size_t> block_sizes = {
@@ -414,7 +414,7 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
 
     for (size_t i = 0; i < random_entries.size(); i++) {
       read_amp_slow_and_accurate.ResetCheckSequence();
-      auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
+      auto& current_entry = random_entries[rnd.Next() % random_entries.size()];
 
       read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
                            static_cast<uint32_t>(current_entry.second));
@@ -465,7 +465,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
 
     // read contents of block sequentially
     size_t read_bytes = 0;
-    DataBlockIter *iter = reader.NewDataIterator(
+    DataBlockIter* iter = reader.NewDataIterator(
         options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       iter->value();
@@ -496,7 +496,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter = reader.NewDataIterator(
+    DataBlockIter* iter = reader.NewDataIterator(
         options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     for (int i = 0; i < num_records; i++) {
       Slice k(keys[i]);
@@ -530,7 +530,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     Block reader(std::move(contents), kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    DataBlockIter *iter = reader.NewDataIterator(
+    DataBlockIter* iter = reader.NewDataIterator(
         options.comparator, kDisableGlobalSequenceNumber, nullptr, stats.get());
     std::unordered_set<int> read_keys;
     for (int i = 0; i < num_records; i++) {
@@ -595,9 +595,9 @@ class IndexBlockTest
 };
 
 // Similar to GenerateRandomKVs but for index block contents.
-void GenerateRandomIndexEntries(std::vector<std::string> *separators,
-                                std::vector<BlockHandle> *block_handles,
-                                std::vector<std::string> *first_keys,
+void GenerateRandomIndexEntries(std::vector<std::string>* separators,
+                                std::vector<BlockHandle>* block_handles,
+                                std::vector<std::string>* first_keys,
                                 const int len, size_t ts_sz = 0,
                                 bool zero_seqno = false) {
   Random rnd(42);
@@ -689,10 +689,10 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
   Block reader(std::move(contents));
 
   const bool kTotalOrderSeek = true;
-  IndexBlockIter *kNullIter = nullptr;
-  Statistics *kNullStats = nullptr;
+  IndexBlockIter* kNullIter = nullptr;
+  Statistics* kNullStats = nullptr;
   // read contents of block sequentially
-  InternalIteratorBase<IndexValue> *iter = reader.NewIndexIterator(
+  InternalIteratorBase<IndexValue>* iter = reader.NewIndexIterator(
       options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
       kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(),
       !useValueDeltaEncoding(), false /* block_contents_pinned */,
@@ -764,8 +764,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
       : DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {}
 
   template <typename TBlockIter>
-  void TestIterateForward(std::unique_ptr<TBlockIter> &biter,
-                          size_t &verification_count) {
+  void TestIterateForward(std::unique_ptr<TBlockIter>& biter,
+                          size_t& verification_count) {
     while (biter->Valid()) {
       verification_count = 0;
       biter->Next();
@@ -776,8 +776,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestIterateBackward(std::unique_ptr<TBlockIter> &biter,
-                           size_t &verification_count) {
+  void TestIterateBackward(std::unique_ptr<TBlockIter>& biter,
+                           size_t& verification_count) {
     while (biter->Valid()) {
       verification_count = 0;
       biter->Prev();
@@ -788,8 +788,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeekToFirst(std::unique_ptr<TBlockIter> &biter,
-                       size_t &verification_count) {
+  void TestSeekToFirst(std::unique_ptr<TBlockIter>& biter,
+                       size_t& verification_count) {
     verification_count = 0;
     biter->SeekToFirst();
     ASSERT_GE(verification_count, 1);
@@ -797,8 +797,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeekToLast(std::unique_ptr<TBlockIter> &biter,
-                      size_t &verification_count) {
+  void TestSeekToLast(std::unique_ptr<TBlockIter>& biter,
+                      size_t& verification_count) {
     verification_count = 0;
     biter->SeekToLast();
     ASSERT_GE(verification_count, 1);
@@ -806,8 +806,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeekForPrev(std::unique_ptr<TBlockIter> &biter,
-                       size_t &verification_count, std::string k) {
+  void TestSeekForPrev(std::unique_ptr<TBlockIter>& biter,
+                       size_t& verification_count, std::string k) {
     verification_count = 0;
     biter->SeekForPrev(k);
     ASSERT_GE(verification_count, 1);
@@ -815,7 +815,7 @@ class BlockPerKVChecksumTest : public DBTestBase {
   }
 
   template <typename TBlockIter>
-  void TestSeek(std::unique_ptr<TBlockIter> &biter, size_t &verification_count,
+  void TestSeek(std::unique_ptr<TBlockIter>& biter, size_t& verification_count,
                 std::string k) {
     verification_count = 0;
     biter->Seek(k);
@@ -823,8 +823,8 @@ class BlockPerKVChecksumTest : public DBTestBase {
     TestIterateForward(biter, verification_count);
   }
 
-  bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr,
-                      const Slice &key, const Slice &val) {
+  bool VerifyChecksum(uint32_t checksum_len, const char* checksum_ptr,
+                      const Slice& key, const Slice& val) {
     if (!checksum_len) {
       return checksum_ptr == nullptr;
     }
@@ -834,11 +834,11 @@ class BlockPerKVChecksumTest : public DBTestBase {
 };
 
 namespace {
-const BlockBasedTableOptions *kTableOptions() {
+const BlockBasedTableOptions* kTableOptions() {
   static BlockBasedTableOptions opts{};
   return &opts;
 }
-Decompressor *kDecompressor() {
+Decompressor* kDecompressor() {
   static auto mgr = GetBuiltinV2CompressionManager();
   static auto decomp = mgr->GetDecompressor();
   return decomp.get();
@@ -1056,7 +1056,7 @@ class DataBlockKVChecksumTest
   bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); }
 
   std::unique_ptr<Block_kData> GenerateDataBlock(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     BlockCreateContext create_context{
         kTableOptions(), nullptr /* statistics */, nullptr /* ioptions */,
@@ -1089,9 +1089,9 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */,
         ::testing::Values(1, 2, 3, 8, 16) /* restart_interval */,
         ::testing::Values(false, true)) /* delta_encoding */,
-    [](const testing::TestParamInfo<std::tuple<
-           BlockBasedTableOptions::DataBlockIndexType, uint8_t, uint32_t, bool>>
-           &args) {
+    [](const testing::TestParamInfo<
+        std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
+                   uint32_t, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param))
           << "ProtectionPerKey" << std::to_string(std::get<1>(args.param))
@@ -1114,7 +1114,7 @@ TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     std::unique_ptr<Block_kData> data_block =
         GenerateDataBlock(keys, values, kNumRecords);
 
-    const char *checksum_ptr = data_block->TEST_GetKVChecksum();
+    const char* checksum_ptr = data_block->TEST_GetKVChecksum();
     // Check checksum of correct length is generated
     for (int i = 0; i < kNumRecords; i++) {
       ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key,
@@ -1132,8 +1132,8 @@ TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     // that case (see Block::VerifyChecksum()).
     SyncPoint::GetInstance()->SetCallBack(
         "Block::VerifyChecksum::checksum_len",
-        [&verification_count, protection_bytes_per_key](void *checksum_len) {
-          ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
+        [&verification_count, protection_bytes_per_key](void* checksum_len) {
+          ASSERT_EQ((*static_cast<uint8_t*>(checksum_len)),
                     protection_bytes_per_key);
           ++verification_count;
         });
@@ -1177,9 +1177,9 @@ class IndexBlockKVChecksumTest
   bool IncludeFirstKey() const { return std::get<4>(GetParam()); }
 
   std::unique_ptr<Block_kIndex> GenerateIndexBlock(
-      std::vector<std::string> &separators,
-      std::vector<BlockHandle> &block_handles,
-      std::vector<std::string> &first_keys, int num_record) {
+      std::vector<std::string>& separators,
+      std::vector<BlockHandle>& block_handles,
+      std::vector<std::string>& first_keys, int num_record) {
     Options options = Options();
     uint8_t protection_bytes_per_key = GetChecksumLen();
     BlockCreateContext create_context{
@@ -1235,7 +1235,7 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(true, false), ::testing::Values(true, false)),
     [](const testing::TestParamInfo<
         std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
-                   uint32_t, bool, bool>> &args) {
+                   uint32_t, bool, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
           << std::to_string(std::get<1>(args.param)) << "RestartInterval"
@@ -1264,8 +1264,8 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
       SyncPoint::GetInstance()->DisableProcessing();
       std::unique_ptr<Block_kIndex> index_block = GenerateIndexBlock(
           separators, block_handles, first_keys, kNumRecords);
-      IndexBlockIter *kNullIter = nullptr;
-      Statistics *kNullStats = nullptr;
+      IndexBlockIter* kNullIter = nullptr;
+      Statistics* kNullStats = nullptr;
       // read contents of block sequentially
       std::unique_ptr<IndexBlockIter> biter{index_block->NewIndexIterator(
           options.comparator, seqno, kNullIter, kNullStats,
@@ -1276,7 +1276,7 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
           true /* user_defined_timestamps_persisted */,
           nullptr /* prefix_index */)};
       biter->SeekToFirst();
-      const char *checksum_ptr = index_block->TEST_GetKVChecksum();
+      const char* checksum_ptr = index_block->TEST_GetKVChecksum();
       // Check checksum of correct length is generated
       for (int i = 0; i < kNumRecords; i++) {
         // Obtaining the actual content written as value to index block is not
@@ -1296,8 +1296,8 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
       // assert checking on checksum_len here.
       SyncPoint::GetInstance()->SetCallBack(
           "Block::VerifyChecksum::checksum_len",
-          [&verification_count, protection_bytes_per_key](void *checksum_len) {
-            ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
+          [&verification_count, protection_bytes_per_key](void* checksum_len) {
+            ASSERT_EQ((*static_cast<uint8_t*>(checksum_len)),
                       protection_bytes_per_key);
             ++verification_count;
           });
@@ -1320,7 +1320,7 @@ class MetaIndexBlockKVChecksumTest
   uint32_t GetRestartInterval() const { return 1; }
 
   std::unique_ptr<Block_kMetaIndex> GenerateMetaIndexBlock(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     Options options = Options();
     uint8_t protection_bytes_per_key = GetChecksumLen();
@@ -1346,7 +1346,7 @@ class MetaIndexBlockKVChecksumTest
 
 INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest,
                         ::testing::Values(0, 1, 2, 4, 8),
-                        [](const testing::TestParamInfo<uint8_t> &args) {
+                        [](const testing::TestParamInfo<uint8_t>& args) {
                           std::ostringstream oss;
                           oss << "ProtBytes" << std::to_string(args.param);
                           return oss.str();
@@ -1368,7 +1368,7 @@ TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     SyncPoint::GetInstance()->DisableProcessing();
     std::unique_ptr<Block_kMetaIndex> meta_block =
         GenerateMetaIndexBlock(keys, values, kNumRecords);
-    const char *checksum_ptr = meta_block->TEST_GetKVChecksum();
+    const char* checksum_ptr = meta_block->TEST_GetKVChecksum();
     // Check checksum of correct length is generated
     for (int i = 0; i < kNumRecords; i++) {
       ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key,
@@ -1383,8 +1383,8 @@ TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
     // checking on checksum_len here.
     SyncPoint::GetInstance()->SetCallBack(
         "Block::VerifyChecksum::checksum_len",
-        [&verification_count, protection_bytes_per_key](void *checksum_len) {
-          ASSERT_EQ((*static_cast<uint8_t *>(checksum_len)),
+        [&verification_count, protection_bytes_per_key](void* checksum_len) {
+          ASSERT_EQ((*static_cast<uint8_t*>(checksum_len)),
                     protection_bytes_per_key);
           ++verification_count;
         });
@@ -1404,7 +1404,7 @@ class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest {
   DataBlockKVChecksumCorruptionTest() = default;
 
   std::unique_ptr<DataBlockIter> GenerateDataBlockIter(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     // During Block construction, we may create block iter to initialize per kv
     // checksum. Disable syncpoint that may be created for block iter methods.
@@ -1430,15 +1430,15 @@ TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) {
     GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
                       24 /* padding_size */);
     SyncPoint::GetInstance()->SetCallBack(
-        "BlockIter::UpdateKey::value", [](void *arg) {
-          char *value = static_cast<char *>(arg);
+        "BlockIter::UpdateKey::value", [](void* arg) {
+          char* value = static_cast<char*>(arg);
           // values generated by GenerateRandomKVs are of length 100
           ++value[10];
         });
 
     // Purely for reducing the number of lines of code.
     typedef std::unique_ptr<DataBlockIter> IterPtr;
-    typedef void(IterAPI)(IterPtr & iter, std::string &);
+    typedef void(IterAPI)(IterPtr & iter, std::string&);
 
     std::string seek_key = keys[kNumRecords / 2];
     auto test_seek = [&](IterAPI iter_api) {
@@ -1449,14 +1449,14 @@ TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) {
       ASSERT_TRUE(biter->status().IsCorruption());
     };
 
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->SeekForPrev(k); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->SeekForGet(k); });
 
     typedef void (DataBlockIter::*IterStepAPI)();
-    auto test_step = [&](IterStepAPI iter_api, std::string &k) {
+    auto test_step = [&](IterStepAPI iter_api, std::string& k) {
       IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords);
       SyncPoint::GetInstance()->DisableProcessing();
       biter->Seek(k);
@@ -1485,9 +1485,9 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(4, 8) /* block_protection_bytes_per_key */,
         ::testing::Values(1, 3, 8, 16) /* restart_interval */,
         ::testing::Values(false, true)),
-    [](const testing::TestParamInfo<std::tuple<
-           BlockBasedTableOptions::DataBlockIndexType, uint8_t, uint32_t, bool>>
-           &args) {
+    [](const testing::TestParamInfo<
+        std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
+                   uint32_t, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
           << std::to_string(std::get<1>(args.param)) << "RestartInterval"
@@ -1501,9 +1501,9 @@ class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest {
   IndexBlockKVChecksumCorruptionTest() = default;
 
   std::unique_ptr<IndexBlockIter> GenerateIndexBlockIter(
-      std::vector<std::string> &separators,
-      std::vector<BlockHandle> &block_handles,
-      std::vector<std::string> &first_keys, int num_record,
+      std::vector<std::string>& separators,
+      std::vector<BlockHandle>& block_handles,
+      std::vector<std::string>& first_keys, int num_record,
       SequenceNumber seqno) {
     SyncPoint::GetInstance()->DisableProcessing();
     block_ =
@@ -1536,7 +1536,7 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(true, false), ::testing::Values(true, false)),
     [](const testing::TestParamInfo<
         std::tuple<BlockBasedTableOptions::DataBlockIndexType, uint8_t,
-                   uint32_t, bool, bool>> &args) {
+                   uint32_t, bool, bool>>& args) {
       std::ostringstream oss;
       oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes"
           << std::to_string(std::get<1>(args.param)) << "RestartInterval"
@@ -1561,15 +1561,15 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) {
                                  kNumRecords, 0 /* ts_sz */,
                                  seqno != kDisableGlobalSequenceNumber);
       SyncPoint::GetInstance()->SetCallBack(
-          "BlockIter::UpdateKey::value", [](void *arg) {
-            char *value = static_cast<char *>(arg);
+          "BlockIter::UpdateKey::value", [](void* arg) {
+            char* value = static_cast<char*>(arg);
             // value can be delta-encoded with different lengths, so we corrupt
             // first bytes here to be safe
             ++value[0];
           });
 
       typedef std::unique_ptr<IndexBlockIter> IterPtr;
-      typedef void(IterAPI)(IterPtr & iter, std::string &);
+      typedef void(IterAPI)(IterPtr & iter, std::string&);
       std::string seek_key = first_keys[kNumRecords / 2];
       auto test_seek = [&](IterAPI iter_api) {
         std::unique_ptr<IndexBlockIter> biter = GenerateIndexBlockIter(
@@ -1579,12 +1579,12 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) {
         ASSERT_FALSE(biter->Valid());
         ASSERT_TRUE(biter->status().IsCorruption());
       };
-      test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
-      test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
-      test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
+      test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); });
+      test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); });
+      test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); });
 
       typedef void (IndexBlockIter::*IterStepAPI)();
-      auto test_step = [&](IterStepAPI iter_api, std::string &k) {
+      auto test_step = [&](IterStepAPI iter_api, std::string& k) {
         std::unique_ptr<IndexBlockIter> biter = GenerateIndexBlockIter(
             separators, block_handles, first_keys, kNumRecords, seqno);
         SyncPoint::GetInstance()->DisableProcessing();
@@ -1610,7 +1610,7 @@ class MetaIndexBlockKVChecksumCorruptionTest
   MetaIndexBlockKVChecksumCorruptionTest() = default;
 
   std::unique_ptr<MetaBlockIter> GenerateMetaIndexBlockIter(
-      std::vector<std::string> &keys, std::vector<std::string> &values,
+      std::vector<std::string>& keys, std::vector<std::string>& values,
       int num_record) {
     SyncPoint::GetInstance()->DisableProcessing();
     block_ = GenerateMetaIndexBlock(keys, values, num_record);
@@ -1627,7 +1627,7 @@ class MetaIndexBlockKVChecksumCorruptionTest
 INSTANTIATE_TEST_CASE_P(
     P, MetaIndexBlockKVChecksumCorruptionTest,
     ::testing::Values(4, 8) /* block_protection_bytes_per_key */,
-    [](const testing::TestParamInfo<uint8_t> &args) {
+    [](const testing::TestParamInfo<uint8_t>& args) {
       std::ostringstream oss;
       oss << "ProtBytes" << std::to_string(args.param);
       return oss.str();
@@ -1644,14 +1644,14 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
     GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */,
                       24 /* padding_size */);
     SyncPoint::GetInstance()->SetCallBack(
-        "BlockIter::UpdateKey::value", [](void *arg) {
-          char *value = static_cast<char *>(arg);
+        "BlockIter::UpdateKey::value", [](void* arg) {
+          char* value = static_cast<char*>(arg);
           // values generated by GenerateRandomKVs are of length 100
           ++value[10];
         });
 
     typedef std::unique_ptr<MetaBlockIter> IterPtr;
-    typedef void(IterAPI)(IterPtr & iter, std::string &);
+    typedef void(IterAPI)(IterPtr & iter, std::string&);
     typedef void (MetaBlockIter::*IterStepAPI)();
     std::string seek_key = keys[kNumRecords / 2];
     auto test_seek = [&](IterAPI iter_api) {
@@ -1662,12 +1662,12 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
       ASSERT_TRUE(biter->status().IsCorruption());
     };
 
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); });
-    test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); });
-    test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToFirst(); });
+    test_seek([](IterPtr& iter, std::string&) { iter->SeekToLast(); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->Seek(k); });
+    test_seek([](IterPtr& iter, std::string& k) { iter->SeekForPrev(k); });
 
-    auto test_step = [&](IterStepAPI iter_api, const std::string &k) {
+    auto test_step = [&](IterStepAPI iter_api, const std::string& k) {
       IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords);
       SyncPoint::GetInstance()->DisableProcessing();
       biter->Seek(k);
@@ -1687,7 +1687,7 @@ TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) {
 }
 }  // namespace ROCKSDB_NAMESPACE
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h
index 13f3dfaee14b..481589076f4a 100644
--- a/table/block_based/mock_block_based_table.h
+++ b/table/block_based/mock_block_based_table.h
@@ -32,7 +32,7 @@ class MockBlockBasedTableTester {
 
   explicit MockBlockBasedTableTester(const FilterPolicy* filter_policy)
       : MockBlockBasedTableTester(
-            std::shared_ptr<const FilterPolicy>(filter_policy)){};
+            std::shared_ptr<const FilterPolicy>(filter_policy)) {};
 
   explicit MockBlockBasedTableTester(
       std::shared_ptr<const FilterPolicy> filter_policy)
diff --git a/table/cleanable_test.cc b/table/cleanable_test.cc
index b58eb7dc61e2..c53571bf0077 100644
--- a/table/cleanable_test.cc
+++ b/table/cleanable_test.cc
@@ -31,7 +31,9 @@ void Multiplier(void* arg1, void* arg2) {
 TEST_F(CleanableTest, Register) {
   int n2 = 2, n3 = 3;
   int res = 1;
-  { Cleanable c1; }
+  {
+    Cleanable c1;
+  }
   // ~Cleanable
   ASSERT_EQ(1, res);
 
diff --git a/table/unique_id.cc b/table/unique_id.cc
index 758ad574e948..6da691082770 100644
--- a/table/unique_id.cc
+++ b/table/unique_id.cc
@@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE {
 
 std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
   std::string db_session_id(20U, '\0');
-  char *buf = db_session_id.data();
+  char* buf = db_session_id.data();
   // Preserving `lower` is slightly tricky. 36^12 is slightly more than
   // 62 bits, so we use 12 chars plus the bottom two bits of one more.
   // (A tiny fraction of 20 digit strings go unused.)
@@ -26,8 +26,8 @@ std::string EncodeSessionId(uint64_t upper, uint64_t lower) {
   return db_session_id;
 }
 
-Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
-                       uint64_t *lower) {
+Status DecodeSessionId(const std::string& db_session_id, uint64_t* upper,
+                       uint64_t* lower) {
   const size_t len = db_session_id.size();
   if (len == 0) {
     return Status::NotSupported("Missing db_session_id");
@@ -41,7 +41,7 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
     return Status::NotSupported("Too long db_session_id");
   }
   uint64_t a = 0, b = 0;
-  const char *buf = &db_session_id.front();
+  const char* buf = &db_session_id.front();
   bool success = ParseBaseChars<36>(&buf, len - 12U, &a);
   if (!success) {
     return Status::NotSupported("Bad digit in db_session_id");
@@ -56,8 +56,8 @@ Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
   return Status::OK();
 }
 
-Status GetSstInternalUniqueId(const std::string &db_id,
-                              const std::string &db_session_id,
+Status GetSstInternalUniqueId(const std::string& db_id,
+                              const std::string& db_session_id,
                               uint64_t file_number, UniqueIdPtr out,
                               bool force) {
   if (!force) {
@@ -160,11 +160,11 @@ std::string EncodeUniqueIdBytes(UniqueIdPtr in) {
   return ret;
 }
 
-Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) {
+Status DecodeUniqueIdBytes(const std::string& unique_id, UniqueIdPtr out) {
   if (unique_id.size() != (out.extended ? 24 : 16)) {
     return Status::NotSupported("Not a valid unique_id");
   }
-  const char *buf = &unique_id.front();
+  const char* buf = &unique_id.front();
   out.ptr[0] = DecodeFixed64(&buf[0]);
   out.ptr[1] = DecodeFixed64(&buf[8]);
   if (out.extended) {
@@ -174,8 +174,8 @@ Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out) {
 }
 
 template <typename ID>
-Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props,
-                                            std::string *out_id) {
+Status GetUniqueIdFromTablePropertiesHelper(const TableProperties& props,
+                                            std::string* out_id) {
   ID tmp{};
   Status s = GetSstInternalUniqueId(props.db_id, props.db_session_id,
                                     props.orig_file_number, &tmp);
@@ -188,17 +188,17 @@ Status GetUniqueIdFromTablePropertiesHelper(const TableProperties &props,
   return s;
 }
 
-Status GetExtendedUniqueIdFromTableProperties(const TableProperties &props,
-                                              std::string *out_id) {
+Status GetExtendedUniqueIdFromTableProperties(const TableProperties& props,
+                                              std::string* out_id) {
   return GetUniqueIdFromTablePropertiesHelper<UniqueId64x3>(props, out_id);
 }
 
-Status GetUniqueIdFromTableProperties(const TableProperties &props,
-                                      std::string *out_id) {
+Status GetUniqueIdFromTableProperties(const TableProperties& props,
+                                      std::string* out_id) {
   return GetUniqueIdFromTablePropertiesHelper<UniqueId64x2>(props, out_id);
 }
 
-std::string UniqueIdToHumanString(const std::string &id) {
+std::string UniqueIdToHumanString(const std::string& id) {
   std::string hex = Slice(id).ToString(/*hex*/ true);
   std::string result;
   result.reserve(hex.size() + hex.size() / 16);
diff --git a/table/unique_id_impl.h b/table/unique_id_impl.h
index 6e3dc62c794d..47d10c9712be 100644
--- a/table/unique_id_impl.h
+++ b/table/unique_id_impl.h
@@ -26,14 +26,14 @@ constexpr UniqueId64x3 kNullUniqueId64x3 = {};
 
 // Dynamic pointer wrapper for one of the two above
 struct UniqueIdPtr {
-  uint64_t *ptr = nullptr;
+  uint64_t* ptr = nullptr;
   bool extended = false;
 
-  /*implicit*/ UniqueIdPtr(UniqueId64x2 *id) {
+  /*implicit*/ UniqueIdPtr(UniqueId64x2* id) {
     ptr = (*id).data();
     extended = false;
   }
-  /*implicit*/ UniqueIdPtr(UniqueId64x3 *id) {
+  /*implicit*/ UniqueIdPtr(UniqueId64x3* id) {
     ptr = (*id).data();
     extended = true;
   }
@@ -45,8 +45,8 @@ struct UniqueIdPtr {
 // unique id, so can be manipulated in more ways but very carefully.
 // These must be long term stable to ensure GetUniqueIdFromTableProperties
 // is long term stable.
-Status GetSstInternalUniqueId(const std::string &db_id,
-                              const std::string &db_session_id,
+Status GetSstInternalUniqueId(const std::string& db_id,
+                              const std::string& db_session_id,
                               uint64_t file_number, UniqueIdPtr out,
                               bool force = false);
 
@@ -66,7 +66,7 @@ void ExternalUniqueIdToInternal(UniqueIdPtr in_out);
 std::string EncodeUniqueIdBytes(UniqueIdPtr in);
 
 // Reverse of EncodeUniqueIdBytes.
-Status DecodeUniqueIdBytes(const std::string &unique_id, UniqueIdPtr out);
+Status DecodeUniqueIdBytes(const std::string& unique_id, UniqueIdPtr out);
 
 // For presenting internal IDs for debugging purposes. Visually distinct from
 // UniqueIdToHumanString for external IDs.
@@ -87,7 +87,7 @@ std::string EncodeSessionId(uint64_t upper, uint64_t lower);
 // Reverse of EncodeSessionId. Returns NotSupported on error rather than
 // Corruption because non-standard session IDs should be allowed with degraded
 // functionality.
-Status DecodeSessionId(const std::string &db_session_id, uint64_t *upper,
-                       uint64_t *lower);
+Status DecodeSessionId(const std::string& db_session_id, uint64_t* upper,
+                       uint64_t* lower);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/tools/dump/rocksdb_undump.cc b/tools/dump/rocksdb_undump.cc
index e437b3fe8a43..9b922a8233dd 100644
--- a/tools/dump/rocksdb_undump.cc
+++ b/tools/dump/rocksdb_undump.cc
@@ -25,7 +25,7 @@ DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
 DEFINE_string(db_options, "",
               "Options string used to open the database that will be loaded");
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
 
   if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
index c9bbb125b8f1..3b2f1792934b 100644
--- a/util/bloom_impl.h
+++ b/util/bloom_impl.h
@@ -198,13 +198,13 @@ class FastLocalBloomImpl {
   }
 
   static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes,
-                             int num_probes, char *data) {
+                             int num_probes, char* data) {
     uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
     AddHashPrepared(h2, num_probes, data + bytes_to_cache_line);
   }
 
   static inline void AddHashPrepared(uint32_t h2, int num_probes,
-                                     char *data_at_cache_line) {
+                                     char* data_at_cache_line) {
     uint32_t h = h2;
     for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) {
       // 9-bit address within 512 bit cache line
@@ -214,8 +214,8 @@ class FastLocalBloomImpl {
   }
 
   static inline void PrepareHash(uint32_t h1, uint32_t len_bytes,
-                                 const char *data,
-                                 uint32_t /*out*/ *byte_offset) {
+                                 const char* data,
+                                 uint32_t /*out*/* byte_offset) {
     uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
     PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */);
     PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */);
@@ -223,13 +223,13 @@ class FastLocalBloomImpl {
   }
 
   static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes,
-                                  int num_probes, const char *data) {
+                                  int num_probes, const char* data) {
     uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6;
     return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line);
   }
 
   static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes,
-                                          const char *data_at_cache_line) {
+                                          const char* data_at_cache_line) {
     uint32_t h = h2;
 #ifdef __AVX2__
     int rem_probes = num_probes;
@@ -277,8 +277,8 @@ class FastLocalBloomImpl {
       //                            /*bytes / i32*/ 4);
       // END Option 1
       // Potentially unaligned as we're not *always* cache-aligned -> loadu
-      const __m256i *mm_data =
-          reinterpret_cast<const __m256i *>(data_at_cache_line);
+      const __m256i* mm_data =
+          reinterpret_cast<const __m256i*>(data_at_cache_line);
       __m256i lower = _mm256_loadu_si256(mm_data);
       __m256i upper = _mm256_loadu_si256(mm_data + 1);
       // Option 2: AVX512VL permute hack
@@ -362,7 +362,7 @@ class LegacyNoLocalityBloomImpl {
   }
 
   static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes,
-                             char *data) {
+                             char* data) {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
     for (int i = 0; i < num_probes; i++) {
       const uint32_t bitpos = h % total_bits;
@@ -372,7 +372,7 @@ class LegacyNoLocalityBloomImpl {
   }
 
   static inline bool HashMayMatch(uint32_t h, uint32_t total_bits,
-                                  int num_probes, const char *data) {
+                                  int num_probes, const char* data) {
     const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
     for (int i = 0; i < num_probes; i++) {
       const uint32_t bitpos = h % total_bits;
@@ -430,10 +430,10 @@ class LegacyLocalityBloomImpl {
   }
 
   static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes,
-                             char *data, int log2_cache_line_bytes) {
+                             char* data, int log2_cache_line_bytes) {
     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
 
-    char *data_at_offset =
+    char* data_at_offset =
         data + (GetLine(h, num_lines) << log2_cache_line_bytes);
     const uint32_t delta = (h >> 17) | (h << 15);
     for (int i = 0; i < num_probes; ++i) {
@@ -448,8 +448,8 @@ class LegacyLocalityBloomImpl {
   }
 
   static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines,
-                                         const char *data,
-                                         uint32_t /*out*/ *byte_offset,
+                                         const char* data,
+                                         uint32_t /*out*/* byte_offset,
                                          int log2_cache_line_bytes) {
     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
     PREFETCH(data + b, 0 /* rw */, 1 /* locality */);
@@ -459,14 +459,14 @@ class LegacyLocalityBloomImpl {
   }
 
   static inline bool HashMayMatch(uint32_t h, uint32_t num_lines,
-                                  int num_probes, const char *data,
+                                  int num_probes, const char* data,
                                   int log2_cache_line_bytes) {
     uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes;
     return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes);
   }
 
   static inline bool HashMayMatchPrepared(uint32_t h, int num_probes,
-                                          const char *data_at_offset,
+                                          const char* data_at_offset,
                                           int log2_cache_line_bytes) {
     const int log2_cache_line_bits = log2_cache_line_bytes + 3;
 
diff --git a/util/coding.h b/util/coding.h
index 8648d9a13ba2..2d7522478461 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -355,8 +355,7 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-inline void
-PutUnaligned(T* memory, const T& value) {
+inline void PutUnaligned(T* memory, const T& value) {
 #if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
   char* nonAlignedMemory = reinterpret_cast<char*>(memory);
   memcpy(nonAlignedMemory, reinterpret_cast<const char*>(&value), sizeof(T));
@@ -373,8 +372,7 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-inline void
-GetUnaligned(const T* memory, T* value) {
+inline void GetUnaligned(const T* memory, T* value) {
 #if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED)
   char* nonAlignedMemory = reinterpret_cast<char*>(value);
   memcpy(nonAlignedMemory, reinterpret_cast<const char*>(memory), sizeof(T));
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 4bccb75bc792..47c22c030fc5 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -113,10 +113,9 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-uint32_t
-crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
-  const uint8_t *buf8;
-  const uint64_t *buf64 = (uint64_t *)data;
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const* data, size_t len) {
+  const uint8_t* buf8;
+  const uint64_t* buf64 = (uint64_t*)data;
   int length = (int)len;
   crc ^= 0xffffffff;
 
@@ -148,7 +147,7 @@ crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
       uint32_t k0 = 0xe417f38a, k1 = 0x8f158014;
 
       /* Prefetch data for following block to avoid cache miss */
-      PREF1KL1((uint8_t *)buf64, 1024);
+      PREF1KL1((uint8_t*)buf64, 1024);
 
       /* First 8 byte for better pipelining */
       crc0 = crc32c_u64(crc, *buf64++);
@@ -184,22 +183,22 @@ crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
 #endif
   }  // if Pmull runtime check here
 
-  buf8 = (const uint8_t *)buf64;
+  buf8 = (const uint8_t*)buf64;
   while (length >= 8) {
-    crc = crc32c_u64(crc, *(const uint64_t *)buf8);
+    crc = crc32c_u64(crc, *(const uint64_t*)buf8);
     buf8 += 8;
     length -= 8;
   }
 
   /* The following is more efficient than the straight loop */
   if (length >= 4) {
-    crc = crc32c_u32(crc, *(const uint32_t *)buf8);
+    crc = crc32c_u32(crc, *(const uint32_t*)buf8);
     buf8 += 4;
     length -= 4;
   }
 
   if (length >= 2) {
-    crc = crc32c_u16(crc, *(const uint16_t *)buf8);
+    crc = crc32c_u16(crc, *(const uint16_t*)buf8);
     buf8 += 2;
     length -= 2;
   }
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index 5df3fa8d9deb..d2cfab3c7507 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -36,7 +36,7 @@
   PREF4X64L1(buffer, (PREF_OFFSET), 8) \
   PREF4X64L1(buffer, (PREF_OFFSET), 12)
 
-uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len);
+uint32_t crc32c_arm64(uint32_t crc, unsigned char const* data, size_t len);
 uint32_t crc32c_runtime_check(void);
 bool crc32c_pmull_runtime_check(void);
 
diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h
index 365ba2c427a1..a3cfc63705f1 100644
--- a/util/crc32c_ppc.h
+++ b/util/crc32c_ppc.h
@@ -14,7 +14,7 @@
 extern "C" {
 #endif
 
-uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, size_t len);
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const* buffer, size_t len);
 
 #ifdef __cplusplus
 }
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 949ab8f76bb1..6b35214a9eca 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -43,13 +43,13 @@ struct KeyMaker {
   // Sequential, within a hash function block
   inline Slice Seq(uint64_t i) {
     a = i;
-    return Slice(reinterpret_cast<char *>(&a), sizeof(a));
+    return Slice(reinterpret_cast<char*>(&a), sizeof(a));
   }
   // Not quite sequential, varies across hash function blocks
   inline Slice Nonseq(uint64_t i) {
     a = i;
     b = i * 123;
-    return Slice(reinterpret_cast<char *>(this), sizeof(*this));
+    return Slice(reinterpret_cast<char*>(this), sizeof(*this));
   }
   inline Slice Key(uint64_t i, bool nonseq) {
     return nonseq ? Nonseq(i) : Seq(i);
@@ -315,7 +315,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
 
 }  // namespace ROCKSDB_NAMESPACE
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   ParseCommandLineFlags(&argc, &argv, true);
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 0afe8c2fd6bf..7938d20953a5 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -126,7 +126,7 @@ DEFINE_bool(legend, false,
 
 DEFINE_uint32(runs, 1, "Number of times to rebuild and run benchmark tests");
 
-void _always_assert_fail(int line, const char *file, const char *expr) {
+void _always_assert_fail(int line, const char* file, const char* expr) {
   fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr);
   abort();
 }
@@ -195,7 +195,7 @@ struct KeyMaker {
       len += FastRange32(
           (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5);
     }
-    char *data = buf_.get() + start;
+    char* data = buf_.get() + start;
     // Populate key data such that all data makes it into a key of at
     // least 8 bytes. We also don't want all the within-filter key
     // variance confined to a contiguous 32 bits, because then a 32 bit
@@ -220,7 +220,7 @@ void PrintWarnings() {
 #endif
 }
 
-void PrintError(const char *error) { fprintf(stderr, "ERROR: %s\n", error); }
+void PrintError(const char* error) { fprintf(stderr, "ERROR: %s\n", error); }
 
 struct FilterInfo {
   uint32_t filter_id_ = 0;
@@ -258,7 +258,7 @@ static const std::vector<TestMode> bestCaseTestModes = {
     kSingleFilter,
 };
 
-const char *TestModeToString(TestMode tm) {
+const char* TestModeToString(TestMode tm) {
   switch (tm) {
     case kSingleFilter:
       return "Single filter";
@@ -278,7 +278,7 @@ const char *TestModeToString(TestMode tm) {
 
 // Do just enough to keep some data dependence for the
 // compiler / CPU
-static uint32_t DryRunNoHash(Slice &s) {
+static uint32_t DryRunNoHash(Slice& s) {
   uint32_t sz = static_cast<uint32_t>(s.size());
   if (sz >= 4) {
     return sz + s.data()[3];
@@ -287,16 +287,16 @@ static uint32_t DryRunNoHash(Slice &s) {
   }
 }
 
-static uint32_t DryRunHash32(Slice &s) {
+static uint32_t DryRunHash32(Slice& s) {
   // Same perf characteristics as GetSliceHash()
   return BloomHash(s);
 }
 
-static uint32_t DryRunHash64(Slice &s) {
+static uint32_t DryRunHash64(Slice& s) {
   return Lower32of64(GetSliceHash64(s));
 }
 
-const std::shared_ptr<const FilterPolicy> &GetPolicy() {
+const std::shared_ptr<const FilterPolicy>& GetPolicy() {
   static std::shared_ptr<const FilterPolicy> policy;
   if (!policy) {
     policy = BloomLikeFilterPolicy::Create(
@@ -378,7 +378,7 @@ void FilterBench::Go() {
                                     FLAGS_average_keys_per_filter);
   const uint32_t variance_offset = variance_range / 2;
 
-  const std::vector<TestMode> &testModes = FLAGS_best_case ? bestCaseTestModes
+  const std::vector<TestMode>& testModes = FLAGS_best_case ? bestCaseTestModes
                                            : FLAGS_quick   ? quickTestModes
                                                            : allTestModes;
 
@@ -425,7 +425,7 @@ void FilterBench::Go() {
       keys_to_add = static_cast<uint32_t>(max_total_keys - total_keys_added);
     }
     infos_.emplace_back();
-    FilterInfo &info = infos_.back();
+    FilterInfo& info = infos_.back();
     info.filter_id_ = filter_id;
     info.keys_added_ = keys_to_add;
     if (FLAGS_use_plain_table_bloom) {
@@ -475,7 +475,7 @@ void FilterBench::Go() {
     total_size += info.filter_.size();
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     total_memory_used +=
-        malloc_usable_size(const_cast<char *>(info.filter_.data()));
+        malloc_usable_size(const_cast<char*>(info.filter_.data()));
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
     total_keys_added += keys_to_add;
   }
@@ -513,7 +513,7 @@ void FilterBench::Go() {
         static_cast<uint32_t>(m_queries_ * 1000000 / infos_.size());
     uint64_t fps = 0;
     for (uint32_t i = 0; i < infos_.size(); ++i) {
-      FilterInfo &info = infos_[i];
+      FilterInfo& info = infos_[i];
       for (uint32_t j = 0; j < info.keys_added_; ++j) {
         if (FLAGS_use_plain_table_bloom) {
           uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j));
@@ -593,7 +593,7 @@ void FilterBench::Go() {
 
 double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
                                     TestMode mode) {
-  for (auto &info : infos_) {
+  for (auto& info : infos_) {
     info.outside_queries_ = 0;
     info.false_positives_ = 0;
   }
@@ -644,14 +644,14 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
   }
   uint32_t batch_size = 1;
   std::unique_ptr<Slice[]> batch_slices;
-  std::unique_ptr<Slice *[]> batch_slice_ptrs;
+  std::unique_ptr<Slice*[]> batch_slice_ptrs;
   std::unique_ptr<bool[]> batch_results;
   if (mode == kBatchPrepared || mode == kBatchUnprepared) {
     batch_size = static_cast<uint32_t>(kms_.size());
   }
 
   batch_slices.reset(new Slice[batch_size]);
-  batch_slice_ptrs.reset(new Slice *[batch_size]);
+  batch_slice_ptrs.reset(new Slice*[batch_size]);
   batch_results.reset(new bool[batch_size]);
   for (uint32_t i = 0; i < batch_size; ++i) {
     batch_results[i] = false;
@@ -672,7 +672,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
       filter_index = num_primary_filters +
                      random_.Uniformish(num_infos - num_primary_filters);
     }
-    FilterInfo &info = infos_[filter_index];
+    FilterInfo& info = infos_[filter_index];
     for (uint32_t i = 0; i < batch_size; ++i) {
       if (inside_this_time) {
         batch_slices[i] =
@@ -767,7 +767,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
     uint64_t fp = 0;
     double worst_fp_rate = 0.0;
     double best_fp_rate = 1.0;
-    for (auto &info : infos_) {
+    for (auto& info : infos_) {
       q += info.outside_queries_;
       fp += info.false_positives_;
       if (info.outside_queries_ > 0) {
@@ -789,7 +789,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
   return ns;
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
                   " [-quick] [OTHER OPTIONS]...");
diff --git a/util/gflags_compat.h b/util/gflags_compat.h
index 8f4a30b0d661..a38273fd7492 100644
--- a/util/gflags_compat.h
+++ b/util/gflags_compat.h
@@ -22,8 +22,8 @@
   namespace gflags_compat {           \
   DEFINE_int32(name, val, txt);       \
   }                                   \
-  uint32_t &FLAGS_##name =            \
-      *reinterpret_cast<uint32_t *>(&gflags_compat::FLAGS_##name);
+  uint32_t& FLAGS_##name =            \
+      *reinterpret_cast<uint32_t*>(&gflags_compat::FLAGS_##name);
 
-#define DECLARE_uint32(name) extern uint32_t &FLAGS_##name;
+#define DECLARE_uint32(name) extern uint32_t& FLAGS_##name;
 #endif  // !DEFINE_uint32
diff --git a/util/hash_test.cc b/util/hash_test.cc
index dffdae4ce598..2b3f5a4ae856 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -233,8 +233,8 @@ TEST(HashTest, Hash64SmallValueSchema) {
             uint64_t{10551812464348219044u});
 }
 
-std::string Hash64TestDescriptor(const char *repeat, size_t limit) {
-  const char *mod61_encode =
+std::string Hash64TestDescriptor(const char* repeat, size_t limit) {
+  const char* mod61_encode =
       "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
   std::string input;
@@ -388,8 +388,8 @@ TEST(HashTest, Hash128Trivial) {
   }
 }
 
-std::string Hash128TestDescriptor(const char *repeat, size_t limit) {
-  const char *mod61_encode =
+std::string Hash128TestDescriptor(const char* repeat, size_t limit) {
+  const char* mod61_encode =
       "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
   std::string input;
@@ -850,7 +850,7 @@ TEST(MathTest, Math128) {
 }
 
 TEST(MathTest, Coding128) {
-  const char *in = "_1234567890123456";
+  const char* in = "_1234567890123456";
   // Note: in + 1 is likely unaligned
   Unsigned128 decoded = DecodeFixed128(in + 1);
   EXPECT_EQ(Lower64of128(decoded), 0x3837363534333231U);
@@ -863,7 +863,7 @@ TEST(MathTest, Coding128) {
 }
 
 TEST(MathTest, CodingGeneric) {
-  const char *in = "_1234567890123456";
+  const char* in = "_1234567890123456";
   // Decode
   // Note: in + 1 is likely unaligned
   Unsigned128 decoded128 = DecodeFixedGeneric<Unsigned128>(in + 1);
@@ -899,7 +899,7 @@ TEST(MathTest, CodingGeneric) {
   EXPECT_EQ(std::string("_12"), std::string(out));
 }
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   fprintf(stderr, "NPHash64 id: %x\n",
           static_cast<int>(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB")));
   ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
diff --git a/util/io_dispatcher_test.cc b/util/io_dispatcher_test.cc
index d5ea665e5ce4..89624ac5bcd4 100644
--- a/util/io_dispatcher_test.cc
+++ b/util/io_dispatcher_test.cc
@@ -719,7 +719,6 @@ TEST_F(IODispatcherTest, ReadSetDestroysUnpinsBlocks) {
       << " final=" << final_pinned_usage;
 }
 
-
 // Test that verifies the coalescing logic: adjacent blocks within the
 // coalesce threshold should be combined into a single read request.
 TEST_F(IODispatcherTest, VerifyCoalescing) {
diff --git a/util/mutexlock.h b/util/mutexlock.h
index aecd4f21cb4f..b142bde320f2 100644
--- a/util/mutexlock.h
+++ b/util/mutexlock.h
@@ -34,15 +34,15 @@ namespace ROCKSDB_NAMESPACE {
 
 class MutexLock {
  public:
-  explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); }
+  explicit MutexLock(port::Mutex* mu) : mu_(mu) { this->mu_->Lock(); }
   // No copying allowed
-  MutexLock(const MutexLock &) = delete;
-  void operator=(const MutexLock &) = delete;
+  MutexLock(const MutexLock&) = delete;
+  void operator=(const MutexLock&) = delete;
 
   ~MutexLock() { this->mu_->Unlock(); }
 
  private:
-  port::Mutex *const mu_;
+  port::Mutex* const mu_;
 };
 
 //
@@ -52,15 +52,15 @@ class MutexLock {
 //
 class ReadLock {
  public:
-  explicit ReadLock(port::RWMutex *mu) : mu_(mu) { this->mu_->ReadLock(); }
+  explicit ReadLock(port::RWMutex* mu) : mu_(mu) { this->mu_->ReadLock(); }
   // No copying allowed
-  ReadLock(const ReadLock &) = delete;
-  void operator=(const ReadLock &) = delete;
+  ReadLock(const ReadLock&) = delete;
+  void operator=(const ReadLock&) = delete;
 
   ~ReadLock() { this->mu_->ReadUnlock(); }
 
  private:
-  port::RWMutex *const mu_;
+  port::RWMutex* const mu_;
 };
 
 //
@@ -68,15 +68,15 @@ class ReadLock {
 //
 class ReadUnlock {
  public:
-  explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); }
+  explicit ReadUnlock(port::RWMutex* mu) : mu_(mu) { mu->AssertHeld(); }
   // No copying allowed
-  ReadUnlock(const ReadUnlock &) = delete;
-  ReadUnlock &operator=(const ReadUnlock &) = delete;
+  ReadUnlock(const ReadUnlock&) = delete;
+  ReadUnlock& operator=(const ReadUnlock&) = delete;
 
   ~ReadUnlock() { mu_->ReadUnlock(); }
 
  private:
-  port::RWMutex *const mu_;
+  port::RWMutex* const mu_;
 };
 
 //
@@ -86,15 +86,15 @@ class ReadUnlock {
 //
 class WriteLock {
  public:
-  explicit WriteLock(port::RWMutex *mu) : mu_(mu) { this->mu_->WriteLock(); }
+  explicit WriteLock(port::RWMutex* mu) : mu_(mu) { this->mu_->WriteLock(); }
   // No copying allowed
-  WriteLock(const WriteLock &) = delete;
-  void operator=(const WriteLock &) = delete;
+  WriteLock(const WriteLock&) = delete;
+  void operator=(const WriteLock&) = delete;
 
   ~WriteLock() { this->mu_->WriteUnlock(); }
 
  private:
-  port::RWMutex *const mu_;
+  port::RWMutex* const mu_;
 };
 
 //
@@ -145,12 +145,12 @@ struct ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedWrapper {
 template <class T>
 struct Unwrap {
   using type = T;
-  static type &Go(T &t) { return t; }
+  static type& Go(T& t) { return t; }
 };
 template <class T>
 struct Unwrap<CacheAlignedWrapper<T>> {
   using type = T;
-  static type &Go(CacheAlignedWrapper<T> &t) { return t.obj_; }
+  static type& Go(CacheAlignedWrapper<T>& t) { return t.obj_; }
 };
 
 //
@@ -169,7 +169,7 @@ class Striped {
       : stripe_count_(stripe_count), data_(new T[stripe_count]) {}
 
   using Unwrapped = typename Unwrap<T>::type;
-  Unwrapped &Get(const Key &key, uint64_t seed = 0) {
+  Unwrapped& Get(const Key& key, uint64_t seed = 0) {
     size_t index = FastRangeGeneric(hash_(key, seed), stripe_count_);
     return Unwrap<T>::Go(data_[index]);
   }
diff --git a/util/ribbon_alg.h b/util/ribbon_alg.h
index f9afefc2377b..52016e266c9d 100644
--- a/util/ribbon_alg.h
+++ b/util/ribbon_alg.h
@@ -545,10 +545,10 @@ namespace ribbon {
 // solution satisfying all the cr@start -> rr entries added.
 template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
           typename BacktrackStorage>
-bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
+bool BandingAdd(BandingStorage* bs, typename BandingStorage::Index start,
                 typename BandingStorage::ResultRow rr,
-                typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
-                typename BandingStorage::Index *backtrack_pos) {
+                typename BandingStorage::CoeffRow cr, BacktrackStorage* bts,
+                typename BandingStorage::Index* backtrack_pos) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using ResultRow = typename BandingStorage::ResultRow;
   using Index = typename BandingStorage::Index;
@@ -608,8 +608,8 @@ bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
 //
 template <typename BandingStorage, typename BacktrackStorage,
           typename BandingHasher, typename InputIterator>
-bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
-                     const BandingHasher &bh, InputIterator begin,
+bool BandingAddRange(BandingStorage* bs, BacktrackStorage* bts,
+                     const BandingHasher& bh, InputIterator begin,
                      InputIterator end) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
@@ -703,7 +703,7 @@ bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
 //
 template <typename BandingStorage, typename BandingHasher,
           typename InputIterator>
-bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
+bool BandingAddRange(BandingStorage* bs, const BandingHasher& bh,
                      InputIterator begin, InputIterator end) {
   using Index = typename BandingStorage::Index;
   struct NoopBacktrackStorage {
@@ -754,7 +754,7 @@ bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
 // Back-substitution for generating a solution from BandingStorage to
 // SimpleSolutionStorage.
 template <typename SimpleSolutionStorage, typename BandingStorage>
-void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
+void SimpleBackSubst(SimpleSolutionStorage* sss, const BandingStorage& bs) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
   using ResultRow = typename BandingStorage::ResultRow;
@@ -815,7 +815,7 @@ template <typename SimpleSolutionStorage>
 typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
     typename SimpleSolutionStorage::Index start_slot,
     typename SimpleSolutionStorage::CoeffRow cr,
-    const SimpleSolutionStorage &sss) {
+    const SimpleSolutionStorage& sss) {
   using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
   using ResultRow = typename SimpleSolutionStorage::ResultRow;
 
@@ -833,8 +833,8 @@ typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
 // General PHSF query a key from SimpleSolutionStorage.
 template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
 typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
-    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
-    const SimpleSolutionStorage &sss) {
+    const typename PhsfQueryHasher::Key& key, const PhsfQueryHasher& hasher,
+    const SimpleSolutionStorage& sss) {
   const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
 
   static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
@@ -850,9 +850,9 @@ typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
 
 // Filter query a key from SimpleSolutionStorage.
 template <typename SimpleSolutionStorage, typename FilterQueryHasher>
-bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
-                       const FilterQueryHasher &hasher,
-                       const SimpleSolutionStorage &sss) {
+bool SimpleFilterQuery(const typename FilterQueryHasher::Key& key,
+                       const FilterQueryHasher& hasher,
+                       const SimpleSolutionStorage& sss) {
   const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
   const typename SimpleSolutionStorage::ResultRow expected =
       hasher.GetResultRowFromHash(hash);
@@ -968,9 +968,9 @@ bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
 
 // A helper for InterleavedBackSubst.
 template <typename BandingStorage>
-inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
+inline void BackSubstBlock(typename BandingStorage::CoeffRow* state,
                            typename BandingStorage::Index num_columns,
-                           const BandingStorage &bs,
+                           const BandingStorage& bs,
                            typename BandingStorage::Index start_slot) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
@@ -1004,8 +1004,8 @@ inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
 // Back-substitution for generating a solution from BandingStorage to
 // InterleavedSolutionStorage.
 template <typename InterleavedSolutionStorage, typename BandingStorage>
-void InterleavedBackSubst(InterleavedSolutionStorage *iss,
-                          const BandingStorage &bs) {
+void InterleavedBackSubst(InterleavedSolutionStorage* iss,
+                          const BandingStorage& bs) {
   using CoeffRow = typename BandingStorage::CoeffRow;
   using Index = typename BandingStorage::Index;
 
@@ -1084,12 +1084,12 @@ void InterleavedBackSubst(InterleavedSolutionStorage *iss,
 // Prefetch memory for a key in InterleavedSolutionStorage.
 template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
 inline void InterleavedPrepareQuery(
-    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
-    const InterleavedSolutionStorage &iss,
-    typename PhsfQueryHasher::Hash *saved_hash,
-    typename InterleavedSolutionStorage::Index *saved_segment_num,
-    typename InterleavedSolutionStorage::Index *saved_num_columns,
-    typename InterleavedSolutionStorage::Index *saved_start_bit) {
+    const typename PhsfQueryHasher::Key& key, const PhsfQueryHasher& hasher,
+    const InterleavedSolutionStorage& iss,
+    typename PhsfQueryHasher::Hash* saved_hash,
+    typename InterleavedSolutionStorage::Index* saved_segment_num,
+    typename InterleavedSolutionStorage::Index* saved_num_columns,
+    typename InterleavedSolutionStorage::Index* saved_start_bit) {
   using Hash = typename PhsfQueryHasher::Hash;
   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
   using Index = typename InterleavedSolutionStorage::Index;
@@ -1131,7 +1131,7 @@ inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
     typename InterleavedSolutionStorage::Index segment_num,
     typename InterleavedSolutionStorage::Index num_columns,
     typename InterleavedSolutionStorage::Index start_bit,
-    const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+    const PhsfQueryHasher& hasher, const InterleavedSolutionStorage& iss) {
   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
   using Index = typename InterleavedSolutionStorage::Index;
   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
@@ -1170,7 +1170,7 @@ inline bool InterleavedFilterQuery(
     typename InterleavedSolutionStorage::Index segment_num,
     typename InterleavedSolutionStorage::Index num_columns,
     typename InterleavedSolutionStorage::Index start_bit,
-    const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+    const FilterQueryHasher& hasher, const InterleavedSolutionStorage& iss) {
   using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
   using Index = typename InterleavedSolutionStorage::Index;
   using ResultRow = typename InterleavedSolutionStorage::ResultRow;
diff --git a/utilities/cassandra/test_utils.cc b/utilities/cassandra/test_utils.cc
index 3615813500a8..a596ea98869b 100644
--- a/utilities/cassandra/test_utils.cc
+++ b/utilities/cassandra/test_utils.cc
@@ -51,7 +51,7 @@ RowValue CreateRowTombstone(int64_t timestamp) {
 }
 
 void VerifyRowValueColumns(
-    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    const std::vector<std::shared_ptr<ColumnBase>>& columns,
     std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
     int64_t expected_timestamp) {
   EXPECT_EQ(expected_timestamp, columns[index_of_vector]->Timestamp());
diff --git a/utilities/cassandra/test_utils.h b/utilities/cassandra/test_utils.h
index be23f707606f..1fd789c27e8f 100644
--- a/utilities/cassandra/test_utils.h
+++ b/utilities/cassandra/test_utils.h
@@ -32,7 +32,7 @@ RowValue CreateTestRowValue(
 RowValue CreateRowTombstone(int64_t timestamp);
 
 void VerifyRowValueColumns(
-    const std::vector<std::shared_ptr<ColumnBase>> &columns,
+    const std::vector<std::shared_ptr<ColumnBase>>& columns,
     std::size_t index_of_vector, int8_t expected_mask, int8_t expected_index,
     int64_t expected_timestamp);
 
diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc
index 105d52bf5af3..2b9e4d85aa29 100644
--- a/utilities/object_registry.cc
+++ b/utilities/object_registry.cc
@@ -15,7 +15,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
-bool MatchesInteger(const std::string &target, size_t start, size_t pos) {
+bool MatchesInteger(const std::string& target, size_t start, size_t pos) {
   // If it is numeric, everything up to the match must be a number
   int digits = 0;
   if (target[start] == '-') {
@@ -31,7 +31,7 @@ bool MatchesInteger(const std::string &target, size_t start, size_t pos) {
   return (digits > 0);
 }
 
-bool MatchesDecimal(const std::string &target, size_t start, size_t pos) {
+bool MatchesDecimal(const std::string& target, size_t start, size_t pos) {
   int digits = 0;
   if (target[start] == '-') {
     start++;  // Allow negative numbers
@@ -54,8 +54,8 @@ bool MatchesDecimal(const std::string &target, size_t start, size_t pos) {
 }  // namespace
 
 size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
-    size_t start, Quantifier mode, const std::string &target, size_t tlen,
-    const std::string &separator) const {
+    size_t start, Quantifier mode, const std::string& target, size_t tlen,
+    const std::string& separator) const {
   size_t slen = separator.size();
   // See if there is enough space.  If so, find the separator
   if (tlen < start + slen) {
@@ -87,9 +87,9 @@ size_t ObjectLibrary::PatternEntry::MatchSeparatorAt(
   }
 }
 
-bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
+bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string& name,
                                                 size_t nlen,
-                                                const std::string &target,
+                                                const std::string& target,
                                                 size_t tlen) const {
   if (separators_.empty()) {
     assert(optional_);  // If there are no separators, it must be only a name
@@ -109,7 +109,7 @@ bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
     size_t start = nlen;
     auto mode = kMatchExact;
     for (size_t idx = 0; idx < separators_.size(); ++idx) {
-      const auto &separator = separators_[idx];
+      const auto& separator = separators_[idx];
       start = MatchSeparatorAt(start, mode, target, tlen, separator.first);
       if (start == std::string::npos) {
         return false;
@@ -132,12 +132,12 @@ bool ObjectLibrary::PatternEntry::MatchesTarget(const std::string &name,
   return true;
 }
 
-bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
+bool ObjectLibrary::PatternEntry::Matches(const std::string& target) const {
   auto tlen = target.size();
   if (MatchesTarget(name_, nlength_, target, tlen)) {
     return true;
   } else if (!names_.empty()) {
-    for (const auto &alt : names_) {
+    for (const auto& alt : names_) {
       if (MatchesTarget(alt, alt.size(), target, tlen)) {
         return true;
       }
@@ -146,17 +146,17 @@ bool ObjectLibrary::PatternEntry::Matches(const std::string &target) const {
   return false;
 }
 
-size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
+size_t ObjectLibrary::GetFactoryCount(size_t* types) const {
   std::unique_lock<std::mutex> lock(mu_);
   *types = factories_.size();
   size_t factories = 0;
-  for (const auto &e : factories_) {
+  for (const auto& e : factories_) {
     factories += e.second.size();
   }
   return factories;
 }
 
-size_t ObjectLibrary::GetFactoryCount(const std::string &type) const {
+size_t ObjectLibrary::GetFactoryCount(const std::string& type) const {
   std::unique_lock<std::mutex> lock(mu_);
   auto iter = factories_.find(type);
   if (iter != factories_.end()) {
@@ -166,36 +166,36 @@ size_t ObjectLibrary::GetFactoryCount(const std::string &type) const {
   }
 }
 
-void ObjectLibrary::GetFactoryNames(const std::string &type,
-                                    std::vector<std::string> *names) const {
+void ObjectLibrary::GetFactoryNames(const std::string& type,
+                                    std::vector<std::string>* names) const {
   assert(names);
   std::unique_lock<std::mutex> lock(mu_);
   auto iter = factories_.find(type);
   if (iter != factories_.end()) {
-    for (const auto &f : iter->second) {
+    for (const auto& f : iter->second) {
       names->push_back(f->Name());
     }
   }
 }
 
 void ObjectLibrary::GetFactoryTypes(
-    std::unordered_set<std::string> *types) const {
+    std::unordered_set<std::string>* types) const {
   assert(types);
   std::unique_lock<std::mutex> lock(mu_);
-  for (const auto &iter : factories_) {
+  for (const auto& iter : factories_) {
     types->insert(iter.first);
   }
 }
 
-void ObjectLibrary::Dump(Logger *logger) const {
+void ObjectLibrary::Dump(Logger* logger) const {
   std::unique_lock<std::mutex> lock(mu_);
   if (logger != nullptr && !factories_.empty()) {
     ROCKS_LOG_HEADER(logger, "    Registered Library: %s\n", id_.c_str());
-    for (const auto &iter : factories_) {
+    for (const auto& iter : factories_) {
       ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
                        iter.first.c_str());
       bool printed_one = false;
-      for (const auto &e : iter.second) {
+      for (const auto& e : iter.second) {
         ROCKS_LOG_HEADER(logger, "%c %s", (printed_one) ? ',' : ':', e->Name());
         printed_one = true;
       }
@@ -205,7 +205,7 @@ void ObjectLibrary::Dump(Logger *logger) const {
 
 // Returns the Default singleton instance of the ObjectLibrary
 // This instance will contain most of the "standard" registered objects
-std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
+std::shared_ptr<ObjectLibrary>& ObjectLibrary::Default() {
   // Use avoid destruction here so the default ObjectLibrary will not be
   // statically destroyed and long-lived.
   STATIC_AVOID_DESTRUCTION(std::shared_ptr<ObjectLibrary>, instance)
@@ -213,9 +213,9 @@ std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
   return instance;
 }
 
-ObjectRegistry::ObjectRegistry(const std::shared_ptr<ObjectLibrary> &library) {
+ObjectRegistry::ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library) {
   libraries_.push_back(library);
-  for (const auto &b : builtins_) {
+  for (const auto& b : builtins_) {
     RegisterPlugin(b.first, b.second);
   }
 }
@@ -233,13 +233,13 @@ std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
 }
 
 std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
-    const std::shared_ptr<ObjectRegistry> &parent) {
+    const std::shared_ptr<ObjectRegistry>& parent) {
   return std::make_shared<ObjectRegistry>(parent);
 }
 
 Status ObjectRegistry::SetManagedObject(
-    const std::string &type, const std::string &id,
-    const std::shared_ptr<Customizable> &object) {
+    const std::string& type, const std::string& id,
+    const std::shared_ptr<Customizable>& object) {
   std::string object_key = ToManagedObjectKey(type, id);
   std::shared_ptr<Customizable> curr;
   if (parent_ != nullptr) {
@@ -267,7 +267,7 @@ Status ObjectRegistry::SetManagedObject(
 }
 
 std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
-    const std::string &type, const std::string &id) const {
+    const std::string& type, const std::string& id) const {
   {
     std::unique_lock<std::mutex> lock(objects_mutex_);
     auto iter = managed_objects_.find(ToManagedObjectKey(type, id));
@@ -283,8 +283,8 @@ std::shared_ptr<Customizable> ObjectRegistry::GetManagedObject(
 }
 
 Status ObjectRegistry::ListManagedObjects(
-    const std::string &type, const std::string &name,
-    std::vector<std::shared_ptr<Customizable>> *results) const {
+    const std::string& type, const std::string& name,
+    std::vector<std::shared_ptr<Customizable>>* results) const {
   {
     std::string key = ToManagedObjectKey(type, name);
     std::unique_lock<std::mutex> lock(objects_mutex_);
@@ -309,50 +309,50 @@ Status ObjectRegistry::ListManagedObjects(
 // Returns the number of registered types for this registry.
 // If specified (not-null), types is updated to include the names of the
 // registered types.
-size_t ObjectRegistry::GetFactoryCount(const std::string &type) const {
+size_t ObjectRegistry::GetFactoryCount(const std::string& type) const {
   size_t count = 0;
   if (parent_ != nullptr) {
     count = parent_->GetFactoryCount(type);
   }
   std::unique_lock<std::mutex> lock(library_mutex_);
-  for (const auto &library : libraries_) {
+  for (const auto& library : libraries_) {
     count += library->GetFactoryCount(type);
   }
   return count;
 }
 
-void ObjectRegistry::GetFactoryNames(const std::string &type,
-                                     std::vector<std::string> *names) const {
+void ObjectRegistry::GetFactoryNames(const std::string& type,
+                                     std::vector<std::string>* names) const {
   assert(names);
   names->clear();
   if (parent_ != nullptr) {
     parent_->GetFactoryNames(type, names);
   }
   std::unique_lock<std::mutex> lock(library_mutex_);
-  for (const auto &library : libraries_) {
+  for (const auto& library : libraries_) {
     library->GetFactoryNames(type, names);
   }
 }
 
 void ObjectRegistry::GetFactoryTypes(
-    std::unordered_set<std::string> *types) const {
+    std::unordered_set<std::string>* types) const {
   assert(types);
   if (parent_ != nullptr) {
     parent_->GetFactoryTypes(types);
   }
   std::unique_lock<std::mutex> lock(library_mutex_);
-  for (const auto &library : libraries_) {
+  for (const auto& library : libraries_) {
     library->GetFactoryTypes(types);
   }
 }
 
-void ObjectRegistry::Dump(Logger *logger) const {
+void ObjectRegistry::Dump(Logger* logger) const {
   if (logger != nullptr) {
     std::unique_lock<std::mutex> lock(library_mutex_);
     if (!plugins_.empty()) {
       ROCKS_LOG_HEADER(logger, "    Registered Plugins:");
       bool printed_one = false;
-      for (const auto &plugin : plugins_) {
+      for (const auto& plugin : plugins_) {
         ROCKS_LOG_HEADER(logger, "%s%s", (printed_one) ? ", " : " ",
                          plugin.c_str());
         printed_one = true;
@@ -368,8 +368,8 @@ void ObjectRegistry::Dump(Logger *logger) const {
   }
 }
 
-int ObjectRegistry::RegisterPlugin(const std::string &name,
-                                   const RegistrarFunc &func) {
+int ObjectRegistry::RegisterPlugin(const std::string& name,
+                                   const RegistrarFunc& func) {
   if (!name.empty() && func != nullptr) {
     plugins_.push_back(name);
     return AddLibrary(name)->Register(func, name);
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h
index 6871a922ae5f..293d5289cb35 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -165,9 +165,9 @@ struct WriteBatchIndexEntry {
   uint32_t column_family;  // column family of the entry.
   // The following three fields are only maintained when the WBWI is created
   // with overwrite_key = true.
-  uint32_t update_count;   // The number of updates (1-based) for this key up to
-                           // this entry.
-  bool has_single_del;     // whether single del was issued for this key
+  uint32_t update_count;  // The number of updates (1-based) for this key up to
+                          // this entry.
+  bool has_single_del;    // whether single del was issued for this key
   bool has_overwritten_single_del;  // whether a single del for this key was
                                     // overwritten by another key
   // The following two fields are used when search_key is null.

From 9f4751867658ae850254ec9d844b2e4325c8ae14 Mon Sep 17 00:00:00 2001
From: Josh Kang <jkangs@meta.com>
Date: Fri, 13 Feb 2026 17:15:10 -0800
Subject: [PATCH 475/500] Add interpolation search as an alternative to binary
 search (#14247)

Summary:
Interpolation search is an alternative algorithm to binary search, which performs better on uniformly distributed keys. Instead of binary search always computing the mid point of the left and right boundaries, interpolation search "interpolates" the mid point based on the distance to the target. Fortunately, we can re-use existing block format to support interpolation search.

For a given block, we compute the shared_prefix length of the first and last key. Interpolation search is usually done with numerical target values, so for a variable binary length key, we calculate the "value" as the first 8 non-shared bytes. This also means interpolation search would only really be effective for bytewise comparator (guarded via options validations).

#### Fallback to binary search
- if the the val(left_key) == val(right_key) then we fallback to classic binary search (to avoid divide by 0)
- interpolation search is significantly more computationally expensive than binary search, so when the search distance is small, we also fallback to binary search.
- if interpolation search does not make significant progress (i.e. reduces search space by more than half each iteration), we can assume data is non-uniform and fallback.

Interpolation search also performs best when there is minimal shortening, especially shortening of the last block, as it can heavily skew the distribution of the actual keys.

Note that each search algorithm is guaranteed to make progress because at each iteration the search space is guaranteed to be reduce by at least 1.

For now this change only applies to index block seeks, as data block seeks and other blocks do not have as many entries and would not require significant number of search rounds, but it could be easily extended to include that support.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14247

Test Plan:
Updated unit tests and crash test with new search option

### Benchmark
The default benchmark sets up keys in generally uniform distribution, so it was a good way to test performance improvements.

Setup: `./db_bench -benchmarks=fillseq,compact -index_shortening_mode=1`

#### Before this change
```
./db_bench -use_existing_db=true -benchmarks=readrandom -seed=1

readrandom   :       2.899 micros/op 344973 ops/sec 2.899 seconds 1000000 operations;   38.2 MB/s (1000000 of 1000000 found)
```

#### After this change

Notice how key comparison counts are the same between the two.
```
./db_bench -use_existing_db=true -benchmarks=readrandom -seed=1 -index_search_type=binary_search

readrandom   :       2.881 micros/op 347128 ops/sec 2.881 seconds 1000000 operations;   38.4 MB/s (1000000 of 1000000 found)
```

```
./db_bench -use_existing_db=true -benchmarks=readrandom -seed=1 -index_search_type=interpolation_search

readrandom   :       2.609 micros/op 383209 ops/sec 2.610 seconds 1000000 operations;   42.4 MB/s (1000000 of 1000000 found)
```

With a non-uniform distribution, `i.e. index_shortening_mode=2`

```
./db_bench -use_existing_db=true -benchmarks=readrandom -seed=1 -index_search_type=binary_search

readrandom   :       2.958 micros/op 338075 ops/sec 2.958 seconds 1000000 operations;   37.4 MB/s (1000000 of 1000000 found)
```

```
./db_bench -use_existing_db=true -benchmarks=readrandom -seed=1 -index_search_type=interpolation_search

readrandom   :       5.502 micros/op 181750 ops/sec 5.502 seconds 1000000 operations;   20.1 MB/s (1000000 of 1000000 found)
```

Reviewed By: pdillinger

Differential Revision: D91063163

Pulled By: joshkang97

fbshipit-source-id: 151d6aa76f8713740b714de6e406aff40d28ccbc
---
 db/c.cc                                       |   6 +
 db_stress_tool/db_stress_common.h             |   1 +
 db_stress_tool/db_stress_gflags.cc            |   6 +
 db_stress_tool/db_stress_test_base.cc         |   3 +
 include/rocksdb/c.h                           |   7 +
 include/rocksdb/table.h                       |  15 +
 java/CMakeLists.txt                           |   1 +
 java/rocksjni/portal.h                        |  42 +-
 java/rocksjni/table.cc                        |   8 +-
 .../org/rocksdb/BlockBasedTableConfig.java    |  30 +-
 .../java/org/rocksdb/IndexSearchType.java     |  34 ++
 .../rocksdb/BlockBasedTableConfigTest.java    |  12 +
 options/options_settable_test.cc              |   1 +
 .../block_based/binary_search_index_reader.cc |   3 +-
 table/block_based/block.cc                    | 363 ++++++++++++++++--
 table/block_based/block.h                     |  69 +++-
 .../block_based/block_based_table_factory.cc  |  19 +
 table/block_based/block_based_table_reader.cc |   3 +-
 table/block_based/block_test.cc               |  93 +++--
 table/table_test.cc                           |  11 +-
 tools/db_bench_tool.cc                        |  15 +
 tools/db_crashtest.py                         |   1 +
 .../new_features/interpolation_search         |   1 +
 23 files changed, 651 insertions(+), 93 deletions(-)
 create mode 100644 java/src/main/java/org/rocksdb/IndexSearchType.java
 create mode 100644 unreleased_history/new_features/interpolation_search

diff --git a/db/c.cc b/db/c.cc
index 9a9c0c0f9aa4..2ae26e75b7b3 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3757,6 +3757,12 @@ void rocksdb_block_based_options_set_data_block_index_type(
       static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
 }
 
+void rocksdb_block_based_options_set_index_block_search_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.index_block_search_type =
+      static_cast<BlockBasedTableOptions::BlockSearchType>(v);
+}
+
 void rocksdb_block_based_options_set_data_block_hash_ratio(
     rocksdb_block_based_table_options_t* options, double v) {
   options->rep.data_block_hash_table_util_ratio = v;
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 5ec396235283..666751d95a55 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -175,6 +175,7 @@ DECLARE_uint32(sqfc_version);
 DECLARE_bool(use_sqfc_for_range_queries);
 DECLARE_int32(index_type);
 DECLARE_int32(data_block_index_type);
+DECLARE_int32(index_block_search_type);
 DECLARE_string(db);
 DECLARE_string(secondaries_base);
 DECLARE_bool(test_secondary);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index eb2e7a7ca9a1..27918aa0230f 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -596,6 +596,12 @@ DEFINE_int32(
         ROCKSDB_NAMESPACE::BlockBasedTableOptions().data_block_index_type),
     "Index type for data blocks (see `enum DataBlockIndexType` in table.h)");
 
+DEFINE_int32(index_block_search_type,
+             static_cast<int32_t>(ROCKSDB_NAMESPACE::BlockBasedTableOptions()
+                                      .index_block_search_type),
+             "Search algorithm for index blocks (see `enum BlockSearchType` in "
+             "table.h)");
+
 DEFINE_string(db, "", "Use the db with the following name.");
 
 DEFINE_string(secondaries_base, "",
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 8f3737975501..2883d990be3c 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4325,6 +4325,9 @@ void InitializeOptionsFromFlags(
   block_based_options.data_block_index_type =
       static_cast<BlockBasedTableOptions::DataBlockIndexType>(
           FLAGS_data_block_index_type);
+  block_based_options.index_block_search_type =
+      static_cast<BlockBasedTableOptions::BlockSearchType>(
+          FLAGS_index_block_search_type);
   block_based_options.prepopulate_block_cache =
       static_cast<BlockBasedTableOptions::PrepopulateBlockCache>(
           FLAGS_prepopulate_block_cache);
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 55e03ea96937..d12b68d9dbea 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1209,6 +1209,13 @@ enum {
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_data_block_index_type(
     rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+enum {
+  rocksdb_block_based_table_index_block_search_type_binary = 0,
+  rocksdb_block_based_table_index_block_search_type_interpolation = 1,
+};
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_search_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_data_block_hash_ratio(
     rocksdb_block_based_table_options_t* options, double v);
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 9be34a0284e3..3485c41f8079 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -263,6 +263,21 @@ struct BlockBasedTableOptions {
 
   IndexType index_type = kBinarySearch;
 
+  // The search algorithm used when seeking to entries in the index block.
+  enum BlockSearchType : char {
+    // Standard binary search
+    kBinary = 0x00,
+    // Interpolation search, which may be better suited for uniformly
+    // distributed keys. This will only be applicable if the comparator is the
+    // byte-wise comparator. Avoid using
+    // IndexShorteningMode::kShortenSeparatorsAndSuccessor as shortening the
+    // succesor can skew the end key and make interpolation search significantly
+    // less performant.
+    kInterpolation = 0x01,
+  };
+
+  BlockSearchType index_block_search_type = kBinary;
+
   // The index type that will be used for the data block.
   enum DataBlockIndexType : char {
     kDataBlockBinarySearch = 0,   // traditional block type
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index ffc374102699..5dd7be6cd1e4 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -182,6 +182,7 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/HyperClockCache.java
   src/main/java/org/rocksdb/ImportColumnFamilyOptions.java
   src/main/java/org/rocksdb/IndexShorteningMode.java
+  src/main/java/org/rocksdb/IndexSearchType.java
   src/main/java/org/rocksdb/IndexType.java
   src/main/java/org/rocksdb/InfoLogLevel.java
   src/main/java/org/rocksdb/IngestExternalFileOptions.java
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index c0adc5eb9f49..9600a736573a 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -7016,6 +7016,44 @@ class DataBlockIndexTypeJni {
   }
 };
 
+// The portal class for org.rocksdb.IndexSearchType
+class IndexSearchTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.IndexSearchType for the provided
+  // C++ ROCKSDB_NAMESPACE::BlockSearchType enum
+  static jbyte toJavaIndexSearchType(
+      const ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType&
+          index_block_search_type) {
+    switch (index_block_search_type) {
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::kBinary:
+        return 0x0;
+      case ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+          kInterpolation:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ ROCKSDB_NAMESPACE::BlockSearchType enum for
+  // the provided Java org.rocksdb.IndexSearchType
+  static ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType
+  toCppIndexSearchType(jbyte jindex_search_type) {
+    switch (jindex_search_type) {
+      case 0x0:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+            kBinary;
+      case 0x1:
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+            kInterpolation;
+      default:
+        // undefined/default
+        return ROCKSDB_NAMESPACE::BlockBasedTableOptions::BlockSearchType::
+            kBinary;
+    }
+  }
+};
+
 // The portal class for org.rocksdb.ChecksumType
 class ChecksumTypeJni {
  public:
@@ -9200,7 +9238,7 @@ class BlockBasedTableOptionsJni
     }
 
     jmethodID method_id_init =
-        env->GetMethodID(jclazz, "<init>", "(ZZZZBBDBZJIIIJZZZZZIIZZJJBBJD)V");
+        env->GetMethodID(jclazz, "<init>", "(ZZZZBBDBZJIIIJZZZZZIIZZJJBBBJD)V");
     if (method_id_init == nullptr) {
       // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
@@ -9250,6 +9288,8 @@ class BlockBasedTableOptionsJni
             table_factory_options->super_block_alignment_space_overhead_ratio),
         IndexShorteningModeJni::toJavaIndexShorteningMode(
             table_factory_options->index_shortening),
+        IndexSearchTypeJni::toJavaIndexSearchType(
+            table_factory_options->index_block_search_type),
         FilterPolicyJni::toJavaIndexType(filter_policy_type),
         filter_policy_handle, filter_policy_config_value);
     if (env->ExceptionCheck()) {
diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc
index 10747212fd1f..064a5b1a7fac 100644
--- a/java/rocksjni/table.cc
+++ b/java/rocksjni/table.cc
@@ -45,7 +45,7 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBJI)J
+ * Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBBJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     JNIEnv*, jclass, jboolean jcache_index_and_filter_blocks,
@@ -65,7 +65,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
     jboolean jenable_index_compression, jboolean jblock_align,
     jlong jsuper_block_alignment_size,
     jlong jsuper_block_alignment_space_overhead_ratio, jbyte jindex_shortening,
-    jlong jblock_cache_size, jint jblock_cache_num_shard_bits) {
+    jbyte jindex_search_type, jlong jblock_cache_size,
+    jint jblock_cache_num_shard_bits) {
   ROCKSDB_NAMESPACE::BlockBasedTableOptions options;
   options.cache_index_and_filter_blocks =
       static_cast<bool>(jcache_index_and_filter_blocks);
@@ -144,6 +145,9 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
   options.index_shortening =
       ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode(
           jindex_shortening);
+  options.index_block_search_type =
+      ROCKSDB_NAMESPACE::IndexSearchTypeJni::toCppIndexSearchType(
+          jindex_search_type);
 
   return GET_CPLUSPLUS_POINTER(
       ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options));
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index df21d774484d..555f54f3b748 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -43,6 +43,7 @@ public BlockBasedTableConfig() {
     superBlockAlignmentSize = 0;
     superBlockAlignmentSpaceOverheadRatio = 128;
     indexShortening = IndexShorteningMode.kShortenSeparators;
+    indexSearchType = IndexSearchType.kBinary;
 
     // NOTE: ONLY used if blockCache == null
     blockCacheSize = 8 * 1024 * 1024;
@@ -64,8 +65,8 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks,
       final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion,
       final boolean enableIndexCompression, final boolean blockAlign,
       final long superBlockAlignmentSize, final long superBlockAlignmentSpaceOverheadRatio,
-      final byte indexShortening, final byte filterPolicyType, final long filterPolicyHandle,
-      final double filterPolicyConfigValue) {
+      final byte indexShortening, final byte indexSearchType, final byte filterPolicyType,
+      final long filterPolicyHandle, final double filterPolicyConfigValue) {
     this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks;
     this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority;
     this.pinL0FilterAndIndexBlocksInCache = pinL0FilterAndIndexBlocksInCache;
@@ -92,6 +93,7 @@ private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks,
     this.superBlockAlignmentSize = superBlockAlignmentSize;
     this.superBlockAlignmentSpaceOverheadRatio = superBlockAlignmentSpaceOverheadRatio;
     this.indexShortening = IndexShorteningMode.values()[indexShortening];
+    this.indexSearchType = IndexSearchType.values()[indexSearchType];
     try (Filter filterPolicy = FilterPolicyType.values()[filterPolicyType].createFilter(
              filterPolicyHandle, filterPolicyConfigValue)) {
       if (filterPolicy != null) {
@@ -871,6 +873,26 @@ public BlockBasedTableConfig setIndexShortening(final IndexShorteningMode indexS
     return this;
   }
 
+  /**
+   * Get the index search type.
+   *
+   * @return the currently set index search type
+   */
+  public IndexSearchType indexSearchType() {
+    return indexSearchType;
+  }
+
+  /**
+   * Sets the index search type to used with this table.
+   *
+   * @param indexSearchType {@link org.rocksdb.IndexSearchType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexSearchType(final IndexSearchType indexSearchType) {
+    this.indexSearchType = indexSearchType;
+    return this;
+  }
+
   /**
    * Get the size of the cache in bytes that will be used by RocksDB.
    *
@@ -996,7 +1018,7 @@ public BlockBasedTableConfig setHashIndexAllowCollision(
         useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, verifyCompression,
         readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign,
         superBlockAlignmentSize, superBlockAlignmentSpaceOverheadRatio, indexShortening.getValue(),
-        blockCacheSize, blockCacheNumShardBits);
+        indexSearchType.getValue(), blockCacheSize, blockCacheNumShardBits);
   }
 
   private static native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks,
@@ -1013,6 +1035,7 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt
       final int readAmpBytesPerBit, final int formatVersion, final boolean enableIndexCompression,
       final boolean blockAlign, final long superBlockAlignmentSize,
       final long superBlockAlignmentSpaceOverheadRatio, final byte indexShortening,
+      final byte indexSearchType,
 
       @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits);
 
@@ -1046,6 +1069,7 @@ private static native long newTableFactoryHandle(final boolean cacheIndexAndFilt
   private long superBlockAlignmentSize;
   private long superBlockAlignmentSpaceOverheadRatio;
   private IndexShorteningMode indexShortening;
+  private IndexSearchType indexSearchType;
 
   // NOTE: ONLY used if blockCache == null
   @Deprecated private long blockCacheSize;
diff --git a/java/src/main/java/org/rocksdb/IndexSearchType.java b/java/src/main/java/org/rocksdb/IndexSearchType.java
new file mode 100644
index 000000000000..55ec0eef3820
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/IndexSearchType.java
@@ -0,0 +1,34 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * BlockSearchType used in conjunction with BlockBasedTable.
+ */
+public enum IndexSearchType {
+  /**
+   * Standard binary search
+   */
+  kBinary((byte) 0x0),
+
+  /**
+   * Interpolation search, which may be better suited for uniformly
+   * distributed keys. Only applicable if the comparator is the
+   * byte-wise comparator.
+   */
+  kInterpolation((byte) 0x1);
+
+  private final byte value;
+
+  IndexSearchType(final byte value) {
+    this.value = value;
+  }
+
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
index be2a7b46ec87..ef904ffe1b54 100644
--- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
+++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -109,11 +109,13 @@ public void jniPortal() throws Exception {
     tableConfig.setIndexType(IndexType.kBinarySearch);
     tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch);
     tableConfig.setChecksumType(ChecksumType.kNoChecksum);
+    tableConfig.setIndexSearchType(IndexSearchType.kBinary);
     try (final Options options = new Options().setTableFormatConfig(tableConfig)) {
       final String opts = getOptionAsString(options);
       assertThat(opts).contains("index_type=kBinarySearch");
       assertThat(opts).contains("data_block_index_type=kDataBlockBinarySearch");
       assertThat(opts).contains("checksum=kNoChecksum");
+      assertThat(opts).contains("index_block_search_type=kBinary");
     }
 
     tableConfig.setIndexType(IndexType.kHashSearch);
@@ -399,6 +401,16 @@ public void indexShortening() {
         .isEqualTo(IndexShorteningMode.kShortenSeparatorsAndSuccessor);
   }
 
+  @Test
+  public void indexSearchType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(IndexSearchType.values().length).isEqualTo(2);
+    blockBasedTableConfig.setIndexSearchType(IndexSearchType.kInterpolation);
+    assertThat(blockBasedTableConfig.indexSearchType()).isEqualTo(IndexSearchType.kInterpolation);
+    blockBasedTableConfig.setIndexSearchType(IndexSearchType.kBinary);
+    assertThat(blockBasedTableConfig.indexSearchType()).isEqualTo(IndexSearchType.kBinary);
+  }
+
   @Deprecated
   @Test
   public void hashIndexAllowCollision() {
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 135a4461847c..4bafb6fcca9a 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -182,6 +182,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "pin_l0_filter_and_index_blocks_in_cache=1;"
       "pin_top_level_index_and_filter=1;"
       "index_type=kHashSearch;"
+      "index_block_search_type=kBinary;"
       "data_block_index_type=kDataBlockBinaryAndHash;"
       "index_shortening=kNoShortening;"
       "data_block_hash_table_util_ratio=0.75;"
diff --git a/table/block_based/binary_search_index_reader.cc b/table/block_based/binary_search_index_reader.cc
index abe09d86fb3a..940bb261db23 100644
--- a/table/block_based/binary_search_index_reader.cc
+++ b/table/block_based/binary_search_index_reader.cc
@@ -63,7 +63,8 @@ InternalIteratorBase<IndexValue>* BinarySearchIndexReader::NewIterator(
       internal_comparator()->user_comparator(),
       rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true,
       index_has_first_key(), index_key_includes_seq(), index_value_is_full(),
-      false /* block_contents_pinned */, user_defined_timestamps_persisted());
+      false /* block_contents_pinned */, user_defined_timestamps_persisted(),
+      nullptr /* prefix_index */, rep->table_options.index_block_search_type);
 
   assert(it != nullptr);
   index_block.TransferTo(it);
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index 7b531f959879..dd57b3df4cf4 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -152,6 +152,25 @@ struct DecodeEntryV4 {
   }
 };
 
+// Read first 8 bytes (starting at offset) as big-endian uint64_t, padding
+// with zeros on the right if the key is shorter. This preserves
+// lexicographic ordering. Non-user keys will also have end internal bytes
+// stripped and not counted for in the value.
+static uint64_t ReadBe64FromKey(Slice s, bool is_user_key, size_t offset) {
+  if (!is_user_key) {
+    assert(s.size() >= kNumInternalBytes);
+    s = Slice(s.data(), s.size() - kNumInternalBytes);
+  }
+  uint64_t val = 0;
+  offset = std::min(offset, s.size());
+  size_t len = std::min(s.size() - offset, size_t{8});
+  for (size_t i = 0; i < len; i++) {
+    val = (val << 8) | static_cast<uint8_t>(s.data()[offset + i]);
+  }
+  val <<= (8 - len) * 8;  // Pad zeros on the right
+  return val;
+}
+
 void DataBlockIter::NextImpl() {
 #ifndef NDEBUG
   if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
@@ -307,7 +326,8 @@ void DataBlockIter::SeekImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -323,7 +343,8 @@ void MetaBlockIter::SeekImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -494,10 +515,14 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
     // restart interval must be one when hash search is enabled so the binary
     // search simply lands at the right place.
     skip_linear_scan = true;
-  } else if (value_delta_encoded_) {
-    ok = BinarySeek<DecodeKeyV4>(seek_key, &index, &skip_linear_scan);
   } else {
-    ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+    if (value_delta_encoded_) {
+      ok = FindRestartPointForSeek<DecodeKeyV4>(seek_key, &index,
+                                                &skip_linear_scan);
+    } else {
+      ok = FindRestartPointForSeek<DecodeKey>(seek_key, &index,
+                                              &skip_linear_scan);
+    }
   }
 
   if (!ok) {
@@ -506,6 +531,18 @@ void IndexBlockIter::SeekImpl(const Slice& target) {
   FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
 }
 
+template <typename DecodeKeyFunc>
+bool IndexBlockIter::FindRestartPointForSeek(const Slice& seek_key,
+                                             uint32_t* index,
+                                             bool* skip_linear_scan) {
+  if (index_search_type_ == BlockBasedTableOptions::kBinary) {
+    return BinarySeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
+                                                      skip_linear_scan);
+  }
+  return InterpolationSeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
+                                                           skip_linear_scan);
+}
+
 void DataBlockIter::SeekForPrevImpl(const Slice& target) {
   PERF_TIMER_GUARD(block_seek_nanos);
   Slice seek_key = target;
@@ -514,7 +551,8 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -540,7 +578,8 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
   }
   uint32_t index = 0;
   bool skip_linear_scan = false;
-  bool ok = BinarySeek<DecodeKey>(seek_key, &index, &skip_linear_scan);
+  bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
+                                                   &skip_linear_scan);
 
   if (!ok) {
     return;
@@ -816,9 +855,27 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
   }
 }
 
-// Binary searches in restart array to find the starting restart point for the
-// linear scan, and stores it in `*index`. Assumes restart array does not
-// contain duplicate keys. It is guaranteed that the restart key at `*index + 1`
+// Get the key slice at a given restart point index.
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::GetRestartKey(uint32_t index, Slice* key) {
+  uint32_t region_offset = GetRestartPoint(index);
+  uint32_t shared, non_shared;
+  const char* key_ptr = DecodeKeyFunc()(
+      data_ + region_offset, data_ + restarts_, &shared, &non_shared);
+  if (key_ptr == nullptr || (shared != 0)) {
+    CorruptionError();
+    return false;
+  }
+  *key = Slice(key_ptr, non_shared);
+  return true;
+}
+
+// Searches in restart array using binary search to find the starting restart
+// point for the linear scan, and stores it in `*index`. Assumes restart array
+// does not contain duplicate keys.
+//
+// It is guaranteed that the restart key at `*index + 1`
 // is strictly greater than `target` or does not exist (this can be used to
 // elide a comparison when linear scan reaches all the way to the next restart
 // key). Furthermore, `*skip_linear_scan` is set to indicate whether the
@@ -826,15 +883,15 @@ void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
 // compared again later.
 template <class TValue>
 template <typename DecodeKeyFunc>
-bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
-                                   bool* skip_linear_scan) {
+bool BlockIter<TValue>::BinarySeekRestartPointIndex(const Slice& target,
+                                                    uint32_t* index,
+                                                    bool* skip_linear_scan) {
   if (restarts_ == 0) {
     // SST files dedicated to range tombstones are written with index blocks
     // that have no keys while also having `num_restarts_ == 1`. This would
-    // cause a problem for `BinarySeek()` as it'd try to access the first key
-    // which does not exist. We identify such blocks by the offset at which
-    // their restarts are stored, and return false to prevent any attempted
-    // key accesses.
+    // cause a problem as we'd try to access the first key which does not exist.
+    // We identify such blocks by the offset at which their restarts are stored,
+    // and return false to prevent any attempted key accesses.
     return false;
   }
 
@@ -842,23 +899,25 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
   // Loop invariants:
   // - Restart key at index `left` is less than or equal to the target key. The
   //   sentinel index `-1` is considered to have a key that is less than all
-  //   keys.
+  //   keys. Doing this allows us to avoid a bounds check on left.
   // - Any restart keys after index `right` are strictly greater than the target
   //   key.
-  int64_t left = -1, right = num_restarts_ - 1;
+  int64_t left = -1;
+  int64_t right = num_restarts_ - 1;
+
   while (left != right) {
     // The `mid` is computed by rounding up so it lands in (`left`, `right`].
     int64_t mid = left + (right - left + 1) / 2;
-    uint32_t region_offset = GetRestartPoint(static_cast<uint32_t>(mid));
-    uint32_t shared, non_shared;
-    const char* key_ptr = DecodeKeyFunc()(
-        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
-    if (key_ptr == nullptr || (shared != 0)) {
-      CorruptionError();
+
+    assert(left < mid && mid <= right);
+
+    Slice mid_key;
+    if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
       return false;
     }
-    Slice mid_key(key_ptr, non_shared);
+
     UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
+
     int cmp = CompareCurrentKey(target);
     if (cmp < 0) {
       // Key at "mid" is smaller than "target". Therefore all
@@ -885,22 +944,240 @@ bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t* index,
   return true;
 }
 
+// Similar effects to BinarySeekRestartPointIndex, except it uses a different
+// algorithm to search for the restart point index (i.e. interpolation search).
+// Interpolation search is typically more efficient for uniformly distributed
+// datasets.
+//
+// Typically, interpolation search requires an integer "value". But because we
+// are searching through variable length binary slices, we must estimate an
+// integer value for each key. Currently, the value is set to be the first 8
+// bytes (read big-endian) that do not share a prefix with the start and end
+// key. As a side effect, this can really only be used with the
+// BytewiseComparator().
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
+    const Slice& target, uint32_t* index, bool* skip_linear_scan) {
+  static constexpr int64_t kGuardLen = 8;
+  static constexpr uint64_t kMaxPoorSearches = 8;
+
+  if (restarts_ == 0) {
+    return false;
+  }
+
+  *skip_linear_scan = false;
+  assert(icmp_.user_comparator() == BytewiseComparator());
+
+  int64_t left = -1;
+  int64_t right = num_restarts_ - 1;
+  int64_t shared_prefix_len = -1;
+
+  Slice left_key;
+  Slice right_key;
+  bool seek_failed = false;
+
+#ifndef NDEBUG
+  // used to validate invariants
+  bool first_iter = true;
+#endif
+
+  // A poor search is when less than half the search space is reduced, because
+  // binary search would do better. When there are kMaxPoorSearches in a row,
+  // then fallback to binary search. This helps bound worse cast performance.
+  uint64_t continuous_poor_searches = 0;
+
+  // Loop invariants while not first iteration AND seek has not failed:
+  // - arr[usable_left] = left_key, arr[right] = right_key
+  // - left < mid <= right, and arr[left] < target < arr[right + 1]
+  //
+  // The first iteration is used as an early optimization to determine initial
+  // bounds, and whether target is within those bounds
+  while (left != right) {
+    int64_t mid = 0;
+
+    // If either search window is small or we've bad numerous bad guesses, then
+    // fallback to binary search
+    seek_failed = (right - left <= kGuardLen) ||
+                  continuous_poor_searches >= kMaxPoorSearches;
+
+    if (!seek_failed) {
+      // Interpolation seek reads left and right boundaries anyways, so we can
+      // set left = 0. The invariant that left <= target is still held because
+      // we early exit if left > target for the first iteration.
+      const auto usable_left = std::max<int64_t>(left, 0);
+
+      // First iteration: decode both boundary keys and compute shared prefix.
+      if (shared_prefix_len < 0) {
+        assert(first_iter);
+        if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(usable_left),
+                                          &left_key)) {
+          return false;
+        }
+
+        if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
+                                          &right_key)) {
+          return false;
+        }
+
+        // Compute the shared prefix length between smallest index key and
+        // largest index key this can be used to "normalize" the values
+        // calculated during interpolation search.
+        shared_prefix_len =
+            static_cast<int64_t>(left_key.difference_offset(right_key));
+      }
+      assert(shared_prefix_len >= 0);
+
+      size_t spl = static_cast<size_t>(shared_prefix_len);
+      assert(spl <= left_key.size() && spl <= right_key.size());
+      uint64_t left_val = ReadBe64FromKey(left_key, raw_key_.IsUserKey(), spl);
+      uint64_t right_val =
+          ReadBe64FromKey(right_key, raw_key_.IsUserKey(), spl);
+      uint64_t target_val = ReadBe64FromKey(target, raw_key_.IsUserKey(), spl);
+
+      if (left_val > right_val) {
+        CorruptionError("left key is greater than right key");
+        return false;
+      }
+
+      bool lte_left = false;
+      bool gt_right = false;
+
+      if (target_val < left_val) {
+        assert(first_iter);
+        assert(CompareKey(left_key, target) > 0);
+        lte_left = true;
+      } else if (target_val == left_val) {
+        // target_val == left_val doesn't imply target == left_key
+        // because ReadBe64FromKey only reads 8 bytes and skips sequence
+        // numbers. We need to check actual key order.
+        if (CompareKey(left_key, target) >= 0) {
+          assert(first_iter);
+          lte_left = true;
+        }
+      }
+
+      if (!lte_left && !seek_failed) {
+        if (target_val > right_val) {
+          // note that we only ever guarantee arr[target] < arr[right + 1], so
+          // it is possible to end up here even on non-first iteration
+          assert(CompareKey(right_key, target) < 0);
+          gt_right = true;
+        } else if (right_val == left_val) {
+          // cannot divide by 0
+          seek_failed = true;
+        }
+      }
+
+      // early exit if key is not within bounds
+      if (lte_left) {
+        assert(!seek_failed);
+        UpdateRawKeyAndMaybePadMinTimestamp(left_key);
+        assert(CompareCurrentKey(target) >= 0);
+        *skip_linear_scan = true;
+        *index = static_cast<uint32_t>(usable_left);
+        return true;
+      }
+      if (gt_right) {
+        assert(!seek_failed);
+        UpdateRawKeyAndMaybePadMinTimestamp(right_key);
+        assert(CompareCurrentKey(target) < 0);
+        *index = static_cast<uint32_t>(right);
+        return true;
+      }
+
+      if (!seek_failed) {
+#ifdef HAVE_UINT128_EXTENSION
+        __uint128_t range = right - usable_left;
+        __uint128_t target_delta = target_val - left_val;
+        uint64_t range_delta = right_val - left_val;
+        int64_t offset =
+            static_cast<int64_t>(range * target_delta / range_delta);
+#else
+        double ratio = static_cast<double>(target_val - left_val) /
+                       static_cast<double>(right_val - left_val);
+        assert(0 <= ratio && ratio <= 1);
+        int64_t range = right - usable_left;
+        int64_t offset = static_cast<int64_t>(range * ratio);
+#endif
+        left = usable_left;  // can reduce search space by 1
+        mid = usable_left + offset;
+        assert(mid <= right);
+        if (mid == usable_left) {
+          // this is to guarantee progress and avoid infinite loop
+          ++mid;
+        }
+      }
+    }
+
+    if (seek_failed) {
+      // Fallback to binary seek
+      mid = left + (right - left + 1) / 2;
+    }
+
+    assert(left < mid && mid <= right);
+
+    Slice mid_key;
+    if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
+      return false;
+    }
+
+    UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
+
+    int cmp = CompareCurrentKey(target);
+
+    int64_t previous_search_space = right - left;
+    if (cmp < 0) {
+      left = mid;
+      left_key = mid_key;
+    } else if (cmp > 0) {
+      right = mid - 1;
+      if (!seek_failed && left != right) {
+        if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
+                                          &right_key)) {
+          return false;
+        }
+      }
+    } else {
+      *skip_linear_scan = true;
+      left = right = mid;
+    }
+
+    // If seach space is not reduced by at least half, good chance this data is
+    // not uniform.
+    int64_t new_search_space = right - left;
+    if (new_search_space > previous_search_space / 2) {
+      ++continuous_poor_searches;
+    } else {
+      continuous_poor_searches = 0;
+    }
+
+#ifndef NDEBUG
+    first_iter = false;
+#endif
+  }
+
+  if (left == -1) {
+    // All keys in the block were strictly greater than `target`. So the very
+    // first key in the block is the final seek result.
+    *skip_linear_scan = true;
+    *index = 0;
+  } else {
+    *index = static_cast<uint32_t>(left);
+  }
+  return true;
+}
+
 // Compare target key and the block key of the block of `block_index`.
 // Return -1 if error.
 int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
-  uint32_t region_offset = GetRestartPoint(block_index);
-  uint32_t shared, non_shared;
-  const char* key_ptr =
-      value_delta_encoded_
-          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
-                          &non_shared)
-          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
-                        &non_shared);
-  if (key_ptr == nullptr || (shared != 0)) {
-    CorruptionError();
+  Slice block_key;
+  bool ok = value_delta_encoded_
+                ? GetRestartKey<DecodeKeyV4>(block_index, &block_key)
+                : GetRestartKey<DecodeKey>(block_index, &block_key);
+  if (!ok) {
     return 1;  // Return target is smaller
   }
-  Slice block_key(key_ptr, non_shared);
   UpdateRawKeyAndMaybePadMinTimestamp(block_key);
   return CompareCurrentKey(target);
 }
@@ -1293,7 +1570,8 @@ IndexBlockIter* Block::NewIndexIterator(
     IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
     bool have_first_key, bool key_includes_seq, bool value_is_full,
     bool block_contents_pinned, bool user_defined_timestamps_persisted,
-    BlockPrefixIndex* prefix_index) {
+    BlockPrefixIndex* prefix_index,
+    BlockBasedTableOptions::BlockSearchType index_block_search_type) {
   IndexBlockIter* ret_iter;
   if (iter != nullptr) {
     ret_iter = iter;
@@ -1311,11 +1589,12 @@ IndexBlockIter* Block::NewIndexIterator(
   } else {
     BlockPrefixIndex* prefix_index_ptr =
         total_order_seek ? nullptr : prefix_index;
-    ret_iter->Initialize(
-        raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
-        prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
-        block_contents_pinned, user_defined_timestamps_persisted,
-        protection_bytes_per_key_, kv_checksum_, block_restart_interval_);
+    ret_iter->Initialize(raw_ucmp, data(), restart_offset_, num_restarts_,
+                         global_seqno, prefix_index_ptr, have_first_key,
+                         key_includes_seq, value_is_full, block_contents_pinned,
+                         user_defined_timestamps_persisted,
+                         protection_bytes_per_key_, kv_checksum_,
+                         block_restart_interval_, index_block_search_type);
   }
 
   return ret_iter;
diff --git a/table/block_based/block.h b/table/block_based/block.h
index afd0d302ce76..071dc4a5da49 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -233,13 +233,19 @@ class Block {
   // It is determined by IndexType property of the table.
   // `user_defined_timestamps_persisted` controls whether a min timestamp is
   // padded while key is being parsed from the block.
+  // `index_block_search_type` controls which search algorithm to use when
+  // reading the index block. kBinary uses binary search, while
+  // kInterpolation uses interpolation search which can be faster
+  // for uniformly distributed keys.
   IndexBlockIter* NewIndexIterator(
       const Comparator* raw_ucmp, SequenceNumber global_seqno,
       IndexBlockIter* iter, Statistics* stats, bool total_order_seek,
       bool have_first_key, bool key_includes_seq, bool value_is_full,
       bool block_contents_pinned = false,
       bool user_defined_timestamps_persisted = true,
-      BlockPrefixIndex* prefix_index = nullptr);
+      BlockPrefixIndex* prefix_index = nullptr,
+      BlockBasedTableOptions::BlockSearchType index_block_search_type =
+          BlockBasedTableOptions::kBinary);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -616,19 +622,27 @@ class BlockIter : public InternalIteratorBase<TValue> {
     }
   }
 
-  // Returns the result of `Comparator::Compare()`, where the appropriate
-  // comparator is used for the block contents, the LHS argument is the current
-  // key with global seqno applied, and the RHS argument is `other`.
-  int CompareCurrentKey(const Slice& other) {
+  // Compares two keys using the appropriate comparator for the block contents.
+  // Uses user comparator when the block stores user keys, otherwise uses the
+  // internal key comparator. When global_seqno is not disabled, applies it to
+  // the LHS key for comparison.
+  int CompareKey(const Slice& a, const Slice& b) {
     assert(icmp_.user_comparator() != nullptr);
     if (raw_key_.IsUserKey()) {
       assert(global_seqno_ == kDisableGlobalSequenceNumber);
-      return icmp_.user_comparator()->Compare(raw_key_.GetUserKey(), other);
+      return icmp_.user_comparator()->Compare(a, b);
     } else if (global_seqno_ == kDisableGlobalSequenceNumber) {
-      return icmp_.Compare(raw_key_.GetInternalKey(), other);
+      return icmp_.Compare(a, b);
     }
-    return icmp_.Compare(raw_key_.GetInternalKey(), global_seqno_, other,
-                         kDisableGlobalSequenceNumber);
+    return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber);
+  }
+
+  // Compares the current key (with global seqno applied) against `other`.
+  int CompareCurrentKey(const Slice& other) {
+    if (raw_key_.IsUserKey()) {
+      return CompareKey(raw_key_.GetUserKey(), other);
+    }
+    return CompareKey(raw_key_.GetInternalKey(), other);
   }
 
  private:
@@ -663,8 +677,16 @@ class BlockIter : public InternalIteratorBase<TValue> {
 
  protected:
   template <typename DecodeKeyFunc>
-  inline bool BinarySeek(const Slice& target, uint32_t* index,
-                         bool* is_index_key_result);
+  inline bool GetRestartKey(uint32_t index, Slice* key);
+
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeekRestartPointIndex(const Slice& target, uint32_t* index,
+                                          bool* is_index_key_result);
+
+  template <typename DecodeKeyFunc>
+  inline bool InterpolationSeekRestartPointIndex(const Slice& target,
+                                                 uint32_t* index,
+                                                 bool* is_index_key_result);
 
   // Find the first key in restart interval `index` that is >= `target`.
   // If there is no such key, iterator is positioned at the first key in
@@ -828,14 +850,14 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
   // format.
   // value_is_full, default true, means that no delta encoding is
   // applied to values.
-  void Initialize(const Comparator* raw_ucmp, const char* data,
-                  uint32_t restarts, uint32_t num_restarts,
-                  SequenceNumber global_seqno, BlockPrefixIndex* prefix_index,
-                  bool have_first_key, bool key_includes_seq,
-                  bool value_is_full, bool block_contents_pinned,
-                  bool user_defined_timestamps_persisted,
-                  uint8_t protection_bytes_per_key, const char* kv_checksum,
-                  uint32_t block_restart_interval) {
+  void Initialize(
+      const Comparator* raw_ucmp, const char* data, uint32_t restarts,
+      uint32_t num_restarts, SequenceNumber global_seqno,
+      BlockPrefixIndex* prefix_index, bool have_first_key,
+      bool key_includes_seq, bool value_is_full, bool block_contents_pinned,
+      bool user_defined_timestamps_persisted, uint8_t protection_bytes_per_key,
+      const char* kv_checksum, uint32_t block_restart_interval,
+      BlockBasedTableOptions::BlockSearchType index_block_search_type) {
     InitializeBase(raw_ucmp, data, restarts, num_restarts,
                    kDisableGlobalSequenceNumber, block_contents_pinned,
                    user_defined_timestamps_persisted, protection_bytes_per_key,
@@ -844,6 +866,7 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
     prefix_index_ = prefix_index;
     value_delta_encoded_ = !value_is_full;
     have_first_key_ = have_first_key;
+    index_search_type_ = index_block_search_type;
     if (have_first_key_ && global_seqno != kDisableGlobalSequenceNumber) {
       global_seqno_state_.reset(new GlobalSeqnoState(global_seqno));
     } else {
@@ -938,6 +961,10 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
   // `pad_min_timestamp_` is true.
   std::string first_internal_key_with_ts_;
 
+  // The search algorithm to use when reading the index block.
+  BlockBasedTableOptions::BlockSearchType index_search_type_ =
+      BlockBasedTableOptions::kBinary;
+
   // Set *prefix_may_exist to false if no key possibly share the same prefix
   // as `target`. If not set, the result position should be the same as total
   // order Seek.
@@ -950,6 +977,10 @@ class IndexBlockIter final : public BlockIter<IndexValue> {
                             bool* prefix_may_exist);
   inline int CompareBlockKey(uint32_t block_index, const Slice& target);
 
+  template <typename DecodeKeyFunc>
+  bool FindRestartPointForSeek(const Slice& seek_key, uint32_t* index,
+                               bool* skip_linear_scan);
+
   inline bool ParseNextIndexKey();
 
   // When value_delta_encoded_ is enabled it decodes the value which is assumed
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 3101a82cd50b..f90e95f36a06 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -20,6 +20,7 @@
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
@@ -184,6 +185,12 @@ static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
         {"kBinarySearchWithFirstKey",
          BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey}};
 
+static std::unordered_map<std::string, BlockBasedTableOptions::BlockSearchType>
+    block_base_table_index_search_type_string_map = {
+        {"kBinary", BlockBasedTableOptions::BlockSearchType::kBinary},
+        {"kInterpolation",
+         BlockBasedTableOptions::BlockSearchType::kInterpolation}};
+
 static std::unordered_map<std::string,
                           BlockBasedTableOptions::DataBlockIndexType>
     block_base_table_data_block_index_type_string_map = {
@@ -261,6 +268,10 @@ static struct BlockBasedTableTypeInfo {
         {"index_type", OptionTypeInfo::Enum<BlockBasedTableOptions::IndexType>(
                            offsetof(struct BlockBasedTableOptions, index_type),
                            &block_base_table_index_type_string_map)},
+        {"index_block_search_type",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::BlockSearchType>(
+             offsetof(struct BlockBasedTableOptions, index_block_search_type),
+             &block_base_table_index_search_type_string_map)},
         {"hash_index_allow_collision",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}},
         {"data_block_index_type",
@@ -616,6 +627,14 @@ Status BlockBasedTableFactory::ValidateOptions(
         "Hash index is specified for block-based "
         "table, but prefix_extractor is not given");
   }
+  if (table_options_.index_block_search_type ==
+      BlockBasedTableOptions::kInterpolation) {
+    // Interpolation search requires BytewiseComparator
+    if (cf_opts.comparator != BytewiseComparator()) {
+      return Status::InvalidArgument(
+          "Interpolation search requires BytewiseComparator");
+    }
+  }
   if (table_options_.cache_index_and_filter_blocks &&
       table_options_.no_block_cache) {
     return Status::InvalidArgument(
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index b7e660f29ab0..d4e26e9b52f4 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1698,7 +1698,8 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
       rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
       /* total_order_seek */ true, rep->index_has_first_key,
       rep->index_key_includes_seq, rep->index_value_is_full,
-      block_contents_pinned, rep->user_defined_timestamps_persisted);
+      block_contents_pinned, rep->user_defined_timestamps_persisted,
+      nullptr /* prefix_index */, rep->table_options.index_block_search_type);
 }
 
 // Right now only called for Data blocks.
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index e93ac5979a4f..a083d003ac4d 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -576,10 +576,14 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) {
   ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
 }
 
+enum class KeyDistribution { kUniform, kNonUniform };
+
 class IndexBlockTest
     : public testing::Test,
       public testing::WithParamInterface<
-          std::tuple<bool, bool, bool, test::UserDefinedTimestampTestMode>> {
+          std::tuple<bool, bool, bool, test::UserDefinedTimestampTestMode,
+                     BlockBasedTableOptions::BlockSearchType, int, int, int,
+                     int, KeyDistribution>> {
  public:
   IndexBlockTest() = default;
 
@@ -592,25 +596,52 @@ class IndexBlockTest
   bool shouldPersistUDT() const {
     return test::ShouldPersistUDT(std::get<3>(GetParam()));
   }
+  BlockBasedTableOptions::BlockSearchType indexSearchType() const {
+    return isUDTEnabled() ? BlockBasedTableOptions::kBinary
+                          : std::get<4>(GetParam());
+  }
+  int numRecords() const {
+    return std::min(1 << keyLength(), std::get<5>(GetParam()));
+  }
+  int indexBlockRestartInterval() const { return std::get<6>(GetParam()); }
+  int keyLength() const { return std::get<7>(GetParam()); }
+  int prefixLength() const { return std::get<8>(GetParam()); }
+  KeyDistribution keyDistribution() const { return std::get<9>(GetParam()); }
 };
 
-// Similar to GenerateRandomKVs but for index block contents.
-void GenerateRandomIndexEntries(std::vector<std::string>* separators,
-                                std::vector<BlockHandle>* block_handles,
-                                std::vector<std::string>* first_keys,
-                                const int len, size_t ts_sz = 0,
-                                bool zero_seqno = false) {
+// Similar to GenerateRandomKVs but for index block contents. Keys always
+// contain a 0-sequence number, callers may extract the user key if needed.
+void GenerateRandomIndexEntries(
+    std::vector<std::string>* separators,
+    std::vector<BlockHandle>* block_handles,
+    std::vector<std::string>* first_keys, const int len, size_t ts_sz = 0,
+    int key_length = 12, int prefix_length = 0,
+    KeyDistribution distribution = KeyDistribution::kUniform) {
   Random rnd(42);
+  std::string prefix(prefix_length, 'x');
 
   // For each of `len` blocks, we need to generate a first and last key.
-  // Let's generate n*2 random keys, sort them, group into consecutive pairs.
+  // Generate n*2 random keys, sort them, group into consecutive pairs.
   std::set<std::string> keys;
+
+  // Two clusters with shared prefixes of effective_key_length - 2. This
+  // stresses interpolation search's uniform distribution assumption.
+  int cluster_prefix_len = std::max(0, key_length - 5);
+  std::string cluster1_prefix = prefix + rnd.RandomString(cluster_prefix_len);
+  std::string cluster2_prefix = prefix + rnd.RandomString(cluster_prefix_len);
+
   while ((int)keys.size() < len * 2) {
-    // Keys need to be at least 8 bytes long to look like internal keys.
-    std::string new_key = test::RandomKey(&rnd, 12);
-    if (zero_seqno) {
-      AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue);
+    std::string new_key;
+    if (distribution == KeyDistribution::kNonUniform) {
+      int remaining = key_length - cluster_prefix_len;
+      const std::string& cp =
+          (keys.size() % 2 == 0) ? cluster1_prefix : cluster2_prefix;
+      new_key = cp + rnd.RandomString(std::max(1, remaining));
+    } else {
+      new_key = prefix + test::RandomKey(&rnd, key_length);
     }
+
+    AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue);
     if (ts_sz > 0) {
       std::string key;
       PadInternalKeyWithMinTimestamp(&key, new_key, ts_sz);
@@ -643,15 +674,17 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
   std::vector<BlockHandle> block_handles;
   std::vector<std::string> first_keys;
   const bool kUseDeltaEncoding = true;
-  BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding(),
+  BlockBuilder builder(indexBlockRestartInterval(), kUseDeltaEncoding,
+                       useValueDeltaEncoding(),
                        BlockBasedTableOptions::kDataBlockBinarySearch,
                        0.75 /* data_block_hash_table_util_ratio */, ts_sz,
                        shouldPersistUDT(), !keyIncludesSeq());
 
-  int num_records = 100;
+  int num_records = numRecords();
 
   GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
-                             num_records, ts_sz, false /* zero_seqno */);
+                             num_records, ts_sz, keyLength(), prefixLength(),
+                             keyDistribution());
   BlockHandle last_encoded_handle;
   for (int i = 0; i < num_records; i++) {
     std::string first_key_to_persist_buf;
@@ -696,7 +729,7 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
       options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
       kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(),
       !useValueDeltaEncoding(), false /* block_contents_pinned */,
-      shouldPersistUDT());
+      shouldPersistUDT(), nullptr /* prefix_index */, indexSearchType());
   iter->SeekToFirst();
   for (int index = 0; index < num_records; ++index) {
     ASSERT_TRUE(iter->Valid());
@@ -724,7 +757,7 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
       options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats,
       kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(),
       !useValueDeltaEncoding(), false /* block_contents_pinned */,
-      shouldPersistUDT());
+      shouldPersistUDT(), nullptr /* prefix_index */, indexSearchType());
   for (int i = 0; i < num_records * 2; i++) {
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
@@ -753,10 +786,26 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
 // Param 1: use value delta encoding
 // Param 2: include first key
 // Param 3: user-defined timestamp test mode
+// Param 4: index search type (binary search or interpolation search)
+// Param 5: number of records
+// Param 6: index block restart interval
+// Param 7: key length
+// Param 8: prefix length
+// Param 9: key distribution (uniform or non-uniform)
 INSTANTIATE_TEST_CASE_P(
     P, IndexBlockTest,
-    ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
-                       ::testing::ValuesIn(test::GetUDTTestModes())));
+    ::testing::Combine(
+        ::testing::Bool(), ::testing::Bool(), ::testing::Bool(),
+        ::testing::ValuesIn(test::GetUDTTestModes()),
+        ::testing::Values(
+            BlockBasedTableOptions::BlockSearchType::kBinary,
+            BlockBasedTableOptions::BlockSearchType::kInterpolation),
+        ::testing::Values(1, 100),    // num_records
+        ::testing::Values(1, 16),     // index_block_restart_interval
+        ::testing::Values(1, 8, 12),  // key_length
+        ::testing::Values(0, 50),     // prefix_length
+        ::testing::Values(KeyDistribution::kUniform,
+                          KeyDistribution::kNonUniform)));
 
 class BlockPerKVChecksumTest : public DBTestBase {
  public:
@@ -1259,8 +1308,7 @@ TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) {
       std::vector<BlockHandle> block_handles;
       std::vector<std::string> first_keys;
       GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
-                                 kNumRecords, 0 /* ts_sz */,
-                                 seqno != kDisableGlobalSequenceNumber);
+                                 kNumRecords, 0 /* ts_sz */);
       SyncPoint::GetInstance()->DisableProcessing();
       std::unique_ptr<Block_kIndex> index_block = GenerateIndexBlock(
           separators, block_handles, first_keys, kNumRecords);
@@ -1558,8 +1606,7 @@ TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) {
       std::vector<BlockHandle> block_handles;
       std::vector<std::string> first_keys;
       GenerateRandomIndexEntries(&separators, &block_handles, &first_keys,
-                                 kNumRecords, 0 /* ts_sz */,
-                                 seqno != kDisableGlobalSequenceNumber);
+                                 kNumRecords, 0 /* ts_sz */);
       SyncPoint::GetInstance()->SetCallBack(
           "BlockIter::UpdateKey::value", [](void* arg) {
             char* value = static_cast<char*>(arg);
diff --git a/table/table_test.cc b/table/table_test.cc
index 768ed9a0f8fd..68f677fe0f01 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2651,9 +2651,18 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
   c.ResetTableReader();
 }
 
-TEST_P(BlockBasedTableTest, BinaryIndexTest) {
+TEST_P(BlockBasedTableTest, BinaryIndexTestBinarySearch) {
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  table_options.index_block_search_type = BlockBasedTableOptions::kBinary;
+  IndexTest(table_options);
+}
+
+TEST_P(BlockBasedTableTest, BinaryIndexTestInterpolationSearch) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  table_options.index_block_search_type =
+      BlockBasedTableOptions::kInterpolation;
   IndexTest(table_options);
 }
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index 5eedc676f39a..a7d850b825ba 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1760,6 +1760,9 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
 DEFINE_bool(use_hash_search, false,
             "if use kHashSearch instead of kBinarySearch. "
             "This is valid if only we use BlockTable");
+DEFINE_string(index_block_search_type, "binary_search",
+              "Search algorithm for reading index blocks: binary_search or "
+              "interpolation_search.");
 DEFINE_string(merge_operator, "",
               "The merge operator to use with the database."
               "If a new merge operator is specified, be sure to use fresh"
@@ -4518,6 +4521,18 @@ class Benchmark {
       } else {
         block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
       }
+
+      if (FLAGS_index_block_search_type == "binary_search") {
+        block_based_options.index_block_search_type =
+            BlockBasedTableOptions::kBinary;
+      } else if (FLAGS_index_block_search_type == "interpolation_search") {
+        block_based_options.index_block_search_type =
+            BlockBasedTableOptions::kInterpolation;
+      } else {
+        fprintf(stderr, "Unknown index_block_search_type: %s\n",
+                FLAGS_index_block_search_type.c_str());
+        db_bench_exit(1);
+      }
       block_based_options.decouple_partitioned_filters =
           FLAGS_decouple_partitioned_filters;
       if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 970ba3939032..b5c422329602 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -174,6 +174,7 @@ def apply_random_seed_per_iteration():
     "get_current_wal_file_one_in": 0,
     # Temporarily disable hash index
     "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]),
+    "index_block_search_type": lambda: random.choice([0, 1]),
     "ingest_external_file_one_in": lambda: random.choice([1000, 1000000]),
     "test_ingest_standalone_range_deletion_one_in": lambda: random.choice([0, 5, 10]),
     "iterpercent": 10,
diff --git a/unreleased_history/new_features/interpolation_search b/unreleased_history/new_features/interpolation_search
new file mode 100644
index 000000000000..ded1f773a07c
--- /dev/null
+++ b/unreleased_history/new_features/interpolation_search
@@ -0,0 +1 @@
+Include interpolation search as an alternative to binary search, which typically performs better when keys are uniformly distributed. This is exposed as a new table option `index_block_search_type`. The default is `binary_search`.

From b040ab83e1c936b8b7a49a48eeb8fb9421d1b4b4 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Sun, 15 Feb 2026 10:04:58 -0800
Subject: [PATCH 476/500] Add a new picking algorithm in fifo compaction
 (#14326)

Summary:
Add a new kv ratio based compaction picking algorithm in fifo compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14326

Test Plan: Unit test

Reviewed By: pdillinger

Differential Revision: D93257941

Pulled By: xingbowang

fbshipit-source-id: fd2d0e1356c7b54682a1197475a1bd26cb45c9d4
---
 db/c.cc                                       |   21 +
 db/c_test.c                                   |    8 +
 db/column_family.cc                           |   36 +-
 db/compaction/compaction_picker.cc            |   11 +-
 db/compaction/compaction_picker.h             |    9 +-
 db/compaction/compaction_picker_fifo.cc       |  437 +++++--
 db/compaction/compaction_picker_fifo.h        |   22 +
 db/compaction/compaction_picker_level.cc      |    8 +-
 db/compaction/compaction_picker_test.cc       | 1060 +++++++++++++++++
 db/db_compaction_test.cc                      |   15 +-
 db/version_set.cc                             |   19 +-
 db_stress_tool/db_stress_common.h             |    2 +
 db_stress_tool/db_stress_gflags.cc            |   11 +
 db_stress_tool/db_stress_test_base.cc         |    6 +
 include/rocksdb/advanced_options.h            |   53 +
 include/rocksdb/c.h                           |   13 +
 java/rocksjni/compaction_options_fifo.cc      |   48 +
 .../org/rocksdb/CompactionOptionsFIFO.java    |   50 +
 options/cf_options.cc                         |   12 +
 options/options_settable_test.cc              |    3 +-
 tools/db_bench_tool.cc                        |   12 +
 tools/db_crashtest.py                         |   16 +
 .../new_features/fifo_kv_ratio_compaction.md  |    1 +
 wiki/fifo_compaction.md                       |  672 +++++++++++
 24 files changed, 2442 insertions(+), 103 deletions(-)
 create mode 100644 unreleased_history/new_features/fifo_kv_ratio_compaction.md
 create mode 100644 wiki/fifo_compaction.md

diff --git a/db/c.cc b/db/c.cc
index 2ae26e75b7b3..ad029be3c425 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -7047,6 +7047,27 @@ uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
   return fifo_opts->rep.max_table_files_size;
 }
 
+void rocksdb_fifo_compaction_options_set_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+  fifo_opts->rep.max_data_files_size = size;
+}
+
+uint64_t rocksdb_fifo_compaction_options_get_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.max_data_files_size;
+}
+
+void rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts,
+    unsigned char use_kv_ratio_compaction) {
+  fifo_opts->rep.use_kv_ratio_compaction = use_kv_ratio_compaction;
+}
+
+unsigned char rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.use_kv_ratio_compaction;
+}
+
 void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts) {
   delete fifo_opts;
diff --git a/db/c_test.c b/db/c_test.c
index 6811fe4ae8cb..8c57d0fcf6ec 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -3596,6 +3596,14 @@ int main(int argc, char** argv) {
         100000 ==
         rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
 
+    rocksdb_fifo_compaction_options_set_max_data_files_size(fco, 200000);
+    CheckCondition(
+        200000 == rocksdb_fifo_compaction_options_get_max_data_files_size(fco));
+
+    rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(fco, 1);
+    CheckCondition(
+        1 == rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(fco));
+
     rocksdb_fifo_compaction_options_destroy(fco);
   }
 
diff --git a/db/column_family.cc b/db/column_family.cc
index 3a34bae1f653..bbf9f8210b31 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -401,7 +401,13 @@ ColumnFamilyOptions SanitizeCfOptions(const ImmutableDBOptions& db_options,
   }
 
   if (result.max_compaction_bytes == 0) {
-    result.max_compaction_bytes = result.target_file_size_base * 25;
+    // For FIFO with use_kv_ratio_compaction, leave max_compaction_bytes as 0
+    // to signal "auto-calculate target from capacity and SST/blob ratio."
+    // When explicitly set by the user, it overrides the auto-calculated target.
+    if (result.compaction_style != kCompactionStyleFIFO ||
+        !result.compaction_options_fifo.use_kv_ratio_compaction) {
+      result.max_compaction_bytes = result.target_file_size_base * 25;
+    }
   }
 
   bool is_block_based_table = (result.table_factory->IsInstanceOf(
@@ -1563,6 +1569,34 @@ Status ColumnFamilyData::ValidateOptions(
         "FIFO compaction only supported with max_open_files = -1.");
   }
 
+  if (cf_options.compaction_options_fifo.use_kv_ratio_compaction) {
+    if (cf_options.compaction_style != kCompactionStyleFIFO) {
+      return Status::InvalidArgument(
+          "use_kv_ratio_compaction is only supported with FIFO compaction "
+          "style.");
+    }
+    if (!cf_options.compaction_options_fifo.allow_compaction) {
+      return Status::InvalidArgument(
+          "use_kv_ratio_compaction requires allow_compaction = true. "
+          "allow_compaction enables intra-L0 compaction, and "
+          "use_kv_ratio_compaction selects the picking strategy.");
+    }
+    if (cf_options.compaction_options_fifo.max_data_files_size == 0) {
+      return Status::InvalidArgument(
+          "use_kv_ratio_compaction requires max_data_files_size > 0 to "
+          "compute the target compacted file size from data capacity.");
+    }
+  }
+
+  if (cf_options.compaction_options_fifo.max_data_files_size > 0 &&
+      cf_options.compaction_options_fifo.max_data_files_size <
+          cf_options.compaction_options_fifo.max_table_files_size) {
+    return Status::InvalidArgument(
+        "max_data_files_size (total data = SST + blob) must be >= "
+        "max_table_files_size (SST only) when non-zero, since total data "
+        "always includes SST data.");
+  }
+
   std::vector<uint32_t> supported{0, 1, 2, 4, 8};
   if (std::find(supported.begin(), supported.end(),
                 cf_options.memtable_protection_bytes_per_key) ==
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 230cdd643967..5e3ff66cf8b3 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -27,12 +27,11 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
-                           size_t min_files_to_compact,
-                           uint64_t max_compact_bytes_per_del_file,
-                           uint64_t max_compaction_bytes,
-                           CompactionInputFiles* comp_inputs) {
-  TEST_SYNC_POINT("FindIntraL0Compaction");
+bool PickCostBasedIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs) {
+  TEST_SYNC_POINT("PickCostBasedIntraL0Compaction");
 
   size_t start = 0;
 
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 89d5c1841265..bb9b22456e50 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -328,11 +328,10 @@ class NullCompactionPicker : public CompactionPicker {
 //                                        files. Cannot be nullptr.
 //
 // @return                                true iff compaction was found.
-bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
-                           size_t min_files_to_compact,
-                           uint64_t max_compact_bytes_per_del_file,
-                           uint64_t max_compaction_bytes,
-                           CompactionInputFiles* comp_inputs);
+bool PickCostBasedIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs);
 
 CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
                                    const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index a1c4df368d1d..e13c333856d2 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -9,6 +9,7 @@
 
 #include "db/compaction/compaction_picker_fifo.h"
 
+#include <algorithm>
 #include <cinttypes>
 #include <string>
 #include <vector>
@@ -31,6 +32,29 @@ uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
   }
   return total_size;
 }
+
+// Compute effective data size and capacity limit for FIFO compaction.
+// When max_data_files_size > 0 (blob-aware mode), the effective size includes
+// both SST and blob file sizes, and the limit is max_data_files_size.
+// Otherwise, only SST sizes are used with max_table_files_size as the limit.
+void GetEffectiveSizeAndLimit(const CompactionOptionsFIFO& fifo_opts,
+                              uint64_t total_sst_size, uint64_t total_blob_size,
+                              uint64_t* effective_size,
+                              uint64_t* effective_max) {
+  *effective_size = total_sst_size;
+  *effective_max = fifo_opts.max_table_files_size;
+  if (fifo_opts.max_data_files_size > 0) {
+    *effective_size += total_blob_size;
+    *effective_max = fifo_opts.max_data_files_size;
+  }
+}
+
+// Return the effective capacity limit for FIFO compaction.
+// Convenience wrapper when only the limit is needed (e.g., PickTTLCompaction).
+uint64_t GetEffectiveMax(const CompactionOptionsFIFO& fifo_opts) {
+  return fifo_opts.max_data_files_size > 0 ? fifo_opts.max_data_files_size
+                                           : fifo_opts.max_table_files_size;
+}
 }  // anonymous namespace
 
 bool FIFOCompactionPicker::NeedsCompaction(
@@ -98,10 +122,43 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   // Return a nullptr and proceed to size-based FIFO compaction if:
   // 1. there are no files older than ttl OR
   // 2. there are a few files older than ttl, but deleting them will not bring
-  //    the total size to be less than max_table_files_size threshold.
-  if (inputs[0].files.empty() ||
-      total_size >
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+  //    the total size to be less than the size threshold.
+  uint64_t effective_max =
+      GetEffectiveMax(mutable_cf_options.compaction_options_fifo);
+  // Estimate the effective remaining data after dropping TTL-expired SSTs.
+  // Each dropped SST also frees a proportional share of blob data.
+  //
+  // In multi-level FIFO (migration), we must use total SST across ALL levels
+  // as the reference, because total_blob covers all levels. Using only L0
+  // SST would inflate the blob estimate.
+  uint64_t effective_remaining = total_size;
+  if (mutable_cf_options.compaction_options_fifo.max_data_files_size > 0) {
+    uint64_t total_blob = vstorage->GetBlobStats().total_file_size;
+    // Compute total SST across all levels so the reference scope matches
+    // total_blob's scope (all levels).
+    uint64_t total_sst_all_levels = GetTotalFilesSize(level_files);
+    for (int level = 1; level < vstorage->num_levels(); ++level) {
+      total_sst_all_levels += GetTotalFilesSize(vstorage->LevelFiles(level));
+    }
+    // remaining_sst_all = total_sst_all - dropped_l0_sst
+    // total_size is the remaining L0 SST after removing expired files;
+    // original L0 SST minus remaining L0 SST = dropped.
+    uint64_t original_l0_sst = GetTotalFilesSize(level_files);
+    uint64_t dropped_sst = original_l0_sst - total_size;
+    uint64_t remaining_sst_all = total_sst_all_levels - dropped_sst;
+    // Proportional blob estimate: each SST byte "owns" a proportional
+    // share of blob bytes. Both reference sizes must come from the same
+    // scope (all levels) to avoid inflated estimates.
+    if (total_sst_all_levels > 0 && total_blob > 0) {
+      effective_remaining =
+          remaining_sst_all +
+          static_cast<uint64_t>(static_cast<double>(remaining_sst_all) /
+                                total_sst_all_levels * total_blob);
+    } else {
+      effective_remaining = remaining_sst_all;
+    }
+  }
+  if (inputs[0].files.empty() || effective_remaining > effective_max) {
     return nullptr;
   }
 
@@ -151,7 +208,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
     LogBuffer* log_buffer) {
-  // compute the total size and identify the last non-empty level
+  const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+
+  // compute the total SST size and identify the last non-empty level
   int last_level = 0;
   uint64_t total_size = 0;
   for (int level = 0; level < vstorage->num_levels(); ++level) {
@@ -164,52 +223,13 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
   const std::vector<FileMetaData*>& last_level_files =
       vstorage->LevelFiles(last_level);
 
-  if (last_level == 0 &&
-      total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
-    // total size not exceeded, try to find intra level 0 compaction if enabled
-    const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
-    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
-        level0_files.size() > 0) {
-      CompactionInputFiles comp_inputs;
-      // try to prevent same files from being compacted multiple times, which
-      // could produce large files that may never TTL-expire. Achieve this by
-      // disallowing compactions with files larger than memtable (inflate its
-      // size by 10% to account for uncompressed L0 files that may have size
-      // slightly greater than memtable size limit).
-      size_t max_compact_bytes_per_del_file =
-          static_cast<size_t>(MultiplyCheckOverflow(
-              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
-              1.1));
-      if (FindIntraL0Compaction(
-              level0_files,
-              mutable_cf_options
-                  .level0_file_num_compaction_trigger /* min_files_to_compact */
-              ,
-              max_compact_bytes_per_del_file,
-              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
-        Compaction* c = new Compaction(
-            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
-            {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
-            0 /* max compaction bytes, not applicable */,
-            0 /* output path ID */, mutable_cf_options.compression,
-            mutable_cf_options.compression_opts, Temperature::kUnknown,
-            0 /* max_subcompactions */, {},
-            /* earliest_snapshot */ std::nullopt,
-            /* snapshot_checker */ nullptr,
-            CompactionReason::kFIFOReduceNumFiles,
-            /* trim_ts */ "", vstorage->CompactionScore(0),
-            /* l0_files_might_overlap */ true);
-        return c;
-      }
-    }
+  // Compute effective size and limit for comparison.
+  uint64_t effective_size, effective_max;
+  GetEffectiveSizeAndLimit(fifo_opts, total_size,
+                           vstorage->GetBlobStats().total_file_size,
+                           &effective_size, &effective_max);
 
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
-        ", max size %" PRIu64 "\n",
-        cf_name.c_str(), total_size,
-        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+  if (last_level == 0 && effective_size <= effective_max) {
     return nullptr;
   }
 
@@ -227,11 +247,29 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
   inputs[0].level = last_level;
 
   if (last_level == 0) {
+    // When using blob-aware sizing, use proportional estimation (same
+    // principle as EstimateTotalDataForSST): each SST "owns"
+    // effective_size / num_files of total data. This is an approximation
+    // — individual SSTs may reference different amounts of blob data,
+    // but uniform distribution is a reasonable estimate for FIFO dropping.
+    uint64_t remaining_size = effective_size;
+    const uint64_t num_files = last_level_files.size();
+    // Proportional estimate of data per file (SST + blob).
+    // Use max(1) to prevent stalling when effective_size < num_files.
+    const uint64_t data_per_file =
+        (fifo_opts.max_data_files_size > 0 && num_files > 0)
+            ? std::max(effective_size / num_files, uint64_t{1})
+            : 0;
+
     // In L0, right-most files are the oldest files.
     for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
          ++ritr) {
       auto f = *ritr;
-      total_size -= f->fd.file_size;
+      if (fifo_opts.max_data_files_size > 0) {
+        remaining_size -= std::min(remaining_size, data_per_file);
+      } else {
+        remaining_size -= std::min(remaining_size, f->fd.file_size);
+      }
       inputs[0].files.push_back(f);
       char tmp_fsize[16];
       AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
@@ -239,13 +277,11 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
                        "[%s] FIFO compaction: picking file %" PRIu64
                        " with size %s for deletion",
                        cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
-      if (total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      if (remaining_size <= effective_max) {
         break;
       }
     }
-  } else if (total_size >
-             mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+  } else if (effective_size > effective_max) {
     // If the last level is non-L0, we actually don't know which file is
     // logically the oldest since the file creation time only represents
     // when this file was compacted to this level, which is independent
@@ -255,34 +291,36 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
     // file with the smallest key will be deleted first.  This design decision
     // better serves a major type of FIFO use cases where smaller keys are
     // associated with older data.
+    const uint64_t num_files = last_level_files.size();
+    // Proportional estimate of data per file (SST + blob), same as L0 path.
+    const uint64_t data_per_file =
+        (fifo_opts.max_data_files_size > 0 && num_files > 0)
+            ? std::max(effective_size / num_files, uint64_t{1})
+            : 0;
     for (const auto& f : last_level_files) {
       if (f->being_compacted) {
         continue;
       }
-      total_size -= f->fd.file_size;
+      if (fifo_opts.max_data_files_size > 0) {
+        effective_size -= std::min(effective_size, data_per_file);
+      } else {
+        effective_size -= std::min(effective_size, f->fd.file_size);
+      }
       inputs[0].files.push_back(f);
       char tmp_fsize[16];
       AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
-      ROCKS_LOG_BUFFER(
-          log_buffer,
-          "[%s] FIFO compaction: picking file %" PRIu64
-          " with size %s for deletion under total size %" PRIu64
-          " vs max table files size %" PRIu64,
-          cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, total_size,
-          mutable_cf_options.compaction_options_fifo.max_table_files_size);
-
-      if (total_size <=
-          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with size %s for deletion under total size %" PRIu64
+                       " vs max size %" PRIu64,
+                       cf_name.c_str(), f->fd.GetNumber(), tmp_fsize,
+                       effective_size, effective_max);
+
+      if (effective_size <= effective_max) {
         break;
       }
     }
   } else {
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
-        ", max size %" PRIu64 "\n",
-        cf_name.c_str(), total_size,
-        mutable_cf_options.compaction_options_fifo.max_table_files_size);
     return nullptr;
   }
 
@@ -419,6 +457,249 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
   return c;
 }
 
+Compaction* FIFOCompactionPicker::PickIntraL0Compaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+
+  if (!fifo_opts.allow_compaction) {
+    return nullptr;
+  }
+
+  const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+  if (level0_files.empty()) {
+    return nullptr;
+  }
+
+  if (fifo_opts.use_kv_ratio_compaction) {
+    return PickRatioBasedIntraL0Compaction(
+        cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
+  }
+
+  // Old intra-L0 path: merge small files using PickCostBasedIntraL0Compaction.
+  // Minimum files to compact follows level0_file_num_compaction_trigger.
+  // Try to prevent same files from being compacted multiple times, which
+  // could produce large files that may never TTL-expire. Achieve this by
+  // disallowing compactions with files larger than memtable (inflate its
+  // size by 10% to account for uncompressed L0 files that may have size
+  // slightly greater than memtable size limit).
+
+  CompactionInputFiles comp_inputs;
+  size_t max_compact_bytes_per_del_file =
+      static_cast<size_t>(MultiplyCheckOverflow(
+          static_cast<uint64_t>(mutable_cf_options.write_buffer_size), 1.1));
+  if (PickCostBasedIntraL0Compaction(
+          level0_files,
+          mutable_cf_options
+              .level0_file_num_compaction_trigger /* min_files_to_compact */,
+          max_compact_bytes_per_del_file,
+          mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+    Compaction* c = new Compaction(
+        vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+        {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
+        0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+        mutable_cf_options.compression, mutable_cf_options.compression_opts,
+        Temperature::kUnknown, 0 /* max_subcompactions */, {},
+        /* earliest_snapshot */ std::nullopt,
+        /* snapshot_checker */ nullptr, CompactionReason::kFIFOReduceNumFiles,
+        /* trim_ts */ "", vstorage->CompactionScore(0),
+        /* l0_files_might_overlap */ true);
+    return c;
+  }
+
+  return nullptr;
+}
+
+Compaction* FIFOCompactionPicker::PickRatioBasedIntraL0Compaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+  assert(fifo_opts.use_kv_ratio_compaction);
+  assert(fifo_opts.max_data_files_size > 0);
+
+  // During migration from level/universal compaction to FIFO, non-L0 levels
+  // may still contain files. The ratio-based algorithm only operates on L0,
+  // so skip it until PickSizeCompaction has drained all non-L0 levels.
+  // Once levels collapse to L0-only, this algorithm will kick in.
+  for (int level = 1; level < vstorage->num_levels(); ++level) {
+    if (!vstorage->LevelFiles(level).empty()) {
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO kv-ratio compaction: skipping — non-L0 "
+                       "level %d still has %" ROCKSDB_PRIszt
+                       " files (migration in progress)",
+                       cf_name.c_str(), level,
+                       vstorage->LevelFiles(level).size());
+      return nullptr;
+    }
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    return nullptr;
+  }
+
+  const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+  if (mutable_cf_options.level0_file_num_compaction_trigger <= 1) {
+    // trigger <= 0 is invalid; trigger == 1 means compact after every flush,
+    // which doesn't make sense for tiered merging (the tier boundary loop
+    // divides by trigger, so trigger == 1 would cause an infinite loop).
+    return nullptr;
+  }
+  const size_t trigger = static_cast<size_t>(
+      mutable_cf_options.level0_file_num_compaction_trigger);
+  if (level0_files.size() < trigger) {
+    return nullptr;
+  }
+
+  // Determine the target compacted file size.
+  //
+  // When max_compaction_bytes > 0 (explicitly set by user), use it directly
+  // as the target. This allows users to override the auto-calculated value.
+  //
+  // When max_compaction_bytes == 0 (default), auto-calculate from the data
+  // capacity and observed SST/blob ratio:
+  //   target = max_data_files_size * sst_ratio / trigger
+  //
+  // This is recomputed on every PickCompaction call. The computation is
+  // trivial (sum file sizes + arithmetic) and PickCompaction is only called
+  // once per flush or compaction completion, so no caching is needed.
+  uint64_t target = 0;
+  if (mutable_cf_options.max_compaction_bytes > 0) {
+    // User explicitly set max_compaction_bytes — use it as target
+    target = mutable_cf_options.max_compaction_bytes;
+  } else {
+    // Auto-calculate from capacity and observed SST/blob ratio
+    uint64_t total_sst = GetTotalFilesSize(level0_files);
+    uint64_t total_blob = vstorage->GetBlobStats().total_file_size;
+    uint64_t total_data = total_sst + total_blob;
+
+    if (total_data == 0 || total_sst == 0) {
+      return nullptr;
+    }
+
+    // Compute sst_ratio (inverse of EstimateTotalDataForSST's proportion):
+    // when no blob files exist, sst_ratio is 1.0 and the target becomes
+    // max_data_files_size / trigger, which is large. The algorithm will
+    // naturally not find small enough files to compact.
+    double sst_ratio =
+        (total_blob > 0) ? static_cast<double>(total_sst) / total_data : 1.0;
+
+    uint64_t total_sst_at_cap =
+        static_cast<uint64_t>(fifo_opts.max_data_files_size * sst_ratio);
+    target = total_sst_at_cap / trigger;
+
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO ratio-based compaction: sst_ratio=%.4f, "
+                     "target_file_size=%" PRIu64,
+                     cf_name.c_str(), sst_ratio, target);
+  }
+  if (target == 0) {
+    return nullptr;
+  }
+
+  // Tiered size-based file selection.
+  //
+  // Tier boundaries form a geometric sequence descending from target:
+  //   ..., target/trigger^2, target/trigger, target
+  // For each boundary (smallest first), find contiguous L0 files with
+  // size < boundary. If their accumulated bytes >= boundary, merge them.
+  // The output (~boundary bytes) advances to the next tier. Files that
+  // reach target are "graduated" and never compacted again.
+  //
+  // Trade-off: write amplification vs L0 file count.
+  //
+  // Write amp: O(log(target/flush) / log(trigger)) per byte, instead of
+  //   O(target / (trigger * flush)) from flat merging. Each byte is
+  //   rewritten once per tier crossing.
+  //
+  // L0 file count: trigger + k * (trigger - 1) at steady state, where
+  //   k = ceil(log(target/flush) / log(trigger)). This is higher than
+  //   the original trigger target because intermediate tier files
+  //   accumulate while waiting for the next tier merge. The trade-off
+  //   is explicit: more L0 files in exchange for logarithmic (instead
+  //   of linear) write amplification.
+
+  // Build tier boundaries from smallest to largest.
+  // Stop at 10KB minimum — SST files of most workloads are larger than
+  // this, so lower boundaries would only waste CPU scanning L0 files.
+  // Files smaller than the lowest boundary simply merge at that boundary.
+  static constexpr uint64_t kMinTierBoundary = 10 * 1024;  // 10KB
+  std::vector<uint64_t> boundaries;
+  for (uint64_t b = target; b >= kMinTierBoundary; b /= trigger) {
+    boundaries.push_back(b);
+  }
+  if (boundaries.empty()) {
+    // target itself is below kMinTierBoundary — use target as the
+    // sole boundary so we can still compact at the target size.
+    boundaries.push_back(target);
+  }
+  std::reverse(boundaries.begin(), boundaries.end());
+
+  // For each tier boundary (smallest first), scan L0 for mergeable batches.
+  // L0 files are stored newest-first; oldest is at the end.
+  for (const uint64_t boundary : boundaries) {
+    for (size_t scan = level0_files.size(); scan > 0;) {
+      // Skip files >= boundary (they belong to higher tiers) or in-progress
+      if (level0_files[scan - 1]->fd.file_size >= boundary ||
+          level0_files[scan - 1]->being_compacted) {
+        --scan;
+        continue;
+      }
+
+      // Found a file < boundary — collect contiguous batch
+      std::vector<FileMetaData*> batch;
+      uint64_t accumulated = 0;
+      size_t pos = scan;
+      while (pos > 0 && level0_files[pos - 1]->fd.file_size < boundary &&
+             !level0_files[pos - 1]->being_compacted) {
+        // Don't let output exceed 2x boundary (prevent tier-skipping)
+        if (accumulated >= boundary &&
+            accumulated + level0_files[pos - 1]->fd.file_size > boundary * 2) {
+          break;
+        }
+        batch.push_back(level0_files[pos - 1]);
+        accumulated += level0_files[pos - 1]->fd.file_size;
+        --pos;
+      }
+
+      // Viable: >= 2 files and accumulated >= boundary
+      if (batch.size() >= 2 && accumulated >= boundary) {
+        CompactionInputFiles comp_inputs;
+        comp_inputs.level = 0;
+        comp_inputs.files = std::move(batch);
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] FIFO kv-ratio compaction: picking %" ROCKSDB_PRIszt
+            " files (%" PRIu64 " bytes) at tier boundary %" PRIu64
+            " for intra-L0 compaction, target=%" PRIu64,
+            cf_name.c_str(), comp_inputs.files.size(), accumulated, boundary,
+            target);
+
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+            {comp_inputs}, 0, boundary /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            mutable_cf_options.compression_opts, Temperature::kUnknown,
+            0 /* max_subcompactions */, {},
+            /* earliest_snapshot */ std::nullopt,
+            /* snapshot_checker */ nullptr,
+            CompactionReason::kFIFOReduceNumFiles,
+            /* trim_ts */ "", vstorage->CompactionScore(0),
+            /* l0_files_might_overlap */ true);
+        return c;
+      }
+
+      // This batch wasn't enough — advance past it
+      scan = pos;
+    }
+  }
+
+  return nullptr;
+}
+
 // The full_history_ts_low parameter is used to control bottommost file marking
 // for compaction when user-defined timestamps (UDT) are enabled.
 
@@ -441,10 +722,22 @@ Compaction* FIFOCompactionPicker::PickCompaction(
     c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
                            vstorage, log_buffer);
   }
+  // Intra-L0 compaction merges small files to reduce file count.
+  // It runs after size-based dropping: if PickSizeCompaction dropped files,
+  // it returned non-null and we skip this. Otherwise, we try to reduce
+  // L0 file count by merging small files together.
+  if (c == nullptr) {
+    c = PickIntraL0Compaction(cf_name, mutable_cf_options, mutable_db_options,
+                              vstorage, log_buffer);
+  }
   if (c == nullptr) {
     c = PickTemperatureChangeCompaction(
         cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
   }
+  if (c == nullptr) {
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] FIFO compaction: no compaction picked",
+                     cf_name.c_str());
+  }
   RegisterCompaction(c);
   return c;
 }
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index 2ddbd54b28ee..2c1cd21321b9 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -55,6 +55,28 @@ class FIFOCompactionPicker : public CompactionPicker {
                                  VersionStorageInfo* version,
                                  LogBuffer* log_buffer);
 
+  // Intra-L0 compaction: merges small L0 files to reduce file count.
+  // Dispatches between two strategies based on configuration:
+  //   - use_kv_ratio_compaction = true: PickRatioBasedIntraL0Compaction
+  //   (BlobDB-optimized)
+  //   - use_kv_ratio_compaction = false: PickCostBasedIntraL0Compaction
+  //   (original)
+  // Only active when allow_compaction = true.
+  Compaction* PickIntraL0Compaction(const std::string& cf_name,
+                                    const MutableCFOptions& mutable_cf_options,
+                                    const MutableDBOptions& mutable_db_options,
+                                    VersionStorageInfo* vstorage,
+                                    LogBuffer* log_buffer);
+
+  // Capacity-derived intra-L0 compaction for BlobDB workloads.
+  // Uses the observed SST/blob ratio to compute a target file size,
+  // producing uniform files for predictable FIFO trimming.
+  // Called from PickIntraL0Compaction when use_kv_ratio_compaction = true.
+  Compaction* PickRatioBasedIntraL0Compaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer);
+
   // Will pick one file to compact at a time, starting from the oldest file.
   Compaction* PickTemperatureChangeCompaction(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 090ad0bbfa7d..ade42ce5e3e8 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -914,10 +914,10 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() {
     // resort to L0->L0 compaction yet.
     return false;
   }
-  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
-                               std::numeric_limits<uint64_t>::max(),
-                               mutable_cf_options_.max_compaction_bytes,
-                               &start_level_inputs_);
+  return PickCostBasedIntraL0Compaction(
+      level_files, kMinFilesForIntraL0Compaction,
+      std::numeric_limits<uint64_t>::max(),
+      mutable_cf_options_.max_compaction_bytes, &start_level_inputs_);
 }
 
 bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index 5260ac5abbfc..4dfa327ae162 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -7,6 +7,8 @@
 #include <string>
 #include <utility>
 
+#include "db/blob/blob_file_meta.h"
+#include "db/column_family.h"
 #include "db/compaction/compaction.h"
 #include "db/compaction/compaction_picker_fifo.h"
 #include "db/compaction/compaction_picker_level.h"
@@ -17,6 +19,7 @@
 #include "table/unique_id_impl.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/random.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -220,6 +223,46 @@ class CompactionPickerTestBase : public testing::Test {
     vstorage_->SetFinalized();
   }
 
+  void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_bytes,
+                   BlobFileMetaData::LinkedSsts linked_ssts = {}) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, /*total_blob_count=*/1, total_blob_bytes,
+        /*checksum_method=*/"", /*checksum_value=*/"");
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 /*garbage_blob_count=*/0,
+                                 /*garbage_blob_bytes=*/0);
+    vstorage_->AddBlobFile(std::move(meta));
+  }
+
+  // Helper to set up FIFO ratio-based compaction options and version storage.
+  // Call before Add()/AddBlobFile(), then create FIFOCompactionPicker after.
+  void SetupFIFORatioBased(uint64_t max_table_files_size,
+                           uint64_t max_data_files_size, int trigger,
+                           bool allow_compaction = true,
+                           bool use_kv_ratio = true, int num_levels = 1) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(num_levels, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        max_table_files_size;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        max_data_files_size;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction =
+        allow_compaction;
+    mutable_cf_options_.compaction_options_fifo.use_kv_ratio_compaction =
+        use_kv_ratio;
+    mutable_cf_options_.level0_file_num_compaction_trigger = trigger;
+  }
+
+  // Helper to finalize version storage and pick a FIFO compaction.
+  std::unique_ptr<Compaction> PickFIFOCompaction(FIFOCompactionPicker& picker) {
+    UpdateVersionStorageInfo();
+    return std::unique_ptr<Compaction>(picker.PickCompaction(
+        cf_name_, mutable_cf_options_, mutable_db_options_,
+        /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr,
+        vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
+  }
+
  private:
   Options CreateOptions(const Comparator* ucmp) const {
     Options opts;
@@ -4961,6 +5004,1023 @@ TEST_F(CompactionPickerU64TsTest, UniversalPickCompactionWithFullHistoryTsLow) {
   ASSERT_EQ(2U, compaction->num_input_files(0));
 }
 
+// ============================================================================
+// FIFO Ratio-Based Compaction Picker Unit Tests
+// Tests the actual FIFOCompactionPicker with use_kv_ratio_compaction option
+// (PickRatioBasedIntraL0Compaction path).
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionFileCountThreshold) {
+  // Test three file count scenarios relative to trigger (= 4):
+  //   - fewer than trigger: no compaction
+  //   - exactly trigger: compaction fires
+  //   - more than trigger: compaction fires, picks >= 2 files
+
+  // Sub-test 1: fewer than trigger (3 files < trigger 4) -> no compaction
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 1ULL * 1024 * 1024 * 1024, 4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+    AddBlobFile(102, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_EQ(nullptr, compaction.get())
+        << "Should not compact when file count < trigger";
+  }
+
+  // Sub-test 2: exactly trigger (4 files = trigger 4) -> compaction fires
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 1ULL * 1024 * 1024 * 1024, 4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 32 * 1024);
+    Add(0, 3U, "300", "400", 48 * 1024);
+    Add(0, 4U, "400", "500", 96 * 1024);
+    // sst_ratio ~ 240KB/256MB ~ 0.001, target ~ 250KB
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+    AddBlobFile(102, 64ULL * 1024 * 1024);
+    AddBlobFile(103, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Should compact when file count == trigger";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+    ASSERT_EQ(0, compaction->output_level());
+  }
+
+  // Sub-test 3: more than trigger (8 files > trigger 4) -> compaction fires
+  {
+    SetupFIFORatioBased(100 * 1024 * 1024, 500ULL * 1024 * 1024, 4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "199", 64 * 1024);
+    Add(0, 2U, "200", "299", 32 * 1024);
+    Add(0, 3U, "300", "399", 48 * 1024);
+    Add(0, 4U, "400", "499", 96 * 1024);
+    Add(0, 5U, "500", "599", 64 * 1024);
+    Add(0, 6U, "600", "699", 48 * 1024);
+    Add(0, 7U, "700", "799", 64 * 1024);
+    Add(0, 8U, "800", "899", 64 * 1024);
+    for (uint64_t i = 0; i < 8; i++) {
+      AddBlobFile(100 + i, 50ULL * 1024 * 1024);
+    }
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Should compact when file count > trigger";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+    ASSERT_GE(compaction->num_input_files(0), 2);
+  }
+}
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionNoBlobsFallback) {
+  // When total_blob == 0, sst_ratio = 1.0 and target becomes huge
+  // (max_data_files_size / trigger). With the tiered algorithm, the tier
+  // boundaries descend from target, and the lowest boundary where files
+  // can accumulate will be found. The algorithm should still work
+  // correctly (not crash) and produce a compaction at a low tier boundary.
+  SetupFIFORatioBased(10 * 1024 * 1024, 10ULL * 1024 * 1024 * 1024, 4);
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+  // Small SST files, no blob files
+  Add(0, 1U, "100", "200", 64 * 1024);
+  Add(0, 2U, "200", "300", 64 * 1024);
+  Add(0, 3U, "300", "400", 64 * 1024);
+  Add(0, 4U, "400", "500", 64 * 1024);
+
+  // No blob files added -- total_blob == 0
+
+  // With sst_ratio=1.0 and 10GB cap, target = 10GB/4 = 2.5GB.
+  // Tiered boundaries descend: 2.5GB, 625MB, ..., ~152KB, ~38KB, ...
+  // At boundary ~152KB, 4 files of 64KB accumulate to 256KB >= 152KB.
+  // The tiered algorithm finds a viable batch and compacts.
+  auto compaction = PickFIFOCompaction(picker);
+  ASSERT_NE(nullptr, compaction.get());
+  ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+            compaction->compaction_reason());
+  ASSERT_GE(compaction->num_input_files(0), 2);
+}
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionNoRecompaction) {
+  // When all files are at or above the target size (graduated),
+  // no re-compaction should happen. Files >= target are skipped at every
+  // tier boundary.
+  SetupFIFORatioBased(100 * 1024 * 1024, 500ULL * 1024 * 1024, 4);
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+  // Use max_compaction_bytes to set an explicit target of 256KB.
+  // Make all files >= 256KB so they are "graduated" (at or above target).
+  mutable_cf_options_.max_compaction_bytes = 256 * 1024;
+
+  // All files at 300KB, which is >= target (256KB) -> graduated
+  Add(0, 1U, "100", "199", 300 * 1024);
+  Add(0, 2U, "200", "299", 300 * 1024);
+  Add(0, 3U, "300", "399", 300 * 1024);
+  Add(0, 4U, "400", "499", 300 * 1024);
+
+  // All files are at/above target -> graduated -> no compaction.
+  auto compaction = PickFIFOCompaction(picker);
+  ASSERT_EQ(nullptr, compaction.get());
+}
+
+TEST_F(CompactionPickerTest,
+       FIFORatioBasedCompactionWithExplicitMaxCompactionBytes) {
+  // When max_compaction_bytes > 0, it overrides the auto-calculated target.
+  SetupFIFORatioBased(100 * 1024 * 1024, 10ULL * 1024 * 1024 * 1024, 4);
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+  // Explicitly set target to 256KB
+  mutable_cf_options_.max_compaction_bytes = 256 * 1024;
+
+  // 6 small SST files
+  Add(0, 1U, "100", "199", 64 * 1024);
+  Add(0, 2U, "200", "299", 64 * 1024);
+  Add(0, 3U, "300", "399", 64 * 1024);
+  Add(0, 4U, "400", "499", 64 * 1024);
+  Add(0, 5U, "500", "599", 64 * 1024);
+  Add(0, 6U, "600", "699", 64 * 1024);
+
+  // No blob files needed when max_compaction_bytes is explicitly set
+
+  // target = max_compaction_bytes = 256KB.
+  // Tier boundaries descend from 256KB: [25KB, 256KB] (trigger=4, floor=10KB).
+  // At boundary 25KB: each 64KB file >= 25KB -> skipped.
+  // At boundary 256KB: all 64KB files < 256KB -> accumulated until >= 256KB.
+  auto compaction = PickFIFOCompaction(picker);
+  ASSERT_NE(nullptr, compaction.get());
+  ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, FIFORatioBasedCompactionFallbackToOldPath) {
+  // When use_kv_ratio_compaction is false, PickIntraL0Compaction should
+  // fall through to the old PickCostBasedIntraL0Compaction path.
+
+  // Sub-test 1: allow_compaction = false -> no intra-L0 at all
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 0, 4,
+                        /*allow_compaction=*/false, /*use_kv_ratio=*/false);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    Add(0, 4U, "400", "500", 64 * 1024);
+
+    // Total size (256KB) < max_table_files_size (10MB), so no deletion.
+    // allow_compaction=false, so no intra-L0 either.
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_EQ(nullptr, compaction.get());
+  }
+
+  // Sub-test 2: allow_compaction = true, use_kv_ratio = false
+  // -> falls through to old PickCostBasedIntraL0Compaction path
+  {
+    SetupFIFORatioBased(10 * 1024 * 1024, 0, 4,
+                        /*allow_compaction=*/true, /*use_kv_ratio=*/false);
+    // The old path uses max_compaction_bytes to cap total input size.
+    // In production this is sanitized to target_file_size_base * 25,
+    // but tests bypass sanitization, so set it explicitly.
+    mutable_cf_options_.max_compaction_bytes = 64 * 1024 * 1024;  // 64MB
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    Add(0, 4U, "400", "500", 64 * 1024);
+
+    // Total size (256KB) < max_table_files_size (10MB), so no deletion.
+    // allow_compaction=true and use_kv_ratio=false -> old path.
+    // 4 files >= trigger(4), per_del = 256KB/3 ~ 85KB < 1.1*WBS -> passes.
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Old path should compact when allow_compaction=true";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+  }
+}
+
+// ============================================================================
+// FIFO Option Validation Tests
+// Tests that ColumnFamilyData::ValidateOptions rejects invalid configurations
+// for use_kv_ratio_compaction.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFOOptionValidation) {
+  auto validate = [](std::function<void(ColumnFamilyOptions&)> configure) {
+    ColumnFamilyOptions cf_opts;
+    cf_opts.compaction_style = kCompactionStyleFIFO;
+    cf_opts.compaction_options_fifo.allow_compaction = true;
+    cf_opts.compaction_options_fifo.use_kv_ratio_compaction = true;
+    cf_opts.compaction_options_fifo.max_data_files_size =
+        1ULL * 1024 * 1024 * 1024;
+    cf_opts.num_levels = 1;
+    configure(cf_opts);
+    return ColumnFamilyData::ValidateOptions(DBOptions(), cf_opts);
+  };
+
+  // use_kv_ratio_compaction requires FIFO compaction style
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_style = kCompactionStyleLevel;
+              }).IsInvalidArgument());
+
+  // use_kv_ratio_compaction requires allow_compaction
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_options_fifo.allow_compaction = false;
+              }).IsInvalidArgument());
+
+  // use_kv_ratio_compaction requires max_data_files_size > 0
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_options_fifo.max_data_files_size = 0;
+              }).IsInvalidArgument());
+
+  // Accepts multi-level (for migration from level/universal to FIFO)
+  ASSERT_OK(validate([](auto& o) { o.num_levels = 4; }));
+
+  // Accepts valid single-level config
+  ASSERT_OK(validate([](auto& /*o*/) {}));
+
+  // max_data_files_size < max_table_files_size is invalid when non-zero
+  ASSERT_TRUE(validate([](auto& o) {
+                o.compaction_options_fifo.use_kv_ratio_compaction = false;
+                o.compaction_options_fifo.max_data_files_size = 0;
+                o.compaction_options_fifo.max_table_files_size =
+                    1ULL * 1024 * 1024 * 1024;
+                o.compaction_options_fifo.max_data_files_size =
+                    500ULL * 1024 * 1024;
+              }).IsInvalidArgument());
+
+  // max_data_files_size == max_table_files_size is valid
+  ASSERT_OK(validate([](auto& o) {
+    o.compaction_options_fifo.use_kv_ratio_compaction = false;
+    o.compaction_options_fifo.max_data_files_size = 0;
+    o.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024;
+    o.compaction_options_fifo.max_data_files_size = 1ULL * 1024 * 1024 * 1024;
+  }));
+}
+
+// ============================================================================
+// FIFO Ratio-Based Compaction: Multi-Level Migration Graceful Skip
+// Tests that PickRatioBasedIntraL0Compaction gracefully skips when non-L0
+// levels still contain files (e.g., during migration from level/universal
+// to FIFO), and resumes once all data has been drained to L0.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFORatioBasedMultiLevelMigration) {
+  // Sub-case 1: During migration (non-L0 levels have files).
+  // Ratio-based intra-L0 compaction should be skipped.
+  {
+    SetupFIFORatioBased(/*max_table_files_size=*/100 * 1024 * 1024,
+                        /*max_data_files_size=*/1ULL * 1024 * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/true,
+                        /*num_levels=*/4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 64 * 1024);
+    Add(0, 3U, "300", "400", 64 * 1024);
+    Add(0, 4U, "400", "500", 64 * 1024);
+    Add(0, 5U, "500", "600", 64 * 1024);
+    Add(2, 10U, "100", "600", 50 * 1024 * 1024);
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    if (compaction != nullptr) {
+      if (compaction->compaction_reason() ==
+          CompactionReason::kFIFOReduceNumFiles) {
+        // Cost-based path is fine; verify it's not ratio-based.
+        ASSERT_EQ(16 * 1024 * 1024, compaction->max_output_file_size());
+      }
+    }
+  }
+
+  // Sub-case 2: After migration (only L0 has files).
+  // Ratio-based compaction should resume normally.
+  {
+    SetupFIFORatioBased(/*max_table_files_size=*/100 * 1024 * 1024,
+                        /*max_data_files_size=*/1ULL * 1024 * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/true,
+                        /*num_levels=*/4);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "200", 64 * 1024);
+    Add(0, 2U, "200", "300", 32 * 1024);
+    Add(0, 3U, "300", "400", 48 * 1024);
+    Add(0, 4U, "400", "500", 96 * 1024);
+    AddBlobFile(100, 64ULL * 1024 * 1024);
+    AddBlobFile(101, 64ULL * 1024 * 1024);
+    AddBlobFile(102, 64ULL * 1024 * 1024);
+    AddBlobFile(103, 64ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get())
+        << "Should compact when non-L0 levels are empty (migration complete)";
+    ASSERT_EQ(CompactionReason::kFIFOReduceNumFiles,
+              compaction->compaction_reason());
+    ASSERT_EQ(0, compaction->output_level());
+  }
+}
+
+// ============================================================================
+// FIFO TTL Compaction with Blob-Aware Estimation Tests
+// Tests that PickTTLCompaction correctly estimates remaining data (SST + blob)
+// in both single-level and multi-level FIFO configurations.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFOTTLBlobEstimationSingleLevel) {
+  // Single-level FIFO with TTL and max_data_files_size.
+  // After dropping expired L0 SSTs, the blob estimate should be proportional
+  // to the remaining SST fraction.
+  //
+  // Common setup: L0 = 4 files x 50KB = 200KB, files 3,4 expired.
+  // Remaining SST after drop = 100KB = 50%.
+
+  auto run = [&](uint64_t blob_total, uint64_t limit, bool expect_ttl_fires) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size = limit;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size = limit;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+    mutable_cf_options_.ttl = 3600;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    uint64_t recent_time = static_cast<uint64_t>(time(nullptr));
+    Add(0, 1U, "100", "200", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+    Add(0, 2U, "200", "300", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+    Add(0, 3U, "300", "400", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+    Add(0, 4U, "400", "500", 50 * 1024, 0, 100, 100, 0, false,
+        Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+    if (blob_total > 0) {
+      AddBlobFile(100, blob_total / 2);
+      AddBlobFile(101, blob_total / 2);
+    }
+
+    auto compaction = PickFIFOCompaction(picker);
+    if (expect_ttl_fires) {
+      ASSERT_NE(nullptr, compaction.get())
+          << "TTL compaction should fire when remaining data < limit";
+      ASSERT_EQ(CompactionReason::kFIFOTtl, compaction->compaction_reason());
+      ASSERT_EQ(2U, compaction->num_input_files(0));
+    } else {
+      if (compaction != nullptr) {
+        ASSERT_NE(CompactionReason::kFIFOTtl, compaction->compaction_reason())
+            << "TTL should not fire when remaining data still exceeds limit";
+      }
+    }
+  };
+
+  // Sub-case 1: Under limit after drop.
+  //   blob=400KB, limit=500KB.
+  //   effective = 100KB + (100KB/200KB)*400KB = 300KB < 500KB -> fires.
+  run(400 * 1024, 500 * 1024, /*expect_ttl_fires=*/true);
+
+  // Sub-case 2: Over limit after drop.
+  //   blob=4MB, limit=100KB.
+  //   effective = 100KB + (100KB/200KB)*4MB ~ 2MB >> 100KB -> does NOT fire.
+  run(4ULL * 1024 * 1024, 100 * 1024, /*expect_ttl_fires=*/false);
+
+  // Sub-case 3: No blob files. Falls back to SST-only estimation.
+  //   blob=0, limit=150KB. remaining SST = 100KB < 150KB -> fires.
+  run(0, 150 * 1024, /*expect_ttl_fires=*/true);
+}
+
+TEST_F(CompactionPickerTest, FIFOTTLBlobEstimationMultiLevel) {
+  // Multi-level FIFO (migration) with TTL and max_data_files_size.
+  // This is the ritical bug fix scenario:
+  //   - L0 has some SSTs, L2 has legacy SSTs from migration
+  //   - Blob files cover ALL levels
+  //   - The estimation must use total SST across ALL levels (not just L0)
+  //     to avoid inflating the blob proportion.
+  //
+  // Setup:
+  //   L0: 4 files x 50KB = 200KB SST (files 3,4 expired)
+  //   L2: 1 file x 200KB SST (legacy migration data)
+  //   Total SST = 400KB
+  //   Blob: 800KB total
+  //   max_data_files_size = 1000KB
+  //   Remaining SST after TTL drop = 400KB - 100KB = 300KB
+  //
+  //   CORRECT (fixed): effective = 300KB + (300KB/400KB)*800KB = 300+600 =
+  //   900KB < 1000KB -> fires BUG (old):        effective = 100KB +
+  //   (100KB/200KB)*800KB = 100+400 = 500KB < 1000KB -> fires
+  //                     (coincidentally fires too, but with wrong estimate)
+  //
+  // To distinguish correct vs buggy behavior, use a limit that triggers the
+  // difference: set max_data_files_size = 850KB.
+  //   CORRECT: effective = 300KB + (300KB/400KB)*800KB = 900KB > 850KB -> does
+  //   NOT fire BUG:     effective = 100KB + (100KB/200KB)*800KB = 500KB < 850KB
+  //   -> fires (wrong!)
+  ioptions_.compaction_style = kCompactionStyleFIFO;
+  NewVersionStorage(4, kCompactionStyleFIFO);
+  mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+      850 * 1024;  // match max_data_files_size
+  mutable_cf_options_.compaction_options_fifo.max_data_files_size = 850 * 1024;
+  mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+  mutable_cf_options_.ttl = 3600;
+  FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+  uint64_t recent_time = static_cast<uint64_t>(time(nullptr));
+  // L0 files: 2 recent, 2 expired
+  Add(0, 1U, "100", "200", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+  Add(0, 2U, "200", "300", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, recent_time);
+  Add(0, 3U, "300", "400", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+  Add(0, 4U, "400", "500", 50 * 1024, 0, 100, 100, 0, false,
+      Temperature::kUnknown, kUnknownOldestAncesterTime, 1);
+  // L2 legacy migration file
+  Add(2, 10U, "100", "600", 200 * 1024);
+  // Blob files (associated with ALL levels)
+  AddBlobFile(100, 400 * 1024);
+  AddBlobFile(101, 400 * 1024);
+
+  auto compaction = PickFIFOCompaction(picker);
+  // With correct all-levels estimation:
+  //   remaining_sst_all = 400KB - 100KB(dropped) = 300KB
+  //   effective = 300KB + (300KB/400KB)*800KB = 900KB > 850KB
+  //   -> TTL should NOT fire (falls through to size-based)
+  if (compaction != nullptr) {
+    ASSERT_NE(CompactionReason::kFIFOTtl, compaction->compaction_reason())
+        << "Multi-level FIFO: TTL should not fire when correct all-levels "
+           "blob estimation shows data still exceeds limit";
+  }
+}
+
+TEST_F(CompactionPickerTest, FIFOBlobAwareSizeDropping) {
+  // PickSizeCompaction with max_data_files_size should account for blob data.
+  //
+  // Sub-case 1: Single-level. SST = 200KB, blob = 500MB, limit = 200MB.
+  //   effective_size ~ 500MB >> 200MB -> drops from L0.
+  {
+    SetupFIFORatioBased(/*max_table=*/200ULL * 1024 * 1024,
+                        /*max_data=*/200ULL * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/false);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "199", 40 * 1024);
+    Add(0, 2U, "200", "299", 40 * 1024);
+    Add(0, 3U, "300", "399", 40 * 1024);
+    Add(0, 4U, "400", "499", 40 * 1024);
+    Add(0, 5U, "500", "599", 40 * 1024);
+    AddBlobFile(100, 100ULL * 1024 * 1024);
+    AddBlobFile(101, 100ULL * 1024 * 1024);
+    AddBlobFile(102, 100ULL * 1024 * 1024);
+    AddBlobFile(103, 100ULL * 1024 * 1024);
+    AddBlobFile(104, 100ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get());
+    ASSERT_EQ(CompactionReason::kFIFOMaxSize, compaction->compaction_reason());
+    ASSERT_GE(compaction->num_input_files(0), 1);
+  }
+
+  // Sub-case 2: Multi-level (migration). L0=100KB, L2=150KB, blob=500KB.
+  //   effective_size = 250KB + 500KB = 750KB > 400KB -> drops from L2.
+  {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(4, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        400 * 1024;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        400 * 1024;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+    mutable_cf_options_.ttl = 0;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "200", 50 * 1024);
+    Add(0, 2U, "200", "300", 50 * 1024);
+    Add(2, 10U, "100", "300", 50 * 1024);
+    Add(2, 11U, "300", "500", 50 * 1024);
+    Add(2, 12U, "500", "700", 50 * 1024);
+    AddBlobFile(100, 250 * 1024);
+    AddBlobFile(101, 250 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    ASSERT_NE(nullptr, compaction.get());
+    ASSERT_EQ(CompactionReason::kFIFOMaxSize, compaction->compaction_reason());
+    ASSERT_EQ(2, compaction->start_level());
+    ASSERT_GE(compaction->num_input_files(0), 1U);
+  }
+
+  // Sub-case 3: Under limit. SST = 256KB, blob = 200MB, limit = 1GB.
+  //   effective_size ~ 200MB < 1GB -> no dropping.
+  {
+    SetupFIFORatioBased(/*max_table=*/1ULL * 1024 * 1024 * 1024,
+                        /*max_data=*/1ULL * 1024 * 1024 * 1024,
+                        /*trigger=*/4,
+                        /*allow_compaction=*/true,
+                        /*use_kv_ratio=*/true);
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    Add(0, 1U, "100", "199", 64 * 1024);
+    Add(0, 2U, "200", "299", 64 * 1024);
+    Add(0, 3U, "300", "399", 64 * 1024);
+    Add(0, 4U, "400", "499", 64 * 1024);
+    AddBlobFile(100, 50ULL * 1024 * 1024);
+    AddBlobFile(101, 50ULL * 1024 * 1024);
+    AddBlobFile(102, 50ULL * 1024 * 1024);
+    AddBlobFile(103, 50ULL * 1024 * 1024);
+
+    auto compaction = PickFIFOCompaction(picker);
+    if (compaction) {
+      ASSERT_NE(CompactionReason::kFIFOMaxSize,
+                compaction->compaction_reason());
+    }
+  }
+}
+
+// ============================================================================
+// FIFO Blob-Aware Score Computation Test
+// Tests that ComputeCompactionScore includes blob sizes when
+// max_data_files_size > 0.
+// ============================================================================
+
+TEST_F(CompactionPickerTest, FIFOBlobAwareScoreComputation) {
+  // Sub-case 1: With max_data_files_size, score includes blob sizes.
+  //   SST = 100KB, blob = 500MB, max_data = 200MB -> score ~ 2.5
+  {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        200ULL * 1024 * 1024;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        200ULL * 1024 * 1024;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = false;
+    mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+    Add(0, 1U, "100", "199", 25 * 1024);
+    Add(0, 2U, "200", "299", 25 * 1024);
+    Add(0, 3U, "300", "399", 25 * 1024);
+    Add(0, 4U, "400", "499", 25 * 1024);
+    AddBlobFile(100, 500ULL * 1024 * 1024);
+    UpdateVersionStorageInfo();
+
+    double score = vstorage_->CompactionScore(0);
+    ASSERT_GT(score, 2.0) << "Score should reflect 500MB/200MB ~ 2.5";
+  }
+
+  // Sub-case 2: Without max_data_files_size, score ignores blobs.
+  //   SST = 400KB < 1MB, blob = 500MB ignored -> score ~ 0.4
+  {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        1ULL * 1024 * 1024;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size = 0;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = false;
+    mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+    Add(0, 1U, "100", "199", 100 * 1024);
+    Add(0, 2U, "200", "299", 100 * 1024);
+    Add(0, 3U, "300", "399", 100 * 1024);
+    Add(0, 4U, "400", "499", 100 * 1024);
+    AddBlobFile(100, 500ULL * 1024 * 1024);
+    UpdateVersionStorageInfo();
+
+    double score = vstorage_->CompactionScore(0);
+    ASSERT_LT(score, 1.0)
+        << "Score should be < 1 when only SST sizes are counted";
+  }
+}
+
+// ============================================================================
+// FIFO + BlobDB Intra-L0 Compaction Picking Tests
+//
+// These tests validate the tiered intra-L0 compaction picking algorithm
+// over multiple flush/compaction cycles. Each round:
+//   1. Add a flush file to the L0 file list
+//   2. Rebuild VersionStorageInfo and call FIFOCompactionPicker::PickCompaction
+//   3. If compaction is picked, update the file list accordingly
+//   4. Repeat
+//
+// The compaction PICKING uses the real FIFOCompactionPicker -- this ensures
+// the tests always match the production picking logic. The rest of the
+// system (compaction execution, file metadata updates, FIFO dropping) is
+// handled by test helpers, since wiring up the full compaction execution
+// pipeline (CompactionJob, VersionEdit, etc.) would add significant
+// complexity without testing the picking logic more thoroughly.
+//
+// ============================================================================
+
+class FIFORatioBasedCompactionPickingTest : public CompactionPickerTest {
+ protected:
+  struct L0File {
+    uint64_t size;       // SST file size in bytes
+    uint64_t blob_size;  // Associated blob data size
+    uint64_t age;        // Creation order (lower = older)
+    bool is_compacted;   // Created by compaction (vs flush)
+  };
+
+  // Pick compaction using FIFOCompactionPicker.
+  //
+  // Rebuilds VersionStorageInfo from the files vector and calls
+  // PickCompaction on the given picker. Maps the returned
+  // Compaction's input files back to vector indices.
+  //
+  // Returns the picked indices, or empty if no compaction.
+  // Also returns the compaction reason via out-parameter.
+  std::vector<size_t> PickCompactionFromFiles(
+      FIFOCompactionPicker& picker, const std::vector<L0File>& files,
+      uint64_t max_table_files_size, uint64_t max_data_files_size, int trigger,
+      CompactionReason* out_reason = nullptr) {
+    // Rebuild VersionStorageInfo from the current file list
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    mutable_cf_options_.compaction_options_fifo.max_table_files_size =
+        max_table_files_size;
+    mutable_cf_options_.compaction_options_fifo.max_data_files_size =
+        max_data_files_size;
+    mutable_cf_options_.compaction_options_fifo.allow_compaction = true;
+    mutable_cf_options_.compaction_options_fifo.use_kv_ratio_compaction = true;
+    mutable_cf_options_.level0_file_num_compaction_trigger = trigger;
+
+    // Add files: newest first. Use descending file numbers so L0 sort
+    // (newest-first by epoch/seqno/file_number) matches our order.
+    uint32_t base_fn = static_cast<uint32_t>(files.size());
+    for (size_t i = 0; i < files.size(); i++) {
+      uint32_t fn = base_fn - static_cast<uint32_t>(i);
+      std::string smallest = "k" + std::to_string(10000 + fn * 10);
+      std::string largest = "k" + std::to_string(10000 + fn * 10 + 9);
+      Add(0, fn, smallest.c_str(), largest.c_str(), files[i].size);
+    }
+
+    // Add one blob file with the total blob size
+    uint64_t total_blob = 0;
+    for (const auto& f : files) {
+      total_blob += f.blob_size;
+    }
+    if (total_blob > 0) {
+      AddBlobFile(9999, total_blob);
+    }
+
+    UpdateVersionStorageInfo();
+
+    std::unique_ptr<Compaction> compaction(picker.PickCompaction(
+        cf_name_, mutable_cf_options_, mutable_db_options_,
+        /*existing_snapshots=*/{}, /*snapshot_checker=*/nullptr,
+        vstorage_.get(), &log_buffer_, /*full_history_ts_low=*/""));
+
+    if (!compaction) return {};
+
+    if (out_reason) {
+      *out_reason = compaction->compaction_reason();
+    }
+
+    // For size-based dropping (kFIFOMaxSize / kFIFOTtl), map input files
+    // back to sim indices, same as for intra-L0.
+    std::vector<size_t> result;
+    for (size_t j = 0; j < compaction->num_input_files(0); j++) {
+      uint32_t fn =
+          static_cast<uint32_t>(compaction->input(0, j)->fd.GetNumber());
+      size_t idx = base_fn - fn;
+      result.push_back(idx);
+    }
+
+    // Unregister so the picker allows the next compaction
+    picker.UnregisterCompaction(compaction.get());
+
+    return result;
+  }
+
+  // Execute one compaction: merge input files into 1 output
+  void ExecuteCompaction(std::vector<L0File>& files,
+                         const std::vector<size_t>& input_indices,
+                         uint64_t& global_age) {
+    uint64_t output_size = 0;
+    uint64_t output_blob = 0;
+    for (size_t idx : input_indices) {
+      output_size += files[idx].size;
+      output_blob += files[idx].blob_size;
+    }
+
+    size_t oldest_input_pos = 0;
+    for (size_t idx : input_indices) {
+      oldest_input_pos = std::max(oldest_input_pos, idx);
+    }
+
+    std::vector<size_t> sorted_indices = input_indices;
+    std::sort(sorted_indices.rbegin(), sorted_indices.rend());
+    for (size_t idx : sorted_indices) {
+      files.erase(files.begin() + idx);
+    }
+
+    size_t insert_pos = oldest_input_pos;
+    for (size_t idx : sorted_indices) {
+      if (idx < oldest_input_pos) insert_pos--;
+    }
+    insert_pos = std::min(insert_pos, files.size());
+    files.insert(files.begin() + insert_pos,
+                 {output_size, output_blob, global_age++, true});
+  }
+
+  // Compute statistics about compacted file sizes
+  struct FileStats {
+    uint64_t count;
+    uint64_t min_size;
+    uint64_t max_size;
+    double mean_size;
+    double cv;
+  };
+
+  FileStats ComputeStats(const std::vector<L0File>& files,
+                         bool compacted_only) {
+    std::vector<uint64_t> sizes;
+    for (const auto& f : files) {
+      if (!compacted_only || f.is_compacted) {
+        sizes.push_back(f.size);
+      }
+    }
+    if (sizes.empty()) return {0, 0, 0, 0.0, 0.0};
+
+    uint64_t sum = 0;
+    uint64_t min_s = UINT64_MAX, max_s = 0;
+    for (uint64_t s : sizes) {
+      sum += s;
+      min_s = std::min(min_s, s);
+      max_s = std::max(max_s, s);
+    }
+    double mean = static_cast<double>(sum) / sizes.size();
+
+    double variance = 0;
+    for (uint64_t s : sizes) {
+      double diff = static_cast<double>(s) - mean;
+      variance += diff * diff;
+    }
+    variance /= sizes.size();
+    double stddev = std::sqrt(variance);
+    double cv = mean > 0 ? stddev / mean : 0;
+
+    return {sizes.size(), min_s, max_s, mean, cv};
+  }
+
+  // Track write amplification
+  struct WriteAmpTracker {
+    uint64_t bytes_flushed = 0;
+    uint64_t bytes_compacted = 0;
+
+    double sst_write_amp() const {
+      return bytes_flushed > 0
+                 ? static_cast<double>(bytes_flushed + bytes_compacted) /
+                       bytes_flushed
+                 : 1.0;
+    }
+  };
+
+  struct TestState {
+    std::vector<L0File> files;
+    uint64_t global_age = 0;
+    WriteAmpTracker wa;
+    int compaction_count = 0;
+    uint64_t max_file_count_seen = 0;
+  };
+
+  using FlushGenerator =
+      std::function<std::pair<uint64_t, uint64_t>(int round)>;
+
+  // Core test loop: flush -> pick -> execute -> repeat.
+  void RunFlushAndCompact(TestState& s, int num_rounds, int trigger,
+                          uint64_t max_data_files_size,
+                          const FlushGenerator& gen) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+
+    // Use max_data_files_size for both limits. When max_data_files_size > 0,
+    // it takes precedence and max_table_files_size is ignored, but keeping
+    // them consistent avoids contradictory configurations.
+    const uint64_t max_table_files_size = max_data_files_size;
+
+    for (int round = 0; round < num_rounds; round++) {
+      auto [sst_size, blob_size] = gen(round);
+      s.files.insert(s.files.begin(),
+                     {sst_size, blob_size, s.global_age++, false});
+      s.wa.bytes_flushed += sst_size;
+
+      // Pick compaction. Handle both dropping and intra-L0 results.
+      CompactionReason reason;
+      auto inputs =
+          PickCompactionFromFiles(picker, s.files, max_table_files_size,
+                                  max_data_files_size, trigger, &reason);
+      if (!inputs.empty()) {
+        if (reason == CompactionReason::kFIFOMaxSize ||
+            reason == CompactionReason::kFIFOTtl) {
+          // Size/TTL dropping: remove the picked files
+          std::vector<size_t> sorted = inputs;
+          std::sort(sorted.rbegin(), sorted.rend());
+          for (size_t idx : sorted) {
+            s.files.erase(s.files.begin() + idx);
+          }
+        } else {
+          // Intra-L0 compaction: merge picked files
+          uint64_t compaction_input = 0;
+          for (size_t idx : inputs) {
+            compaction_input += s.files[idx].size;
+          }
+          s.wa.bytes_compacted += compaction_input;
+          ExecuteCompaction(s.files, inputs, s.global_age);
+          s.compaction_count++;
+        }
+      }
+      s.max_file_count_seen = std::max(s.max_file_count_seen,
+                                       static_cast<uint64_t>(s.files.size()));
+    }
+  }
+
+  // Assertion helpers
+  void AssertFileCountBounded(const std::vector<L0File>& files,
+                              uint64_t max_count, uint64_t multiplier = 3) {
+    ASSERT_LE(files.size(), max_count * multiplier)
+        << "File count " << files.size() << " exceeds "
+        << max_count * multiplier;
+  }
+
+  void AssertCompactedUniform(const std::vector<L0File>& files, double max_cv) {
+    auto stats = ComputeStats(files, true);
+    if (stats.count >= 2) {
+      ASSERT_LE(stats.cv, max_cv)
+          << "Compacted CV=" << stats.cv << " exceeds " << max_cv
+          << " (min=" << stats.min_size << " max=" << stats.max_size
+          << " mean=" << stats.mean_size << " count=" << stats.count << ")";
+    }
+  }
+
+  void AssertLowWriteAmp(const WriteAmpTracker& wa, double max_wa = 3.0) {
+    ASSERT_LE(wa.sst_write_amp(), max_wa)
+        << "Write amp=" << wa.sst_write_amp() << " exceeds " << max_wa;
+  }
+
+  void AssertStandardGoals(const TestState& s, uint64_t max_count,
+                           double max_cv = 0.30, double max_wa = 3.0,
+                           uint64_t file_mult = 3) {
+    AssertFileCountBounded(s.files, max_count, file_mult);
+    AssertCompactedUniform(s.files, max_cv);
+    AssertLowWriteAmp(s.wa, max_wa);
+  }
+
+  // Verify that graduated files (>= target) are never picked for compaction.
+  void AssertGraduatedNotPicked(const std::vector<L0File>& files, int trigger,
+                                uint64_t max_data_files_size) {
+    ioptions_.compaction_style = kCompactionStyleFIFO;
+    FIFOCompactionPicker picker(ioptions_, &icmp_);
+    const uint64_t max_table_files_size = max_data_files_size;
+
+    CompactionReason reason;
+    auto inputs =
+        PickCompactionFromFiles(picker, files, max_table_files_size,
+                                max_data_files_size, trigger, &reason);
+    if (!inputs.empty() && reason == CompactionReason::kFIFOReduceNumFiles) {
+      // Compute target from the picker's perspective: we need to estimate
+      // it the same way the picker does.
+      uint64_t total_sst = 0, total_blob = 0;
+      for (const auto& f : files) {
+        total_sst += f.size;
+        total_blob += f.blob_size;
+      }
+      double sst_ratio = total_blob > 0 ? static_cast<double>(total_sst) /
+                                              (total_sst + total_blob)
+                                        : 1.0;
+      uint64_t target =
+          static_cast<uint64_t>(max_data_files_size * sst_ratio) / trigger;
+
+      for (size_t idx : inputs) {
+        ASSERT_LT(files[idx].size, target)
+            << "Should not re-compact graduated file at index " << idx
+            << " size=" << files[idx].size << " target=" << target;
+      }
+    }
+  }
+};
+
+// Variable flush + FIFO dropping -- the full scenario.
+// Variable SST sizes (32-128KB), variable blob sizes (32-96MB), with
+// FIFO size-based dropping active. This covers constant flush, variable
+// flush, and FIFO dropping behaviors in a single test.
+TEST_F(FIFORatioBasedCompactionPickingTest, VariableFlushWithFIFODropping) {
+  const uint64_t kCap = 500ULL * 1024 * 1024;
+  Random rng(42);
+  TestState s;
+  RunFlushAndCompact(s, 200, /*trigger=*/10, kCap, [&](int) {
+    return std::make_pair((32 + rng.Next() % 97) * 1024ULL,
+                          (32 + rng.Next() % 65) * 1024ULL * 1024);
+  });
+  AssertStandardGoals(s, 10, /*max_cv=*/0.40);
+}
+
+// Verify graduated files are never re-compacted.
+// With the tiered algorithm, intermediate compacted files CAN be merged
+// at higher tier boundaries (that's the whole point of tiering). But files
+// that have reached the target size ("graduated") should never be picked.
+TEST_F(FIFORatioBasedCompactionPickingTest, NoCascadingReCompaction) {
+  const uint64_t kCap = 10ULL * 1024 * 1024 * 1024;
+  TestState s;
+  RunFlushAndCompact(s, 200, /*trigger=*/10, kCap, [](int) {
+    return std::make_pair(64ULL * 1024, 64ULL * 1024 * 1024);
+  });
+
+  AssertGraduatedNotPicked(s.files, 10, kCap);
+  // Write amp should be bounded (k=2 tiers for this config, so wa <= 3+margin)
+  AssertLowWriteAmp(s.wa, 4.0);
+}
+
+// Early memtable flush -- very small flushes
+TEST_F(FIFORatioBasedCompactionPickingTest, EarlyMemtableFlush) {
+  const uint64_t kCap = 1ULL * 1024 * 1024 * 1024;
+  Random rng(123);
+  TestState s;
+  RunFlushAndCompact(s, 100, /*trigger=*/10, kCap, [&](int) {
+    uint64_t sst = (rng.Next() % 5 == 0) ? (64 + rng.Next() % 65) * 1024ULL
+                                         : (8 + rng.Next() % 25) * 1024ULL;
+    return std::make_pair(sst, 32ULL * 1024 * 1024);
+  });
+
+  AssertStandardGoals(s, 10, /*max_cv=*/0.50, /*max_wa=*/4.0,
+                      /*file_mult=*/5);
+}
+
+// Blob compression variation -- data per flush varies, shifting
+// the SST/blob ratio. The target is recomputed on every PickCompaction call
+// (no caching), so the picker naturally adapts to ratio changes.
+TEST_F(FIFORatioBasedCompactionPickingTest, BlobCompressionVariation) {
+  const uint64_t kCap = 300ULL * 1024 * 1024;
+  Random rng(456);
+  TestState s;
+  RunFlushAndCompact(s, 150, /*trigger=*/10, kCap, [&](int) {
+    return std::make_pair(64ULL * 1024,
+                          (20 + rng.Next() % 61) * 1024ULL * 1024);
+  });
+  AssertCompactedUniform(s.files, 0.30);
+}
+
+// Large target/flush ratio -- verify logarithmic write amp with tiering
+TEST_F(FIFORatioBasedCompactionPickingTest, TieredLargeRatio) {
+  // target/flush ~ 1000x with trigger=10 -> k=3 tiers, write amp ~ 4.
+  // Without tiering (flat merge), write amp would be ~57x.
+  const uint64_t kCap = 10ULL * 1024 * 1024 * 1024;  // 10GB
+  TestState s;
+  // SST = 1KB, blob = 1MB. sst_ratio ~ 0.001.
+  // target = 10GB * 0.001 / 10 = 1MB. ratio = 1MB/1KB = 1024.
+  // k = ceil(log_10(1024)) = 4. Tier boundaries: ~10KB, ~100KB, 1MB.
+  // (10KB floor means lowest boundary is 10KB, not 1KB)
+  RunFlushAndCompact(s, 500, /*trigger=*/10, kCap, [](int) {
+    return std::make_pair(1ULL * 1024, 1ULL * 1024 * 1024);
+  });
+
+  // Write amp should be logarithmic: k+1 = 4 (with 10KB floor, 3 tiers).
+  // Allow some margin for ramp-up and boundary effects.
+  AssertLowWriteAmp(s.wa, 6.0);
+
+  // File count should be bounded: trigger * (k+1) ~ 10 * 4 = 40
+  AssertFileCountBounded(s.files, 10, /*multiplier=*/6);
+}
+
+// Tiered progression -- verify intermediate tiers form and merge up
+TEST_F(FIFORatioBasedCompactionPickingTest, TieredProgression) {
+  // SST = 10KB, blob = 1MB, cap = 100MB, trigger=4.
+  // sst_ratio ~ 10KB/1010KB ~ 0.0099.
+  // target = 100MB * 0.0099 / 4 ~ 248KB. ratio ~ 25.
+  // k = ceil(log_4(25)) = ceil(2.32) = 3. Boundaries: ~16KB, ~62KB, ~248KB.
+  const uint64_t kCap = 100ULL * 1024 * 1024;
+  TestState s;
+  RunFlushAndCompact(s, 200, /*trigger=*/4, kCap, [](int) {
+    return std::make_pair(10ULL * 1024, 1ULL * 1024 * 1024);
+  });
+
+  // Should have compacted files at multiple tier sizes
+  auto stats = ComputeStats(s.files, true);
+  ASSERT_GE(stats.count, 1u) << "Should have at least one compacted file";
+
+  // Write amp should be bounded: k+1 = 4, plus margin
+  AssertLowWriteAmp(s.wa, 5.0);
+}
+
+// Graduated files should never be re-compacted
+TEST_F(FIFORatioBasedCompactionPickingTest, GraduatedFilesNotRecompacted) {
+  // Build a state with graduated files (>= target), then verify they are
+  // never selected for compaction.
+  const uint64_t kCap = 500ULL * 1024 * 1024;  // 500MB
+  TestState s;
+  // SST = 64KB, blob = 50MB. sst_ratio ~ 0.00125.
+  // target = 500MB * 0.00125 / 4 ~ 156KB.
+  // k = ceil(log_4(156/64)) = ceil(log_4(2.44)) = 1.
+  RunFlushAndCompact(s, 60, /*trigger=*/4, kCap, [](int) {
+    return std::make_pair(64ULL * 1024, 50ULL * 1024 * 1024);
+  });
+
+  AssertGraduatedNotPicked(s.files, 4, kCap);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 749acda11c8b..58887c2777f7 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -7840,7 +7840,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
       options_.level0_file_num_compaction_trigger = 3;
 
       CompactionOptionsFIFO fifo_options;
-      if (compaction_path_to_test == "FindIntraL0Compaction" ||
+      if (compaction_path_to_test == "PickCostBasedIntraL0Compaction" ||
           compaction_path_to_test == "CompactRange") {
         fifo_options.allow_compaction = true;
       } else if (compaction_path_to_test == "CompactFile") {
@@ -7940,7 +7940,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
 
   void SetupSyncPoints(const std::string& compaction_path_to_test) {
     compaction_path_sync_point_called_.store(false);
-    if (compaction_path_to_test == "FindIntraL0Compaction" &&
+    if (compaction_path_to_test == "PickCostBasedIntraL0Compaction" &&
         options_.compaction_style == CompactionStyle::kCompactionStyleLevel) {
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
           "PostPickFileToCompact", [&](void* arg) {
@@ -7950,7 +7950,7 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
             *picked_file_to_compact = false;
           });
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-          "FindIntraL0Compaction", [&](void* /*arg*/) {
+          "PickCostBasedIntraL0Compaction", [&](void* /*arg*/) {
             compaction_path_sync_point_called_.store(true);
           });
 
@@ -7986,12 +7986,12 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest {
           "PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) {
             compaction_path_sync_point_called_.store(true);
           });
-    } else if ((compaction_path_to_test == "FindIntraL0Compaction" ||
+    } else if ((compaction_path_to_test == "PickCostBasedIntraL0Compaction" ||
                 compaction_path_to_test == "CompactRange") &&
                options_.compaction_style ==
                    CompactionStyle::kCompactionStyleFIFO) {
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-          "FindIntraL0Compaction", [&](void* /*arg*/) {
+          "PickCostBasedIntraL0Compaction", [&](void* /*arg*/) {
             compaction_path_sync_point_called_.store(true);
           });
     }
@@ -8151,7 +8151,7 @@ TEST_F(DBCompactionTestL0FilesMisorderCorruption,
     IngestOneKeyValue(dbfull(), Key(i), "new", options_);
   }
 
-  SetupSyncPoints("FindIntraL0Compaction");
+  SetupSyncPoints("PickCostBasedIntraL0Compaction");
   ResumeCompactionThread();
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -8284,7 +8284,8 @@ TEST_F(DBCompactionTestL0FilesMisorderCorruption,
 
 TEST_F(DBCompactionTestL0FilesMisorderCorruption,
        FlushAfterIntraL0FIFOCompactionWithIngestedFile) {
-  for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) {
+  for (const std::string compaction_path_to_test :
+       {"PickCostBasedIntraL0Compaction"}) {
     SetupOptions(CompactionStyle::kCompactionStyleFIFO,
                  compaction_path_to_test);
     DestroyAndReopen(options_);
diff --git a/db/version_set.cc b/db/version_set.cc
index f3f1ee727cf8..bd36cc577475 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -3792,15 +3792,20 @@ void VersionStorageInfo::ComputeCompactionScore(
       }
 
       if (compaction_style_ == kCompactionStyleFIFO) {
-        auto max_table_files_size =
-            mutable_cf_options.compaction_options_fifo.max_table_files_size;
-        if (max_table_files_size == 0) {
+        const auto& fifo_opts = mutable_cf_options.compaction_options_fifo;
+        uint64_t effective_size = total_size;
+        uint64_t effective_max = fifo_opts.max_table_files_size;
+        if (fifo_opts.max_data_files_size > 0) {
+          // Blob-aware: include blob file sizes in the total
+          effective_size += GetBlobStats().total_file_size;
+          effective_max = fifo_opts.max_data_files_size;
+        }
+        if (effective_max == 0) {
           // avoid divide 0
-          max_table_files_size = 1;
+          effective_max = 1;
         }
-        score = static_cast<double>(total_size) / max_table_files_size;
-        if (score < 1 &&
-            mutable_cf_options.compaction_options_fifo.allow_compaction) {
+        score = static_cast<double>(effective_size) / effective_max;
+        if (score < 1 && fifo_opts.allow_compaction) {
           score = std::max(
               static_cast<double>(num_sorted_runs) /
                   mutable_cf_options.level0_file_num_compaction_trigger,
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 666751d95a55..fff3720f150d 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -161,6 +161,8 @@ DECLARE_uint64(periodic_compaction_seconds);
 DECLARE_string(daily_offpeak_time_utc);
 DECLARE_uint64(compaction_ttl);
 DECLARE_bool(fifo_allow_compaction);
+DECLARE_uint64(fifo_compaction_max_data_files_size_mb);
+DECLARE_bool(fifo_compaction_use_kv_ratio_compaction);
 DECLARE_bool(allow_concurrent_memtable_write);
 DECLARE_double(experimental_mempurge_threshold);
 DECLARE_bool(enable_write_thread_adaptive_yield);
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 27918aa0230f..19b4c602e7c3 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -415,6 +415,17 @@ DEFINE_bool(fifo_allow_compaction, false,
             "If true, set `Options::compaction_options_fifo.allow_compaction = "
             "true`. It only take effect when FIFO compaction is used.");
 
+DEFINE_uint64(fifo_compaction_max_data_files_size_mb, 0,
+              "If non-zero, set "
+              "`Options::compaction_options_fifo.max_data_files_size` to this "
+              "value (in MB). Only takes effect with FIFO compaction.");
+
+DEFINE_bool(fifo_compaction_use_kv_ratio_compaction, false,
+            "If true, set "
+            "`Options::compaction_options_fifo.use_kv_ratio_compaction = "
+            "true`. Requires fifo_allow_compaction and "
+            "fifo_compaction_max_data_files_size_mb > 0.");
+
 DEFINE_bool(allow_concurrent_memtable_write, false,
             "Allow multi-writers to update mem tables in parallel.");
 
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 2883d990be3c..7428b2eff03a 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4378,6 +4378,12 @@ void InitializeOptionsFromFlags(
       ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleFIFO) {
     options.compaction_options_fifo.allow_compaction =
         FLAGS_fifo_allow_compaction;
+    if (FLAGS_fifo_compaction_max_data_files_size_mb > 0) {
+      options.compaction_options_fifo.max_data_files_size =
+          FLAGS_fifo_compaction_max_data_files_size_mb * 1024 * 1024;
+    }
+    options.compaction_options_fifo.use_kv_ratio_compaction =
+        FLAGS_fifo_compaction_use_kv_ratio_compaction;
   }
   options.compaction_pri =
       static_cast<ROCKSDB_NAMESPACE::CompactionPri>(FLAGS_compaction_pri);
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index ffa5d5a2659c..898d07a6021d 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -128,6 +128,50 @@ struct CompactionOptionsFIFO {
   // not be used. The minmum buffer size must be at least 4KiB
   uint64_t trivial_copy_buffer_size = 4096;
 
+  // When non-zero, FIFO compaction uses the combined size of SST files and
+  // blob files for size-based trimming decisions. When the total data size
+  // (SST + blob) exceeds this limit, the oldest SST files are dropped along
+  // with their associated blob files.
+  //
+  // When non-zero, this takes precedence over max_table_files_size for all
+  // FIFO compaction decisions: size-based dropping, TTL threshold checks,
+  // and compaction score computation. max_table_files_size is ignored.
+  //
+  // When zero (default), FIFO compaction uses max_table_files_size which
+  // only considers SST file sizes, maintaining backward compatibility.
+  //
+  // This option is primarily intended for use with integrated BlobDB where
+  // blob files can represent a significant portion of the total data.
+  //
+  // Dynamically changeable through SetOptions() API.
+  // Default: 0 (use max_table_files_size behavior)
+  uint64_t max_data_files_size = 0;
+
+  // When true, enables a capacity-derived intra-L0 compaction strategy
+  // optimized for BlobDB workloads where SST files are much smaller than
+  // write_buffer_size. Uses the observed key/value size ratio (SST vs blob
+  // file sizes) to compute a target compacted file size, producing uniform
+  // files for predictable FIFO trimming.
+  //
+  // Uses level0_file_num_compaction_trigger as the target max L0 file count.
+  //
+  // When max_compaction_bytes is 0, the target is auto-calculated from the
+  // data capacity and observed SST/blob ratio. When max_compaction_bytes is
+  // explicitly set to a non-zero value, it overrides the auto-calculated
+  // target.
+  //
+  // Requires:
+  //   - allow_compaction = true (master switch for intra-L0 compaction)
+  //   - max_data_files_size > 0 (needed to compute the target file size)
+  // Setting this to true without these will fail option validation.
+  //
+  // When false, the old intra-L0 strategy is used if allow_compaction is
+  // true (PickCostBasedIntraL0Compaction with 1.1 * write_buffer_size guard).
+  //
+  // Dynamically changeable through SetOptions() API.
+  // Default: false
+  bool use_kv_ratio_compaction = false;
+
   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
   CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
       : max_table_files_size(_max_table_files_size),
@@ -643,6 +687,15 @@ struct AdvancedColumnFamilyOptions {
   //
   // Default: target_file_size_base * 25
   //
+  // For FIFO compaction with use_kv_ratio_compaction=true:
+  // When set to 0 (and compaction_style is FIFO), the value is NOT sanitized
+  // to the default. Instead, the target compacted file size is automatically
+  // calculated from the data capacity (max_data_files_size) and observed
+  // SST/blob ratio. When explicitly set to a non-zero value, it overrides
+  // the auto-calculated target and is used directly as the max compaction
+  // input size. Note: for FIFO, this controls the output file size target,
+  // not a general compaction byte limit as in level/universal compaction.
+  //
   // Dynamically changeable through SetOptions() API
   uint64_t max_compaction_bytes = 0;
 
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index d12b68d9dbea..ab8efecabb78 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -2804,6 +2804,19 @@ rocksdb_fifo_compaction_options_set_max_table_files_size(
 extern ROCKSDB_LIBRARY_API uint64_t
 rocksdb_fifo_compaction_options_get_max_table_files_size(
     rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_data_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts,
+    unsigned char use_kv_ratio_compaction);
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_fifo_compaction_options_get_use_kv_ratio_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
 extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts);
 
diff --git a/java/rocksjni/compaction_options_fifo.cc b/java/rocksjni/compaction_options_fifo.cc
index 535562fb47f7..f23eee6c3d2a 100644
--- a/java/rocksjni/compaction_options_fifo.cc
+++ b/java/rocksjni/compaction_options_fifo.cc
@@ -71,6 +71,54 @@ jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(JNIEnv*, jclass,
   return static_cast<jboolean>(opt->allow_compaction);
 }
 
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setMaxDataFilesSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setMaxDataFilesSize(
+    JNIEnv*, jclass, jlong jhandle, jlong jmax_data_files_size) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  opt->max_data_files_size = static_cast<uint64_t>(jmax_data_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    maxDataFilesSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionOptionsFIFO_maxDataFilesSize(JNIEnv*, jclass,
+                                                              jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jlong>(opt->max_data_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setUseKvRatioCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setUseKvRatioCompaction(
+    JNIEnv*, jclass, jlong jhandle, jboolean use_kv_ratio_compaction) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  opt->use_kv_ratio_compaction = static_cast<bool>(use_kv_ratio_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    useKvRatioCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionOptionsFIFO_useKvRatioCompaction(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* opt =
+      reinterpret_cast<ROCKSDB_NAMESPACE::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jboolean>(opt->use_kv_ratio_compaction);
+}
+
 /*
  * Class:     org_rocksdb_CompactionOptionsFIFO
  * Method:    disposeInternal
diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
index 24ebe0da2ff1..3d94e7eb0215 100644
--- a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
+++ b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
@@ -75,6 +75,51 @@ public boolean allowCompaction() {
     return allowCompaction(nativeHandle_);
   }
 
+  /**
+   * Combined SST + blob file size limit for FIFO compaction trimming.
+   * When non-zero, FIFO uses total_sst + total_blob for size-based dropping.
+   * When zero (default), uses max_table_files_size (SST-only).
+   *
+   * @param maxDataFilesSize the combined size limit in bytes
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setMaxDataFilesSize(final long maxDataFilesSize) {
+    setMaxDataFilesSize(nativeHandle_, maxDataFilesSize);
+    return this;
+  }
+
+  /**
+   * Get the combined SST + blob file size limit.
+   *
+   * @return max data files size in bytes, 0 means disabled
+   */
+  public long maxDataFilesSize() {
+    return maxDataFilesSize(nativeHandle_);
+  }
+
+  /**
+   * Enable capacity-derived intra-L0 compaction using the observed key/value
+   * size ratio. Requires maxDataFilesSize &gt; 0.
+   *
+   * @param useKvRatioCompaction true to enable
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setUseKvRatioCompaction(final boolean useKvRatioCompaction) {
+    setUseKvRatioCompaction(nativeHandle_, useKvRatioCompaction);
+    return this;
+  }
+
+  /**
+   * Check if capacity-derived intra-L0 compaction is enabled.
+   *
+   * @return true if enabled
+   */
+  public boolean useKvRatioCompaction() {
+    return useKvRatioCompaction(nativeHandle_);
+  }
+
   private static native long newCompactionOptionsFIFO();
   @Override
   protected final void disposeInternal(final long handle) {
@@ -86,4 +131,9 @@ protected final void disposeInternal(final long handle) {
   private static native long maxTableFilesSize(final long handle);
   private static native void setAllowCompaction(final long handle, final boolean allowCompaction);
   private static native boolean allowCompaction(final long handle);
+  private static native void setMaxDataFilesSize(final long handle, final long maxDataFilesSize);
+  private static native long maxDataFilesSize(final long handle);
+  private static native void setUseKvRatioCompaction(
+      final long handle, final boolean useKvRatioCompaction);
+  private static native boolean useKvRatioCompaction(final long handle);
 }
diff --git a/options/cf_options.cc b/options/cf_options.cc
index 98c010406b43..2ba56e0f36d8 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -311,6 +311,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {"trivial_copy_buffer_size",
          {offsetof(struct CompactionOptionsFIFO, trivial_copy_buffer_size),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"max_data_files_size",
+         {offsetof(struct CompactionOptionsFIFO, max_data_files_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"use_kv_ratio_compaction",
+         {offsetof(struct CompactionOptionsFIFO, use_kv_ratio_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}}};
 
 static std::unordered_map<std::string, OptionTypeInfo>
@@ -1260,6 +1268,10 @@ void MutableCFOptions::Dump(Logger* log) const {
                  compaction_options_fifo.max_table_files_size);
   ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d",
                  compaction_options_fifo.allow_compaction);
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.max_data_files_size : %" PRIu64,
+                 compaction_options_fifo.max_data_files_size);
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.use_kv_ratio_compaction : %d",
+                 compaction_options_fifo.use_kv_ratio_compaction);
 
   // Blob file related options
   ROCKS_LOG_INFO(log, "                        enable_blob_files: %s",
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 4bafb6fcca9a..bbc4db46a68a 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -678,7 +678,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "preserve_internal_time_seconds=86400;"
       "compaction_options_fifo={max_table_files_size=3;allow_"
       "compaction=true;age_for_warm=0;file_temperature_age_thresholds={{"
-      "temperature=kCold;age=12345}};};"
+      "temperature=kCold;age=12345}};max_data_files_size=1073741824;"
+      "use_kv_ratio_compaction=false;};"
       "blob_cache=1M;"
       "memtable_protection_bytes_per_key=2;"
       "persist_user_defined_timestamps=true;"
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index a7d850b825ba..eb96ef83eff6 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -1034,6 +1034,14 @@ DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
 
 DEFINE_uint64(fifo_age_for_warm, 0, "age_for_warm for FIFO compaction.");
 
+DEFINE_uint64(fifo_compaction_max_data_files_size_mb, 0,
+              "Combined SST + blob file size limit for FIFO compaction "
+              "trimming. 0 means use max_table_files_size (SST-only).");
+
+DEFINE_bool(fifo_compaction_use_kv_ratio_compaction, false,
+            "Enable capacity-derived intra-L0 compaction for FIFO with "
+            "BlobDB. Requires fifo_compaction_max_data_files_size_mb > 0.");
+
 // Stacked BlobDB Options
 DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
 
@@ -4425,6 +4433,10 @@ class Benchmark {
         FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
         FLAGS_fifo_compaction_allow_compaction);
     options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm;
+    options.compaction_options_fifo.max_data_files_size =
+        FLAGS_fifo_compaction_max_data_files_size_mb * 1024 * 1024;
+    options.compaction_options_fifo.use_kv_ratio_compaction =
+        FLAGS_fifo_compaction_use_kv_ratio_compaction;
     options.prefix_extractor = prefix_extractor_;
     if (FLAGS_use_uint64_comparator) {
       options.comparator = test::Uint64Comparator();
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index b5c422329602..55cc42d0046c 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -265,6 +265,10 @@ def apply_random_seed_per_iteration():
     "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
     "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
     "fifo_allow_compaction": lambda: random.randint(0, 1),
+    "fifo_compaction_max_data_files_size_mb": lambda: random.choice(
+        [0, 100, 500]
+    ),
+    "fifo_compaction_use_kv_ratio_compaction": lambda: random.randint(0, 1),
     # Test small max_manifest_file_size in a smaller chance, as most of the
     # time we wnat manifest history to be preserved to help debug
     "max_manifest_file_size": lambda: random.choice(
@@ -969,9 +973,21 @@ def finalize_and_sanitize(src_params):
         # Disable irrelevant tiering options
         dest_params["preclude_last_level_data_seconds"] = 0
         dest_params["last_level_temperature"] = "kUnknown"
+        # use_kv_ratio_compaction requires allow_compaction and
+        # max_data_files_size > 0
+        if dest_params.get("fifo_compaction_use_kv_ratio_compaction", 0) == 1:
+            if (
+                dest_params.get("fifo_allow_compaction", 0) != 1
+                or dest_params.get("fifo_compaction_max_data_files_size_mb", 0)
+                == 0
+            ):
+                dest_params["fifo_compaction_use_kv_ratio_compaction"] = 0
     else:
         # Disable irrelevant tiering options
         dest_params["file_temperature_age_thresholds"] = ""
+        # Disable FIFO-specific options for non-FIFO compaction styles
+        dest_params["fifo_compaction_max_data_files_size_mb"] = 0
+        dest_params["fifo_compaction_use_kv_ratio_compaction"] = 0
     if dest_params["partition_filters"] == 1:
         if dest_params["index_type"] != 2:
             dest_params["partition_filters"] = 0
diff --git a/unreleased_history/new_features/fifo_kv_ratio_compaction.md b/unreleased_history/new_features/fifo_kv_ratio_compaction.md
new file mode 100644
index 000000000000..3cf01dda0c78
--- /dev/null
+++ b/unreleased_history/new_features/fifo_kv_ratio_compaction.md
@@ -0,0 +1 @@
+Added `CompactionOptionsFIFO::max_data_files_size` to support FIFO compaction trimming based on combined SST and blob file sizes. Added `CompactionOptionsFIFO::use_kv_ratio_compaction` to enable a capacity-derived intra-L0 compaction strategy optimized for BlobDB workloads, producing uniform-sized compacted files for predictable FIFO trimming.
diff --git a/wiki/fifo_compaction.md b/wiki/fifo_compaction.md
new file mode 100644
index 000000000000..52c25fb920a5
--- /dev/null
+++ b/wiki/fifo_compaction.md
@@ -0,0 +1,672 @@
+# FIFO Compaction Strategy
+
+This document describes the FIFO compaction style in RocksDB, covering the
+file dropping strategies and both the old and new intra-L0 compaction
+picking strategies.
+
+## Overview
+
+FIFO compaction is designed for time-series and log-like workloads where data
+has a natural expiration. All data lives at L0. When total data exceeds a
+configured size limit, the oldest SST files are dropped — no merge, no rewrite,
+just deletion. This gives near-zero write amplification for the compaction layer.
+
+```
+L0 (all data lives here):
+  Newest                                                           Oldest
+    |                                                                |
+    v                                                                v
+  [SST_N] [SST_N-1] ... [SST_3] [SST_2] [SST_1]
+    ^                                       ^
+    |                                       |
+  new flushes added here          oldest files dropped here
+                                  (when over size limit)
+```
+
+Without intra-L0 compaction, every memtable flush creates a new small SST file.
+Over time, the number of L0 files grows, increasing read amplification (each
+point lookup must check every L0 file). Intra-L0 compaction addresses this by
+merging small files into fewer larger files.
+
+## Compaction Picking Priority Chain
+
+When compaction is triggered (score >= 1.0), the picker tries these strategies
+in order, returning the first non-null result:
+
+```
+PickCompaction():
+    |
+    |-- 1. PickTTLCompaction()               [File Dropping]
+    |       Drop files older than TTL.
+    |
+    |-- 2. PickSizeCompaction()              [File Dropping]
+    |       Drop oldest files when over size limit.
+    |
+    |-- 3. PickIntraL0Compaction()           [Intra-L0]
+    |       Dispatcher: merges small L0 files to reduce file count.
+    |       Requires allow_compaction=true. Dispatches to:
+    |         - PickRatioBasedIntraL0Compaction (use_kv_ratio_compaction=true)
+    |         - PickCostBasedIntraL0Compaction  (use_kv_ratio_compaction=false)
+    |
+    |-- 4. PickTemperatureChangeCompaction() [Temperature Migration]
+            Rewrite one file to change its temperature tier.
+            Lowest priority — runs only if nothing else needs to be done.
+```
+
+Steps 1 and 2 are **file dropping** — they delete old files to enforce size
+or TTL limits. Step 3 is **intra-L0 compaction** — it merges small files into
+fewer larger ones. `PickIntraL0Compaction` is the dispatcher that selects
+between the two strategies based on `use_kv_ratio_compaction`.
+
+Step 4 is **temperature migration** — it rewrites a single file to change its
+storage temperature (e.g., moving cold data to cheaper storage). It picks one
+file at a time, checking if the file's age exceeds a configured threshold but
+its current temperature doesn't match the target. It runs last because it's
+the lowest priority: disk space management (dropping) and read amplification
+(intra-L0) are more important than storage tiering. Since FIFO only allows
+one compaction at a time, running temperature change last ensures it never
+blocks more critical operations.
+
+Note: Intra-L0 compaction runs after size-based dropping. If `PickSizeCompaction`
+dropped files (returned non-null), `PickIntraL0Compaction` is skipped. This
+means intra-L0 only runs when the DB is under the size limit or when
+size-based compaction is already in progress.
+
+## Score Computation
+
+The compaction score determines when compaction should be triggered. For FIFO:
+
+```
+score = effective_total_size / effective_max_size
+```
+
+Where:
+- `effective_total_size` = total SST size (or SST + blob when
+  `max_data_files_size > 0`)
+- `effective_max_size` = `max_table_files_size` (or `max_data_files_size`
+  when set)
+
+Additional score contributions:
+- When `allow_compaction` is true (enables intra-L0 compaction):
+  `score = max(score, num_sorted_runs / level0_file_num_compaction_trigger)`
+- When `ttl > 0`: score is boosted by expired file count
+- When temperature thresholds are set: score is boosted if files need
+  temperature change
+
+---
+
+# Part 1: File Dropping Strategies
+
+These strategies delete old files to enforce data size or TTL limits.
+No data is rewritten — files are simply removed.
+
+## TTL-Based Dropping (`PickTTLCompaction`)
+
+**When**: `ttl > 0`
+
+Drops L0 files whose data is older than the TTL threshold. Iterates from
+oldest to newest, checking `newest_key_time` or `creation_time` from table
+properties against `current_time - ttl`.
+
+```
+Before TTL compaction (ttl = 3600s, files older than 1 hour):
+
+  L0: [F6:10m] [F5:20m] [F4:40m] [F3:50m] [F2:70m] [F1:80m]
+                                              ^^^^     ^^^^
+                                            older than TTL --> DROP
+
+After:
+  L0: [F6:10m] [F5:20m] [F4:40m] [F3:50m]
+```
+
+Returns `nullptr` if deleting expired files would still leave the total size
+above the size limit — in that case, size-based dropping handles it instead.
+
+**Config**: `MutableCFOptions::ttl` (in seconds)
+
+## Size-Based Dropping (`PickSizeCompaction`)
+
+**When**: Total size exceeds the configured limit.
+
+### SST-Only Mode (default)
+
+Compares sum of SST file sizes against `max_table_files_size`:
+
+```
+Before (total 1.2GB > max_table_files_size 1GB):
+
+  L0: [F8:200MB] [F7:200MB] [F6:200MB] [F5:200MB] [F4:200MB] [F3:200MB]
+                                                       total = 1.2GB
+
+  Drop oldest files until under limit:
+  Drop F3 (200MB) --> remaining = 1.0GB <= 1GB limit --> STOP
+
+After:
+  L0: [F8:200MB] [F7:200MB] [F6:200MB] [F5:200MB] [F4:200MB]
+                                                       total = 1.0GB
+```
+
+### Blob-Aware Mode (`max_data_files_size > 0`)
+
+When BlobDB is enabled, SST files are small (keys + blob references) and blob
+files hold the actual values. The total disk usage is dominated by blob files,
+so `max_table_files_size` (SST-only) cannot control total disk usage.
+
+`max_data_files_size` accounts for both SST and blob files:
+
+```
+effective_size = total_sst + total_blob
+
+Example: total_sst = 10MB, total_blob = 9.99GB
+  max_table_files_size = 1GB  --> sees 10MB, no dropping (WRONG!)
+  max_data_files_size = 10GB  --> sees 10GB, drops when exceeded (CORRECT)
+```
+
+When dropping files, proportional estimation is used to account for blob
+data freed per SST file:
+
+```
+data_per_file = effective_size / num_files
+```
+
+Blob files are automatically cleaned up when their linked SSTs are deleted
+(via `BlobFileMetaData::GetLinkedSsts()` reference counting).
+
+**Config**:
+- `CompactionOptionsFIFO::max_table_files_size` (default: 1GB)
+- `CompactionOptionsFIFO::max_data_files_size` (default: 0, disabled)
+
+## Temperature Migration (`PickTemperatureChangeCompaction`)
+
+**When**: `file_temperature_age_thresholds` is non-empty
+
+This is NOT file dropping — it **rewrites** a single SST file to assign it a
+new storage temperature (e.g., kWarm, kCold). This allows tiered storage
+systems to move aging data to cheaper/slower media. The file content is
+unchanged; only the temperature metadata is updated.
+
+Picks one file at a time, scanning from oldest to newest. For each file,
+checks if its age exceeds a configured threshold AND its current temperature
+doesn't match the target. Only one file is migrated per compaction to minimize
+impact on other operations. Only works with single-level FIFO
+(`num_levels == 1`).
+
+This runs as the **lowest priority** in the picking chain (step 4) because
+storage tiering is less urgent than disk space management (dropping) or read
+amplification (intra-L0 compaction). Since FIFO allows only one compaction at
+a time, this ensures temperature migration never blocks critical operations.
+
+```
+Config: file_temperature_age_thresholds = [{kWarm, 3600}, {kCold, 86400}]
+
+  [F6:5m,kUnk] [F5:30m,kUnk] [F4:2h,kUnk] [F3:5h,kUnk] [F2:2d,kUnk]
+                                                            ^^^^^^^^
+                                                            age > 86400s
+                                                            --> compact to kCold
+
+After:
+  [F6:5m,kUnk] [F5:30m,kUnk] [F4:2h,kUnk] [F3:5h,kUnk] [F2:2d,kCold]
+```
+
+**Config**: `CompactionOptionsFIFO::file_temperature_age_thresholds`
+
+---
+
+# Part 2: Intra-L0 Compaction
+
+Intra-L0 compaction merges multiple small L0 files into fewer larger files
+to reduce file count and read amplification. Unlike file dropping, this
+rewrites data — but only SST data (blob files are never rewritten).
+
+`allow_compaction = true` is the **master switch** for intra-L0 compaction.
+When enabled, `use_kv_ratio_compaction` selects which picking strategy to use:
+
+```
+  allow_compaction = true          (master switch for intra-L0)
+          |
+          +-- use_kv_ratio_compaction = false   (default)
+          |     Old Strategy: PickCostBasedIntraL0Compaction
+          |     Guard: 1.1 * write_buffer_size
+          |     Works when SST ~= write_buffer_size (non-BlobDB)
+          |
+          +-- use_kv_ratio_compaction = true
+                New Strategy: PickRatioBasedIntraL0Compaction
+                Guard: capacity-derived target from SST/blob ratio
+                Works when SST << write_buffer_size (BlobDB)
+                Requires: max_data_files_size > 0
+```
+
+## Old Strategy: `PickCostBasedIntraL0Compaction`
+
+**When**: `allow_compaction = true` AND `use_kv_ratio_compaction = false`.
+Called from `PickIntraL0Compaction` (which only runs when `PickSizeCompaction`
+returned nullptr, meaning the DB is under the size limit).
+
+This is the original intra-L0 compaction, implemented in
+`PickCostBasedIntraL0Compaction()`. It uses a greedy algorithm to pick files,
+with a `write_buffer_size`-based guard to prevent re-compacting large files.
+
+### Algorithm
+
+```
+1. Start from the newest L0 file (index 0)
+2. Greedily add older files while compact_bytes_per_del_file decreases
+3. Stop when:
+   - A file is being_compacted
+   - compact_bytes_per_del_file starts increasing (diminishing returns)
+   - Total exceeds max_compaction_bytes
+4. Check: enough files (>= trigger) AND per_del < 1.1 * write_buffer_size
+5. Output: always a single file
+```
+
+### Understanding `compact_bytes_per_del_file`
+
+`compact_bytes_per_del_file` measures the **cost per file eliminated**. When
+we compact N files into 1 output, we eliminate (N-1) files but must read and
+rewrite all N files' data. The metric is:
+
+```
+compact_bytes_per_del_file = total_input_bytes / (num_files - 1)
+```
+
+The algorithm greedily adds files as long as this ratio keeps **decreasing**
+(meaning each additional file is "cheap" to include). When adding a file
+causes the ratio to **increase**, we stop — it signals diminishing returns.
+
+```
+Example: scanning files from newest (left) to oldest (right)
+
+  Files:    [F5:32KB] [F4:64KB] [F3:48KB] [F2:96KB] [F1:128KB]
+
+  Step 1: Start with F5 (32KB). compact_bytes = 32KB.
+  Step 2: Add F4.  compact_bytes = 96KB.  per_del = 96/1 = 96KB.
+  Step 3: Add F3.  compact_bytes = 144KB. per_del = 144/2 = 72KB. (72 < 96, improving)
+  Step 4: Add F2.  compact_bytes = 240KB. per_del = 240/3 = 80KB. (80 > 72, WORSE!)
+          --> STOP. Adding F2 makes the ratio increase.
+
+  Result: pick [F5, F4, F3] (3 files), per_del = 72KB.
+```
+
+The ratio increases when a file is significantly larger than the average of
+files already selected. This naturally prevents including already-compacted
+files (which are larger than flush files) — IF the size gap is significant.
+
+### Example (uniform flush files)
+
+```
+Before (4 flush files of 64KB each, trigger=4):
+
+  L0: [F4:64KB] [F3:64KB] [F2:64KB] [F1:64KB]
+       newest                          oldest
+
+  PickCostBasedIntraL0Compaction:
+    Add F4: compact_bytes = 64KB
+    Add F3: compact_bytes = 128KB, per_del = 128/1 = 128KB
+    Add F2: compact_bytes = 192KB, per_del = 192/2 = 96KB  (96 < 128, better)
+    Add F1: compact_bytes = 256KB, per_del = 256/3 = 85KB  (85 < 96, better)
+    No more files. Check: 4 >= trigger(4) and 85KB < 70MB. OK.
+
+After:
+  L0: [C1:256KB]    (single compacted output)
+```
+
+### Example (flush + compacted, ratio detects size gap)
+
+```
+  L0: [F8:64KB] [F7:64KB] [F6:64KB] [F5:64KB] [C1:256KB]
+       newest                                     oldest (compacted)
+
+  PickCostBasedIntraL0Compaction:
+    Add F8: compact_bytes = 64KB
+    Add F7: compact_bytes = 128KB, per_del = 128/1 = 128KB
+    Add F6: compact_bytes = 192KB, per_del = 192/2 = 96KB  (improving)
+    Add F5: compact_bytes = 256KB, per_del = 256/3 = 85KB  (improving)
+    Add C1: compact_bytes = 512KB, per_del = 512/4 = 128KB (128 > 85, WORSE!)
+    --> STOP before C1.
+
+  Result: pick [F8, F7, F6, F5] — compacted file C1 is excluded.
+  This works because C1 (256KB) is 4x larger than flush files (64KB).
+```
+
+### Anti-Re-Compaction Guard
+
+The guard `compact_bytes_per_del_file < 1.1 * write_buffer_size` prevents
+picking files that are already near memtable size. The idea: compacted files
+should be ~write_buffer_size, so they'd push `per_del` above the guard.
+
+```
+Guard works when SST ~= write_buffer_size:
+
+  Files: [64MB, 64MB, 64MB, 64MB]   (SST ~= WBS = 64MB)
+  per_del = 256MB/3 = 85MB > 70MB   --> guard rejects --> no re-compaction
+```
+
+### Known Limitation with BlobDB
+
+With BlobDB, SST files are ~1000x smaller than `write_buffer_size`. The guard
+threshold (e.g., 70MB) is never reached by any L0 file. ALL files pass the
+guard, including previously compacted files:
+
+```
+Guard FAILS when SST << write_buffer_size (BlobDB):
+
+  write_buffer_size = 64MB, SST files ~64KB (1000x smaller)
+  Guard threshold: 1.1 * 64MB = 70.4MB
+
+  10 compacted files of 256KB each:
+    per_del = 2560KB/9 = 284KB << 70.4MB --> guard passes!
+    ALL 10 files re-compacted into 1 file of 2.56MB
+
+  Result: cascading re-compaction creates "monster files"
+
+  Round 1: [64KB, 64KB, 64KB, 64KB] --> compact --> [256KB]
+  Round 2: [64KB, 64KB, 64KB, 256KB] --> compact ALL --> [448KB]
+  Round 3: [64KB, 64KB, 64KB, 448KB] --> compact ALL --> [640KB]
+  ... files grow unboundedly
+```
+
+Use the KV-ratio strategy instead for BlobDB workloads.
+
+### Config
+
+- `CompactionOptionsFIFO::allow_compaction` (default: false)
+- Anti-re-compaction guard: `1.1 * write_buffer_size`
+- Min files: `level0_file_num_compaction_trigger`
+
+## New Strategy: `use_kv_ratio_compaction` (`PickRatioBasedIntraL0Compaction`)
+
+**When**: `allow_compaction = true` AND `use_kv_ratio_compaction = true`
+AND `max_data_files_size > 0`
+
+This strategy replaces the `write_buffer_size`-based guard with a
+**capacity-derived target** and uses **tiered size-based merging** to achieve
+logarithmic write amplification. It observes the actual SST/blob size ratio,
+computes a target graduated file size, and merges files incrementally through
+size tiers rather than directly to target.
+
+### Why a New Strategy?
+
+```
+Without BlobDB:  SST ~= write_buffer_size     --> old guard works
+With BlobDB:     SST ~= write_buffer_size/1000 --> old guard is useless
+```
+
+The new strategy derives the target from the **data capacity** and
+**observed key/value ratio**, not from `write_buffer_size`.
+
+### Algorithm
+
+**Step 1: Target Computation**
+
+The target graduated file size can be determined in two ways:
+
+```
+If max_compaction_bytes > 0 (explicitly set by user):
+  target = max_compaction_bytes      // user override
+
+If max_compaction_bytes == 0 (default, auto-calculate):
+  sst_ratio = total_l0_sst / (total_l0_sst + total_blob)
+  total_sst_at_cap = max_data_files_size * sst_ratio
+  target = total_sst_at_cap / level0_file_num_compaction_trigger
+```
+
+```
+Example (auto-calculated):
+  max_data_files_size = 10GB, sst_ratio = 0.001 (64KB SST / 64MB total)
+  total_sst_at_cap = 10GB * 0.001 = 10MB
+  trigger = 10
+  target = 10MB / 10 = 1MB
+```
+
+The `sst_ratio` is **recomputed on every `PickCompaction` call**. The
+computation is trivial (sum file sizes + arithmetic) and `PickCompaction`
+is only called once per flush or compaction completion, so no caching is
+needed. This also means the ratio naturally adapts when `SetOptions()`
+changes configuration.
+
+**Step 2: Tier Boundaries**
+
+Tier boundaries form a geometric sequence descending from the target,
+using `trigger` as the growth factor:
+
+```
+..., target/trigger^2, target/trigger, target
+```
+
+Example with target=1MB, trigger=10:
+  boundaries = [10KB, 100KB, 1MB]
+
+Boundaries below 10KB are not generated (SST files of most workloads
+are larger than this). If target itself is below 10KB, it is used as
+the sole boundary.
+
+Files >= target are "graduated" and never compacted again. They sit
+in L0 until FIFO drops them.
+
+**Step 3: Tiered File Selection**
+
+For each tier boundary (smallest first), scan L0 from oldest to newest:
+
+```
+For each boundary B (from smallest to largest):
+  1. Skip files >= B (they belong to higher tiers) and being_compacted files
+  2. Collect contiguous files < B
+  3. Stop when accumulated >= B (cap at 2*B to prevent tier-skipping)
+  4. If >= 2 files and accumulated >= B: merge them
+  5. Output (~B bytes) lands at the next tier
+```
+
+Processing boundaries smallest-first ensures bottom-up build: flush outputs
+are merged first, and higher-tier merges happen naturally as lower-tier
+outputs accumulate.
+
+```
+Example (target=1MB, trigger=10, flush~10KB):
+
+  Tier boundaries: [10KB, 100KB, 1MB]
+
+  L0: [1MB_grad] [1MB_grad] [100KB] [100KB] [10KB] [10KB] [F] [F] [F] [F]
+
+  Scan at boundary=10KB:
+    F,F,F,F (all < 10KB) --> accumulated >= 10KB? If yes, merge → ~10KB output
+
+  Scan at boundary=100KB:
+    10KB,10KB,... (all < 100KB) --> accumulated >= 100KB? merge → ~100KB
+
+  Scan at boundary=1MB:
+    100KB,100KB,... (all < 1MB) --> accumulated >= 1MB? merge → graduated!
+```
+
+### Trade-Off: Write Amp vs L0 File Count
+
+The tiered approach trades higher L0 file count for logarithmic write amp:
+
+```
+Write amp per byte:
+  k + 1 = ceil(log(target/flush) / log(trigger)) + 1
+  Each byte is rewritten once per tier crossing.
+
+L0 file count at steady state:
+  trigger + k * (trigger - 1)
+  More than the original trigger target, but bounded logarithmically.
+
+Example (target=1MB, flush=1KB, trigger=10):
+  k = 3 tiers, write amp = 4, file count ≈ 37
+  vs flat merging: write amp ≈ 57
+```
+
+### Anti-Re-Compaction Guard
+
+The guard is implicit in the tier boundaries:
+
+```
+Graduated files (>= target) are skipped at EVERY tier boundary.
+  1MB >= 1MB   --> skipped at 1MB boundary
+  1MB >= 100KB --> skipped at 100KB boundary
+  1MB >= 10KB  --> skipped at 10KB boundary
+
+Intermediate tier files are only merged at HIGHER tier boundaries.
+  A 100KB file (output of tier-0 merge) is:
+    >= 100KB --> skipped at 100KB boundary (won't be re-merged at same tier)
+    < 1MB    --> eligible at 1MB boundary (merges into graduated file)
+```
+
+Compare with the old strategy's guard:
+
+```
+Old: guard = 1.1 * write_buffer_size (breaks when SST << WBS)
+New: graduated files >= target always excluded; intermediate files
+     progress through tiers without cascading re-compaction
+```
+
+### Steady State
+
+```
+Steady state L0 (target=64MB, trigger=4, flush~1MB):
+
+  [64MB_grad, 64MB_grad, 64MB_grad, 64MB_grad,
+   16MB, 16MB, 16MB,
+   4MB, 4MB,
+   1MB, 1MB, 1MB]
+
+  - 4 graduated files at target size (frozen until FIFO drops them)
+  - Intermediate files at tier sizes (accumulating for next merge)
+  - Flush outputs (accumulating for first tier merge)
+
+When FIFO drops the oldest graduated file, it removes exactly
+1/trigger of the total SST data (predictable).
+```
+
+### Write Amplification
+
+```
+With BlobDB (SST ~1KB, blob ~1MB per flush, target=1MB, trigger=10):
+  - k = 3 tiers (1KB → 10KB → 100KB → 1MB)
+  - SST write amp: k+1 = 4x (flush + 3 tier crossings)
+  - Blob write amp: ~1x (never rewritten)
+  - Total write amp: ~1 + 1KB*4/(1KB+1MB) ≈ 1.004x
+
+Without BlobDB (SST ~64MB per flush):
+  - target = large, ratio = 1, k = 1 typically
+  - SST write amp: ~2x
+```
+
+### File Uniformity
+
+At steady state, all graduated files are close to the target size.
+Output is in [boundary, 2*boundary) at each tier. Variable flush sizes
+are handled naturally — the size-based merge rule produces consistent
+output regardless of individual file sizes.
+
+### Config
+
+- `CompactionOptionsFIFO::allow_compaction` (required: true)
+- `CompactionOptionsFIFO::use_kv_ratio_compaction` (default: false)
+- `CompactionOptionsFIFO::max_data_files_size` (required, > 0)
+- `level0_file_num_compaction_trigger` (target max L0 file count)
+- `max_compaction_bytes` (default: 0 = auto-calculate target from capacity;
+  when > 0, overrides auto-calculated target with this value)
+
+## Choosing Between Old and New Intra-L0 Strategies
+
+Both strategies require `allow_compaction = true`. The choice of strategy
+depends on whether BlobDB is used:
+
+```
+Decision tree:
+
+  Want intra-L0 compaction?
+    |
+    +-- NO:  allow_compaction = false (default)
+    |        No file merging, only dropping.
+    |
+    +-- YES: allow_compaction = true
+             |
+             +-- Using BlobDB (SST << write_buffer_size)?
+             |     |
+             |     +-- YES: use_kv_ratio_compaction = true
+             |     |        (also requires max_data_files_size > 0)
+             |     |
+             |     +-- NO:  use_kv_ratio_compaction = false (default)
+             |              Old strategy works fine.
+```
+
+| Criteria | Old (default) | New (`use_kv_ratio_compaction`) |
+|----------|------------------------|-------------------------------|
+| Guard mechanism | `1.1 * write_buffer_size` | capacity-derived target |
+| Works with BlobDB? | No (guard broken) | Yes (designed for it) |
+| File uniformity | Poor with BlobDB | Good (+/-25%) |
+| Re-compaction risk | High with BlobDB | None (tiered boundaries prevent it) |
+| Write amp (BlobDB) | Unpredictable | Logarithmic: (k+1)x SST, ~1x total |
+| Requires | `allow_compaction=true` | `allow_compaction=true` + `use_kv_ratio_compaction=true` + `max_data_files_size>0` |
+
+---
+
+# Configuration Examples
+
+## Basic FIFO (no intra-L0 compaction)
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024;  // 1GB
+```
+
+## FIFO with old intra-L0 (non-BlobDB)
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_table_files_size = 1ULL * 1024 * 1024 * 1024;
+options.compaction_options_fifo.allow_compaction = true;
+options.level0_file_num_compaction_trigger = 4;
+```
+
+## FIFO with BlobDB and KV-ratio compaction
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_data_files_size = 10ULL * 1024 * 1024 * 1024;  // 10GB
+options.compaction_options_fifo.allow_compaction = true;   // master switch
+options.compaction_options_fifo.use_kv_ratio_compaction = true;  // select new strategy
+options.level0_file_num_compaction_trigger = 10;
+options.enable_blob_files = true;
+options.min_blob_size = 1024;
+```
+
+## FIFO with TTL + BlobDB
+
+```cpp
+options.compaction_style = kCompactionStyleFIFO;
+options.compaction_options_fifo.max_data_files_size = 10ULL * 1024 * 1024 * 1024;
+options.compaction_options_fifo.allow_compaction = true;
+options.compaction_options_fifo.use_kv_ratio_compaction = true;
+options.level0_file_num_compaction_trigger = 10;
+options.ttl = 86400;  // 24 hours
+options.enable_blob_files = true;
+options.min_blob_size = 1024;
+```
+
+---
+
+# Configuration Reference
+
+## CompactionOptionsFIFO
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `max_table_files_size` | uint64_t | 1GB | SST-only size limit for FIFO dropping |
+| `max_data_files_size` | uint64_t | 0 | Combined SST+blob size limit (0=disabled) |
+| `allow_compaction` | bool | false | Master switch for intra-L0 compaction (required for both old and new strategies) |
+| `use_kv_ratio_compaction` | bool | false | Select capacity-derived intra-L0 strategy (requires allow_compaction=true AND max_data_files_size>0) |
+| `age_for_warm` | uint64_t | 0 | DEPRECATED |
+| `file_temperature_age_thresholds` | vector | empty | Age-based temperature migration |
+| `allow_trivial_copy_when_change_temperature` | bool | false | Allow trivial copy for temp change |
+| `trivial_copy_buffer_size` | uint64_t | 4096 | Buffer size for trivial copy |
+
+## Related CF Options
+
+| Option | Relevance to FIFO |
+|--------|-------------------|
+| `level0_file_num_compaction_trigger` | Target max L0 file count for KV-ratio; min files for old intra-L0 |
+| `ttl` | TTL-based file expiration (seconds) |
+| `write_buffer_size` | Guard threshold for old-style intra-L0 (1.1x) |
+| `max_compaction_bytes` | For KV-ratio: 0 = auto-calculate target from capacity; > 0 = use as target directly. For old intra-L0: cap on total input size. Default sanitized to target_file_size_base * 25 (except when use_kv_ratio_compaction=true, where 0 is preserved) |

From 88ff4f6b1256767b2b5675cf71f0cd6c1249b556 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 17 Feb 2026 11:13:17 -0800
Subject: [PATCH 477/500] Disable Interpolation Search in DB Stress (#14339)

Summary:
Temporarily disable interpolation search.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14339

Reviewed By: pdillinger

Differential Revision: D93497658

Pulled By: mszeszko-meta

fbshipit-source-id: 6ac826dd3fc354e18af0d928f87ed71e2cef3f14
---
 tools/db_crashtest.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 55cc42d0046c..c00254958b48 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -174,7 +174,8 @@ def apply_random_seed_per_iteration():
     "get_current_wal_file_one_in": 0,
     # Temporarily disable hash index
     "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]),
-    "index_block_search_type": lambda: random.choice([0, 1]),
+    # Temporarily disable interpolation search (allow for binary search '0' only)
+    "index_block_search_type": 0,
     "ingest_external_file_one_in": lambda: random.choice([1000, 1000000]),
     "test_ingest_standalone_range_deletion_one_in": lambda: random.choice([0, 5, 10]),
     "iterpercent": 10,

From 821bd37d09061458a1957c75355c1bebe2a4d8de Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 17 Feb 2026 11:49:20 -0800
Subject: [PATCH 478/500] Cap max table files size in db stress FIFO
 compactions (#14341)

Summary:
FIFO crash tests fail on DB open when `fifo_compaction_max_data_files_size_mb` is randomly set to 100 or 500 MB, because `max_table_files_size` defaults to 1GB and the validation requires `max_data_files_size` >= `max_table_files_size` when non-zero. Cap `max_table_files_size` to `max_data_files_size` in db_stress when the latter is set. `max_table_files_size` is ignored at runtime when `max_data_files_size` is non-zero, so this only satisfies the validation constraint.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14341

Reviewed By: pdillinger

Differential Revision: D93503113

Pulled By: mszeszko-meta

fbshipit-source-id: 5c3e7c9b568661244c71c548cb0fe5e55472c0ca
---
 db_stress_tool/db_stress_test_base.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 7428b2eff03a..902a6c8ff546 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -4381,6 +4381,11 @@ void InitializeOptionsFromFlags(
     if (FLAGS_fifo_compaction_max_data_files_size_mb > 0) {
       options.compaction_options_fifo.max_data_files_size =
           FLAGS_fifo_compaction_max_data_files_size_mb * 1024 * 1024;
+      // max_table_files_size is ignored when max_data_files_size is non-zero,
+      // but validation requires max_data_files_size >= max_table_files_size.
+      options.compaction_options_fifo.max_table_files_size =
+          std::min(options.compaction_options_fifo.max_table_files_size,
+                   options.compaction_options_fifo.max_data_files_size);
     }
     options.compaction_options_fifo.use_kv_ratio_compaction =
         FLAGS_fifo_compaction_use_kv_ratio_compaction;

From ebd1000008acfb5a12c199ee8bfba9a41214aed5 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 17 Feb 2026 12:28:31 -0800
Subject: [PATCH 479/500] Fix UB in ReadBe64FromKey shift by 64 (#14340)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14340

`ReadBe64FromKey` pads its result with `val <<= (8 - len) * 8` to right-align partial reads. When the seek target's user key is shorter than shared_prefix_len, len is 0 and this becomes a shift by 64, which is undefined behavior for uint64_t. On x86 this happens to produce 0 (the correct result), but UBSan rightfully flags it. Guard the shift with `len > 0 && len < 8`.

Reviewed By: joshkang97

Differential Revision: D93435715

fbshipit-source-id: bab128e9a65ea18d401670268cbac77d45e11340
---
 table/block_based/block.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index dd57b3df4cf4..8d28e9ae3f60 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -167,7 +167,9 @@ static uint64_t ReadBe64FromKey(Slice s, bool is_user_key, size_t offset) {
   for (size_t i = 0; i < len; i++) {
     val = (val << 8) | static_cast<uint8_t>(s.data()[offset + i]);
   }
-  val <<= (8 - len) * 8;  // Pad zeros on the right
+  if (len > 0 && len < 8) {
+    val <<= (8 - len) * 8;  // Pad zeros on the right
+  }
   return val;
 }
 

From 5f692d747c1c816e3f726d2ef6663214620eaad5 Mon Sep 17 00:00:00 2001
From: Maciej Szeszko <mszeszko@meta.com>
Date: Tue, 17 Feb 2026 12:51:36 -0800
Subject: [PATCH 480/500] Fall back to sync read when async IO is unavailable
 (#14337)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14337

## Context:

D91624185 changed `FilePrefetchBuffer::PollIfNeeded` from `void` to returning `Status`, correctly propagating `Poll` errors instead of silently swallowing them. A side effect is that when io_uring fails to initialize at runtime (e.g., sandcastle seccomp restrictions), `ReadAsync` returns `NotSupported` which now propagates through `PrefetchRemBuffers` and `HandleOverlappingAsyncData`, causing `PrefetchInternal` to return early before executing the synchronous `Read` that the caller actually depends on. This leaves iterators invalid with no recovery path — both in `db_stress` crash tests and in production. The filesystem advertises async IO support (`CheckFSFeatureSupport` passes), so the failure only surfaces at runtime when io_uring initialization fails. The prior behavior silently degraded to sync reads because `PollIfNeeded` swallowed the error.

## Changes

Add a sync fallback in `FilePrefetchBuffer::ReadAsync` — the single chokepoint for all async reads. When `reader->ReadAsync()` returns `NotSupported`, fall back to `reader->Read()` synchronously, populate the buffer inline, and return OK. Since `async_read_in_progress_` stays false, `PollIfNeeded` becomes a no-op (nothing to poll, data is already there). All callers — `PrefetchRemBuffers`, `HandleOverlappingAsyncData`, `PrefetchAsync` — work transparently without any per-site changes.

Reviewed By: archang19

Differential Revision: D93432284

fbshipit-source-id: daef185fc3535e347d182e75dd443ae921eeb495
---
 file/file_prefetch_buffer.cc      | 12 +++++
 file/prefetch_test.cc             | 73 ++++++++++++++++++++++++++++---
 file/random_access_file_reader.cc |  5 +++
 3 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
index e8aa7d10c512..ab78fccf72b4 100644
--- a/file/file_prefetch_buffer.cc
+++ b/file/file_prefetch_buffer.cc
@@ -160,6 +160,18 @@ Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts,
       RecordTick(stats_, PREFETCH_BYTES, read_len);
     }
     buf->async_read_in_progress_ = true;
+  } else if (s.IsNotSupported()) {
+    // Async IO is not available (e.g., io_uring failed to initialize).
+    // Fall back to synchronous read so the buffer is populated inline
+    // and callers proceed transparently.
+    s = reader->Read(opts, start_offset, read_len, &result,
+                     buf->buffer_.BufferStart(), /*aligned_buf=*/nullptr);
+    if (s.ok()) {
+      buf->buffer_.Size(buf->CurrentSize() + result.size());
+      if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
+        RecordTick(stats_, PREFETCH_BYTES, read_len);
+      }
+    }
   }
   return s;
 }
diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc
index 472fdc13bd1f..57559b5e8466 100644
--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@@ -3582,14 +3582,25 @@ TEST_F(FilePrefetchBufferTest, PollErrorPropagation) {
   // Start an async prefetch to set up async_read_in_progress_ state
   Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result);
 
-  // Skip test on platforms that don't support async IO
+  // Skip test on platforms that don't support async IO.
   if (s.IsNotSupported()) {
     ROCKSDB_GTEST_SKIP("Async IO not supported on this platform");
     return;
   }
   ASSERT_TRUE(s.IsTryAgain());
-  std::cout << "PollErrorPropagation: Async IO supported, proceeding with test"
-            << std::endl;
+
+  // With the ReadAsync sync fallback, PrefetchAsync returns TryAgain even when
+  // async IO is unavailable (data is read synchronously, but data_found was
+  // false at entry). Detect by checking async_read_in_progress_ on the buffer.
+  {
+    std::vector<std::tuple<uint64_t, size_t, bool>> buf_info(1);
+    fpb.TEST_GetBufferOffsetandSize(buf_info);
+    bool async_read_in_progress = std::get<2>(buf_info[0]);
+    if (!async_read_in_progress) {
+      ROCKSDB_GTEST_SKIP("Async IO not available (sync fallback used)");
+      return;
+    }
+  }
 
   // Set up SyncPoint to inject Poll error
   SyncPoint::GetInstance()->SetCallBack(
@@ -3619,9 +3630,40 @@ TEST_F(FilePrefetchBufferTest, PollErrorPropagation) {
       << "Expected error message to contain 'Injected Poll error', got: "
       << read_status.ToString();
 
-  std::cout << "PollErrorPropagation: Poll error correctly propagated - "
-            << "found=" << found << ", status=" << read_status.ToString()
-            << std::endl;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(FilePrefetchBufferTest, ReadAsyncSyncFallbackOnNotSupported) {
+  std::string fname = "read-async-sync-fallback";
+  Random rand(0);
+  std::string content = rand.RandomString(32768);
+  Write(fname, content);
+
+  FileOptions opts;
+  std::unique_ptr<RandomAccessFileReader> r;
+  Read(fname, opts, &r);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::ReadAsync:InjectStatus", [](void* arg) {
+        *static_cast<IOStatus*>(arg) = IOStatus::NotSupported();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadaheadParams readahead_params;
+  readahead_params.initial_readahead_size = 16384;
+  readahead_params.max_readahead_size = 16384;
+  readahead_params.num_buffers = 2;
+
+  FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
+                         /*track_min_offset=*/false, fs());
+
+  Slice result;
+  Status s;
+  ASSERT_TRUE(fpb.TryReadFromCache(IOOptions(), r.get(), 0, 4096, &result, &s));
+  ASSERT_OK(s);
+  ASSERT_EQ(result.size(), 4096);
+  ASSERT_EQ(memcmp(result.data(), content.data(), 4096), 0);
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -3792,6 +3834,9 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) {
       fpb.TryReadFromCache(IOOptions(), r.get(), 0 /* offset */, 4096 /* n */,
                            &result, &s, for_compaction);
   // Platforms that don't have IO uring may not support async IO.
+  // With the ReadAsync sync fallback, s will be OK even when async IO is
+  // unavailable — detect by checking if the second buffer has an async read
+  // in progress.
   if (use_async_prefetch && s.IsNotSupported()) {
     return;
   }
@@ -3805,6 +3850,14 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchStatsInternals) {
   fpb.TEST_GetOverlapBufferOffsetandSize(overlap_buffer_info);
   fpb.TEST_GetBufferOffsetandSize(buffer_info);
   if (use_async_prefetch) {
+    bool async_read_in_progress = std::get<2>(buffer_info[1]);
+    if (!async_read_in_progress) {
+      // Async IO was requested but not available (e.g., no io_uring).
+      // ReadAsync fell back to sync read. Skip async-specific assertions.
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      return;
+    }
     // Cut the readahead of 8192 in half.
     // Overlap buffer is not used
     ASSERT_EQ(overlap_buffer_info.first, 0);
@@ -3997,6 +4050,14 @@ TEST_P(FSBufferPrefetchTest, FSBufferPrefetchUnalignedReads) {
   fpb.TEST_GetOverlapBufferOffsetandSize(overlap_buffer_info);
   fpb.TEST_GetBufferOffsetandSize(buffer_info);
   if (use_async_prefetch) {
+    bool async_read_in_progress = std::get<2>(buffer_info[1]);
+    if (!async_read_in_progress) {
+      // Async IO was requested but not available (e.g., no io_uring).
+      // ReadAsync fell back to sync read. Skip async-specific assertions.
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+      return;
+    }
     // Overlap buffer is not used
     ASSERT_EQ(overlap_buffer_info.first, 0);
     ASSERT_EQ(overlap_buffer_info.second, 0);
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index ba376249d9da..ae070ef34626 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -528,6 +528,11 @@ IOStatus RandomAccessFileReader::ReadAsync(
     void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf,
     IODebugContext* dbg) {
   IOStatus s;
+  TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::ReadAsync:InjectStatus",
+                           &s);
+  if (!s.ok()) {
+    return s;
+  }
   // Create a callback and populate info.
   auto read_async_callback =
       std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,

From 09bda51c5009e371873af655bb231779e71ab306 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Tue, 17 Feb 2026 13:05:44 -0800
Subject: [PATCH 481/500] Propagate file_checksum through FileOptions on
 NewRandomAccessFile (#14321)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14321

Add file_checksum and file_checksum_func_name fields to FileOptions so that downstream FileSystem implementations can access per-file checksum metadata when SST files are opened. The fields are populated from FileMetaData at all call sites where SST files are opened via NewRandomAccessFile: TableCache::GetTableReader, Version::GetTableProperties, and CompactionJob::ReadTablePropertiesDirectly. Also fixes the fallback path in TableCache::GetTableReader to use the local fopts (with temperature and checksum) instead of the original file_options.

Added a kNoFileChecksumFuncName which is distinct from  kUnknownFileChecksumFuncName:

 - kUnknownFileChecksumFuncName ("Unknown"): We have FileMetaData for this file, and the metadata says no checksum was computed (no factory was configured when the file was written). This is a property of the file itself.
- kNoFileChecksumFuncName ("Unavailable"): We don't even have FileMetaData — we're opening this file in a context where there's no checksum metadata to propagate at all (e.g., SstFileDumper, SstFileReader, checksum generation). It's a property of the call site, not the file.

So the assertion file_checksum.empty() is correct for both, but for different reasons — one says "the file has no checksum," the other says "we have no idea about this file's checksum."

Reviewed By: pdillinger

Differential Revision: D92728944

fbshipit-source-id: 8fd34ea22ca87090b26d0a55c921f354f97f1ffc
---
 db/compaction/compaction_job.cc        |  5 +-
 db/convenience.cc                      |  8 +--
 db/convenience_impl.h                  |  3 +-
 db/db_basic_test.cc                    | 88 ++++++++++++++++++++++++++
 db/db_impl/db_impl.cc                  | 10 ++-
 db/external_sst_file_ingestion_job.cc  | 18 +++++-
 db/import_column_family_job.cc         |  6 +-
 db/table_cache.cc                      |  5 +-
 db/version_set.cc                      |  6 +-
 db_stress_tool/db_stress_env_wrapper.h | 32 ++++++++++
 db_stress_tool/db_stress_tool.cc       |  6 +-
 file/file_util.cc                      | 10 ++-
 file/file_util.h                       |  3 +-
 include/rocksdb/file_checksum.h        |  5 ++
 include/rocksdb/file_system.h          | 17 ++++-
 table/sst_file_dumper.cc               |  3 +
 table/sst_file_reader.cc               |  2 +
 17 files changed, 205 insertions(+), 22 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index a8baddee5dda..8092a26069be 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -2807,7 +2807,10 @@ Status CompactionJob::ReadTablePropertiesDirectly(
     std::shared_ptr<const TableProperties>* tp) {
   std::unique_ptr<FSRandomAccessFile> file;
   std::string file_name = GetTableFileName(file_meta->fd.GetNumber());
-  Status s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file,
+  FileOptions fopts = file_options_;
+  fopts.file_checksum = file_meta->file_checksum;
+  fopts.file_checksum_func_name = file_meta->file_checksum_func_name;
+  Status s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file,
                                               nullptr /* dbg */);
   if (!s.ok()) {
     return s;
diff --git a/db/convenience.cc b/db/convenience.cc
index e8c1fcd01e00..5560cffe5fda 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -65,7 +65,7 @@ Status VerifySstFileChecksum(const Options& options,
 }
 
 Status VerifySstFileChecksumInternal(const Options& options,
-                                     const EnvOptions& env_options,
+                                     const FileOptions& file_options,
                                      const ReadOptions& read_options,
                                      const std::string& file_path,
                                      const SequenceNumber& largest_seqno) {
@@ -74,8 +74,8 @@ Status VerifySstFileChecksumInternal(const Options& options,
   InternalKeyComparator internal_comparator(options.comparator);
   ImmutableOptions ioptions(options);
 
-  Status s = ioptions.fs->NewRandomAccessFile(
-      file_path, FileOptions(env_options), &file, nullptr);
+  Status s =
+      ioptions.fs->NewRandomAccessFile(file_path, file_options, &file, nullptr);
   if (s.ok()) {
     s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
   } else {
@@ -94,7 +94,7 @@ Status VerifySstFileChecksumInternal(const Options& options,
   const bool kImmortal = true;
   auto reader_options = TableReaderOptions(
       ioptions, options.prefix_extractor, options.compression_manager.get(),
-      env_options, internal_comparator, options.block_protection_bytes_per_key,
+      file_options, internal_comparator, options.block_protection_bytes_per_key,
       false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
       -1 /* level */);
   reader_options.largest_seqno = largest_seqno;
diff --git a/db/convenience_impl.h b/db/convenience_impl.h
index 32f4476bde99..5e8d6d49667c 100644
--- a/db/convenience_impl.h
+++ b/db/convenience_impl.h
@@ -5,10 +5,11 @@
 
 #pragma once
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 
 namespace ROCKSDB_NAMESPACE {
 Status VerifySstFileChecksumInternal(const Options& options,
-                                     const EnvOptions& env_options,
+                                     const FileOptions& file_options,
                                      const ReadOptions& read_options,
                                      const std::string& file_path,
                                      const SequenceNumber& largest_seqno = 0);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index c33f08628d10..0ff1295e9120 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -5474,6 +5474,94 @@ INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
                         ::testing::Values(std::make_tuple(true, false),
                                           std::make_tuple(false, true),
                                           std::make_tuple(true, true)));
+
+// FileSystemWrapper that captures FileOptions passed to NewRandomAccessFile
+// for .sst files, so we can verify file_checksum fields are populated.
+class ChecksumCapturingFS : public FileSystemWrapper {
+ public:
+  explicit ChecksumCapturingFS(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+
+  static const char* kClassName() { return "ChecksumCapturingFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    if (fname.find(".sst") != std::string::npos) {
+      std::lock_guard<std::mutex> lock(mu_);
+      captured_file_checksum_ = opts.file_checksum;
+      captured_file_checksum_func_name_ = opts.file_checksum_func_name;
+      capture_count_++;
+    }
+    return target()->NewRandomAccessFile(fname, opts, result, dbg);
+  }
+
+  std::string GetCapturedFileChecksum() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return captured_file_checksum_;
+  }
+
+  std::string GetCapturedFileChecksumFuncName() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return captured_file_checksum_func_name_;
+  }
+
+  int GetCaptureCount() {
+    std::lock_guard<std::mutex> lock(mu_);
+    return capture_count_;
+  }
+
+  void Reset() {
+    std::lock_guard<std::mutex> lock(mu_);
+    captured_file_checksum_.clear();
+    captured_file_checksum_func_name_.clear();
+    capture_count_ = 0;
+  }
+
+ private:
+  std::mutex mu_;
+  std::string captured_file_checksum_;
+  std::string captured_file_checksum_func_name_;
+  int capture_count_ = 0;
+};
+
+TEST_F(DBBasicTest, FileChecksumInFileOptions) {
+  // Verify that file_checksum and file_checksum_func_name from FileMetaData
+  // are propagated through FileOptions when opening SST files.
+  auto capturing_fs =
+      std::make_shared<ChecksumCapturingFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, capturing_fs));
+
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env.get();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  DestroyAndReopen(options);
+
+  // Write data and flush to create an SST with a file checksum.
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Flush());
+
+  // Reset captures, then reopen to trigger TableCache SST open.
+  capturing_fs->Reset();
+  Reopen(options);
+
+  // Read to trigger SST open through TableCache::GetTableReader.
+  ASSERT_EQ("value1", Get("key1"));
+
+  // Verify that checksum fields were populated.
+  ASSERT_GT(capturing_fs->GetCaptureCount(), 0);
+  ASSERT_FALSE(capturing_fs->GetCapturedFileChecksum().empty());
+  ASSERT_NE(capturing_fs->GetCapturedFileChecksumFuncName(),
+            capturing_fs->GetCapturedFileChecksum());
+  ASSERT_EQ(capturing_fs->GetCapturedFileChecksumFuncName(),
+            "FileChecksumCrc32c");
+
+  Close();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 93cdbf5f36ad..b9120bc0486d 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -6471,8 +6471,11 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
                                      fmeta->file_checksum_func_name, fname,
                                      read_options);
         } else {
+          FileOptions fopts = file_options_;
+          fopts.file_checksum = fmeta->file_checksum;
+          fopts.file_checksum_func_name = fmeta->file_checksum_func_name;
           s = ROCKSDB_NAMESPACE::VerifySstFileChecksumInternal(
-              opts, file_options_, read_options, fname, fd.largest_seqno);
+              opts, fopts, read_options, fname, fd.largest_seqno);
         }
         RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
                    IOSTATS(bytes_read) - prev_bytes_read);
@@ -6540,12 +6543,15 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
   }
   std::string file_checksum;
   std::string func_name;
+  FileOptions fopts;
+  fopts.file_checksum = file_checksum_expected;
+  fopts.file_checksum_func_name = func_name_expected;
   s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
       fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
       func_name_expected, &file_checksum, &func_name,
       read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
       io_tracer_, immutable_db_options_.rate_limiter.get(), read_options,
-      immutable_db_options_.stats, immutable_db_options_.clock);
+      immutable_db_options_.stats, immutable_db_options_.clock, fopts);
   if (s.ok()) {
     assert(func_name_expected == func_name);
     if (file_checksum != file_checksum_expected) {
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 2b92bd2f09bd..7a379b9df790 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -300,6 +300,18 @@ Status ExternalSstFileIngestionJob::Prepare(
         // ingestion.
         // TODO: plumb Env::IOActivity
         ReadOptions ro;
+        // Pass user-provided checksums through FileOptions when available.
+        // The caller may not have provided checksums at all (empty vectors),
+        // so we guard with a bounds check.
+        FileOptions fopts;
+        if (i < files_checksums.size()) {
+          fopts.file_checksum = files_checksums[i];
+        }
+        if (i < files_checksum_func_names.size()) {
+          fopts.file_checksum_func_name = files_checksum_func_names[i];
+        } else {
+          fopts.file_checksum_func_name = kNoFileChecksumFuncName;
+        }
         IOStatus io_s = GenerateOneFileChecksum(
             fs_.get(), files_to_ingest_[i].internal_file_path,
             db_options_.file_checksum_gen_factory.get(),
@@ -308,7 +320,7 @@ Status ExternalSstFileIngestionJob::Prepare(
             ingestion_options_.verify_checksums_readahead_size,
             db_options_.allow_mmap_reads, io_tracer_,
             db_options_.rate_limiter.get(), ro, db_options_.stats,
-            db_options_.clock);
+            db_options_.clock, fopts);
         if (!io_s.ok()) {
           status = io_s;
           ROCKS_LOG_WARN(db_options_.info_log,
@@ -1482,13 +1494,15 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
   // TODO: rate limit file reads for checksum calculation during file ingestion.
   // TODO: plumb Env::IOActivity
   ReadOptions ro;
+  FileOptions gen_fopts;
+  gen_fopts.file_checksum_func_name = kNoFileChecksumFuncName;
   IOStatus io_s = GenerateOneFileChecksum(
       fs_.get(), file_to_ingest->internal_file_path,
       db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
       &file_checksum, &file_checksum_func_name,
       ingestion_options_.verify_checksums_readahead_size,
       db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
-      ro, db_options_.stats, db_options_.clock);
+      ro, db_options_.stats, db_options_.clock, gen_fopts);
   if (!io_s.ok()) {
     ROCKS_LOG_WARN(
         db_options_.info_log, "Failed to generate checksum for %s: %s",
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 770dc5b69025..3033f1cf41e2 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -310,8 +310,10 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
   std::unique_ptr<FSRandomAccessFile> sst_file;
   std::unique_ptr<RandomAccessFileReader> sst_file_reader;
 
-  status =
-      fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+  FileOptions fo{env_options_};
+  fo.file_checksum = file_meta.file_checksum;
+  fo.file_checksum_func_name = file_meta.file_checksum_func_name;
+  status = fs_->NewRandomAccessFile(external_file, fo, &sst_file, nullptr);
   if (!status.ok()) {
     return status;
   }
diff --git a/db/table_cache.cc b/db/table_cache.cc
index feb66f2eff4f..0e4e9f2e5155 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -98,6 +98,8 @@ Status TableCache::GetTableReader(
   std::unique_ptr<FSRandomAccessFile> file;
   FileOptions fopts = file_options;
   fopts.temperature = file_temperature;
+  fopts.file_checksum = file_meta.file_checksum;
+  fopts.file_checksum_func_name = file_meta.file_checksum_func_name;
   Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
   TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
                            const_cast<Status*>(&s));
@@ -113,8 +115,7 @@ Status TableCache::GetTableReader(
     Status temp_s =
         PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
     if (temp_s.ok()) {
-      temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
-                                                 nullptr);
+      temp_s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
     }
     if (temp_s.ok()) {
       RecordTick(ioptions_.stats, NO_FILE_OPENS);
diff --git a/db/version_set.cc b/db/version_set.cc
index bd36cc577475..6c9cbc82a17c 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -1820,8 +1820,10 @@ Status Version::GetTableProperties(const ReadOptions& read_options,
     file_name = TableFileName(ioptions.cf_paths, file_meta->fd.GetNumber(),
                               file_meta->fd.GetPathId());
   }
-  s = ioptions.fs->NewRandomAccessFile(file_name, file_options_, &file,
-                                       nullptr);
+  FileOptions fopts = file_options_;
+  fopts.file_checksum = file_meta->file_checksum;
+  fopts.file_checksum_func_name = file_meta->file_checksum_func_name;
+  s = ioptions.fs->NewRandomAccessFile(file_name, fopts, &file, nullptr);
   if (!s.ok()) {
     return s;
   }
diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h
index 5ea9e8b6ef1c..4186bc41f653 100644
--- a/db_stress_tool/db_stress_env_wrapper.h
+++ b/db_stress_tool/db_stress_env_wrapper.h
@@ -9,8 +9,11 @@
 
 #ifdef GFLAGS
 #pragma once
+
 #include "db_stress_tool/db_stress_common.h"
+#include "file/filename.h"
 #include "monitoring/thread_status_util.h"
+#include "rocksdb/file_checksum.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
@@ -173,6 +176,35 @@ class DbStressFSWrapper : public FileSystemWrapper {
                                const FileOptions& file_opts,
                                std::unique_ptr<FSRandomAccessFile>* r,
                                IODebugContext* dbg) override {
+    // verify that file checksums are propagated through FileOptions
+    // for SST file opens.
+
+    std::string basename = f.substr(f.rfind('/') + 1);
+    uint64_t file_number;
+    FileType file_type;
+    if (ParseFileName(basename, &file_number, &file_type) &&
+        file_type == kTableFile) {
+      // file_checksum_func_name must always be populated to be sure each call
+      // site within RocksDB is intentional about populating the fields with the
+      // best available information:
+      //  - kNoFileChecksumFuncName: no checksum context available
+      //    (e.g., SstFileDumper, SstFileReader, checksum generation),
+      //    always paired with empty checksum
+      //  - kUnknownFileChecksumFuncName: file created without a
+      //    checksum factory (from MANIFEST), always paired with
+      //    empty checksum
+      //  - a real name (e.g., "FileChecksumCrc32c"): checksum exists
+      assert(!file_opts.file_checksum_func_name.empty());
+      if (file_opts.file_checksum_func_name == kUnknownFileChecksumFuncName ||
+          file_opts.file_checksum_func_name == kNoFileChecksumFuncName) {
+        // No checksum available — checksum value must be empty
+        assert(file_opts.file_checksum.empty());
+      } else {
+        // A real checksum function — checksum value must be present
+        assert(!file_opts.file_checksum.empty());
+      }
+    }
+
     std::unique_ptr<FSRandomAccessFile> file;
     IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg);
     if (s.ok()) {
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index a32dcf557f52..15b52b827b14 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -94,8 +94,10 @@ int db_stress_tool(int argc, char** argv) {
     raw_env = fault_env_guard.get();
   }
 
-  env_wrapper_guard = std::make_shared<CompositeEnvWrapper>(
-      raw_env, std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem()));
+  auto db_stress_fs =
+      std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem());
+  env_wrapper_guard =
+      std::make_shared<CompositeEnvWrapper>(raw_env, db_stress_fs);
   db_stress_env = env_wrapper_guard.get();
 
   // Handle --destroy_db_and_exit early, before other option validation
diff --git a/file/file_util.cc b/file/file_util.cc
index 6e06ea0d95e3..c44d799b8ce4 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -178,7 +178,8 @@ IOStatus GenerateOneFileChecksum(
     std::string* file_checksum_func_name,
     size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/,
     std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
-    const ReadOptions& read_options, Statistics* stats, SystemClock* clock) {
+    const ReadOptions& read_options, Statistics* stats, SystemClock* clock,
+    const FileOptions& file_options) {
   if (checksum_factory == nullptr) {
     return IOStatus::InvalidArgument("Checksum factory is invalid");
   }
@@ -218,7 +219,12 @@ IOStatus GenerateOneFileChecksum(
   std::unique_ptr<RandomAccessFileReader> reader;
   {
     std::unique_ptr<FSRandomAccessFile> r_file;
-    io_s = fs->NewRandomAccessFile(file_path, FileOptions(), &r_file, nullptr);
+    FileOptions fopts = file_options;
+    if (fopts.file_checksum.empty()) {
+      // No expected checksum is known — this is a from-scratch computation.
+      fopts.file_checksum_func_name = kNoFileChecksumFuncName;
+    }
+    io_s = fs->NewRandomAccessFile(file_path, fopts, &r_file, nullptr);
     if (!io_s.ok()) {
       return io_s;
     }
diff --git a/file/file_util.h b/file/file_util.h
index d19a4de6cda0..f460a30caa9b 100644
--- a/file/file_util.h
+++ b/file/file_util.h
@@ -83,7 +83,8 @@ IOStatus GenerateOneFileChecksum(
     std::string* file_checksum_func_name,
     size_t verify_checksums_readahead_size, bool allow_mmap_reads,
     std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter,
-    const ReadOptions& read_options, Statistics* stats, SystemClock* clock);
+    const ReadOptions& read_options, Statistics* stats, SystemClock* clock,
+    const FileOptions& file_options);
 
 inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
                                          SystemClock* clock, IOOptions& opts,
diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h
index bbb148c67d28..70de891f2c05 100644
--- a/include/rocksdb/file_checksum.h
+++ b/include/rocksdb/file_checksum.h
@@ -22,7 +22,12 @@ namespace ROCKSDB_NAMESPACE {
 // The unknown file checksum.
 constexpr char kUnknownFileChecksum[] = "";
 // The unknown sst file checksum function name.
+// Indicates that the file metadata says that no checksum factory was configured
+// when the file was written.
 constexpr char kUnknownFileChecksumFuncName[] = "Unknown";
+// Used when opening a file and there is no file checksum metadata to propagate
+// at all.
+constexpr char kNoFileChecksumFuncName[] = "Unavailable";
 // The standard DB file checksum function name.
 // This is the name of the checksum function returned by
 // GetFileChecksumGenCrc32cFactory();
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index 16f807e4f299..ea9d52bf6b30 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -201,6 +201,19 @@ struct FileOptions : EnvOptions {
   // FSWritableFile object creation.
   Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET;
 
+  // File checksum of the file being opened. Empty string if no checksum is
+  // available.
+  std::string file_checksum;
+
+  // Name of the checksum function used to compute file_checksum. Set to
+  // kUnknownFileChecksumFuncName when file was created without a checksum
+  // factory. Set to kNoFileChecksumFuncName when no checksum metadata is
+  // available.
+  // Production FileSystems will accept empty values for both
+  // file_checksum and file_checksum_func_name, but internally within RocksDB
+  // that is forbidden for checking/auditing purposes.
+  std::string file_checksum_func_name;
+
   FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const DBOptions& opts)
@@ -216,7 +229,9 @@ struct FileOptions : EnvOptions {
         io_options(opts.io_options),
         temperature(opts.temperature),
         handoff_checksum_type(opts.handoff_checksum_type),
-        write_hint(opts.write_hint) {}
+        write_hint(opts.write_hint),
+        file_checksum(opts.file_checksum),
+        file_checksum_func_name(opts.file_checksum_func_name) {}
 
   FileOptions& operator=(const FileOptions&) = default;
 };
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 6ccfe636e688..3b185380b571 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -23,6 +23,7 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/status.h"
@@ -85,6 +86,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
   uint64_t file_size = 0;
   FileOptions fopts = soptions_;
   fopts.temperature = file_temp_;
+  fopts.file_checksum_func_name = kNoFileChecksumFuncName;
   Status s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
   if (s.ok()) {
     // check empty file
@@ -129,6 +131,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
       if (magic_number == kCuckooTableMagicNumber) {
         fopts = soptions_;
         fopts.temperature = file_temp_;
+        fopts.file_checksum_func_name = kNoFileChecksumFuncName;
       }
 
       fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index 11013712e281..e63e67c92e1a 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -11,6 +11,7 @@
 #include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
@@ -51,6 +52,7 @@ Status SstFileReader::Open(const std::string& file_path) {
   std::unique_ptr<FSRandomAccessFile> file;
   std::unique_ptr<RandomAccessFileReader> file_reader;
   FileOptions fopts(r->soptions);
+  fopts.file_checksum_func_name = kNoFileChecksumFuncName;
   const auto& fs = r->options.env->GetFileSystem();
 
   s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);

From 49695ef868cc110d6350f3c52b873a89e280bc97 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Tue, 17 Feb 2026 14:12:08 -0800
Subject: [PATCH 482/500] Add debug assertion for clean cut invariant in
 expanding compaction inputs (#14333)

Summary:
**Context/Summary:**

Stress test recently encountered a one-off failure where input file selection for trivial move did not select all the files it should and left behind one adjacent file to the input file. This violated the clean cut invariant enforced through `ExpandInputsToCleanCut()` and caused `Get()` to return stale data.

While I had no luck reproducing it nor in code inspection to find the root cause, this debug assertion should help in two ways: 1. Fail fast if the invariant is violated, showing us the file boundary in memory 2. If the assertion doesn't trigger yet the same failure occurs, it points to metadata corruption bypassing this check and ExpandInputsToCleanCut() enforcement

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14333

Test Plan:
- Existing unit tests
- Manually trace through and run a stress test command that frequently exercises this check for 10 minutes
```
./db_stress --level0_file_num_compaction_trigger=2 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_setting_blob_options_dynamically=1 --async_io=1 --auto_readahead_size=0 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=8 --blob_cache_size=1048576 --blob_compaction_readahead_size=4194304 --blob_compression_type=lz4 --blob_file_size=1048576 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=1.0 --blob_garbage_collection_force_threshold=0.75 --block_protection_bytes_per_key=2 --block_size=16384 --bloom_before_level=0 --bloom_bits=16 --bottommost_compression_type=zstd --bottommost_file_compaction_delay=0 --bytes_per_sync=0 --cache_index_and_filter_blocks=0 --cache_size=8388608 --cache_type=auto_hyper_clock_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=2 --compression_checksum=1 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=xpress --compression_use_zstd_dict_trainer=1 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_wal=0 --enable_blob_files=0 --enable_blob_garbage_collection=0 --enable_compaction_filter=0 --enable_pipelined_write=1 --enable_thread_tracking=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=big --flush_one_in=1000000 --format_version=3 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=2 --index_type=3 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=10 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=1 --lock_wal_one_in=1000000 --long_running_snapshots=1 --manual_wal_flush_one_in=1000 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=0 --max_background_compactions=1 --max_bytes_for_level_base=1000 --max_key=25000000 --max_key_len=3 --max_manifest_file_size=16384 --max_write_batch_group_size_bytes=64 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=1000 --memtable_max_range_deletions=100 --memtable_prefix_bloom_size_ratio=0.01 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --min_blob_size=0 --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=2 --open_files=-1 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=0 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=1000000 --periodic_compaction_seconds=1 --prefix_size=8 --prefixpercent=5 --prepopulate_blob_cache=0 --prepopulate_block_cache=0 --preserve_internal_time_seconds=0 --progress_reports=0 --read_fault_one_in=32 --readahead_size=0 --readpercent=45 --recycle_log_file_num=0 --reopen=0 --secondary_cache_fault_one_in=32 --secondary_cache_uri=compressed_secondary_cache://capacity=8388608 --set_options_one_in=0 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --subcompactions=1 --sync=0 --sync_fault_injection=0 --target_file_size_base=1000 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=0 --unpartitioned_pinning=3 --use_blob_cache=0 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_get_entity=0 --use_multiget=0 --use_put_entity_one_in=5 --use_shared_block_and_blob_cache=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=524288 --wal_compression=none --write_buffer_size=1000 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=35
```

Reviewed By: mszeszko-meta

Differential Revision: D93300664

Pulled By: hx235

fbshipit-source-id: b56f01c08a7348ba383110dd8f89b5b1b7961c55
---
 db/compaction/compaction_picker.cc | 61 ++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 5e3ff66cf8b3..14c25677c0b9 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -27,6 +27,63 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+#ifndef NDEBUG
+static void AssertCleanCut(const InternalKeyComparator* icmp,
+                           VersionStorageInfo* vstorage,
+                           CompactionInputFiles* inputs, int level,
+                           Logger* logger) {
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(level);
+  if (inputs->files.empty() || level_files.empty()) {
+    return;
+  }
+
+  const Comparator* ucmp = icmp->user_comparator();
+
+  // Find first and last input file indices in level
+  int first_input_idx = -1;
+  int last_input_idx = -1;
+  for (size_t i = 0; i < level_files.size(); i++) {
+    if (level_files[i] == inputs->files.front()) {
+      first_input_idx = static_cast<int>(i);
+    }
+    if (level_files[i] == inputs->files.back()) {
+      last_input_idx = static_cast<int>(i);
+    }
+  }
+
+  // Check file before first input
+  if (first_input_idx > 0) {
+    const FileMetaData* prev_file = level_files[first_input_idx - 1];
+    const FileMetaData* first_file = inputs->files.front();
+    int cmp = sstableKeyCompare(ucmp, prev_file->largest, first_file->smallest);
+    if (cmp == 0) {
+      ROCKS_LOG_ERROR(logger,
+                      "Clean cut violated: L%d unselected file %" PRIu64
+                      " adjacent to first selected file %" PRIu64,
+                      level, prev_file->fd.GetNumber(),
+                      first_file->fd.GetNumber());
+      assert(false);
+    }
+  }
+
+  // Check file after last input
+  if (last_input_idx >= 0 &&
+      static_cast<size_t>(last_input_idx) < level_files.size() - 1) {
+    const FileMetaData* last_file = inputs->files.back();
+    const FileMetaData* next_file = level_files[last_input_idx + 1];
+    int cmp = sstableKeyCompare(ucmp, last_file->largest, next_file->smallest);
+    if (cmp == 0) {
+      ROCKS_LOG_ERROR(logger,
+                      "Clean cut violated: L%d unselected file %" PRIu64
+                      " adjacent to last selected file %" PRIu64,
+                      level, next_file->fd.GetNumber(),
+                      last_file->fd.GetNumber());
+      assert(false);
+    }
+  }
+}
+#endif  // NDEBUG
+
 bool PickCostBasedIntraL0Compaction(
     const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
     uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
@@ -249,6 +306,10 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
   // inputs. thus, inputs should be non-empty here
   assert(!inputs->empty());
 
+#ifndef NDEBUG
+  AssertCleanCut(icmp_, vstorage, inputs, level, ioptions_.logger);
+#endif  // NDEBUG
+
   // If, after the expansion, there are files that are already under
   // compaction, then we must drop/cancel this compaction.
   if (AreFilesInCompaction(inputs->files)) {

From f065e1c95de98ed9d0e7c4f88a14d6f4c37c0cdc Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 17 Feb 2026 14:30:03 -0800
Subject: [PATCH 483/500] Fix up authors.yml for blog entries (#14342)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
Fixes blog author display issues on rocksdb.org/blog by:

* Adding missing authors to authors.yml: pdillinger, alanpaxton, akankshamahajan15, anand1976, poojam23
* Standardizing on GitHub usernames: renamed sdong → siying
* Fixing typo in 2016-02-25-rocksdb-ama.markdown: yhchiang → yhciang
* A short note in CLAUDE.md

Authors were not showing on the blog because they were referenced in post frontmatter but not defined in the _data/authors.yml lookup file.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14342

Test Plan: push & see ;)

Reviewed By: mszeszko-meta

Differential Revision: D93523972

Pulled By: pdillinger

fbshipit-source-id: 757c33e80f3c1d99ff4134a37321f40634d6e294
---
 CLAUDE.md                                     |  3 +++
 docs/_data/authors.yml                        | 20 ++++++++++++++++++-
 docs/_posts/2014-05-14-lock.markdown          |  2 +-
 ...6-23-plaintable-a-new-file-format.markdown |  2 +-
 ...2015-02-27-write-batch-with-index.markdown |  2 +-
 docs/_posts/2015-07-23-dynamic-level.markdown |  2 +-
 ...alysis-file-read-latency-by-level.markdown |  2 +-
 .../_posts/2016-01-29-compaction_pri.markdown |  2 +-
 .../2016-02-24-rocksdb-4-2-release.markdown   |  2 +-
 docs/_posts/2016-02-25-rocksdb-ama.markdown   |  2 +-
 ...2016-04-26-rocksdb-4-5-1-released.markdown |  2 +-
 ...016-09-28-rocksdb-4-11-2-released.markdown |  2 +-
 ...2017-03-02-rocksdb-5-2-1-released.markdown |  2 +-
 ...2021-04-12-universal-improvements.markdown |  2 +-
 .../2021-05-26-online-validation.markdown     |  2 +-
 ...10-08-parallel-compression-revamp.markdown |  2 +-
 16 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 6cbdb32e1cbc..acf14592e99d 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -229,6 +229,9 @@ The following patterns emerged as frequent sources of review feedback:
 ### Adding release note
 * Release note should be kept short at high level for external user consumption.
 
+### Blog posts (docs/_posts)
+* Blog post authors must be defined in `docs/_data/authors.yml` to be displayed
+
 ### Final verification of the change
 * Execute make clean to clean all of the changes.
 * Execute make check to build all of the changes and execute all of the tests.
diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml
index 256f4c07ff65..0bc79ad80de6 100644
--- a/docs/_data/authors.yml
+++ b/docs/_data/authors.yml
@@ -1,3 +1,5 @@
+# Note: standardize on github user names here. fbid is optional and was used
+# to use author's profile picture from Facebook
 icanadi:
   full_name: Igor Canadi
   fbid: 706165749
@@ -26,7 +28,7 @@ lgalanis:
   full_name: Leonidas Galanis
   fbid: 8649950
 
-sdong:
+siying:
   full_name: Siying Dong
   fbid: 9805119
 
@@ -83,3 +85,19 @@ zjay:
 hx235:
   full_name: Hui Xiao
   fbid: 100037058588280
+
+pdillinger:
+  full_name: Peter Dillinger
+  fbid: 513108
+
+alanpaxton:
+  full_name: Alan Paxton
+
+akankshamahajan15:
+  full_name: Akanksha Mahajan
+
+anand1976:
+  full_name: Anand Ananthabhotla
+
+poojam23:
+  full_name: Pooja Malik
diff --git a/docs/_posts/2014-05-14-lock.markdown b/docs/_posts/2014-05-14-lock.markdown
index 12009cc88c11..66bf05dc4736 100644
--- a/docs/_posts/2014-05-14-lock.markdown
+++ b/docs/_posts/2014-05-14-lock.markdown
@@ -1,7 +1,7 @@
 ---
 title: Reducing Lock Contention in RocksDB
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/521/lock/
diff --git a/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown b/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
index 6a641f23353c..ed03b0273233 100644
--- a/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
+++ b/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
@@ -1,7 +1,7 @@
 ---
 title: PlainTable — A New File Format
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/599/plaintable-a-new-file-format/
diff --git a/docs/_posts/2015-02-27-write-batch-with-index.markdown b/docs/_posts/2015-02-27-write-batch-with-index.markdown
index 7f9f77653655..770ee0581651 100644
--- a/docs/_posts/2015-02-27-write-batch-with-index.markdown
+++ b/docs/_posts/2015-02-27-write-batch-with-index.markdown
@@ -1,7 +1,7 @@
 ---
 title: 'WriteBatchWithIndex: Utility for Implementing Read-Your-Own-Writes'
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/1901/write-batch-with-index/
diff --git a/docs/_posts/2015-07-23-dynamic-level.markdown b/docs/_posts/2015-07-23-dynamic-level.markdown
index 0ff3a0542f82..1bc41b2fb3a4 100644
--- a/docs/_posts/2015-07-23-dynamic-level.markdown
+++ b/docs/_posts/2015-07-23-dynamic-level.markdown
@@ -1,7 +1,7 @@
 ---
 title: Dynamic Level Size for Level-Based Compaction
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/2207/dynamic-level/
diff --git a/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown b/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
index b21b04fe3869..7e5eb03582d6 100644
--- a/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
+++ b/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
@@ -1,7 +1,7 @@
 ---
 title: Analysis File Read Latency by Level
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/2537/analysis-file-read-latency-by-level/
diff --git a/docs/_posts/2016-01-29-compaction_pri.markdown b/docs/_posts/2016-01-29-compaction_pri.markdown
index ba9ee627c91d..955e0849c95f 100644
--- a/docs/_posts/2016-01-29-compaction_pri.markdown
+++ b/docs/_posts/2016-01-29-compaction_pri.markdown
@@ -1,7 +1,7 @@
 ---
 title: Option of Compaction Priority
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/2921/compaction_pri/
diff --git a/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown b/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
index 409015cc8c8c..927121bac173 100644
--- a/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
+++ b/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 4.2 Release!
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/3017/rocksdb-4-2-release/
diff --git a/docs/_posts/2016-02-25-rocksdb-ama.markdown b/docs/_posts/2016-02-25-rocksdb-ama.markdown
index 2ba04f39a18e..31792552fc29 100644
--- a/docs/_posts/2016-02-25-rocksdb-ama.markdown
+++ b/docs/_posts/2016-02-25-rocksdb-ama.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB AMA
 layout: post
-author: yhchiang
+author: yhciang
 category: blog
 redirect_from:
   - /blog/3065/rocksdb-ama/
diff --git a/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown b/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
index 247768d307b4..b29a9bd3649f 100644
--- a/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
+++ b/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 4.5.1 Released!
 layout: post
-author: sdong
+author: siying
 category: blog
 redirect_from:
   - /blog/3179/rocksdb-4-5-1-released/
diff --git a/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown b/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
index 87c20eb47d43..11760cc82560 100644
--- a/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
+++ b/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 4.11.2 Released!
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 We abandoned release candidates 4.10.x and directly go to 4.11.2 from 4.9, to make sure the latest release is stable. In 4.11.2, we fixed several data corruption related bugs introduced in 4.9.0.
diff --git a/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown b/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
index c6ce27d64db4..87fe0c050e0b 100644
--- a/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
+++ b/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
@@ -1,7 +1,7 @@
 ---
 title: RocksDB 5.2.1 Released!
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 
diff --git a/docs/_posts/2021-04-12-universal-improvements.markdown b/docs/_posts/2021-04-12-universal-improvements.markdown
index fa4e9d463b23..f6bf64b2da8e 100644
--- a/docs/_posts/2021-04-12-universal-improvements.markdown
+++ b/docs/_posts/2021-04-12-universal-improvements.markdown
@@ -1,7 +1,7 @@
 ---
 title: (Call For Contribution) Make Universal Compaction More Incremental
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 
diff --git a/docs/_posts/2021-05-26-online-validation.markdown b/docs/_posts/2021-05-26-online-validation.markdown
index 33e9dfc151ac..9314630b0705 100644
--- a/docs/_posts/2021-05-26-online-validation.markdown
+++ b/docs/_posts/2021-05-26-online-validation.markdown
@@ -1,7 +1,7 @@
 ---
 title: Online Validation
 layout: post
-author: sdong
+author: siying
 category: blog
 ---
 To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones.
diff --git a/docs/_posts/2025-10-08-parallel-compression-revamp.markdown b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
index 435c409415f6..42386e5c941a 100644
--- a/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
+++ b/docs/_posts/2025-10-08-parallel-compression-revamp.markdown
@@ -1,7 +1,7 @@
 ---
 title: "Parallel Compression Revamp: Dramatically Reduced CPU Overhead"
 layout: post
-author: peterd
+author: pdillinger
 category: blog
 ---
 

From 653fd9c65bc68550c1ffbaa4dee1b99469b055b2 Mon Sep 17 00:00:00 2001
From: anand76 <anand1976@users.noreply.github.com>
Date: Tue, 17 Feb 2026 14:52:00 -0800
Subject: [PATCH 484/500] Bug fix for bg error recovery in TransactionDB
 (#14313)

Summary:
This PR fixes a bug in the interaction between WritePrepared/WriteUnprepared TransactionDB (with two_write_queues=true) and background error recovery. This bug caused crash tests to fail with a "sequence number going backwards" error during DB open.

Root Cause
------------
When two_write_queues=true, sequence numbers are allocated via FetchAddLastAllocatedSequence() before a write completes, but are only published via SetLastSequence() after the write succeeds. If a background error occurs (e.g., a MANIFEST write failure during flush), the error recovery path in DBImpl::ResumeImpl creates new memtables and WAL files. The new WAL's starting sequence number is based on LastSequence() (the published value), which can be lower than already-allocated sequence numbers that were written to the old WAL. On subsequent recovery, RocksDB detects that sequence numbers in the new WAL are lower than those in the old WAL and reports a "sequence number going backwards" corruption error, causing the DB to fail to open.

Fix
 ---
The fix adds a call to a new VersionSet::SyncLastSequenceWithAllocated() method at the beginning of DBImpl::ResumeImpl, before any new memtables or WALs are created. This method advances last_sequence_ to match last_allocated_sequence_ if the latter is higher, ensuring the new WAL starts with a sequence number that is at least as high as any previously allocated one.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14313

Test Plan:
---------
Add new unit tests in write_prepared_transaction_test_seqno

Reviewed By: pdillinger

Differential Revision: D92746944

Pulled By: anand1976

fbshipit-source-id: 34385fc13fd74435dd1c3283637eb118f45d887e
---
 BUCK                                          |   6 +
 CMakeLists.txt                                |   1 +
 Makefile                                      |   3 +
 db/db_impl/db_impl.cc                         |  16 +
 db/version_set.h                              |  26 ++
 src.mk                                        |   1 +
 .../txn_two_write_queues_seqno_recovery.md    |   1 +
 .../write_prepared_transaction_test_seqno.cc  | 425 ++++++++++++++++++
 utilities/transactions/write_prepared_txn.h   |  91 ++++
 9 files changed, 570 insertions(+)
 create mode 100644 unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md
 create mode 100644 utilities/transactions/write_prepared_transaction_test_seqno.cc

diff --git a/BUCK b/BUCK
index 7037c44e778f..c05b7bb33d3a 100644
--- a/BUCK
+++ b/BUCK
@@ -5731,6 +5731,12 @@ cpp_unittest_wrapper(name="write_prepared_transaction_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="write_prepared_transaction_test_seqno",
+            srcs=["utilities/transactions/write_prepared_transaction_test_seqno.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="write_unprepared_transaction_test",
             srcs=["utilities/transactions/write_unprepared_transaction_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9134aa01889..f0e79d9306e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1548,6 +1548,7 @@ if(WITH_TESTS)
         utilities/transactions/lock/point/point_lock_manager_stress_test.cc
         utilities/transactions/write_committed_transaction_ts_test.cc
         utilities/transactions/write_prepared_transaction_test.cc
+        utilities/transactions/write_prepared_transaction_test_seqno.cc
         utilities/transactions/write_unprepared_transaction_test.cc
         utilities/transactions/lock/range/range_locking_test.cc
         utilities/transactions/timestamped_snapshot_test.cc
diff --git a/Makefile b/Makefile
index 7c35b80d95f6..3e05fc174443 100644
--- a/Makefile
+++ b/Makefile
@@ -1829,6 +1829,9 @@ write_committed_transaction_ts_test: $(OBJ_DIR)/utilities/transactions/write_com
 write_prepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+write_prepared_transaction_test_seqno: $(OBJ_DIR)/utilities/transactions/write_prepared_transaction_test_seqno.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unprepared_transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index b9120bc0486d..34a3d2d9ea41 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -320,6 +320,22 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
 
   WaitForBackgroundWork();
 
+  TEST_SYNC_POINT("DBImpl::ResumeImpl:Start");
+
+  // With two_write_queues=true, sequence numbers are allocated via
+  // FetchAddLastAllocatedSequence() before writes complete, but only
+  // published via SetLastSequence() after success. If we're recovering from
+  // an error, there may be allocated-but-not-published sequence numbers.
+  // We must sync last_sequence_ with last_allocated_sequence_ before creating
+  // any new memtables/WALs, otherwise the new WAL could start with a sequence
+  // number lower than what was already written, causing "sequence number
+  // going backwards" corruption on subsequent recovery.
+  if (immutable_db_options_.two_write_queues) {
+    versions_->SyncLastSequenceWithAllocated();
+  }
+
+  TEST_SYNC_POINT("DBImpl::ResumeImpl:AfterSyncSeq");
+
   Status s;
   if (shutdown_initiated_) {
     // Returning shutdown status to SFM during auto recovery will cause it
diff --git a/db/version_set.h b/db/version_set.h
index 800e55259872..47a677cf59e6 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1434,6 +1434,29 @@ class VersionSet {
     return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
   }
 
+  // Sync last_sequence_ with last_allocated_sequence_. This should be called
+  // during error recovery to ensure that any sequence numbers that were
+  // allocated (written to WAL) but not yet published are accounted for when
+  // creating new memtables/WALs. This prevents the "sequence number going
+  // backwards" corruption on subsequent recovery.
+  //
+  // This is necessary because with two_write_queues=true, writes allocate
+  // sequence numbers via FetchAddLastAllocatedSequence() before the write
+  // is complete, but only publish via SetLastSequence() after success.
+  // If an error occurs and recovery creates new memtables, SwitchMemtable
+  // uses LastSequence() which may be lower than already-allocated sequences.
+  //
+  // REQUIRED: DB mutex is held and no concurrent writers are active (i.e.,
+  // after WaitForBackgroundWork() in ResumeImpl).
+  void SyncLastSequenceWithAllocated() {
+    uint64_t alloc_seq =
+        last_allocated_sequence_.load(std::memory_order_seq_cst);
+    uint64_t last_seq = last_sequence_.load(std::memory_order_acquire);
+    if (alloc_seq > last_seq) {
+      last_sequence_.store(alloc_seq, std::memory_order_release);
+    }
+  }
+
   // Mark the specified file number as used.
   // REQUIRED: this is only called during single-threaded recovery or repair.
   void MarkFileNumberUsed(uint64_t number);
@@ -1715,6 +1738,9 @@ class VersionSet {
   // The last sequence number of data committed to the descriptor (manifest
   // file).
   SequenceNumber descriptor_last_sequence_ = 0;
+  // See write_prepared_txn.h for a more detailed description of how Write
+  // Prepared transactions work, with concrete examples.
+  //
   // The last seq that is already allocated. It is applicable only when we have
   // two write queues. In that case seq might or might not have appreated in
   // memtable but it is expected to appear in the WAL.
diff --git a/src.mk b/src.mk
index 0bae5ee333fd..a77efc8f6123 100644
--- a/src.mk
+++ b/src.mk
@@ -661,6 +661,7 @@ TEST_MAIN_SOURCES =                                                     \
   utilities/transactions/lock/point/point_lock_manager_test.cc          \
   utilities/transactions/lock/point/point_lock_manager_stress_test.cc   \
   utilities/transactions/write_prepared_transaction_test.cc             \
+  utilities/transactions/write_prepared_transaction_test_seqno.cc       \
   utilities/transactions/write_unprepared_transaction_test.cc           \
   utilities/transactions/write_committed_transaction_ts_test.cc         \
   utilities/transactions/timestamped_snapshot_test.cc                   \
diff --git a/unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md b/unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md
new file mode 100644
index 000000000000..95413bf15fd0
--- /dev/null
+++ b/unreleased_history/bug_fixes/txn_two_write_queues_seqno_recovery.md
@@ -0,0 +1 @@
+Fix a bug where WritePrepared/WriteUnprepared TransactionDB with two_write_queues=true could experience "sequence number going backwards" corruption during recovery from a background error, due to allocated-but-not-published sequence numbers not being synced before creating new WAL files.
diff --git a/utilities/transactions/write_prepared_transaction_test_seqno.cc b/utilities/transactions/write_prepared_transaction_test_seqno.cc
new file mode 100644
index 000000000000..0148ab9cc32c
--- /dev/null
+++ b/utilities/transactions/write_prepared_transaction_test_seqno.cc
@@ -0,0 +1,425 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// Test to verify that sequence numbers remain consistent during error recovery
+// with WritePrepared TransactionDB and two_write_queues=true.
+//
+// The fix: SyncLastSequenceWithAllocated() is called during ResumeImpl to
+// ensure that allocated-but-not-published sequence numbers are accounted for
+// before creating new memtables/WALs, preventing "sequence number going
+// backwards" corruption on subsequent recovery.
+
+#include <atomic>
+#include <memory>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritePreparedTransactionSeqnoTest : public ::testing::Test {
+ public:
+  WritePreparedTransactionSeqnoTest()
+      : db_(nullptr),
+        special_env_(Env::Default()),
+        fault_fs_(new FaultInjectionTestFS(FileSystem::Default())),
+        env_(new CompositeEnvWrapper(&special_env_, fault_fs_)) {
+    options_.create_if_missing = true;
+    options_.max_write_buffer_number = 2;
+    options_.write_buffer_size = 4 * 1024;
+    options_.level0_file_num_compaction_trigger = 2;
+    options_.env = env_.get();
+    // Use two_write_queues which is typical for WritePrepared
+    options_.two_write_queues = true;
+    // Enable auto recovery from retryable errors
+    options_.max_bgerror_resume_count = 2;
+    options_.bgerror_resume_retry_interval = 100000;  // 100ms
+
+    dbname_ = test::PerThreadDBPath("write_prepared_seqno_test");
+    EXPECT_OK(DestroyDB(dbname_, options_));
+
+    txn_db_options_.transaction_lock_timeout = 0;
+    txn_db_options_.default_lock_timeout = 0;
+    txn_db_options_.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+  }
+
+  ~WritePreparedTransactionSeqnoTest() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    if (db_) {
+      for (auto h : handles_) {
+        if (h) {
+          EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+        }
+      }
+      handles_.clear();
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  Status Open() {
+    return TransactionDB::Open(options_, txn_db_options_, dbname_, &db_);
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      if (h) {
+        EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+      }
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  DBImpl* dbimpl() { return static_cast_with_check<DBImpl>(db_->GetRootDB()); }
+
+ protected:
+  TransactionDB* db_;
+  SpecialEnv special_env_;
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> env_;
+  std::string dbname_;
+  Options options_;
+  TransactionDBOptions txn_db_options_;
+  std::vector<ColumnFamilyHandle*> handles_;
+};
+
+// Regression test: verify that after error recovery with two_write_queues,
+// the DB can be closed and reopened without sequence number corruption.
+TEST_F(WritePreparedTransactionSeqnoTest,
+       SeqnoGoesBackwardsDuringErrorRecovery) {
+  ASSERT_OK(Open());
+
+  // Write some initial data and flush to establish baseline
+  WriteOptions write_opts;
+  TransactionOptions txn_opts;
+  for (int i = 0; i < 10; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Write more data - these will allocate sequence numbers
+  for (int i = 10; i < 20; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Set up sync point dependency chain for deterministic recovery
+  // synchronization, following the pattern from
+  // ManifestWriteRetryableErrorAutoRecover in error_handler_fs_test.cc.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "SeqnoGoesBackwardsDuringErrorRecovery:0"},
+       {"SeqnoGoesBackwardsDuringErrorRecovery:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "SeqnoGoesBackwardsDuringErrorRecovery:2"}});
+
+  // Inject a retryable MANIFEST write error on the next flush
+  IOStatus error_to_inject = IOStatus::IOError("Injected MANIFEST error");
+  error_to_inject.SetRetryable(true);
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger a flush that will fail due to MANIFEST write error
+  Status s = db_->Flush(FlushOptions());
+  ASSERT_NOK(s);
+
+  // Wait for recovery to start, then re-enable filesystem and let it proceed
+  TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:0");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearCallBack(
+      "VersionSet::LogAndApply:WriteManifest");
+  TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:1");
+
+  // Wait for recovery to complete
+  TEST_SYNC_POINT("SeqnoGoesBackwardsDuringErrorRecovery:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Write some more data after recovery
+  for (int i = 20; i < 30; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn_after_" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Close and reopen - this would fail with "sequence number going backwards"
+  // before the fix.
+  Close();
+
+  Status reopen_s = Open();
+  ASSERT_OK(reopen_s);
+
+  // Verify data integrity
+  ReadOptions read_opts;
+  for (int i = 0; i < 20; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
+    ASSERT_EQ(value, "value" + std::to_string(i));
+  }
+
+  Close();
+}
+
+// Test that verifies the sequence number discrepancy is resolved by checking
+// that LastSequence >= LastAllocatedSequence after recovery completes.
+TEST_F(WritePreparedTransactionSeqnoTest, SeqnoDiscrepancyDuringErrorRecovery) {
+  ASSERT_OK(Open());
+
+  WriteOptions write_opts;
+  TransactionOptions txn_opts;
+
+  // Write initial data and flush
+  for (int i = 0; i < 5; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("init_txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Write more transactions with two_write_queues to potentially create a gap
+  // between allocated and published sequence numbers. These must be written
+  // before installing the error injection callback, since the small write
+  // buffer (4KB) could trigger an automatic flush during these writes.
+  for (int i = 5; i < 10; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Track sequence numbers at key points
+  std::atomic<uint64_t> last_seq_after_recovery{0};
+  std::atomic<uint64_t> last_allocated_seq_after_recovery{0};
+  std::atomic<bool> captured_seqs_after{false};
+
+  IOStatus error_to_inject = IOStatus::IOError("Injected error");
+  error_to_inject.SetRetryable(true);
+
+  // Set up sync point dependency chain for deterministic synchronization
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "SeqnoDiscrepancyDuringErrorRecovery:0"},
+       {"SeqnoDiscrepancyDuringErrorRecovery:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "SeqnoDiscrepancyDuringErrorRecovery:2"}});
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
+
+  // Capture sequence numbers after recovery completes to verify the fix
+  SyncPoint::GetInstance()->SetCallBack(
+      "RecoverFromRetryableBGIOError:RecoverSuccess", [&](void*) {
+        DBImpl* db_impl = dbimpl();
+        if (db_impl) {
+          VersionSet* vs = db_impl->GetVersionSet();
+          if (vs) {
+            last_seq_after_recovery.store(vs->LastSequence());
+            last_allocated_seq_after_recovery.store(
+                vs->LastAllocatedSequence());
+            captured_seqs_after.store(true);
+          }
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger a flush that will fail
+  Status flush_s = db_->Flush(FlushOptions());
+  ASSERT_NOK(flush_s);
+
+  // Wait for recovery to start, re-enable filesystem, let it proceed
+  TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:0");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearCallBack(
+      "VersionSet::LogAndApply:WriteManifest");
+  TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:1");
+
+  // Wait for recovery to complete
+  TEST_SYNC_POINT("SeqnoDiscrepancyDuringErrorRecovery:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Verify that sequences were captured and are in sync after recovery
+  ASSERT_TRUE(captured_seqs_after.load());
+  ASSERT_GE(last_seq_after_recovery.load(),
+            last_allocated_seq_after_recovery.load())
+      << "LastSequence should be >= LastAllocatedSequence after recovery";
+
+  // Close and reopen should succeed without corruption
+  Close();
+  ASSERT_OK(Open());
+
+  // Verify data integrity
+  ReadOptions read_opts;
+  for (int i = 0; i < 10; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
+    ASSERT_EQ(value, "value" + std::to_string(i));
+  }
+
+  Close();
+}
+
+// Test that verifies SyncLastSequenceWithAllocated is called during ResumeImpl
+// by checking sequence numbers before and after the sync point.
+TEST_F(WritePreparedTransactionSeqnoTest, ConcurrentWritesDuringErrorRecovery) {
+  ASSERT_OK(Open());
+
+  WriteOptions write_opts;
+  TransactionOptions txn_opts;
+
+  // Write initial data and flush
+  for (int i = 0; i < 5; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("init_txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Write more transactions. These must be written before installing the error
+  // injection callback, since the small write buffer (4KB) could trigger an
+  // automatic flush during these writes.
+  for (int i = 5; i < 10; i++) {
+    Transaction* txn = db_->BeginTransaction(write_opts, txn_opts);
+    ASSERT_NE(txn, nullptr);
+    ASSERT_OK(txn->SetName("txn" + std::to_string(i)));
+    ASSERT_OK(txn->Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+
+  // Track sequence numbers at key points during recovery
+  std::atomic<uint64_t> seq_before_resume{0};
+  std::atomic<uint64_t> alloc_seq_before_resume{0};
+  std::atomic<uint64_t> seq_after_resume{0};
+  std::atomic<uint64_t> alloc_seq_after_resume{0};
+
+  IOStatus error_to_inject = IOStatus::IOError("Injected error");
+  error_to_inject.SetRetryable(true);
+
+  // Set up sync point dependency chain for deterministic synchronization
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ConcurrentWritesDuringErrorRecovery:0"},
+       {"ConcurrentWritesDuringErrorRecovery:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ConcurrentWritesDuringErrorRecovery:2"}});
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_to_inject); });
+
+  // Capture sequences right before ResumeImpl runs the sync
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::ResumeImpl:Start", [&](void*) {
+    DBImpl* db_impl = dbimpl();
+    if (db_impl) {
+      VersionSet* vs = db_impl->GetVersionSet();
+      if (vs) {
+        seq_before_resume.store(vs->LastSequence());
+        alloc_seq_before_resume.store(vs->LastAllocatedSequence());
+      }
+    }
+  });
+
+  // Capture sequences right after ResumeImpl syncs them
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::ResumeImpl:AfterSyncSeq", [&](void*) {
+        DBImpl* db_impl = dbimpl();
+        if (db_impl) {
+          VersionSet* vs = db_impl->GetVersionSet();
+          if (vs) {
+            seq_after_resume.store(vs->LastSequence());
+            alloc_seq_after_resume.store(vs->LastAllocatedSequence());
+          }
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger a flush that will fail
+  Status flush_s = db_->Flush(FlushOptions());
+  ASSERT_NOK(flush_s);
+
+  // Wait for recovery to start, re-enable filesystem, let it proceed
+  TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:0");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearCallBack(
+      "VersionSet::LogAndApply:WriteManifest");
+  TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:1");
+
+  // Wait for recovery to complete
+  TEST_SYNC_POINT("ConcurrentWritesDuringErrorRecovery:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Verify that the AfterSyncSeq callback fired and sequences are in sync
+  ASSERT_GT(seq_after_resume.load(), 0u)
+      << "DBImpl::ResumeImpl:AfterSyncSeq callback should have fired";
+  ASSERT_EQ(seq_after_resume.load(), alloc_seq_after_resume.load())
+      << "Fix should have synced sequences";
+
+  // Close and reopen
+  Close();
+  ASSERT_OK(Open());
+
+  // Verify data integrity
+  ReadOptions read_opts;
+  for (int i = 0; i < 10; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, "key" + std::to_string(i), &value));
+    ASSERT_EQ(value, "value" + std::to_string(i));
+  }
+
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h
index aca6a19ea08d..3237b8011328 100644
--- a/utilities/transactions/write_prepared_txn.h
+++ b/utilities/transactions/write_prepared_txn.h
@@ -36,6 +36,97 @@ class WritePreparedTxnDB;
 // committed data from uncommitted data. Uncommitted data could be after the
 // Prepare phase in 2PC (WritePreparedTxn) or before that
 // (WriteUnpreparedTxnImpl).
+//
+// == Concrete example: WritePrepared 2PC transaction ==
+//
+// User code:
+//
+//   Transaction* txn = db->BeginTransaction(write_opts, txn_opts);
+//   txn->SetName("txn1");
+//   txn->Put("key1", "value1");   // buffered in WriteBatch, nothing written
+//   yet txn->Prepare();               // Phase 1 txn->Commit(); // Phase 2
+//
+// -- Phase 1: Prepare (PrepareInternal) --
+//
+// The Prepare call (write_prepared_txn.cc PrepareInternal) calls:
+//
+//   db_impl_->WriteImpl(write_options, GetWriteBatch(),
+//                       ..., !DISABLE_MEMTABLE, ...);
+//
+// !DISABLE_MEMTABLE is false — memtable is enabled. This is the defining
+// characteristic of "WritePrepared": the actual data (Put("key1", "value1"))
+// is written to the memtable at Prepare time.
+//
+// Because disable_memtable == false, the routing check at
+// db_impl_write.cc:502 is not taken. The write goes through the main write
+// queue (write_thread_), which handles both WAL and memtable:
+//
+//   Destination | What gets written                          | Sequence
+//   ------------|--------------------------------------------|-----------
+//   WAL         | Put(key1, value1) + EndPrepare(txn1)       | prepare_seq
+//   Memtable    | Put(key1, value1)                          | prepare_seq
+//
+// The data is now durable (WAL) and in the memtable, but not yet visible
+// to readers. Readers use GetLastPublishedSequence() which consults a
+// commit map — since prepare_seq is in the PreparedHeap but not yet in the
+// CommitCache, readers know this data is uncommitted and skip it.
+//
+// -- Phase 2: Commit (CommitInternal) --
+//
+// The Commit call (write_prepared_txn.cc CommitInternal) calls:
+//
+//   db_impl_->WriteImpl(write_options_, working_batch,
+//                       ..., disable_memtable, ...);
+//
+// In the typical case (do_one_write == true, i.e., the commit-time batch
+// is empty or has no data), disable_memtable is true. Now the routing
+// check at db_impl_write.cc:502 is taken:
+//
+//   if (two_write_queues_ && disable_memtable) {
+//       return WriteImplWALOnly(&nonmem_write_thread_, ...);
+//   }
+//
+// The commit goes through the second write queue (nonmem_write_thread_),
+// WAL only:
+//
+//   Destination | What gets written   | Sequence
+//   ------------|---------------------|-----------
+//   WAL         | Commit(txn1) marker | commit_seq
+//   Memtable    | Nothing             | —
+//
+// The PreReleaseCallback (WritePreparedCommitEntryPreReleaseCallback)
+// updates the CommitCache to record that prepare_seq was committed at
+// commit_seq. After this, readers consulting the commit map will see that
+// the data at prepare_seq is committed and therefore visible.
+//
+// -- Why two queues help --
+//
+// The Commit phase doesn't touch the memtable — it only writes a small
+// marker to WAL and updates an in-memory commit map. By routing this
+// through a separate queue, Commit writes don't have to wait behind other
+// transactions' Prepare writes (which do the expensive memtable insertion
+// on the main queue). This is the optimization mentioned in the options
+// comment about MySQL 2PC where commits are serial.
+//
+// -- Sequence number flow --
+//
+//                            last_sequence_ | last_allocated_seq |
+//                            last_published_seq
+//                            ---------------|--------------------|-------------------
+//   Before Prepare:                  9      |         9          |        9
+//
+//   Prepare (main queue):
+//     FetchAdd alloc seq             9      |        10          |        9
+//     Write WAL + memtable
+//     SetLastSequence               10      |        10          |        9
+//     (published_seq not advanced yet — data is uncommitted)
+//
+//   Commit (2nd queue):
+//     FetchAdd alloc seq            10      |        11          |        9
+//     Write WAL only
+//     Update CommitCache
+//     SetLastPublishedSeq           10      |        11          |       11
+//
 class WritePreparedTxn : public PessimisticTransaction {
  public:
   WritePreparedTxn(WritePreparedTxnDB* db, const WriteOptions& write_options,

From d693d5ae26bddeb67e099d6ec653fae5f75e67bb Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 17 Feb 2026 15:30:00 -0800
Subject: [PATCH 485/500] Blog post on CPU bug (#14078)

Summary:
see draft post

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14078

Test Plan: markdown preview (simple post)

Reviewed By: hx235

Differential Revision: D85475518

Pulled By: pdillinger

fbshipit-source-id: d7f7b0d68de3880fcffbdbbef27cd2c14fe51f96
---
 docs/_posts/2026-02-17-cpu-bug.markdown | 46 +++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 docs/_posts/2026-02-17-cpu-bug.markdown

diff --git a/docs/_posts/2026-02-17-cpu-bug.markdown b/docs/_posts/2026-02-17-cpu-bug.markdown
new file mode 100644
index 000000000000..7147ca74dc6b
--- /dev/null
+++ b/docs/_posts/2026-02-17-cpu-bug.markdown
@@ -0,0 +1,46 @@
+---
+title: "RocksDB development finds a CPU bug"
+layout: post
+author: pdillinger
+category: blog
+---
+
+This is the story of how a RocksDB unit test I added four years ago, a mini-stress test you might call it, revealed [a novel hardware bug in a newer CPU](https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html). It was scary enough to be assigned a "high severity" CVE.
+
+## Background: Unique Identifiers
+About four years ago, we [added unique identifiers to SST files](https://github.com/facebook/rocksdb/pull/9126) to give them stable identifiers across different filesystems for caching purposes. Part of the motivation here was to eliminate our dependence on the uniqueness and non-recycling of unique identifiers on files provided by the OS filesystem. (Some filesystems were only [guaranteeing uniqueness among existing files, not among all files even in recent history](https://github.com/facebook/rocksdb/issues/7405#issuecomment-694595587).) I would call this dependency problem the *great tension* between reusing existing solutions and code self-reliance. You don't want to duplicate others' work but you also don't want to be subject to their bugs or changing / misaligned requirements. Striking this balance can be tricky, but in this case it was clear to us that we didn't want to rely on all the possible filesystems providing quality unique identifiers.
+
+If you're comfortable with large random numbers (e.g. 128 bits), you probably agree that persisting random identifiers (or [quasi-random](https://github.com/pdillinger/unique_id/blob/main/README.md), which [I helped formalize in a paper](https://dl.acm.org/doi/10.1145/3584372.3588674), [also on arXiv](https://arxiv.org/abs/2304.07109)) with each file would be safer and more predictable than relying so crucially on a minor feature of OS filesystems.
+
+## High Quality Randomness
+However, that assumes we have access to *high quality* random numbers (at least a good one or two to start from - see the paper). Because RocksDB intends to be cross-platform, we want to minimize platform-specific dependencies and prefer cross-platform dependencies. But that could easily land us back where we didn't want to be: susceptible to a bug or hiccup in one implementation of what we needed.
+
+Fortunately, the nature of random entropy allows *combining* sources so that your result is as good as your *best* input source, so even if one is bad, you only have a problem if they're all bad. And we had the advantages that (a) we only needed uniqueness, not security, which reduced the need for extra scrutiny and allowed us to use the quasi-random approach, and (b) the quasi-random approach minimized the amount of entropy needed, so the performance cost of acquiring each unit of entropy was almost inconsequential. Therefore, I combined these sources of entropy:
+
+* C++11's [std::random_device](https://en.cppreference.com/w/cpp/numeric/random/random_device.html) which is supposed to provide high quality but is allowed not to.
+* A hash of various environment parameters including hostname, process id, thread id, and various macro and micro time readings.
+* Platform-specific UUID generator (Linux and Windows only)
+
+## Trust But Verify
+To verify the quality of each of these sources on an ongoing basis, [I added unit tests](https://github.com/facebook/rocksdb/pull/8708) that used many threads to create thousands of unique identifiers based on one of the above sources at a time and verified their uniqueness. For a high quality source, the probability of any duplicate 128-bit IDs among thousands is negligible, even if running these tests continuously for decades.
+
+## That's Weird
+That was pretty much the story until some months ago the test based on `std::random_device` failed, once. It was quite suspicious because the number of unique IDs was not just one short of expectation, it was dozens or hundreds short. However, even that could be explained by a random CPU hiccup or bit flip in which we generated fewer IDs to begin with. (You might have noticed an increasing amount of RocksDB development effort and portion of CPU time going into checks that are logically redundant but exist to detect CPU miscalculations before the corruption propagates too far.)
+
+But then it failed again about a month later. No failures for four years, then two failures in two months. This smelled really bad. Digging into the details I noticed a crucial correlation: both of the failed test jobs had run on the same type of hardware, though in completely different data centers.
+
+From there I did the natural thing for an engineer: scale it up to try to reproduce the failure. And that was remarkably easy. By increasing the number of threads in the job to around the number of cores it would fail quickly and consistently on all systems using the same type of newer CPU, and pass on everything else. I tested some variants of this to establish some more details, including
+
+* `std::random_device` using "rdrand" and "/dev/urandom" sources were not affected, and
+* libc++ (from clang) was not affected, only libstdc++ (from GCC)
+
+## Root Cause Analysis
+From there Meta colleagues investigated the low-level details. They found the problem to be that the RDSEED instruction on this type of processor would return 0 and "success" much more often than would randomly be expected, but only on some cores and only under "complex micro-architectural conditions reproducible under memory-load," as a colleague describes it. A mitigating Linux kernel patch was developed to signal that RDSEED was unavailable on these processors, with the intention of rolling it out internally at Meta to avoid problems until a fix came from the OEM. [AMD quickly acknowledged the issue and announced planned mitigation](https://www.amd.com/en/resources/product-security/bulletin/amd-sb-7055.html), including a CPU microcode update.
+
+## With Apologies
+Although I worked to keep the information confidential until the OEM publicly acknowledged the issue, the uncoordinated disclosure via the Linux mailing list was due to zealous remediation efforts that crossed multiple infrastructure teams at Meta. We regret the mistake and are working to improve controls on the processes that failed to coordinate with the OEM first.
+
+## Key Takeaways
+* Test what you depend on.
+* Have redundancies and/or sanity checks for what you depend on.
+* Even CPUs can have bugs, usually flaky individual units but occasionally a bug affecting all units.

From 641f4703ac95ad1d9cb30e97027698be9e4333e3 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 17 Feb 2026 17:28:54 -0800
Subject: [PATCH 486/500] Refactor data block footer to reserve metadata bits
 for future features (#14332)

Summary:
I'm implementing this intending it to be used for https://github.com/facebook/rocksdb/issues/14287

Refactor the data block footer encoding/decoding to use a struct-based Encode/Decode API (DataBlockFooter), reserving the top 4 bits of the footer for metadata:
- Bit 31: Hash index present (kDataBlockBinaryAndHash) - existing use
- Bits 28-30: Reserved for future features

Comments have some detail for why it is safe to assume no practical existing SST files would use these newly reserved bits. And for forward compatibility, existing versions detect (non-zero) use of these new bits as impossibly large num_restarts and report "bad block contents". Not perfect, but not bad.

Key changes:
- Replace PackIndexTypeAndNumRestarts/UnPackIndexTypeAndNumRestarts with DataBlockFooter::EncodeTo/DecodeFrom methods
- DecodeFrom returns a detailed error when reserved bits are set, enabling graceful failure on newer format versions
- Reduce kMaxNumRestarts from 2^31-1 to 2^28-1 (268M), which is adequate for the maximum possible restarts in a 4GiB block
- Add GetCorruptionStatus() to Block for detailed error messages (Note that we are sensitive to the size of Block objects, so have to avoid adding unnecessary new members.)
- Remove obsolete kMaxBlockSizeSupportedByHashIndex size checks

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14332

Test Plan:
- Existing unit tests and format compatibility test
- Add test for reserved bit detection (ReservedBitInDataBlockFooter)

Reviewed By: joshkang97

Differential Revision: D93293152

Pulled By: pdillinger

fbshipit-source-id: b65a83e96bb09a98fb9b8b2dd9f754653ca7ed4d
---
 port/lang.h                            |   4 +
 table/block_based/block.cc             | 117 ++++++++++++-------------
 table/block_based/block.h              |  11 ++-
 table/block_based/block_builder.cc     |  13 ++-
 table/block_based/data_block_footer.cc |  68 +++++++-------
 table/block_based/data_block_footer.h  |  60 +++++++++++--
 table/table_test.cc                    |  38 ++++++++
 7 files changed, 200 insertions(+), 111 deletions(-)

diff --git a/port/lang.h b/port/lang.h
index ab79f9d22a75..f0418cedaeda 100644
--- a/port/lang.h
+++ b/port/lang.h
@@ -69,6 +69,10 @@ constexpr bool kMustFreeHeapAllocations = false;
 #define TSAN_SUPPRESSION
 #endif  // TSAN_SUPPRESSION
 
+// Fail in debug build with a useful message, for automatically grouping
+// related failures
+#define DEBUG_FAIL(msg) assert(false && msg)
+
 // Compile-time CPU feature testing compatibility
 //
 // A way to be extra sure these defines have been included.
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index 8d28e9ae3f60..d387019bb711 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -1294,39 +1294,12 @@ bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
   }
 }
 
-uint32_t Block::NumRestarts() const {
-  assert(size() >= 2 * sizeof(uint32_t));
-  uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
-  uint32_t num_restarts = block_footer;
-  if (size() > kMaxBlockSizeSupportedByHashIndex) {
-    // In BlockBuilder, we have ensured a block with HashIndex is less than
-    // kMaxBlockSizeSupportedByHashIndex (64KiB).
-    //
-    // Therefore, if we encounter a block with a size > 64KiB, the block
-    // cannot have HashIndex. So the footer will directly interpreted as
-    // num_restarts.
-    //
-    // Such check is for backward compatibility. We can ensure legacy block
-    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
-    // correctly as no HashIndex even if the MSB of num_restarts is set.
-    return num_restarts;
-  }
-  BlockBasedTableOptions::DataBlockIndexType index_type;
-  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
-  return num_restarts;
-}
-
 BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
-  assert(size() >= 2 * sizeof(uint32_t));
-  if (size() > kMaxBlockSizeSupportedByHashIndex) {
-    // The check is for the same reason as that in NumRestarts()
-    return BlockBasedTableOptions::kDataBlockBinarySearch;
-  }
-  uint32_t block_footer = DecodeFixed32(data() + size() - sizeof(uint32_t));
-  uint32_t num_restarts = block_footer;
-  BlockBasedTableOptions::DataBlockIndexType index_type;
-  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
-  return index_type;
+  assert(size() >= DataBlockFooter::kMinEncodedLength);
+  Slice input(data(), size());
+  DataBlockFooter footer;
+  footer.DecodeFrom(&input).PermitUncheckedError();
+  return footer.index_type;
 }
 
 Block::~Block() {
@@ -1336,51 +1309,71 @@ Block::~Block() {
   delete[] kv_checksum_;
 }
 
+Status Block::GetCorruptionStatus() const {
+  // Re-process the footer to get a detailed error status.
+  // This should only be called when size() == 0 (error marker).
+  assert(size() == 0);
+  // When size() == 0 and restart_offset_ != 0, restart_offset_ stores the
+  // original data size for re-decoding the footer to get detailed error.
+  if (restart_offset_ == 0) {
+    return Status::Corruption("bad block contents");
+  }
+  Slice input(contents_.data.data(), restart_offset_);
+  DataBlockFooter footer;
+  Status s = footer.DecodeFrom(&input);
+  if (!s.ok()) {
+    return s;  // Return the detailed error from DecodeFrom
+  }
+  // Footer decoded OK, so error was in later processing (shouldn't happen)
+  DEBUG_FAIL("ok status on presumed bad block contents");
+  return Status::Corruption("presumed bad block contents");
+}
+
 Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
              Statistics* statistics)
     : contents_(std::move(contents)), restart_offset_(0), num_restarts_(0) {
   TEST_SYNC_POINT("Block::Block:0");
   auto& size = contents_.data.size_;
-  if (size < sizeof(uint32_t)) {
+  // `contents` is assumed to be uncompressed in the proper format
+  Slice input(contents_.data.data(), size);
+  DataBlockFooter footer;
+  Status s = footer.DecodeFrom(&input);
+  if (!s.ok()) {
+    // Save original size for GetCorruptionStatus() to re-decode footer
+    restart_offset_ = static_cast<uint32_t>(size);
     size = 0;  // Error marker
   } else {
-    // Should only decode restart points for uncompressed blocks
-    num_restarts_ = NumRestarts();
-    switch (IndexType()) {
+    // After DecodeFrom, input has the footer removed. Each case below
+    // may strip additional suffix (e.g., hash index) so that input ends
+    // with just the restart array.
+    num_restarts_ = footer.num_restarts;
+    switch (footer.index_type) {
       case BlockBasedTableOptions::kDataBlockBinarySearch:
-        restart_offset_ = static_cast<uint32_t>(size) -
-                          (1 + num_restarts_) * sizeof(uint32_t);
-        if (restart_offset_ > size - sizeof(uint32_t)) {
-          // The size is too small for NumRestarts() and therefore
-          // restart_offset_ wrapped around.
-          size = 0;
-        }
         break;
       case BlockBasedTableOptions::kDataBlockBinaryAndHash:
-        if (size < sizeof(uint32_t) /* block footer */ +
-                       sizeof(uint16_t) /* NUM_BUCK */) {
+        if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) {
           size = 0;
           break;
         }
-
         uint16_t map_offset;
-        data_block_hash_index_.Initialize(
-            contents_.data.data(),
-            /* chop off NUM_RESTARTS */
-            static_cast<uint16_t>(size - sizeof(uint32_t)), &map_offset);
-
-        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
-
-        if (restart_offset_ > map_offset) {
-          // map_offset is too small for NumRestarts() and
-          // therefore restart_offset_ wrapped around.
-          size = 0;
-          break;
-        }
+        data_block_hash_index_.Initialize(contents_.data.data(),
+                                          static_cast<uint16_t>(input.size()),
+                                          &map_offset);
+        // Strip the hash index, leaving just data + restarts
+        input.remove_suffix(input.size() - map_offset);
         break;
       default:
         size = 0;  // Error marker
     }
+    // After the switch, input should end with restarts[num_restarts_]
+    if (size != 0) {
+      if (input.size() < num_restarts_ * sizeof(uint32_t)) {
+        size = 0;  // Block too small for the declared number of restarts
+      } else {
+        restart_offset_ = static_cast<uint32_t>(input.size()) -
+                          num_restarts_ * sizeof(uint32_t);
+      }
+    }
   }
   if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
     read_amp_bitmap_.reset(new BlockReadAmpBitmap(
@@ -1517,7 +1510,7 @@ void Block::InitializeMetaIndexBlockProtectionInfo(
 MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
   MetaBlockIter* iter = new MetaBlockIter();
   if (size() < 2 * sizeof(uint32_t)) {
-    iter->Invalidate(Status::Corruption("bad block contents"));
+    iter->Invalidate(GetCorruptionStatus());
     return iter;
   } else if (num_restarts_ == 0) {
     // Empty block.
@@ -1542,7 +1535,7 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
     ret_iter = new DataBlockIter;
   }
   if (size() < 2 * sizeof(uint32_t)) {
-    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    ret_iter->Invalidate(GetCorruptionStatus());
     return ret_iter;
   }
   if (num_restarts_ == 0) {
@@ -1581,7 +1574,7 @@ IndexBlockIter* Block::NewIndexIterator(
     ret_iter = new IndexBlockIter;
   }
   if (size() < 2 * sizeof(uint32_t)) {
-    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    ret_iter->Invalidate(GetCorruptionStatus());
     return ret_iter;
   }
   if (num_restarts_ == 0) {
diff --git a/table/block_based/block.h b/table/block_based/block.h
index 071dc4a5da49..afe059cdd5eb 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -167,7 +167,7 @@ class Block {
   const char* data() const { return contents_.data.data(); }
   // The additional memory space taken by the block data.
   size_t usable_size() const { return contents_.usable_size(); }
-  uint32_t NumRestarts() const;
+  uint32_t NumRestarts() const { return num_restarts_; }
   bool own_bytes() const { return contents_.own_bytes(); }
 
   BlockBasedTableOptions::DataBlockIndexType IndexType() const;
@@ -282,8 +282,15 @@ class Block {
   const char* TEST_GetKVChecksum() const { return kv_checksum_; }
 
  private:
+  // Returns a detailed error status by re-processing the footer.
+  // Should only be called when size() == 0 (error marker).
+  Status GetCorruptionStatus() const;
+
   BlockContents contents_;
-  uint32_t restart_offset_;  // Offset in data_ of restart array
+  // Normal state: offset in data_ of restart array.
+  // Error state (size()==0): original data size if footer decode failed,
+  //   otherwise 0. Used by GetCorruptionStatus() to re-decode footer.
+  uint32_t restart_offset_;
   uint32_t num_restarts_;
   std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
   char* kv_checksum_{nullptr};
diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc
index 4c2cb40094d7..541ff6ea23da 100644
--- a/table/block_based/block_builder.cc
+++ b/table/block_based/block_builder.cc
@@ -133,19 +133,16 @@ Slice BlockBuilder::Finish() {
     PutFixed32(&buffer_, restarts_[i]);
   }
 
-  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
-  BlockBasedTableOptions::DataBlockIndexType index_type =
-      BlockBasedTableOptions::kDataBlockBinarySearch;
+  DataBlockFooter footer;
+  footer.num_restarts = static_cast<uint32_t>(restarts_.size());
+  footer.index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
   if (data_block_hash_index_builder_.Valid() &&
       CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
     data_block_hash_index_builder_.Finish(buffer_);
-    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    footer.index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
   }
 
-  // footer is a packed format of data_block_index_type and num_restarts
-  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
-
-  PutFixed32(&buffer_, block_footer);
+  footer.EncodeTo(&buffer_);
   finished_ = true;
   return Slice(buffer_);
 }
diff --git a/table/block_based/data_block_footer.cc b/table/block_based/data_block_footer.cc
index 5d5d8ed55e4e..24a31c0d52b5 100644
--- a/table/block_based/data_block_footer.cc
+++ b/table/block_based/data_block_footer.cc
@@ -9,51 +9,55 @@
 
 #include "table/block_based/data_block_footer.h"
 
-#include "rocksdb/table.h"
+#include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-const int kDataBlockIndexTypeBitShift = 31;
+// Hash index bit (bit 31)
+constexpr uint32_t kHashIndexBit = 1u << 31;
 
-// 0x7FFFFFFF
-const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+void DataBlockFooter::EncodeTo(std::string* dst) const {
+  assert(num_restarts <= kMaxNumRestarts);
 
-// 0x7FFFFFFF
-const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
-
-uint32_t PackIndexTypeAndNumRestarts(
-    BlockBasedTableOptions::DataBlockIndexType index_type,
-    uint32_t num_restarts) {
-  if (num_restarts > kMaxNumRestarts) {
-    assert(0);  // mute travis "unused" warning
-  }
-
-  uint32_t block_footer = num_restarts;
+  uint32_t packed = num_restarts;
   if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
-    block_footer |= 1u << kDataBlockIndexTypeBitShift;
-  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
-    assert(0);
+    packed |= kHashIndexBit;
+  } else {
+    assert(index_type == BlockBasedTableOptions::kDataBlockBinarySearch);
   }
 
-  return block_footer;
+  PutFixed32(dst, packed);
 }
 
-void UnPackIndexTypeAndNumRestarts(
-    uint32_t block_footer,
-    BlockBasedTableOptions::DataBlockIndexType* index_type,
-    uint32_t* num_restarts) {
-  if (index_type) {
-    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
-      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
-    } else {
-      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
-    }
+Status DataBlockFooter::DecodeFrom(Slice* input) {
+  if (input->size() < kMinEncodedLength) {
+    return Status::Corruption("Block too small for footer");
   }
 
-  if (num_restarts) {
-    *num_restarts = block_footer & kNumRestartsMask;
-    assert(*num_restarts <= kMaxNumRestarts);
+  // Decode from the end of the input
+  const char* footer_ptr = input->data() + input->size() - kMinEncodedLength;
+  uint32_t packed = DecodeFixed32(footer_ptr);
+
+  if (packed & kHashIndexBit) {
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    packed &= ~kHashIndexBit;
+  } else {
+    index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
   }
+
+  // Check for reserved/unrecognized feature bits (anything beyond
+  // kMaxNumRestarts)
+  if (packed > kMaxNumRestarts) {
+    return Status::Corruption(
+        "Unrecognized feature in block footer (reserved bits set)");
+  }
+
+  num_restarts = packed;
+
+  // Remove the footer from the input slice
+  input->remove_suffix(kMinEncodedLength);
+
+  return Status::OK();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/data_block_footer.h b/table/block_based/data_block_footer.h
index c1cfd473099a..74301d0e0a1a 100644
--- a/table/block_based/data_block_footer.h
+++ b/table/block_based/data_block_footer.h
@@ -9,17 +9,63 @@
 
 #pragma once
 
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
 #include "rocksdb/table.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-uint32_t PackIndexTypeAndNumRestarts(
-    BlockBasedTableOptions::DataBlockIndexType index_type,
-    uint32_t num_restarts);
+// DataBlockFooter represents the footer of a data block, containing metadata
+// about the block's structure and features.
+//
+// Current encoding (may expand in future format versions):
+// - A single uint32_t where:
+//   - The low 28 bits store the number of restart points (num_restarts)
+//   - The high 4 bits are reserved for metadata/features:
+//     - Bit 31: Hash index present (kDataBlockBinaryAndHash)
+//     - Bits 28-30: Reserved for future features
+//
+// When any unrecognized reserved bit is set, DecodeFrom() returns an error,
+// allowing older versions to fail gracefully on newer formats.
+//
+// The encoding size is not fixed - future format versions may expand it.
+// Use kMaxEncodedLength for buffer sizing.
+struct DataBlockFooter {
+  // Maximum number of restarts that can be stored (2^28 - 1 = 268,435,455).
+  // This reserves the top 4 bits for metadata (bit 31 for hash index, bits
+  // 28-30 for future features). For historical compatibility purposes, the
+  // limit is adequate because a 4GiB block (maximum due to 32-bit block size)
+  // with restart_interval=1 and minimum entries (12 bytes: 3 varint bytes +
+  // 9-byte internal key + empty value) plus 4-byte restart offsets = 16 bytes
+  // per restart, fits at most (2^32 - 4) / 16 ≈ 268 million restarts.
+  static constexpr uint32_t kMaxNumRestarts = (1u << 28) - 1;
+
+  // Maximum encoded length of a DataBlockFooter (for buffer sizing)
+  // Currently 4 bytes, but may grow in future format versions.
+  static constexpr uint32_t kMaxEncodedLength = sizeof(uint32_t);
+
+  // Minimum encoded length (for current format version)
+  static constexpr uint32_t kMinEncodedLength = sizeof(uint32_t);
+
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  uint32_t num_restarts = 0;
+
+  DataBlockFooter() = default;
+  DataBlockFooter(BlockBasedTableOptions::DataBlockIndexType _index_type,
+                  uint32_t _num_restarts)
+      : index_type(_index_type), num_restarts(_num_restarts) {}
+
+  // Appends the encoded footer to dst.
+  void EncodeTo(std::string* dst) const;
 
-void UnPackIndexTypeAndNumRestarts(
-    uint32_t block_footer,
-    BlockBasedTableOptions::DataBlockIndexType* index_type,
-    uint32_t* num_restarts);
+  // Decodes a footer from the end of input (consumes bytes from the end).
+  // Returns an error if reserved/unrecognized feature bits are set.
+  // On success, advances input to exclude the consumed footer bytes.
+  Status DecodeFrom(Slice* input);
+};
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/table_test.cc b/table/table_test.cc
index 68f677fe0f01..43d8ccac43a5 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -2280,6 +2280,44 @@ TEST_P(BlockBasedTableTest, BadChecksumType) {
             "Corruption: Corrupt or unsupported checksum type: 123 in test");
 }
 
+TEST_P(BlockBasedTableTest, ReservedBitInDataBlockFooter) {
+  // Test that reserved metadata bits in data block footer are detected.
+  // We construct a block directly rather than going through the full table
+  // iterator path to avoid issues with iterator error handling.
+
+  // Build a simple data block
+  BlockBuilder builder(16 /* restart_interval */);
+  InternalKey key("abc", 1, kTypeValue);
+  builder.Add(key.Encode(), "test_value");
+  Slice block_contents = builder.Finish();
+  std::string block_data = block_contents.ToString();
+
+  // The footer is the last 4 bytes - corrupt it by setting reserved bit 28
+  ASSERT_GE(block_data.size(), sizeof(uint32_t));
+  size_t footer_offset = block_data.size() - sizeof(uint32_t);
+  uint32_t footer = DecodeFixed32(block_data.data() + footer_offset);
+  footer |= (1u << 28);  // Set lowest reserved bit
+  EncodeFixed32(&block_data[footer_offset], footer);
+
+  // Try to construct a Block from the corrupted data
+  BlockContents contents(std::move(block_data));
+  Block block(std::move(contents), 0 /* read_amp_bytes_per_bit */);
+
+  // Block should have size() == 0 indicating error
+  ASSERT_EQ(block.size(), 0u);
+
+  // Try to get an iterator - it should be invalid with corruption status
+  DataBlockIter iter;
+  block.NewDataIterator(BytewiseComparator(), kMaxSequenceNumber, &iter,
+                        /*stats=*/nullptr, /*block_contents_pinned=*/false);
+  ASSERT_FALSE(iter.Valid());
+  ASSERT_EQ(iter.status().code(), Status::kCorruption)
+      << iter.status().ToString();
+  ASSERT_NE(iter.status().ToString().find("reserved bits set"),
+            std::string::npos)
+      << iter.status().ToString();
+}
+
 class BuiltinChecksumTest : public testing::Test,
                             public testing::WithParamInterface<ChecksumType> {};
 

From 1f9d8ee302d3ca912fddcb4f334fcfee80f9d51f Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 17 Feb 2026 23:31:30 -0800
Subject: [PATCH 487/500] Remove obsolete code for block-based format_version <
 2 (after #14315) (#14327)

Summary:
After PR https://github.com/facebook/rocksdb/issues/14315 dropped support for block-based table format_version < 2, several code paths became obsolete. This change removes them.

Investigation findings:

1. Table properties are now a hard requirement for block-based SST files:
   - format_version >= 2 guarantees a properties block exists
   - Removed defensive conditionals like `if (rep_->table_properties)`
   - Missing properties block now returns Status::Corruption instead of just logging an error. This is important because some properties affect the semantic interpretation of the file.

2. Index type property (kIndexType) is now required:
   - kIndexType was introduced in Feb 2014 (commit 74939a9e1), ~11 months BEFORE format_version was introduced in Jan 2015
   - BlockBasedTablePropertiesCollector::Finish() has always written kIndexType unconditionally for all block-based tables
   - Therefore all format_version >= 2 files have this property
   - Now returns Status::Corruption if missing instead of silently defaulting to kBinarySearch

3. Removed SetOldTableOptions() from sst_file_dumper:
   - This fallback handled files without a properties block
   - Dead code since format_version >= 2 guarantees properties exist

4. Removed kPropertiesBlockOldName ("rocksdb.stats") fallback:
   - The properties block was renamed from "rocksdb.stats" to "rocksdb.properties" in RocksDB 2.7 (April 2014)
   - format_version 2 was introduced in RocksDB 3.10 (Oct 2015)
   - All table formats (block-based, plain, cuckoo) were created after the rename, so they all use "rocksdb.properties"
   - The backward compatibility fallback in FindOptionalMetaBlock() was dead code for all supported table formats

5. Removed obsolete assertion about format_version 0 checksum in BlockBasedTableBuilder::WriteFooter()

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14327

Test Plan: some tests updated for updated requirements. Mostly, CI including format compatible test

Reviewed By: mszeszko-meta

Differential Revision: D93124820

Pulled By: pdillinger

fbshipit-source-id: eb12cbdca0e69f34a08051d5160c282384128a4a
---
 db/db_table_properties_test.cc                |  21 +--
 .../block_based/block_based_table_builder.cc  |   3 -
 table/block_based/block_based_table_reader.cc | 165 ++++++++----------
 table/meta_blocks.cc                          |  10 --
 table/meta_blocks.h                           |   1 -
 table/sst_file_dumper.cc                      |  22 +--
 table/sst_file_dumper.h                       |   1 -
 7 files changed, 78 insertions(+), 145 deletions(-)

diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 095f090fd773..8e84f26541bc 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -69,14 +69,6 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
 
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
-    // Use old meta name for table properties for one file
-    if (table == 3) {
-      SyncPoint::GetInstance()->SetCallBack(
-          "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) {
-            *static_cast<const std::string**>(meta) = &kPropertiesBlockOldName;
-          });
-      SyncPoint::GetInstance()->EnableProcessing();
-    }
     // Build file
     for (int i = 0; i < 10 + table; ++i) {
       ASSERT_OK(
@@ -84,7 +76,6 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
     }
     ASSERT_OK(db_->Flush(FlushOptions()));
   }
-  SyncPoint::GetInstance()->DisableProcessing();
   std::string original_session_id;
   ASSERT_OK(db_->GetDbSessionId(original_session_id));
 
@@ -169,10 +160,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBTablePropertiesTest, InvalidIgnored) {
-  // RocksDB versions 2.5 - 2.7 generate some properties that Block considers
-  // invalid in some way. This approximates that.
-
+TEST_F(DBTablePropertiesTest, InvalidReportedAsCorruption) {
   // Inject properties block data that Block considers invalid
   SyncPoint::GetInstance()->SetCallBack(
       "BlockBasedTableBuilder::WritePropertiesBlock:BlockData",
@@ -189,13 +177,10 @@ TEST_F(DBTablePropertiesTest, InvalidIgnored) {
   for (int i = 0; i < 10; ++i) {
     ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val"));
   }
-  ASSERT_OK(db_->Flush(FlushOptions()));
+  // Corrupted properties block should be detected and reported as corruption
+  ASSERT_TRUE(db_->Flush(FlushOptions()).IsCorruption());
 
   SyncPoint::GetInstance()->DisableProcessing();
-
-  // Not crashing is good enough
-  TablePropertiesCollection props;
-  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
 }
 
 TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 46197d35dc24..ed288952213d 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -2559,9 +2559,6 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
                                          BlockHandle& index_block_handle) {
   assert(LIKELY(ok()));
   Rep* r = rep_.get();
-  // this is guaranteed by BlockBasedTableBuilder's constructor
-  assert(r->table_options.checksum == kCRC32c ||
-         r->table_options.format_version != 0);
   FooterBuilder footer;
   Status s = footer.Build(kBlockBasedTableMagicNumber,
                           r->table_options.format_version, r->get_offset(),
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index d4e26e9b52f4..1de0096f4a72 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -927,20 +927,18 @@ Status BlockBasedTable::Open(
     rep->table_prefix_extractor = prefix_extractor;
   } else {
     // Current prefix_extractor doesn't match table
-    if (rep->table_properties) {
-      //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
-      // will need to use it
-      ConfigOptions config_options;
-      Status st = SliceTransform::CreateFromString(
-          config_options, rep->table_properties->prefix_extractor_name,
-          &(rep->table_prefix_extractor));
-      if (!st.ok()) {
-        //**TODO: Should this be error be returned or swallowed?
-        ROCKS_LOG_ERROR(rep->ioptions.logger,
-                        "Failed to create prefix extractor[%s]: %s",
-                        rep->table_properties->prefix_extractor_name.c_str(),
-                        st.ToString().c_str());
-      }
+    //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions
+    // will need to use it
+    ConfigOptions config_options;
+    Status st = SliceTransform::CreateFromString(
+        config_options, rep->table_properties->prefix_extractor_name,
+        &(rep->table_prefix_extractor));
+    if (!st.ok()) {
+      //**TODO: Should this be error be returned or swallowed?
+      ROCKS_LOG_ERROR(rep->ioptions.logger,
+                      "Failed to create prefix extractor[%s]: %s",
+                      rep->table_properties->prefix_extractor_name.c_str(),
+                      st.ToString().c_str());
     }
   }
 
@@ -1086,86 +1084,72 @@ Status BlockBasedTable::ReadPropertiesBlock(
   BlockHandle handle;
   s = FindOptionalMetaBlock(meta_iter, kPropertiesBlockName, &handle);
 
+  if (!s.ok()) {
+    return s;
+  } else if (handle.IsNull()) {
+    return Status::Corruption("Cannot find Properties block from file.");
+  }
+
+  s = meta_iter->status();
+  std::unique_ptr<TableProperties> table_properties;
+  if (s.ok()) {
+    s = ReadTablePropertiesHelper(
+        ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
+        rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  assert(table_properties != nullptr);
+  rep_->table_properties = std::move(table_properties);
+
+  s = rep_->seqno_to_time_mapping.DecodeFrom(
+      rep_->table_properties->seqno_to_time_mapping);
   if (!s.ok()) {
     ROCKS_LOG_WARN(rep_->ioptions.logger,
-                   "Error when seeking to properties block from file: %s",
+                   "Problem reading or processing seqno-to-time mapping: %s",
                    s.ToString().c_str());
-  } else if (!handle.IsNull()) {
-    s = meta_iter->status();
-    std::unique_ptr<TableProperties> table_properties;
-    if (s.ok()) {
-      s = ReadTablePropertiesHelper(
-          ro, handle, rep_->file.get(), prefetch_buffer, rep_->footer,
-          rep_->ioptions, &table_properties, nullptr /* memory_allocator */);
-    }
-    IGNORE_STATUS_IF_ERROR(s);
+  }
 
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep_->ioptions.logger,
-                     "Encountered error while reading data from properties "
-                     "block %s",
-                     s.ToString().c_str());
-    } else {
-      assert(table_properties != nullptr);
-      rep_->table_properties = std::move(table_properties);
+  // Read the table properties
+  rep_->whole_key_filtering &= IsFeatureSupported(
+      *(rep_->table_properties),
+      BlockBasedTablePropertyNames::kWholeKeyFiltering, rep_->ioptions.logger);
+  rep_->prefix_filtering &= IsFeatureSupported(
+      *(rep_->table_properties), BlockBasedTablePropertyNames::kPrefixFiltering,
+      rep_->ioptions.logger);
 
-      if (s.ok()) {
-        s = rep_->seqno_to_time_mapping.DecodeFrom(
-            rep_->table_properties->seqno_to_time_mapping);
-      }
-      if (!s.ok()) {
-        ROCKS_LOG_WARN(
-            rep_->ioptions.logger,
-            "Problem reading or processing seqno-to-time mapping: %s",
-            s.ToString().c_str());
-      }
-    }
-  } else {
-    ROCKS_LOG_ERROR(rep_->ioptions.logger,
-                    "Cannot find Properties block from file.");
+  rep_->index_key_includes_seq =
+      rep_->table_properties->index_key_is_user_key == 0;
+  rep_->index_value_is_full =
+      rep_->table_properties->index_value_is_delta_encoded == 0;
+
+  // Read index_type from properties (required for format_version >= 2)
+  auto& props = rep_->table_properties->user_collected_properties;
+  auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType);
+  if (index_type_pos == props.end()) {
+    return Status::Corruption("Missing index type property");
+  }
+  rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
+      DecodeFixed32(index_type_pos->second.c_str()));
+  auto min_ts_pos = props.find("rocksdb.timestamp_min");
+  if (min_ts_pos != props.end()) {
+    rep_->min_timestamp = Slice(min_ts_pos->second);
+  }
+  auto max_ts_pos = props.find("rocksdb.timestamp_max");
+  if (max_ts_pos != props.end()) {
+    rep_->max_timestamp = Slice(max_ts_pos->second);
   }
 
-  // Read the table properties, if provided.
-  if (rep_->table_properties) {
-    rep_->whole_key_filtering &=
-        IsFeatureSupported(*(rep_->table_properties),
-                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep_->ioptions.logger);
-    rep_->prefix_filtering &= IsFeatureSupported(
-        *(rep_->table_properties),
-        BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
-
-    rep_->index_key_includes_seq =
-        rep_->table_properties->index_key_is_user_key == 0;
-    rep_->index_value_is_full =
-        rep_->table_properties->index_value_is_delta_encoded == 0;
-
-    // Update index_type with the true type.
-    // If table properties don't contain index type, we assume that the table
-    // is in very old format and has kBinarySearch index type.
-    auto& props = rep_->table_properties->user_collected_properties;
-    auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType);
-    if (index_type_pos != props.end()) {
-      rep_->index_type = static_cast<BlockBasedTableOptions::IndexType>(
-          DecodeFixed32(index_type_pos->second.c_str()));
-    }
-    auto min_ts_pos = props.find("rocksdb.timestamp_min");
-    if (min_ts_pos != props.end()) {
-      rep_->min_timestamp = Slice(min_ts_pos->second);
-    }
-    auto max_ts_pos = props.find("rocksdb.timestamp_max");
-    if (max_ts_pos != props.end()) {
-      rep_->max_timestamp = Slice(max_ts_pos->second);
-    }
-
-    rep_->index_has_first_key =
-        rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
-
-    s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
-                                &(rep_->global_seqno));
-    if (!s.ok()) {
-      ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
-    }
+  rep_->index_has_first_key =
+      rep_->index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey;
+
+  s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
+                              &(rep_->global_seqno));
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
   }
   return s;
 }
@@ -3053,12 +3037,7 @@ uint64_t BlockBasedTable::ApproximateDataOffsetOf(
 }
 
 uint64_t BlockBasedTable::GetApproximateDataSize() {
-  // Should be in table properties unless super old version
-  if (rep_->table_properties) {
-    return rep_->table_properties->data_size;
-  }
-  // Fall back to rough estimate from footer
-  return rep_->footer.metaindex_handle().offset();
+  return rep_->table_properties->data_size;
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options,
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index d8be37e58b39..72ee79266af6 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -29,8 +29,6 @@ namespace ROCKSDB_NAMESPACE {
 const std::string kPropertiesBlockName = "rocksdb.properties";
 // NB: only used with format_version >= 6
 const std::string kIndexBlockName = "rocksdb.index";
-// Old property block name for backward compatibility
-const std::string kPropertiesBlockOldName = "rocksdb.stats";
 const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
 const std::string kRangeDelBlockName = "rocksdb.range_del";
 
@@ -545,14 +543,6 @@ Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
     if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
       Slice v = meta_index_iter->value();
       return block_handle->DecodeFrom(&v);
-    } else if (meta_block_name == kPropertiesBlockName) {
-      // Have to try old name for compatibility
-      meta_index_iter->Seek(kPropertiesBlockOldName);
-      if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
-          meta_index_iter->key() == kPropertiesBlockOldName) {
-        Slice v = meta_index_iter->value();
-        return block_handle->DecodeFrom(&v);
-      }
     }
   }
   // else
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index bc7ad18734f0..0012e9c305fc 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -34,7 +34,6 @@ struct TableProperties;
 // Meta block names for metaindex
 extern const std::string kPropertiesBlockName;
 extern const std::string kIndexBlockName;
-extern const std::string kPropertiesBlockOldName;
 extern const std::string kCompressionDictBlockName;
 extern const std::string kRangeDelBlockName;
 
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index 3b185380b571..a4b235546559 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -138,12 +138,11 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
       file_.reset(new RandomAccessFileReader(std::move(file), file_path));
     }
 
-    // For old sst format, ReadTableProperties might fail but file can be read
-    if (ReadTableProperties(magic_number, file_.get(), file_size,
+    s = ReadTableProperties(magic_number, file_.get(), file_size,
                             (magic_number == kBlockBasedTableMagicNumber)
                                 ? &prefetch_buffer
-                                : nullptr)
-            .ok()) {
+                                : nullptr);
+    if (s.ok()) {
       s = SetTableOptionsByMagicNumber(magic_number);
       if (s.ok()) {
         if (table_properties_ && !table_properties_->comparator_name.empty()) {
@@ -158,8 +157,6 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
           }
         }
       }
-    } else {
-      s = SetOldTableOptions();
     }
     options_.comparator = internal_comparator_.user_comparator();
 
@@ -526,19 +523,6 @@ Status SstFileDumper::SetTableOptionsByMagicNumber(
   return Status::OK();
 }
 
-Status SstFileDumper::SetOldTableOptions() {
-  assert(table_properties_ == nullptr);
-  if (!options_.table_factory->IsInstanceOf(
-          TableFactory::kBlockBasedTableName())) {
-    options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-  }
-  if (!silent_) {
-    fprintf(stdout, "Sst file format: block-based(old version)\n");
-  }
-
-  return Status::OK();
-}
-
 Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num_limit,
                                      bool has_from, const std::string& from_key,
                                      bool has_to, const std::string& to_key,
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index 23a878ba07f5..b7d9e4003b83 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -65,7 +65,6 @@ class SstFileDumper {
                                       std::chrono::microseconds* read_time);
 
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
-  Status SetOldTableOptions();
 
   // Helper function to call the factory with settings specific to the
   // factory implementation

From d3817f058db613868d4879f6d3fd6ae8f4134030 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Tue, 17 Feb 2026 23:33:39 -0800
Subject: [PATCH 488/500] Remove deprecated DB::Open raw pointer variants (and
 more) (#14335)

Summary:
and remove deprecated DB::MaxMemCompactionLevel(). In the process of pushing through a relatively clean refactoring of uses of the old functions, some other minor public APIs are also migrated from raw DB pointers to unique_ptr.

Claude did pretty much all the work, but requiring dozens of prompts to actually push through relatively clean phase out of raw DB pointers from what needed to be touched, and leaving that code in better shape. (Hundreds of `DB*` still remain all over the place even outside C and Java bindings.)

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14335

Test Plan: existing tests; no functional changes intended

Reviewed By: xingbowang, mszeszko-meta

Differential Revision: D93523820

Pulled By: pdillinger

fbshipit-source-id: e4ca22ad81cd2cfe91122d7507d7ca34fe03d043
---
 cache/lru_cache_test.cc                       |  22 +--
 db/c.cc                                       |  58 +++---
 db/column_family_test.cc                      |  15 +-
 db/compact_files_test.cc                      |  78 ++++----
 db/compaction/compaction_job_stats_test.cc    |  14 +-
 db/compaction/compaction_service_test.cc      |   5 +-
 db/compaction/tiered_compaction_test.cc       |   8 +-
 db/comparator_db_test.cc                      |  14 +-
 db/corruption_test.cc                         | 148 ++++++--------
 db/cuckoo_table_db_test.cc                    |  22 +--
 db/db_basic_test.cc                           |  73 ++++---
 db/db_block_cache_test.cc                     |   4 +-
 db/db_bloom_filter_test.cc                    |   2 +-
 db/db_compaction_test.cc                      |  28 +--
 db/db_flush_test.cc                           |  11 +-
 db/db_impl/db_impl.h                          |   2 -
 db/db_impl/db_impl_compaction_flush.cc        |   4 -
 db/db_impl/db_impl_open.cc                    |   8 +-
 db/db_impl/db_impl_secondary.cc               |   9 +-
 db/db_iterator_test.cc                        |   2 +-
 db/db_logical_block_size_cache_test.cc        |  45 +++--
 db/db_merge_operator_test.cc                  |   8 +-
 db/db_properties_test.cc                      |   6 +-
 db/db_range_del_test.cc                       |   2 +-
 db/db_readonly_with_timestamp_test.cc         |   8 +-
 db/db_secondary_test.cc                       |  29 ++-
 db/db_statistics_test.cc                      |   5 +-
 db/db_table_properties_test.cc                |   6 +-
 db/db_test.cc                                 | 143 ++++++++------
 db/db_test2.cc                                | 160 ++++++++--------
 db/db_test_util.cc                            |  21 +-
 db/db_test_util.h                             |   4 +-
 db/db_wal_test.cc                             |  62 +++---
 db/db_with_timestamp_basic_test.cc            |  10 +-
 db/db_write_buffer_manager_test.cc            |  40 ++--
 db/error_handler_fs_test.cc                   |  26 ++-
 db/external_sst_file_basic_test.cc            |   2 +-
 db/external_sst_file_test.cc                  |  23 +--
 db/fault_injection_test.cc                    |  10 +-
 db/forward_iterator_bench.cc                  |  11 +-
 db/import_column_family_test.cc               |  34 ++--
 db/listener_test.cc                           |  33 ++--
 db/manual_compaction_test.cc                  |  12 +-
 db/memtable_list_test.cc                      |   7 +-
 db/merge_test.cc                              |  72 +++----
 db/obsolete_files_test.cc                     |   3 +-
 db/options_file_test.cc                       |   8 +-
 db/perf_context_test.cc                       |  20 +-
 db/periodic_task_scheduler_test.cc            |  12 +-
 db/plain_table_db_test.cc                     |  27 +--
 db/prefix_test.cc                             |  12 +-
 db/seqno_time_test.cc                         |  10 +-
 db/wide/db_wide_basic_test.cc                 |  10 +-
 db/write_callback_test.cc                     |   6 +-
 db_stress_tool/cf_consistency_stress.cc       |   2 +-
 db_stress_tool/db_stress_test_base.cc         |  43 +++--
 db_stress_tool/db_stress_test_base.h          |   3 +-
 db_stress_tool/no_batched_ops_stress.cc       |   5 +-
 env/env_test.cc                               |   4 +-
 examples/column_families_example.cc           |   7 +-
 examples/compact_files_example.cc             |  11 +-
 examples/compaction_filter_example.cc         |   5 +-
 examples/multi_processes_example.cc           |  21 +-
 examples/options_file_example.cc              |   7 +-
 examples/rocksdb_backup_restore_example.cc    |  11 +-
 examples/simple_example.cc                    |   5 +-
 fuzz/db_fuzzer.cc                             |  12 +-
 fuzz/db_map_fuzzer.cc                         |   7 +-
 include/rocksdb/db.h                          | 101 +---------
 include/rocksdb/tool_hooks.h                  |  26 ++-
 include/rocksdb/utilities/db_ttl.h            |   2 +-
 include/rocksdb/utilities/ldb_cmd.h           |   3 +-
 include/rocksdb/utilities/memory_util.h       |   6 +-
 .../utilities/optimistic_transaction_db.h     |   3 +-
 include/rocksdb/utilities/stackable_db.h      |   5 -
 java/rocksjni/rocksjni.cc                     |  64 +++----
 logging/auto_roll_logger_test.cc              |   3 +-
 memory/memory_allocator_test.cc               |   4 +-
 microbench/db_basic_bench.cc                  |  16 +-
 table/sst_file_reader_test.cc                 |   4 +-
 table/table_reader_bench.cc                   |   5 +-
 table/table_test.cc                           |  36 ++--
 tools/db_bench_tool.cc                        |  58 ++++--
 tools/db_repl_stress.cc                       |   4 +-
 tools/db_sanity_test.cc                       |   6 +-
 tools/dump/db_dump_tool.cc                    |  12 +-
 tools/io_tracer_parser_test.cc                |   5 +-
 tools/ldb_cmd.cc                              |  27 ++-
 tools/ldb_cmd_test.cc                         |  88 ++++-----
 tools/reduce_levels_test.cc                   |  11 +-
 tools/tool_hooks.cc                           |  11 +-
 tools/trace_analyzer_test.cc                  |   8 +-
 tools/write_stress.cc                         |   4 +-
 .../remove_raw_ptr_db_open.md                 |   2 +
 util/slice_transform_test.cc                  |  12 +-
 utilities/backup/backup_engine_test.cc        | 139 +++++++-------
 utilities/blob_db/blob_db_impl.cc             |  12 +-
 utilities/blob_db/blob_db_test.cc             |  10 +-
 .../cassandra/cassandra_functional_test.cc    |  28 +--
 utilities/checkpoint/checkpoint_test.cc       | 107 +++++------
 utilities/debug.cc                            |   3 +-
 utilities/memory/memory_test.cc               |  30 ++-
 utilities/memory/memory_util.cc               |  19 +-
 .../string_append/stringappend_test.cc        |  50 ++---
 .../option_change_migration.cc                |   7 +-
 utilities/options/options_util_test.cc        |  34 ++--
 .../optimistic_transaction_db_impl.cc         |   4 +-
 .../optimistic_transaction_db_impl.h          |  17 +-
 utilities/transactions/transaction_test.cc    |   6 +-
 .../write_prepared_transaction_test.cc        |   3 +-
 utilities/ttl/db_ttl_impl.cc                  |   7 +-
 utilities/ttl/db_ttl_impl.h                   |   2 +-
 utilities/ttl/ttl_test.cc                     |   4 +-
 .../write_batch_with_index_test.cc            | 181 +++++++++---------
 114 files changed, 1299 insertions(+), 1447 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_raw_ptr_db_open.md

diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index efdef44bac0b..c9b4393dd274 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -2179,7 +2179,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) {
                             &cache_dumper);
   ASSERT_OK(s);
   std::vector<DB*> db_list;
-  db_list.push_back(db_);
+  db_list.push_back(db_.get());
   s = cache_dumper->SetDumpFilter(db_list);
   ASSERT_OK(s);
   s = cache_dumper->DumpCacheEntriesToWriter();
@@ -2263,11 +2263,11 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
   options.env = fault_env_.get();
   std::string dbname1 = test::PerThreadDBPath("db_1");
   ASSERT_OK(DestroyDB(dbname1, options));
-  DB* db1 = nullptr;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, dbname1, &db1));
   std::string dbname2 = test::PerThreadDBPath("db_2");
   ASSERT_OK(DestroyDB(dbname2, options));
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   ASSERT_OK(DB::Open(options, dbname2, &db2));
   fault_fs_->SetFailGetUniqueId(true);
 
@@ -2335,7 +2335,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
                             &cache_dumper);
   ASSERT_OK(s);
   std::vector<DB*> db_list;
-  db_list.push_back(db1);
+  db_list.push_back(db1.get());
   s = cache_dumper->SetDumpFilter(db_list);
   ASSERT_OK(s);
   s = cache_dumper->DumpCacheEntriesToWriter();
@@ -2377,7 +2377,7 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
   ASSERT_OK(s);
 
   ASSERT_OK(db1->Close());
-  delete db1;
+  db1.reset();
   ASSERT_OK(DB::Open(options, dbname1, &db1));
 
   // After load, we do the Get again. To validate the cache, we do not allow any
@@ -2406,8 +2406,8 @@ TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) {
   ASSERT_EQ(256, static_cast<int>(block_lookup));
   fault_fs_->SetFailGetUniqueId(false);
   fault_fs_->SetFilesystemActive(true);
-  delete db1;
-  delete db2;
+  db1.reset();
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname1, options));
   ASSERT_OK(DestroyDB(dbname2, options));
 }
@@ -2619,11 +2619,11 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
   options.paranoid_file_checks = true;
   std::string dbname1 = test::PerThreadDBPath("db_t_1");
   ASSERT_OK(DestroyDB(dbname1, options));
-  DB* db1 = nullptr;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, dbname1, &db1));
   std::string dbname2 = test::PerThreadDBPath("db_t_2");
   ASSERT_OK(DestroyDB(dbname2, options));
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   Options options2 = options;
   options2.lowest_used_cache_tier = CacheTier::kVolatileTier;
   ASSERT_OK(DB::Open(options2, dbname2, &db2));
@@ -2700,8 +2700,8 @@ TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) {
 
   fault_fs_->SetFailGetUniqueId(false);
   fault_fs_->SetFilesystemActive(true);
-  delete db1;
-  delete db2;
+  db1.reset();
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname1, options));
   ASSERT_OK(DestroyDB(dbname2, options));
 }
diff --git a/db/c.cc b/db/c.cc
index ad029be3c425..947b538f1ea8 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -11,6 +11,7 @@
 
 #include <cstdlib>
 #include <map>
+#include <memory>
 #include <unordered_set>
 #include <vector>
 
@@ -1222,12 +1223,12 @@ char* rocksdb_open_and_compact_with_options(
 
 rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name,
                         char** errptr) {
-  DB* db;
-  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+  std::unique_ptr<DB> dbptr;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &dbptr))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1247,13 +1248,14 @@ rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
                                       const char* name,
                                       unsigned char error_if_wal_file_exists,
                                       char** errptr) {
-  DB* db;
-  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
-                                            &db, error_if_wal_file_exists))) {
+  std::unique_ptr<DB> dbptr;
+  if (SaveError(errptr,
+                DB::OpenForReadOnly(options->rep, std::string(name), &dbptr,
+                                    error_if_wal_file_exists))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1261,14 +1263,14 @@ rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
                                      const char* name,
                                      const char* secondary_path,
                                      char** errptr) {
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   if (SaveError(errptr,
                 DB::OpenAsSecondary(options->rep, std::string(name),
-                                    std::string(secondary_path), &db))) {
+                                    std::string(secondary_path), &dbptr))) {
     return nullptr;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1582,11 +1584,11 @@ rocksdb_t* rocksdb_open_and_trim_history(
 
   std::string trim_ts_(trim_ts, trim_tslen);
 
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
   if (SaveError(errptr, DB::OpenAndTrimHistory(
                             DBOptions(db_options->rep), std::string(name),
-                            column_families, &handles, &db, trim_ts_))) {
+                            column_families, &handles, &dbptr, trim_ts_))) {
     return nullptr;
   }
 
@@ -1598,7 +1600,7 @@ rocksdb_t* rocksdb_open_and_trim_history(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1614,10 +1616,10 @@ rocksdb_t* rocksdb_open_column_families(
         ColumnFamilyOptions(column_family_options[i]->rep));
   }
 
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
   if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name),
-                                 column_families, &handles, &db))) {
+                                 column_families, &handles, &dbptr))) {
     return nullptr;
   }
 
@@ -1629,7 +1631,7 @@ rocksdb_t* rocksdb_open_column_families(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1682,12 +1684,12 @@ rocksdb_t* rocksdb_open_for_read_only_column_families(
         ColumnFamilyOptions(column_family_options[i]->rep));
   }
 
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
-  if (SaveError(errptr,
-                DB::OpenForReadOnly(DBOptions(db_options->rep),
-                                    std::string(name), column_families,
-                                    &handles, &db, error_if_wal_file_exists))) {
+  if (SaveError(errptr, DB::OpenForReadOnly(DBOptions(db_options->rep),
+                                            std::string(name), column_families,
+                                            &handles, &dbptr,
+                                            error_if_wal_file_exists))) {
     return nullptr;
   }
 
@@ -1699,7 +1701,7 @@ rocksdb_t* rocksdb_open_for_read_only_column_families(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
@@ -1715,12 +1717,12 @@ rocksdb_t* rocksdb_open_as_secondary_column_families(
         std::string(column_family_names[i]),
         ColumnFamilyOptions(column_family_options[i]->rep));
   }
-  DB* db;
+  std::unique_ptr<DB> dbptr;
   std::vector<ColumnFamilyHandle*> handles;
-  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
-                                            std::string(name),
-                                            std::string(secondary_path),
-                                            column_families, &handles, &db))) {
+  if (SaveError(errptr, DB::OpenAsSecondary(
+                            DBOptions(db_options->rep), std::string(name),
+                            std::string(secondary_path), column_families,
+                            &handles, &dbptr))) {
     return nullptr;
   }
   for (size_t i = 0; i != handles.size(); ++i) {
@@ -1731,7 +1733,7 @@ rocksdb_t* rocksdb_open_as_secondary_column_families(
     column_family_handles[i] = c_handle;
   }
   rocksdb_t* result = new rocksdb_t;
-  result->rep = db;
+  result->rep = dbptr.release();
   return result;
 }
 
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index 8579ab525076..7cb505179c38 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -118,8 +118,7 @@ class ColumnFamilyTestBase : public testing::Test {
 
     for (int i = 0; i < n; i++) {
       if (flush_every != 0 && i != 0 && i % flush_every == 0) {
-        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-        dbi->TEST_FlushMemTable();
+        dbfull()->TEST_FlushMemTable();
       }
 
       int keyi = base + i;
@@ -177,8 +176,7 @@ class ColumnFamilyTestBase : public testing::Test {
     }
     handles_.clear();
     names_.clear();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
   }
 
   Status TryOpen(std::vector<std::string> cf,
@@ -218,7 +216,7 @@ class ColumnFamilyTestBase : public testing::Test {
 
   void Open() { Open({"default"}); }
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   int GetProperty(int cf, std::string property) {
     std::string value;
@@ -500,7 +498,7 @@ class ColumnFamilyTestBase : public testing::Test {
   ColumnFamilyOptions column_family_options_;
   DBOptions db_options_;
   std::string dbname_;
-  DB* db_ = nullptr;
+  std::unique_ptr<DB> db_;
   EnvCounter* env_;
   std::shared_ptr<Env> env_guard_;
   Random rnd_;
@@ -3542,11 +3540,10 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
 
   // Re-open and verify the keys.
   Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
   for (int cf = 1; cf != 3; ++cf) {
     ReadOptions read_options;
     read_options.readahead_size = 0;
-    auto it = dbi->NewIterator(read_options, handles_[cf]);
+    auto it = db_->NewIterator(read_options, handles_[cf]);
     for (it->SeekToFirst(); it->Valid(); it->Next()) {
       ASSERT_OK(it->status());
       Slice key(it->key());
@@ -3886,7 +3883,7 @@ TEST_F(ManualFlushSkipRetainUDTTest, FlushRemovesStaleEntries) {
       static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
   for (int version = 0; version < 100; version++) {
     if (version == 50) {
-      ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable(cfd));
+      ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfd));
     }
     ASSERT_OK(
         Put(0, "foo", EncodeAsUint64(version), "v" + std::to_string(version)));
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index 730921f9680b..62669bc1bdb2 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -75,10 +75,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
   options.level0_file_num_compaction_trigger = kLevel0Trigger;
   options.compression = kNoCompression;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
-  Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(DB::Open(options, db_name_, &db));
   assert(db);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
@@ -114,7 +113,6 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
     }
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  delete db;
 }
 
 TEST_F(CompactFilesTest, MultipleLevel) {
@@ -128,11 +126,11 @@ TEST_F(CompactFilesTest, MultipleLevel) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
-  ASSERT_NE(db, nullptr);
+  ASSERT_NE(db.get(), nullptr);
 
   // create couple files in L0, L3, L4 and L5
   for (int i = 5; i > 2; --i) {
@@ -141,7 +139,8 @@ TEST_F(CompactFilesTest, MultipleLevel) {
     ASSERT_OK(db->Flush(FlushOptions()));
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
-    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForBackgroundWork());
     auto l0_files = collector->GetFlushedFiles();
     ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
 
@@ -191,8 +190,6 @@ TEST_F(CompactFilesTest, MultipleLevel) {
   ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
   SyncPoint::GetInstance()->DisableProcessing();
   thread.join();
-
-  delete db;
 }
 
 TEST_F(CompactFilesTest, ObsoleteFiles) {
@@ -212,11 +209,11 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
-  ASSERT_NE(db, nullptr);
+  ASSERT_NE(db.get(), nullptr);
 
   // create couple files
   for (int i = 1000; i < 2000; ++i) {
@@ -226,13 +223,12 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
 
   auto l0_files = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
+  ASSERT_OK(static_cast_with_check<DBImpl>(db.get())->TEST_WaitForCompact());
 
   // verify all compaction input files are deleted
   for (const auto& fname : l0_files) {
     ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
   }
-  delete db;
 }
 
 TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
@@ -251,10 +247,9 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
-  Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(DB::Open(options, db_name_, &db));
   assert(db);
 
   // create couple files
@@ -262,19 +257,20 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
     ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
                       std::string(1000, 'a' + (i % 26))));
   }
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   collector->ClearFlushedFiles();
   for (int i = 0; i < 500; ++i) {
     ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
                       std::string(1000, 'a' + (i % 26))));
   }
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   auto l0_files_2 = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
   // no assertion failure
-  delete db;
 }
 
 TEST_F(CompactFilesTest, CapturingPendingFiles) {
@@ -289,7 +285,7 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
@@ -303,7 +299,8 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 
   // Ensure background work is fully finished including listener callbacks
   // before accessing listener state.
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForBackgroundWork());
   auto l0_files = collector->GetFlushedFiles();
   EXPECT_EQ(5, l0_files.size());
 
@@ -327,13 +324,12 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
-  delete db;
+  db.reset();
 
   // Make sure we can reopen the DB.
   s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
   assert(db);
-  delete db;
 }
 
 TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
@@ -365,12 +361,12 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
   options.create_if_missing = true;
   options.compaction_filter = cf.get();
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
 
-  cf->SetDB(db);
+  cf->SetDB(db.get());
 
   // Write one L0 file
   ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
@@ -384,8 +380,6 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
     ASSERT_OK(
         db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
   }
-
-  delete db;
 }
 
 TEST_F(CompactFilesTest, SentinelCompressionType) {
@@ -413,7 +407,7 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
     options.create_if_missing = true;
     FlushedFileCollector* collector = new FlushedFileCollector();
     options.listeners.emplace_back(collector);
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     ASSERT_OK(DB::Open(options, db_name_, &db));
 
     ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
@@ -421,7 +415,8 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
 
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
-    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForBackgroundWork());
     auto l0_files = collector->GetFlushedFiles();
     ASSERT_EQ(1, l0_files.size());
 
@@ -437,7 +432,6 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
       // compression_name property
       ASSERT_EQ("BuiltinV2;02;", name_and_table_props.second->compression_name);
     }
-    delete db;
   }
 }
 
@@ -462,11 +456,7 @@ TEST_F(CompactFilesTest, CompressionWithBlockAlign) {
   }
 
   std::unique_ptr<DB> db;
-  {
-    DB* _db = nullptr;
-    ASSERT_OK(DB::Open(options, db_name_, &_db));
-    db.reset(_db);
-  }
+  ASSERT_OK(DB::Open(options, db_name_, &db));
 
   ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
   ASSERT_OK(db->Flush(FlushOptions()));
@@ -505,7 +495,7 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   FlushedFileCollector* collector = new FlushedFileCollector();
   options.listeners.emplace_back(collector);
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
@@ -516,7 +506,8 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
     ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
                       std::string(1000, 'a' + (i % 26))));
   }
-  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   CompactionOptions co;
   co.compression = CompressionType::kLZ4Compression;
@@ -532,7 +523,6 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   ASSERT_EQ(compaction_job_info.output_level, 0);
   ASSERT_OK(compaction_job_info.status);
   // no assertion failure
-  delete db;
 }
 
 // Helper function to generate zero-padded keys
@@ -548,11 +538,11 @@ TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) {
   options.compression = kNoCompression;
   options.level_compaction_dynamic_level_bytes = false;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
-  ASSERT_NE(db, nullptr);
+  ASSERT_NE(db.get(), nullptr);
 
   // Create 3 non-overlapping files in L0
   // File 1: keys [a00-a99]
@@ -665,8 +655,6 @@ TEST_F(CompactFilesTest, TrivialMoveNonOverlappingFiles) {
     ASSERT_OK(db->Get(ReadOptions(), key, &value));
     ASSERT_EQ(value, "value_" + key);
   }
-
-  delete db;
 }
 
 TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
@@ -677,11 +665,11 @@ TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
   options.level_compaction_dynamic_level_bytes = false;
   options.num_levels = 7;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DestroyDB(db_name_, options));
   Status s = DB::Open(options, db_name_, &db);
   ASSERT_OK(s);
-  ASSERT_NE(db, nullptr);
+  ASSERT_NE(db.get(), nullptr);
 
   // Create a file in L6 with keys [m00-m99] (wide range)
   for (int i = 0; i < 100; i++) {
@@ -757,8 +745,6 @@ TEST_F(CompactFilesTest, TrivialMoveBlockedByOverlap) {
     ASSERT_OK(db->Get(ReadOptions(), key, &value));
     ASSERT_EQ(value, "updated_value_" + key);
   }
-
-  delete db;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
index c4a05c951dfc..6a91271520d0 100644
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -82,7 +82,7 @@ class CompactionJobStatsTest : public testing::Test,
   std::string dbname_;
   std::string alternative_wal_dir_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   std::vector<ColumnFamilyHandle*> handles_;
   uint32_t max_subcompactions_;
 
@@ -123,7 +123,7 @@ class CompactionJobStatsTest : public testing::Test,
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options) {
@@ -162,7 +162,8 @@ class CompactionJobStatsTest : public testing::Test,
       column_families.emplace_back(cfs[i], options[i]);
     }
     DBOptions db_opts = DBOptions(options[0]);
-    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+    auto s = DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+    return s;
   }
 
   Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
@@ -179,8 +180,7 @@ class CompactionJobStatsTest : public testing::Test,
       delete h;
     }
     handles_.clear();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
   }
 
   void DestroyAndReopen(const Options& options) {
@@ -743,7 +743,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
     }
 
     ASSERT_OK(Flush(1));
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     stats_checker->set_verify_next_comp_io_stats(true);
     std::atomic<bool> first_prepare_write(true);
@@ -944,7 +944,7 @@ TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
        start_key += key_base) {
     MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
                            kValueSize, key_interval, compression_ratio, 1);
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
   ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
 }
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
index 421663d8a0ae..f76a25092974 100644
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@@ -1380,8 +1380,9 @@ TEST_F(CompactionServiceTest, CancelCompactionOnPrimarySide) {
 
   // Primary DB calls CancelAllBackgroundWork() while the compaction is running
   SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Inprogress",
-      [&](void* /*arg*/) { CancelAllBackgroundWork(db_, false /*wait*/); });
+      "CompactionJob::Run():Inprogress", [&](void* /*arg*/) {
+        CancelAllBackgroundWork(db_.get(), false /*wait*/);
+      });
 
   SyncPoint::GetInstance()->EnableProcessing();
 
diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc
index f8e9da373394..7bd840e486d4 100644
--- a/db/compaction/tiered_compaction_test.cc
+++ b/db/compaction/tiered_compaction_test.cc
@@ -1717,8 +1717,8 @@ TEST_P(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
   ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
 
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
 
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
@@ -2319,8 +2319,8 @@ TEST_P(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
   ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
 
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
 
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index af7355c5144d..fdd042fcd717 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -258,12 +258,12 @@ class ComparatorDBTest
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   Options last_options_;
   std::unique_ptr<const Comparator> comparator_guard;
 
  public:
-  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+  ComparatorDBTest() : env_(Env::Default()) {
     kTestComparator = BytewiseComparator();
     dbname_ = test::PerThreadDBPath("comparator_db_test");
     BlockBasedTableOptions toptions;
@@ -274,12 +274,12 @@ class ComparatorDBTest
   }
 
   ~ComparatorDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, last_options_));
     kTestComparator = BytewiseComparator();
   }
 
-  DB* GetDB() { return db_; }
+  DB* GetDB() { return db_.get(); }
 
   void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
     if (owner) {
@@ -301,14 +301,12 @@ class ComparatorDBTest
   }
 
   void Destroy() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, last_options_));
   }
 
   Status TryReopen() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     last_options_.create_if_missing = true;
 
     return DB::Open(last_options_, dbname_, &db_);
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 9a7b789b2d25..448d2c9d94c0 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -73,7 +73,7 @@ class CorruptionTest : public testing::Test {
   std::string dbname_;
   std::shared_ptr<Cache> tiny_cache_;
   Options options_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
   CorruptionTest() {
     // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
@@ -105,8 +105,7 @@ class CorruptionTest : public testing::Test {
     SyncPoint::GetInstance()->DisableProcessing();
     SyncPoint::GetInstance()->LoadDependency({});
     SyncPoint::GetInstance()->ClearAllCallBacks();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     if (getenv("KEEP_DB")) {
       fprintf(stdout, "db is still at %s\n", dbname_.c_str());
     } else {
@@ -116,14 +115,12 @@ class CorruptionTest : public testing::Test {
     }
   }
 
-  void CloseDb() {
-    delete db_;
-    db_ = nullptr;
-  }
+  void CloseDb() { db_.reset(); }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     Options opt = (options ? *options : options_);
     if (opt.env == Options().env) {
       // If env is not overridden, replace it with ErrorEnv.
@@ -141,8 +138,7 @@ class CorruptionTest : public testing::Test {
   void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
 
   void RepairDB() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
   }
 
@@ -151,8 +147,7 @@ class CorruptionTest : public testing::Test {
     WriteBatch batch;
     for (int i = 0; i < n; i++) {
       if (flush_every != 0 && i != 0 && i % flush_every == 0) {
-        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-        ASSERT_OK(dbi->TEST_FlushMemTable());
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
       }
       // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
       Slice key = Key(i + start, &key_space);
@@ -436,14 +431,14 @@ TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
 
 TEST_F(CorruptionTest, TableFile) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
 
   Corrupt(kTableFile, 100, 1);
   Check(99, 99);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, VerifyChecksumReadahead) {
@@ -460,14 +455,14 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   Reopen(&options);
 
   Build(10000);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
 
   senv.count_random_reads_ = true;
   senv.random_read_counter_.Reset();
-  ASSERT_OK(dbi->VerifyChecksum());
+  ASSERT_OK(db_->VerifyChecksum());
 
   // Make sure the counter is enabled.
   ASSERT_GT(senv.random_read_counter_.Read(), 0);
@@ -480,7 +475,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   senv.random_read_bytes_counter_ = 0;
   ReadOptions ro;
   ro.readahead_size = size_t{32 * 1024};
-  ASSERT_OK(dbi->VerifyChecksum(ro));
+  ASSERT_OK(db_->VerifyChecksum(ro));
   // The SST file is about 10MB. We set readahead size to 32KB.
   // Give 0 to 20 reads for metadata blocks, and allow real read
   // to range from 24KB to 48KB. The lower bound would be:
@@ -494,8 +489,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   // disabled).
   options.allow_mmap_reads = true;
   Reopen(&options);
-  dbi = static_cast<DBImpl*>(db_);
-  ASSERT_OK(dbi->VerifyChecksum(ro));
+  ASSERT_OK(db_->VerifyChecksum(ro));
 
   CloseDb();
 }
@@ -508,18 +502,16 @@ TEST_F(CorruptionTest, TableFileIndexData) {
   Reopen(&options);
   // build 2 tables, flush at 5000
   Build(10000, 5000);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // corrupt an index block of an entire file
   Corrupt(kTableFile, -2000, 500);
   options.paranoid_checks = false;
   Reopen(&options);
-  dbi = static_cast_with_check<DBImpl>(db_);
   // one full file may be readable, since only one was corrupted
   // the other file should be fully non-readable, since index was corrupted
   Check(0, 5000, ReadOptions(true, true));
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   // In paranoid mode, the db cannot be opened due to the corrupted file.
   ASSERT_TRUE(TryReopen().IsCorruption());
@@ -527,8 +519,7 @@ TEST_F(CorruptionTest, TableFileIndexData) {
 
 TEST_F(CorruptionTest, TableFileFooterMagic) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Check(100, 100);
   // Corrupt the whole footer
   Corrupt(kTableFile, -100, 100);
@@ -543,8 +534,7 @@ TEST_F(CorruptionTest, TableFileFooterMagic) {
 
 TEST_F(CorruptionTest, TableFileFooterNotMagic) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Check(100, 100);
   // Corrupt footer except magic number
   Corrupt(kTableFile, -100, 92);
@@ -579,8 +569,7 @@ TEST_F(CorruptionTest, DBOpenWithWrongFileSize) {
     for (auto* cfh : cfhs) {
       delete cfh;
     }
-    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
     // ********************************************
     // Corrupt the file by making the file bigger
@@ -601,7 +590,8 @@ TEST_F(CorruptionTest, DBOpenWithWrongFileSize) {
   // true
   options_.paranoid_checks = true;
   std::vector<ColumnFamilyHandle*> cfhs;
-  auto s = DB::Open(options_, dbname_, cf_descs, &cfhs, &db_);
+  Status s;
+  s = DB::Open(options_, dbname_, cf_descs, &cfhs, &db_);
   ASSERT_TRUE(s.IsCorruption());
   ASSERT_TRUE(s.ToString().find("file size mismatch") != std::string::npos);
 
@@ -626,8 +616,7 @@ TEST_F(CorruptionTest, DBOpenWithWrongFileSize) {
 
 TEST_F(CorruptionTest, TableFileWrongSize) {
   Build(100);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Check(100, 100);
 
   // ********************************************
@@ -710,12 +699,11 @@ TEST_F(CorruptionTest, SequenceNumberRecovery) {
 
 TEST_F(CorruptionTest, CorruptedDescriptor) {
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_OK(
-      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+      db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
 
   Corrupt(kDescriptorFile, 0, 1000);
   Status s = TryReopen();
@@ -734,7 +722,7 @@ TEST_F(CorruptionTest, CompactionInputError) {
   options.env = env_.get();
   Reopen(&options);
   Build(10);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
@@ -742,12 +730,12 @@ TEST_F(CorruptionTest, CompactionInputError) {
 
   Corrupt(kTableFile, 100, 1);
   Check(9, 9);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   // Force compactions by writing lots of values
   Build(10000);
   Check(10000, 10000);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 }
 
 TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
@@ -758,14 +746,14 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
   Reopen(&options);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
 
   // Fill levels >= 1
-  for (int level = 1; level < dbi->NumberLevels(); level++) {
-    ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
-    ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
+  for (int level = 1; level < db_->NumberLevels(); level++) {
+    ASSERT_OK(db_->Put(WriteOptions(), "", "begin"));
+    ASSERT_OK(db_->Put(WriteOptions(), "~", "end"));
     ASSERT_OK(dbi->TEST_FlushMemTable());
-    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+    for (int comp_level = 0; comp_level < db_->NumberLevels() - level;
          ++comp_level) {
       ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
     }
@@ -773,7 +761,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
   Reopen(&options);
 
-  dbi = static_cast_with_check<DBImpl>(db_);
+  dbi = dbfull();
   Build(10);
   ASSERT_OK(dbi->TEST_FlushMemTable());
   ASSERT_OK(dbi->TEST_WaitForCompact());
@@ -781,7 +769,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
   CorruptTableFileAtLevel(0, 100, 1);
   Check(9, 9);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   // Write must eventually fail because of corrupted table
   Status s;
@@ -800,17 +788,16 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
 
 TEST_F(CorruptionTest, UnrelatedKeys) {
   Build(10);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   Corrupt(kTableFile, 100, 1);
-  ASSERT_NOK(dbi->VerifyChecksum());
+  ASSERT_NOK(db_->VerifyChecksum());
 
   std::string tmp1, tmp2;
   ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
   std::string v;
   ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
 }
@@ -857,14 +844,12 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) {
     Reopen(&options);
     Build(10);
     ASSERT_OK(db_->Flush(FlushOptions()));
-    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
     std::vector<LiveFileMetaData> metadata;
-    dbi->GetLiveFilesMetaData(&metadata);
+    db_->GetLiveFilesMetaData(&metadata);
     ASSERT_GT(metadata.size(), 0);
     std::string filename = dbname_ + metadata[0].name;
 
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
 
     if (iter == 0) {  // corrupt file size
       std::unique_ptr<WritableFile> file;
@@ -896,8 +881,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
   options.create_if_missing = true;
   Status s;
   for (const auto& mode : corruption_modes) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     s = DestroyDB(dbname_, options);
     ASSERT_OK(s);
     std::shared_ptr<mock::MockTableFactory> mock =
@@ -924,8 +908,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
   options.create_if_missing = true;
   Status s;
   for (const auto& mode : corruption_modes) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     s = DestroyDB(dbname_, options);
     ASSERT_OK(s);
     std::shared_ptr<mock::MockTableFactory> mock =
@@ -934,12 +917,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
     Build(100, 2);
-    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-    ASSERT_OK(dbi->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
     mock->SetCorruptionMode(mode);
     CompactRangeOptions cro;
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-    s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+    s = db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr);
     if (mode == mock::MockTableFactory::kCorruptNone) {
       ASSERT_OK(s);
     } else {
@@ -955,8 +937,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     std::string start, end;
@@ -973,12 +954,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
     if (do_flush) {
       ASSERT_OK(db_->Flush(FlushOptions()));
     } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(
-          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+          db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
     }
     db_->ReleaseSnapshot(snap);
   }
@@ -991,8 +971,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
@@ -1012,12 +991,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
     if (do_flush) {
       ASSERT_OK(db_->Flush(FlushOptions()));
     } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(
-          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+          db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
     }
     db_->ReleaseSnapshot(snap);
   }
@@ -1030,8 +1008,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     assert(db_ != nullptr);  // suppress false clang-analyze report
@@ -1048,12 +1025,11 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
     if (do_flush) {
       ASSERT_OK(db_->Flush(FlushOptions()));
     } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
       ASSERT_OK(
-          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+          db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
     }
     db_->ReleaseSnapshot(snap);
   }
@@ -1066,8 +1042,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
   options.create_if_missing = true;
   options.allow_data_in_errors = true;
   auto mode = mock::MockTableFactory::kCorruptKey;
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
 
   std::shared_ptr<mock::MockTableFactory> mock =
@@ -1079,12 +1054,11 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
   assert(db_ != nullptr);  // suppress false clang-analyze report
   Build(100, 2);
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   Status s =
-      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+      db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr);
   ASSERT_NOK(s);
   ASSERT_TRUE(s.IsCorruption());
 }
@@ -1095,8 +1069,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
   options.env = env_.get();
   options.paranoid_file_checks = false;
   options.create_if_missing = true;
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
   std::shared_ptr<mock::MockTableFactory> mock =
       std::make_shared<mock::MockTableFactory>();
@@ -1105,14 +1078,13 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
   assert(db_ != nullptr);  // suppress false clang-analyze report
   mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
   Build(100, 2);
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   ASSERT_NOK(
-      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+      db_->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr));
 }
 
 TEST_F(CorruptionTest, FlushKeyOrderCheck) {
@@ -1139,7 +1111,7 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) {
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
+  Status s = dbfull()->TEST_FlushMemTable();
   ASSERT_NOK(s);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -1263,7 +1235,7 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
   // while other don't.
   {
     ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
-    auto* dbimpl = static_cast_with_check<DBImpl>(db_);
+    auto* dbimpl = dbfull();
     assert(dbimpl);
 
     // Write one key to test_cf.
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 78ae86683318..1ece0e3630ab 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -21,18 +21,18 @@ class CuckooTableDBTest : public testing::Test {
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
  public:
   CuckooTableDBTest() : env_(Env::Default()) {
     dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
+    db_.reset();
     Reopen();
   }
 
   ~CuckooTableDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
@@ -47,12 +47,11 @@ class CuckooTableDBTest : public testing::Test {
     return options;
   }
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   // The following util methods are copied from plain_table_db_test.
   void Reopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     Options opts;
     if (options != nullptr) {
       opts = *options;
@@ -66,8 +65,7 @@ class CuckooTableDBTest : public testing::Test {
   void DestroyAndReopen(Options* options) {
     assert(options);
     ASSERT_OK(db_->Close());
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, *options));
     Reopen(options);
   }
@@ -130,7 +128,7 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(3U, ptc.begin()->second->num_entries);
@@ -147,7 +145,7 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Put("key6", "v6"));
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(2U, ptc.size());
   auto row = ptc.begin();
@@ -165,7 +163,7 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Delete("key5"));
   ASSERT_OK(Delete("key4"));
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(3U, ptc.size());
   row = ptc.begin();
@@ -190,7 +188,7 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
   ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  ASSERT_OK(static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
   VerifySstUniqueIds(ptc);
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(2U, ptc.begin()->second->num_entries);
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 0ff1295e9120..71bf37f197fe 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -91,17 +91,15 @@ class DBBasicTest : public DBTestBase {
 TEST_F(DBBasicTest, OpenWhenOpen) {
   Options options = CurrentOptions();
   options.env = env_;
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   Status s = DB::Open(options, dbname_, &db2);
-  ASSERT_NOK(s) << [db2]() {
-    delete db2;
+  ASSERT_NOK(s) << [&db2]() {
+    db2.reset();
     return "db2 open: ok";
   }();
   ASSERT_EQ(Status::Code::kIOError, s.code());
   ASSERT_EQ(Status::SubCode::kNone, s.subcode());
   ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
-
-  delete db2;
 }
 
 TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) {
@@ -586,14 +584,14 @@ TEST_F(DBBasicTest, GetSnapshot) {
 
 TEST_F(DBBasicTest, CheckLock) {
   do {
-    DB* localdb = nullptr;
+    std::unique_ptr<DB> localdb;
     Options options = CurrentOptions();
     ASSERT_OK(TryReopen(options));
 
     // second open should fail
     Status s = DB::Open(options, dbname_, &localdb);
-    ASSERT_NOK(s) << [localdb]() {
-      delete localdb;
+    ASSERT_NOK(s) << [&localdb]() {
+      localdb.reset();
       return "localdb open: ok";
     }();
 #ifdef OS_LINUX
@@ -862,7 +860,7 @@ TEST_F(DBBasicTest, Snapshot) {
     ASSERT_OK(Put(1, "foo", "1v3"));
 
     {
-      ManagedSnapshot s3(db_);
+      ManagedSnapshot s3(db_.get());
       ASSERT_EQ(3U, GetNumSnapshots());
       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
@@ -985,7 +983,7 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   Destroy(options);
 
   // Does not exist, and create_if_missing == false: error
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   options.create_if_missing = false;
   Status s = DB::Open(options, dbname_, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
@@ -997,8 +995,7 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Does exist, and error_if_exists == true: error
   options.create_if_missing = false;
@@ -1014,8 +1011,7 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 }
 
 TEST_F(DBBasicTest, CompactOnFlush) {
@@ -1320,7 +1316,7 @@ TEST_F(DBBasicTest, DBClose) {
   std::string dbname = test::PerThreadDBPath("db_close_test");
   ASSERT_OK(DestroyDB(dbname, options));
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   TestEnv* env = new TestEnv(env_);
   std::unique_ptr<TestEnv> local_env_guard(env);
   options.create_if_missing = true;
@@ -1333,14 +1329,14 @@ TEST_F(DBBasicTest, DBClose) {
   ASSERT_EQ(env->GetCloseCount(), 1);
   ASSERT_EQ(s, Status::IOError());
 
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 1);
 
   // Do not call DB::Close() and ensure our logger Close() still gets called
   s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 2);
 
   // close by WaitForCompact() with close_db option
@@ -1355,7 +1351,7 @@ TEST_F(DBBasicTest, DBClose) {
   // see TestLogger::CloseHelper()
   ASSERT_EQ(s, Status::IOError());
 
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 3);
 
   // Provide our own logger and ensure DB::Close() does not close it
@@ -1366,7 +1362,7 @@ TEST_F(DBBasicTest, DBClose) {
 
   s = db->Close();
   ASSERT_EQ(s, Status::OK());
-  delete db;
+  db.reset();
   ASSERT_EQ(env->GetCloseCount(), 3);
   options.info_log.reset();
   ASSERT_EQ(env->GetCloseCount(), 4);
@@ -1384,7 +1380,7 @@ TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
 
   ASSERT_OK(DestroyDB(dbname, options));
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::unique_ptr<Env> env = NewCompositeEnv(
       std::make_shared<CountedFileSystem>(FileSystem::Default()));
   options.create_if_missing = true;
@@ -1402,7 +1398,7 @@ TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
   ASSERT_EQ(counted_fs->counters()->dir_opens,
             counted_fs->counters()->dir_closes);
   ASSERT_OK(s);
-  delete db;
+  db.reset();
 }
 
 TEST_F(DBBasicTest, DBCloseFlushError) {
@@ -1464,7 +1460,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
   }
 
   int get_sv_count = 0;
-  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::DBImpl* db = dbfull();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) {
         if (++get_sv_count == 2) {
@@ -1536,10 +1532,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
   ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
 
   for (int cf = 0; cf < 8; ++cf) {
-    auto* cfd =
-        static_cast_with_check<ColumnFamilyHandleImpl>(
-            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
-            ->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                    dbfull()->GetColumnFamilyHandle(cf))
+                    ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
   }
@@ -1625,10 +1620,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
               "cf" + std::to_string(j) + "_val" + std::to_string(retries));
   }
   for (int i = 0; i < 8; ++i) {
-    auto* cfd =
-        static_cast_with_check<ColumnFamilyHandleImpl>(
-            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
-            ->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                    dbfull()->GetColumnFamilyHandle(i))
+                    ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
   }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -1652,7 +1646,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
   }
 
   int get_sv_count = 0;
-  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::DBImpl* db = dbfull();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) {
         if (++get_sv_count == 2) {
@@ -1693,10 +1687,9 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
     ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
   }
   for (int i = 0; i < 8; ++i) {
-    auto* cfd =
-        static_cast_with_check<ColumnFamilyHandleImpl>(
-            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
-            ->cfd();
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                    dbfull()->GetColumnFamilyHandle(i))
+                    ->cfd();
     ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
   }
 }
@@ -3301,8 +3294,8 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
     ASSERT_OK(Delete(std::to_string(i)));
   }
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
   for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
     if (i % 3 == 0) {
@@ -3311,7 +3304,7 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
       ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
     }
   }
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[0], {}, {},
+  ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[0], {}, {},
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
@@ -3326,14 +3319,14 @@ TEST_F(DBBasicTest, GetAllKeyVersions) {
   for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
     ASSERT_OK(Delete(1, std::to_string(i)));
   }
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], {}, {},
+  ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[1], {}, {},
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
 
   // Change from historical behavior: empty key is now interpreted literally as
   // a legal key (rather than as a "not present" key)
-  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+  ASSERT_OK(GetAllKeyVersions(db_.get(), handles_[1], Slice(), Slice(),
                               std::numeric_limits<size_t>::max(),
                               &key_versions));
   ASSERT_EQ(key_versions.size(), 0);
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index d712e7253fae..1433bd6014e6 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -1661,7 +1661,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
   std::string export_files_dir = dbname_ + "/exported";
   ExportImportFilesMetaData* metadata_ptr_ = nullptr;
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
@@ -1698,7 +1698,7 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
   // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
   // (Checkpoint  not available in LITE mode to test this.)
   auto db_copy_name = dbname_ + "-copy";
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
   delete checkpoint;
 
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index 51b259aeb8d9..fd42a06866f5 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -4145,7 +4145,7 @@ TEST_F(DBBloomFilterTest, SstQueryFilter) {
 
   using Keys = std::vector<std::string>;
   auto RangeQuery =
-      [factory, db = db_](
+      [factory, db = db_.get()](
           std::string lb, std::string ub,
           std::shared_ptr<SstQueryFilterConfigsManager::Factory> alt_factory =
               nullptr) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 58887c2777f7..a97d3461501a 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -2208,7 +2208,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) {
   std::string end_string = Key(2000);
   Slice begin(begin_string);
   Slice end(end_string);
-  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_OK(
+      DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end));
 
   int32_t deleted_count = 0;
   for (int32_t i = 0; i < 4300; i++) {
@@ -2229,8 +2230,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) {
   Slice begin1(begin_string);
   Slice end1(end_string);
   // Try deleting files in range which contain no keys
-  ASSERT_OK(
-      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+  ASSERT_OK(DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin1,
+                               &end1));
 
   // Push data from level 0 to level 1 to force all data to be deleted
   // Note that we don't delete level 0 files
@@ -2239,8 +2240,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRange) {
   ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
-  ASSERT_OK(
-      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+  ASSERT_OK(DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), nullptr,
+                               nullptr));
 
   int32_t deleted_count2 = 0;
   for (int32_t i = 0; i < 4300; i++) {
@@ -2308,7 +2309,7 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
     ranges.emplace_back(begin_str1, end_str1);
     ranges.emplace_back(begin_str2, end_str2);
     ranges.emplace_back(begin_str3, end_str3);
-    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+    ASSERT_OK(DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(),
                                   ranges.data(), ranges.size()));
     ASSERT_EQ("0,3,7", FilesPerLevel(0));
 
@@ -2335,7 +2336,7 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
     ranges.emplace_back(&begin1, &end1);
     ranges.emplace_back(&begin2, &end2);
     ranges.emplace_back(&begin3, &end3);
-    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+    ASSERT_OK(DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(),
                                   ranges.data(), ranges.size(), false));
     ASSERT_EQ("0,1,4", FilesPerLevel(0));
 
@@ -2356,7 +2357,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFilesInRanges) {
   // Delete all files.
   {
     RangeOpt range;
-    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+    ASSERT_OK(
+        DeleteFilesInRanges(db_.get(), db_->DefaultColumnFamily(), &range, 1));
     ASSERT_EQ("", FilesPerLevel(0));
 
     for (auto i = 0; i < 1000; i++) {
@@ -2418,7 +2420,8 @@ TEST_P(DBDeleteFileRangeTest, DeleteFileRangeFileEndpointsOverlapBug) {
   // "1 -> vals[0]" to reappear.
   std::string begin_str = Key(0), end_str = Key(1);
   Slice begin = begin_str, end = end_str;
-  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_OK(
+      DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end));
   ASSERT_EQ(vals[1], GetValue(Key(1)));
 
   db_->ReleaseSnapshot(snapshot);
@@ -3657,7 +3660,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
 
   GenerateNewRandomFile(&rnd, /* nowait */ true);
   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
-  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), nullptr, nullptr));
   for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
        num++) {
     GenerateNewRandomFile(&rnd, /* nowait */ true);
@@ -4533,7 +4536,8 @@ TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
   std::string end_string = Key(kMaxKey + 1);
   Slice begin(begin_string);
   Slice end(end_string);
-  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_OK(
+      DeleteFilesInRange(db_.get(), db_->DefaultColumnFamily(), &begin, &end));
   SyncPoint::GetInstance()->DisableProcessing();
 }
 
@@ -8063,7 +8067,7 @@ TEST_F(DBCompactionTest, CompactFilesSupportKeyPlacementRangeConflict) {
   ASSERT_OK(Flush());
   ASSERT_OK(Put("k4", "v"));
   ASSERT_OK(Flush());
-  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 1));
+  ASSERT_OK(experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily(), 1));
   ASSERT_EQ("0,2,1", FilesPerLevel());
 
   ASSERT_OK(Put("k2", "v"));
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 61f9b5757acc..e1000c576fd2 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -709,7 +709,7 @@ class TestFlushListener : public EventListener {
     // that assumption does not hold (see the test case MultiDBMultiListeners
     // below).
     ASSERT_TRUE(test_);
-    if (db == test_->db_) {
+    if (db == test_->db_.get()) {
       std::vector<std::vector<FileMetaData>> files_by_level;
       test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(),
                                              &files_by_level);
@@ -2533,7 +2533,7 @@ TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
 
   ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
 
-  ManagedSnapshot snapshot_guard(db_);
+  ManagedSnapshot snapshot_guard(db_.get());
 
   ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
   ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
@@ -2574,7 +2574,7 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
   txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
   ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
   ASSERT_NE(txn_db, nullptr);
-  db_ = txn_db;
+  db_.reset(txn_db);
 
   // Create two more columns other than default CF.
   std::vector<std::string> cfs = {"puppy", "kitty"};
@@ -2638,9 +2638,8 @@ TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
   // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
   cfs.push_back(kDefaultColumnFamilyName);
   ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
-  DBImpl* db_impl = static_cast<DBImpl*>(db_);
-  ASSERT_TRUE(db_impl->allow_2pc());
-  ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+  ASSERT_TRUE(dbfull()->allow_2pc());
+  ASSERT_NE(dbfull()->MinLogNumberToKeep(), 0);
 }
 
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index a3f25dd7788f..c3c432bec8d8 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -469,8 +469,6 @@ class DBImpl : public DB {
 
   using DB::NumberLevels;
   int NumberLevels(ColumnFamilyHandle* column_family) override;
-  using DB::MaxMemCompactionLevel;
-  int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
   using DB::Level0StopWriteTrigger;
   int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override;
   const std::string& GetName() const override;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 9cf25f639da0..ab136b57b505 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2025,10 +2025,6 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
   return cfh->cfd()->NumberLevels();
 }
 
-int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
-  return 0;
-}
-
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   InstrumentedMutexLock l(&mutex_);
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index fb6ad5094e7a..7b2e949789fc 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -2264,7 +2264,7 @@ Status DB::OpenAndTrimHistory(
     return s;
   }
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   s = DB::Open(db_options, dbname, column_families, handles, &db);
   if (!s.ok()) {
     return s;
@@ -2273,7 +2273,7 @@ Status DB::OpenAndTrimHistory(
   CompactRangeOptions options;
   options.bottommost_level_compaction =
       BottommostLevelCompaction::kForceOptimized;
-  auto db_impl = static_cast_with_check<DBImpl>(db);
+  auto db_impl = static_cast_with_check<DBImpl>(db.get());
   for (auto handle : *handles) {
     assert(handle != nullptr);
     auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
@@ -2295,14 +2295,14 @@ Status DB::OpenAndTrimHistory(
       assert(temp_s.ok());
     }
     handles->clear();
-    delete db;
+    db.reset();
   };
   if (!s.ok()) {
     clean_op();
     return s;
   }
 
-  dbptr->reset(db);
+  *dbptr = std::move(db);
   return s;
 }
 
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index b60c615f880c..f2cd4c865d1e 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -1546,7 +1546,7 @@ Status DB::OpenAndCompact(
   }
 
   // 5. Open db As Secondary
-  DB* db;
+  std::unique_ptr<DB> db;
   std::vector<ColumnFamilyHandle*> handles;
   s = DB::OpenAsSecondary(db_options, name, output_directory, column_families,
                           &handles, &db);
@@ -1556,7 +1556,7 @@ Status DB::OpenAndCompact(
   assert(db);
 
   TEST_SYNC_POINT_CALLBACK(
-      "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0", db);
+      "DBImplSecondary::OpenAndCompact::AfterOpenAsSecondary:0", db.get());
 
   // 6. Find the handle of the Column Family that this will compact
   ColumnFamilyHandle* cfh = nullptr;
@@ -1571,7 +1571,8 @@ Status DB::OpenAndCompact(
   // 7. Run the compaction without installation.
   // Output will be stored in the directory specified by output_directory
   CompactionServiceResult compaction_result;
-  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+  DBImplSecondary* db_secondary =
+      static_cast_with_check<DBImplSecondary>(db.get());
   s = db_secondary->CompactWithoutInstallation(options, cfh, compaction_input,
                                                &compaction_result);
 
@@ -1582,7 +1583,7 @@ Status DB::OpenAndCompact(
   for (auto& handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
   if (s.ok()) {
     return serialization_status;
   } else {
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 645f0e7266c7..9862b6b8a632 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -2574,7 +2574,7 @@ TEST_P(DBIteratorTest, AutoRefreshIterator) {
         ReadOptions read_options;
         std::unique_ptr<ManagedSnapshot> snapshot = nullptr;
         if (explicit_snapshot) {
-          snapshot = std::make_unique<ManagedSnapshot>(db_);
+          snapshot = std::make_unique<ManagedSnapshot>(db_.get());
         }
         read_options.snapshot =
             explicit_snapshot ? snapshot->snapshot() : nullptr;
diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc
index ff56d56e370d..a2de4e33e417 100644
--- a/db/db_logical_block_size_cache_test.cc
+++ b/db/db_logical_block_size_cache_test.cc
@@ -67,7 +67,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
   options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}};
 
   for (int i = 0; i < 2; i++) {
-    DB* db;
+    std::unique_ptr<DB> db;
     if (!i) {
       printf("Open\n");
       ASSERT_OK(DB::Open(options, dbname_, &db));
@@ -82,7 +82,6 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
     ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
     ASSERT_OK(db->Close());
     ASSERT_EQ(0, cache_->Size());
-    delete db;
   }
   ASSERT_OK(DestroyDB(dbname_, options, {}));
 }
@@ -95,7 +94,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
   options.env = env_.get();
 
   for (int i = 0; i < 2; i++) {
-    DB* db;
+    std::unique_ptr<DB> db;
     if (!i) {
       printf("Open\n");
       ASSERT_OK(DB::Open(options, dbname_, &db));
@@ -106,7 +105,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
     ASSERT_EQ(1, cache_->Size());
     ASSERT_TRUE(cache_->Contains(dbname_));
     ASSERT_EQ(1, cache_->GetRefCount(dbname_));
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
   }
   ASSERT_OK(DestroyDB(dbname_, options, {}));
@@ -122,7 +121,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
   ColumnFamilyOptions cf_options;
   cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}};
 
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -153,7 +152,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
   ASSERT_TRUE(cache_->Contains(dbname_));
   ASSERT_EQ(1, cache_->GetRefCount(dbname_));
 
-  delete db;
+  db.reset();
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
 }
@@ -173,7 +172,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ColumnFamilyOptions cf_options;
   cf_options.cf_paths = {{cf_path_0_, 1024}};
 
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -211,7 +210,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
   ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
   ASSERT_TRUE(cache_->Contains(dbname_));
   ASSERT_EQ(1, cache_->GetRefCount(dbname_));
-  delete db;
+  db.reset();
 
   // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry
   // is dropped from cache
@@ -233,15 +232,15 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
   cf_options.cf_paths = {{cf_path_0_, 1024}};
 
   for (int i = 0; i < 2; i++) {
-    DB* db;
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, dbname_, &db));
     ColumnFamilyHandle* cf1 = nullptr;
     ColumnFamilyHandle* cf2 = nullptr;
-    ASSERT_OK(DB::Open(options, dbname_, &db));
     ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1));
     ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2));
     ASSERT_OK(db->DestroyColumnFamilyHandle(cf1));
     ASSERT_OK(db->DestroyColumnFamilyHandle(cf2));
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
 
     std::vector<ColumnFamilyHandle*> cfs;
@@ -298,7 +297,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
     ASSERT_TRUE(cache_->Contains(dbname_));
     ASSERT_EQ(1, cache_->GetRefCount(dbname_));
 
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
   }
   ASSERT_OK(
@@ -315,7 +314,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
   ColumnFamilyOptions cf_options;
   cf_options.cf_paths = {{cf_path_0_, 1024}};
 
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(dbname_));
@@ -336,7 +335,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-  delete db;
+  db.reset();
   ASSERT_EQ(0, cache_->Size());
 
   // Open with column families.
@@ -369,7 +368,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
     ASSERT_TRUE(cache_->Contains(cf_path_0_));
     ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-    delete db;
+    db.reset();
     ASSERT_EQ(0, cache_->Size());
   }
   ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
@@ -384,7 +383,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
 
   ASSERT_OK(env_->CreateDirIfMissing(dbname_));
 
-  DB* db0;
+  std::unique_ptr<DB> db0;
   ASSERT_OK(DB::Open(options, data_path_0_, &db0));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -399,7 +398,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-  DB* db1;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, data_path_1_, &db1));
   ASSERT_EQ(3, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -424,7 +423,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
 
   ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
-  delete db0;
+  db0.reset();
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_1_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
@@ -433,7 +432,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
 
   ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
-  delete db1;
+  db1.reset();
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
 }
@@ -450,7 +449,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
 
   ASSERT_OK(env_->CreateDirIfMissing(dbname_));
 
-  DB* db0;
+  std::unique_ptr<DB> db0;
   ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0));
   ASSERT_EQ(1, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -464,7 +463,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
-  DB* db1;
+  std::unique_ptr<DB> db1;
   ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -481,7 +480,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
 
   ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
-  delete db0;
+  db0.reset();
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
@@ -490,7 +489,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
 
   ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
-  delete db1;
+  db1.reset();
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
 }
diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 0f6d05d0527c..143203fd7b7e 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -386,7 +386,7 @@ TEST_F(DBMergeOperatorTest, MergeOperandThresholdExceeded) {
   snapshots.reserve(3);
 
   for (size_t i = 0; i < keys.size(); ++i) {
-    snapshots.emplace_back(db_);
+    snapshots.emplace_back(db_.get());
 
     const std::string suffix = std::to_string(i + 1);
 
@@ -985,7 +985,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
     // max_successive_merges.
     constexpr size_t max_key_versions = 8;
     std::vector<KeyVersion> key_versions;
-    ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
+    ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key,
                                 max_key_versions, &key_versions));
     ASSERT_EQ(key_versions.size(), 2);
     ASSERT_EQ(key_versions[0].type, kTypeValue);
@@ -1009,7 +1009,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
     // max_successive_merges.
     constexpr size_t max_key_versions = 8;
     std::vector<KeyVersion> key_versions;
-    ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
+    ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key,
                                 max_key_versions, &key_versions));
     ASSERT_EQ(key_versions.size(), 3);
     ASSERT_EQ(key_versions[0].type, kTypeValue);
@@ -1038,7 +1038,7 @@ TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) {
     // max_successive_merges.
     constexpr size_t max_key_versions = 8;
     std::vector<KeyVersion> key_versions;
-    ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key,
+    ASSERT_OK(GetAllKeyVersions(db_.get(), db_->DefaultColumnFamily(), key, key,
                                 max_key_versions, &key_versions));
     ASSERT_EQ(key_versions.size(), 3);
     ASSERT_EQ(key_versions[0].type, kTypeWideColumnEntity);
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index 160a5fcd774e..523abeb1cbd6 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -384,7 +384,7 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
 
     // Hold open a snapshot to prevent range tombstones from being compacted
     // away.
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
 
     Random rnd(5632);
     for (int table = 1; table <= kTableCount; ++table) {
@@ -582,7 +582,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   DestroyAndReopen(options);
 
   // Hold open a snapshot to prevent range tombstones from being compacted away.
-  ManagedSnapshot snapshot(db_);
+  ManagedSnapshot snapshot(db_.get());
 
   std::string level_tp_strings[kMaxLevel];
   std::string tp_string;
@@ -1864,7 +1864,7 @@ TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
   options.listeners.push_back(listener);
   options.level0_file_num_compaction_trigger = kNumL0Files;
   DestroyAndReopen(options);
-  listener->SetDB(db_);
+  listener->SetDB(db_.get());
 
   for (int i = 0; i < kNumL0Files; ++i) {
     // Make sure they overlap in keyspace to prevent trivial move
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index e22cd5845b09..f0996ce34c94 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -2047,7 +2047,7 @@ TEST_F(DBRangeDelTest, IteratorReseek) {
   // Immutable memtable
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
                              Key(2)));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   std::string value;
   ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
                                     "rocksdb.num-immutable-mem-table", &value));
diff --git a/db/db_readonly_with_timestamp_test.cc b/db/db_readonly_with_timestamp_test.cc
index 7a37bfec81c5..6fbc43bb2664 100644
--- a/db/db_readonly_with_timestamp_test.cc
+++ b/db/db_readonly_with_timestamp_test.cc
@@ -237,7 +237,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
          it->Next(), ++count, ++key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -250,7 +250,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
          it->Prev(), ++count, --key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -272,7 +272,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Next(), ++key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
@@ -282,7 +282,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Prev(), --key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc
index e34a95d55417..a5da2afacc44 100644
--- a/db/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -56,12 +56,11 @@ class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
       ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h));
     }
     handles_secondary_.clear();
-    delete db_secondary_;
-    db_secondary_ = nullptr;
+    db_secondary_.reset();
   }
 
   DBImplSecondary* db_secondary_full() {
-    return static_cast<DBImplSecondary*>(db_secondary_);
+    return static_cast<DBImplSecondary*>(db_secondary_.get());
   }
 
   void CheckFileTypeCounts(const std::string& dir, int expected_log,
@@ -69,7 +68,7 @@ class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
 
   std::string secondary_path_;
   std::vector<ColumnFamilyHandle*> handles_secondary_;
-  DB* db_secondary_;
+  std::unique_ptr<DB> db_secondary_;
 };
 
 void DBSecondaryTestBase::OpenSecondary(const Options& options) {
@@ -152,8 +151,8 @@ TEST_F(DBSecondaryTest, NonExistingDb) {
   options.env = env_;
   options.max_open_files = -1;
   const std::string dbname = "/doesnt/exist";
-  Status s =
-      DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_);
+  std::unique_ptr<DB> dbptr;
+  Status s = DB::OpenAsSecondary(options, dbname, secondary_path_, &dbptr);
   ASSERT_TRUE(s.IsIOError());
 }
 
@@ -182,7 +181,7 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) {
 
   ReadOptions ropts;
   ropts.verify_checksums = true;
-  auto db1 = static_cast<DBImplSecondary*>(db_);
+  auto db1 = static_cast<DBImplSecondary*>(db_.get());
   ASSERT_NE(nullptr, db1);
   Iterator* iter = db1->NewIterator(ropts);
   ASSERT_NE(nullptr, iter);
@@ -834,7 +833,7 @@ TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
   options1.max_open_files = -1;
   OpenSecondary(options1);
   ASSERT_EQ(0, handles_secondary_.size());
-  ASSERT_NE(nullptr, db_secondary_);
+  ASSERT_NE(nullptr, db_secondary_.get());
 
   ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
   ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
@@ -1152,7 +1151,7 @@ TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
   for (int k = 0; k != 16; ++k) {
     ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-    verify_db(dbfull(), db_secondary_);
+    verify_db(dbfull(), db_secondary_.get());
   }
 }
 
@@ -1221,7 +1220,7 @@ TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
     TEST_SYNC_POINT(
         "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    verify_db(dbfull(), handles_, db_secondary_.get(), handles_secondary_);
     SyncPoint::GetInstance()->ClearTrace();
   }
 }
@@ -1357,7 +1356,7 @@ TEST_F(DBSecondaryTest, OpenWithTransactionDB) {
   TransactionDBOptions txn_db_opts;
   ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
   ASSERT_NE(txn_db, nullptr);
-  db_ = txn_db;
+  db_.reset(txn_db);
 
   std::vector<std::string> cfs = {"new_CF"};
   CreateColumnFamilies(cfs, options);
@@ -1561,7 +1560,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
          it->Next(), ++count, ++key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -1574,7 +1573,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
          it->Prev(), ++count, --key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                          "value" + std::to_string(i), write_timestamps[i]);
-      get_value_and_check(db_, read_opts, it->key(), it->value(),
+      get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                           write_timestamps[i]);
     }
     ASSERT_OK(it->status());
@@ -1596,7 +1595,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Next(), ++key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
@@ -1606,7 +1605,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
            it->Valid(); it->Prev(), --key, ++count) {
         CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
                            "value" + std::to_string(i), write_timestamps[i]);
-        get_value_and_check(db_, read_opts, it->key(), it->value(),
+        get_value_and_check(db_.get(), read_opts, it->key(), it->value(),
                             write_timestamps[i]);
       }
       ASSERT_OK(it->status());
diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc
index 4fe3032e901c..91f9df57e92b 100644
--- a/db/db_statistics_test.cc
+++ b/db/db_statistics_test.cc
@@ -321,7 +321,7 @@ TEST_F(DBStatisticsTest, BytesWrittenStats) {
     options.enable_pipelined_write = enable_pipelined_write;
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
     ASSERT_NE(txn_db, nullptr);
-    db_ = txn_db->GetBaseDB();
+    db_.reset(txn_db);
 
     WriteOptions wopts;
     TransactionOptions txn_opts;
@@ -351,8 +351,7 @@ TEST_F(DBStatisticsTest, BytesWrittenStats) {
                   WriteBatchInternal::kHeader);
 
     // Cleanup
-    db_ = nullptr;
-    delete txn_db;
+    db_.reset();
   }
 }
 
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 8e84f26541bc..0f9e1327825c 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -90,7 +90,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   // Clear out auto-opened files
   dbfull()->TEST_table_cache()->EraseUnRefEntries();
   ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+  VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13);
 
   // 2. Put two tables to table cache and
   Reopen(options);
@@ -103,7 +103,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
     Get(std::to_string(i * 100 + 0));
   }
 
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+  VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13);
 
   // 3. Put all tables to table cache
   Reopen(options);
@@ -111,7 +111,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   for (int i = 0; i < 4; ++i) {
     Get(std::to_string(i * 100 + 0));
   }
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+  VerifyTableProperties(db_.get(), 10 + 11 + 12 + 13);
 
   // 4. Try to read CORRUPT properties (a) directly from file, and (b)
   // through reader on Get
diff --git a/db/db_test.cc b/db/db_test.cc
index 928808cffee1..9c0dc9fe326b 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -104,7 +104,7 @@ TEST_F(DBTest, MockEnvTest) {
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
-  DB* db;
+  std::unique_ptr<DB> db;
 
   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
@@ -132,7 +132,7 @@ TEST_F(DBTest, MockEnvTest) {
   ASSERT_OK(iterator->status());
   delete iterator;
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db.get());
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
   for (size_t i = 0; i < 3; ++i) {
@@ -141,7 +141,7 @@ TEST_F(DBTest, MockEnvTest) {
     ASSERT_TRUE(res == vals[i]);
   }
 
-  delete db;
+  db.reset();
 }
 
 TEST_F(DBTest, RequestIdPlumbingTest) {
@@ -264,7 +264,7 @@ TEST_F(DBTest, MemEnvTest) {
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
-  DB* db;
+  std::unique_ptr<DB> db;
 
   const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
   const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
@@ -292,7 +292,7 @@ TEST_F(DBTest, MemEnvTest) {
   ASSERT_OK(iterator->status());
   delete iterator;
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db.get());
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
   for (size_t i = 0; i < 3; ++i) {
@@ -301,7 +301,7 @@ TEST_F(DBTest, MemEnvTest) {
     ASSERT_TRUE(res == vals[i]);
   }
 
-  delete db;
+  db.reset();
 
   options.create_if_missing = false;
   ASSERT_OK(DB::Open(options, "/dir/db", &db));
@@ -310,7 +310,7 @@ TEST_F(DBTest, MemEnvTest) {
     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
     ASSERT_TRUE(res == vals[i]);
   }
-  delete db;
+  db.reset();
 }
 
 TEST_F(DBTest, WriteEmptyBatch) {
@@ -1078,7 +1078,9 @@ TEST_F(DBTest, WrongLevel0Config) {
   options.level0_stop_writes_trigger = 1;
   options.level0_slowdown_writes_trigger = 2;
   options.level0_file_num_compaction_trigger = 3;
-  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  {
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+  }
 }
 
 TEST_F(DBTest, GetOrderedByLevels) {
@@ -1207,8 +1209,10 @@ TEST_F(DBTest, FlushSchedule) {
     t.join();
   }
 
-  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
-  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  auto default_tables =
+      GetNumberOfSstFilesForColumnFamily(db_.get(), "default");
+  auto pikachu_tables =
+      GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu");
   ASSERT_LE(default_tables, static_cast<uint64_t>(10));
   ASSERT_GT(default_tables, static_cast<uint64_t>(0));
   ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
@@ -2368,7 +2372,7 @@ TEST_F(DBTest, Snapshot) {
     ASSERT_OK(Put(1, "foo", "1v3"));
 
     {
-      ManagedSnapshot s3(db_);
+      ManagedSnapshot s3(db_.get());
       ASSERT_EQ(3U, GetNumSnapshots());
       ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
       ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
@@ -2725,37 +2729,43 @@ TEST_F(DBTest, DBOpen_Options) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Does not exist, and create_if_missing == false: error
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   options.create_if_missing = false;
-  Status s = DB::Open(options, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  }
   ASSERT_TRUE(db == nullptr);
 
   // Does not exist, and create_if_missing == true: OK
   options.create_if_missing = true;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_OK(s);
+  }
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Does exist, and error_if_exists == true: error
   options.create_if_missing = false;
   options.error_if_exists = true;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  }
   ASSERT_TRUE(db == nullptr);
 
   // Does exist, and error_if_exists == false: OK
   options.create_if_missing = true;
   options.error_if_exists = false;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
+  {
+    Status s = DB::Open(options, dbname, &db);
+    ASSERT_OK(s);
+  }
   ASSERT_TRUE(db != nullptr);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 }
 
 TEST_F(DBTest, DBOpen_Change_NumLevels) {
@@ -2793,25 +2803,36 @@ TEST_F(DBTest, DestroyDBMetaDatabase) {
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Setup databases
-  DB* db = nullptr;
-  ASSERT_OK(DB::Open(options, dbname, &db));
-  delete db;
-  db = nullptr;
-  ASSERT_OK(DB::Open(options, metadbname, &db));
-  delete db;
-  db = nullptr;
-  ASSERT_OK(DB::Open(options, metametadbname, &db));
-  delete db;
-  db = nullptr;
+  {
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, dbname, &db));
+  }
+  {
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, metadbname, &db));
+  }
+  {
+    std::unique_ptr<DB> db;
+    ASSERT_OK(DB::Open(options, metametadbname, &db));
+  }
 
   // Delete databases
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Check if deletion worked.
   options.create_if_missing = false;
-  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(!(DB::Open(options, dbname, &dbptr)).ok());
+  }
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(!(DB::Open(options, metadbname, &dbptr)).ok());
+  }
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(!(DB::Open(options, metametadbname, &dbptr)).ok());
+  }
 }
 
 TEST_F(DBTest, SnapshotFiles) {
@@ -2890,13 +2911,11 @@ TEST_F(DBTest, SnapshotFiles) {
     column_families.emplace_back("default", ColumnFamilyOptions());
     column_families.emplace_back("pikachu", ColumnFamilyOptions());
     std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* snapdb;
+    std::unique_ptr<DB> snapdb;
     DBOptions opts;
     opts.env = env_;
     opts.create_if_missing = false;
-    Status stat =
-        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
-    ASSERT_OK(stat);
+    ASSERT_OK(DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb));
 
     ReadOptions roptions;
     std::string val;
@@ -2907,7 +2926,7 @@ TEST_F(DBTest, SnapshotFiles) {
     for (auto cfh : cf_handles) {
       delete cfh;
     }
-    delete snapdb;
+    snapdb.reset();
 
     // look at the new live files after we added an 'extra' key
     // and after we took the first snapshot.
@@ -3109,7 +3128,7 @@ struct MTThread {
 static void MTThreadBody(void* arg) {
   MTThread* t = static_cast<MTThread*>(arg);
   int id = t->id;
-  DB* db = t->state->test->db_;
+  DB* db = t->state->test->db_.get();
   int counter = 0;
   std::shared_ptr<SystemClock> clock = SystemClock::Default();
   auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
@@ -3324,7 +3343,7 @@ TEST_F(DBTest, GroupCommitTest) {
     GCThread thread[kGCNumThreads];
     for (int id = 0; id < kGCNumThreads; id++) {
       thread[id].id = id;
-      thread[id].db = db_;
+      thread[id].db = db_.get();
       thread[id].done = false;
       env_->StartThread(GCThreadBody, &thread[id]);
     }
@@ -3996,8 +4015,10 @@ TEST_P(DBTestRandomized, Randomized) {
       // than return a key that is close to it.
       if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
           option_config_ != kBlockBasedTableWithPrefixHashIndex) {
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        ASSERT_TRUE(
+            CompareIterators(step, &model, db_.get(), nullptr, nullptr));
+        ASSERT_TRUE(
+            CompareIterators(step, &model, db_.get(), model_snap, db_snap));
       }
 
       // Save a snapshot from each DB this time that we'll use next
@@ -4011,7 +4032,7 @@ TEST_P(DBTestRandomized, Randomized) {
       }
 
       Reopen(options);
-      ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+      ASSERT_TRUE(CompareIterators(step, &model, db_.get(), nullptr, nullptr));
 
       model_snap = model.GetSnapshot();
       db_snap = db_->GetSnapshot();
@@ -5437,7 +5458,7 @@ TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
     // Compact all
     MakeTables(1, "a", "z", 1);
     ASSERT_EQ("1,0,2", FilesPerLevel(1));
-    CancelAllBackgroundWork(db_);
+    CancelAllBackgroundWork(db_.get());
     ASSERT_TRUE(
         db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
             .IsShutdownInProgress());
@@ -5457,7 +5478,7 @@ TEST_F(DBTest, PreShutdownFlush) {
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
   ASSERT_OK(Put(1, "key", "value"));
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
   Status s =
       db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
   ASSERT_TRUE(s.IsShutdownInProgress());
@@ -5538,7 +5559,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
 
   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
   TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
   ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
   // Record the number of compactions at a time.
@@ -5624,7 +5645,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
   }
 
   ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
   TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
   ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
@@ -5645,7 +5666,7 @@ TEST_F(DBTest, FlushOnDestroy) {
   WriteOptions wo;
   wo.disableWAL = true;
   ASSERT_OK(Put("foo", "v1", wo));
-  CancelAllBackgroundWork(db_);
+  CancelAllBackgroundWork(db_.get());
 }
 
 TEST_F(DBTest, DynamicCompactionOptions) {
@@ -6513,7 +6534,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) {
 
   // compact it three times
   for (int i = 0; i < 3; ++i) {
-    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+    ASSERT_OK(experimental::SuggestCompactRange(db_.get(), nullptr, nullptr));
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
@@ -6526,7 +6547,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) {
 
   // nonoverlapping with the file on level 0
   Slice start("a"), end("b");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // should not compact the level 0 file
@@ -6534,7 +6555,7 @@ TEST_F(DBTest, SuggestCompactRangeTest) {
 
   start = Slice("j");
   end = Slice("m");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // SuggestCompactRange() is not going to be reported as manual compaction
   ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
@@ -6585,7 +6606,7 @@ TEST_F(DBTest, SuggestCompactRangeUniversal) {
 
   // nonoverlapping with the file on level 0
   Slice start("a"), end("b");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // should not compact the level 0 file
@@ -6593,7 +6614,7 @@ TEST_F(DBTest, SuggestCompactRangeUniversal) {
 
   start = Slice("j");
   end = Slice("m");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(experimental::SuggestCompactRange(db_.get(), &start, &end));
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // now it should compact the level 0 file to the last level
@@ -6630,7 +6651,7 @@ TEST_F(DBTest, PromoteL0) {
   ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
 
   // Promote L0 level to L2.
-  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  ASSERT_OK(experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily(), 2));
   // We expect that all the files were trivially moved from L0 to L2
   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
@@ -6655,7 +6676,7 @@ TEST_F(DBTest, PromoteL0Failure) {
 
   Status status;
   // Fails because L0 has overlapping files.
-  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  status = experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily());
   ASSERT_TRUE(status.IsInvalidArgument());
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -6665,7 +6686,7 @@ TEST_F(DBTest, PromoteL0Failure) {
   ASSERT_OK(Put(Key(5), ""));
   ASSERT_OK(Flush());
   // Fails because L1 is non-empty.
-  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  status = experimental::PromoteL0(db_.get(), db_->DefaultColumnFamily());
   ASSERT_TRUE(status.IsInvalidArgument());
 }
 
@@ -7736,7 +7757,7 @@ TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) {
   });
 
   TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites");
-  CancelAllBackgroundWork(db_, true);
+  CancelAllBackgroundWork(db_.get(), true);
 
   thd.join();
 }
diff --git a/db/db_test2.cc b/db/db_test2.cc
index 67230d846a29..6129e2d923b8 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -39,7 +39,7 @@ class DBTest2 : public DBTestBase {
 };
 
 TEST_F(DBTest2, OpenForReadOnly) {
-  DB* db_ptr = nullptr;
+  std::unique_ptr<DB> db_ptr;
   std::string dbname = test::PerThreadDBPath("db_readonly");
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -63,7 +63,7 @@ TEST_F(DBTest2, OpenForReadOnly) {
 }
 
 TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
-  DB* db_ptr = nullptr;
+  std::unique_ptr<DB> db_ptr;
   std::string dbname = test::PerThreadDBPath("db_readonly");
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -349,9 +349,9 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
   ASSERT_OK(Flush(0));
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
             static_cast<uint64_t>(1));
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
             static_cast<uint64_t>(1));
 
   flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
@@ -371,13 +371,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   // No flush should trigger
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(1));
   }
 
@@ -387,13 +387,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -405,13 +405,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -428,13 +428,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -450,13 +450,13 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   wait_flush();
 
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
   if (cost_cache_) {
@@ -506,7 +506,7 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   CreateAndReopenWithCF({"cf1", "cf2"}, options);
 
   ASSERT_OK(DestroyDB(dbname2, options));
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   ASSERT_OK(DB::Open(options, dbname2, &db2));
 
   WriteOptions wo;
@@ -516,12 +516,12 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
-    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+    ASSERT_OK(static_cast<DBImpl*>(db2.get())->TEST_WaitForFlushMemTable());
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
     ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
-    ASSERT_OK(
-        static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
+    ASSERT_OK(static_cast_with_check<DBImpl>(db2.get())
+                  ->TEST_WaitForBackgroundWork());
   };
 
   // Trigger a flush on cf2
@@ -537,13 +537,13 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
 
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
-  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(static_cast<DBImpl*>(db2.get())->TEST_WaitForFlushMemTable());
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
-                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
-                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default") +
+                  GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1") +
+                  GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"),
               static_cast<uint64_t>(0));
   }
 
@@ -553,13 +553,13 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
   wait_flush();
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"),
               static_cast<uint64_t>(0));
   }
 
@@ -568,19 +568,19 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   wait_flush();
   ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
   wait_flush();
-  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  ASSERT_OK(static_cast<DBImpl*>(db2.get())->TEST_WaitForFlushMemTable());
   {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf1"),
               static_cast<uint64_t>(0));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "cf2"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2.get(), "default"),
               static_cast<uint64_t>(1));
   }
 
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -785,7 +785,7 @@ TEST_F(DBTest2, WalFilterTest) {
     while (true) {
       // Ensure that expected keys exists
       // and not expected keys don't exist after recovery
-      ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+      ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist);
 
       if (checked_after_reopen) {
         break;
@@ -922,7 +922,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
   while (true) {
     // Ensure that expected keys exists
     // and not expected keys don't exist after recovery
-    ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+    ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist);
 
     if (checked_after_reopen) {
       break;
@@ -1004,7 +1004,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
     }
   }
 
-  ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+  ValidateKeyExistence(db_.get(), keys_must_exist, keys_must_not_exist);
 }
 
 TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
@@ -1292,7 +1292,7 @@ TEST_F(DBTest2, DuplicateSnapshot) {
   Options options;
   options = CurrentOptions(options);
   std::vector<const Snapshot*> snapshots;
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   SequenceNumber oldest_ww_snap, first_ww_snap;
 
   ASSERT_OK(Put("k", "v"));  // inc seq
@@ -3694,16 +3694,16 @@ TEST_F(DBTest2, TraceAndReplay) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -3790,7 +3790,7 @@ TEST_F(DBTest2, TraceAndReplay) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -3885,16 +3885,16 @@ TEST_F(DBTest2, TraceAndManualReplay) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -4130,7 +4130,7 @@ TEST_F(DBTest2, TraceAndManualReplay) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -4161,16 +4161,16 @@ TEST_F(DBTest2, TraceWithLimit) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -4203,7 +4203,7 @@ TEST_F(DBTest2, TraceWithLimit) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -4235,16 +4235,16 @@ TEST_F(DBTest2, TraceWithSampling) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   column_families.emplace_back("default", cf_options);
@@ -4279,7 +4279,7 @@ TEST_F(DBTest2, TraceWithSampling) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 }
 
@@ -4339,16 +4339,16 @@ TEST_F(DBTest2, TraceWithFilter) {
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
   // warnings (http://fbinfer.com).
-  DB* db2_init = nullptr;
+  std::unique_ptr<DB> db2_init;
   options.create_if_missing = true;
   ASSERT_OK(DB::Open(options, dbname2, &db2_init));
   ColumnFamilyHandle* cf;
   ASSERT_OK(
       db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
   delete cf;
-  delete db2_init;
+  db2_init.reset();
 
-  DB* db2 = nullptr;
+  std::unique_ptr<DB> db2;
   std::vector<ColumnFamilyDescriptor> column_families;
   ColumnFamilyOptions cf_options;
   cf_options.merge_operator = MergeOperators::CreatePutOperator();
@@ -4384,28 +4384,28 @@ TEST_F(DBTest2, TraceWithFilter) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db2;
+  db2.reset();
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Set up a new db.
   std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
   ASSERT_OK(DestroyDB(dbname3, options));
 
-  DB* db3_init = nullptr;
+  std::unique_ptr<DB> db3_init;
   options.create_if_missing = true;
   ColumnFamilyHandle* cf3;
   ASSERT_OK(DB::Open(options, dbname3, &db3_init));
   ASSERT_OK(
       db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
   delete cf3;
-  delete db3_init;
+  db3_init.reset();
 
   column_families.clear();
   column_families.emplace_back("default", cf_options);
   column_families.emplace_back("pikachu", ColumnFamilyOptions());
   handles.clear();
 
-  DB* db3 = nullptr;
+  std::unique_ptr<DB> db3;
   ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
 
   env_->SleepForMicroseconds(100);
@@ -4435,7 +4435,7 @@ TEST_F(DBTest2, TraceWithFilter) {
   for (auto handle : handles) {
     delete handle;
   }
-  delete db3;
+  db3.reset();
   ASSERT_OK(DestroyDB(dbname3, options));
 
   std::unique_ptr<TraceReader> trace_reader3;
@@ -4626,7 +4626,7 @@ TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
   CreateColumnFamilies({"test1", "test2"}, Options());
   ASSERT_EQ(handles_.size(), 2);
 
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  DBImpl* dbi = dbfull();
   port::Thread user_thread1([&]() {
     auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
     ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
@@ -4830,7 +4830,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
 
   // Verify empty DBs can be created in parallel
   std::vector<std::thread> open_threads;
-  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+  std::vector<std::unique_ptr<DB>> dbs(kNumDbs);
   options.create_if_missing = true;
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads.emplace_back(
@@ -4845,7 +4845,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads[i].join();
     ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   // Verify non-empty DBs can be recovered in parallel
@@ -4861,7 +4861,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
   // Wait and cleanup
   for (int i = 0; i < kNumDbs; ++i) {
     open_threads[i].join();
-    delete dbs[i];
+    dbs[i].reset();
     ASSERT_OK(DestroyDB(dbnames[i], options));
   }
 }
@@ -4922,8 +4922,7 @@ TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
   ASSERT_NOK(db_->Close());
   db_->ReleaseSnapshot(ss);
   ASSERT_OK(db_->Close());
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
 }
 
 TEST_F(DBTest2, PrefixBloomReseek) {
@@ -6774,7 +6773,7 @@ TEST_F(DBTest2, CheckpointFileTemperature) {
 
   test_fs->PopRequestedSstFileTemperatures();
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(
       checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp"));
 
@@ -7536,7 +7535,7 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) {
   opts.level0_file_num_compaction_trigger = 10;
 
   // Bootstrap the test database.
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath("file_chksum");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -7570,8 +7569,7 @@ TEST_F(DBTest2, GetFileChecksumsFromCurrentManifest_CRC32) {
   db->GetLiveFilesMetaData(&live_files);
 
   ASSERT_OK(db->Close());
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Process current MANIFEST file and build internal file checksum mappings.
   std::unique_ptr<FileChecksumList> checksum_list(NewFileChecksumList());
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index 1d2c0f268dbd..d62807d265c4 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -97,7 +97,7 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
   EXPECT_OK(DestroyDB(dbname_, delete_options));
   // Destroy it for not alternative WAL dir is used.
   EXPECT_OK(DestroyDB(dbname_, options));
-  db_ = nullptr;
+  db_.reset();
   Reopen(options);
   Random::GetTLSInstance()->Reset(0xdeadbeef);
 }
@@ -664,7 +664,8 @@ Status DBTestBase::TryReopenWithColumnFamilies(
   DBOptions db_opts = DBOptions(options[0]);
   last_options_ = options[0];
   MaybeInstallTimeElapseOnlySleep(db_opts);
-  return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  Status s = DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  return s;
 }
 
 Status DBTestBase::TryReopenWithColumnFamilies(
@@ -683,8 +684,7 @@ void DBTestBase::Close() {
     EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
   }
   handles_.clear();
-  delete db_;
-  db_ = nullptr;
+  db_.reset();
 }
 
 void DBTestBase::DestroyAndReopen(const Options& options) {
@@ -709,7 +709,8 @@ void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
 Status DBTestBase::ReadOnlyReopen(const Options& options) {
   Close();
   MaybeInstallTimeElapseOnlySleep(options);
-  return DB::OpenForReadOnly(options, dbname_, &db_);
+  Status s = DB::OpenForReadOnly(options, dbname_, &db_);
+  return s;
 }
 
 Status DBTestBase::EnforcedReadOnlyReopen(const Options& options) {
@@ -720,7 +721,8 @@ Status DBTestBase::EnforcedReadOnlyReopen(const Options& options) {
       std::make_shared<ReadOnlyFileSystem>(env_->GetFileSystem());
   env_read_only_ = std::make_shared<CompositeEnvWrapper>(env_, fs_read_only);
   options_copy.env = env_read_only_.get();
-  return DB::OpenForReadOnly(options_copy, dbname_, &db_);
+  Status s = DB::OpenForReadOnly(options_copy, dbname_, &db_);
+  return s;
 }
 
 Status DBTestBase::TryReopen(const Options& options) {
@@ -735,7 +737,8 @@ Status DBTestBase::TryReopen(const Options& options) {
   // clears the block cache.
   last_options_ = options;
   MaybeInstallTimeElapseOnlySleep(options);
-  return DB::Open(options, dbname_, &db_);
+  Status s = DB::Open(options, dbname_, &db_);
+  return s;
 }
 
 bool DBTestBase::IsDirectIOSupported() {
@@ -1162,7 +1165,7 @@ int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
 int DBTestBase::NumTableFilesAtLevel(int level, ColumnFamilyHandle* cfh,
                                      DB* db) {
   if (!db) {
-    db = db_;
+    db = db_.get();
   }
   std::string property;
   EXPECT_TRUE(db->GetProperty(
@@ -1208,7 +1211,7 @@ std::string DBTestBase::FilesPerLevel(int cf) {
 
 std::string DBTestBase::FilesPerLevel(ColumnFamilyHandle* cfh, DB* db) {
   if (!db) {
-    db = db_;
+    db = db_.get();
   }
   int num_levels = db->NumberLevels(cfh);
   std::string result;
diff --git a/db/db_test_util.h b/db/db_test_util.h
index ad25a85b0336..44768f1d1c33 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -1072,7 +1072,7 @@ class DBTestBase : public testing::Test {
   SpecialEnv* env_;
   std::shared_ptr<Env> env_read_only_;
   std::shared_ptr<Env> env_guard_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   std::vector<ColumnFamilyHandle*> handles_;
 
   int option_config_;
@@ -1157,7 +1157,7 @@ class DBTestBase : public testing::Test {
                      const anon::OptionsOverride& options_override =
                          anon::OptionsOverride()) const;
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   void CreateColumnFamilies(const std::vector<std::string>& cfs,
                             const Options& options);
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 641daeb0bfd6..1e9270db0dee 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -395,13 +395,13 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) {
     read_opts.timestamp = &ts_slice;
     ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt,
                                     avoid_flush_during_recovery));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
     ASSERT_OK(Put(1, "foo", ts1, "v1"));
     ASSERT_OK(Put(1, "baz", ts1, "v5"));
 
     ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt,
                                          avoid_flush_during_recovery));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
     // Do a timestamped read with ts1 after second reopen.
     CheckGet(read_opts, 1, "foo", "v1", ts1);
     CheckGet(read_opts, 1, "baz", "v5", ts1);
@@ -415,7 +415,7 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) {
 
     ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt,
                                          avoid_flush_during_recovery));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
     std::string ts3;
     PutFixed64(&ts3, 3);
     ASSERT_OK(Put(1, "foo", ts3, "v4"));
@@ -466,14 +466,14 @@ TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) {
 
   ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt));
   // No flush, no sst files, because of no data.
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 0U);
   ASSERT_OK(Put(1, largest_ukey_without_ts, write_ts, "v1"));
   ASSERT_OK(Put(1, smallest_ukey_without_ts, write_ts, "v5"));
 
   ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt));
   // Memtable recovered from WAL flushed because `avoid_flush_during_recovery`
   // defaults to false, created one L0 file.
-  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1U);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"), 1U);
 
   std::vector<std::vector<FileMetaData>> level_to_files;
   dbfull()->TEST_GetFilesMetaData(handles_[1], &level_to_files);
@@ -1347,7 +1347,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(1));
     // Make sure 'dobrynia' was flushed: check sst files amount
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(1));
   }
   // New WAL file
@@ -1363,16 +1363,16 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
                            options);
   {
     // No inserts => default is empty
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(0));
     // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(5));
     // 1 SST for big key + 1 SST for small one
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(2));
     // 1 SST for all keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(1));
   }
 }
@@ -1401,7 +1401,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
   {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(1));
   }
   // Memtable for 'nikitich' has flushed, new WAL file has opened
@@ -1425,7 +1425,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
   {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(2));
   }
 
@@ -1437,13 +1437,13 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
     // first, second and third WALs  went to the same SST.
     // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
     // 'dobrynia', one for 'pikachu'
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "default"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "nikitich"),
               static_cast<uint64_t>(3));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "dobrynia"),
               static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_.get(), "pikachu"),
               static_cast<uint64_t>(1));
   }
 }
@@ -1521,9 +1521,9 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
   // from an old incarnation of the WAL on recovery
   ASSERT_OK(db_->PauseBackgroundWork());
   ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(db_->ContinueBackgroundWork());
   ASSERT_OK(Flush());
   ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
@@ -1545,13 +1545,13 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
   // gap in sequence numbers to interfere with recovery
   ASSERT_OK(db_->PauseBackgroundWork());
   ASSERT_OK(Put("key1", "val1"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(Put("key2", "val2"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   // Need a gap in sequence numbers, so e.g. ingest external file
   // with an open snapshot
   {
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     ASSERT_OK(
         db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
   }
@@ -1560,7 +1560,7 @@ TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
   // Need an SST file that is logically after that WAL, so that dropping WAL
   // data is not a valid point in time.
   {
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     ASSERT_OK(
         db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
   }
@@ -1655,10 +1655,10 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
   // with a single thread, to exercise as much logic as we reasonably can.
   ASSERT_OK(db_->PauseBackgroundWork());
   ASSERT_OK(Put("key1", "val1"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(db_->SyncWAL());
   ASSERT_OK(Put("key2", "val2"));
-  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   ASSERT_OK(Put("key3", "val3"));
 
   // Allow 1 of the WALs to sync, but another won't
@@ -1879,9 +1879,11 @@ TEST_F(DBWALTest, TrackAndVerifyWALsRecycleWAL) {
   // Drop `Put("key1", "old_value")` in the first WAL
   ASSERT_OK(test::TruncateFile(options.env, log_name, 0 /* new_length */));
 
-  Status s = DB::Open(options, dbname_, &db_);
+  {
+    Status s = DB::Open(options, dbname_, &db_);
 
-  ASSERT_OK(s);
+    ASSERT_OK(s);
+  }
 
   ASSERT_EQ("wal_to_recycle", Get("key_ignore2"));
   ASSERT_EQ("NOT_FOUND", Get("key1"));
@@ -1979,7 +1981,10 @@ TEST_P(DBWALTrackAndVerifyWALsWithParamsTest, Basic) {
       ASSERT_OK(options.env->DeleteFile(second_log_name));
     }
 
-    Status s = DB::Open(options, dbname_, &db_);
+    Status s;
+    {
+      s = DB::Open(options, dbname_, &db_);
+    }
 
     if (i == 0) {
       ASSERT_OK(s);
@@ -2266,11 +2271,10 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
 
-  DB* db1 = nullptr;
+  std::unique_ptr<DB> db1;
   Status s = DB::OpenForReadOnly(options, dbname_, &db1);
   ASSERT_OK(s);
   assert(db1);
-  delete db1;
 }
 
 TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) {
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index 983080eae78f..d4728e9811af 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -662,7 +662,7 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
   ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2"));
   ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0)));
   ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3"));
-  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3",
+  check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::OK(), "v3",
                     Timestamp(6, 0));
   ASSERT_OK(Flush());
   Close();
@@ -675,27 +675,27 @@ TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
   // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND.
   ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
                                    &handles_, &db_, Timestamp(5, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "",
+  check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::NotFound(), "",
                     Timestamp(5, 0));
   Close();
 
   // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2
   ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
                                    &handles_, &db_, Timestamp(4, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2",
+  check_value_by_ts(db_.get(), "k1", Timestamp(7, 0), Status::OK(), "v2",
                     Timestamp(4, 0));
   Close();
 
   Reopen(options);
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1",
                              "k3", Timestamp(7, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "",
+  check_value_by_ts(db_.get(), "k1", Timestamp(8, 0), Status::NotFound(), "",
                     Timestamp(7, 0));
   Close();
   // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2
   ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
                                    &handles_, &db_, Timestamp(6, 0)));
-  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2",
+  check_value_by_ts(db_.get(), "k1", Timestamp(8, 0), Status::OK(), "v2",
                     Timestamp(4, 0));
   Close();
 }
diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc
index db4bf2b8a289..2eff1d397f7e 100644
--- a/db/db_write_buffer_manager_test.cc
+++ b/db/db_write_buffer_manager_test.cc
@@ -183,11 +183,11 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
 // is waiting to be finished but DBs tries to write meanwhile.
 TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   std::vector<std::string> dbnames;
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   int num_dbs = 3;
 
   for (int i = 0; i < num_dbs; i++) {
-    dbs.push_back(nullptr);
+    dbs.emplace_back();
     dbnames.push_back(
         test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
   }
@@ -266,7 +266,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   //  Last writer will write and when its blocked it will signal Flush to
   //  continue to clear the stall.
 
-  threads.emplace_back(write_db, db_);
+  threads.emplace_back(write_db, db_.get());
   // Wait untill first DB is blocked and then create the multiple writers for
   // different DBs which will be blocked from getting added to the queue because
   // stall is in effect.
@@ -277,7 +277,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
     }
   }
   for (int i = 0; i < num_dbs; i++) {
-    threads.emplace_back(write_db, dbs[i]);
+    threads.emplace_back(write_db, dbs[i].get());
   }
   for (auto& t : threads) {
     t.join();
@@ -289,7 +289,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
   for (int i = 0; i < num_dbs; i++) {
     ASSERT_OK(dbs[i]->Close());
     ASSERT_OK(DestroyDB(dbnames[i], options));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -300,11 +300,11 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
 // blocked when stall by WriteBufferManager is in effect.
 TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   std::vector<std::string> dbnames;
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   int num_dbs = 3;
 
   for (int i = 0; i < num_dbs; i++) {
-    dbs.push_back(nullptr);
+    dbs.emplace_back();
     dbnames.push_back(
         test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
   }
@@ -407,7 +407,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   //  |
   //  Last writer thread will write and when its blocked it will signal Flush to
   //  continue to clear the stall.
-  threads.emplace_back(write_db, db_);
+  threads.emplace_back(write_db, db_.get());
   // Wait untill first thread is blocked and then create the multiple writer
   // threads.
   {
@@ -421,7 +421,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
     // Write to multiple columns of db_.
     writer_threads.emplace_back(write_cf, i % 3);
     // Write to different dbs.
-    threads.emplace_back(write_db, dbs[i]);
+    threads.emplace_back(write_db, dbs[i].get());
   }
   for (auto& t : threads) {
     t.join();
@@ -441,7 +441,7 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
   for (int i = 0; i < num_dbs; i++) {
     ASSERT_OK(dbs[i]->Close());
     ASSERT_OK(DestroyDB(dbnames[i], options));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -604,11 +604,11 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
 // dbs by passing different values to WriteOption.no_slown_down.
 TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   std::vector<std::string> dbnames;
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   int num_dbs = 4;
 
   for (int i = 0; i < num_dbs; i++) {
-    dbs.push_back(nullptr);
+    dbs.emplace_back();
     dbnames.push_back(
         test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
   }
@@ -732,7 +732,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   //  |
   //  Last writer thread will write and when its blocked/return it will signal
   //  Flush to continue to clear the stall.
-  threads.emplace_back(write_slow_down, db_);
+  threads.emplace_back(write_slow_down, db_.get());
   // Wait untill first thread writing to DB is blocked and then
   // create the multiple writers.
   {
@@ -744,11 +744,11 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
 
   for (int i = 0; i < num_dbs; i += 2) {
     // Write to multiple columns of db_.
-    writer_threads.emplace_back(write_slow_down, db_);
-    writer_threads.emplace_back(write_no_slow_down, db_);
+    writer_threads.emplace_back(write_slow_down, db_.get());
+    writer_threads.emplace_back(write_no_slow_down, db_.get());
     // Write to different DBs.
-    threads.emplace_back(write_slow_down, dbs[i]);
-    threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+    threads.emplace_back(write_slow_down, dbs[i].get());
+    threads.emplace_back(write_no_slow_down, dbs[i + 1].get());
   }
 
   for (auto& t : threads) {
@@ -773,7 +773,7 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
   for (int i = 0; i < num_dbs; i++) {
     ASSERT_OK(dbs[i]->Close());
     ASSERT_OK(DestroyDB(dbnames[i], options));
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -809,7 +809,7 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
 
   Reopen(options);
   std::string dbname = test::PerThreadDBPath("db_shared_wbm_db");
-  DB* shared_wbm_db = nullptr;
+  std::unique_ptr<DB> shared_wbm_db;
 
   ASSERT_OK(DestroyDB(dbname, options));
   ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db));
@@ -842,7 +842,7 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
   sleeping_task_high.WaitUntilDone();
   ASSERT_OK(shared_wbm_db->Close());
   ASSERT_OK(DestroyDB(dbname, options));
-  delete shared_wbm_db;
+  shared_wbm_db.reset();
 }
 
 TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) {
diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc
index 57c3c0dcdd88..26263011ffde 100644
--- a/db/error_handler_fs_test.cc
+++ b/db/error_handler_fs_test.cc
@@ -1550,7 +1550,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   std::vector<FaultInjectionTestFS*> fault_fs;
   std::vector<Options> options;
   std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
-  std::vector<DB*> db;
+  std::vector<std::unique_ptr<DB>> db;
   std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
   int kNumDbInstances = 3;
   Random rnd(301);
@@ -1567,7 +1567,6 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
     options[i].writable_file_max_buffer_size = 32768;
     options[i].listeners.emplace_back(listener[i]);
     options[i].sst_file_manager = sfm;
-    DB* dbptr;
     char buf[16];
 
     listener[i]->EnableAutoRecovery();
@@ -1576,8 +1575,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
                                          IOStatus::NoSpace("Out of space"));
     snprintf(buf, sizeof(buf), "_%d", i);
     ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
-    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
-    db.emplace_back(dbptr);
+    ASSERT_OK(
+        DB::Open(options[i], dbname_ + std::string(buf), &db.emplace_back()));
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
@@ -1609,7 +1608,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
-    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact();
+    Status s = static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact();
     ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
     fault_fs[i]->SetFilesystemActive(true);
   }
@@ -1618,7 +1617,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     std::string prop;
     ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
-    ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact());
+    ASSERT_OK(static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact());
     EXPECT_TRUE(db[i]->GetProperty(
         "rocksdb.num-files-at-level" + std::to_string(0), &prop));
     EXPECT_EQ(atoi(prop.c_str()), 0);
@@ -1634,7 +1633,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     char buf[16];
     snprintf(buf, sizeof(buf), "_%d", i);
-    delete db[i];
+    db[i].reset();
     fault_fs[i]->SetFilesystemActive(true);
     if (getenv("KEEP_DB")) {
       printf("DB is still at %s%s\n", dbname_.c_str(), buf);
@@ -1657,7 +1656,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
   std::vector<FaultInjectionTestFS*> fault_fs;
   std::vector<Options> options;
   std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
-  std::vector<DB*> db;
+  std::vector<std::unique_ptr<DB>> db;
   std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
   int kNumDbInstances = 3;
   Random rnd(301);
@@ -1674,7 +1673,6 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     options[i].writable_file_max_buffer_size = 32768;
     options[i].listeners.emplace_back(listener[i]);
     options[i].sst_file_manager = sfm;
-    DB* dbptr;
     char buf[16];
 
     listener[i]->EnableAutoRecovery();
@@ -1695,8 +1693,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     }
     snprintf(buf, sizeof(buf), "_%d", i);
     ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
-    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
-    db.emplace_back(dbptr);
+    ASSERT_OK(
+        DB::Open(options[i], dbname_ + std::string(buf), &db.emplace_back()));
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
@@ -1732,7 +1730,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
-    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact();
+    Status s = static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact();
     switch (i) {
       case 0:
         ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
@@ -1754,7 +1752,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
       ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
     }
     if (i == 1) {
-      ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact());
+      ASSERT_OK(static_cast<DBImpl*>(db[i].get())->TEST_WaitForCompact());
     }
     EXPECT_TRUE(db[i]->GetProperty(
         "rocksdb.num-files-at-level" + std::to_string(0), &prop));
@@ -1772,7 +1770,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     char buf[16];
     snprintf(buf, sizeof(buf), "_%d", i);
     fault_fs[i]->SetFilesystemActive(true);
-    delete db[i];
+    db[i].reset();
     if (getenv("KEEP_DB")) {
       printf("DB is still at %s%s\n", dbname_.c_str(), buf);
     } else {
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index c93806aaea09..326b3d567a09 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -2816,7 +2816,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevelAndDisallowMemtable) {
 
     {
       const Snapshot* snapshot = db_->GetSnapshot();
-      ManagedSnapshot snapshot_guard(db_, snapshot);
+      ManagedSnapshot snapshot_guard(db_.get(), snapshot);
       IngestExternalFileOptions ifo;
       ifo.fail_if_not_bottommost_level = true;
       ifo.snapshot_consistency = true;
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index ff0d15faa73a..e30405d50978 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -80,8 +80,7 @@ class ExternSSTFileLinkFailFallbackTest
   }
 
   void TearDown() override {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options_));
   }
 
@@ -4040,7 +4039,7 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
       std::string db2_path = test::PerThreadDBPath("DB2");
       Options db2_options;
       db2_options.create_if_missing = true;
-      DB* db2 = nullptr;
+      std::unique_ptr<DB> db2;
       ASSERT_OK(DB::Open(db2_options, db2_path, &db2));
       // Write some base data.
       expected_value.emplace_back(rnd.RandomString(100));
@@ -4069,10 +4068,10 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
       ASSERT_OK(db_->DropColumnFamily(temp_cfh));
       ASSERT_OK(db_->DestroyColumnFamilyHandle(temp_cfh));
       ASSERT_OK(db2->Close());
-      delete db2;
+      db2.reset();
       ASSERT_OK(DB::Open(db2_options, db2_path, &db2));
       ASSERT_OK(db2->Close());
-      delete db2;
+      db2.reset();
       ASSERT_OK(DestroyDB(db2_path, db2_options));
     } else {
       ASSERT_OK(db_->DropColumnFamily(temp_cfh));
@@ -4135,6 +4134,7 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
     // Create temp CF/DB
     Options temp_cf_opts;
     ColumnFamilyHandle* temp_cfh = nullptr;
+    std::unique_ptr<DB> temp_db_holder;
     DB* from_db = nullptr;
     std::string temp_db_name;
     // Using a separate DB also validates that latest sequence number
@@ -4155,10 +4155,11 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
     if (use_temp_db) {
       temp_cf_opts.create_if_missing = true;
       temp_db_name = dbname_ + "/temp_db_" + std::to_string(rnd->Next());
-      ASSERT_OK(DB::Open(temp_cf_opts, temp_db_name, &from_db));
+      ASSERT_OK(DB::Open(temp_cf_opts, temp_db_name, &temp_db_holder));
+      from_db = temp_db_holder.get();
       temp_cfh = from_db->DefaultColumnFamily();
     } else {
-      from_db = db_;
+      from_db = db_.get();
       ASSERT_OK(
           from_db->CreateColumnFamily(temp_cf_opts, "temp_cf", &temp_cfh));
     }
@@ -4293,7 +4294,7 @@ TEST_P(IngestDBGeneratedFileTest2, NonZeroSeqno) {
     ASSERT_OK(db_->WaitForCompact({}));
     if (use_temp_db) {
       ASSERT_OK(from_db->Close());
-      delete from_db;
+      temp_db_holder.reset();
       ASSERT_OK(DestroyDB(temp_db_name, temp_cf_opts));
     } else {
       ASSERT_OK(db_->DropColumnFamily(temp_cfh));
@@ -4381,7 +4382,7 @@ TEST_P(IngestDBGeneratedFileTest2, ZeroAndNonZeroSeqno) {
 
     std::string temp_db_name =
         dbname_ + "/temp_db_" + std::to_string(rnd->Next());
-    DB* temp_db = nullptr;
+    std::unique_ptr<DB> temp_db;
     ASSERT_OK(DB::Open(temp_db_opts, temp_db_name, &temp_db));
 
     const Snapshot* snapshot = db_->GetSnapshot();
@@ -4444,7 +4445,7 @@ TEST_P(IngestDBGeneratedFileTest2, ZeroAndNonZeroSeqno) {
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
     ASSERT_OK(temp_db->CompactRange(cro, nullptr, nullptr));
     SCOPED_TRACE("Temp DB LSM: " +
-                 FilesPerLevel(temp_db->DefaultColumnFamily(), temp_db));
+                 FilesPerLevel(temp_db->DefaultColumnFamily(), temp_db.get()));
 
     // Base data from snapshot
     std::vector<std::string> sst_file_paths_zero_seqno;
@@ -4539,7 +4540,7 @@ TEST_P(IngestDBGeneratedFileTest2, ZeroAndNonZeroSeqno) {
     ASSERT_OK(db_->DestroyColumnFamilyHandle(live_write_cfh));
 
     ASSERT_OK(temp_db->Close());
-    delete temp_db;
+    temp_db.reset();
     ASSERT_OK(DestroyDB(temp_db_name, temp_db_opts));
   } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
 }
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 3152c7635bea..9e7ec6ddd2ed 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -76,7 +76,7 @@ class FaultInjectionTest
   std::string dbname_;
   std::shared_ptr<Cache> tiny_cache_;
   Options options_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
   FaultInjectionTest()
       : option_config_(std::get<1>(GetParam())),
@@ -260,10 +260,7 @@ class FaultInjectionTest
     return Slice(*storage);
   }
 
-  void CloseDB() {
-    delete db_;
-    db_ = nullptr;
-  }
+  void CloseDB() { db_.reset(); }
 
   Status OpenDB() {
     CloseDB();
@@ -348,7 +345,8 @@ class FaultInjectionTest
   }
 
   void WaitCompactionFinish() {
-    ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_->GetRootDB())
+                  ->TEST_WaitForCompact());
     ASSERT_OK(db_->Put(WriteOptions(), "", ""));
   }
 
diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc
index b57b119e484a..ecab01168474 100644
--- a/db/forward_iterator_bench.cc
+++ b/db/forward_iterator_bench.cc
@@ -344,19 +344,18 @@ int main(int argc, char** argv) {
 
   status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
   assert(status.ok());
-  ROCKSDB_NAMESPACE::DB* db_raw;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db);
   assert(status.ok());
-  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
 
   std::vector<ShardState> shard_states(FLAGS_shards + 1);
   std::deque<Reader> readers;
   while (static_cast<int>(readers.size()) < FLAGS_readers) {
-    readers.emplace_back(&shard_states, db_raw);
+    readers.emplace_back(&shard_states, db.get());
   }
   std::deque<Writer> writers;
   while (static_cast<int>(writers.size()) < FLAGS_writers) {
-    writers.emplace_back(&shard_states, db_raw);
+    writers.emplace_back(&shard_states, db.get());
   }
 
   // Each shard gets a random reader and random writer assigned to it
@@ -367,7 +366,7 @@ int main(int argc, char** argv) {
     shard_states[i].writer = &writers[writer_dist(rng)];
   }
 
-  StatsThread stats_thread(db_raw);
+  StatsThread stats_thread(db.get());
   for (Writer& w : writers) {
     w.start();
   }
diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index 5a0139017754..0a6f9d6a3905 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -371,7 +371,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
   ASSERT_OK(Flush(1));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
@@ -481,14 +481,14 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
   ASSERT_OK(Flush(1));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
   delete checkpoint;
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* cfh = nullptr;
@@ -504,7 +504,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
   }
   ASSERT_OK(db_copy->DropColumnFamily(cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
 }
 
@@ -529,7 +529,7 @@ TEST_F(ImportColumnFamilyTest,
   ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(2)));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
@@ -605,14 +605,14 @@ TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
   ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
                                            &metadata_ptr_));
   ASSERT_NE(metadata_ptr_, nullptr);
   delete checkpoint;
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* cfh = nullptr;
@@ -627,7 +627,7 @@ TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
   }
   ASSERT_OK(db_copy->DropColumnFamily(cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   for (const Snapshot* snapshot : snapshots) {
     db_->ReleaseSnapshot(snapshot);
@@ -771,12 +771,12 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) {
 
   Checkpoint* checkpoint1;
   Checkpoint* checkpoint2;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint1));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_,
                                             &metadata_ptr_));
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* copy_cfh = nullptr;
@@ -796,7 +796,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) {
   ASSERT_OK(db_copy->Flush(FlushOptions()));
 
   // Flush again to create another L0 file. It should have higher sequencer.
-  ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2));
+  ASSERT_OK(Checkpoint::Create(db_copy.get(), &checkpoint2));
   ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_,
                                             &metadata_ptr2_));
 
@@ -826,7 +826,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) {
 
   ASSERT_OK(db_copy->DropColumnFamily(copy_cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
 }
 
@@ -840,12 +840,12 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) {
 
   Checkpoint* checkpoint1;
   Checkpoint* checkpoint2;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint1));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_,
                                             &metadata_ptr_));
 
   // Create a new db and import the files.
-  DB* db_copy;
+  std::unique_ptr<DB> db_copy;
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
   ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
   ColumnFamilyHandle* copy_cfh = nullptr;
@@ -857,7 +857,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) {
   ASSERT_OK(db_copy->Flush(FlushOptions()));
 
   // Flush again to create another L0 file. It should have higher sequencer.
-  ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2));
+  ASSERT_OK(Checkpoint::Create(db_copy.get(), &checkpoint2));
   ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_,
                                             &metadata_ptr2_));
 
@@ -877,7 +877,7 @@ TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) {
 
   ASSERT_OK(db_copy->DropColumnFamily(copy_cfh));
   ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh));
-  delete db_copy;
+  db_copy.reset();
   ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
 }
 
@@ -1017,7 +1017,7 @@ TEST_F(ImportColumnFamilyTest, AssignEpochNumberToMultipleCF) {
   // corruption where two L0 files can have the same epoch number but
   // with overlapping key range.
   Checkpoint* checkpoint1;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint1));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint1));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_,
                                             &metadata_ptr_));
   ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[2], export_files_dir2_,
diff --git a/db/listener_test.cc b/db/listener_test.cc
index f587717c6f26..10ca451fb546 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -105,7 +105,7 @@ class TestCompactionListener : public EventListener {
     ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
 
     ASSERT_TRUE(test_);
-    ASSERT_EQ(test_->db_, db);
+    ASSERT_EQ(test_->db_.get(), db);
 
     std::vector<std::vector<FileMetaData>> files_by_level;
     test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
@@ -197,7 +197,7 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
 
   ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
   for (size_t i = 0; i < cf_names.size(); ++i) {
-    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+    ASSERT_EQ(listener->compacted_dbs_[i], db_.get());
   }
 }
 
@@ -268,7 +268,7 @@ class TestFlushListener : public EventListener {
     // that assumption does not hold (see the test case MultiDBMultiListeners
     // below).
     ASSERT_TRUE(test_);
-    if (db == test_->db_) {
+    if (db == test_->db_.get()) {
       std::vector<std::vector<FileMetaData>> files_by_level;
       ASSERT_LT(info.cf_id, test_->handles_.size());
       ASSERT_GE(info.cf_id, 0u);
@@ -343,7 +343,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
 
   // make sure callback functions are called in the right order
   for (size_t i = 0; i < cf_names.size(); ++i) {
-    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_dbs_[i], db_.get());
     ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
   }
 }
@@ -387,7 +387,7 @@ TEST_F(EventListenerTest, MultiCF) {
       // make sure callback functions are called in the right order
       if (i == 7) {
         for (size_t j = 0; j < cf_names.size(); j++) {
-          ASSERT_EQ(listener->flushed_dbs_[j], db_);
+          ASSERT_EQ(listener->flushed_dbs_[j], db_.get());
           ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]);
         }
       }
@@ -422,22 +422,21 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   DBOptions db_opts(options);
   ColumnFamilyOptions cf_opts(options);
 
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
 
   for (int d = 0; d < kNumDBs; ++d) {
     ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
-    DB* db;
+    ASSERT_OK(
+        DB::Open(options, dbname_ + std::to_string(d), &dbs.emplace_back()));
     std::vector<ColumnFamilyHandle*> handles;
-    ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
     for (size_t c = 0; c < cf_names.size(); ++c) {
       ColumnFamilyHandle* handle;
-      ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
+      ASSERT_OK(dbs.back()->CreateColumnFamily(cf_opts, cf_names[c], &handle));
       handles.push_back(handle);
     }
 
     vec_handles.push_back(std::move(handles));
-    dbs.push_back(db);
   }
 
   for (int d = 0; d < kNumDBs; ++d) {
@@ -450,23 +449,23 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   for (size_t c = 0; c < cf_names.size(); ++c) {
     for (int d = 0; d < kNumDBs; ++d) {
       ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
-      ASSERT_OK(
-          static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
+      ASSERT_OK(static_cast_with_check<DBImpl>(dbs[d].get())
+                    ->TEST_WaitForFlushMemTable());
     }
   }
 
   for (int d = 0; d < kNumDBs; ++d) {
     // Ensure background work is fully finished including listener callbacks
     // before accessing listener state.
-    ASSERT_OK(
-        static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForBackgroundWork());
+    ASSERT_OK(static_cast_with_check<DBImpl>(dbs[d].get())
+                  ->TEST_WaitForBackgroundWork());
   }
 
   for (auto* listener : listeners) {
     int pos = 0;
     for (size_t c = 0; c < cf_names.size(); ++c) {
       for (int d = 0; d < kNumDBs; ++d) {
-        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d].get());
         ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
         pos++;
       }
@@ -481,8 +480,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   }
   vec_handles.clear();
 
-  for (auto db : dbs) {
-    delete db;
+  for (auto& db : dbs) {
+    db.reset();
   }
 }
 
diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc
index e84031065426..d740c7d2d630 100644
--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@@ -98,7 +98,7 @@ class LogCompactionFilter : public CompactionFilter {
 
 TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
   for (int iter = 0; iter < 2; ++iter) {
-    DB* db;
+    std::unique_ptr<DB> db;
     Options options;
     if (iter == 0) {  // level compaction
       options.num_levels = 3;
@@ -128,7 +128,7 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
     delete itr;
 
     delete options.compaction_filter;
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
   }
 }
@@ -137,7 +137,7 @@ TEST_F(ManualCompactionTest, Test) {
   // Open database.  Disable compression since it affects the creation
   // of layers and the code below is trying to test against a very
   // specific scenario.
-  DB* db;
+  std::unique_ptr<DB> db;
   Options db_options;
   db_options.write_buffer_size = 1024;
   db_options.create_if_missing = true;
@@ -185,12 +185,12 @@ TEST_F(ManualCompactionTest, Test) {
   ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
 
   // close database
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname_, Options()));
 }
 
 TEST_F(ManualCompactionTest, SkipLevel) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.level_compaction_dynamic_level_bytes = false;
   options.num_levels = 3;
@@ -298,7 +298,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
   }
 
   delete filter;
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index 7065b125babe..c5589b2643a0 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -33,12 +33,12 @@ std::string ValueWithWriteTime(std::string value, uint64_t write_time) {
 class MemTableListTest : public testing::Test {
  public:
   std::string dbname;
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   std::vector<ColumnFamilyHandle*> handles;
   std::atomic<uint64_t> file_number;
 
-  MemTableListTest() : db(nullptr), file_number(1) {
+  MemTableListTest() : file_number(1) {
     dbname = test::PerThreadDBPath("memtable_list_test");
     options.create_if_missing = true;
     EXPECT_OK(DestroyDB(dbname, options));
@@ -88,8 +88,7 @@ class MemTableListTest : public testing::Test {
         }
       }
       handles.clear();
-      delete db;
-      db = nullptr;
+      db.reset();
       EXPECT_OK(DestroyDB(dbname, options, cf_descs));
     }
   }
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 0592856b7353..5f3546d6ce93 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/wide_columns.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/coding.h"
 #include "utilities/merge_operators.h"
 
@@ -96,9 +97,9 @@ class EnvMergeTest : public EnvWrapper {
 uint64_t EnvMergeTest::now_nanos_count_{0};
 std::unique_ptr<EnvMergeTest> EnvMergeTest::singleton_;
 
-std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+std::unique_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
                            const size_t max_successive_merges = 0) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.merge_operator = std::make_shared<CountMergeOperator>();
@@ -109,7 +110,7 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   if (ttl) {
     DBWithTTL* db_with_ttl;
     s = DBWithTTL::Open(options, dbname, &db_with_ttl);
-    db = db_with_ttl;
+    db.reset(db_with_ttl);
   } else {
     s = DB::Open(options, dbname, &db);
   }
@@ -118,7 +119,7 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for
   // session ID)
   EnvMergeTest::now_nanos_count_ = 0;
-  return std::shared_ptr<DB>(db);
+  return db;
 }
 
 // Imagine we are maintaining a set of uint64 counters.
@@ -128,7 +129,7 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
 // This is a quick implementation without a Merge operation.
 class Counters {
  protected:
-  std::shared_ptr<DB> db_;
+  UnownedPtr<DB> db_;
 
   WriteOptions put_option_;
   ReadOptions get_option_;
@@ -137,7 +138,7 @@ class Counters {
   uint64_t default_;
 
  public:
-  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+  explicit Counters(UnownedPtr<DB> db, uint64_t defaultCount = 0)
       : db_(db),
         put_option_(),
         get_option_(),
@@ -242,7 +243,7 @@ class MergeBasedCounters : public Counters {
   WriteOptions merge_option_;  // for merge
 
  public:
-  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+  explicit MergeBasedCounters(UnownedPtr<DB> db, uint64_t defaultCount = 0)
       : Counters(db, defaultCount), merge_option_() {}
 
   // mapped to a rocksdb Merge operation
@@ -261,7 +262,7 @@ class MergeBasedCounters : public Counters {
   }
 };
 
-void dumpDb(DB* db) {
+void dumpDb(const std::unique_ptr<DB>& db) {
   auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
     // uint64_t value = DecodeFixed64(it->value().data());
@@ -270,7 +271,8 @@ void dumpDb(DB* db) {
   assert(it->status().ok());  // Check for any errors found during the scan
 }
 
-void testCounters(Counters& counters, DB* db, bool test_compaction) {
+void testCounters(Counters& counters, const std::unique_ptr<DB>& db,
+                  bool test_compaction) {
   FlushOptions o;
   o.wait = true;
 
@@ -320,7 +322,8 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
 }
 
-void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+void testCountersWithFlushAndCompaction(Counters& counters,
+                                        const std::unique_ptr<DB>& db) {
   ASSERT_OK(db->Put({}, "1", "1"));
   ASSERT_OK(db->Flush(FlushOptions()));
 
@@ -388,12 +391,12 @@ void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   port::Thread set_options_thread([&]() {
-    ASSERT_OK(static_cast<DBImpl*>(db)->SetOptions(
+    ASSERT_OK(static_cast_with_check<DBImpl>(db.get())->SetOptions(
         {{"disable_auto_compactions", "false"}}));
   });
   TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
   port::Thread compact_thread([&]() {
-    ASSERT_OK(static_cast<DBImpl*>(db)->CompactRange(
+    ASSERT_OK(static_cast_with_check<DBImpl>(db.get())->CompactRange(
         CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
   });
 
@@ -440,8 +443,8 @@ void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
   }
 }
 
-void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
-                      size_t min_merge, size_t count) {
+void testPartialMerge(Counters* counters, const std::unique_ptr<DB>& db,
+                      size_t max_merge, size_t min_merge, size_t count) {
   FlushOptions o;
   o.wait = true;
 
@@ -481,8 +484,8 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
   ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
 }
 
-void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
-                                    size_t num_merges) {
+void testSingleBatchSuccessiveMerge(const std::unique_ptr<DB>& db,
+                                    size_t max_num_merges, size_t num_merges) {
   ASSERT_GT(num_merges, max_num_merges);
 
   Slice key("BatchSuccessiveMerge");
@@ -520,13 +523,13 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     auto db = OpenDb(dbname, use_ttl);
 
     {
-      Counters counters(db, 0);
-      testCounters(counters, db.get(), true);
+      Counters counters(db.get(), 0);
+      testCounters(counters, db, true);
     }
 
     {
-      MergeBasedCounters counters(db, 0);
-      testCounters(counters, db.get(), use_compression);
+      MergeBasedCounters counters(db.get(), 0);
+      testCounters(counters, db, use_compression);
     }
   }
 
@@ -535,10 +538,10 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
   {
     size_t max_merge = 5;
     auto db = OpenDb(dbname, use_ttl, max_merge);
-    MergeBasedCounters counters(db, 0);
-    testCounters(counters, db.get(), use_compression);
+    MergeBasedCounters counters(db.get(), 0);
+    testCounters(counters, db, use_compression);
     testSuccessiveMerge(counters, max_merge, max_merge * 2);
-    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    testSingleBatchSuccessiveMerge(db, 5, 7);
     ASSERT_OK(db->Close());
     ASSERT_OK(DestroyDB(dbname, Options()));
   }
@@ -549,16 +552,15 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     uint32_t min_merge = 2;
     for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
       auto db = OpenDb(dbname, use_ttl, max_merge);
-      MergeBasedCounters counters(db, 0);
-      testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+      MergeBasedCounters counters(db.get(), 0);
+      testPartialMerge(&counters, db, max_merge, min_merge, count);
       ASSERT_OK(db->Close());
       ASSERT_OK(DestroyDB(dbname, Options()));
     }
     {
       auto db = OpenDb(dbname, use_ttl, max_merge);
-      MergeBasedCounters counters(db, 0);
-      testPartialMerge(&counters, db.get(), max_merge, min_merge,
-                       min_merge * 10);
+      MergeBasedCounters counters(db.get(), 0);
+      testPartialMerge(&counters, db, max_merge, min_merge, min_merge * 10);
       ASSERT_OK(db->Close());
       ASSERT_OK(DestroyDB(dbname, Options()));
     }
@@ -567,18 +569,18 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
   {
     {
       auto db = OpenDb(dbname);
-      MergeBasedCounters counters(db, 0);
+      MergeBasedCounters counters(db.get(), 0);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
 
-    DB* reopen_db;
+    std::unique_ptr<DB> reopen_db;
     ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
     std::string value;
     ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
-    delete reopen_db;
+    reopen_db.reset();
     ASSERT_OK(DestroyDB(dbname, Options()));
   }
 
@@ -587,13 +589,13 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     std::cout << "Test merge-operator not set after reopen (recovery case)\n";
     {
       auto db = OpenDb(dbname);
-      MergeBasedCounters counters(db, 0);
+      MergeBasedCounters counters(db.get(), 0);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
     }
 
-    DB* reopen_db;
+    std::unique_ptr<DB> reopen_db;
     ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
   }
   */
@@ -614,8 +616,8 @@ TEST_F(MergeTest, MergeWithCompactionAndFlush) {
   {
     auto db = OpenDb(dbname);
     {
-      MergeBasedCounters counters(db, 0);
-      testCountersWithFlushAndCompaction(counters, db.get());
+      MergeBasedCounters counters(db.get(), 0);
+      testCountersWithFlushAndCompaction(counters, db);
     }
   }
   ASSERT_OK(DestroyDB(dbname, Options()));
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 818bcc4b5901..7709a80fcc59 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -309,7 +309,8 @@ TEST_F(ObsoleteFilesTest, GetSortedWalFilesHangsAfterNoopPurge) {
 
   // Grab an iterator and flush to switch the super version. That way, when the
   // iterator is destroyed, it will go through the purge path.
-  DB* db = db_;  // Only using `db` makes it clear we only use DB-level APIs.
+  DB* db =
+      db_.get();  // Only using `db` makes it clear we only use DB-level APIs.
   ASSERT_OK(db->Put(WriteOptions(), "key", "value"));
   std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
   ASSERT_OK(db->Flush(FlushOptions()));
diff --git a/db/options_file_test.cc b/db/options_file_test.cc
index 7e48f0cf38c1..f420d0dff4df 100644
--- a/db/options_file_test.cc
+++ b/db/options_file_test.cc
@@ -66,16 +66,16 @@ TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
   opt.create_if_missing = true;
   ASSERT_OK(DestroyDB(dbname_, opt));
   std::unordered_set<std::string> filename_history;
-  DB* db;
+  std::unique_ptr<DB> db;
   for (int i = 0; i < kReopenCount; ++i) {
     ASSERT_OK(DB::Open(opt, dbname_, &db));
     int num_options_files = 0;
-    UpdateOptionsFiles(db, &filename_history, &num_options_files);
+    UpdateOptionsFiles(db.get(), &filename_history, &num_options_files);
     ASSERT_GT(num_options_files, 0);
     ASSERT_LE(num_options_files, 2);
     // Make sure we always keep the latest option files.
-    VerifyOptionsFileName(db, filename_history);
-    delete db;
+    VerifyOptionsFileName(db.get(), filename_history);
+    db.reset();
   }
 }
 
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index c439c1ffedf7..fbd38c6c26ee 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -38,8 +38,8 @@ const std::string kDbName =
 
 namespace ROCKSDB_NAMESPACE {
 
-std::shared_ptr<DB> OpenDb(bool read_only = false) {
-  DB* db;
+std::unique_ptr<DB> OpenDb(bool read_only = false) {
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.max_open_files = -1;
@@ -61,7 +61,7 @@ std::shared_ptr<DB> OpenDb(bool read_only = false) {
     s = DB::OpenForReadOnly(options, kDbName, &db);
   }
   EXPECT_OK(s);
-  return std::shared_ptr<DB>(db);
+  return db;
 }
 
 class PerfContextTest : public testing::Test {};
@@ -659,12 +659,11 @@ TEST_F(PerfContextTest, ToString) {
 
 TEST_F(PerfContextTest, MergeOperatorTime) {
   ASSERT_OK(DestroyDB(kDbName, Options()));
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.merge_operator = MergeOperators::CreateStringAppendOperator();
-  Status s = DB::Open(options, kDbName, &db);
-  EXPECT_OK(s);
+  EXPECT_OK(DB::Open(options, kDbName, &db));
 
   std::string val;
   ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
@@ -704,7 +703,7 @@ TEST_F(PerfContextTest, MergeOperatorTime) {
 #endif
   EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
 
-  delete db;
+  db.reset();
 }
 
 TEST_F(PerfContextTest, CopyAndMove) {
@@ -972,13 +971,12 @@ TEST_F(PerfContextTest, CPUTimer) {
 TEST_F(PerfContextTest, MergeOperandCount) {
   ASSERT_OK(DestroyDB(kDbName, Options()));
 
-  DB* db = nullptr;
   Options options;
   options.create_if_missing = true;
   options.merge_operator = MergeOperators::CreateStringAppendOperator();
 
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, kDbName, &db));
-  std::unique_ptr<DB> db_guard(db);
 
   constexpr size_t num_keys = 3;
   const std::string key_prefix("key");
@@ -1007,7 +1005,7 @@ TEST_F(PerfContextTest, MergeOperandCount) {
     for (size_t j = 0; j <= i; ++j) {
       // Take a snapshot before each Merge so they are preserved and not
       // collapsed during flush.
-      snapshots.emplace_back(db);
+      snapshots.emplace_back(db.get());
 
       ASSERT_OK(db->Merge(WriteOptions(), keys[i], value + std::to_string(j)));
     }
@@ -1124,7 +1122,7 @@ TEST_F(PerfContextTest, MergeOperandCount) {
 TEST_F(PerfContextTest, WriteMemtableTimePerfLevel) {
   // Write and check time
   ASSERT_OK(DestroyDB(kDbName, Options()));
-  std::shared_ptr<DB> db = OpenDb();
+  auto db = OpenDb();
 
   SetPerfLevel(PerfLevel::kEnableWait);
   PerfContext* perf_ctx = get_perf_context();
diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc
index e0904abe3dd0..5575333b095a 100644
--- a/db/periodic_task_scheduler_test.cc
+++ b/db/periodic_task_scheduler_test.cc
@@ -157,13 +157,13 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
                                         [&](void*) { pst_st_counter++; });
   SyncPoint::GetInstance()->EnableProcessing();
 
-  auto dbs = std::vector<DB*>(kInstanceNum);
+  auto dbs = std::vector<std::unique_ptr<DB>>(kInstanceNum);
   for (int i = 0; i < kInstanceNum; i++) {
     ASSERT_OK(
         DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
   }
 
-  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
+  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1].get());
 
   const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler();
   // kRecordSeqnoTime is not registered since the feature is not enabled
@@ -190,7 +190,7 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
 
   int half = kInstanceNum / 2;
   for (int i = 0; i < half; i++) {
-    delete dbs[i];
+    dbs[i].reset();
   }
 
   expected_run += (kInstanceNum - half) * 2;
@@ -204,7 +204,7 @@ TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
 
   for (int i = half; i < kInstanceNum; i++) {
     ASSERT_OK(dbs[i]->Close());
-    delete dbs[i];
+    dbs[i].reset();
   }
 }
 
@@ -229,11 +229,11 @@ TEST_F(PeriodicTaskSchedulerTest, MultiEnv) {
   options1.env = mock_env2.get();
 
   std::string dbname = test::PerThreadDBPath("multi_env_test");
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options2, dbname, &db));
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
   Close();
 }
 
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index 4645ae31b7c6..6e2909ca5159 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -98,7 +98,7 @@ class PlainTableDBTest : public testing::Test,
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
   bool mmap_mode_;
   Options last_options_;
@@ -107,7 +107,7 @@ class PlainTableDBTest : public testing::Test,
   PlainTableDBTest() : env_(Env::Default()) {}
 
   ~PlainTableDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
@@ -115,7 +115,7 @@ class PlainTableDBTest : public testing::Test,
     mmap_mode_ = GetParam();
     dbname_ = test::PerThreadDBPath("plain_table_db_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
+    db_.reset();
     Reopen();
   }
 
@@ -144,14 +144,11 @@ class PlainTableDBTest : public testing::Test,
     return options;
   }
 
-  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
 
   void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
 
-  void Close() {
-    delete db_;
-    db_ = nullptr;
-  }
+  void Close() { db_.reset(); }
 
   bool mmap_mode() const { return mmap_mode_; }
 
@@ -162,24 +159,21 @@ class PlainTableDBTest : public testing::Test,
   }
 
   void Destroy(Options* options) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, *options));
   }
 
-  Status PureReopen(Options* options, DB** db) {
+  Status PureReopen(Options* options, std::unique_ptr<DB>* db) {
     return DB::Open(*options, dbname_, db);
   }
 
   Status ReopenForReadOnly(Options* options) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     return DB::OpenForReadOnly(*options, dbname_, &db_);
   }
 
   Status TryReopen(Options* options = nullptr) {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     Options opts;
     if (options != nullptr) {
       opts = *options;
@@ -495,8 +489,7 @@ TEST_P(PlainTableDBTest, Flush) {
             ASSERT_GT(int_num, 0U);
 
             TablePropertiesCollection ptc;
-            ASSERT_OK(
-                static_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+            ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&ptc));
             ASSERT_EQ(1U, ptc.size());
             auto row = ptc.begin();
             auto tp = row->second;
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 9b1d4ed79e6a..35f005138662 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -229,8 +229,8 @@ class SamePrefixTransform : public SliceTransform {
 
 class PrefixTest : public testing::Test {
  public:
-  std::shared_ptr<DB> OpenDb() {
-    DB* db;
+  std::unique_ptr<DB> OpenDb() {
+    std::unique_ptr<DB> db;
 
     options.create_if_missing = true;
     options.write_buffer_size = FLAGS_write_buffer_size;
@@ -251,7 +251,7 @@ class PrefixTest : public testing::Test {
 
     Status s = DB::Open(options, kDbName, &db);
     EXPECT_OK(s);
-    return std::shared_ptr<DB>(db);
+    return db;
   }
 
   void FirstOption() { option_config_ = kBegin; }
@@ -304,7 +304,7 @@ class PrefixTest : public testing::Test {
 };
 
 TEST(SamePrefixTest, InDomainTest) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
@@ -331,7 +331,7 @@ TEST(SamePrefixTest, InDomainTest) {
     ASSERT_EQ(db_iter->value(), "idk");
 
     delete db_iter;
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(kDbName, Options()));
   }
 
@@ -348,7 +348,7 @@ TEST(SamePrefixTest, InDomainTest) {
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_OK(db_iter->status());
     delete db_iter;
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(kDbName, Options()));
   }
 }
diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc
index 271a53fa9ae0..e474c583d892 100644
--- a/db/seqno_time_test.cc
+++ b/db/seqno_time_test.cc
@@ -661,14 +661,14 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
   options.stats_dump_period_sec = 0;
   options.stats_persist_period_sec = 0;
 
-  auto dbs = std::vector<DB*>(kInstanceNum);
+  auto dbs = std::vector<std::unique_ptr<DB>>(kInstanceNum);
   for (int i = 0; i < kInstanceNum; i++) {
     ASSERT_OK(
         DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
   }
 
   // Make sure the second instance has the worker enabled
-  auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
+  auto dbi = static_cast_with_check<DBImpl>(dbs[1].get());
   WriteOptions wo;
   for (int i = 0; i < 200; i++) {
     ASSERT_OK(dbi->Put(wo, Key(i), "value"));
@@ -680,7 +680,7 @@ TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
 
   for (int i = 0; i < kInstanceNum; i++) {
     ASSERT_OK(dbs[i]->Close());
-    delete dbs[i];
+    dbs[i].reset();
   }
 }
 
@@ -792,8 +792,8 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
   }
   ASSERT_GT(num_seqno_zeroing, 0);
   std::vector<KeyVersion> key_versions;
-  ASSERT_OK(GetAllKeyVersions(db_, {}, {}, std::numeric_limits<size_t>::max(),
-                              &key_versions));
+  ASSERT_OK(GetAllKeyVersions(
+      db_.get(), {}, {}, std::numeric_limits<size_t>::max(), &key_versions));
   // make sure there're more than 300 keys and first 100 keys are having seqno
   // zeroed out, the last 100 key seqno not zeroed out
   ASSERT_GT(key_versions.size(), 300);
diff --git a/db/wide/db_wide_basic_test.cc b/db/wide/db_wide_basic_test.cc
index 886f71d7452f..5c46c3c6443f 100644
--- a/db/wide/db_wide_basic_test.cc
+++ b/db/wide/db_wide_basic_test.cc
@@ -714,7 +714,7 @@ TEST_F(DBWideBasicTest, MergePlainKeyValue) {
     // snapshot in between to make sure they do not get reconciled during the
     // subsequent flush)
     write_base();
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     write_merge();
     verify();
 
@@ -958,7 +958,7 @@ TEST_F(DBWideBasicTest, MergeEntity) {
     // between to make sure they do not get reconciled during the subsequent
     // flush)
     write_base();
-    ManagedSnapshot snapshot(db_);
+    ManagedSnapshot snapshot(db_.get());
     write_merge();
     verify_basic();
     verify_merge_ops_pre_compaction();
@@ -1033,7 +1033,7 @@ class DBWideMergeV3Test : public DBWideBasicTest {
                              third_key,
                              third_columns));  // wide-column base value
 
-    snapshots_.emplace_back(db_);
+    snapshots_.emplace_back(db_.get());
 
     // First round of merge operands
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
@@ -1043,7 +1043,7 @@ class DBWideMergeV3Test : public DBWideBasicTest {
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
                          third_merge_op1));
 
-    snapshots_.emplace_back(db_);
+    snapshots_.emplace_back(db_.get());
 
     // Second round of merge operands
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
@@ -1053,7 +1053,7 @@ class DBWideMergeV3Test : public DBWideBasicTest {
     ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
                          third_merge_op2));
 
-    snapshots_.emplace_back(db_);
+    snapshots_.emplace_back(db_.get());
   }
 
   void VerifyKeyValues(const WideColumns& first_expected,
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index 53094eca4b9b..4fd1d8bcdc65 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -419,7 +419,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  DB* db;
+  std::unique_ptr<DB> db;
   DBImpl* db_impl;
 
   ASSERT_OK(DestroyDB(dbname, options));
@@ -428,7 +428,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   Status s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
 
-  db_impl = dynamic_cast<DBImpl*>(db);
+  db_impl = dynamic_cast<DBImpl*>(db.get());
   ASSERT_TRUE(db_impl);
 
   WriteBatch wb;
@@ -481,7 +481,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   ASSERT_TRUE(user_write_cb.write_enqueued_.load());
   ASSERT_TRUE(user_write_cb.wal_write_done_.load());
 
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname, options));
 }
 
diff --git a/db_stress_tool/cf_consistency_stress.cc b/db_stress_tool/cf_consistency_stress.cc
index 1df4fc7cb7fc..d18c47281a69 100644
--- a/db_stress_tool/cf_consistency_stress.cc
+++ b/db_stress_tool/cf_consistency_stress.cc
@@ -1047,7 +1047,7 @@ class CfConsistencyStressTest : public StressTest {
     assert(thread);
     Status status;
 
-    DB* db_ptr = secondary_db_ ? secondary_db_ : db_;
+    DB* db_ptr = secondary_db_ ? secondary_db_.get() : db_;
     const auto& cfhs = secondary_db_ ? secondary_cfhs_ : column_families_;
 
     // Take a snapshot to preserve the state of primary db.
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 902a6c8ff546..a57199e2d226 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -74,7 +74,6 @@ StressTest::StressTest()
       new_column_family_name_(1),
       num_times_reopened_(0),
       db_preload_finished_(false),
-      secondary_db_(nullptr),
       is_db_stopped_(false) {
   if (FLAGS_destroy_db_initially) {
     const Status s = DbStressDestroyDb(FLAGS_db);
@@ -98,11 +97,10 @@ void StressTest::CleanUp() {
   if (db_) {
     db_->Close();
   }
-  delete db_;
+  db_owner_.reset();
   db_ = nullptr;
 
-  delete secondary_db_;
-  secondary_db_ = nullptr;
+  secondary_db_.reset();
 }
 
 void StressTest::CleanUpColumnFamilies() {
@@ -753,12 +751,11 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
   }
   if (s.ok()) {
     CleanUpColumnFamilies();
-    delete db_;
+    db_owner_.reset();
     db_ = nullptr;
     txn_db_ = nullptr;
     optimistic_txn_db_ = nullptr;
-    delete secondary_db_;
-    secondary_db_ = nullptr;
+    secondary_db_.reset();
 
     db_preload_finished_.store(true);
     auto now = clock_->NowMicros();
@@ -2506,7 +2503,7 @@ Status StressTest::TestBackupRestore(
       from = "BackupEngine::PurgeOldBackups";
     }
   }
-  DB* restored_db = nullptr;
+  std::unique_ptr<DB> restored_db;
   std::vector<ColumnFamilyHandle*> restored_cf_handles;
 
   // Not yet implemented: opening restored BlobDB or TransactionDB
@@ -2594,8 +2591,7 @@ Status StressTest::TestBackupRestore(
     for (auto* cf_handle : restored_cf_handles) {
       restored_db->DestroyColumnFamilyHandle(cf_handle);
     }
-    delete restored_db;
-    restored_db = nullptr;
+    restored_db.reset();
   }
   if (s.ok() && inplace_not_restore) {
     // Purge late if inplace open read-only
@@ -2830,7 +2826,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
   delete checkpoint;
   checkpoint = nullptr;
   std::vector<ColumnFamilyHandle*> cf_handles;
-  DB* checkpoint_db = nullptr;
+  std::unique_ptr<DB> checkpoint_db;
   if (s.ok()) {
     Options options(options_);
     options.best_efforts_recovery = false;
@@ -2894,8 +2890,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
       delete cfh;
     }
     cf_handles.clear();
-    delete checkpoint_db;
-    checkpoint_db = nullptr;
+    checkpoint_db.reset();
   }
 
   //  Temporarily disable error injection for clean-up
@@ -3871,15 +3866,20 @@ void StressTest::Open(SharedState* shared, bool reopen) {
                                     cf_descriptors, &column_families_,
                                     &blob_db);
           if (s.ok()) {
+            db_owner_.reset(blob_db);
             db_ = blob_db;
           }
         } else {
           if (db_preload_finished_.load() && FLAGS_read_only) {
             s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db,
-                                    cf_descriptors, &column_families_, &db_);
+                                    cf_descriptors, &column_families_,
+                                    &db_owner_);
           } else {
             s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                         &column_families_, &db_);
+                         &column_families_, &db_owner_);
+          }
+          if (s.ok()) {
+            db_ = db_owner_.get();
           }
         }
 
@@ -3895,10 +3895,9 @@ void StressTest::Open(SharedState* shared, bool reopen) {
             s = db_->GetRootDB()->WaitForCompact(WaitForCompactOptions());
             if (!s.ok()) {
               CleanUpColumnFamilies();
-              delete db_;
+              db_owner_.reset();
               db_ = nullptr;
-              delete secondary_db_;
-              secondary_db_ = nullptr;
+              secondary_db_.reset();
             }
           }
           if (!s.ok()) {
@@ -3955,6 +3954,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
         }
         assert(s.ok());
         {
+          db_owner_.reset(optimistic_txn_db_);
           db_ = optimistic_txn_db_;
           db_aptr_.store(optimistic_txn_db_, std::memory_order_release);
         }
@@ -3990,6 +3990,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
 
         // Do not swap the order of the following.
         {
+          db_owner_.reset(txn_db_);
           db_ = txn_db_;
           db_aptr_.store(txn_db_, std::memory_order_release);
         }
@@ -4028,6 +4029,7 @@ void StressTest::Open(SharedState* shared, bool reopen) {
   } else {
     DBWithTTL* db_with_ttl;
     s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+    db_owner_.reset(db_with_ttl);
     db_ = db_with_ttl;
   }
 
@@ -4107,12 +4109,11 @@ void StressTest::Reopen(ThreadState* thread) {
   }
   assert((txn_db_ == nullptr && optimistic_txn_db_ == nullptr) ||
          (db_ == txn_db_ || db_ == optimistic_txn_db_));
-  delete db_;
+  db_owner_.reset();
   db_ = nullptr;
   txn_db_ = nullptr;
   optimistic_txn_db_ = nullptr;
-  delete secondary_db_;
-  secondary_db_ = nullptr;
+  secondary_db_.reset();
 
   num_times_reopened_++;
   auto now = clock_->NowMicros();
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index 3e8bc2af0def..d97aadf9e60e 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -404,6 +404,7 @@ class StressTest {
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
   std::shared_ptr<const FilterPolicy> filter_policy_;
+  std::unique_ptr<DB> db_owner_;
   DB* db_;
   TransactionDB* txn_db_;
   OptimisticTransactionDB* optimistic_txn_db_;
@@ -422,7 +423,7 @@ class StressTest {
   std::atomic<bool> db_preload_finished_;
   std::shared_ptr<SstQueryFilterConfigsManager::Factory> sqfc_factory_;
 
-  DB* secondary_db_;
+  std::unique_ptr<DB> secondary_db_;
   std::vector<ColumnFamilyHandle*> secondary_cfhs_;
   bool is_db_stopped_;
 };
diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index becda50ec3e8..c9d3250a119a 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -234,8 +234,9 @@ class NonBatchedOpsStressTest : public StressTest {
 
           Status s = secondary_db_->TryCatchUpWithPrimary();
 #ifndef NDEBUG
-          uint64_t manifest_num = static_cast_with_check<DBImpl>(secondary_db_)
-                                      ->TEST_Current_Manifest_FileNo();
+          uint64_t manifest_num =
+              static_cast_with_check<DBImpl>(secondary_db_.get())
+                  ->TEST_Current_Manifest_FileNo();
 #else
           uint64_t manifest_num = 0;
 #endif
diff --git a/env/env_test.cc b/env/env_test.cc
index 68efa41c2c0b..4c0939ecffa4 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -2508,7 +2508,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) {
     }
   }
   for (int i = 0; i < 2; ++i) {
-    DB* db;
+    std::unique_ptr<DB> db;
     Status s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
 
@@ -2526,7 +2526,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) {
     ASSERT_EQ("b", val);
 
     ASSERT_OK(db->Close());
-    delete db;
+    db.reset();
     ASSERT_OK(DestroyDB(dbname, opts));
 
     dbname = dbname2_;
diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc
index 3828d3fb3f73..f8ce4b8c7013 100644
--- a/examples/column_families_example.cc
+++ b/examples/column_families_example.cc
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 #include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -32,7 +33,7 @@ int main() {
   // open DB
   Options options;
   options.create_if_missing = true;
-  DB* db;
+  std::unique_ptr<DB> db;
   Status s = DB::Open(options, kDBPath, &db);
   assert(s.ok());
 
@@ -44,7 +45,7 @@ int main() {
   // close DB
   s = db->DestroyColumnFamilyHandle(cf);
   assert(s.ok());
-  delete db;
+  db.reset();
 
   // open DB with two column families
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -82,7 +83,7 @@ int main() {
     s = db->DestroyColumnFamilyHandle(handle);
     assert(s.ok());
   }
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc
index 52b054002d76..cc9e04e4506b 100644
--- a/examples/compact_files_example.cc
+++ b/examples/compact_files_example.cc
@@ -6,6 +6,7 @@
 // An example code demonstrating how to use CompactFiles, EventListener,
 // and GetColumnFamilyMetaData APIs to implement custom compaction algorithm.
 
+#include <memory>
 #include <mutex>
 #include <string>
 
@@ -151,10 +152,12 @@ int main() {
   options.IncreaseParallelism(5);
   options.listeners.emplace_back(new FullCompactor(options));
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ROCKSDB_NAMESPACE::DestroyDB(kDBPath, options);
-  Status s = DB::Open(options, kDBPath, &db);
-  assert(s.ok());
+  {
+    Status s = DB::Open(options, kDBPath, &db);
+    assert(s.ok());
+  }
   assert(db);
 
   // if background compaction is not working, write will stall
@@ -172,7 +175,7 @@ int main() {
   }
 
   // close the db.
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/examples/compaction_filter_example.cc b/examples/compaction_filter_example.cc
index 03a1952600d7..9c17a229940b 100644
--- a/examples/compaction_filter_example.cc
+++ b/examples/compaction_filter_example.cc
@@ -63,7 +63,7 @@ std::string kRemoveDirCommand = "rm -rf ";
 #endif
 
 int main() {
-  ROCKSDB_NAMESPACE::DB* raw_db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Status status;
 
   MyFilter filter;
@@ -77,9 +77,8 @@ int main() {
   options.create_if_missing = true;
   options.merge_operator.reset(new MyMerge);
   options.compaction_filter = &filter;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &raw_db);
+  status = ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db);
   assert(status.ok());
-  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(raw_db);
 
   ROCKSDB_NAMESPACE::WriteOptions wopts;
   db->Merge(wopts, "0", "bad");  // This is filtered out
diff --git a/examples/multi_processes_example.cc b/examples/multi_processes_example.cc
index b9a6cbe207d1..20a3af3637b4 100644
--- a/examples/multi_processes_example.cc
+++ b/examples/multi_processes_example.cc
@@ -19,6 +19,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
+#include <memory>
 #include <string>
 #include <thread>
 #include <vector>
@@ -147,7 +148,7 @@ void CreateDB() {
     assert(false);
   }
   options.create_if_missing = true;
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   s = DB::Open(options, kDBPath, &db);
   if (!s.ok()) {
     fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid,
@@ -173,7 +174,7 @@ void CreateDB() {
     delete h;
   }
   handles.clear();
-  delete db;
+  db.reset();
 }
 
 void RunPrimary() {
@@ -181,7 +182,7 @@ void RunPrimary() {
   fprintf(stdout, "[process %ld] Primary instance starts\n", my_pid);
   CreateDB();
   std::srand(time(nullptr));
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = false;
   std::vector<ColumnFamilyDescriptor> column_families;
@@ -227,8 +228,7 @@ void RunPrimary() {
         delete h;
       }
       handles.clear();
-      delete db;
-      db = nullptr;
+      db.reset();
     }
   }
   if (nullptr != db) {
@@ -236,8 +236,7 @@ void RunPrimary() {
       delete h;
     }
     handles.clear();
-    delete db;
-    db = nullptr;
+    db.reset();
   }
   fprintf(stdout, "[process %ld] Finished adding keys\n", my_pid);
 }
@@ -262,7 +261,7 @@ void RunSecondary() {
       exit(0);
     }
   }
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = false;
   options.max_open_files = -1;
@@ -344,7 +343,7 @@ void RunSecondary() {
     column_families.push_back(ColumnFamilyDescriptor(cf_name, options));
   }
   std::vector<ColumnFamilyHandle*> handles;
-  DB* verification_db = nullptr;
+  std::unique_ptr<DB> verification_db;
   s = DB::OpenForReadOnly(options, kDBPath, column_families, &handles,
                           &verification_db);
   assert(s.ok());
@@ -369,8 +368,8 @@ void RunSecondary() {
   }
   delete iter;
   delete iter1;
-  delete db;
-  delete verification_db;
+  db.reset();
+  verification_db.reset();
 }
 
 int main(int argc, char** argv) {
diff --git a/examples/options_file_example.cc b/examples/options_file_example.cc
index 00632f391ae9..09be3185ca88 100644
--- a/examples/options_file_example.cc
+++ b/examples/options_file_example.cc
@@ -7,6 +7,7 @@
 // rocksdb/utilities/options_util.h to open a rocksdb database without
 // remembering all the rocksdb options.
 #include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -74,7 +75,7 @@ int main() {
   cf_descs[1].options.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
 
   // destroy and open DB
-  DB* db;
+  std::unique_ptr<DB> db;
   Status s = ROCKSDB_NAMESPACE::DestroyDB(kDBPath,
                                           Options(db_opt, cf_descs[0].options));
   assert(s.ok());
@@ -88,7 +89,7 @@ int main() {
 
   // close DB
   delete cf;
-  delete db;
+  db.reset();
 
   // In the following code, we will reopen the rocksdb instance using
   // the options file stored in the db directory.
@@ -128,5 +129,5 @@ int main() {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
 }
diff --git a/examples/rocksdb_backup_restore_example.cc b/examples/rocksdb_backup_restore_example.cc
index c833ed1c2a8f..e5ad703eed8d 100644
--- a/examples/rocksdb_backup_restore_example.cc
+++ b/examples/rocksdb_backup_restore_example.cc
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <cstdio>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -29,7 +30,7 @@ std::string kDBPath = "/tmp/rocksdb_example";
 #endif
 
 int main() {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
   options.IncreaseParallelism();
@@ -52,7 +53,7 @@ int main() {
                          &backup_engine);
   assert(s.ok());
 
-  backup_engine->CreateNewBackup(db);
+  backup_engine->CreateNewBackup(db.get());
   assert(s.ok());
 
   std::vector<BackupInfo> backup_info;
@@ -65,9 +66,7 @@ int main() {
   db->Put(WriteOptions(), "key2", "value2");
   assert(s.ok());
 
-  db->Close();
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // restore db to backup 1
   BackupEngineReadOnly* backup_engine_ro;
@@ -93,7 +92,7 @@ int main() {
 
   delete backup_engine;
   delete backup_engine_ro;
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/examples/simple_example.cc b/examples/simple_example.cc
index 2d49c4d14da2..85a87da77cea 100644
--- a/examples/simple_example.cc
+++ b/examples/simple_example.cc
@@ -4,6 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include <cstdio>
+#include <memory>
 #include <string>
 
 #include "rocksdb/db.h"
@@ -25,7 +26,7 @@ std::string kDBPath = "/tmp/rocksdb_simple_example";
 #endif
 
 int main() {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   // Optimize RocksDB. This is the easiest way to get RocksDB to perform well
   options.IncreaseParallelism();
@@ -87,7 +88,7 @@ int main() {
   pinnable_val.Reset();
   // The Slice pointed by pinnable_val is not valid after this point
 
-  delete db;
+  db.reset();
 
   return 0;
 }
diff --git a/fuzz/db_fuzzer.cc b/fuzz/db_fuzzer.cc
index e6d5bb63c06f..7b10b35ce101 100644
--- a/fuzz/db_fuzzer.cc
+++ b/fuzz/db_fuzzer.cc
@@ -31,11 +31,11 @@ constexpr char db_path[] = "/tmp/testdb";
 // enum. The goal is to capture sanitizer bugs, so the code should be
 // compiled with a given sanitizer (ASan, UBSan, MSan).
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  ROCKSDB_NAMESPACE::DB* db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Options options;
+  ROCKSDB_NAMESPACE::Status status;
   options.create_if_missing = true;
-  ROCKSDB_NAMESPACE::Status status =
-      ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
+  status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
   if (!status.ok()) {
     return 0;
   }
@@ -88,7 +88,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
       }
       case kOpenClose: {
         db->Close();
-        delete db;
+        db.reset();
         status = ROCKSDB_NAMESPACE::DB::Open(options, db_path, &db);
         if (!status.ok()) {
           ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
@@ -104,7 +104,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
                                    "new_cf", &cf);
         s = db->DestroyColumnFamilyHandle(cf);
         db->Close();
-        delete db;
+        db.reset();
 
         // open DB with two column families
         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> column_families;
@@ -166,7 +166,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
 
   // Cleanup DB
   db->Close();
-  delete db;
+  db.reset();
   ROCKSDB_NAMESPACE::DestroyDB(db_path, options);
   return 0;
 }
diff --git a/fuzz/db_map_fuzzer.cc b/fuzz/db_map_fuzzer.cc
index ed9df8f8432d..8c55ac4e9e7a 100644
--- a/fuzz/db_map_fuzzer.cc
+++ b/fuzz/db_map_fuzzer.cc
@@ -50,7 +50,7 @@ DEFINE_PROTO_FUZZER(DBOperations& input) {
   }
 
   std::map<std::string, std::string> kv;
-  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Options options;
   options.create_if_missing = true;
   CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
@@ -86,8 +86,7 @@ DEFINE_PROTO_FUZZER(DBOperations& input) {
     }
   }
   CHECK_OK(db->Close());
-  delete db;
-  db = nullptr;
+  db.reset();
 
   CHECK_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDbPath, &db));
   auto kv_it = kv.begin();
@@ -102,6 +101,6 @@ DEFINE_PROTO_FUZZER(DBOperations& input) {
   delete it;
 
   CHECK_OK(db->Close());
-  delete db;
+  db.reset();
   CHECK_OK(ROCKSDB_NAMESPACE::DestroyDB(kDbPath, options));
 }
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index 8b4be252cfd9..d31660de4ae4 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -131,24 +131,13 @@ using TablePropertiesCollection =
 class DB {
  public:
   // Open the database with the specified "name" for reads and writes.
-  // Stores a pointer to a heap-allocated database in *dbptr and returns
-  // OK on success.
-  // Stores nullptr in *dbptr and returns a non-OK status on error, including
+  // On success, stores the database in *dbptr and returns OK.
+  // On error, resets *dbptr and returns a non-OK status, including
   // if the DB is already open (read-write) by another DB object. (This
   // guarantee depends on options.env->LockFile(), which might not provide
   // this guarantee in a custom Env implementation.)
-  //
-  // Caller must delete *dbptr when it is no longer needed.
   static Status Open(const Options& options, const std::string& name,
                      std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status Open(const Options& options, const std::string& name,
-                     DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = Open(options, name, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Open DB with column families.
   // db_options specify database specific options
@@ -162,21 +151,12 @@ class DB {
   // If everything is OK, handles will on return be the same size
   // as column_families --- handles[i] will be a handle that you
   // will use to operate on column family column_family[i].
-  // Before delete DB, you have to close All column families by calling
+  // Before destroying the DB, you have to close all column families by calling
   // DestroyColumnFamilyHandle() with all the handles.
   static Status Open(const DBOptions& db_options, const std::string& name,
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles,
                      std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status Open(const DBOptions& db_options, const std::string& name,
-                     const std::vector<ColumnFamilyDescriptor>& column_families,
-                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = Open(db_options, name, column_families, handles, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // OpenForReadOnly() creates a Read-only instance that supports reads alone.
   //
@@ -195,16 +175,6 @@ class DB {
   static Status OpenForReadOnly(const Options& options, const std::string& name,
                                 std::unique_ptr<DB>* dbptr,
                                 bool error_if_wal_file_exists = false);
-  // DEPRECATED: raw pointer variant
-  static Status OpenForReadOnly(const Options& options, const std::string& name,
-                                DB** dbptr,
-                                bool error_if_wal_file_exists = false) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s =
-        OpenForReadOnly(options, name, &smart_ptr, error_if_wal_file_exists);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Open the database for read only with column families.
   //
@@ -218,18 +188,6 @@ class DB {
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
       bool error_if_wal_file_exists = false);
-  // DEPRECATED: raw pointer variant
-  static Status OpenForReadOnly(
-      const DBOptions& db_options, const std::string& name,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-      bool error_if_wal_file_exists = false) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenForReadOnly(db_options, name, column_families, handles,
-                               &smart_ptr, error_if_wal_file_exists);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // OpenAsSecondary() creates a secondary instance that supports read-only
   // operations and supports dynamic catch up with the primary (through a
@@ -251,8 +209,6 @@ class DB {
   // The secondary_path argument points to a directory where the secondary
   // instance stores its info log.
   // The dbptr is an out-arg corresponding to the opened secondary instance.
-  // The pointer points to a heap-allocated database, and the caller should
-  // delete it after use.
   //
   // Return OK on success, non-OK on failures.
   //
@@ -265,14 +221,6 @@ class DB {
   static Status OpenAsSecondary(const Options& options, const std::string& name,
                                 const std::string& secondary_path,
                                 std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status OpenAsSecondary(const Options& options, const std::string& name,
-                                const std::string& secondary_path, DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenAsSecondary(options, name, secondary_path, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Open DB as secondary instance with specified column families
   //
@@ -301,9 +249,8 @@ class DB {
   // The handles is an out-arg corresponding to the opened database column
   // family handles.
   // The dbptr is an out-arg corresponding to the opened secondary instance.
-  // The pointer points to a heap-allocated database, and the caller should
-  // delete it after use. Before deleting the dbptr, the user should also
-  // delete the pointers stored in handles vector.
+  // Before destroying the DB, the user should call
+  // DestroyColumnFamilyHandle() on all the handles.
   //
   // Return OK on success, non-OK on failures.
   static Status OpenAsSecondary(
@@ -311,18 +258,6 @@ class DB {
       const std::string& secondary_path,
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr);
-  // DEPRECATED: raw pointer variant
-  static Status OpenAsSecondary(
-      const DBOptions& db_options, const std::string& name,
-      const std::string& secondary_path,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenAsSecondary(db_options, name, secondary_path,
-                               column_families, handles, &smart_ptr);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // EXPERIMENTAL
 
@@ -389,18 +324,6 @@ class DB {
       const std::vector<ColumnFamilyDescriptor>& column_families,
       std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr,
       std::string trim_ts);
-  // DEPRECATED: raw pointer variant
-  static Status OpenAndTrimHistory(
-      const DBOptions& db_options, const std::string& dbname,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-      std::string trim_ts) {
-    std::unique_ptr<DB> smart_ptr;
-    Status s = OpenAndTrimHistory(db_options, dbname, column_families, handles,
-                                  &smart_ptr, trim_ts);
-    *dbptr = smart_ptr.release();
-    return s;
-  }
 
   // Manually, synchronously attempt to resume DB writes after a write failure
   // to the underlying filesystem. See
@@ -1063,7 +986,7 @@ class DB {
   // call one of the Seek methods on the iterator before using it).
   //
   // Caller should delete the iterator when it is no longer needed.
-  // The returned iterator should be deleted before this db is deleted.
+  // The returned iterator should be deleted before this db is destroyed.
   virtual Iterator* NewIterator(const ReadOptions& options,
                                 ColumnFamilyHandle* column_family) = 0;
   virtual Iterator* NewIterator(const ReadOptions& options) {
@@ -1071,7 +994,7 @@ class DB {
   }
   // Returns iterators from a consistent database state across multiple
   // column families. Iterators are heap allocated and need to be deleted
-  // before the db is deleted
+  // before the db is destroyed
   virtual Status NewIterators(
       const ReadOptions& options,
       const std::vector<ColumnFamilyHandle*>& column_families,
@@ -1791,16 +1714,6 @@ class DB {
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
 
-  // DEPRECATED:
-  // Maximum level to which a new compacted memtable is pushed if it
-  // does not create overlap.
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
-    return 0;
-  }
-  virtual int MaxMemCompactionLevel() {
-    return MaxMemCompactionLevel(DefaultColumnFamily());
-  }
-
   // Number of files in level-0 that would stop writes.
   virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) = 0;
   virtual int Level0StopWriteTrigger() {
diff --git a/include/rocksdb/tool_hooks.h b/include/rocksdb/tool_hooks.h
index 507c32d5e457..a92abde67356 100644
--- a/include/rocksdb/tool_hooks.h
+++ b/include/rocksdb/tool_hooks.h
@@ -30,18 +30,21 @@ class ToolHooks {
   ToolHooks() = default;
   virtual ~ToolHooks() = default;
   virtual Status Open(const Options& db_options, const std::string& name,
-                      DB** dbptr) = 0;
+                      std::unique_ptr<DB>* dbptr) = 0;
   virtual Status Open(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) = 0;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) = 0;
   virtual Status OpenForReadOnly(const Options& options,
-                                 const std::string& name, DB** dbptr,
+                                 const std::string& name,
+                                 std::unique_ptr<DB>* dbptr,
                                  bool error_if_wal_file_exists) = 0;
   virtual Status OpenForReadOnly(
       const Options& options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) = 0;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) = 0;
   virtual Status OpenTransactionDB(const Options& db_options,
                                    const TransactionDBOptions& txn_db_options,
                                    const std::string& dbname,
@@ -62,7 +65,7 @@ class ToolHooks {
   virtual Status OpenAsSecondary(const Options& options,
                                  const std::string& name,
                                  const std::string& secondary_path,
-                                 DB** dbptr) = 0;
+                                 std::unique_ptr<DB>* dbptr) = 0;
   virtual Status OpenAsFollower(const Options& options, const std::string& name,
                                 const std::string& leader_path,
                                 std::unique_ptr<DB>* dbptr) = 0;
@@ -77,18 +80,21 @@ class DefaultHooks : public ToolHooks {
   DefaultHooks() = default;
   ~DefaultHooks() override = default;
   virtual Status Open(const Options& db_options, const std::string& name,
-                      DB** dbptr) override;
+                      std::unique_ptr<DB>* dbptr) override;
   virtual Status Open(
       const DBOptions& db_options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) override;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) override;
   virtual Status OpenForReadOnly(const Options& options,
-                                 const std::string& name, DB** dbptr,
+                                 const std::string& name,
+                                 std::unique_ptr<DB>* dbptr,
                                  bool error_if_wal_file_exists) override;
   virtual Status OpenForReadOnly(
       const Options& options, const std::string& name,
       const std::vector<ColumnFamilyDescriptor>& column_families,
-      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) override;
+      std::vector<ColumnFamilyHandle*>* handles,
+      std::unique_ptr<DB>* dbptr) override;
   virtual Status OpenTransactionDB(const Options& db_options,
                                    const TransactionDBOptions& txn_db_options,
                                    const std::string& dbname,
@@ -110,7 +116,7 @@ class DefaultHooks : public ToolHooks {
   virtual Status OpenAsSecondary(const Options& options,
                                  const std::string& name,
                                  const std::string& secondary_path,
-                                 DB** dbptr) override;
+                                 std::unique_ptr<DB>* dbptr) override;
   virtual Status OpenAsFollower(const Options& options, const std::string& name,
                                 const std::string& leader_path,
                                 std::unique_ptr<DB>* dbptr) override;
diff --git a/include/rocksdb/utilities/db_ttl.h b/include/rocksdb/utilities/db_ttl.h
index 02313277cd8a..bccce8ddb14f 100644
--- a/include/rocksdb/utilities/db_ttl.h
+++ b/include/rocksdb/utilities/db_ttl.h
@@ -66,7 +66,7 @@ class DBWithTTL : public StackableDB {
   virtual Status GetTtl(ColumnFamilyHandle* h, int32_t* ttl) = 0;
 
  protected:
-  explicit DBWithTTL(DB* db) : StackableDB(db) {}
+  explicit DBWithTTL(std::unique_ptr<DB>&& db) : StackableDB(std::move(db)) {}
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index f5b1387d7042..aacf9d3e9338 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <functional>
 #include <map>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -165,7 +166,7 @@ class LDBCommand {
   std::string secondary_path_;
   std::string leader_path_;
   std::string column_family_name_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   DBWithTTL* db_ttl_;
   TransactionDB* db_txn_;
   std::map<std::string, ColumnFamilyHandle*> cf_handles_;
diff --git a/include/rocksdb/utilities/memory_util.h b/include/rocksdb/utilities/memory_util.h
index acebc8b4a655..40d9f5646c46 100644
--- a/include/rocksdb/utilities/memory_util.h
+++ b/include/rocksdb/utilities/memory_util.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -39,8 +40,11 @@ class MemoryUtil {
   // only report the usage of the input "cache_set" without
   // including those Cache usage inside the input list "dbs"
   // of DBs.
+  //
+  // Supports vectors of DB* or unique_ptr<DB>.
+  template <typename DBPtr>
   static Status GetApproximateMemoryUsageByType(
-      const std::vector<DB*>& dbs,
+      const std::vector<DBPtr>& dbs,
       const std::unordered_set<const Cache*> cache_set,
       std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
 };
diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h
index 875a132e408f..eb9f973a82b1 100644
--- a/include/rocksdb/utilities/optimistic_transaction_db.h
+++ b/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -123,7 +123,8 @@ class OptimisticTransactionDB : public StackableDB {
 
  protected:
   // To Create an OptimisticTransactionDB, call Open()
-  explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
+  explicit OptimisticTransactionDB(std::unique_ptr<DB>&& db)
+      : StackableDB(std::move(db)) {}
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index f48acb2433db..de43ba386282 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -388,11 +388,6 @@ class StackableDB : public DB {
     return db_->NumberLevels(column_family);
   }
 
-  using DB::MaxMemCompactionLevel;
-  int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override {
-    return db_->MaxMemCompactionLevel(column_family);
-  }
-
   using DB::Level0StopWriteTrigger;
   int Level0StopWriteTrigger(ColumnFamilyHandle* column_family) override {
     return db_->Level0StopWriteTrigger(column_family);
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 45e8f507d9d2..4a33d4e2f5e4 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -34,11 +34,12 @@
 #undef min
 #endif
 
-jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
-                          std::function<ROCKSDB_NAMESPACE::Status(
-                              const ROCKSDB_NAMESPACE::Options&,
-                              const std::string&, ROCKSDB_NAMESPACE::DB**)>
-                              open_fn) {
+jlong rocksdb_open_helper(
+    JNIEnv* env, jlong jopt_handle, jstring jdb_path,
+    std::function<ROCKSDB_NAMESPACE::Status(
+        const ROCKSDB_NAMESPACE::Options&, const std::string&,
+        std::unique_ptr<ROCKSDB_NAMESPACE::DB>*)>
+        open_fn) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
   if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
@@ -46,13 +47,13 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
   }
 
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jopt_handle);
-  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Status s = open_fn(*opt, db_path, &db);
 
   env->ReleaseStringUTFChars(jdb_path, db_path);
 
   if (s.ok()) {
-    return GET_CPLUSPLUS_POINTER(db);
+    return GET_CPLUSPLUS_POINTER(db.release());
   } else {
     ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s);
     return 0;
@@ -69,9 +70,10 @@ jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(JNIEnv* env, jclass,
                                                           jstring jdb_path) {
   return rocksdb_open_helper(
       env, jopt_handle, jdb_path,
-      (ROCKSDB_NAMESPACE::Status (*)(
-          const ROCKSDB_NAMESPACE::Options&, const std::string&,
-          ROCKSDB_NAMESPACE::DB**))&ROCKSDB_NAMESPACE::DB::Open);
+      [](const ROCKSDB_NAMESPACE::Options& options, const std::string& db_path,
+         std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
+        return ROCKSDB_NAMESPACE::DB::Open(options, db_path, db);
+      });
 }
 
 /*
@@ -87,7 +89,7 @@ jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Z(
       env, jopt_handle, jdb_path,
       [error_if_wal_file_exists](const ROCKSDB_NAMESPACE::Options& options,
                                  const std::string& db_path,
-                                 ROCKSDB_NAMESPACE::DB** db) {
+                                 std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, db_path, db,
                                                       error_if_wal_file_exists);
       });
@@ -100,7 +102,7 @@ jlongArray rocksdb_open_helper(
         const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
         const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
-        ROCKSDB_NAMESPACE::DB**)>
+        std::unique_ptr<ROCKSDB_NAMESPACE::DB>*)>
         open_fn) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
   if (db_path == nullptr) {
@@ -141,7 +143,7 @@ jlongArray rocksdb_open_helper(
 
   auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jopt_handle);
   std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*> cf_handles;
-  ROCKSDB_NAMESPACE::DB* db = nullptr;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Status s =
       open_fn(*opt, db_path, column_families, &cf_handles, &db);
 
@@ -157,7 +159,7 @@ jlongArray rocksdb_open_helper(
   const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
   std::unique_ptr<jlong[]> results =
       std::unique_ptr<jlong[]>(new jlong[resultsLen]);
-  results[0] = GET_CPLUSPLUS_POINTER(db);
+  results[0] = GET_CPLUSPLUS_POINTER(db.release());
   for (int i = 1; i <= len_cols; i++) {
     results[i] = GET_CPLUSPLUS_POINTER(cf_handles[i - 1]);
   }
@@ -196,7 +198,7 @@ jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3JZ(
           const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
               column_families,
           std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
-          ROCKSDB_NAMESPACE::DB** db) {
+          std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenForReadOnly(
             options, db_path, column_families, handles, db,
             error_if_wal_file_exists);
@@ -213,11 +215,15 @@ jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J(
     jobjectArray jcolumn_names, jlongArray jcolumn_options) {
   return rocksdb_open_helper(
       env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
-      (ROCKSDB_NAMESPACE::Status (*)(
-          const ROCKSDB_NAMESPACE::DBOptions&, const std::string&,
-          const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&,
-          std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>*,
-          ROCKSDB_NAMESPACE::DB**))&ROCKSDB_NAMESPACE::DB::Open);
+      [](const ROCKSDB_NAMESPACE::DBOptions& options,
+         const std::string& db_path,
+         const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
+             column_families,
+         std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
+         std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
+        return ROCKSDB_NAMESPACE::DB::Open(options, db_path, column_families,
+                                           handles, db);
+      });
 }
 
 /*
@@ -239,7 +245,7 @@ jlong Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_S
       env, jopt_handle, jdb_path,
       [secondary_db_path](const ROCKSDB_NAMESPACE::Options& options,
                           const std::string& db_path,
-                          ROCKSDB_NAMESPACE::DB** db) {
+                          std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(options, db_path,
                                                       secondary_db_path, db);
       });
@@ -275,7 +281,7 @@ Java_org_rocksdb_RocksDB_openAsSecondary__JLjava_lang_String_2Ljava_lang_String_
           const std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor>&
               column_families,
           std::vector<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>* handles,
-          ROCKSDB_NAMESPACE::DB** db) {
+          std::unique_ptr<ROCKSDB_NAMESPACE::DB>* db) {
         return ROCKSDB_NAMESPACE::DB::OpenAsSecondary(
             options, db_path, secondary_db_path, column_families, handles, db);
       });
@@ -3044,17 +3050,9 @@ jint Java_org_rocksdb_RocksDB_numberLevels(JNIEnv*, jclass, jlong jdb_handle,
  * Signature: (JJ)I
  */
 jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(JNIEnv*, jclass,
-                                                    jlong jdb_handle,
-                                                    jlong jcf_handle) {
-  auto* db = reinterpret_cast<ROCKSDB_NAMESPACE::DB*>(jdb_handle);
-  ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle;
-  if (jcf_handle == 0) {
-    cf_handle = db->DefaultColumnFamily();
-  } else {
-    cf_handle =
-        reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
-  }
-  return static_cast<jint>(db->MaxMemCompactionLevel(cf_handle));
+                                                    jlong /*jdb_handle*/,
+                                                    jlong /*jcf_handle*/) {
+  return 0;
 }
 
 /*
diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
index 2e2747729eb1..30ade0f38919 100644
--- a/logging/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -647,7 +647,7 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
 }
 
 TEST_F(AutoRollLoggerTest, LogFileExistence) {
-  ROCKSDB_NAMESPACE::DB* db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ROCKSDB_NAMESPACE::Options options;
 #ifdef OS_WIN
   // Replace all slashes in the path so windows CompSpec does not
@@ -664,7 +664,6 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) {
   options.create_if_missing = true;
   ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kTestDir, &db));
   ASSERT_OK(default_env->FileExists(kLogFile));
-  delete db;
 }
 
 TEST_F(AutoRollLoggerTest, FileCreateFailure) {
diff --git a/memory/memory_allocator_test.cc b/memory/memory_allocator_test.cc
index 2ae38ec11b57..669548970ad2 100644
--- a/memory/memory_allocator_test.cc
+++ b/memory/memory_allocator_test.cc
@@ -83,7 +83,7 @@ TEST_P(MemoryAllocatorTest, DatabaseBlockCache) {
   auto cache = NewLRUCache(1024 * 1024, 6, false, 0.0, allocator_);
   table_options.block_cache = cache;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Status s = DB::Open(options, dbname, &db);
   ASSERT_OK(s);
   ASSERT_NE(db, nullptr);
@@ -115,7 +115,7 @@ TEST_P(MemoryAllocatorTest, DatabaseBlockCache) {
   // Close database
   s = db->Close();
   ASSERT_OK(s);
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname, options));
 }
 
diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc
index 2eca31f10843..dd4bbb0d68f7 100644
--- a/microbench/db_basic_bench.cc
+++ b/microbench/db_basic_bench.cc
@@ -138,13 +138,11 @@ static void SetupDB(benchmark::State& state, Options& options,
       db_path + kFilePathSeparator + test_name + std::to_string(getpid());
   DestroyDB(db_name, options);
 
-  DB* db_ptr = nullptr;
-  s = DB::Open(options, db_name, &db_ptr);
+  s = DB::Open(options, db_name, db);
   if (!s.ok()) {
     state.SkipWithError(s.ToString().c_str());
     return;
   }
-  db->reset(db_ptr);
 }
 
 static void TeardownDB(benchmark::State& state, const std::unique_ptr<DB>& db,
@@ -181,12 +179,10 @@ static void DBOpen(benchmark::State& state) {
 
   for (auto _ : state) {
     {
-      DB* db_ptr = nullptr;
-      Status s = DB::Open(options, db_name, &db_ptr);
+      Status s = DB::Open(options, db_name, &db);
       if (!s.ok()) {
         state.SkipWithError(s.ToString().c_str());
       }
-      db.reset(db_ptr);
     }
     state.PauseTiming();
     auto wo = WriteOptions();
@@ -231,12 +227,10 @@ static void DBClose(benchmark::State& state) {
   for (auto _ : state) {
     state.PauseTiming();
     {
-      DB* db_ptr = nullptr;
-      Status s = DB::Open(options, db_name, &db_ptr);
+      Status s = DB::Open(options, db_name, &db);
       if (!s.ok()) {
         state.SkipWithError(s.ToString().c_str());
       }
-      db.reset(db_ptr);
     }
     auto wo = WriteOptions();
     Status s;
@@ -727,13 +721,11 @@ static void SimpleGetWithPerfContext(benchmark::State& state) {
     DestroyDB(db_name, options);
 
     {
-      DB* db_ptr = nullptr;
-      s = DB::Open(options, db_name, &db_ptr);
+      s = DB::Open(options, db_name, &db);
       if (!s.ok()) {
         state.SkipWithError(s.ToString().c_str());
         return;
       }
-      db.reset(db_ptr);
     }
     // load db
     auto wo = WriteOptions();
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 2d169d6f3bee..439ac66b1963 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -164,7 +164,7 @@ TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
   Options options;
   options.create_if_missing = true;
   std::string db_name = test::PerThreadDBPath("test_db");
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, db_name, &db));
   // Bump sequence number.
   ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo"));
@@ -186,7 +186,7 @@ TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
     }
   }
   ASSERT_FALSE(ingested_file.empty());
-  delete db;
+  db.reset();
 
   // Verify the file can be open and read by SstFileReader.
   CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */);
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index 8a164488c8c0..ce2e81ddecef 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -84,7 +84,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   Env* env = Env::Default();
   auto* clock = env->GetSystemClock().get();
   TableBuilder* tb = nullptr;
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Status s;
   const ImmutableOptions ioptions(opts);
   const ColumnFamilyOptions cfo(opts);
@@ -257,8 +257,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   if (!through_db) {
     env->DeleteFile(file_name);
   } else {
-    delete db;
-    db = nullptr;
+    db.reset();
     DestroyDB(dbname, opts);
   }
 }
diff --git a/table/table_test.cc b/table/table_test.cc
index 43d8ccac43a5..a4f06e4eacc8 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -583,18 +583,16 @@ class DBConstructor : public Constructor {
  public:
   explicit DBConstructor(const Comparator* cmp)
       : Constructor(cmp), comparator_(cmp) {
-    db_ = nullptr;
     NewDB();
   }
-  ~DBConstructor() override { delete db_; }
+  ~DBConstructor() override {}
   Status FinishImpl(const Options& /*options*/,
                     const ImmutableOptions& /*ioptions*/,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& /*internal_comparator*/,
                     const stl_wrappers::KVMap& kv_map) override {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     NewDB();
     for (const auto& kv : kv_map) {
       WriteBatch batch;
@@ -609,7 +607,7 @@ class DBConstructor : public Constructor {
     return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
   }
 
-  DB* db() const override { return db_; }
+  DB* db() const override { return db_.get(); }
 
  private:
   void NewDB() {
@@ -628,7 +626,7 @@ class DBConstructor : public Constructor {
   }
 
   const Comparator* comparator_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 };
 
 enum TestType {
@@ -5368,7 +5366,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
   const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   ASSERT_OK(DestroyDB(kDBPath, options));
-  ROCKSDB_NAMESPACE::DB* db;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
   // Create a bunch of keys with 10 filters.
@@ -5382,7 +5380,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
 
   // Trigger compaction.
   ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  delete db;
+  db.reset();
   // In the second round, turn whole_key_filtering off and expect
   // rocksdb still works.
 }
@@ -5688,7 +5686,7 @@ TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) {
   const std::string kDBPath =
       test::PerThreadDBPath("block_align_padded_bytes_verify_file_checksums");
   ASSERT_OK(DestroyDB(kDBPath, options));
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, kDBPath, &db));
   ASSERT_OK(db->Put(WriteOptions(), "k1", "v1"));
   ASSERT_OK(db->Flush(FlushOptions()));
@@ -5696,7 +5694,7 @@ TEST_P(BlockBasedTableTest, FixBlockAlignMismatchedFileChecksums) {
   // aligning blocks are used to generate the checksum to compare against the
   // one not generated by padded bytes
   ASSERT_OK(db->VerifyFileChecksums(ReadOptions()));
-  delete db;
+  db.reset();
 }
 
 class NoBufferAlignmenttWritableFile : public FSWritableFileOwnerWrapper {
@@ -5751,7 +5749,7 @@ TEST_P(BlockBasedTableTest,
   const std::string kDBPath = test::PerThreadDBPath(
       "block_align_flush_during_flush_verify_file_checksums");
   ASSERT_OK(DestroyDB(kDBPath, options));
-  DB* db;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, kDBPath, &db));
 
   ASSERT_OK(db->Put(WriteOptions(), "k1", "k2"));
@@ -5760,7 +5758,7 @@ TEST_P(BlockBasedTableTest,
   // Before the fix, VerifyFileChecksums() will fail as incorrect padded bytes
   // were used to generate checksum upon file creation
   ASSERT_OK(db->VerifyFileChecksums(ReadOptions()));
-  delete db;
+  db.reset();
 }
 
 TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
@@ -6093,27 +6091,25 @@ TEST_P(BlockBasedTableTest, BadOptions) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   ASSERT_OK(DestroyDB(kDBPath, options));
 
-  std::unique_ptr<DB> db;
   {
-    ROCKSDB_NAMESPACE::DB* _db;
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     bbto.block_size = 4096;
     options.compression = kSnappyCompression;
     options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     options.compression = kNoCompression;
     options.bottommost_compression = kSnappyCompression;
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     options.bottommost_compression = kNoCompression;
     options.compression_per_level.emplace_back(kSnappyCompression);
-    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
+    ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
     options.compression_per_level.clear();
-    ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &_db));
-    db.reset(_db);
+    ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
   }
 }
 
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index eb96ef83eff6..5098953cd993 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -2058,6 +2058,7 @@ static void AppendWithSpace(std::string* str, Slice msg) {
 
 struct DBWithColumnFamilies {
   std::vector<ColumnFamilyHandle*> cfh;
+  std::unique_ptr<DB> db_owner;
   DB* db;
   OptimisticTransactionDB* opt_txn_db;
   std::atomic<size_t> num_created;  // Need to be updated after all the
@@ -2087,13 +2088,9 @@ struct DBWithColumnFamilies {
     std::for_each(cfh.begin(), cfh.end(),
                   [](ColumnFamilyHandle* cfhi) { delete cfhi; });
     cfh.clear();
-    if (opt_txn_db) {
-      delete opt_txn_db;
-      opt_txn_db = nullptr;
-    } else {
-      delete db;
-      db = nullptr;
-    }
+    db_owner.reset();
+    db = nullptr;
+    opt_txn_db = nullptr;
   }
 
   ColumnFamilyHandle* GetCfh(int64_t rand_num) {
@@ -3412,8 +3409,8 @@ class Benchmark {
 
   void DeleteDBs() {
     db_.DeleteDBs();
-    for (const DBWithColumnFamilies& dbwcf : multi_dbs_) {
-      delete dbwcf.db;
+    for (auto& dbwcf : multi_dbs_) {
+      dbwcf.DeleteDBs();
     }
   }
 
@@ -3518,11 +3515,13 @@ class Benchmark {
 
   void VerifyDBFromDB(std::string& truth_db_name) {
     DBWithColumnFamilies truth_db;
-    auto s = DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db);
+    auto s =
+        DB::OpenForReadOnly(open_options_, truth_db_name, &truth_db.db_owner);
     if (!s.ok()) {
       fprintf(stderr, "open error: %s\n", s.ToString().c_str());
       db_bench_exit(1);
     }
+    truth_db.db = truth_db.db_owner.get();
     ReadOptions ro;
     ro.total_order_seek = true;
     std::unique_ptr<Iterator> truth_iter(truth_db.db->NewIterator(ro));
@@ -3903,7 +3902,7 @@ class Benchmark {
           }
           Options options = open_options_;
           for (size_t i = 0; i < multi_dbs_.size(); i++) {
-            delete multi_dbs_[i].db;
+            multi_dbs_[i].DeleteDBs();
             if (!open_options_.wal_dir.empty()) {
               options.wal_dir = GetPathForMultiple(open_options_.wal_dir, i);
             }
@@ -5136,11 +5135,15 @@ class Benchmark {
       }
       if (FLAGS_readonly) {
         s = hooks.OpenForReadOnly(options, db_name, column_families, &db->cfh,
-                                  &db->db);
+                                  &db->db_owner);
+        if (s.ok()) {
+          db->db = db->db_owner.get();
+        }
       } else if (FLAGS_optimistic_transaction_db) {
         s = hooks.OpenOptimisticTransactionDB(options, db_name, column_families,
                                               &db->cfh, &db->opt_txn_db);
         if (s.ok()) {
+          db->db_owner.reset(db->opt_txn_db);
           db->db = db->opt_txn_db->GetBaseDB();
         }
       } else if (FLAGS_transaction_db) {
@@ -5154,20 +5157,29 @@ class Benchmark {
         s = hooks.OpenTransactionDB(options, txn_db_options, db_name,
                                     column_families, &db->cfh, &ptr);
         if (s.ok()) {
+          db->db_owner.reset(ptr);
           db->db = ptr;
         }
       } else {
-        s = hooks.Open(options, db_name, column_families, &db->cfh, &db->db);
+        s = hooks.Open(options, db_name, column_families, &db->cfh,
+                       &db->db_owner);
+        if (s.ok()) {
+          db->db = db->db_owner.get();
+        }
       }
       db->cfh.resize(FLAGS_num_column_families);
       db->num_created = num_hot;
       db->num_hot = num_hot;
       db->cfh_idx_to_prob = std::move(cfh_idx_to_prob);
     } else if (FLAGS_readonly) {
-      s = hooks.OpenForReadOnly(options, db_name, &db->db, false);
+      s = hooks.OpenForReadOnly(options, db_name, &db->db_owner, false);
+      if (s.ok()) {
+        db->db = db->db_owner.get();
+      }
     } else if (FLAGS_optimistic_transaction_db) {
       s = hooks.OpenOptimisticTransactionDB(options, db_name, &db->opt_txn_db);
       if (s.ok()) {
+        db->db_owner.reset(db->opt_txn_db);
         db->db = db->opt_txn_db->GetBaseDB();
       }
     } else if (FLAGS_transaction_db) {
@@ -5183,6 +5195,7 @@ class Benchmark {
         s = hooks.OpenTransactionDB(options, txn_db_options, db_name, &ptr);
       }
       if (s.ok()) {
+        db->db_owner.reset(ptr);
         db->db = ptr;
       }
     } else if (FLAGS_use_blob_db) {
@@ -5195,6 +5208,7 @@ class Benchmark {
       blob_db::BlobDB* ptr = nullptr;
       s = hooks.Open(options, blob_db_options, db_name, &ptr);
       if (s.ok()) {
+        db->db_owner.reset(ptr);
         db->db = ptr;
       }
     } else if (FLAGS_use_secondary_db) {
@@ -5205,7 +5219,10 @@ class Benchmark {
         FLAGS_secondary_path = default_secondary_path;
       }
       s = hooks.OpenAsSecondary(options, db_name, FLAGS_secondary_path,
-                                &db->db);
+                                &db->db_owner);
+      if (s.ok()) {
+        db->db = db->db_owner.get();
+      }
       if (s.ok() && FLAGS_secondary_update_interval > 0) {
         secondary_update_thread_.reset(new port::Thread(
             [this](int interval, DBWithColumnFamilies* _db) {
@@ -5225,13 +5242,16 @@ class Benchmark {
             FLAGS_secondary_update_interval, db));
       }
     } else if (FLAGS_open_as_follower) {
-      std::unique_ptr<DB> dbptr;
-      s = hooks.OpenAsFollower(options, db_name, FLAGS_leader_path, &dbptr);
+      s = hooks.OpenAsFollower(options, db_name, FLAGS_leader_path,
+                               &db->db_owner);
       if (s.ok()) {
-        db->db = dbptr.release();
+        db->db = db->db_owner.get();
       }
     } else {
-      s = hooks.Open(options, db_name, &db->db);
+      s = hooks.Open(options, db_name, &db->db_owner);
+      if (s.ok()) {
+        db->db = db->db_owner.get();
+      }
     }
     if (FLAGS_report_open_timing) {
       std::cout << "OpenDb:     "
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 78cccaa038d4..8db16a1d76ec 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -83,7 +83,7 @@ int main(int argc, const char** argv) {
   options.create_if_missing = true;
   options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
   options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
-  DB* db;
+  std::unique_ptr<DB> db;
   DestroyDB(default_db_path, options);
 
   Status s = DB::Open(options, default_db_path, &db);
@@ -94,7 +94,7 @@ int main(int argc, const char** argv) {
   }
 
   DataPumpThread dataPump;
-  dataPump.db = db;
+  dataPump.db = db.get();
   env->StartThread(DataPumpThreadBody, &dataPump);
 
   std::unique_ptr<TransactionLogIterator> iter;
diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc
index dd4fd59bc4ce..76a93e5bbfdd 100644
--- a/tools/db_sanity_test.cc
+++ b/tools/db_sanity_test.cc
@@ -41,9 +41,8 @@ class SanityTest {
     if (!s.ok()) {
       return s;
     }
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     s = DB::Open(options, dbname, &db);
-    std::unique_ptr<DB> db_guard(db);
     if (!s.ok()) {
       return s;
     }
@@ -58,10 +57,9 @@ class SanityTest {
     return db->Flush(FlushOptions());
   }
   Status Verify() {
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     std::string dbname = path_ + Name();
     Status s = DB::Open(GetOptions(), dbname, &db);
-    std::unique_ptr<DB> db_guard(db);
     if (!s.ok()) {
       return s;
     }
diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc
index d03230308f31..520c276915db 100644
--- a/tools/dump/db_dump_tool.cc
+++ b/tools/dump/db_dump_tool.cc
@@ -16,7 +16,6 @@ namespace ROCKSDB_NAMESPACE {
 
 bool DbDumpTool::Run(const DumpOptions& dump_options,
                      ROCKSDB_NAMESPACE::Options options) {
-  ROCKSDB_NAMESPACE::DB* dbptr;
   ROCKSDB_NAMESPACE::Status status;
   std::unique_ptr<ROCKSDB_NAMESPACE::WritableFile> dumpfile;
   char hostname[1024];
@@ -31,16 +30,15 @@ bool DbDumpTool::Run(const DumpOptions& dump_options,
 
   // Open the database
   options.create_if_missing = false;
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
   status = ROCKSDB_NAMESPACE::DB::OpenForReadOnly(options, dump_options.db_path,
-                                                  &dbptr);
+                                                  &db);
   if (!status.ok()) {
     std::cerr << "Unable to open database '" << dump_options.db_path
               << "' for reading: " << status.ToString() << std::endl;
     return false;
   }
 
-  const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
-
   status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
                                 ROCKSDB_NAMESPACE::EnvOptions());
   if (!status.ok()) {
@@ -131,7 +129,6 @@ bool DbDumpTool::Run(const DumpOptions& dump_options,
 
 bool DbUndumpTool::Run(const UndumpOptions& undump_options,
                        ROCKSDB_NAMESPACE::Options options) {
-  ROCKSDB_NAMESPACE::DB* dbptr;
   ROCKSDB_NAMESPACE::Status status;
   ROCKSDB_NAMESPACE::Env* env;
   std::unique_ptr<ROCKSDB_NAMESPACE::SequentialFile> dumpfile;
@@ -180,15 +177,14 @@ bool DbUndumpTool::Run(const UndumpOptions& undump_options,
   }
 
   options.create_if_missing = true;
-  status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &dbptr);
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, undump_options.db_path, &db);
   if (!status.ok()) {
     std::cerr << "Unable to open database '" << undump_options.db_path
               << "' for writing: " << status.ToString() << std::endl;
     return false;
   }
 
-  const std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(dbptr);
-
   uint32_t last_keysize = 64;
   size_t last_valsize = 1 << 20;
   std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
diff --git a/tools/io_tracer_parser_test.cc b/tools/io_tracer_parser_test.cc
index 8e1fb72df394..8f7cb3a5d0cb 100644
--- a/tools/io_tracer_parser_test.cc
+++ b/tools/io_tracer_parser_test.cc
@@ -50,8 +50,7 @@ class IOTracerParserTest : public testing::Test {
     if (db_ != nullptr) {
       Options options;
       options.env = env_;
-      delete db_;
-      db_ = nullptr;
+      db_.reset();
       EXPECT_OK(DestroyDB(dbname_, options));
     }
     EXPECT_OK(env_->DeleteDir(test_path_));
@@ -97,7 +96,7 @@ class IOTracerParserTest : public testing::Test {
     ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
   }
 
-  DB* db_;
+  std::unique_ptr<DB> db_;
   Env* env_;
   EnvOptions env_options_;
   std::string trace_file_path_;
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 8d39987af1e3..eb3fb66d36bc 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -595,7 +595,7 @@ void LDBCommand::OpenDB() {
       st = TransactionDB::Open(options_, txn_db_options, db_path_,
                                column_families_, &handles_opened, &db_txn_);
     }
-    db_ = db_txn_;
+    db_.reset(db_txn_);
   } else if (is_db_ttl_) {
     // ldb doesn't yet support TTL DB with multiple column families
     if (!column_family_name_.empty() || !column_families_.empty()) {
@@ -611,7 +611,7 @@ void LDBCommand::OpenDB() {
     } else {
       st = DBWithTTL::Open(options_, db_path_, &db_ttl_);
     }
-    db_ = db_ttl_;
+    db_.reset(db_ttl_);
   } else {
     if (!secondary_path_.empty() && !leader_path_.empty()) {
       exec_state_ = LDBCommandExecuteResult::Failed(
@@ -631,9 +631,7 @@ void LDBCommand::OpenDB() {
         } else if (!secondary_path_.empty()) {
           st = DB::OpenAsSecondary(options_, db_path_, secondary_path_, &db_);
         } else {
-          std::unique_ptr<DB> dbptr;
-          st = DB::OpenAsFollower(options_, db_path_, leader_path_, &dbptr);
-          db_ = dbptr.release();
+          st = DB::OpenAsFollower(options_, db_path_, leader_path_, &db_);
         }
       } else {
         if (secondary_path_.empty() && leader_path_.empty()) {
@@ -643,10 +641,8 @@ void LDBCommand::OpenDB() {
           st = DB::OpenAsSecondary(options_, db_path_, secondary_path_,
                                    column_families_, &handles_opened, &db_);
         } else {
-          std::unique_ptr<DB> dbptr;
           st = DB::OpenAsFollower(options_, db_path_, leader_path_,
-                                  column_families_, &handles_opened, &dbptr);
-          db_ = dbptr.release();
+                                  column_families_, &handles_opened, &db_);
         }
       }
     }
@@ -691,8 +687,9 @@ void LDBCommand::CloseDB() {
     }
     Status s = db_->Close();
     s.PermitUncheckedError();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
+    db_ttl_ = nullptr;
+    db_txn_ = nullptr;
   }
 }
 
@@ -2227,9 +2224,9 @@ void InternalDumpCommand::DoCommand() {
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st =
-      GetAllKeyVersions(db_, GetCfHandle(), has_from_ ? from_ : OptSlice{},
-                        has_to_ ? to_ : OptSlice{}, max_keys_, &key_versions);
+  Status st = GetAllKeyVersions(
+      db_.get(), GetCfHandle(), has_from_ ? from_ : OptSlice{},
+      has_to_ ? to_ : OptSlice{}, max_keys_, &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -4501,7 +4498,7 @@ void CheckPointCommand::DoCommand() {
     return;
   }
   Checkpoint* checkpoint;
-  Status status = Checkpoint::Create(db_, &checkpoint);
+  Status status = Checkpoint::Create(db_.get(), &checkpoint);
   status = checkpoint->CreateCheckpoint(checkpoint_dir_);
   if (status.ok()) {
     fprintf(stdout, "OK\n");
@@ -4656,7 +4653,7 @@ void BackupCommand::DoCommand() {
     exec_state_ = LDBCommandExecuteResult::Failed(status.ToString());
     return;
   }
-  status = backup_engine->CreateNewBackup(db_);
+  status = backup_engine->CreateNewBackup(db_.get());
   if (status.ok()) {
     fprintf(stdout, "create new backup OK\n");
   } else {
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 6943780f74cc..09be65a6914c 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -7,6 +7,7 @@
 
 #include <cinttypes>
 #include <iomanip>
+#include <memory>
 
 #include "db/db_test_util.h"
 #include "db/version_edit.h"
@@ -99,7 +100,7 @@ TEST_F(LdbCmdTest, MemEnv) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -113,7 +114,7 @@ TEST_F(LdbCmdTest, MemEnv) {
   fopts.wait = true;
   ASSERT_OK(db->Flush(fopts));
 
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -285,7 +286,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -322,7 +323,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -337,7 +338,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -350,11 +351,11 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -364,7 +365,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   // Verify the checksum information in memory is the same as that in Manifest;
   std::vector<LiveFileMetaData> live_files;
   db->GetLiveFilesMetaData(&live_files);
-  delete db;
+  db.reset();
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 }
 
@@ -376,7 +377,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   opts.create_if_missing = true;
   opts.enable_blob_files = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -413,7 +414,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   std::string arg2_str = "--db=" + dbname;
@@ -427,7 +428,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst and blob file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -443,11 +444,11 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -461,7 +462,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   opts.create_if_missing = true;
   opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -498,7 +499,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -513,7 +514,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -526,11 +527,11 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -543,7 +544,7 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 }
 
 TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
@@ -555,7 +556,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
   opts.enable_blob_files = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -592,7 +593,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   }
   ASSERT_OK(db->Flush(fopts));
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   std::string arg2_str = "--db=" + dbname;
@@ -606,7 +607,7 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
   // Verify each sst and blob file checksum value and checksum name
-  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  FileChecksumTestHelper fct_helper(opts, db.get(), dbname);
   ASSERT_OK(fct_helper.VerifyEachFileChecksum());
 
   // Manually trigger compaction
@@ -622,11 +623,11 @@ TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
   CompactRangeOptions options;
   ASSERT_OK(db->CompactRange(options, &begin, &end));
   // Verify each sst file checksum after compaction
-  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  FileChecksumTestHelper fct_helper_ac(opts, db.get(), dbname);
   ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
 
   ASSERT_OK(db->Close());
-  delete db;
+  db.reset();
 
   ASSERT_EQ(0,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
@@ -678,7 +679,7 @@ TEST_F(LdbCmdTest, ListFileTombstone) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -694,7 +695,7 @@ TEST_F(LdbCmdTest, ListFileTombstone) {
   ASSERT_OK(db->DeleteRange(wopts, db->DefaultColumnFamily(), "bar", "foo2"));
   ASSERT_OK(db->Flush(fopts));
 
-  delete db;
+  db.reset();
 
   {
     char arg1[] = "./ldb";
@@ -771,7 +772,7 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) {
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
 
   {
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     ASSERT_OK(DB::Open(opts, dbname, &db));
 
     WriteOptions wopts;
@@ -785,8 +786,6 @@ TEST_F(LdbCmdTest, DisableConsistencyChecks) {
     ASSERT_OK(db->Put(wopts, "foo2", "3"));
     ASSERT_OK(db->Put(wopts, "bar2", "4"));
     ASSERT_OK(db->Flush(fopts));
-
-    delete db;
   }
 
   {
@@ -890,7 +889,7 @@ TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
   opts.env = env.get();
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DestroyDB(dbname, opts));
   ASSERT_OK(DB::Open(opts, dbname, &db));
@@ -901,7 +900,7 @@ TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
   ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle));
 
   delete cf_handle;
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -933,7 +932,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   opts.level0_file_num_compaction_trigger = 10;
   opts.create_if_missing = true;
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(Env::Default(), "ldb_cmd_test");
   ASSERT_OK(DestroyDB(dbname, opts));
   ASSERT_OK(DB::Open(opts, dbname, &db));
@@ -957,8 +956,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   uint64_t to_remove = numbers[1];
 
   // Close for unsafe_remove_sst_file
-  delete db;
-  db = nullptr;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -1007,8 +1005,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
 
   // Close for unsafe_remove_sst_file
   delete cf_handle;
-  delete db;
-  db = nullptr;
+  db.reset();
 
   snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
   ASSERT_EQ(0,
@@ -1049,8 +1046,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   for (auto& h : handles) {
     delete h;
   }
-  delete db;
-  db = nullptr;
+  db.reset();
 
   snprintf(arg4, sizeof(arg4), "%" PRIu64, to_remove);
   ASSERT_EQ(0,
@@ -1066,7 +1062,7 @@ TEST_F(LdbCmdTest, UnsafeRemoveSstFile) {
   for (auto& h : handles) {
     delete h;
   }
-  delete db;
+  db.reset();
 }
 
 TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
@@ -1078,7 +1074,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   opts.create_if_missing = true;
   opts.env = env.get();
 
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
   ASSERT_OK(DestroyDB(dbname, opts));
   ASSERT_OK(DB::Open(opts, dbname, &db));
@@ -1102,8 +1098,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   }
 
   // Close & reopen
-  delete db;
-  db = nullptr;
+  db.reset();
   test_fs->PopRequestedSstFileTemperatures();
   ASSERT_OK(DB::Open(opts, dbname, &db));
 
@@ -1122,8 +1117,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   }
 
   // Close for update_manifest
-  delete db;
-  db = nullptr;
+  db.reset();
 
   char arg1[] = "./ldb";
   char arg2[1024];
@@ -1151,7 +1145,7 @@ TEST_F(LdbCmdTest, FileTemperatureUpdateManifest) {
   for (auto& r : requests) {
     ASSERT_EQ(r.second, number_to_temp[r.first]);
   }
-  delete db;
+  db.reset();
 }
 
 TEST_F(LdbCmdTest, RenameDbAndLoadOptions) {
@@ -1231,7 +1225,7 @@ TEST_F(LdbCmdTest, CustomComparator) {
   opts.comparator = &my_comparator;
 
   std::string dbname = test::PerThreadDBPath(env, "ldb_cmd_test");
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
 
   std::vector<ColumnFamilyDescriptor> cfds = {
       {kDefaultColumnFamilyName, opts}, {"cf1", opts}, {"cf2", opts}};
@@ -1243,7 +1237,7 @@ TEST_F(LdbCmdTest, CustomComparator) {
   for (auto& h : handles) {
     ASSERT_OK(db->DestroyColumnFamilyHandle(h));
   }
-  delete db;
+  db.reset();
 
   char arg1[] = "./ldb";
   std::string arg2 = "--db=" + dbname;
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index 061270d6d32a..2ab32a8c97b1 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -21,7 +21,7 @@ class ReduceLevelTest : public testing::Test {
   ReduceLevelTest() {
     dbname_ = test::PerThreadDBPath("db_reduce_levels_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
+    db_.reset();
   }
 
   Status OpenDB(bool create_if_missing, int levels);
@@ -46,12 +46,12 @@ class ReduceLevelTest : public testing::Test {
     if (db_ == nullptr) {
       return Status::InvalidArgument("DB not opened.");
     }
-    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
     return db_impl->TEST_FlushMemTable();
   }
 
   void MoveL0FileToLevel(int level) {
-    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
     for (int i = 0; i < level; ++i) {
       ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr));
     }
@@ -59,8 +59,7 @@ class ReduceLevelTest : public testing::Test {
 
   void CloseDB() {
     if (db_ != nullptr) {
-      delete db_;
-      db_ = nullptr;
+      db_.reset();
     }
   }
 
@@ -75,7 +74,7 @@ class ReduceLevelTest : public testing::Test {
 
  private:
   std::string dbname_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 };
 
 Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) {
diff --git a/tools/tool_hooks.cc b/tools/tool_hooks.cc
index bdccd4d49157..32fac03e61fb 100644
--- a/tools/tool_hooks.cc
+++ b/tools/tool_hooks.cc
@@ -16,19 +16,20 @@
 namespace ROCKSDB_NAMESPACE {
 
 Status DefaultHooks::Open(const Options& db_options, const std::string& name,
-                          DB** dbptr) {
+                          std::unique_ptr<DB>* dbptr) {
   return DB::Open(db_options, name, dbptr);
 };
 
 Status DefaultHooks::Open(
     const DBOptions& db_options, const std::string& name,
     const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+    std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
   return DB::Open(db_options, name, column_families, handles, dbptr);
 };
 
 Status DefaultHooks::OpenForReadOnly(const Options& options,
-                                     const std::string& name, DB** dbptr,
+                                     const std::string& name,
+                                     std::unique_ptr<DB>* dbptr,
                                      bool error_if_wal_file_exists = false) {
   return DB::OpenForReadOnly(options, name, dbptr, error_if_wal_file_exists);
 };
@@ -36,7 +37,7 @@ Status DefaultHooks::OpenForReadOnly(const Options& options,
 Status DefaultHooks::OpenForReadOnly(
     const Options& options, const std::string& name,
     const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+    std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
   return DB::OpenForReadOnly(options, name, column_families, handles, dbptr);
 };
 Status DefaultHooks::OpenTransactionDB(
@@ -72,7 +73,7 @@ Status DefaultHooks::OpenOptimisticTransactionDB(
 Status DefaultHooks::OpenAsSecondary(const Options& options,
                                      const std::string& name,
                                      const std::string& secondary_path,
-                                     DB** dbptr) {
+                                     std::unique_ptr<DB>* dbptr) {
   return DB::OpenAsSecondary(options, name, secondary_path, dbptr);
 }
 Status DefaultHooks::OpenAsFollower(const Options& options,
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index 1d5c870540ad..1d1ee61e6670 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -69,7 +69,7 @@ class TraceAnalyzerTest : public testing::Test {
     ro.iterate_lower_bound = &lower_bound;
     WriteOptions wo;
     TraceOptions trace_opt;
-    DB* db_ = nullptr;
+    std::unique_ptr<DB> db_;
     std::string value;
     std::unique_ptr<TraceWriter> trace_writer;
     Iterator* single_iter = nullptr;
@@ -125,7 +125,7 @@ class TraceAnalyzerTest : public testing::Test {
     ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_));
     std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n";
     ASSERT_OK(whole_f->Append(whole_str));
-    delete db_;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
   }
 
@@ -786,7 +786,7 @@ TEST_F(TraceAnalyzerTest, Iterator) {
 }
 
 TEST_F(TraceAnalyzerTest, ExistsPreviousTraceWriteError) {
-  DB* db_ = nullptr;
+  std::unique_ptr<DB> db_;
   Options options;
   options.create_if_missing = true;
 
@@ -823,7 +823,7 @@ TEST_F(TraceAnalyzerTest, ExistsPreviousTraceWriteError) {
   ASSERT_TRUE(s.ToString().find("Tracing has seen error") != std::string::npos);
   ASSERT_TRUE(s.ToString().find("Injected") != std::string::npos);
 
-  delete db_;
+  db_.reset();
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index 5cfec3e8e5bd..30fc0467d52d 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -145,13 +145,11 @@ class WriteStress {
     }
 
     // open DB
-    DB* db;
-    Status s = DB::Open(options, FLAGS_db, &db);
+    Status s = DB::Open(options, FLAGS_db, &db_);
     if (!s.ok()) {
       fprintf(stderr, "Can't open database: %s\n", s.ToString().c_str());
       std::abort();
     }
-    db_.reset(db);
   }
 
   void WriteThread() {
diff --git a/unreleased_history/public_api_changes/remove_raw_ptr_db_open.md b/unreleased_history/public_api_changes/remove_raw_ptr_db_open.md
new file mode 100644
index 000000000000..03c442d2ac53
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_raw_ptr_db_open.md
@@ -0,0 +1,2 @@
+* Remove deprecated raw `DB*` variants of `DB::Open` and related functions. Some other minor public APIs were updated as a result
+* Remove deprecated `DB::MaxMemCompactionLevel()`
diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc
index 18b0ea51f327..9761699f41af 100644
--- a/util/slice_transform_test.cc
+++ b/util/slice_transform_test.cc
@@ -49,7 +49,7 @@ class SliceTransformDBTest : public testing::Test {
  private:
   std::string dbname_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
 
  public:
   SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
@@ -58,11 +58,11 @@ class SliceTransformDBTest : public testing::Test {
   }
 
   ~SliceTransformDBTest() override {
-    delete db_;
+    db_.reset();
     EXPECT_OK(DestroyDB(dbname_, last_options_));
   }
 
-  DB* db() { return db_; }
+  DB* db() { return db_.get(); }
 
   // Return the current option configuration.
   Options* GetOptions() { return &last_options_; }
@@ -74,14 +74,12 @@ class SliceTransformDBTest : public testing::Test {
   }
 
   void Destroy() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, last_options_));
   }
 
   Status TryReopen() {
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     last_options_.create_if_missing = true;
 
     return DB::Open(last_options_, dbname_, &db_);
diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc
index d31f34ef887a..ff5c7378e7e3 100644
--- a/utilities/backup/backup_engine_test.cc
+++ b/utilities/backup/backup_engine_test.cc
@@ -758,8 +758,8 @@ class BackupEngineTest : public testing::Test {
     ASSERT_OK(CreateLoggerFromOptions(dbname_, logger_options, &logger_));
   }
 
-  DB* OpenDB() {
-    DB* db;
+  std::unique_ptr<DB> OpenDB() {
+    std::unique_ptr<DB> db;
     EXPECT_OK(DB::Open(options_, dbname_, &db));
     return db;
   }
@@ -770,13 +770,11 @@ class BackupEngineTest : public testing::Test {
 
     // Open DB
     test_db_fs_->SetLimitWrittenFiles(1000000);
-    DB* db;
     if (read_only) {
-      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
+      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db_));
     } else {
-      ASSERT_OK(DB::Open(options_, dbname_, &db));
+      ASSERT_OK(DB::Open(options_, dbname_, &db_));
     }
-    db_.reset(db);
   }
 
   void InitializeDBAndBackupEngine(bool dummy = false) {
@@ -784,14 +782,12 @@ class BackupEngineTest : public testing::Test {
     test_db_fs_->SetLimitWrittenFiles(1000000);
     test_db_fs_->SetDummySequentialFile(dummy);
 
-    DB* db;
     if (dummy) {
       dummy_db_ = new DummyDB(options_, dbname_);
-      db = dummy_db_;
+      db_.reset(dummy_db_);
     } else {
-      ASSERT_OK(DB::Open(options_, dbname_, &db));
+      ASSERT_OK(DB::Open(options_, dbname_, &db_));
     }
-    db_.reset(db);
   }
 
   virtual void OpenDBAndBackupEngine(
@@ -914,13 +910,13 @@ class BackupEngineTest : public testing::Test {
       ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_,
                                                           restore_options));
     }
-    DB* db = OpenDB();
+    auto db = OpenDB();
     // Check DB contents
-    AssertExists(db, start_exist, end_exist);
+    AssertExists(db.get(), start_exist, end_exist);
     if (end != 0) {
-      AssertEmpty(db, end_exist, end);
+      AssertEmpty(db.get(), end_exist, end);
     }
-    delete db;
+    db.reset();
     if (opened_backup_engine) {
       CloseBackupEngine();
     }
@@ -1063,6 +1059,7 @@ class BackupEngineTest : public testing::Test {
   // all the dbs!
   DummyDB* dummy_db_;  // owned as db_ when present
   std::unique_ptr<DB> db_;
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
   std::unique_ptr<BackupEngine> backup_engine_;
 
   // options
@@ -1203,7 +1200,7 @@ TEST_F(BackupEngineTest, IncrementalRestore) {
     // Since we started with a blank db, restore copied all the files.
     test_db_fs_->AssertWrittenFiles(all_files);
 
-    db_.reset(OpenDB());
+    db_ = OpenDB();
 
     // Check DB contents.
     AssertExists(db_.get(), 0, keys_iteration * 2);
@@ -1255,7 +1252,7 @@ TEST_F(BackupEngineTest, IncrementalRestore) {
     test_db_fs_->AssertWrittenFiles(should_have_written);
 
     // Check DB contents.
-    db_.reset(OpenDB());
+    db_ = OpenDB();
     AssertExists(db_.get(), 0, keys_iteration * 2);
 
     db_.reset();  // Close DB.
@@ -1307,7 +1304,7 @@ TEST_F(BackupEngineTest, IncrementalRestore) {
     // 'Hole' has been patched, 'in-policy' db files were retained.
     test_db_fs_->AssertWrittenFiles(should_have_written);
 
-    db_.reset(OpenDB());
+    db_ = OpenDB();
     Status s = db_->VerifyChecksum();
 
     // Check DB contents.
@@ -1424,9 +1421,9 @@ TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
       DestroyDBWithoutCheck(dbname_, options_);
 
       // ---- make sure it's empty ----
-      DB* db = OpenDB();
-      AssertEmpty(db, 0, fill_up_to);
-      delete db;
+      auto db = OpenDB();
+      AssertEmpty(db.get(), 0, fill_up_to);
+      db.reset();
 
       // ---- restore the DB ----
       OpenBackupEngine();
@@ -1478,9 +1475,9 @@ TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) {
   DestroyDBWithoutCheck(dbname_, options_);
 
   // ---- make sure it's empty ----
-  DB* db = OpenDB();
-  AssertEmpty(db, 0, max_key);
-  delete db;
+  auto db = OpenDB();
+  AssertEmpty(db.get(), 0, max_key);
+  db.reset();
 
   // ---- restore every backup and verify all the data is there ----
   OpenBackupEngine();
@@ -2091,10 +2088,9 @@ TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
           "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before");
       FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
       ASSERT_OK(db_->Flush(FlushOptions()));
-      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
-      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-      ASSERT_OK(dbi->TEST_WaitForCompact());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       TEST_SYNC_POINT(
           "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After");
     }};
@@ -2141,7 +2137,7 @@ TEST_F(BackupEngineTest, BackupOptions) {
     // Must reset() before reset(OpenDB()) again.
     // Calling OpenDB() while *db_ is existing will cause LOCK issue
     db_.reset();
-    db_.reset(OpenDB());
+    db_ = OpenDB();
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
     ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(),
                                                           options_.env, &name));
@@ -2169,13 +2165,12 @@ TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) {
   ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() {
     TEST_SYNC_POINT(
         "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
-    DBImpl* dbi = static_cast<DBImpl*>(db_.get());
     // Change arbitrary option to trigger OPTIONS file deletion
-    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+    ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "false"}}));
-    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+    ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "true"}}));
-    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+    ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "false"}}));
     TEST_SYNC_POINT(
         "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions");
@@ -2433,14 +2428,13 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
         engine_options_->share_files_with_checksum_naming = option;
       }
       OpenDBAndBackupEngine(true, false, share);
-      DBImpl* dbi = static_cast<DBImpl*>(db_.get());
       // A small SST file
-      ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
-      ASSERT_OK(dbi->Flush(FlushOptions()));
+      ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+      ASSERT_OK(db_->Flush(FlushOptions()));
       // And a bigger one
-      ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500)));
-      ASSERT_OK(dbi->Flush(FlushOptions()));
-      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       CloseAndReopenDB(/*read_only*/ true);
 
       std::vector<FileAttributes> table_files;
@@ -2485,9 +2479,8 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
       db_.reset();
       ASSERT_OK(backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_));
       {
-        DB* db = OpenDB();
+        auto db = OpenDB();
         s = db->VerifyChecksum();
-        delete db;
       }
       if (option != kLegacyCrc32cAndFileSize && !corrupt_before_first_backup) {
         // Second backup is OK because it used (uncorrupt) file from first
@@ -2529,11 +2522,10 @@ TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
 
 TEST_F(BackupEngineTest, PropertiesBlockCorruptionIncremental) {
   OpenDBAndBackupEngine(true, false, kShareWithChecksum);
-  DBImpl* dbi = static_cast<DBImpl*>(db_.get());
   // A small SST file
-  ASSERT_OK(dbi->Put(WriteOptions(), "x", "y"));
-  ASSERT_OK(dbi->Flush(FlushOptions()));
-  ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+  ASSERT_OK(db_->Put(WriteOptions(), "x", "y"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
 
@@ -3350,9 +3342,9 @@ TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
   std::vector<std::string> should_have_written;
   test_backup_fs_->AssertWrittenFiles(should_have_written);
 
-  DB* db = OpenDB();
-  AssertExists(db, 0, 200);
-  delete db;
+  auto db = OpenDB();
+  AssertExists(db.get(), 0, 200);
+  db.reset();
 }
 
 TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
@@ -3385,7 +3377,7 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   // Caution: DBOptions only holds a raw pointer to Env, so something else
   // must keep it alive.
   // Case 1: Keeping BackupEngine open suffices to keep Env alive
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   Options opts = options_;
   // Ensure some key defaults are set
   opts.wal_dir = "";
@@ -3397,11 +3389,10 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   backup_info = BackupInfo();
   ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
 
-  AssertExists(db, 0, 100);
-  AssertEmpty(db, 100, 200);
+  AssertExists(db.get(), 0, 100);
+  AssertEmpty(db.get(), 100, 200);
 
-  delete db;
-  db = nullptr;
+  db.reset();
 
   // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices
   ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info,
@@ -3413,12 +3404,14 @@ TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
   // Note: keeping backup_info alive
   ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
 
-  AssertExists(db, 0, 200);
-  delete db;
-  db = nullptr;
+  AssertExists(db.get(), 0, 200);
+  db.reset();
 
   // Now try opening read-write and make sure it fails, for safety.
-  ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError());
+  {
+    std::unique_ptr<DB> dbptr;
+    ASSERT_TRUE(DB::Open(opts, name, &dbptr).IsIOError());
+  }
 }
 
 TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) {
@@ -3565,16 +3558,15 @@ TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
   // The last manifest roll would've already been cleaned up by the full scan
   // that happens when CreateNewBackup invokes EnableFileDeletions. We need to
   // trigger another roll to verify non-full scan purges stale manifests.
-  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_.get());
   std::string prev_manifest_path =
-      DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo());
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
   FillDB(db_.get(), 0, 100, kAutoFlushOnly);
   ASSERT_OK(db_chroot_env_->FileExists(prev_manifest_path));
   ASSERT_OK(db_->Flush(FlushOptions()));
   // Even though manual flush completed above, the background thread may not
   // have finished its cleanup work. `TEST_WaitForBackgroundWork()` will wait
   // until all the background thread's work has completed, including cleanup.
-  ASSERT_OK(db_impl->TEST_WaitForBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
   ASSERT_TRUE(db_chroot_env_->FileExists(prev_manifest_path).IsNotFound());
 
   CloseDBAndBackupEngine();
@@ -3940,7 +3932,7 @@ TEST_F(BackupEngineTest, Concurrency) {
       // by doing it async and ensuring we either get OK or InvalidArgument
       restore_verify_threads[i] =
           std::thread([this, &db_opts, restore_db_dir, to_restore] {
-            DB* restored;
+            std::unique_ptr<DB> restored;
             Status s;
             for (;;) {
               s = DB::Open(db_opts, restore_db_dir, &restored);
@@ -3956,10 +3948,9 @@ TEST_F(BackupEngineTest, Concurrency) {
               }
             }
             int factor = std::min(static_cast<int>(to_restore), max_factor);
-            AssertExists(restored, 0, factor * keys_iteration);
-            AssertEmpty(restored, factor * keys_iteration,
+            AssertExists(restored.get(), 0, factor * keys_iteration);
+            AssertEmpty(restored.get(), factor * keys_iteration,
                         (factor + 1) * keys_iteration);
-            delete restored;
           });
 
       // (Ok now) Restore one of the backups, or "latest"
@@ -4418,14 +4409,13 @@ TEST_F(BackupEngineTest, FileTemperatures) {
                         kShareWithChecksum);
 
   // generate a bottommost file (combined from 2) and a non-bottommost file
-  DBImpl* dbi = static_cast_with_check<DBImpl>(db_.get());
   ASSERT_OK(db_->Put(WriteOptions(), "a", "val"));
   ASSERT_OK(db_->Put(WriteOptions(), "c", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
   ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
   ASSERT_OK(db_->Put(WriteOptions(), "d", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
-  ASSERT_OK(dbi->TEST_WaitForCompact());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_OK(db_->Put(WriteOptions(), "e", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
 
@@ -4583,7 +4573,7 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
 
     // Ensure each backup is same set of files
     db_.reset();
-    DB* db = nullptr;
+    std::unique_ptr<DB> db;
     ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
 
     // A callback that throws should cleanly fail the backup creation.
@@ -4593,12 +4583,12 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
                                     MaybeExcludeBackupFile* /*files_end*/) {
       throw 42;
     };
-    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db).IsAborted());
+    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db.get()).IsAborted());
     cbo.exclude_files_callback = [](MaybeExcludeBackupFile* /*files_begin*/,
                                     MaybeExcludeBackupFile* /*files_end*/) {
       throw std::out_of_range("blah");
     };
-    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db).IsAborted());
+    ASSERT_TRUE(backup_engine_->CreateNewBackup(cbo, db.get()).IsAborted());
 
     // Include files only in given bucket, based on modulus and remainder
     constexpr int modulus = 4;
@@ -4619,22 +4609,21 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
     BackupID first_id{};
     BackupID last_alt_id{};
     remainder = 0;
-    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db, &first_id));
+    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db.get(), &first_id));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
     remainder = 1;
-    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db));
+    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db.get()));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
     remainder = 2;
-    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db));
+    ASSERT_OK(backup_engine_->CreateNewBackup(cbo, db.get()));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
     remainder = 3;
-    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db, &last_alt_id));
+    ASSERT_OK(alt_backup_engine->CreateNewBackup(cbo, db.get(), &last_alt_id));
     AssertBackupInfoConsistency(/*allow excluded*/ true);
 
     // Close DB
     ASSERT_OK(db->Close());
-    delete db;
-    db = nullptr;
+    db.reset();
 
     auto backup_engine = backup_engine_.get();
     for (auto be_pair : {std::make_pair(backup_engine, alt_backup_engine),
@@ -4652,8 +4641,8 @@ TEST_F(BackupEngineTest, ExcludeFiles) {
 
       // Check DB contents
       db = OpenDB();
-      AssertExists(db, 0, keys_iteration);
-      delete db;
+      AssertExists(db.get(), 0, keys_iteration);
+      db.reset();
     }
 
     // Should still work after close and re-open
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 22216fc82aae..677fa60dfb95 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -113,9 +113,8 @@ Status BlobDBImpl::CloseImpl() {
   // Close base DB before BlobDBImpl destructs to stop event listener and
   // compaction filter call.
   Status s = db_->Close();
-  // delete db_ anyway even if close failed.
-  delete db_;
-  // Reset pointers to avoid StackableDB delete the pointer again.
+  // Reset ownership to free the underlying DB.
+  shared_db_ptr_.reset();
   db_ = nullptr;
   db_impl_ = nullptr;
   if (!s.ok()) {
@@ -202,7 +201,12 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
 
   // Open base db.
   ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
-  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
+  std::unique_ptr<DB> db;
+  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db);
+  if (s.ok()) {
+    shared_db_ptr_ = std::move(db);
+    db_ = shared_db_ptr_.get();
+  }
   if (!s.ok()) {
     return s;
   }
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 7d225047eff0..5d3674f09634 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -829,16 +829,15 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
   // Write to plain rocksdb.
   Options options;
   options.create_if_missing = true;
-  DB* db = nullptr;
+  std::unique_ptr<DB> db;
   ASSERT_OK(DB::Open(options, dbname_, &db));
   for (size_t i = 0; i < kNumIteration; i++) {
     auto key_index = rnd.Next() % kNumKey;
     std::string key = "key" + std::to_string(key_index);
-    PutRandom(db, key, &rnd, &data);
+    PutRandom(db.get(), key, &rnd, &data);
   }
-  VerifyDB(db, data);
-  delete db;
-  db = nullptr;
+  VerifyDB(db.get(), data);
+  db.reset();
 
   // Open as blob db. Verify it can read existing data.
   Open();
@@ -868,7 +867,6 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
       ASSERT_EQ(data[key], value);
     }
   }
-  delete db;
 }
 
 // Test to verify that a NoSpace IOError Status is returned on reaching
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index 28fba3acb88a..f9d9aa4afbf8 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -25,7 +25,7 @@ const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test");
 
 class CassandraStore {
  public:
-  explicit CassandraStore(std::shared_ptr<DB> db)
+  explicit CassandraStore(UnownedPtr<DB> db)
       : db_(db), write_option_(), get_option_() {
     assert(db);
   }
@@ -87,7 +87,7 @@ class CassandraStore {
   }
 
  private:
-  std::shared_ptr<DB> db_;
+  UnownedPtr<DB> db_;
   WriteOptions write_option_;
   ReadOptions get_option_;
 
@@ -122,8 +122,7 @@ class CassandraFunctionalTest : public testing::Test {
         DestroyDB(kDbName, Options()));  // Start each test with a fresh DB
   }
 
-  std::shared_ptr<DB> OpenDb() {
-    DB* db;
+  std::unique_ptr<DB> OpenDb() {
     Options options;
     options.create_if_missing = true;
     options.merge_operator.reset(
@@ -131,8 +130,9 @@ class CassandraFunctionalTest : public testing::Test {
     auto* cf_factory = new TestCompactionFilterFactory(
         purge_ttl_on_expiration_, gc_grace_period_in_seconds_);
     options.compaction_filter_factory.reset(cf_factory);
+    std::unique_ptr<DB> db;
     EXPECT_OK(DB::Open(options, kDbName, &db));
-    return std::shared_ptr<DB>(db);
+    return db;
   }
 
   bool purge_ttl_on_expiration_ = false;
@@ -142,7 +142,8 @@ class CassandraFunctionalTest : public testing::Test {
 // THE TEST CASES BEGIN HERE
 
 TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append(
@@ -190,7 +191,8 @@ constexpr int64_t kTestTimeoutSecs = 600;
 
 TEST_F(CassandraFunctionalTest,
        CompactionShouldConvertExpiredColumnsToTombstone) {
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append(
@@ -232,7 +234,8 @@ TEST_F(CassandraFunctionalTest,
 TEST_F(CassandraFunctionalTest,
        CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append(
@@ -271,7 +274,8 @@ TEST_F(CassandraFunctionalTest,
 TEST_F(CassandraFunctionalTest,
        CompactionShouldRemoveRowWhenAllColumnsExpiredIfPurgeTtlIsOn) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append("k1", CreateTestRowValue({
@@ -296,7 +300,8 @@ TEST_F(CassandraFunctionalTest,
 TEST_F(CassandraFunctionalTest,
        CompactionShouldRemoveTombstoneExceedingGCGracePeriod) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Append("k1",
@@ -327,7 +332,8 @@ TEST_F(CassandraFunctionalTest,
 
 TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
   purge_ttl_on_expiration_ = true;
-  CassandraStore store(OpenDb());
+  auto db = OpenDb();
+  CassandraStore store(db.get());
   int64_t now = time(nullptr);
 
   store.Put("k1",
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index f7ca4136e7d9..9e0729b64cd0 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -46,7 +46,7 @@ class CheckpointTest : public testing::Test {
   std::string dbname_;
   std::string alternative_wal_dir_;
   Env* env_;
-  DB* db_;
+  std::unique_ptr<DB> db_;
   Options last_options_;
   std::vector<ColumnFamilyHandle*> handles_;
   std::string snapshot_name_;
@@ -65,7 +65,7 @@ class CheckpointTest : public testing::Test {
     EXPECT_OK(DestroyDB(dbname_, delete_options));
     // Destroy it for not alternative WAL dir is used.
     EXPECT_OK(DestroyDB(dbname_, options));
-    db_ = nullptr;
+    db_.reset();
     snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
     std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
     EXPECT_OK(DestroyDB(snapshot_name_, options));
@@ -102,6 +102,8 @@ class CheckpointTest : public testing::Test {
     DestroyDir(env_, export_path_).PermitUncheckedError();
   }
 
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_.get()); }
+
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
@@ -170,8 +172,7 @@ class CheckpointTest : public testing::Test {
       delete h;
     }
     handles_.clear();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
   }
 
   void DestroyAndReopen(const Options& options) {
@@ -268,14 +269,12 @@ class CheckpointTest : public testing::Test {
 TEST_F(CheckpointTest, GetSnapshotLink) {
   for (uint64_t log_size_for_flush : {0, 1000000}) {
     Options options;
-    DB* snapshotDB;
     ReadOptions roptions;
     std::string result;
     Checkpoint* checkpoint;
 
     options = CurrentOptions();
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
 
     // Create a database
@@ -284,7 +283,7 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     std::string key = std::string("foo");
     ASSERT_OK(Put(key, "v1"));
     // Take a snapshot
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, log_size_for_flush));
     ASSERT_OK(Put(key, "v2"));
     ASSERT_EQ("v2", Get(key));
@@ -292,13 +291,12 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     ASSERT_EQ("v2", Get(key));
     // Open snapshot and verify contents while DB is running
     options.create_if_missing = false;
+    std::unique_ptr<DB> snapshotDB;
     ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
     ASSERT_OK(snapshotDB->Get(roptions, key, &result));
     ASSERT_EQ("v1", result);
-    delete snapshotDB;
-    snapshotDB = nullptr;
-    delete db_;
-    db_ = nullptr;
+    snapshotDB.reset();
+    db_.reset();
 
     // Destroy original DB
     ASSERT_OK(DestroyDB(dbname_, options));
@@ -308,8 +306,7 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     dbname_ = snapshot_name_;
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     ASSERT_EQ("v1", Get(key));
-    delete db_;
-    db_ = nullptr;
+    db_.reset();
     ASSERT_OK(DestroyDB(dbname_, options));
     delete checkpoint;
 
@@ -335,7 +332,7 @@ TEST_F(CheckpointTest, CheckpointWithBlob) {
 
   // Create a checkpoint
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
   std::unique_ptr<Checkpoint> checkpoint_guard(checkpoint);
 
@@ -360,11 +357,9 @@ TEST_F(CheckpointTest, CheckpointWithBlob) {
 
   // Make sure the checkpoint can be opened and the blob value read
   options.create_if_missing = false;
-  DB* checkpoint_db = nullptr;
+  std::unique_ptr<DB> checkpoint_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db));
 
-  std::unique_ptr<DB> checkpoint_db_guard(checkpoint_db);
-
   PinnableSlice value;
   ASSERT_OK(checkpoint_db->Get(
       ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value));
@@ -393,7 +388,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
     ASSERT_OK(Put(key, "v1"));
 
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
     // Export the Tables and verify
     ASSERT_OK(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
@@ -427,7 +422,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
     ASSERT_OK(db_->Put(WriteOptions(), cfh_reverse_comp_, key, "v1"));
 
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
     // Export the Tables and verify
     ASSERT_OK(checkpoint->ExportColumnFamily(cfh_reverse_comp_, export_path_,
@@ -449,7 +444,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
   ASSERT_OK(Put(key, "v1"));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
 
   // Export onto existing directory
   ASSERT_OK(env_->CreateDirIfMissing(export_path_));
@@ -482,7 +477,6 @@ TEST_F(CheckpointTest, CheckpointCF) {
   ASSERT_OK(Put(4, "four", "four"));
   ASSERT_OK(Put(5, "five", "five"));
 
-  DB* snapshotDB;
   ReadOptions roptions;
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
@@ -490,7 +484,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
   // Take a snapshot
   ROCKSDB_NAMESPACE::port::Thread t([&]() {
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
@@ -519,6 +513,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
   for (size_t i = 0; i < cfs.size(); ++i) {
     column_families.emplace_back(cfs[i], options);
   }
+  std::unique_ptr<DB> snapshotDB;
   ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
                      &snapshotDB));
   ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
@@ -530,8 +525,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
     delete h;
   }
   cphandles.clear();
-  delete snapshotDB;
-  snapshotDB = nullptr;
+  snapshotDB.reset();
 }
 
 TEST_F(CheckpointTest, CheckpointCFNoFlush) {
@@ -545,7 +539,6 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   ASSERT_OK(Flush());
   ASSERT_OK(Put(2, "two", "two"));
 
-  DB* snapshotDB;
   ReadOptions roptions;
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
@@ -558,7 +551,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1000000));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
@@ -577,6 +570,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   for (size_t i = 0; i < cfs.size(); ++i) {
     column_families.emplace_back(cfs[i], options);
   }
+  std::unique_ptr<DB> snapshotDB;
   ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
                      &snapshotDB));
   ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
@@ -589,8 +583,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
     delete h;
   }
   cphandles.clear();
-  delete snapshotDB;
-  snapshotDB = nullptr;
+  snapshotDB.reset();
 }
 
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
@@ -615,7 +608,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
 
   ROCKSDB_NAMESPACE::port::Thread t([&]() {
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
@@ -627,12 +620,10 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
-  DB* snapshotDB;
   // Successful Open() implies that CURRENT pointed to the manifest in the
   // checkpoint.
+  std::unique_ptr<DB> snapshotDB;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
-  delete snapshotDB;
-  snapshotDB = nullptr;
 }
 
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
@@ -752,7 +743,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
 TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) {
   for (std::string checkpoint_dir : {"", "/", "////"}) {
     Checkpoint* checkpoint;
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
     ASSERT_TRUE(
         checkpoint->CreateCheckpoint(checkpoint_dir).IsInvalidArgument());
     delete checkpoint;
@@ -765,7 +756,7 @@ TEST_F(CheckpointTest, CheckpointWithParallelWrites) {
   ASSERT_OK(Put("key1", "val1"));
   port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); });
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   thread.join();
@@ -816,24 +807,24 @@ TEST_P(CheckpointTestWithWalParams, CheckpointWithUnsyncedDataDropped) {
     // * one active WAL, not synced
     // with a single thread, so that we have at least one that can be hard
     // linked, etc.
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork());
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+    ASSERT_OK(dbfull()->PauseBackgroundWork());
+    ASSERT_OK(dbfull()->TEST_SwitchMemtable());
     ASSERT_OK(db_->SyncWAL());
   }
   ASSERT_OK(Put("key2", "val2"));
   if (GetLogSizeForFlush() > 0) {
-    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+    ASSERT_OK(dbfull()->TEST_SwitchMemtable());
   }
   ASSERT_OK(Put("key3", "val3"));
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, GetLogSizeForFlush()));
   delete checkpoint;
   ASSERT_OK(fault_fs->DropUnsyncedFileData());
   // make sure it's openable even though whatever data that wasn't synced got
   // dropped.
   options.env = env_;
-  DB* snapshot_db;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
@@ -843,9 +834,8 @@ TEST_P(CheckpointTestWithWalParams, CheckpointWithUnsyncedDataDropped) {
   ASSERT_EQ("val2", get_result);
   ASSERT_OK(snapshot_db->Get(read_opts, "key3", &get_result));
   ASSERT_EQ("val3", get_result);
-  delete snapshot_db;
-  delete db_;
-  db_ = nullptr;
+  snapshot_db.reset();
+  db_.reset();
 }
 
 TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
@@ -855,18 +845,17 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
   Options options = CurrentOptions();
   ASSERT_OK(ReadOnlyReopen(options));
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   checkpoint = nullptr;
   Close();
-  DB* snapshot_db = nullptr;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
   ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
   ASSERT_EQ("foo_value", get_result);
-  delete snapshot_db;
 }
 
 TEST_F(CheckpointTest, CheckpointWithLockWAL) {
@@ -876,7 +865,7 @@ TEST_F(CheckpointTest, CheckpointWithLockWAL) {
   ASSERT_OK(db_->LockWAL());
 
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   checkpoint = nullptr;
@@ -884,13 +873,12 @@ TEST_F(CheckpointTest, CheckpointWithLockWAL) {
   ASSERT_OK(db_->UnlockWAL());
   Close();
 
-  DB* snapshot_db = nullptr;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
   ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
   ASSERT_EQ("foo_value", get_result);
-  delete snapshot_db;
 }
 
 TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
@@ -905,7 +893,7 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
       {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
   ASSERT_OK(s);
   Checkpoint* checkpoint = nullptr;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
   checkpoint = nullptr;
@@ -915,7 +903,7 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
       {kDefaultColumnFamilyName, options},
       {"pikachu", options},
       {"eevee", options}};
-  DB* snapshot_db = nullptr;
+  std::unique_ptr<DB> snapshot_db;
   std::vector<ColumnFamilyHandle*> snapshot_handles;
   s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
                &snapshot_db);
@@ -932,7 +920,6 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
     delete snapshot_h;
   }
   snapshot_handles.clear();
-  delete snapshot_db;
 }
 
 TEST_F(CheckpointTest, CheckpointWithDbPath) {
@@ -942,7 +929,7 @@ TEST_F(CheckpointTest, CheckpointWithDbPath) {
   ASSERT_OK(Put("key1", "val1"));
   ASSERT_OK(Flush());
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   // Currently not supported
   ASSERT_TRUE(checkpoint->CreateCheckpoint(snapshot_name_).IsNotSupported());
   delete checkpoint;
@@ -964,7 +951,7 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) {
   ASSERT_OK(Put("key2", std::string(1024, 'a')));
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   TEST_SYNC_POINT("CheckpointTest:CheckpointWithArchievedLog");
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1024 * 1024));
   // unflushed log size < 1024 * 1024 < total file size including archived log,
@@ -973,7 +960,7 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) {
   delete checkpoint;
   checkpoint = nullptr;
 
-  DB* snapshot_db;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
@@ -982,7 +969,6 @@ TEST_F(CheckpointTest, CheckpointWithArchievedLog) {
   get_result.clear();
   ASSERT_OK(snapshot_db->Get(read_opts, "key2", &get_result));
   ASSERT_EQ(std::string(1024, 'a'), get_result);
-  delete snapshot_db;
 }
 
 class CheckpointDestroyTest : public CheckpointTest,
@@ -1013,7 +999,7 @@ TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) {
   ASSERT_EQ(NumTableFilesAtLevel(1), 2);
 
   Checkpoint* checkpoint;
-  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(Checkpoint::Create(db_.get(), &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
 
   delete checkpoint;
@@ -1023,7 +1009,7 @@ TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) {
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 2);
 
-  DB* snapshot_db;
+  std::unique_ptr<DB> snapshot_db;
   ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
   ReadOptions read_opts;
   std::string get_result;
@@ -1031,11 +1017,10 @@ TEST_P(CheckpointDestroyTest, DisableEnableSlowDeletion) {
   ASSERT_EQ("a", get_result);
   ASSERT_OK(snapshot_db->Get(read_opts, "bar", &get_result));
   ASSERT_EQ("val9", get_result);
-  delete snapshot_db;
+  snapshot_db.reset();
 
   // Make sure original obsolete files for hard linked files are all deleted.
-  DBImpl* db_impl = static_cast_with_check<DBImpl>(db_);
-  db_impl->TEST_DeleteObsoleteFiles();
+  dbfull()->TEST_DeleteObsoleteFiles();
   auto sfm = static_cast_with_check<SstFileManagerImpl>(
       options.sst_file_manager.get());
   ASSERT_NE(nullptr, sfm);
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 59e6d46880f5..4c35c0ed52c3 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -7,6 +7,7 @@
 
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/utilities/options_type.h"
+#include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -77,7 +78,7 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, OptSlice begin_key,
   }
   key_versions->clear();
 
-  DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
+  DBImpl* idb = static_cast_with_check<DBImpl>(db->GetRootDB());
   auto icmp = InternalKeyComparator(idb->GetOptions(cfh).comparator);
   ReadOptions read_options;
   Arena arena;
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index c9e782212984..a177e40c360a 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -24,7 +24,9 @@ class MemoryTest : public testing::Test {
 
   std::string GetDBName(int id) { return kDbDir + "db_" + std::to_string(id); }
 
-  void UpdateUsagesHistory(const std::vector<DB*>& dbs) {
+  using DBVec = std::vector<std::unique_ptr<DB>>;
+
+  void UpdateUsagesHistory(const DBVec& dbs) {
     std::map<MemoryUtil::UsageType, uint64_t> usage_by_type;
     ASSERT_OK(GetApproximateMemoryUsageByType(dbs, &usage_by_type));
     for (int i = 0; i < MemoryUtil::kNumUsageTypes; ++i) {
@@ -33,16 +35,17 @@ class MemoryTest : public testing::Test {
     }
   }
 
-  void GetCachePointers(const std::vector<DB*>& dbs,
+  void GetCachePointers(const DBVec& dbs,
                         std::unordered_set<const Cache*>* cache_set) {
     cache_set->clear();
 
-    for (auto* db : dbs) {
+    for (auto& db : dbs) {
       assert(db);
 
       // Cache from DBImpl
-      StackableDB* sdb = dynamic_cast<StackableDB*>(db);
-      DBImpl* db_impl = dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db);
+      StackableDB* sdb = dynamic_cast<StackableDB*>(db.get());
+      DBImpl* db_impl =
+          dynamic_cast<DBImpl*>(sdb ? sdb->GetBaseDB() : db.get());
       if (db_impl != nullptr) {
         cache_set->insert(db_impl->TEST_table_cache());
       }
@@ -58,7 +61,7 @@ class MemoryTest : public testing::Test {
   }
 
   Status GetApproximateMemoryUsageByType(
-      const std::vector<DB*>& dbs,
+      const DBVec& dbs,
       std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
     std::unordered_set<const Cache*> cache_set;
     GetCachePointers(dbs, &cache_set);
@@ -73,7 +76,7 @@ class MemoryTest : public testing::Test {
 };
 
 TEST_F(MemoryTest, SharedBlockCacheTotal) {
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   std::vector<uint64_t> usage_by_type;
   const int kNumDBs = 10;
   const int kKeySize = 100;
@@ -88,9 +91,7 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) {
   bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10);
   for (int i = 0; i < kNumDBs; ++i) {
     ASSERT_OK(DestroyDB(GetDBName(i), opt));
-    DB* db = nullptr;
-    ASSERT_OK(DB::Open(opt, GetDBName(i), &db));
-    dbs.push_back(db);
+    ASSERT_OK(DB::Open(opt, GetDBName(i), &dbs.emplace_back()));
   }
 
   std::vector<std::string> keys_by_db[kNumDBs];
@@ -119,13 +120,10 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) {
     ASSERT_EQ(usage_history_[MemoryUtil::kTableReadersTotal][i],
               usage_history_[MemoryUtil::kTableReadersTotal][i - 1]);
   }
-  for (int i = 0; i < kNumDBs; ++i) {
-    delete dbs[i];
-  }
 }
 
 TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
-  std::vector<DB*> dbs;
+  std::vector<std::unique_ptr<DB>> dbs;
   std::vector<uint64_t> usage_by_type;
   std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
   const int kNumDBs = 10;
@@ -150,10 +148,9 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
   for (int i = 0; i < kNumDBs; ++i) {
     ASSERT_OK(DestroyDB(GetDBName(i), opt));
     std::vector<ColumnFamilyHandle*> handles;
-    dbs.emplace_back();
     vec_handles.emplace_back();
     ASSERT_OK(DB::Open(DBOptions(opt), GetDBName(i), cf_descs,
-                       &vec_handles.back(), &dbs.back()));
+                       &vec_handles.back(), &dbs.emplace_back()));
   }
 
   // Fill one memtable per Put to make memtable use more memory.
@@ -237,7 +234,6 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
     for (auto* handle : vec_handles[i]) {
       delete handle;
     }
-    delete dbs[i];
   }
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc
index c7bf30bfb716..c252f46c4eb7 100644
--- a/utilities/memory/memory_util.cc
+++ b/utilities/memory/memory_util.cc
@@ -9,14 +9,15 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+template <typename DBPtr>
 Status MemoryUtil::GetApproximateMemoryUsageByType(
-    const std::vector<DB*>& dbs,
+    const std::vector<DBPtr>& dbs,
     const std::unordered_set<const Cache*> cache_set,
     std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type) {
   usage_by_type->clear();
 
   // MemTable
-  for (auto* db : dbs) {
+  for (auto& db : dbs) {
     uint64_t usage = 0;
     if (db->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
                                      &usage)) {
@@ -29,7 +30,7 @@ Status MemoryUtil::GetApproximateMemoryUsageByType(
   }
 
   // Table Readers
-  for (auto* db : dbs) {
+  for (auto& db : dbs) {
     uint64_t usage = 0;
     if (db->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem,
                                      &usage)) {
@@ -46,4 +47,16 @@ Status MemoryUtil::GetApproximateMemoryUsageByType(
 
   return Status::OK();
 }
+
+template Status MemoryUtil::GetApproximateMemoryUsageByType<DB*>(
+    const std::vector<DB*>& dbs,
+    const std::unordered_set<const Cache*> cache_set,
+    std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+
+template Status
+MemoryUtil::GetApproximateMemoryUsageByType<std::unique_ptr<DB>>(
+    const std::vector<std::unique_ptr<DB>>& dbs,
+    const std::unordered_set<const Cache*> cache_set,
+    std::map<MemoryUtil::UsageType, uint64_t>* usage_by_type);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
index acc71c8e49c1..2b52b8d901f1 100644
--- a/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/utilities/merge_operators/string_append/stringappend_test.cc
@@ -23,6 +23,7 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
+#include "util/cast_util.h"
 #include "util/random.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
@@ -34,8 +35,7 @@ const std::string kDbName = test::PerThreadDBPath("stringappend_test");
 
 namespace {
 // OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
-std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
-  DB* db;
+std::unique_ptr<DB> OpenNormalDb(const std::string& delim) {
   Options options;
   options.create_if_missing = true;
   MergeOperator* mergeOperator;
@@ -45,12 +45,13 @@ std::shared_ptr<DB> OpenNormalDb(const std::string& delim) {
     mergeOperator = new StringAppendOperator(delim);
   }
   options.merge_operator.reset(mergeOperator);
+  std::unique_ptr<DB> db;
   EXPECT_OK(DB::Open(options, kDbName, &db));
-  return std::shared_ptr<DB>(db);
+  return db;
 }
 
 // Open a TtlDB with a non-associative StringAppendTESTOperator
-std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
+std::unique_ptr<DB> OpenTtlDb(const std::string& delim) {
   DBWithTTL* db;
   Options options;
   options.create_if_missing = true;
@@ -62,7 +63,7 @@ std::shared_ptr<DB> OpenTtlDb(const std::string& delim) {
   }
   options.merge_operator.reset(mergeOperator);
   EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
-  return std::shared_ptr<DB>(db);
+  return std::unique_ptr<DB>(db);
 }
 }  // namespace
 
@@ -72,8 +73,7 @@ class StringLists {
  public:
   // Constructor: specifies the rocksdb db
   /* implicit */
-  StringLists(std::shared_ptr<DB> db)
-      : db_(db), merge_option_(), get_option_() {
+  StringLists(UnownedPtr<DB> db) : db_(db), merge_option_(), get_option_() {
     assert(db);
   }
 
@@ -113,7 +113,7 @@ class StringLists {
   }
 
  private:
-  std::shared_ptr<DB> db_;
+  UnownedPtr<DB> db_;
   WriteOptions merge_option_;
   ReadOptions get_option_;
 };
@@ -138,7 +138,7 @@ class StringAppendOperatorTest : public testing::Test,
     StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb);
   }
 
-  using OpenFuncPtr = std::shared_ptr<DB> (*)(const std::string&);
+  using OpenFuncPtr = std::unique_ptr<DB> (*)(const std::string&);
 
   // Allows user to open databases with different configurations.
   // e.g.: Can open a DB or a TtlDB, etc.
@@ -154,7 +154,7 @@ StringAppendOperatorTest::OpenFuncPtr StringAppendOperatorTest::OpenDb =
 
 TEST_P(StringAppendOperatorTest, IteratorTest) {
   auto db_ = OpenDb(",");
-  StringLists slists(db_);
+  StringLists slists(db_.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -249,7 +249,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) {
 
 TEST_P(StringAppendOperatorTest, SimpleTest) {
   auto db = OpenDb(",");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -262,7 +262,7 @@ TEST_P(StringAppendOperatorTest, SimpleTest) {
 
 TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
   auto db = OpenDb("|");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -275,7 +275,7 @@ TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
 
 TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
   auto db = OpenDb("");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -288,7 +288,7 @@ TEST_P(StringAppendOperatorTest, EmptyDelimiterTest) {
 
 TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
   auto db = OpenDb("<>");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -302,7 +302,7 @@ TEST_P(StringAppendOperatorTest, MultiCharDelimiterTest) {
 TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
   std::string delimiter = "<>";
   auto db = OpenDb(delimiter);
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
@@ -316,7 +316,7 @@ TEST_P(StringAppendOperatorTest, DelimiterIsDefensivelyCopiedTest) {
 
 TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
   auto db = OpenDb("!");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("random_key", "single_val");
 
@@ -327,7 +327,7 @@ TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
 
 TEST_P(StringAppendOperatorTest, VariousKeys) {
   auto db = OpenDb("\n");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("c", "asdasd");
   slists.Append("a", "x");
@@ -353,7 +353,7 @@ TEST_P(StringAppendOperatorTest, VariousKeys) {
 // Generate semi random keys/words from a small distribution.
 TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
   auto db = OpenDb(" ");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   // Generate a list of random keys and values
   const int kWordCount = 15;
@@ -402,7 +402,7 @@ TEST_P(StringAppendOperatorTest, RandomMixGetAppend) {
 
 TEST_P(StringAppendOperatorTest, BIGRandomMixGetAppend) {
   auto db = OpenDb(" ");
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   // Generate a list of random keys and values
   const int kWordCount = 15;
@@ -453,7 +453,7 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Perform the following operations in limited scope
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
 
     slists.Append("c", "asdasd");
     slists.Append("a", "x");
@@ -476,7 +476,7 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Reopen the database (the previous changes should persist / be remembered)
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
 
     slists.Append("c", "bbnagnagsx");
     slists.Append("a", "sa");
@@ -502,7 +502,7 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
   // Reopen the database (the previous changes should persist / be remembered)
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
 
     // All changes should be on disk. This will test VersionSet Get()
     std::string a, b, c;
@@ -520,7 +520,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   // Perform the following operations in limited scope
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
     std::string a, b, c;
 
     // Append, Flush, Get
@@ -559,7 +559,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
   // Reopen the database (the previous changes should persist / be remembered)
   {
     auto db = OpenDb("\n");
-    StringLists slists(db);
+    StringLists slists(db.get());
     std::string a, b, c;
 
     // Get (Quick check for persistence of previous database)
@@ -607,7 +607,7 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
 
 TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) {
   auto db = OpenDb(std::string(1, '\0'));
-  StringLists slists(db);
+  StringLists slists(db.get());
 
   slists.Append("k1", "v1");
   slists.Append("k1", "v2");
diff --git a/utilities/option_change_migration/option_change_migration.cc b/utilities/option_change_migration/option_change_migration.cc
index 9a17daca5ced..4de1ff85107e 100644
--- a/utilities/option_change_migration/option_change_migration.cc
+++ b/utilities/option_change_migration/option_change_migration.cc
@@ -221,12 +221,9 @@ Status OpenDBWithCFs(const DBOptions& db_opts, const std::string& dbname,
                      std::unique_ptr<DB>* db,
                      std::vector<ColumnFamilyHandle*>* handles) {
   handles->clear();
-  DB* tmpdb;
-  Status s = DB::Open(db_opts, dbname, cf_descs, handles, &tmpdb);
+  Status s = DB::Open(db_opts, dbname, cf_descs, handles, db);
 
-  if (s.ok()) {
-    db->reset(tmpdb);
-  } else {
+  if (!s.ok()) {
     for (auto* handle : *handles) {
       delete handle;
     }
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 193142d67c20..59397613b0dc 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -243,7 +243,7 @@ TEST_F(OptionsUtilTest, SanityCheck) {
   db_opt.create_if_missing = true;
 
   ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
-  DB* db;
+  std::unique_ptr<DB> db;
   std::vector<ColumnFamilyHandle*> handles;
   // open and persist the options
   ASSERT_OK(DB::Open(db_opt, dbname_, cf_descs, &handles, &db));
@@ -252,7 +252,7 @@ TEST_F(OptionsUtilTest, SanityCheck) {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
 
   ConfigOptions config_options;
   config_options.ignore_unknown_options = false;
@@ -435,7 +435,7 @@ TEST_F(OptionsUtilTest, LoadLatestOptions) {
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
   std::vector<ColumnFamilyHandle*> handles;
-  DB* db;
+  std::unique_ptr<DB> db;
   options.create_if_missing = true;
 
   ASSERT_OK(DestroyDB(dbname_, options));
@@ -495,7 +495,7 @@ TEST_F(OptionsUtilTest, LoadLatestOptions) {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
   ASSERT_OK(DestroyDB(dbname_, options, cf_descs));
 }
 
@@ -639,7 +639,7 @@ TEST_F(OptionsUtilTest, BadLatestOptions) {
 }
 
 TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -652,7 +652,7 @@ TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
 
   ASSERT_OK(DB::Open(options, dbname_, &db));
   ASSERT_OK(db->Put(WriteOptions(), "foo", "value0"));
-  delete db;
+  db.reset();
 
   auto new_dbname = dbname_ + "_2";
 
@@ -669,14 +669,14 @@ TEST_F(OptionsUtilTest, RenameDatabaseDirectory) {
   for (auto* handle : handles) {
     delete handle;
   }
-  delete db;
+  db.reset();
   Options new_options(db_opts, cf_descs[0].options);
   ASSERT_OK(DestroyDB(new_dbname, new_options, cf_descs));
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
 TEST_F(OptionsUtilTest, WalDirSettings) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -689,14 +689,14 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
 
   // Open a DB with no wal dir set.  The wal_dir should stay empty
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 
   // Open a DB with wal_dir == dbname.  The wal_dir should be set to empty
   options.wal_dir = dbname_;
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 
@@ -705,7 +705,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.wal_dir = "";
   options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 
@@ -714,7 +714,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.wal_dir = dbname_ + "/";
   options.db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
   ASSERT_OK(DestroyDB(dbname_, options));
@@ -725,7 +725,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.db_paths.emplace_back(dbname_ + "_0",
                                 std::numeric_limits<uint64_t>::max());
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, dbname_);
   ASSERT_OK(DestroyDB(dbname_, options));
@@ -734,14 +734,14 @@ TEST_F(OptionsUtilTest, WalDirSettings) {
   options.wal_dir = dbname_ + "/wal";
   options.db_paths.clear();
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal");
   ASSERT_OK(DestroyDB(dbname_, options));
 }
 
 TEST_F(OptionsUtilTest, WalDirInOptins) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   DBOptions db_opts;
   std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -755,7 +755,7 @@ TEST_F(OptionsUtilTest, WalDirInOptins) {
   options.create_if_missing = true;
   options.wal_dir = "";
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   options.wal_dir = dbname_;
   std::string options_file;
   ASSERT_OK(GetLatestOptionsFileName(dbname_, options.env, &options_file));
@@ -766,7 +766,7 @@ TEST_F(OptionsUtilTest, WalDirInOptins) {
   ASSERT_EQ(db_opts.wal_dir, dbname_);
   options.wal_dir = "";
   ASSERT_OK(DB::Open(options, dbname_, &db));
-  delete db;
+  db.reset();
   ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs));
   ASSERT_EQ(db_opts.wal_dir, "");
 }
diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc
index dc854342bc57..42ddddc82774 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.cc
+++ b/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -73,7 +73,7 @@ Status OptimisticTransactionDB::Open(
     std::vector<ColumnFamilyHandle*>* handles,
     OptimisticTransactionDB** dbptr) {
   Status s;
-  DB* db;
+  std::unique_ptr<DB> db;
 
   std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
 
@@ -91,7 +91,7 @@ Status OptimisticTransactionDB::Open(
   s = DB::Open(db_options, dbname, column_families_copy, handles, &db);
 
   if (s.ok()) {
-    *dbptr = new OptimisticTransactionDBImpl(db, occ_options);
+    *dbptr = new OptimisticTransactionDBImpl(std::move(db), occ_options);
   }
 
   return s;
diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h
index 86213832dde1..be6e7b0b6941 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.h
+++ b/utilities/transactions/optimistic_transaction_db_impl.h
@@ -44,10 +44,9 @@ class OccLockBucketsImpl : public OccLockBucketsImplBase {
 class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
  public:
   explicit OptimisticTransactionDBImpl(
-      DB* db, const OptimisticTransactionDBOptions& occ_options,
-      bool take_ownership = true)
-      : OptimisticTransactionDB(db),
-        db_owner_(take_ownership),
+      std::unique_ptr<DB>&& db,
+      const OptimisticTransactionDBOptions& occ_options)
+      : OptimisticTransactionDB(std::move(db)),
         validate_policy_(occ_options.validate_policy) {
     if (validate_policy_ == OccValidationPolicy::kValidateParallel) {
       auto bucketed_locks = occ_options.shared_lock_buckets;
@@ -60,13 +59,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
     }
   }
 
-  ~OptimisticTransactionDBImpl() {
-    // Prevent this stackable from destroying
-    // base db
-    if (!db_owner_) {
-      db_ = nullptr;
-    }
-  }
+  ~OptimisticTransactionDBImpl() override = default;
 
   Transaction* BeginTransaction(const WriteOptions& write_options,
                                 const OptimisticTransactionOptions& txn_options,
@@ -97,8 +90,6 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
  private:
   std::shared_ptr<OccLockBucketsImplBase> bucketed_locks_;
 
-  bool db_owner_;
-
   const OccValidationPolicy validate_policy_;
 
   void ReinitializeTransaction(Transaction* txn,
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index e3b22804c2a7..3c7a7747af32 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -9005,7 +9005,7 @@ class CommitBypassMemtableTest
     txn_db_opts.use_per_key_point_lock_mgr = std::get<1>(GetParam());
     ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
     ASSERT_NE(txn_db, nullptr);
-    db_ = txn_db;
+    db_.reset(txn_db);
   }
 };
 
@@ -9453,9 +9453,9 @@ TEST_P(CommitBypassMemtableTest, Recovery) {
   VerifyDBFromMap(expected);
 
   ASSERT_OK(txn_db->Close());
-  delete txn_db;
+  db_.reset();  // destroys txn_db (owned by db_)
   ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
-  db_ = txn_db;
+  db_.reset(txn_db);
 
   VerifyDBFromMap(expected);
 }
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 2b0056adc4d9..956c8e66a685 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -196,7 +196,7 @@ TEST(PreparedHeap, Concurrent) {
 TEST(WriteBatchWithIndex, SubBatchCnt) {
   ColumnFamilyOptions cf_options;
   std::string cf_name = "two";
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   const std::string dbname = test::PerThreadDBPath("transaction_testdb");
@@ -285,7 +285,6 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
   }
 
   delete cf_handle;
-  delete db;
 }
 
 TEST(CommitEntry64b, BasicTest) {
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index 08a2515197f0..84a2dbce9bc6 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -305,7 +305,8 @@ int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/) {
   return static_cast<int>(library.GetFactoryCount(&num_types));
 }
 // Open the db inside DBWithTTLImpl because options needs pointer to its ttl
-DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
+DBWithTTLImpl::DBWithTTLImpl(std::unique_ptr<DB>&& db)
+    : DBWithTTL(std::move(db)), closed_(false) {}
 
 DBWithTTLImpl::~DBWithTTLImpl() {
   if (!closed_) {
@@ -372,7 +373,7 @@ Status DBWithTTL::Open(
     DBWithTTLImpl::SanitizeOptions(
         ttls[i], &column_families_sanitized[i].options, clock);
   }
-  DB* db;
+  std::unique_ptr<DB> db;
 
   Status st;
   if (read_only) {
@@ -382,7 +383,7 @@ Status DBWithTTL::Open(
     st = DB::Open(db_options, dbname, column_families_sanitized, handles, &db);
   }
   if (st.ok()) {
-    *dbptr = new DBWithTTLImpl(db);
+    *dbptr = new DBWithTTLImpl(std::move(db));
   } else {
     *dbptr = nullptr;
   }
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 9b7710739aa5..b8b3866233f0 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -32,7 +32,7 @@ class DBWithTTLImpl : public DBWithTTL {
                               SystemClock* clock);
 
   static void RegisterTtlClasses();
-  explicit DBWithTTLImpl(DB* db);
+  explicit DBWithTTLImpl(std::unique_ptr<DB>&& db);
 
   virtual ~DBWithTTLImpl();
 
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 798d1d4425e2..26454c6ee08c 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -658,7 +658,7 @@ TEST_F(TtlTest, TtlFiftenYears) {
 }
 
 TEST_F(TtlTest, ColumnFamiliesTest) {
-  DB* db;
+  std::unique_ptr<DB> db;
   Options options;
   options.create_if_missing = true;
   options.env = env_.get();
@@ -669,7 +669,7 @@ TEST_F(TtlTest, ColumnFamiliesTest) {
                                    "ttl_column_family", &handle));
 
   delete handle;
-  delete db;
+  db.reset();
 
   std::vector<ColumnFamilyDescriptor> column_families;
   column_families.emplace_back(kDefaultColumnFamilyName,
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index a61de9129f23..caa1881e89b2 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -364,7 +364,7 @@ class WBWIBaseTest : public testing::Test {
 
     if (db_ != nullptr) {
       ReleaseSnapshot();
-      delete db_;
+      db_.reset();
       EXPECT_OK(DestroyDB(dbname_, options_));
     }
   }
@@ -435,7 +435,7 @@ class WBWIBaseTest : public testing::Test {
   }
 
  public:
-  DB* db_;
+  std::unique_ptr<DB> db_;
   std::string dbname_;
   Options options_;
   WriteOptions write_opts_;
@@ -1594,21 +1594,21 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
   ASSERT_OK(batch_->Put("a", "batch_->a"));
   ASSERT_OK(batch_->Delete("b"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_EQ("batch_->a", value);
 
-  Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value);
+  Status s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
   ASSERT_EQ("c", value);
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 }
 
@@ -1630,24 +1630,24 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
   ASSERT_OK(batch_->Merge("d", "d1"));
   ASSERT_OK(batch_->Merge("e", "e0"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_EQ("a0,a1,a2", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value));
   ASSERT_EQ("b0,b1,b2", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
   ASSERT_EQ("c0", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "d", &value));
   ASSERT_EQ("d0,d1", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
   ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   const Snapshot* snapshot = db_->GetSnapshot();
@@ -1656,42 +1656,44 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
 
   ASSERT_OK(db_->Delete(write_opts_, "a"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_EQ("a1,a2", value);
 
-  ASSERT_OK(
-      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value));
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "a",
+                                          &value));
   ASSERT_EQ("a0,a1,a2", value);
 
   ASSERT_OK(batch_->Delete("a"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1"));
 
-  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
   ASSERT_EQ("c0,c1", value);
 
-  ASSERT_OK(
-      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value));
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "c",
+                                          &value));
   ASSERT_EQ("c0", value);
 
   ASSERT_OK(db_->Put(write_opts_, "e", "e1"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
   ASSERT_EQ("e1,e0", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+  ASSERT_OK(
+      batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
   ASSERT_OK(s = db_->Delete(write_opts_, "e"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
+  ASSERT_OK(
+      batch_->GetFromBatchAndDB(db_.get(), snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
   db_->ReleaseSnapshot(snapshot);
@@ -1703,24 +1705,24 @@ TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) {
 
   std::string value;
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(batch_->Merge("A", "xxx"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "xxx");
 
   ASSERT_OK(batch_->Merge("A", "yyy"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "xxx,yyy");
 
   ASSERT_OK(db_->Put(write_opts_, "A", "a0"));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "a0,xxx,yyy");
 
   ASSERT_OK(batch_->Delete("A"));
 
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
 }
 
@@ -1735,7 +1737,7 @@ TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
   ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily()));
   ASSERT_OK(batch_->Merge("A", "2"));
 
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "A", &value));
   ASSERT_EQ(value, "1,2");
 }
 
@@ -1761,23 +1763,23 @@ TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) {
       // Do it again with a flushed DB...
       ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily()));
     }
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
     ASSERT_EQ("a0,a1,a2", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value));
     ASSERT_EQ("b0,b1,b2", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "c", &value));
     ASSERT_EQ("c0", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "d", &value));
     ASSERT_EQ("d0,d1", value.ToString());
 
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "e", &value));
     ASSERT_EQ("e0", value.ToString());
     ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-    s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+    s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "x", &value);
     ASSERT_TRUE(s.IsNotFound());
   }
 }
@@ -2595,7 +2597,7 @@ TEST_P(WriteBatchWithIndexTest, MultiGetTest) {
   std::vector<PinnableSlice> values(keys.size());
   std::vector<Status> statuses(keys.size());
 
-  batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(),
+  batch_->MultiGetFromBatchAndDB(db_.get(), read_opts_, cf0, key_slices.size(),
                                  key_slices.data(), values.data(),
                                  statuses.data(), false);
   for (size_t i = 0; i < keys.size(); ++i) {
@@ -2674,7 +2676,7 @@ TEST_P(WriteBatchWithIndexTest, MultiGetTest2) {
       int random = rnd.Uniform(num_keys);
       key_slices.emplace_back(keys[random]);
     }
-    batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass,
+    batch_->MultiGetFromBatchAndDB(db_.get(), read_opts_, cf0, keys_per_pass,
                                    key_slices.data(), values.data(),
                                    statuses.data(), false);
     for (size_t i = 0; i < keys_per_pass; i++) {
@@ -2827,9 +2829,9 @@ TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) {
   ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
   ASSERT_OK(batch_->Merge("o", "bb"));  // Merging bb under key "o"
   ASSERT_OK(batch_->Merge("m", "cc"));  // Merging bc under key "m"
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "o", &value));
   ASSERT_EQ(value, "aa,bb");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "m", &value));
   ASSERT_EQ(value, "cc");
 }
 
@@ -2843,19 +2845,19 @@ TEST_P(WriteBatchWithIndexTest, GetAfterPut) {
   ASSERT_OK(batch_->Put("key", "aa"));  // Writing aa under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "aa");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa");
 
   ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "aa,bb");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa,bb");
 
   ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "aa,bb,cc");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa,bb,cc");
 }
 
@@ -2868,25 +2870,25 @@ TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) {
   ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
   Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "orig,aa");
 
   ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
   s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "orig,aa,bb");
 
   ASSERT_OK(batch_->Put("key", "cc"));  // Writing cc under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc");
 
   ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
 }
 
@@ -2898,30 +2900,30 @@ TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) {
   ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
   Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa");
 
   ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
   s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "aa,bb");
 
   ASSERT_OK(batch_->Delete("key"));  // Delete key from batch
   s = batch_->GetFromBatch(cf0, options_, "key", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value);
+  s = batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc");
   ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
   ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "key", &value));
   ASSERT_EQ(value, "cc,dd");
 }
 
@@ -2947,9 +2949,9 @@ TEST_P(WriteBatchWithIndexTest, TestBadMergeOperator) {
   ASSERT_OK(batch_->Put("b", "b0"));
 
   ASSERT_OK(batch_->Merge("a", "a1"));
-  ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_NOK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "a", &value));
   ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value));
-  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_.get(), read_opts_, "b", &value));
   ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value));
 }
 
@@ -2972,7 +2974,7 @@ TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) {
   {
     std::string value;
     ASSERT_TRUE(
-        batch_->GetFromBatchAndDB(db_, ReadOptions(), &cf2, "key", &value)
+        batch_->GetFromBatchAndDB(db_.get(), ReadOptions(), &cf2, "key", &value)
             .IsInvalidArgument());
   }
   {
@@ -2982,7 +2984,7 @@ TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) {
         {PinnableSlice(), PinnableSlice()}};
     std::array<Status, num_keys> statuses{{Status(), Status()}};
     constexpr bool sorted_input = false;
-    batch_->MultiGetFromBatchAndDB(db_, ReadOptions(), &cf2, num_keys,
+    batch_->MultiGetFromBatchAndDB(db_.get(), ReadOptions(), &cf2, num_keys,
                                    keys.data(), pinnable_vals.data(),
                                    statuses.data(), sorted_input);
     for (const auto& s : statuses) {
@@ -3143,13 +3145,15 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
   // GetFromBatchAndDB
   {
     PinnableSlice value;
-    ASSERT_TRUE(batch_->GetFromBatchAndDB(db_, read_opts_, delete_key, &value)
-                    .IsNotFound());
+    ASSERT_TRUE(
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, delete_key, &value)
+            .IsNotFound());
   }
 
   for (size_t i = 1; i < num_keys; ++i) {
     PinnableSlice value;
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, keys[i], &value));
+    ASSERT_OK(
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, keys[i], &value));
     ASSERT_EQ(value, expected[i].front().value());
   }
 
@@ -3159,9 +3163,9 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
     std::array<Status, num_keys> statuses;
     constexpr bool sorted_input = false;
 
-    batch_->MultiGetFromBatchAndDB(db_, read_opts_, db_->DefaultColumnFamily(),
-                                   num_keys, keys.data(), values.data(),
-                                   statuses.data(), sorted_input);
+    batch_->MultiGetFromBatchAndDB(
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), values.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsNotFound());
 
@@ -3175,7 +3179,7 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
   {
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, read_opts_,
+                    ->GetEntityFromBatchAndDB(db_.get(), read_opts_,
                                               db_->DefaultColumnFamily(),
                                               delete_key, &columns)
                     .IsNotFound());
@@ -3184,7 +3188,7 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
   for (size_t i = 1; i < num_keys; ++i) {
     PinnableWideColumns columns;
     ASSERT_OK(batch_->GetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
     ASSERT_EQ(columns.columns(), expected[i]);
   }
 
@@ -3195,8 +3199,8 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchOnly) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results.data(), statuses.data(), sorted_input);
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsNotFound());
 
@@ -3293,14 +3297,15 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
   // GetFromBatchAndDB
   for (size_t i = 0; i < num_keys - 1; ++i) {
     PinnableSlice value;
-    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, keys[i], &value));
+    ASSERT_OK(
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, keys[i], &value));
     ASSERT_EQ(value, expected[i].front().value());
   }
 
   {
     PinnableSlice value;
     ASSERT_TRUE(
-        batch_->GetFromBatchAndDB(db_, read_opts_, no_merge_c_key, &value)
+        batch_->GetFromBatchAndDB(db_.get(), read_opts_, no_merge_c_key, &value)
             .IsNotFound());
   }
 
@@ -3310,9 +3315,9 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
     std::array<Status, num_keys> statuses;
     constexpr bool sorted_input = false;
 
-    batch_->MultiGetFromBatchAndDB(db_, read_opts_, db_->DefaultColumnFamily(),
-                                   num_keys, keys.data(), values.data(),
-                                   statuses.data(), sorted_input);
+    batch_->MultiGetFromBatchAndDB(
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), values.data(), statuses.data(), sorted_input);
 
     for (size_t i = 0; i < num_keys - 1; ++i) {
       ASSERT_OK(statuses[i]);
@@ -3326,14 +3331,14 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
   for (size_t i = 0; i < num_keys - 1; ++i) {
     PinnableWideColumns columns;
     ASSERT_OK(batch_->GetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), keys[i], &columns));
     ASSERT_EQ(columns.columns(), expected[i]);
   }
 
   {
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, read_opts_,
+                    ->GetEntityFromBatchAndDB(db_.get(), read_opts_,
                                               db_->DefaultColumnFamily(),
                                               no_merge_c_key, &columns)
                     .IsNotFound());
@@ -3346,8 +3351,8 @@ TEST_P(WriteBatchWithIndexTest, WideColumnsBatchAndDB) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, read_opts_, db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results.data(), statuses.data(), sorted_input);
+        db_.get(), read_opts_, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results.data(), statuses.data(), sorted_input);
 
     for (size_t i = 0; i < num_keys - 1; ++i) {
       ASSERT_OK(statuses[i]);
@@ -3551,15 +3556,15 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr ColumnFamilyHandle* column_family = nullptr;
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, ReadOptions(), column_family,
-                                              foo, &columns)
+                    ->GetEntityFromBatchAndDB(db_.get(), ReadOptions(),
+                                              column_family, foo, &columns)
                     .IsInvalidArgument());
   }
 
   {
     constexpr PinnableWideColumns* columns = nullptr;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, ReadOptions(),
+                    ->GetEntityFromBatchAndDB(db_.get(), ReadOptions(),
                                               db_->DefaultColumnFamily(), foo,
                                               columns)
                     .IsInvalidArgument());
@@ -3571,7 +3576,7 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
 
     PinnableWideColumns columns;
     ASSERT_TRUE(batch_
-                    ->GetEntityFromBatchAndDB(db_, read_options,
+                    ->GetEntityFromBatchAndDB(db_.get(), read_options,
                                               db_->DefaultColumnFamily(), foo,
                                               &columns)
                     .IsInvalidArgument());
@@ -3599,9 +3604,9 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     std::array<Status, num_keys> statuses;
     constexpr bool sorted_input = false;
 
-    batch_->MultiGetEntityFromBatchAndDB(db_, ReadOptions(), column_family,
-                                         num_keys, keys.data(), results.data(),
-                                         statuses.data(), sorted_input);
+    batch_->MultiGetEntityFromBatchAndDB(
+        db_.get(), ReadOptions(), column_family, num_keys, keys.data(),
+        results.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
     ASSERT_TRUE(statuses[1].IsInvalidArgument());
@@ -3614,7 +3619,7 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, ReadOptions(), db_->DefaultColumnFamily(), num_keys, keys,
+        db_.get(), ReadOptions(), db_->DefaultColumnFamily(), num_keys, keys,
         results.data(), statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
@@ -3628,8 +3633,8 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, ReadOptions(), db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results, statuses.data(), sorted_input);
+        db_.get(), ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results, statuses.data(), sorted_input);
 
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
     ASSERT_TRUE(statuses[1].IsInvalidArgument());
@@ -3645,8 +3650,8 @@ TEST_P(WriteBatchWithIndexTest, EntityReadSanityChecks) {
     constexpr bool sorted_input = false;
 
     batch_->MultiGetEntityFromBatchAndDB(
-        db_, read_options, db_->DefaultColumnFamily(), num_keys, keys.data(),
-        results.data(), statuses.data(), sorted_input);
+        db_.get(), read_options, db_->DefaultColumnFamily(), num_keys,
+        keys.data(), results.data(), statuses.data(), sorted_input);
     ASSERT_TRUE(statuses[0].IsInvalidArgument());
     ASSERT_TRUE(statuses[1].IsInvalidArgument());
   }

From 61b4edd15eff4513f177244e55cf3c63645cc0af Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Wed, 18 Feb 2026 10:52:04 -0800
Subject: [PATCH 489/500] Test compaction forward compatibility (#14344)

Summary:
Extending https://github.com/facebook/rocksdb/issues/14323 by testing scenarios for compaction after downgrade. Detail: we shouldn't need to test loading options with compaction, as options file inclusion is mostly a sanity check for "can you open the DB with options file?"

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14344

Test Plan: manual run of SHORT_TEST=1 J=140 tools/check

Reviewed By: xingbowang

Differential Revision: D93553897

Pulled By: pdillinger

fbshipit-source-id: ec08ae2a3d49971e24a215e38df9506fe1133096
---
 tools/check_format_compatible.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index 8b4c5ccdd7c1..44c513caf2f5 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -367,6 +367,13 @@ do
   then
     echo "== Use $checkout_ref to open DB generated using $current_checkout_name..."
     compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+
+    echo "== Use $checkout_ref to compact a copy of DB generated using $current_checkout_name..."
+    [ "$SANITY_CHECK" ] || cp -a $current_db_test_dir ${current_db_test_dir}_copy_for_${checkout_ref}
+    compact_db ${current_db_test_dir}_copy_for_${checkout_ref} 0
+
+    echo "== After compaction, re-verify DB copy originally from $current_checkout_name..."
+    compare_db ${current_db_test_dir}_copy_for_${checkout_ref} $current_db_test_dir forward_${checkout_ref}_dump_after_compact.txt 0
   fi
 
   if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"

From 3556c2205962bbf66707e290fe23249bf346f43e Mon Sep 17 00:00:00 2001
From: xingbowang <shawn.xingbo.wang@gmail.com>
Date: Thu, 19 Feb 2026 14:12:38 -0800
Subject: [PATCH 490/500] Remove deprecated option
 skip_checking_sst_file_sizes_on_db_open (#14346)

Summary:
Remove deprecated option skip_checking_sst_file_sizes_on_db_open

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14346

Test Plan: Unit test

Reviewed By: hx235

Differential Revision: D93602683

Pulled By: xingbowang

fbshipit-source-id: f576825cb107bb0aeb14f4ff29fef0df269b8728
---
 db/c.cc                                       | 10 ----
 include/rocksdb/c.h                           |  7 ---
 include/rocksdb/options.h                     | 11 -----
 java/rocksjni/options.cc                      | 48 -------------------
 java/src/main/java/org/rocksdb/DBOptions.java | 16 -------
 .../java/org/rocksdb/DBOptionsInterface.java  | 30 ------------
 java/src/main/java/org/rocksdb/Options.java   | 16 -------
 .../test/java/org/rocksdb/DBOptionsTest.java  |  9 ----
 .../test/java/org/rocksdb/OptionsTest.java    |  9 ----
 options/db_options.cc                         |  6 +--
 options/db_options.h                          |  1 -
 options/options_helper.cc                     |  2 -
 options/options_settable_test.cc              |  1 -
 ...skip_checking_sst_file_sizes_on_db_open.md |  1 +
 14 files changed, 2 insertions(+), 165 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md

diff --git a/db/c.cc b/db/c.cc
index 947b538f1ea8..6664d8d0a06a 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -4672,16 +4672,6 @@ unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
   return opt->rep.skip_stats_update_on_db_open;
 }
 
-void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt, unsigned char val) {
-  opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
-}
-
-unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt) {
-  return opt->rep.skip_checking_sst_file_sizes_on_db_open;
-}
-
 /* Blob Options Settings */
 void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
                                            unsigned char val) {
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ab8efecabb78..ffb0f793f8e6 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1668,13 +1668,6 @@ rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
                                                  unsigned char val);
 extern ROCKSDB_LIBRARY_API unsigned char
 rocksdb_options_get_skip_stats_update_on_db_open(rocksdb_options_t* opt);
-extern ROCKSDB_LIBRARY_API void
-rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt, unsigned char val);
-extern ROCKSDB_LIBRARY_API unsigned char
-rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
-    rocksdb_options_t* opt);
-
 /* Blob Options Settings */
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
     rocksdb_options_t* opt, unsigned char val);
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index fdbc5f530424..96c33656953d 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -1363,17 +1363,6 @@ struct DBOptions {
   // Default: false
   bool skip_stats_update_on_db_open = false;
 
-  // This option is deprecated and marked as no-op. Kept for backward
-  // compatibility until usage is fully removed.
-  // File size check will be performed through a thread
-  // pool during DB Open, when max_open_files is set to -1.
-  // Therefore, the concern of DB Open slowness is eliminated.
-  // Note that when max_open_files is not set to -1, only a subset of files will
-  // be opened and checked during DB Open.
-  //
-  // Default: false
-  bool skip_checking_sst_file_sizes_on_db_open = false;
-
   // Recovery mode to control the consistency while replaying WAL
   // Default: kPointInTimeRecovery
   WALRecoveryMode wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 2bb07cf45828..1e921461b8ce 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -1959,30 +1959,6 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(JNIEnv*, jclass,
   return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
 }
 
-/*
- * Class:     org_rocksdb_Options
- * Method:    setSkipCheckingSstFileSizesOnDbOpen
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_Options_setSkipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle,
-    jboolean jskip_checking_sst_file_sizes_on_db_open) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  opt->skip_checking_sst_file_sizes_on_db_open =
-      static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
-}
-
-/*
- * Class:     org_rocksdb_Options
- * Method:    skipCheckingSstFileSizesOnDbOpen
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_Options_skipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
-}
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    setWalRecoveryMode
@@ -7359,30 +7335,6 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(JNIEnv*, jclass,
   return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
 }
 
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    setSkipCheckingSstFileSizesOnDbOpen
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_DBOptions_setSkipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle,
-    jboolean jskip_checking_sst_file_sizes_on_db_open) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  opt->skip_checking_sst_file_sizes_on_db_open =
-      static_cast<bool>(jskip_checking_sst_file_sizes_on_db_open);
-}
-
-/*
- * Class:     org_rocksdb_DBOptions
- * Method:    skipCheckingSstFileSizesOnDbOpen
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_DBOptions_skipCheckingSstFileSizesOnDbOpen(
-    JNIEnv*, jclass, jlong jhandle) {
-  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
-  return static_cast<jboolean>(opt->skip_checking_sst_file_sizes_on_db_open);
-}
-
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setWalRecoveryMode
diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
index 0221a63fba07..12f5d4913c2f 100644
--- a/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/java/src/main/java/org/rocksdb/DBOptions.java
@@ -962,19 +962,6 @@ public boolean skipStatsUpdateOnDbOpen() {
     return skipStatsUpdateOnDbOpen(nativeHandle_);
   }
 
-  @Override
-  public DBOptions setSkipCheckingSstFileSizesOnDbOpen(
-      final boolean skipCheckingSstFileSizesOnDbOpen) {
-    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
-    return this;
-  }
-
-  @Override
-  public boolean skipCheckingSstFileSizesOnDbOpen() {
-    assert (isOwningHandle());
-    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
-  }
-
   @Override
   public DBOptions setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
     assert(isOwningHandle());
@@ -1389,9 +1376,6 @@ private static native void setWriteThreadSlowYieldUsec(
   private static native void setSkipStatsUpdateOnDbOpen(
       final long handle, final boolean skipStatsUpdateOnDbOpen);
   private static native boolean skipStatsUpdateOnDbOpen(final long handle);
-  private static native void setSkipCheckingSstFileSizesOnDbOpen(
-      final long handle, final boolean skipChecking);
-  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
   private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode);
   private static native byte walRecoveryMode(final long handle);
   private static native void setAllow2pc(final long handle, final boolean allow2pc);
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index bc9d9acbd65e..f40fc1a25cfe 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -1214,36 +1214,6 @@ T setEnableWriteThreadAdaptiveYield(
    */
   boolean skipStatsUpdateOnDbOpen();
 
-  /**
-   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
-   * This may significantly speed up startup if there are many sst files,
-   * especially when using non-default Env with expensive GetFileSize().
-   * We'll still check that all required sst files exist.
-   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
-   * not checked at all.
-   *
-   * Default: false
-   *
-   * @param skipCheckingSstFileSizesOnDbOpen if true, then SST file sizes will not be checked
-   *                                         when calling {@link RocksDB#open(String)}.
-   * @return the reference to the current options.
-   */
-  T setSkipCheckingSstFileSizesOnDbOpen(final boolean skipCheckingSstFileSizesOnDbOpen);
-
-  /**
-   * If true, then {@link RocksDB#open(String)} will not fetch and check sizes of all sst files.
-   * This may significantly speed up startup if there are many sst files,
-   * especially when using non-default Env with expensive GetFileSize().
-   * We'll still check that all required sst files exist.
-   * If {@code paranoid_checks} is false, this option is ignored, and sst files are
-   * not checked at all.
-   *
-   * Default: false
-   *
-   * @return true, if file sizes will not be checked when calling {@link RocksDB#open(String)}.
-   */
-  boolean skipCheckingSstFileSizesOnDbOpen();
-
   /**
    * Recovery mode to control the consistency while replaying WAL
    *
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index 675837df7a09..3e7bf28405e8 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -1045,19 +1045,6 @@ public boolean skipStatsUpdateOnDbOpen() {
     return skipStatsUpdateOnDbOpen(nativeHandle_);
   }
 
-  @Override
-  public Options setSkipCheckingSstFileSizesOnDbOpen(
-      final boolean skipCheckingSstFileSizesOnDbOpen) {
-    setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen);
-    return this;
-  }
-
-  @Override
-  public boolean skipCheckingSstFileSizesOnDbOpen() {
-    assert (isOwningHandle());
-    return skipCheckingSstFileSizesOnDbOpen(nativeHandle_);
-  }
-
   @Override
   public Options setWalRecoveryMode(final WALRecoveryMode walRecoveryMode) {
     assert(isOwningHandle());
@@ -2283,9 +2270,6 @@ private static native void setWriteThreadSlowYieldUsec(
   private static native void setSkipStatsUpdateOnDbOpen(
       final long handle, final boolean skipStatsUpdateOnDbOpen);
   private static native boolean skipStatsUpdateOnDbOpen(final long handle);
-  private static native void setSkipCheckingSstFileSizesOnDbOpen(
-      final long handle, final boolean skipChecking);
-  private static native boolean skipCheckingSstFileSizesOnDbOpen(final long handle);
   private static native void setWalRecoveryMode(final long handle, final byte walRecoveryMode);
   private static native byte walRecoveryMode(final long handle);
   private static native void setAllow2pc(final long handle, final boolean allow2pc);
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index cf3ef22ddeb4..0dc1d0cb0a8c 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -830,15 +830,6 @@ public void maxWriteBatchGroupSizeBytes() {
     }
   }
 
-  @Test
-  public void skipCheckingSstFileSizesOnDbOpen() {
-    try (final DBOptions options = new DBOptions()) {
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
-      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
-    }
-  }
-
   @Test
   public void eventListeners() {
     final AtomicBoolean wasCalled1 = new AtomicBoolean();
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index c78d0f76b3a4..f720ed44e220 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -1403,15 +1403,6 @@ public void maxWriteBatchGroupSizeBytes() {
     }
   }
 
-  @Test
-  public void skipCheckingSstFileSizesOnDbOpen() {
-    try (final Options options = new Options()) {
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(false);
-      assertThat(options.setSkipCheckingSstFileSizesOnDbOpen(true)).isEqualTo(options);
-      assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
-    }
-  }
-
   @Test
   public void memtableMaxRangeDeletions() {
     try (final Options options = new Options()) {
diff --git a/options/db_options.cc b/options/db_options.cc
index dfacea8e5b22..2384355264c2 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -259,9 +259,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"skip_checking_sst_file_sizes_on_db_open",
-         {offsetof(struct ImmutableDBOptions,
-                   skip_checking_sst_file_sizes_on_db_open),
-          OptionType::kBoolean, OptionVerificationType::kNormal,
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"new_table_reader_for_compaction_inputs",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
@@ -767,8 +765,6 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       write_thread_max_yield_usec(options.write_thread_max_yield_usec),
       write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
       skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
-      skip_checking_sst_file_sizes_on_db_open(
-          options.skip_checking_sst_file_sizes_on_db_open),
       wal_recovery_mode(options.wal_recovery_mode),
       allow_2pc(options.allow_2pc),
       row_cache(options.row_cache),
diff --git a/options/db_options.h b/options/db_options.h
index ef8607d8bba1..cc978d907dbb 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -70,7 +70,6 @@ struct ImmutableDBOptions {
   uint64_t write_thread_max_yield_usec;
   uint64_t write_thread_slow_yield_usec;
   bool skip_stats_update_on_db_open;
-  bool skip_checking_sst_file_sizes_on_db_open;
   WALRecoveryMode wal_recovery_mode;
   bool allow_2pc;
   std::shared_ptr<Cache> row_cache;
diff --git a/options/options_helper.cc b/options/options_helper.cc
index e5622d0a3238..addada94f927 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -149,8 +149,6 @@ void BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.write_thread_slow_yield_usec;
   options.skip_stats_update_on_db_open =
       immutable_db_options.skip_stats_update_on_db_open;
-  options.skip_checking_sst_file_sizes_on_db_open =
-      immutable_db_options.skip_checking_sst_file_sizes_on_db_open;
   options.wal_recovery_mode = immutable_db_options.wal_recovery_mode;
   options.allow_2pc = immutable_db_options.allow_2pc;
   options.row_cache = immutable_db_options.row_cache;
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index bbc4db46a68a..a4555f6b9ec1 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -407,7 +407,6 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "compaction_readahead_size=0;"
                              "keep_log_file_num=4890;"
                              "skip_stats_update_on_db_open=false;"
-                             "skip_checking_sst_file_sizes_on_db_open=false;"
                              "max_manifest_file_size=4295009941;"
                              "max_manifest_space_amp_pct=321;"
                              "db_log_dir=path/to/db_log_dir;"
diff --git a/unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md b/unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md
new file mode 100644
index 000000000000..385dda3e9167
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_skip_checking_sst_file_sizes_on_db_open.md
@@ -0,0 +1 @@
+Remove deprecated DB option `skip_checking_sst_file_sizes_on_db_open`. The option was deprecated in 10.5.0 and has been a no-op since then. File size validation is now always performed in parallel during DB open.

From cfc2a523a36cf7408a1d0677f4c203cecffbde0e Mon Sep 17 00:00:00 2001
From: xingbowang <shawn.xingbo.wang@gmail.com>
Date: Thu, 19 Feb 2026 14:59:37 -0800
Subject: [PATCH 491/500] Add clang-tidy-comment workflow (#14348)

Summary:
Add clang-tidy-comment workflow. This workflow allows pr clang tidy pr job to post the clang-tidy finding directly on the PR page.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14348

Test Plan: Will be tested with next clang-tidy PR

Reviewed By: joshkang97

Differential Revision: D93670150

Pulled By: xingbowang

fbshipit-source-id: 8245f9d5bde8cf800d88034c4339de9f387c5692
---
 .github/workflows/clang-tidy-comment.yml | 58 ++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 .github/workflows/clang-tidy-comment.yml

diff --git a/.github/workflows/clang-tidy-comment.yml b/.github/workflows/clang-tidy-comment.yml
new file mode 100644
index 000000000000..d0953797d683
--- /dev/null
+++ b/.github/workflows/clang-tidy-comment.yml
@@ -0,0 +1,58 @@
+name: Post clang-tidy PR comment
+on:
+  workflow_run:
+    workflows: ["facebook/rocksdb/pr-jobs"]
+    types: [completed]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  comment:
+    if: github.event.workflow_run.event == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+    - name: Download clang-tidy results
+      id: download
+      uses: actions/download-artifact@v4.0.0
+      with:
+        name: clang-tidy-result
+        run-id: ${{ github.event.workflow_run.id }}
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+      continue-on-error: true
+    - name: Post or update PR comment
+      if: steps.download.outcome == 'success'
+      uses: actions/github-script@v7
+      with:
+        script: |
+          const fs = require('fs');
+          if (!fs.existsSync('clang-tidy-comment.md') || !fs.existsSync('pr_number.txt')) {
+            core.info('No clang-tidy results found; skipping.');
+            return;
+          }
+          const body = fs.readFileSync('clang-tidy-comment.md', 'utf8');
+          const prNumber = parseInt(fs.readFileSync('pr_number.txt', 'utf8').trim());
+          const marker = '<!-- clang-tidy-bot -->';
+          const { data: comments } = await github.rest.issues.listComments({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            issue_number: prNumber,
+          });
+          const existing = comments.find(c => c.body.includes(marker));
+          if (existing) {
+            await github.rest.issues.updateComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: existing.id,
+              body,
+            });
+            core.info(`Updated existing comment ${existing.id}`);
+          } else {
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body,
+            });
+            core.info('Created new PR comment');
+          }

From 98002215d04764217396971e7ecf7013c5702c3b Mon Sep 17 00:00:00 2001
From: Josh Kang <jkangs@meta.com>
Date: Thu, 19 Feb 2026 15:22:58 -0800
Subject: [PATCH 492/500] Fix interpolation search target key less than shared
 prefix length. (#14343)

Summary:
There was an edge case missed in the implementation of interpolation search for target keys that had a length smaller than the shared prefix.

E.g. first_key = "aaaaaa", last_key = "aaaaaz", target_key = "aaz". In the existing setup, we will seek to position 0, but in reality is should be seeked to the end.

#### The fix
The solution here was to also do a bounds check on the first search iteration. We utilize memcmp on the target key with the shared_prefix to determine if the target key is outside the bounds. An edge case here is if the target key itself a prefix of the shared prefix (e.g. target = "aaaa"), in this case memcmp return return 0, but the target key is actually smaller.

### Minor optimizations
- cache left,right values so we don't need to re-compute it when left/right boundaries don't change
- In ReadBe64FromKey, utilize memcpy + swap for fast path
- since we have already computed a shared_prefix, every other comparison only needs to compare the non-shared suffixes

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14343

Test Plan:
Added new unit tests to test this case

### Benchmarks

No significant regressions due to additional memcmp.

#### Configuration
- **CPU:** 192 * AMD EPYC-Genoa Processor
- **RocksDB Version:** 10.12.0
- **Compression:** Snappy
- **Entries:** 1,000,000
- **Value Size:** 100 bytes
- **Index Search Type:** interpolation_search
- **Index Shortening Mode:** 1

#### Results

| Benchmark | Params | ops/s (main) | ops/s (feature) | % change |
|-----------|--------|-------------|-----------------|----------|
| readrandom | 16B keys, no prefix | 367,264 | 369,163 | +0.52% |
| readrandom | 100B keys, prefix_size=50 | 376,066 | 371,193 | -1.29% |

Reviewed By: pdillinger

Differential Revision: D93535267

Pulled By: joshkang97

fbshipit-source-id: beda182efce1e914ff587e697b927347cfa42656
---
 table/block_based/block.cc      | 174 +++++++++++++++++++------
 table/block_based/block.h       |  26 ++--
 table/block_based/block_test.cc | 222 +++++++++++++++++++++++++++++---
 tools/db_crashtest.py           |  10 +-
 4 files changed, 355 insertions(+), 77 deletions(-)

diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index d387019bb711..fe316a37be72 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -24,6 +24,7 @@
 #include "table/block_based/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
+#include "util/math.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -154,21 +155,33 @@ struct DecodeEntryV4 {
 
 // Read first 8 bytes (starting at offset) as big-endian uint64_t, padding
 // with zeros on the right if the key is shorter. This preserves
-// lexicographic ordering. Non-user keys will also have end internal bytes
-// stripped and not counted for in the value.
+// lexicographic ordering.
+//
+// If s.size() >= offset, then returns 0.
 static uint64_t ReadBe64FromKey(Slice s, bool is_user_key, size_t offset) {
   if (!is_user_key) {
     assert(s.size() >= kNumInternalBytes);
     s = Slice(s.data(), s.size() - kNumInternalBytes);
   }
-  uint64_t val = 0;
   offset = std::min(offset, s.size());
-  size_t len = std::min(s.size() - offset, size_t{8});
-  for (size_t i = 0; i < len; i++) {
+  size_t remaining = s.size() - offset;
+
+  // fast path
+  if (remaining >= 8) {
+    uint64_t val;
+    memcpy(&val, s.data() + offset, sizeof(val));
+    if (port::kLittleEndian) {
+      return EndianSwapValue(val);
+    }
+    return val;
+  }
+
+  uint64_t val = 0;
+  for (size_t i = 0; i < remaining; i++) {
     val = (val << 8) | static_cast<uint8_t>(s.data()[offset + i]);
   }
-  if (len > 0 && len < 8) {
-    val <<= (8 - len) * 8;  // Pad zeros on the right
+  if (remaining > 0) {
+    val <<= (8 - remaining) * 8;  // Pad zeros on the right
   }
   return val;
 }
@@ -969,20 +982,24 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
   }
 
   *skip_linear_scan = false;
+  // Currently it is assumed that comparator is always bytewise comparator, but
+  // it may also be useful to to generalize to reverse bytewise in the future.
   assert(icmp_.user_comparator() == BytewiseComparator());
 
   int64_t left = -1;
   int64_t right = num_restarts_ - 1;
-  int64_t shared_prefix_len = -1;
+  size_t shared_user_prefix_len = 0;
 
   Slice left_key;
   Slice right_key;
+  Slice left_key_suffix;
+  Slice right_key_suffix;
+  Slice target_suffix = target;
   bool seek_failed = false;
-
-#ifndef NDEBUG
-  // used to validate invariants
   bool first_iter = true;
-#endif
+  uint64_t left_val = 0;
+  uint64_t right_val = 0;
+  uint64_t target_val = 0;
 
   // A poor search is when less than half the search space is reduced, because
   // binary search would do better. When there are kMaxPoorSearches in a row,
@@ -994,7 +1011,9 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
   // - left < mid <= right, and arr[left] < target < arr[right + 1]
   //
   // The first iteration is used as an early optimization to determine initial
-  // bounds, and whether target is within those bounds
+  // bounds, and whether target is within those bounds.
+  const bool is_user_key = raw_key_.IsUserKey();
+  const Slice target_user_key = is_user_key ? target : ExtractUserKey(target);
   while (left != right) {
     int64_t mid = 0;
 
@@ -1007,13 +1026,12 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
       // Interpolation seek reads left and right boundaries anyways, so we can
       // set left = 0. The invariant that left <= target is still held because
       // we early exit if left > target for the first iteration.
-      const auto usable_left = std::max<int64_t>(left, 0);
+      const uint32_t usable_left =
+          static_cast<uint32_t>(std::max<int64_t>(left, 0));
 
       // First iteration: decode both boundary keys and compute shared prefix.
-      if (shared_prefix_len < 0) {
-        assert(first_iter);
-        if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(usable_left),
-                                          &left_key)) {
+      if (first_iter) {
+        if (!GetRestartKey<DecodeKeyFunc>(usable_left, &left_key)) {
           return false;
         }
 
@@ -1022,20 +1040,78 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
           return false;
         }
 
-        // Compute the shared prefix length between smallest index key and
-        // largest index key this can be used to "normalize" the values
-        // calculated during interpolation search.
-        shared_prefix_len =
-            static_cast<int64_t>(left_key.difference_offset(right_key));
+        // Compute the shared prefix length between the user key portions of
+        // the boundary keys. This is used to "normalize" the values calculated
+        // during interpolation search.
+        shared_user_prefix_len = left_key.difference_offset(right_key);
+        if (!is_user_key) {
+          // Ensure shared_user_prefix_len is only limited to user key. Suppose
+          // that the shared prefix of both keys are extended into the internal
+          // footer. If they are not the same user keys, then it is guaranteed
+          // left is the shorter one due to bytewise comparator. For reverse
+          // bytewise, this would be flipped.
+          shared_user_prefix_len = std::min<size_t>(
+              shared_user_prefix_len, left_key.size() - kNumInternalBytes);
+          assert(shared_user_prefix_len <=
+                 right_key.size() - kNumInternalBytes);
+        }
+
+        left_val =
+            ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
+        right_val =
+            ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
+        target_val =
+            ReadBe64FromKey(target, is_user_key, shared_user_prefix_len);
+      }
+
+      assert(shared_user_prefix_len <= left_key.size() &&
+             shared_user_prefix_len <= right_key.size());
+
+      if (first_iter && shared_user_prefix_len > 0) {
+        // It is not guaranteed that the shared_prefix of the left and right
+        // boundaries is a valid prefix of the target. If it is not, then we can
+        // early exit.
+        size_t cmp_len =
+            std::min(target_user_key.size(), shared_user_prefix_len);
+        int cmp = memcmp(target_user_key.data(), left_key.data(), cmp_len);
+        if (cmp < 0 || (cmp == 0 && cmp_len < shared_user_prefix_len)) {
+#ifndef NDEBUG
+          IterKey tmp_key;
+          tmp_key.SetIsUserKey(is_user_key);
+          UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
+          assert(CompareKey(tmp_key, target) >= 0);
+#endif
+          // if target size is less than shared_prefix length, and cmp == 0,
+          // then it is guaranteed <= left
+          *skip_linear_scan = true;
+          *index = usable_left;
+          return true;
+        } else if (cmp > 0) {
+#ifndef NDEBUG
+          IterKey tmp_key;
+          tmp_key.SetIsUserKey(is_user_key);
+          UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
+          assert(CompareKey(tmp_key, target) < 0);
+#endif
+          *index = static_cast<uint32_t>(right);
+          return true;
+        }
       }
-      assert(shared_prefix_len >= 0);
 
-      size_t spl = static_cast<size_t>(shared_prefix_len);
-      assert(spl <= left_key.size() && spl <= right_key.size());
-      uint64_t left_val = ReadBe64FromKey(left_key, raw_key_.IsUserKey(), spl);
-      uint64_t right_val =
-          ReadBe64FromKey(right_key, raw_key_.IsUserKey(), spl);
-      uint64_t target_val = ReadBe64FromKey(target, raw_key_.IsUserKey(), spl);
+      assert(shared_user_prefix_len <= target_user_key.size());
+      assert(memcmp(left_key.data(), target_user_key.data(),
+                    shared_user_prefix_len) == 0);
+      assert(memcmp(right_key.data(), target_user_key.data(),
+                    shared_user_prefix_len) == 0);
+
+      if (first_iter) {
+        left_key_suffix = Slice(left_key.data() + shared_user_prefix_len,
+                                left_key.size() - shared_user_prefix_len);
+        right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
+                                 right_key.size() - shared_user_prefix_len);
+        target_suffix = Slice(target.data() + shared_user_prefix_len,
+                              target.size() - shared_user_prefix_len);
+      }
 
       if (left_val > right_val) {
         CorruptionError("left key is greater than right key");
@@ -1047,13 +1123,13 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
 
       if (target_val < left_val) {
         assert(first_iter);
-        assert(CompareKey(left_key, target) > 0);
+        assert(CompareKey(left_key_suffix, target_suffix) > 0);
         lte_left = true;
       } else if (target_val == left_val) {
         // target_val == left_val doesn't imply target == left_key
         // because ReadBe64FromKey only reads 8 bytes and skips sequence
         // numbers. We need to check actual key order.
-        if (CompareKey(left_key, target) >= 0) {
+        if (CompareKey(left_key_suffix, target_suffix) >= 0) {
           assert(first_iter);
           lte_left = true;
         }
@@ -1063,7 +1139,7 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
         if (target_val > right_val) {
           // note that we only ever guarantee arr[target] < arr[right + 1], so
           // it is possible to end up here even on non-first iteration
-          assert(CompareKey(right_key, target) < 0);
+          assert(CompareKey(right_key_suffix, target_suffix) < 0);
           gt_right = true;
         } else if (right_val == left_val) {
           // cannot divide by 0
@@ -1073,17 +1149,25 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
 
       // early exit if key is not within bounds
       if (lte_left) {
+#ifndef NDEBUG
         assert(!seek_failed);
-        UpdateRawKeyAndMaybePadMinTimestamp(left_key);
-        assert(CompareCurrentKey(target) >= 0);
+        IterKey tmp_key;
+        tmp_key.SetIsUserKey(is_user_key);
+        UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
+        assert(CompareKey(tmp_key, target) >= 0);
+#endif
         *skip_linear_scan = true;
-        *index = static_cast<uint32_t>(usable_left);
+        *index = usable_left;
         return true;
       }
       if (gt_right) {
+#ifndef NDEBUG
         assert(!seek_failed);
-        UpdateRawKeyAndMaybePadMinTimestamp(right_key);
-        assert(CompareCurrentKey(target) < 0);
+        IterKey tmp_key;
+        tmp_key.SetIsUserKey(is_user_key);
+        UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
+        assert(CompareKey(tmp_key, target) < 0);
+#endif
         *index = static_cast<uint32_t>(right);
         return true;
       }
@@ -1124,14 +1208,18 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
       return false;
     }
 
-    UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
+    Slice mid_key_suffix(mid_key.data() + shared_user_prefix_len,
+                         mid_key.size() - shared_user_prefix_len);
 
-    int cmp = CompareCurrentKey(target);
+    UpdateRawKeyAndMaybePadMinTimestamp(mid_key_suffix);
+    int cmp = CompareCurrentKey(target_suffix);
 
     int64_t previous_search_space = right - left;
     if (cmp < 0) {
       left = mid;
       left_key = mid_key;
+      left_key_suffix = mid_key_suffix;
+      left_val = ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
     } else if (cmp > 0) {
       right = mid - 1;
       if (!seek_failed && left != right) {
@@ -1139,6 +1227,10 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
                                           &right_key)) {
           return false;
         }
+        right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
+                                 right_key.size() - shared_user_prefix_len);
+        right_val =
+            ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
       }
     } else {
       *skip_linear_scan = true;
@@ -1154,9 +1246,7 @@ bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
       continuous_poor_searches = 0;
     }
 
-#ifndef NDEBUG
     first_iter = false;
-#endif
   }
 
   if (left == -1) {
diff --git a/table/block_based/block.h b/table/block_based/block.h
index afe059cdd5eb..2187ff8c1e3b 100644
--- a/table/block_based/block.h
+++ b/table/block_based/block.h
@@ -582,14 +582,18 @@ class BlockIter : public InternalIteratorBase<TValue> {
     CorruptionError(error_msg);
   }
 
-  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
+  void UpdateRawKeyAndMaybePadMinTimestamp(IterKey& raw_key, const Slice& key) {
     if (pad_min_timestamp_) {
-      raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
+      raw_key.SetKeyWithPaddedMinTimestamp(key, ts_sz_);
     } else {
-      raw_key_.SetKey(key, false /* copy */);
+      raw_key.SetKey(key, false /* copy */);
     }
   }
 
+  void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) {
+    UpdateRawKeyAndMaybePadMinTimestamp(raw_key_, key);
+  }
+
   // Must be called every time a key is found that needs to be returned to user,
   // and may be called when no key is found (as a no-op). Updates `key_`,
   // `key_buf_`, and `key_pinned_` with info about the found key.
@@ -633,7 +637,7 @@ class BlockIter : public InternalIteratorBase<TValue> {
   // Uses user comparator when the block stores user keys, otherwise uses the
   // internal key comparator. When global_seqno is not disabled, applies it to
   // the LHS key for comparison.
-  int CompareKey(const Slice& a, const Slice& b) {
+  int CompareKey(const Slice& a, const Slice& b) const {
     assert(icmp_.user_comparator() != nullptr);
     if (raw_key_.IsUserKey()) {
       assert(global_seqno_ == kDisableGlobalSequenceNumber);
@@ -644,12 +648,16 @@ class BlockIter : public InternalIteratorBase<TValue> {
     return icmp_.Compare(a, global_seqno_, b, kDisableGlobalSequenceNumber);
   }
 
-  // Compares the current key (with global seqno applied) against `other`.
-  int CompareCurrentKey(const Slice& other) {
-    if (raw_key_.IsUserKey()) {
-      return CompareKey(raw_key_.GetUserKey(), other);
+  int CompareKey(const IterKey& a, const Slice& b) const {
+    if (a.IsUserKey()) {
+      return CompareKey(a.GetUserKey(), b);
     }
-    return CompareKey(raw_key_.GetInternalKey(), other);
+    return CompareKey(a.GetInternalKey(), b);
+  }
+
+  // Compares the current key (with global seqno applied) against `other`.
+  int CompareCurrentKey(const Slice& other) const {
+    return CompareKey(raw_key_, other);
   }
 
  private:
diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc
index a083d003ac4d..49bec09084f6 100644
--- a/table/block_based/block_test.cc
+++ b/table/block_based/block_test.cc
@@ -576,6 +576,21 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) {
   ASSERT_EQ(BlockReadAmpBitmap(100, 35, stats.get()).GetBytesPerBit(), 32u);
 }
 
+void AddIndexBlockEntry(BlockBuilder& builder, const Slice& key,
+                        const BlockHandle& bh, const BlockHandle* prev,
+                        bool include_first_key,
+                        const Slice& first_internal_key = Slice()) {
+  IndexValue entry(bh, first_internal_key);
+  std::string encoded_entry;
+  entry.EncodeTo(&encoded_entry, include_first_key, nullptr);
+  std::string delta_encoded_entry;
+  if (prev) {
+    entry.EncodeTo(&delta_encoded_entry, include_first_key, prev);
+  }
+  const Slice delta_slice(delta_encoded_entry);
+  builder.Add(key, encoded_entry, &delta_slice);
+}
+
 enum class KeyDistribution { kUniform, kNonUniform };
 
 class IndexBlockTest
@@ -694,23 +709,13 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) {
                                     ts_sz);
       first_internal_key = first_key_to_persist_buf;
     }
-    IndexValue entry(block_handles[i], first_internal_key);
-    std::string encoded_entry;
-    std::string delta_encoded_entry;
-    entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr);
-    if (useValueDeltaEncoding() && i > 0) {
-      entry.EncodeTo(&delta_encoded_entry, includeFirstKey(),
-                     &last_encoded_handle);
-    }
-    last_encoded_handle = entry.handle;
-    const Slice delta_encoded_entry_slice(delta_encoded_entry);
-
-    if (keyIncludesSeq()) {
-      builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice);
-    } else {
-      const Slice user_key = ExtractUserKey(separators[i]);
-      builder.Add(user_key, encoded_entry, &delta_encoded_entry_slice);
-    }
+    const BlockHandle* prev =
+        (useValueDeltaEncoding() && i > 0) ? &last_encoded_handle : nullptr;
+    Slice add_key =
+        keyIncludesSeq() ? Slice(separators[i]) : ExtractUserKey(separators[i]);
+    AddIndexBlockEntry(builder, add_key, block_handles[i], prev,
+                       includeFirstKey(), first_internal_key);
+    last_encoded_handle = block_handles[i];
   }
 
   // read serialized contents of the block
@@ -807,6 +812,185 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(KeyDistribution::kUniform,
                           KeyDistribution::kNonUniform)));
 
+TEST(IndexBlockTest, InterpolationSearchPrefixBoundary) {
+  const bool kIncludeFirstKey = false;
+  const bool kUseValueDeltaEncoding = true;
+  const uint64_t kBlockSize = 50;
+
+  // 20 user keys sharing prefix "ABCDEFGHIJ" with evenly spaced suffixes.
+  const std::string kPrefix = "ABCDEFGHIJ";
+  const int kNumKeys = 20;
+  std::vector<std::string> keys;
+  keys.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    std::string suffix = std::to_string(i);
+    char formatted_suffix[4];
+    snprintf(formatted_suffix, sizeof(formatted_suffix), "%03d", i);
+    keys.push_back(kPrefix + formatted_suffix);
+  }
+
+  std::vector<BlockHandle> handles;
+  handles.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    handles.emplace_back(i * (kBlockSize + BlockBasedTable::kBlockTrailerSize),
+                         kBlockSize);
+  }
+
+  BlockBuilder builder(
+      1 /* restart_interval */, true /* use_delta_encoding */,
+      kUseValueDeltaEncoding, BlockBasedTableOptions::kDataBlockBinarySearch,
+      0.75 /* data_block_hash_table_util_ratio */, 0 /* ts_sz */,
+      false /* persist_udt */, true /* is_user_key */);
+
+  for (int i = 0; i < kNumKeys; i++) {
+    BlockHandle* prev = i > 0 ? &handles[i - 1] : nullptr;
+    AddIndexBlockEntry(builder, keys[i], handles[i], prev, kIncludeFirstKey);
+  }
+
+  Slice rawblock = builder.Finish();
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  // Seek targets must be internal keys since SeekImpl calls ExtractUserKey().
+  auto make_target = [](const std::string& user_key) {
+    std::string target = user_key;
+    AppendInternalKeyFooter(&target, kMaxSequenceNumber, kValueTypeForSeek);
+    return target;
+  };
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+      reader.NewIndexIterator(
+          BytewiseComparator(), kDisableGlobalSequenceNumber,
+          nullptr /* iter */, nullptr /* stats */, true /* total_order_seek */,
+          kIncludeFirstKey, false /* key_includes_seq */,
+          !kUseValueDeltaEncoding /* value_is_full */,
+          false /* block_contents_pinned */,
+          true /* user_defined_timestamps_persisted */,
+          nullptr /* prefix_index */,
+          BlockBasedTableOptions::BlockSearchType::kInterpolation));
+
+  // Case 1: target prefix < shared prefix
+  iter->Seek(make_target("AAAAAA"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  iter->Seek(make_target(""));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 2: target prefix > shared prefix
+  iter->Seek(make_target("ABCDEFGHZZ"));
+  ASSERT_FALSE(iter->Valid());
+
+  // Case 3: target is the prefix
+  iter->Seek(make_target("ABCDEFGHIJ"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 4: target a subset of the prefix
+  iter->Seek(make_target("ABCDEFG"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+}
+
+// Like the above test, but extend the shared prefix into internal bytes
+TEST(IndexBlockTest, InterpolationSearchPrefixBoundary2) {
+  const bool kIncludeFirstKey = false;
+  const bool kUseValueDeltaEncoding = true;
+  const uint64_t kBlockSize = 50;
+
+  // 20 internal keys with the same user key but decreasing sequence numbers
+  // (which is ascending InternalKeyComparator order).
+  const std::string kUserKey = "ABCDEFGHIJ";
+  const int kNumKeys = 20;
+  std::vector<std::string> keys;
+  keys.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    std::string ikey = kUserKey;
+    SequenceNumber seq = static_cast<SequenceNumber>(kNumKeys - i);
+    AppendInternalKeyFooter(&ikey, seq, kTypeValue);
+    keys.push_back(ikey);
+  }
+
+  std::vector<BlockHandle> handles;
+  handles.reserve(kNumKeys);
+  for (int i = 0; i < kNumKeys; i++) {
+    handles.emplace_back(i * (kBlockSize + BlockBasedTable::kBlockTrailerSize),
+                         kBlockSize);
+  }
+
+  BlockBuilder builder(
+      1 /* restart_interval */, true /* use_delta_encoding */,
+      kUseValueDeltaEncoding, BlockBasedTableOptions::kDataBlockBinarySearch,
+      0.75 /* data_block_hash_table_util_ratio */, 0 /* ts_sz */,
+      false /* persist_udt */, false /* is_user_key */);
+
+  for (int i = 0; i < kNumKeys; i++) {
+    BlockHandle* prev = i > 0 ? &handles[i - 1] : nullptr;
+    AddIndexBlockEntry(builder, keys[i], handles[i], prev, kIncludeFirstKey);
+  }
+
+  Slice rawblock = builder.Finish();
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents));
+
+  auto make_target = [&](const std::string& user_key,
+                         SequenceNumber seq = kMaxSequenceNumber) {
+    std::string target = user_key;
+    AppendInternalKeyFooter(&target, seq, kTypeValue);
+    return target;
+  };
+
+  std::unique_ptr<InternalIteratorBase<IndexValue>> iter(
+      reader.NewIndexIterator(
+          BytewiseComparator(), kDisableGlobalSequenceNumber,
+          nullptr /* iter */, nullptr /* stats */, true /* total_order_seek */,
+          kIncludeFirstKey, true /* key_includes_seq */,
+          !kUseValueDeltaEncoding /* value_is_full */,
+          false /* block_contents_pinned */,
+          true /* user_defined_timestamps_persisted */,
+          nullptr /* prefix_index */,
+          BlockBasedTableOptions::BlockSearchType::kInterpolation));
+
+  // Seek to each existing sequence number
+  for (int i = 0; i < kNumKeys; i++) {
+    SequenceNumber seq = static_cast<SequenceNumber>(kNumKeys - i);
+    iter->Seek(make_target(kUserKey, seq));
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(iter->key(), keys[i]);
+  }
+
+  // Case 1: target prefix < shared prefix
+  iter->Seek(make_target("AAAAAA"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  iter->Seek(make_target(""));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 2: target prefix > shared prefix
+  iter->Seek(make_target("ABCDEFGHZZ"));
+  ASSERT_FALSE(iter->Valid());
+
+  // Case 3: target has the same user key with kMaxSequenceNumber
+  iter->Seek(make_target("ABCDEFGHIJ"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 4: target a subset of the prefix
+  iter->Seek(make_target("ABCDEFG"));
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(iter->key(), keys[0]);
+
+  // Case 5: target key is a prefix that also extends into the internal bytes
+  // footer
+  iter->Seek(make_target("ABCDEFGHIJ" + std::string(1, kTypeValue)));
+  ASSERT_FALSE(iter->Valid());
+}
+
 class BlockPerKVChecksumTest : public DBTestBase {
  public:
   BlockPerKVChecksumTest()
@@ -856,7 +1040,7 @@ class BlockPerKVChecksumTest : public DBTestBase {
 
   template <typename TBlockIter>
   void TestSeekForPrev(std::unique_ptr<TBlockIter>& biter,
-                       size_t& verification_count, std::string k) {
+                       size_t& verification_count, const std::string& k) {
     verification_count = 0;
     biter->SeekForPrev(k);
     ASSERT_GE(verification_count, 1);
@@ -865,7 +1049,7 @@ class BlockPerKVChecksumTest : public DBTestBase {
 
   template <typename TBlockIter>
   void TestSeek(std::unique_ptr<TBlockIter>& biter, size_t& verification_count,
-                std::string k) {
+                const std::string& k) {
     verification_count = 0;
     biter->Seek(k);
     ASSERT_GE(verification_count, 1);
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index c00254958b48..ff0b1c998404 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -174,8 +174,7 @@ def apply_random_seed_per_iteration():
     "get_current_wal_file_one_in": 0,
     # Temporarily disable hash index
     "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]),
-    # Temporarily disable interpolation search (allow for binary search '0' only)
-    "index_block_search_type": 0,
+    "index_block_search_type": lambda: random.choice([0, 1]),
     "ingest_external_file_one_in": lambda: random.choice([1000, 1000000]),
     "test_ingest_standalone_range_deletion_one_in": lambda: random.choice([0, 5, 10]),
     "iterpercent": 10,
@@ -266,9 +265,7 @@ def apply_random_seed_per_iteration():
     "stats_dump_period_sec": lambda: random.choice([0, 10, 600]),
     "compaction_ttl": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]),
     "fifo_allow_compaction": lambda: random.randint(0, 1),
-    "fifo_compaction_max_data_files_size_mb": lambda: random.choice(
-        [0, 100, 500]
-    ),
+    "fifo_compaction_max_data_files_size_mb": lambda: random.choice([0, 100, 500]),
     "fifo_compaction_use_kv_ratio_compaction": lambda: random.randint(0, 1),
     # Test small max_manifest_file_size in a smaller chance, as most of the
     # time we wnat manifest history to be preserved to help debug
@@ -979,8 +976,7 @@ def finalize_and_sanitize(src_params):
         if dest_params.get("fifo_compaction_use_kv_ratio_compaction", 0) == 1:
             if (
                 dest_params.get("fifo_allow_compaction", 0) != 1
-                or dest_params.get("fifo_compaction_max_data_files_size_mb", 0)
-                == 0
+                or dest_params.get("fifo_compaction_max_data_files_size_mb", 0) == 0
             ):
                 dest_params["fifo_compaction_use_kv_ratio_compaction"] = 0
     else:

From 407f02da19b70a955a6886c11aaef412ceb43c80 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 19 Feb 2026 16:45:51 -0800
Subject: [PATCH 493/500] Remove deprecated SliceTransform::InRange() virtual
 method (#14353)

Summary:
**Summary/Context:**

Remove the `InRange()` virtual method from `SliceTransform` and all its overrides. This method was marked DEPRECATED, never called by RocksDB, and existed only for backward compatibility.

Also removes the `in_range` callback parameter from `rocksdb_slicetransform_create()` in the C API, which is a breaking change appropriate for a major version release.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14353

Test Plan: Make check

Reviewed By: xingbowang

Differential Revision: D93795070

Pulled By: hx235

fbshipit-source-id: 5eba23f1d038b19c494997a55e5d8ca379fbedcb
---
 db/c.cc                                                |  8 --------
 db/db_bloom_filter_test.cc                             | 10 ----------
 db/db_iterator_test.cc                                 |  5 -----
 db/db_memtable_test.cc                                 |  2 --
 db/dbformat.h                                          |  5 -----
 db/prefix_test.cc                                      |  2 --
 include/rocksdb/c.h                                    |  1 -
 include/rocksdb/slice_transform.h                      |  9 ++-------
 options/customizable_test.cc                           |  2 --
 table/table_test.cc                                    |  7 -------
 .../remove_slice_transform_inrange.md                  |  1 +
 util/slice.cc                                          | 10 ----------
 utilities/options/options_util_test.cc                 |  2 --
 13 files changed, 3 insertions(+), 61 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_slice_transform_inrange.md

diff --git a/db/c.cc b/db/c.cc
index 6664d8d0a06a..9282c55580ba 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -596,7 +596,6 @@ struct rocksdb_slicetransform_t : public SliceTransform {
   char* (*transform_)(void*, const char* key, size_t length,
                       size_t* dst_length);
   unsigned char (*in_domain_)(void*, const char* key, size_t length);
-  unsigned char (*in_range_)(void*, const char* key, size_t length);
 
   ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
 
@@ -611,10 +610,6 @@ struct rocksdb_slicetransform_t : public SliceTransform {
   bool InDomain(const Slice& src) const override {
     return (*in_domain_)(state_, src.data(), src.size());
   }
-
-  bool InRange(const Slice& src) const override {
-    return (*in_range_)(state_, src.data(), src.size());
-  }
 };
 
 struct rocksdb_universal_compaction_options_t {
@@ -6892,14 +6887,12 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create(
     char* (*transform)(void*, const char* key, size_t length,
                        size_t* dst_length),
     unsigned char (*in_domain)(void*, const char* key, size_t length),
-    unsigned char (*in_range)(void*, const char* key, size_t length),
     const char* (*name)(void*)) {
   rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
   result->state_ = state;
   result->destructor_ = destructor;
   result->transform_ = transform;
   result->in_domain_ = in_domain;
-  result->in_range_ = in_range;
   result->name_ = name;
   return result;
 }
@@ -6915,7 +6908,6 @@ struct SliceTransformWrapper : public rocksdb_slicetransform_t {
     return rep_->Transform(src);
   }
   bool InDomain(const Slice& src) const override { return rep_->InDomain(src); }
-  bool InRange(const Slice& src) const override { return rep_->InRange(src); }
   static void DoNothing(void*) {}
 };
 
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index fd42a06866f5..eb6e51a95ec6 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -137,11 +137,6 @@ class SliceTransformLimitedDomainGeneric : public SliceTransform {
     // prefix will be x????
     return src.size() >= 5;
   }
-
-  bool InRange(const Slice& dst) const override {
-    // prefix will be x????
-    return dst.size() == 5;
-  }
 };
 
 // KeyMayExist can lead to a few false positives, but not false negatives.
@@ -2077,11 +2072,6 @@ class SliceTransformLimitedDomain : public SliceTransform {
     // prefix will be x????
     return src.size() >= 5 && src[0] == 'x';
   }
-
-  bool InRange(const Slice& dst) const override {
-    // prefix will be x????
-    return dst.size() == 5 && dst[0] == 'x';
-  }
 };
 
 TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) {
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 9862b6b8a632..d2371abfa890 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -1842,11 +1842,6 @@ class SliceTransformLimitedDomainGeneric : public SliceTransform {
     // prefix will be x????
     return src.size() >= 1;
   }
-
-  bool InRange(const Slice& dst) const override {
-    // prefix will be x????
-    return dst.size() == 1;
-  }
 };
 
 TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 0e3beb5edfbf..1086401dd3f9 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -117,8 +117,6 @@ class TestPrefixExtractor : public SliceTransform {
     return separator(key) != nullptr;
   }
 
-  bool InRange(const Slice& /*key*/) const override { return false; }
-
  private:
   const char* separator(const Slice& key) const {
     return static_cast<const char*>(memchr(key.data(), '_', key.size()));
diff --git a/db/dbformat.h b/db/dbformat.h
index 0ee6e9272b5f..e1b9342ff430 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -980,11 +980,6 @@ class InternalKeySliceTransform : public SliceTransform {
     return transform_->InDomain(user_key);
   }
 
-  bool InRange(const Slice& dst) const override {
-    auto user_key = ExtractUserKey(dst);
-    return transform_->InRange(user_key);
-  }
-
   const SliceTransform* user_prefix_extractor() const { return transform_; }
 
  private:
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index 35f005138662..d1559b50721b 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -220,8 +220,6 @@ class SamePrefixTransform : public SliceTransform {
     return false;
   }
 
-  bool InRange(const Slice& dst) const override { return dst == prefix_; }
-
   bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
 };
 
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index ffb0f793f8e6..531ca1f9413c 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -2725,7 +2725,6 @@ rocksdb_slicetransform_create(
     char* (*transform)(void*, const char* key, size_t length,
                        size_t* dst_length),
     unsigned char (*in_domain)(void*, const char* key, size_t length),
-    unsigned char (*in_range)(void*, const char* key, size_t length),
     const char* (*name)(void*));
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
 rocksdb_slicetransform_create_fixed_prefix(size_t);
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
index f2515d03ffa2..f1ed46a62c50 100644
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -8,9 +8,8 @@
 //
 // Class for specifying user-defined functions which perform a
 // transformation on a slice.  It is not required that every slice
-// belong to the domain and/or range of a function.  Subclasses should
-// define InDomain and InRange to determine which slices are in either
-// of these sets respectively.
+// belong to the domain of a function.  Subclasses should
+// define InDomain to determine which slices are in this set.
 
 #pragma once
 
@@ -70,10 +69,6 @@ class SliceTransform : public Customizable {
   //
   virtual bool InDomain(const Slice& key) const = 0;
 
-  // DEPRECATED: This is currently not used and remains here for backward
-  // compatibility.
-  virtual bool InRange(const Slice& /*dst*/) const { return false; }
-
   // Returns information on maximum prefix length, if there is one.
   // If Transform(x).size() == n for some keys and otherwise < n,
   // should return true and set *len = n. Returning false is safe but
diff --git a/options/customizable_test.cc b/options/customizable_test.cc
index 8549e7947fa8..53eac3cec182 100644
--- a/options/customizable_test.cc
+++ b/options/customizable_test.cc
@@ -1281,8 +1281,6 @@ class MockSliceTransform : public SliceTransform {
   Slice Transform(const Slice& /*key*/) const override { return Slice(); }
 
   bool InDomain(const Slice& /*key*/) const override { return false; }
-
-  bool InRange(const Slice& /*key*/) const override { return false; }
 };
 
 class MockMemoryAllocator : public BaseMemoryAllocator {
diff --git a/table/table_test.cc b/table/table_test.cc
index a4f06e4eacc8..e49b3ecf5b35 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -740,9 +740,6 @@ class FixedOrLessPrefixTransform : public SliceTransform {
 
   bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  bool InRange(const Slice& dst) const override {
-    return (dst.size() <= prefix_len_);
-  }
   bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
 };
 
@@ -5325,10 +5322,6 @@ class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
     return IsValid(src);
   }
 
-  bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
-    return true;
-  }
-
   bool IsValid(const ROCKSDB_NAMESPACE::Slice& src) const {
     if (src.size() != 4) {
       return false;
diff --git a/unreleased_history/public_api_changes/remove_slice_transform_inrange.md b/unreleased_history/public_api_changes/remove_slice_transform_inrange.md
new file mode 100644
index 000000000000..bc007588b9f7
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_slice_transform_inrange.md
@@ -0,0 +1 @@
+Remove deprecated `SliceTransform::InRange()` virtual method and the `in_range` callback parameter from `rocksdb_slicetransform_create()` in the C API. `InRange()` was never called by RocksDB and existed only for backward compatibility.
diff --git a/util/slice.cc b/util/slice.cc
index 9ec0af132c27..cd3be5d33761 100644
--- a/util/slice.cc
+++ b/util/slice.cc
@@ -61,10 +61,6 @@ class FixedPrefixTransform : public SliceTransform {
     return (src.size() >= prefix_len_);
   }
 
-  bool InRange(const Slice& dst) const override {
-    return (dst.size() == prefix_len_);
-  }
-
   bool FullLengthEnabled(size_t* len) const override {
     *len = prefix_len_;
     return true;
@@ -111,10 +107,6 @@ class CappedPrefixTransform : public SliceTransform {
 
   bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  bool InRange(const Slice& dst) const override {
-    return (dst.size() <= cap_len_);
-  }
-
   bool FullLengthEnabled(size_t* len) const override {
     *len = cap_len_;
     return true;
@@ -136,8 +128,6 @@ class NoopTransform : public SliceTransform {
 
   bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  bool InRange(const Slice& /*dst*/) const override { return true; }
-
   bool SameResultWhenAppended(const Slice& /*prefix*/) const override {
     return false;
   }
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 59397613b0dc..0bfcf704f5f5 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -216,8 +216,6 @@ class DummySliceTransform : public SliceTransform {
   // determine whether this is a valid src upon the function applies
   bool InDomain(const Slice& /*src*/) const override { return false; }
 
-  // determine whether dst=Transform(src) for some src
-  bool InRange(const Slice& /*dst*/) const override { return false; }
 };
 
 }  // namespace

From 520c3ecbf1a487166e50768c15e8c58cee842394 Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 19 Feb 2026 18:51:34 -0800
Subject: [PATCH 494/500] Prepare for 11.0 major release (#14357)

Summary:
In my last version bump, I forgot that the next release would be a major release. We can fix that now ahead of release cut.

I'm also updating folly now because I have experience resolving folly issues. Folly commit e04860553 changed libevent to build as static-only so required a change in our build.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14357

Test Plan: CI

Reviewed By: xingbowang

Differential Revision: D93797666

Pulled By: pdillinger

fbshipit-source-id: 22179da900f9dc6c5544163071079a4701c7c663
---
 folly.mk                  | 6 +++---
 include/rocksdb/version.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/folly.mk b/folly.mk
index 7709485f4a6c..69f99b91a9aa 100644
--- a/folly.mk
+++ b/folly.mk
@@ -47,13 +47,13 @@ ifneq ($(strip $(FOLLY_PATH)),)
 	# Add -ldl at the end as gcc resolves a symbol in a library by searching only in libraries specified later
 	# in the command line
 
-	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent-2.1.so $(LIBSODIUM_PATH)/lib/libsodium.a -ldl
+	PLATFORM_LDFLAGS += $(FOLLY_PATH)/lib/libfolly.a $(BOOST_PATH)/lib/libboost_context.a $(BOOST_PATH)/lib/libboost_filesystem.a $(BOOST_PATH)/lib/libboost_atomic.a $(BOOST_PATH)/lib/libboost_program_options.a $(BOOST_PATH)/lib/libboost_regex.a $(BOOST_PATH)/lib/libboost_system.a $(BOOST_PATH)/lib/libboost_thread.a $(DBL_CONV_PATH)/lib/libdouble-conversion.a $(LIBEVENT_PATH)/lib/libevent.a $(LIBSODIUM_PATH)/lib/libsodium.a -ldl
 ifneq ($(DEBUG_LEVEL),0)
 	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmtd.a $(GLOG_LIB_PATH)/libglogd.so $(GFLAGS_PATH)/lib/libgflags_debug.so.2.2
 else
 	PLATFORM_LDFLAGS += $(FMT_LIB_PATH)/libfmt.a $(GLOG_LIB_PATH)/libglog.so $(GFLAGS_PATH)/lib/libgflags.so.2.2
 endif
-	PLATFORM_LDFLAGS += -Wl,-rpath=$(LIBEVENT_PATH)/lib -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib
+	PLATFORM_LDFLAGS += -Wl,-rpath=$(GLOG_LIB_PATH) -Wl,-rpath=$(GFLAGS_PATH)/lib
 endif
 	PLATFORM_CCFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
 	PLATFORM_CXXFLAGS += -DUSE_FOLLY -DFOLLY_NO_CONFIG
@@ -98,7 +98,7 @@ endif  # FMT_SOURCE_PATH
 	PLATFORM_LDFLAGS += -lglog
 endif
 
-FOLLY_COMMIT_HASH = d2d1e6f746faa9ae7a973381dbd017634d04a040
+FOLLY_COMMIT_HASH = 1e8ce1e5d35acff7b78fedbca3e7311b39f43529
 
 # For public CI runs, checkout folly in a way that can build with RocksDB.
 # This is mostly intended as a test-only simulation of Meta-internal folly
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 0de620474ee1..5fe307d19af8 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -11,8 +11,8 @@
 
 // NOTE: in 'main' development branch, this should be the *next*
 // minor or major version number planned for release.
-#define ROCKSDB_MAJOR 10
-#define ROCKSDB_MINOR 12
+#define ROCKSDB_MAJOR 11
+#define ROCKSDB_MINOR 0
 #define ROCKSDB_PATCH 0
 
 // Make it easy to do conditional compilation based on version checks, i.e.

From f1a6759b1f190ef2bb2c6152976761411777b2ca Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@meta.com>
Date: Thu, 19 Feb 2026 20:26:38 -0800
Subject: [PATCH 495/500] Fix flaky
 DBTestXactLogIterator.TransactionLogIteratorCheckWhenArchive (#14349)

Summary:
a couple recent failures in this test. Waiting for purge and disabling sync points before Close should resolve the issues.

Also fixing EventListenerTest.BlobDBOnFlushCompleted because it showed up as flaky in CI for this PR

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14349

Test Plan: watch CI

Reviewed By: mszeszko-meta

Differential Revision: D93619322

Pulled By: pdillinger

fbshipit-source-id: bb9fc7d3c0ecaaeaffe4305e1ad403cbcd597484
---
 db/db_log_iter_test.cc | 12 +++++++-----
 db/listener_test.cc    | 25 +++++++++++++++----------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc
index 17163210e82f..62b1f893d5c2 100644
--- a/db/db_log_iter_test.cc
+++ b/db/db_log_iter_test.cc
@@ -180,13 +180,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckWhenArchive) {
 
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
     ASSERT_OK(dbfull()->Flush(FlushOptions(), cf));
+    // Try lots of things to ensure callback is triggered
+    ASSERT_OK(dbfull()->TEST_SwitchWAL());
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+    ASSERT_OK(dbfull()->TEST_WaitForPurge());
     delete cf;
-    // Normally hit several times; WART: perhaps more in parallel after flush
-    // FIXME: this test is flaky
-    // ASSERT_TRUE(callback_hit.LoadRelaxed());
+    ASSERT_TRUE(callback_hit.LoadRelaxed());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Close();
   } while (ChangeCompactOptions());
-  Close();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 #endif
 
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 10ca451fb546..989de3583c7b 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -1308,16 +1308,21 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
   explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test)
       : test_(test), call_count_(0) {}
 
-  const VersionStorageInfo* GetVersionStorageInfo() const {
-    VersionSet* const versions = test_->dbfull()->GetVersionSet();
+  // NOTE: it's not safe to rely on test_->db_ for these functions because
+  // the DB may be in the process of closing when these are called, and the
+  // unique_ptr is set to nullptr before invoking ~DB()
+
+  const VersionStorageInfo* GetVersionStorageInfo(DB* db) const {
+    DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+    VersionSet* const versions = db_impl->GetVersionSet();
     assert(versions);
 
     ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
     EXPECT_NE(cfd, nullptr);
 
-    test_->dbfull()->TEST_LockMutex();
+    db_impl->TEST_LockMutex();
     Version* const current = cfd->current();
-    test_->dbfull()->TEST_UnlockMutex();
+    db_impl->TEST_UnlockMutex();
     EXPECT_NE(current, nullptr);
 
     const VersionStorageInfo* const storage_info = current->storage_info();
@@ -1327,8 +1332,9 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
   }
 
   void CheckBlobFileAdditions(
+      DB* db,
       const std::vector<BlobFileAdditionInfo>& blob_file_addition_infos) const {
-    const auto* vstorage = GetVersionStorageInfo();
+    const auto* vstorage = GetVersionStorageInfo(db);
 
     EXPECT_FALSE(blob_file_addition_infos.empty());
 
@@ -1356,7 +1362,7 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
     return result;
   }
 
-  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
     {
       std::lock_guard<std::mutex> lock(mutex_);
       IncreaseCallCount(/*mutex_locked*/ true);
@@ -1365,16 +1371,15 @@ class BlobDBJobLevelEventListenerTest : public EventListener {
 
     EXPECT_EQ(info.blob_compression_type, kNoCompression);
 
-    CheckBlobFileAdditions(info.blob_file_addition_infos);
+    CheckBlobFileAdditions(db, info.blob_file_addition_infos);
   }
 
-  void OnCompactionCompleted(DB* /*db*/,
-                             const CompactionJobInfo& info) override {
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override {
     IncreaseCallCount(/*mutex_locked*/ false);
 
     EXPECT_EQ(info.blob_compression_type, kNoCompression);
 
-    CheckBlobFileAdditions(info.blob_file_addition_infos);
+    CheckBlobFileAdditions(db, info.blob_file_addition_infos);
 
     EXPECT_FALSE(info.blob_file_garbage_infos.empty());
 

From 4c89ff1102363ca1377510f0f7a7d305a60bead3 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Thu, 19 Feb 2026 22:10:47 -0800
Subject: [PATCH 496/500] Remove deprecated SstFileWriter::Add() and
 skip_filters parameter (#14352)

Summary:
**Context/Summary:**
Remove `SstFileWriter::Add()` (deprecated in favor of `Put()`) and the `skip_filters` parameter from `SstFileWriter` constructors (deprecated in favor of setting `BlockBasedTableOptions::filter_policy` to `nullptr`).

Both APIs have zero active callers. The `skip_filters` field is also removed from `TableBuilderOptions` (write-side only; the read-side `TableReaderOptions::skip_filters` is unchanged).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14352

Test Plan: make check

Reviewed By: xingbowang

Differential Revision: D93812389

Pulled By: hx235

fbshipit-source-id: 236b36a6e664758ab5ad90e606bc195d0a6de70f
---
 db/external_sst_file_test.cc                    | 13 +++++++++----
 include/rocksdb/sst_file_writer.h               | 17 +++--------------
 table/block_based/block_based_table_builder.cc  |  3 ---
 table/sst_file_writer.cc                        | 15 +++------------
 table/table_builder.h                           |  4 ----
 .../remove_sst_file_writer_deprecated.md        |  1 +
 6 files changed, 16 insertions(+), 37 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md

diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index e30405d50978..c4cc09797af2 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -2569,14 +2569,19 @@ TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
         options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
   }
 
-  // Create external SST file but skip bloom filters
+  // Create external SST file but skip bloom filters by using options
+  // with no filter policy
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   DestroyAndReopen(options);
   {
     std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
-    SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
-                                  Env::IOPriority::IO_TOTAL,
-                                  true /* skip_filters */);
+    // Use options with no filter policy to skip bloom filters
+    Options no_filter_options = options;
+    BlockBasedTableOptions no_filter_table_options = table_options;
+    no_filter_table_options.filter_policy.reset();
+    no_filter_options.table_factory.reset(
+        NewBlockBasedTableFactory(no_filter_table_options));
+    SstFileWriter sst_file_writer(EnvOptions(), no_filter_options);
     ASSERT_OK(sst_file_writer.Open(file_path));
     ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
     ASSERT_OK(sst_file_writer.Finish());
diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h
index 6da739cf38b2..607782715a21 100644
--- a/include/rocksdb/sst_file_writer.h
+++ b/include/rocksdb/sst_file_writer.h
@@ -82,24 +82,19 @@ class SstFileWriter {
   // hint that this file pages is not needed every time we write 1MB to the
   // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
   // passed.
-  // The `skip_filters` option is DEPRECATED and could be removed in the
-  // future. Use `BlockBasedTableOptions::filter_policy` to control filter
-  // generation.
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
-                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
-                bool skip_filters = false)
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL)
       : SstFileWriter(env_options, options, options.comparator, column_family,
-                      invalidate_page_cache, io_priority, skip_filters) {}
+                      invalidate_page_cache, io_priority) {}
 
   // Deprecated API
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 const Comparator* user_comparator,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
-                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
-                bool skip_filters = false);
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL);
 
   ~SstFileWriter();
 
@@ -107,12 +102,6 @@ class SstFileWriter {
   Status Open(const std::string& file_path,
               Temperature temp = Temperature::kUnknown);
 
-  // Add a Put key with value to currently opened file (deprecated)
-  // REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
-  //           key according to the comparator.
-  // REQUIRES: comparator is *not* timestamp-aware.
-  [[deprecated]] Status Add(const Slice& user_key, const Slice& value);
-
   // Add a Put key with value to currently opened file
   // REQUIRES: user_key is after any previously added point (Put/Merge/Delete)
   //           key according to the comparator.
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index ed288952213d..c080dcb5cca1 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1269,9 +1269,6 @@ struct BlockBasedTableBuilder::Rep {
       // Apply optimize_filters_for_hits setting here when applicable by
       // skipping filter generation
       filter_builder.reset();
-    } else if (tbo.skip_filters) {
-      // For SstFileWriter skip_filters
-      filter_builder.reset();
     } else if (!table_options.filter_policy) {
       // Null filter_policy -> no filter
       filter_builder.reset();
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index fae60d82dd4d..cf6c32cdf7da 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -30,7 +30,7 @@ const size_t kFadviseTrigger = 1024 * 1024;  // 1MB
 struct SstFileWriter::Rep {
   Rep(const EnvOptions& _env_options, const Options& options,
       Env::IOPriority _io_priority, const Comparator* _user_comparator,
-      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters,
+      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache,
       std::string _db_session_id)
       : env_options(_env_options),
         ioptions(options),
@@ -39,7 +39,6 @@ struct SstFileWriter::Rep {
         internal_comparator(_user_comparator),
         cfh(_cfh),
         invalidate_page_cache(_invalidate_page_cache),
-        skip_filters(_skip_filters),
         db_session_id(_db_session_id),
         ts_sz(_user_comparator->timestamp_size()),
         strip_timestamp(ts_sz > 0 &&
@@ -67,7 +66,6 @@ struct SstFileWriter::Rep {
   // The size of the file during the last time we called Fadvise to remove
   // cached pages from page cache.
   uint64_t last_fadvise_size = 0;
-  bool skip_filters;
   std::string db_session_id;
   uint64_t next_file_number = 1;
   size_t ts_sz;
@@ -305,9 +303,9 @@ SstFileWriter::SstFileWriter(const EnvOptions& env_options,
                              const Comparator* user_comparator,
                              ColumnFamilyHandle* column_family,
                              bool invalidate_page_cache,
-                             Env::IOPriority io_priority, bool skip_filters)
+                             Env::IOPriority io_priority)
     : rep_(new Rep(env_options, options, io_priority, user_comparator,
-                   column_family, invalidate_page_cache, skip_filters,
+                   column_family, invalidate_page_cache,
                    DBImpl::GenerateDbSessionId(options.env))) {
   // SstFileWriter is used to create sst files that can be added to database
   // later. Therefore, no real db_id and db_session_id are associated with it.
@@ -403,9 +401,6 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) {
   // assign fake file numbers to each file (into table properties) and keep
   // the same session id for the life of the SstFileWriter.
   r->next_file_number++;
-  // XXX: when we can remove skip_filters from the SstFileWriter public API
-  // we can remove it from TableBuilderOptions.
-  table_builder_options.skip_filters = r->skip_filters;
   FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
   r->file_writer.reset(new WritableFileWriter(
       std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
@@ -424,10 +419,6 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) {
   return s;
 }
 
-Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
-  return rep_->Add(user_key, value, ValueType::kTypeValue);
-}
-
 Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
   return rep_->Add(user_key, value, ValueType::kTypeValue);
 }
diff --git a/table/table_builder.h b/table/table_builder.h
index 63ab175b5f60..ec9f61bbf98b 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -164,10 +164,6 @@ struct TableBuilderOptions : public TablePropertiesCollectorFactory::Context {
   const TableFileCreationReason reason;
   // END for FilterBuildingContext
 
-  // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you
-  // want to skip filters, that should be (for example) null filter_policy
-  // in the table options of the ioptions.table_factory
-  bool skip_filters = false;
   const uint64_t cur_file_num;
 };
 
diff --git a/unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md b/unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md
new file mode 100644
index 000000000000..d4096f82b359
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_sst_file_writer_deprecated.md
@@ -0,0 +1 @@
+Remove deprecated `SstFileWriter::Add()` method (use `Put()` instead) and the deprecated `skip_filters` parameter from `SstFileWriter` constructors (use `BlockBasedTableOptions::filter_policy` set to `nullptr` to skip filter generation instead).

From 6d6f7d825b5cf7534e8120834aea6443bae3fbd6 Mon Sep 17 00:00:00 2001
From: Andrew Chang <andrewrchang@meta.com>
Date: Fri, 20 Feb 2026 10:43:30 -0800
Subject: [PATCH 497/500] Check io_uring probe result in SupportedOps (#14355)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/14355

SupportedOps advertised kAsyncIO based only on the IsIOUringEnabled() weak symbol check, without verifying that the constructor's io_uring probe actually succeeded. Add a thread_local_async_read_io_urings_ null check so kAsyncIO is only reported when the probe passed. Also update the constructor to probe with the same IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN flags that ReadAsync and MultiRead use at runtime.

Reviewed By: anand1976

Differential Revision: D93780065

fbshipit-source-id: 6f51f544b267cb39d09b49949a9485f55eeae12e
---
 env/fs_posix.cc |  7 +++----
 env/io_posix.cc | 10 ++--------
 env/io_posix.h  |  5 ++++-
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 4601242f6c8a..14b34ca6920d 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -1274,7 +1274,7 @@ class PosixFileSystem : public FileSystem {
   void SupportedOps(int64_t& supported_ops) override {
     supported_ops = 0;
 #if defined(ROCKSDB_IOURING_PRESENT)
-    if (IsIOUringEnabled()) {
+    if (IsIOUringEnabled() && thread_local_async_read_io_urings_) {
       // Underlying FS supports async_io
       supported_ops |= (1 << FSSupportedOps::kAsyncIO);
     }
@@ -1340,9 +1340,8 @@ PosixFileSystem::PosixFileSystem()
       page_size_(getpagesize()),
       allow_non_owner_access_(true) {
 #if defined(ROCKSDB_IOURING_PRESENT)
-  // Test whether IOUring is supported, and if it does, create a managing
-  // object for thread local point so that in the future thread-local
-  // io_uring can be created.
+  // Test whether IOUring is supported with the same flags that ReadAsync and
+  // MultiRead will use at runtime.
   struct io_uring* new_io_uring = CreateIOUring();
   if (new_io_uring != nullptr) {
     thread_local_async_read_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 80cb1e05aeae..a04e469cb91e 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -755,10 +755,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs,
     iu = static_cast<struct io_uring*>(
         thread_local_multi_read_io_urings_->Get());
     if (iu == nullptr) {
-      unsigned int flags = 0;
-      flags |= IORING_SETUP_SINGLE_ISSUER;
-      flags |= IORING_SETUP_DEFER_TASKRUN;
-      iu = CreateIOUring(flags);
+      iu = CreateIOUring();
       if (iu != nullptr) {
         thread_local_multi_read_io_urings_->Reset(iu);
       }
@@ -1090,10 +1087,7 @@ IOStatus PosixRandomAccessFile::ReadAsync(
     iu = static_cast<struct io_uring*>(
         thread_local_async_read_io_urings_->Get());
     if (iu == nullptr) {
-      unsigned int flags = 0;
-      flags |= IORING_SETUP_SINGLE_ISSUER;
-      flags |= IORING_SETUP_DEFER_TASKRUN;
-      iu = CreateIOUring(flags);
+      iu = CreateIOUring();
       if (iu != nullptr) {
         thread_local_async_read_io_urings_->Reset(iu);
       }
diff --git a/env/io_posix.h b/env/io_posix.h
index f8acffd60892..bca0c5836a63 100644
--- a/env/io_posix.h
+++ b/env/io_posix.h
@@ -333,8 +333,11 @@ inline void DeleteIOUring(void* p) {
   delete iu;
 }
 
-inline struct io_uring* CreateIOUring(unsigned int flags = 0) {
+inline struct io_uring* CreateIOUring() {
   struct io_uring* new_io_uring = new struct io_uring;
+  unsigned int flags = 0;
+  flags |= IORING_SETUP_SINGLE_ISSUER;
+  flags |= IORING_SETUP_DEFER_TASKRUN;
   int ret = io_uring_queue_init(kIoUringDepth, new_io_uring, flags);
   if (ret) {
     delete new_io_uring;

From 29819f37e1db478818500274f5498a668dcb1259 Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 20 Feb 2026 14:00:41 -0800
Subject: [PATCH 498/500] Remove deprecated `ReadOptions::managed`,
 `ColumnFamilyOptions::snap_refresh_nanos (#14350)

Summary:
**Context/Summary:**
Remove deprecated, unused APIs and options:
- ReadOptions::managed: This option was not used anymore. The functionality it controlled has been removed long ago.
- ColumnFamilyOptions::snap_refresh_nanos: Deprecated and unused option.

Corresponding C API (rocksdb_readoptions_set_managed) and Java API (ReadOptions.managed/setManaged) are also removed. All related checks an db_impl and db_impl_secondary iterators are cleaned up.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14350

Test Plan: make check

Reviewed By: pdillinger

Differential Revision: D93812438

Pulled By: hx235

fbshipit-source-id: e4a9d21c65f83294b6d0878286ba14024f049bac
---
 db/c.cc                                       |  5 ---
 db/db_impl/db_impl.cc                         |  7 ----
 db/db_impl/db_impl_secondary.cc               |  7 ----
 include/rocksdb/c.h                           |  3 --
 include/rocksdb/options.h                     |  7 ----
 java/rocksjni/options.cc                      | 20 -----------
 .../main/java/org/rocksdb/ReadOptions.java    | 33 -------------------
 .../java/org/rocksdb/ReadOptionsTest.java     |  9 -----
 options/options_settable_test.cc              |  2 --
 options/options_test.cc                       |  1 +
 .../remove_deprecated_apis_batch1.md          |  1 +
 11 files changed, 2 insertions(+), 93 deletions(-)
 create mode 100644 unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md

diff --git a/db/c.cc b/db/c.cc
index 9282c55580ba..6e00a0761cf6 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -6082,11 +6082,6 @@ unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
   return opt->rep.tailing;
 }
 
-void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt,
-                                     unsigned char v) {
-  opt->rep.managed = v;
-}
-
 void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt,
                                             size_t v) {
   opt->rep.readahead_size = v;
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 34a3d2d9ea41..fea401477cc5 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3925,10 +3925,6 @@ Iterator* DBImpl::NewIterator(const ReadOptions& _read_options,
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
 
-  if (read_options.managed) {
-    return NewErrorIterator(
-        Status::NotSupported("Managed iterator is not supported anymore."));
-  }
   Iterator* result = nullptr;
   if (read_options.read_tier == kPersistedTier) {
     return NewErrorIterator(Status::NotSupported(
@@ -4128,9 +4124,6 @@ Status DBImpl::NewIterators(
   if (read_options.io_activity == Env::IOActivity::kUnknown) {
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
-  if (read_options.managed) {
-    return Status::NotSupported("Managed iterator is not supported anymore.");
-  }
   if (read_options.read_tier == kPersistedTier) {
     return Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators.");
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index f2cd4c865d1e..0db4820c3925 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -509,10 +509,6 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& _read_options,
   if (read_options.io_activity == Env::IOActivity::kUnknown) {
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
-  if (read_options.managed) {
-    return NewErrorIterator(
-        Status::NotSupported("Managed iterator is not supported anymore."));
-  }
   if (read_options.read_tier == kPersistedTier) {
     return NewErrorIterator(Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators."));
@@ -588,9 +584,6 @@ Status DBImplSecondary::NewIterators(
   if (read_options.io_activity == Env::IOActivity::kUnknown) {
     read_options.io_activity = Env::IOActivity::kDBIterator;
   }
-  if (read_options.managed) {
-    return Status::NotSupported("Managed iterator is not supported anymore.");
-  }
   if (read_options.read_tier == kPersistedTier) {
     return Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators.");
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 531ca1f9413c..3ab0c8551d34 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -2311,9 +2311,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
     rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_readoptions_get_tailing(
     rocksdb_readoptions_t*);
-// The functionality that this option controlled has been removed.
-extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
-    rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
     rocksdb_readoptions_t*, size_t);
 extern ROCKSDB_LIBRARY_API size_t
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 96c33656953d..3c0898fdc82b 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -305,9 +305,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   uint64_t max_bytes_for_level_base = 256 * 1048576;
 
-  // Deprecated.
-  uint64_t snap_refresh_nanos = 0;
-
   // Disable automatic compactions. Manual compactions can still
   // be issued on this column family
   //
@@ -2109,10 +2106,6 @@ struct ReadOptions {
   // that were inserted into the database after the creation of the iterator.
   bool tailing = false;
 
-  // This options is not used anymore. It was to turn on a functionality that
-  // has been removed. DEPRECATED
-  bool managed = false;
-
   // Enable a total order seek regardless of index format (e.g. hash index)
   // used in the table. Some table format (e.g. plain table) may not support
   // this option.
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 1e921461b8ce..3166e6625090 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -8031,26 +8031,6 @@ jboolean Java_org_rocksdb_ReadOptions_tailing(JNIEnv*, jclass, jlong jhandle) {
   return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->tailing;
 }
 
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    managed
- * Signature: (J)Z
- */
-jboolean Java_org_rocksdb_ReadOptions_managed(JNIEnv*, jclass, jlong jhandle) {
-  return reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->managed;
-}
-
-/*
- * Class:     org_rocksdb_ReadOptions
- * Method:    setManaged
- * Signature: (JZ)V
- */
-void Java_org_rocksdb_ReadOptions_setManaged(JNIEnv*, jclass, jlong jhandle,
-                                             jboolean jmanaged) {
-  reinterpret_cast<ROCKSDB_NAMESPACE::ReadOptions*>(jhandle)->managed =
-      static_cast<bool>(jmanaged);
-}
-
 /*
  * Class:     org_rocksdb_ReadOptions
  * Method:    totalOrderSeek
diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java
index 8cc9883d23cd..4be053376c61 100644
--- a/java/src/main/java/org/rocksdb/ReadOptions.java
+++ b/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -186,37 +186,6 @@ public ReadOptions setTailing(final boolean tailing) {
     return this;
   }
 
-  /**
-   * Returns whether managed iterators will be used.
-   *
-   * @return the setting of whether managed iterators will be used,
-   *     by default false
-   *
-   * @deprecated This options is not used anymore.
-   */
-  @Deprecated
-  public boolean managed() {
-    assert(isOwningHandle());
-    return managed(nativeHandle_);
-  }
-
-  /**
-   * Specify to create a managed iterator -- a special iterator that
-   * uses less resources by having the ability to free its underlying
-   * resources on request.
-   *
-   * @param managed if true, then managed iterators will be enabled.
-   * @return the reference to the current ReadOptions.
-   *
-   * @deprecated This options is not used anymore.
-   */
-  @Deprecated
-  public ReadOptions setManaged(final boolean managed) {
-    assert(isOwningHandle());
-    setManaged(nativeHandle_, managed);
-    return this;
-  }
-
   /**
    * Returns whether a total seek order will be used
    *
@@ -819,8 +788,6 @@ protected final void disposeInternal(final long handle) {
   private static native void setReadTier(long handle, byte readTierValue);
   private static native boolean tailing(long handle);
   private static native void setTailing(long handle, boolean tailing);
-  private static native boolean managed(long handle);
-  private static native void setManaged(long handle, boolean managed);
   private static native boolean totalOrderSeek(long handle);
   private static native void setTotalOrderSeek(long handle, boolean totalOrderSeek);
   private static native boolean prefixSameAsStart(long handle);
diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
index baf51bf9b4b5..3ff4e6bba6d9 100644
--- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -98,15 +98,6 @@ public void readTier() {
     }
   }
 
-  @SuppressWarnings("deprecated")
-  @Test
-  public void managed() {
-    try (final ReadOptions opt = new ReadOptions()) {
-      opt.setManaged(true);
-      assertThat(opt.managed()).isTrue();
-    }
-  }
-
   @Test
   public void totalOrderSeek() {
     try (final ReadOptions opt = new ReadOptions()) {
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index a4555f6b9ec1..3c12a9e859a9 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -541,8 +541,6 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::shared_ptr<CompressionManager>)},
       {offsetof(struct ColumnFamilyOptions, prefix_extractor),
        sizeof(std::shared_ptr<const SliceTransform>)},
-      {offsetof(struct ColumnFamilyOptions, snap_refresh_nanos),
-       sizeof(uint64_t)},
       {offsetof(struct ColumnFamilyOptions, table_factory),
        sizeof(std::shared_ptr<TableFactory>)},
       {offsetof(struct ColumnFamilyOptions, cf_paths),
diff --git a/options/options_test.cc b/options/options_test.cc
index 7111872f541b..1828dc9d86a3 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -2417,6 +2417,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
       {"max_compaction_bytes", "21"},
       {"soft_rate_limit", "1.1"},
       {"hard_rate_limit", "2.1"},
+      {"snap_refresh_nanos", "1000000"},
       {"rate_limit_delay_max_milliseconds", "100"},
       {"hard_pending_compaction_bytes_limit", "211"},
       {"arena_block_size", "22"},
diff --git a/unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md b/unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md
new file mode 100644
index 000000000000..3897c4918d87
--- /dev/null
+++ b/unreleased_history/public_api_changes/remove_deprecated_apis_batch1.md
@@ -0,0 +1 @@
+Remove deprecated, unused APIs and options: `ReadOptions::managed` and `ColumnFamilyOptions::snap_refresh_nanos`. Corresponding C and Java APIs are also removed.

From aa7571c3ad048d1a4435bb4df4bfd8a793ec492b Mon Sep 17 00:00:00 2001
From: xingbowang <shawn.xingbo.wang@gmail.com>
Date: Fri, 20 Feb 2026 14:58:21 -0800
Subject: [PATCH 499/500] Run clang-tidy in github CI (#14347)

Summary:
RocksDB has been using clang-tidy for a long time inside Meta. However, it is not efficient for external contributor, as the result from clang-tidy has to be ferried back through internal contributor. This PR added support to run clang-tidy on external github CI. It added .clang-tidy file based on internal version. It run clang-tidy in a separate pr job and a workflow step would post the pr job result to the PR itself. See example below.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14347

Test Plan: Github CI

Reviewed By: archang19

Differential Revision: D93862467

Pulled By: xingbowang

fbshipit-source-id: bb4330241036894deb619470efd73a7041a8b62f
---
 .clang-tidy                              |  88 +++
 .github/workflows/clang-tidy-comment.yml | 102 ++--
 .github/workflows/pr-jobs.yml            | 100 +++-
 Makefile                                 |  13 +-
 tools/run_clang_tidy.py                  | 683 +++++++++++++++++++++++
 5 files changed, 919 insertions(+), 67 deletions(-)
 create mode 100644 .clang-tidy
 create mode 100755 tools/run_clang_tidy.py

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000000..033b1cbfe576
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,88 @@
+# When making changes, verify the output of:
+#   clang-tidy -list-checks
+---
+Checks: "-*,\
+  bugprone-argument-comment,\
+  bugprone-dangling-handle,\
+  bugprone-fold-init-type,\
+  bugprone-forward-declaration-namespace,\
+  bugprone-forwarding-reference-overload,\
+  bugprone-shadow,\
+  bugprone-sizeof-*,\
+  bugprone-string-constructor,\
+  bugprone-undefined-memory-manipulation,\
+  bugprone-unused-return-value,\
+  bugprone-use-after-move,\
+  cert-env33-c,\
+  cert-err58-cpp,\
+  cert-msc30-c,\
+  cert-msc50-cpp,\
+  clang-analyzer-core.NullDereference,\
+  clang-analyzer-core.StackAddressEscape,\
+  clang-analyzer-deadcode.DeadStores,\
+  clang-diagnostic-*,\
+  -clang-diagnostic-missing-designated-field-initializers,\
+  concurrency-mt-unsafe,\
+  cppcoreguidelines-avoid-non-const-global-variables,\
+  cppcoreguidelines-missing-std-forward,\
+  cppcoreguidelines-pro-type-member-init,\
+  cppcoreguidelines-special-member-functions,\
+  cppcoreguidelines-virtual-class-destructor,\
+  google-build-using-namespace,\
+  google-explicit-constructor,\
+  google-readability-avoid-underscore-in-googletest-name,\
+  misc-definitions-in-headers,\
+  misc-redundant-expression,\
+  modernize-make-shared,\
+  modernize-use-emplace,\
+  modernize-use-noexcept,\
+  modernize-use-override,\
+  modernize-use-using,\
+  performance-faster-string-find,\
+  performance-for-range-copy,\
+  performance-implicit-conversion-in-loop,\
+  performance-inefficient-algorithm,\
+  performance-inefficient-string-concatenation,\
+  performance-inefficient-vector-operation,\
+  performance-move-const-arg,\
+  performance-move-constructor-init,\
+  performance-no-automatic-move,\
+  performance-no-int-to-ptr,\
+  performance-noexcept-move-constructor,\
+  performance-noexcept-swap,\
+  performance-trivially-destructible,\
+  performance-type-promotion-in-math-fn,\
+  performance-unnecessary-copy-initialization,\
+  performance-unnecessary-value-param,\
+  readability-braces-around-statements,\
+  readability-duplicate-include,\
+  readability-isolate-declaration,\
+  readability-operators-representation,\
+  readability-redundant-string-init"
+
+WarningsAsErrors: "bugprone-use-after-move"
+
+CheckOptions:
+- key: bugprone-easily-swappable-parameters.MinimumLength
+  value: 4
+- key: cppcoreguidelines-avoid-non-const-global-variables.AllowThreadLocal
+  value: true
+- key: cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor
+  value: true
+- key: cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove
+  value: true
+- key: modernize-use-using.IgnoreExternC
+  value: true
+- key: performance-move-const-arg.CheckTriviallyCopyableMove
+  value: false
+- key: performance-unnecessary-value-param.AllowedTypes
+  value: '[Pp]ointer$;[Pp]tr$;[Rr]ef(erence)?$'
+- key: performance-unnecessary-copy-initialization.AllowedTypes
+  value: '[Pp]ointer$;[Pp]tr$;[Rr]ef(erence)?$'
+- key: readability-operators-representation.BinaryOperators
+  value: '&&;&=;&;|;~;!;!=;||;|=;^;^='
+- key: readability-redundant-string-init.StringNames
+  value: '::std::basic_string'
+- key: readability-named-parameter.InsertPlainNamesInForwardDecls
+  value: true
+...
diff --git a/.github/workflows/clang-tidy-comment.yml b/.github/workflows/clang-tidy-comment.yml
index d0953797d683..1a07a7ce70fd 100644
--- a/.github/workflows/clang-tidy-comment.yml
+++ b/.github/workflows/clang-tidy-comment.yml
@@ -1,58 +1,86 @@
-name: Post clang-tidy PR comment
+name: clang-tidy
 on:
-  workflow_run:
-    workflows: ["facebook/rocksdb/pr-jobs"]
-    types: [completed]
+  pull_request_target:
+    types: [opened, synchronize, reopened]
 
 permissions:
   pull-requests: write
 
 jobs:
-  comment:
-    if: github.event.workflow_run.event == 'pull_request'
-    runs-on: ubuntu-latest
+  clang-tidy:
+    if: github.repository_owner == 'facebook'
+    runs-on:
+      labels: 4-core-ubuntu
+    container:
+      image: ghcr.io/facebook/rocksdb_ubuntu:24.0
     steps:
-    - name: Download clang-tidy results
-      id: download
-      uses: actions/download-artifact@v4.0.0
+    - uses: actions/checkout@v4.1.0
       with:
-        name: clang-tidy-result
-        run-id: ${{ github.event.workflow_run.id }}
-        github-token: ${{ secrets.GITHUB_TOKEN }}
+        ref: ${{ github.event.pull_request.head.sha }}
+        fetch-depth: 2
+    - name: Mark workspace as safe for git
+      run: git config --global --add safe.directory $GITHUB_WORKSPACE
+    - name: Install clang-tidy
+      run: apt-get update && apt-get install -y clang-tidy
+    - name: Generate compile_commands.json
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+              -DCMAKE_C_COMPILER=clang-18 \
+              -DCMAKE_CXX_COMPILER=clang++-18 ..
+        cd ..
+        ln -sf build/compile_commands.json compile_commands.json
+    - name: Run clang-tidy on changed files
+      id: clang-tidy
+      run: |
+        python3 tools/run_clang_tidy.py \
+          -j 4 \
+          --diff-base HEAD~1 \
+          --github-annotations \
+          --github-step-summary \
+          --comment-output clang-tidy-comment.md
       continue-on-error: true
-    - name: Post or update PR comment
-      if: steps.download.outcome == 'success'
+    - name: Post clang-tidy results to PR
+      if: always()
       uses: actions/github-script@v7
       with:
         script: |
           const fs = require('fs');
-          if (!fs.existsSync('clang-tidy-comment.md') || !fs.existsSync('pr_number.txt')) {
-            core.info('No clang-tidy results found; skipping.');
+          const commentPath = 'clang-tidy-comment.md';
+          if (!fs.existsSync(commentPath)) {
+            core.info('No comment file generated; skipping PR comment.');
             return;
           }
-          const body = fs.readFileSync('clang-tidy-comment.md', 'utf8');
-          const prNumber = parseInt(fs.readFileSync('pr_number.txt', 'utf8').trim());
+          const body = fs.readFileSync(commentPath, 'utf8');
           const marker = '<!-- clang-tidy-bot -->';
-          const { data: comments } = await github.rest.issues.listComments({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            issue_number: prNumber,
-          });
-          const existing = comments.find(c => c.body.includes(marker));
-          if (existing) {
-            await github.rest.issues.updateComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              comment_id: existing.id,
-              body,
-            });
-            core.info(`Updated existing comment ${existing.id}`);
-          } else {
-            await github.rest.issues.createComment({
+          const prNumber = context.payload.pull_request.number;
+          try {
+            const { data: comments } = await github.rest.issues.listComments({
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: prNumber,
-              body,
             });
-            core.info('Created new PR comment');
+            const existing = comments.find(c => c.body.includes(marker));
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+              core.info(`Updated existing comment ${existing.id}`);
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body,
+              });
+              core.info('Created new PR comment');
+            }
+          } catch (err) {
+            core.warning(`Could not post PR comment: ${err.message}`);
           }
+    - name: Fail if clang-tidy found issues
+      if: steps.clang-tidy.outcome == 'failure'
+      run: exit 1
diff --git a/.github/workflows/pr-jobs.yml b/.github/workflows/pr-jobs.yml
index 6e8080078095..0b5ea4b81d23 100644
--- a/.github/workflows/pr-jobs.yml
+++ b/.github/workflows/pr-jobs.yml
@@ -1,7 +1,18 @@
 name: facebook/rocksdb/pr-jobs
 on: [push, pull_request]
 permissions: {}
+env:
+  # Set to a job name to run only that job (on any repo), or leave empty for
+  # normal behavior (all jobs on facebook repo only).
+  ONLY_JOB: ''
 jobs:
+  config:
+    runs-on: ubuntu-latest
+    outputs:
+      only_job: ${{ steps.set.outputs.only_job }}
+    steps:
+    - id: set
+      run: echo "only_job=$ONLY_JOB" >> "$GITHUB_OUTPUT"
   # NOTE: multiple workflows would be recommended, but the current GHA UI in
   # PRs doesn't make it clear when there's an overall error with a workflow,
   # making it easy to overlook something broken. Grouping everything into one
@@ -19,6 +30,10 @@ jobs:
   # increasing the risk of misconfiguration, especially on forks that might
   # want to run with this GHA setup.
   #
+  # SELECTIVE JOB EXECUTION: Set the ONLY_JOB env var at the top of this file
+  # to a job name (e.g. "build-linux-clang-tidy") to run only that job,
+  # bypassing the repository owner check. Leave it empty for normal behavior.
+  #
   # DEBUGGING WITH SSH: Temporarily add this as a job step, either before the
   # step of interest without the "if:" line or after the failing step with the
   # "if:" line. Then use ssh command printed in CI output.
@@ -30,7 +45,8 @@ jobs:
 
   # ======================== Fast Initial Checks ====================== #
   check-format-and-targets:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'check-format-and-targets' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v4.1.0
@@ -66,7 +82,8 @@ jobs:
         SANITY_CHECK=1 LONG_TEST=1 tools/check_format_compatible.sh
   # ========================= Linux With Tests ======================== #
   build-linux:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -78,7 +95,8 @@ jobs:
     - run: make V=1 J=32 -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-mingw:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-cmake-mingw' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -97,7 +115,8 @@ jobs:
         mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
     - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-make-with-folly' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -116,7 +135,8 @@ jobs:
     - run: USE_FOLLY=1 LIB_MODE=static V=1 make -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-make-with-folly-lite-no-test:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-make-with-folly-lite-no-test' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -130,7 +150,8 @@ jobs:
     - run: USE_FOLLY_LITE=1 EXTRA_CXXFLAGS=-DGLOG_USE_GLOG_EXPORT V=1 make -j32 all
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-folly-coroutines:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-cmake-with-folly-coroutines' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -149,7 +170,8 @@ jobs:
     - run: "(mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make VERBOSE=1 -j20 && ctest -j20)"
     - uses: "./.github/actions/post-steps"
   build-linux-cmake-with-benchmark-no-thread-status:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-cmake-with-benchmark-no-thread-status' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -161,7 +183,8 @@ jobs:
     - run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 -DCMAKE_CXX_FLAGS=-DNROCKSDB_THREAD_STATUS .. && make VERBOSE=1 -j20 && ctest -j20
     - uses: "./.github/actions/post-steps"
   build-linux-encrypted_env-no_compression:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-encrypted_env-no_compression' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -175,7 +198,8 @@ jobs:
     - uses: "./.github/actions/post-steps"
   # ======================== Linux No Test Runs ======================= #
   build-linux-release:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-release' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -201,7 +225,8 @@ jobs:
     - run: if ./trace_analyzer --version; then false; else true; fi
     - uses: "./.github/actions/post-steps"
   build-linux-clang-13-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-clang-13-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 8-core-ubuntu
     container:
@@ -217,7 +242,8 @@ jobs:
     - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 EXTRA_CXXFLAGS=-stdlib=libc++ EXTRA_LDFLAGS=-stdlib=libc++ DEBUG_LEVEL=0 make -j32 shared_lib
     - uses: "./.github/actions/post-steps"
   build-linux-clang-18-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-clang-18-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -231,7 +257,8 @@ jobs:
     - run: CC=clang-18 CXX=clang++-18 USE_CLANG=1 DEBUG_LEVEL=0 make -j32 release
     - uses: "./.github/actions/post-steps"
   build-linux-gcc-14-no_test_run:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-gcc-14-no_test_run' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -245,7 +272,8 @@ jobs:
 
   # ======================== Linux Other Checks ======================= #
   build-linux-clang18-clang-analyze:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-clang18-clang-analyze' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -263,8 +291,10 @@ jobs:
       with:
         name: scan-build-report
         path: scan_build_report.tar.gz
+
   build-linux-unity-and-headers:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-unity-and-headers' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -278,7 +308,8 @@ jobs:
     - run: make V=1 -j8 -k check-headers
     - uses: "./.github/actions/post-steps"
   build-linux-mini-crashtest:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-mini-crashtest' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -291,7 +322,8 @@ jobs:
     - uses: "./.github/actions/post-steps"
   # ======================= Linux with Sanitizers ===================== #
   build-linux-clang18-asan-ubsan:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-clang18-asan-ubsan' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 32-core-ubuntu
     container:
@@ -303,7 +335,8 @@ jobs:
     - run: COMPILE_WITH_ASAN=1 COMPILE_WITH_UBSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j40 check
     - uses: "./.github/actions/post-steps"
   build-linux-clang18-mini-tsan:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-clang18-mini-tsan' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 32-core-ubuntu
     container:
@@ -315,7 +348,8 @@ jobs:
     - run: COMPILE_WITH_TSAN=1 CC=clang-18 CXX=clang++-18 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
     - uses: "./.github/actions/post-steps"
   build-linux-static_lib-alt_namespace-status_checked:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-static_lib-alt_namespace-status_checked' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 16-core-ubuntu
     container:
@@ -328,7 +362,8 @@ jobs:
     - uses: "./.github/actions/post-steps"
   # ========================= MacOS build only ======================== #
   build-macos:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-macos' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: macos-15-xlarge
     env:
       ROCKSDB_DISABLE_JEMALLOC: 1
@@ -345,7 +380,8 @@ jobs:
     - uses: "./.github/actions/post-steps"
   # ========================= MacOS with Tests ======================== #
   build-macos-cmake:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-macos-cmake' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: macos-15-xlarge
     strategy:
       matrix:
@@ -378,7 +414,8 @@ jobs:
   # ======================== Windows with Tests ======================= #
   # NOTE: some windows jobs are in "nightly" to save resources
   build-windows-vs2022:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-windows-vs2022' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: windows-8-core
     env:
       CMAKE_GENERATOR: Visual Studio 17 2022
@@ -388,7 +425,8 @@ jobs:
     - uses: "./.github/actions/windows-build-steps"
   # ============================ Java Jobs ============================ #
   build-linux-java:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-java' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -419,7 +457,8 @@ jobs:
       run: make V=1 J=8 -j8 jtest
     # post-steps skipped because of compatibility issues with docker image
   build-linux-java-static:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-java-static' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -450,7 +489,8 @@ jobs:
       run: make V=1 J=8 -j8 rocksdbjavastatic
     # post-steps skipped because of compatibility issues with docker image
   build-macos-java:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-macos-java' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
@@ -473,7 +513,8 @@ jobs:
       run: make V=1 J=16 -j16 jtest
     - uses: "./.github/actions/post-steps"
   build-macos-java-static:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-macos-java-static' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
@@ -495,7 +536,8 @@ jobs:
       run: make V=1 J=16 -j16 rocksdbjavastaticosx
     - uses: "./.github/actions/post-steps"
   build-macos-java-static-universal:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-macos-java-static-universal' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on: macos-15-xlarge
     env:
       JAVA_HOME: "/Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home"
@@ -517,7 +559,8 @@ jobs:
       run: make V=1 J=16 -j16 rocksdbjavastaticosx_ub
     - uses: "./.github/actions/post-steps"
   build-linux-java-pmd:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-java-pmd' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu
     container:
@@ -543,7 +586,8 @@ jobs:
         name: maven-site
         path: "${{ github.workspace }}/java/target/site"
   build-linux-arm:
-    if: ${{ github.repository_owner == 'facebook' }}
+    if: needs.config.outputs.only_job == 'build-linux-arm' || (needs.config.outputs.only_job == '' && github.repository_owner == 'facebook')
+    needs: config
     runs-on:
       labels: 4-core-ubuntu-arm
     steps:
diff --git a/Makefile b/Makefile
index 3e05fc174443..40d7437c2f6e 100644
--- a/Makefile
+++ b/Makefile
@@ -805,7 +805,7 @@ endif  # PLATFORM_SHARED_EXT
 .PHONY: check clean coverage ldb_tests package dbg gen-pc build_size \
 	release tags tags0 valgrind_check format static_lib shared_lib all \
 	rocksdbjavastatic rocksdbjava install install-static install-shared \
-	uninstall analyze tools tools_lib check-headers checkout_folly
+	uninstall analyze tools tools_lib check-headers checkout_folly clang-tidy
 
 all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
 
@@ -1222,6 +1222,15 @@ check-buck-targets:
 check-sources:
 	build_tools/check-sources.sh
 
+# Run clang-tidy on locally changed files, filtered to changed lines only.
+# Requires compile_commands.json (generate with cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON).
+# Override CLANG_TIDY_BINARY and CLANG_TIDY_JOBS as needed:
+#   make clang-tidy CLANG_TIDY_BINARY=/usr/bin/clang-tidy CLANG_TIDY_JOBS=8
+CLANG_TIDY_BINARY ?= /opt/homebrew/opt/llvm/bin/clang-tidy
+CLANG_TIDY_JOBS ?= $(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+clang-tidy:
+	python3 tools/run_clang_tidy.py --clang-tidy-binary $(CLANG_TIDY_BINARY) -j $(CLANG_TIDY_JOBS)
+
 package:
 	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
 
@@ -2552,7 +2561,7 @@ list_all_tests:
 
 # Remove the rules for which dependencies should not be generated and see if any are left.
 #If so, include the dependencies; if not, do not include the dependency files
-ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS))
+ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources clang-tidy jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS))
 ifneq ("$(ROCKS_DEP_RULES)", "")
 -include $(DEPFILES)
 endif
diff --git a/tools/run_clang_tidy.py b/tools/run_clang_tidy.py
new file mode 100755
index 000000000000..0ae70bbd5829
--- /dev/null
+++ b/tools/run_clang_tidy.py
@@ -0,0 +1,683 @@
+#!/usr/bin/env python3
+"""
+Run clang-tidy on locally changed code and filter results to changed lines.
+
+This script detects local changes by combining:
+  1. Uncommitted changes (staged + unstaged + untracked files)
+  2. Committed-but-not-pushed changes (local commits not in the remote)
+
+It then runs clang-tidy only on the changed .cc/.cpp files (in parallel) and
+filters the output to show only warnings on lines that were actually modified.
+
+Usage:
+  python3 tools/run_clang_tidy.py [options]
+
+Examples:
+  # Basic usage (auto-detects base from remote tracking branch):
+  python3 tools/run_clang_tidy.py
+
+  # Specify clang-tidy binary and parallelism:
+  python3 tools/run_clang_tidy.py --clang-tidy-binary clang-tidy-18 -j 14
+
+  # Explicit diff base (useful in CI where the checkout is a merge commit):
+  python3 tools/run_clang_tidy.py --diff-base HEAD~1
+
+  # Save full (unfiltered) output to a file:
+  python3 tools/run_clang_tidy.py -o full_output.txt
+
+  # Show all warnings, not just on changed lines:
+  python3 tools/run_clang_tidy.py --verbose
+
+  # CI mode with GitHub annotations and step summary:
+  python3 tools/run_clang_tidy.py --diff-base HEAD~1 --github-annotations --github-step-summary
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+
+def log(msg=""):
+    """Print and flush immediately so output is visible in real time."""
+    print(msg, flush=True)
+
+
+def run_cmd(cmd, cwd=None):
+    """Run a command and return (stdout, return_code)."""
+    result = subprocess.run(cmd, capture_output=True, text=True, cwd=cwd)
+    return result.stdout.strip(), result.returncode
+
+
+def get_repo_root():
+    """Get the git repository root directory."""
+    out, rc = run_cmd(["git", "rev-parse", "--show-toplevel"])
+    if rc != 0:
+        log("Error: not inside a git repository.")
+        sys.exit(1)
+    return out
+
+
+def find_remote_base(repo_root):
+    """
+    Auto-detect the base commit to diff against.
+
+    Strategy:
+      1. Use the upstream tracking branch of the current branch if available.
+      2. Fall back to origin/main, origin/master, upstream/main, upstream/master.
+      3. Return the merge-base of HEAD and that ref.
+    """
+    out, rc = run_cmd(
+        ["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{upstream}"],
+        cwd=repo_root,
+    )
+    if rc == 0 and out:
+        base_ref = out
+    else:
+        base_ref = None
+        for candidate in [
+            "origin/main", "origin/master",
+            "upstream/main", "upstream/master",
+        ]:
+            _, rc = run_cmd(["git", "rev-parse", "--verify", candidate], cwd=repo_root)
+            if rc == 0:
+                base_ref = candidate
+                break
+        if base_ref is None:
+            log(
+                "Error: cannot determine remote base branch.\n"
+                "Set an upstream: git branch --set-upstream-to=<remote>/<branch>\n"
+                "Or use --diff-base <ref> to specify the base explicitly."
+            )
+            sys.exit(1)
+
+    merge_base, rc = run_cmd(["git", "merge-base", "HEAD", base_ref], cwd=repo_root)
+    if rc != 0:
+        log(f"Error: cannot compute merge-base with {base_ref}.")
+        sys.exit(1)
+
+    return merge_base, base_ref
+
+
+def resolve_diff_base(diff_base_arg, repo_root):
+    """
+    Resolve --diff-base to a concrete commit SHA.
+
+    When --diff-base is given, resolve the ref and return (sha, display_name).
+    Otherwise, fall back to auto-detection via find_remote_base().
+    """
+    if diff_base_arg:
+        sha, rc = run_cmd(
+            ["git", "rev-parse", "--verify", diff_base_arg], cwd=repo_root
+        )
+        if rc != 0:
+            log(f"Error: --diff-base '{diff_base_arg}' is not a valid git ref.")
+            sys.exit(1)
+        return sha, diff_base_arg
+
+    return find_remote_base(repo_root)
+
+
+def parse_diff_for_changed_lines(diff_text):
+    """
+    Parse a unified diff and return {relative_path: set_of_new_line_numbers}.
+
+    Only tracks added/modified lines (the '+' side of the diff).
+    """
+    changed = {}
+    current_file = None
+
+    for line in diff_text.split("\n"):
+        m = re.match(r"^\+\+\+ b/(.*)", line)
+        if m:
+            current_file = m.group(1)
+            changed.setdefault(current_file, set())
+            continue
+
+        m = re.match(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@", line)
+        if m and current_file is not None:
+            start = int(m.group(1))
+            count = int(m.group(2)) if m.group(2) else 1
+            if count == 0:
+                continue
+            for i in range(start, start + count):
+                changed[current_file].add(i)
+
+    return changed
+
+
+def collect_changed_lines(repo_root, diff_base_arg=None):
+    """
+    Collect every locally-changed file and its changed line numbers.
+
+    When diff_base_arg is provided, diffs HEAD against that ref directly.
+    Otherwise, auto-detects the remote base and also picks up uncommitted
+    and untracked changes.
+    """
+    base_sha, base_label = resolve_diff_base(diff_base_arg, repo_root)
+    head_short, _ = run_cmd(["git", "rev-parse", "--short", "HEAD"], cwd=repo_root)
+
+    log_out, _ = run_cmd(
+        ["git", "log", "--oneline", f"{base_sha}..HEAD"], cwd=repo_root
+    )
+    local_commits = [l for l in log_out.split("\n") if l.strip()] if log_out else []
+
+    log(f"  Diff base   : {base_label}  ({base_sha[:10]})")
+    log(f"  HEAD        : {head_short}")
+    log(f"  Commits in range: {len(local_commits)}")
+    for c in local_commits[:20]:
+        log(f"    {c}")
+    if len(local_commits) > 20:
+        log(f"    ... and {len(local_commits) - 20} more")
+
+    all_changed = {}
+    src_pattern = r"\.(cc|cpp|h)$"
+
+    def merge_into(target, source):
+        for f, lines in source.items():
+            target.setdefault(f, set()).update(lines)
+
+    # Committed changes: base..HEAD
+    diff_committed, _ = run_cmd(
+        ["git", "diff", "--unified=0", f"{base_sha}..HEAD",
+         "--", "*.cc", "*.cpp", "*.h"],
+        cwd=repo_root,
+    )
+    merge_into(all_changed, parse_diff_for_changed_lines(diff_committed))
+
+    # When using explicit --diff-base (e.g. CI), skip working-tree checks
+    if diff_base_arg is None:
+        # Unstaged changes
+        diff_unstaged, _ = run_cmd(
+            ["git", "diff", "--unified=0", "--", "*.cc", "*.cpp", "*.h"],
+            cwd=repo_root,
+        )
+        merge_into(all_changed, parse_diff_for_changed_lines(diff_unstaged))
+
+        # Staged changes
+        diff_staged, _ = run_cmd(
+            ["git", "diff", "--unified=0", "--cached", "--", "*.cc", "*.cpp", "*.h"],
+            cwd=repo_root,
+        )
+        merge_into(all_changed, parse_diff_for_changed_lines(diff_staged))
+
+        # Untracked files — treat every line as changed
+        untracked_out, _ = run_cmd(
+            ["git", "ls-files", "--others", "--exclude-standard"], cwd=repo_root
+        )
+        for f in untracked_out.split("\n"):
+            f = f.strip()
+            if not f or not re.search(src_pattern, f):
+                continue
+            filepath = os.path.join(repo_root, f)
+            if os.path.isfile(filepath):
+                with open(filepath) as fh:
+                    line_count = sum(1 for _ in fh)
+                all_changed.setdefault(f, set()).update(range(1, line_count + 1))
+
+    return all_changed
+
+
+def load_compile_db(compile_db_path, repo_root):
+    """Load compile_commands.json and return a set of known file paths (both abs and rel)."""
+    if not os.path.exists(compile_db_path):
+        log(
+            f"Error: {compile_db_path} not found.\n"
+            "Generate it with:\n"
+            "  mkdir build && cd build && cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ..\n"
+            "  ln -sf build/compile_commands.json compile_commands.json"
+        )
+        sys.exit(1)
+
+    with open(compile_db_path) as f:
+        db = json.load(f)
+
+    files = set()
+    prefix = repo_root.rstrip("/") + "/"
+    for entry in db:
+        abs_path = entry["file"]
+        files.add(abs_path)
+        if abs_path.startswith(prefix):
+            files.add(abs_path[len(prefix):])
+    return files
+
+
+def invoke_clang_tidy(clang_tidy_bin, compile_db_dir, filepath, repo_root):
+    """Run clang-tidy on a single file. Returns (filepath, combined_output, return_code)."""
+    abs_path = os.path.join(repo_root, filepath)
+    cmd = [clang_tidy_bin, "-p", compile_db_dir, abs_path]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600)
+        return filepath, result.stdout + result.stderr, result.returncode
+    except subprocess.TimeoutExpired:
+        return filepath, f"TIMEOUT after 600s: {abs_path}\n", -1
+
+
+def emit_github_annotations(filtered_lines, repo_root):
+    """
+    Emit GitHub Actions workflow commands for each warning/error so they
+    appear as inline annotations on the PR diff.
+
+    Format: ::warning file={path},line={line}::{message}
+
+    Errors are emitted first so they occupy annotation slots before warnings,
+    since GitHub Actions caps display at 10 warnings and 10 errors per step.
+    Use --github-step-summary for the complete report.
+    """
+    prefix = repo_root.rstrip("/") + "/"
+
+    annotations = []
+    for line in filtered_lines:
+        m = re.match(r"^(.*?):(\d+):(\d+): (warning|error): (.+)", line)
+        if not m:
+            continue
+        filepath = m.group(1)
+        lineno = m.group(2)
+        col = m.group(3)
+        severity = m.group(4)
+        message = m.group(5)
+
+        rel_path = filepath
+        if filepath.startswith(prefix):
+            rel_path = filepath[len(prefix):]
+
+        gh_level = "error" if severity == "error" else "warning"
+        annotations.append((gh_level, rel_path, lineno, col, message))
+
+    annotations.sort(key=lambda a: (0 if a[0] == "error" else 1, a[1], int(a[2])))
+
+    for gh_level, rel_path, lineno, col, message in annotations:
+        log(f"::{gh_level} file={rel_path},line={lineno},col={col}::{message}")
+
+
+COMMENT_MARKER = "<!-- clang-tidy-bot -->"
+
+
+def _format_diagnostic_counts(diagnostic_lines):
+    """Return a human-readable string like '3 error(s) and 5 warning(s)'."""
+    n_errors = sum(1 for l in diagnostic_lines if re.search(r": error:", l))
+    n_warnings = sum(1 for l in diagnostic_lines if re.search(r": warning:", l))
+    parts = []
+    if n_errors:
+        parts.append(f"{n_errors} error(s)")
+    if n_warnings:
+        parts.append(f"{n_warnings} warning(s)")
+    return " and ".join(parts) if parts else "0 findings"
+
+
+def build_markdown_summary(diagnostic_lines, by_check, wall_time, repo_root):
+    """Build a Markdown summary string from clang-tidy results."""
+    prefix = repo_root.rstrip("/") + "/"
+    lines = []
+
+    if not diagnostic_lines:
+        lines.append("## :white_check_mark: clang-tidy: No findings on changed lines")
+        lines.append(f"\nCompleted in {wall_time:.1f}s.")
+    else:
+        counts = _format_diagnostic_counts(diagnostic_lines)
+        has_errors = any(re.search(r": error:", l) for l in diagnostic_lines)
+        icon = ":x:" if has_errors else ":warning:"
+        lines.append(f"## {icon} clang-tidy: {counts} on changed lines")
+        lines.append(f"\nCompleted in {wall_time:.1f}s.\n")
+
+        lines.append("### Summary by check\n")
+        lines.append("| Check | Count |")
+        lines.append("|-------|------:|")
+        for check in sorted(by_check):
+            lines.append(f"| `{check}` | {len(by_check[check])} |")
+        lines.append(f"| **Total** | **{len(diagnostic_lines)}** |")
+
+        lines.append("\n### Details\n")
+        by_file = {}
+        for line in diagnostic_lines:
+            m = re.match(r"^(.*?):(\d+):(\d+): (warning|error): (.+)", line)
+            if m:
+                filepath = m.group(1)
+                if filepath.startswith(prefix):
+                    filepath = filepath[len(prefix):]
+                by_file.setdefault(filepath, []).append(line)
+
+        for filepath in sorted(by_file):
+            n_e = sum(1 for l in by_file[filepath] if ": error:" in l)
+            n_w = sum(1 for l in by_file[filepath] if ": warning:" in l)
+            file_parts = []
+            if n_e:
+                file_parts.append(f"{n_e} error(s)")
+            if n_w:
+                file_parts.append(f"{n_w} warning(s)")
+            file_summary = ", ".join(file_parts)
+            lines.append(f"<details><summary><code>{filepath}</code> ({file_summary})</summary>\n")
+            lines.append("```")
+            for w in by_file[filepath]:
+                clean = w
+                if clean.startswith(prefix):
+                    clean = clean[len(prefix):]
+                lines.append(clean)
+            lines.append("```\n")
+            lines.append("</details>\n")
+
+    return "\n".join(lines)
+
+
+def write_github_step_summary(warning_lines, by_check, wall_time, repo_root):
+    """
+    Write a Markdown summary to $GITHUB_STEP_SUMMARY.
+
+    This appears on the job's summary page in GitHub Actions and has no
+    practical size limit, unlike annotations (capped at 10+10 per step).
+    """
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        log("  $GITHUB_STEP_SUMMARY not set; skipping step summary.")
+        return
+
+    md = build_markdown_summary(warning_lines, by_check, wall_time, repo_root)
+    with open(summary_path, "a") as f:
+        f.write(md + "\n")
+    log(f"  Step summary written to $GITHUB_STEP_SUMMARY")
+
+
+def write_comment_file(path, warning_lines, by_check, wall_time, repo_root):
+    """
+    Write the Markdown summary to a file for posting as a PR comment.
+
+    Includes a hidden HTML marker so the workflow can find and update an
+    existing comment instead of creating duplicates on re-runs.
+    """
+    md = build_markdown_summary(warning_lines, by_check, wall_time, repo_root)
+    with open(path, "w") as f:
+        f.write(COMMENT_MARKER + "\n" + md + "\n")
+    log(f"  Comment body written to {path}")
+
+
+def filter_to_changed_lines(raw_output, changed_lines, repo_root):
+    """
+    Parse clang-tidy output and keep only diagnostics whose location falls on
+    a changed line.  Also keeps note/context lines that follow a kept warning.
+    """
+    prefix = repo_root.rstrip("/") + "/"
+    results = []
+    keep_current = False
+
+    for line in raw_output.split("\n"):
+        m = re.match(r"^(.*?):(\d+):\d+: (warning|error): (.+)", line)
+        if m:
+            filepath_abs = m.group(1)
+            lineno = int(m.group(2))
+
+            rel_path = filepath_abs
+            if filepath_abs.startswith(prefix):
+                rel_path = filepath_abs[len(prefix):]
+
+            if rel_path in changed_lines and lineno in changed_lines[rel_path]:
+                keep_current = True
+                results.append(line)
+            else:
+                keep_current = False
+            continue
+
+        if keep_current:
+            if line.strip():
+                results.append(line)
+            else:
+                keep_current = False
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run clang-tidy on locally changed code, filtered to changed lines.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--clang-tidy-binary",
+        default="clang-tidy",
+        help="Path to clang-tidy binary (default: %(default)s)",
+    )
+    parser.add_argument(
+        "-p", "--compile-commands-dir",
+        default=None,
+        help="Directory containing compile_commands.json (default: repo root)",
+    )
+    parser.add_argument(
+        "-j", "--jobs",
+        type=int, default=None,
+        help="Number of parallel clang-tidy jobs (default: CPU count)",
+    )
+    parser.add_argument(
+        "--diff-base",
+        default=None,
+        metavar="REF",
+        help=(
+            "Explicit git ref to diff against (e.g. HEAD~1, a commit SHA, or a "
+            "branch name). When set, only the committed diff from REF to HEAD is "
+            "analyzed (working-tree changes are ignored). This is useful in CI "
+            "where the checkout is a merge commit: --diff-base HEAD~1 gives "
+            "exactly the PR's changes. When omitted, the base is auto-detected "
+            "from the remote tracking branch."
+        ),
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default=None,
+        help="Write full (unfiltered) clang-tidy output to this file",
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Print all clang-tidy output, not just warnings on changed lines",
+    )
+    parser.add_argument(
+        "--github-annotations",
+        action="store_true",
+        help=(
+            "Emit GitHub Actions workflow commands (::warning) so that "
+            "clang-tidy findings appear as inline annotations on the PR's "
+            "\"Files changed\" tab.  Note: GitHub caps this at 10 warnings "
+            "and 10 errors per step."
+        ),
+    )
+    parser.add_argument(
+        "--github-step-summary",
+        action="store_true",
+        help=(
+            "Write a Markdown summary of all findings to $GITHUB_STEP_SUMMARY. "
+            "This appears on the job's summary page with no size limit, "
+            "complementing the capped inline annotations."
+        ),
+    )
+    parser.add_argument(
+        "--comment-output",
+        default=None,
+        metavar="FILE",
+        help=(
+            "Write a Markdown summary to FILE for posting as a PR comment. "
+            "Includes a hidden marker so the CI workflow can find and update "
+            "an existing comment instead of creating duplicates on re-runs."
+        ),
+    )
+    args = parser.parse_args()
+
+    repo_root = get_repo_root()
+    compile_db_dir = args.compile_commands_dir or repo_root
+    compile_db_path = os.path.join(compile_db_dir, "compile_commands.json")
+    jobs = args.jobs or os.cpu_count() or 4
+
+    # ------------------------------------------------------------------
+    # Step 1 — detect changes
+    # ------------------------------------------------------------------
+    log("=" * 70)
+    log("Step 1: Detecting changes")
+    log("=" * 70)
+    changed_lines = collect_changed_lines(repo_root, args.diff_base)
+
+    if not changed_lines:
+        log("\nNo changes detected. Nothing to check.")
+        if args.comment_output:
+            write_comment_file(args.comment_output, [], {}, 0, repo_root)
+        return 0
+
+    total_lines = sum(len(v) for v in changed_lines.values())
+    log(f"\n  {len(changed_lines)} file(s) changed, {total_lines} line(s) total:")
+    for f in sorted(changed_lines):
+        log(f"    {f}  ({len(changed_lines[f])} lines)")
+
+    # ------------------------------------------------------------------
+    # Step 2 — select compilable files present in compile_commands.json
+    # ------------------------------------------------------------------
+    db_files = load_compile_db(compile_db_path, repo_root)
+    cc_changed = sorted(
+        f for f in changed_lines
+        if re.search(r"\.(cc|cpp)$", f)
+        and (f in db_files or os.path.join(repo_root, f) in db_files)
+    )
+
+    if not cc_changed:
+        log("\nNo compilable changed files found in compile_commands.json.")
+        if args.comment_output:
+            write_comment_file(args.comment_output, [], {}, 0, repo_root)
+        return 0
+
+    log(f"\n{'=' * 70}")
+    log(f"Step 2: Running clang-tidy on {len(cc_changed)} file(s)  [jobs={jobs}]")
+    log("=" * 70)
+
+    # ------------------------------------------------------------------
+    # Step 3 — run clang-tidy in parallel via ThreadPoolExecutor
+    # ------------------------------------------------------------------
+    all_raw_output = []
+    all_filtered = []
+    t0 = time.time()
+
+    with ThreadPoolExecutor(max_workers=jobs) as pool:
+        futures = {
+            pool.submit(
+                invoke_clang_tidy,
+                args.clang_tidy_binary,
+                compile_db_dir,
+                f,
+                repo_root,
+            ): f
+            for f in cc_changed
+        }
+
+        done = 0
+        for future in as_completed(futures):
+            done += 1
+            fpath = futures[future]
+            fpath, output, rc = future.result()
+            all_raw_output.append(output)
+
+            filtered = filter_to_changed_lines(output, changed_lines, repo_root)
+            all_filtered.extend(filtered)
+
+            n_diags = sum(
+                1 for l in filtered if re.search(r": (warning|error):", l)
+            )
+            elapsed = time.time() - t0
+            if rc == 0:
+                status = "clean"
+            elif rc == -1:
+                status = "TIMEOUT"
+            else:
+                status = f"{n_diags} on changed lines"
+            log(
+                f"  [{done:>{len(str(len(cc_changed)))}}/{len(cc_changed)}]"
+                f" {elapsed:6.1f}s  {fpath}  ({status})"
+            )
+
+    wall_time = time.time() - t0
+
+    # ------------------------------------------------------------------
+    # Optional: save full output
+    # ------------------------------------------------------------------
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write("\n".join(all_raw_output))
+        log(f"\nFull clang-tidy output saved to {args.output}")
+
+    # ------------------------------------------------------------------
+    # Step 4 — report filtered results
+    # ------------------------------------------------------------------
+    log(f"\n{'=' * 70}")
+    log(f"Step 3: Results  (wall time {wall_time:.1f}s)")
+    log("=" * 70)
+
+    if args.verbose:
+        log("\n--- Full output ---")
+        for chunk in all_raw_output:
+            log(chunk)
+        log("--- End full output ---\n")
+
+    diagnostic_lines = [
+        l for l in all_filtered if re.search(r": (warning|error):", l)
+    ]
+    if not diagnostic_lines:
+        log("\nNo findings on changed lines. Clean!")
+        if args.github_step_summary:
+            write_github_step_summary([], {}, wall_time, repo_root)
+        if args.comment_output:
+            write_comment_file(args.comment_output, [], {}, wall_time, repo_root)
+        return 0
+
+    error_lines = [l for l in diagnostic_lines if re.search(r": error:", l)]
+    warning_lines = [l for l in diagnostic_lines if re.search(r": warning:", l)]
+
+    by_check = {}
+    for line in diagnostic_lines:
+        m = re.search(r"\[([\w.-]+)\]\s*$", line)
+        check = m.group(1) if m else "unknown"
+        by_check.setdefault(check, []).append(line)
+
+    parts = []
+    if error_lines:
+        parts.append(f"{len(error_lines)} error(s)")
+    if warning_lines:
+        parts.append(f"{len(warning_lines)} warning(s)")
+    log(f"\n{' and '.join(parts)} on changed lines:\n")
+    for line in all_filtered:
+        log(line)
+
+    if args.github_annotations:
+        log(f"\n{'=' * 70}")
+        log("Emitting GitHub Actions annotations")
+        log("=" * 70)
+        emit_github_annotations(all_filtered, repo_root)
+
+    if args.github_step_summary:
+        log(f"\n{'=' * 70}")
+        log("Writing GitHub step summary")
+        log("=" * 70)
+        write_github_step_summary(diagnostic_lines, by_check, wall_time, repo_root)
+
+    if args.comment_output:
+        log(f"\n{'=' * 70}")
+        log("Writing PR comment body")
+        log("=" * 70)
+        write_comment_file(
+            args.comment_output, diagnostic_lines, by_check, wall_time, repo_root
+        )
+
+    log(f"\n{'=' * 70}")
+    log("Summary by check:")
+    log("=" * 70)
+    for check in sorted(by_check):
+        log(f"  [{check}]  x{len(by_check[check])}")
+    summary_parts = []
+    if error_lines:
+        summary_parts.append(f"{len(error_lines)} error(s)")
+    if warning_lines:
+        summary_parts.append(f"{len(warning_lines)} warning(s)")
+    log(f"\n  Total: {' and '.join(summary_parts)}")
+
+    return 1 if error_lines else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 4e11dd79e10b9d7586d8fd5ad280d83369c980a4 Mon Sep 17 00:00:00 2001
From: Xingbo Wang <xbw@meta.com>
Date: Sat, 21 Feb 2026 06:19:45 -0800
Subject: [PATCH 500/500] V2 serialization format for wide columns with blob
 references (#14314)

Summary:
Introduce a new V2 serialization format for wide column entities that supports storing individual column values in blob files. The V2 format adds a column type section that marks each column as either inline or blob-index, enabling per-column blob storage for large values.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14314

Reviewed By: pdillinger

Differential Revision: D92832066

Pulled By: xingbowang

fbshipit-source-id: 13c24347e1f481a059d67eef987d2d2b184b4a51
---
 .github/workflows/clang-tidy-comment.yml      |  27 +-
 CLAUDE.md                                     |  37 +-
 db/blob/blob_index.h                          |  12 +
 db/wide/wide_column_serialization.cc          | 678 +++++++++++++++++-
 db/wide/wide_column_serialization.h           | 220 +++++-
 db/wide/wide_column_serialization_test.cc     | 561 ++++++++++++++-
 .../new_features/wide_column_blob_support.md  |   1 +
 7 files changed, 1487 insertions(+), 49 deletions(-)
 create mode 100644 unreleased_history/new_features/wide_column_blob_support.md

diff --git a/.github/workflows/clang-tidy-comment.yml b/.github/workflows/clang-tidy-comment.yml
index 1a07a7ce70fd..9615c890f85f 100644
--- a/.github/workflows/clang-tidy-comment.yml
+++ b/.github/workflows/clang-tidy-comment.yml
@@ -1,5 +1,6 @@
 name: clang-tidy
 on:
+  push:
   pull_request_target:
     types: [opened, synchronize, reopened]
 
@@ -16,13 +17,30 @@ jobs:
     steps:
     - uses: actions/checkout@v4.1.0
       with:
-        ref: ${{ github.event.pull_request.head.sha }}
-        fetch-depth: 2
+        ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.sha }}
     - name: Mark workspace as safe for git
       run: git config --global --add safe.directory $GITHUB_WORKSPACE
+    - name: Determine diff base
+      id: diff-base
+      run: |
+        if [ "${{ github.event_name }}" = "pull_request_target" ]; then
+          BASE="${{ github.event.pull_request.base.sha }}"
+        else
+          BASE="${{ github.event.before }}"
+        fi
+        if [ -z "$BASE" ] || echo "$BASE" | grep -q '^0\{40\}$'; then
+          echo "skip=true" >> "$GITHUB_OUTPUT"
+          echo "No valid diff base; skipping clang-tidy."
+        else
+          git fetch --depth=1 origin "$BASE"
+          echo "ref=$BASE" >> "$GITHUB_OUTPUT"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+        fi
     - name: Install clang-tidy
+      if: steps.diff-base.outputs.skip != 'true'
       run: apt-get update && apt-get install -y clang-tidy
     - name: Generate compile_commands.json
+      if: steps.diff-base.outputs.skip != 'true'
       run: |
         mkdir build && cd build
         cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
@@ -32,16 +50,17 @@ jobs:
         ln -sf build/compile_commands.json compile_commands.json
     - name: Run clang-tidy on changed files
       id: clang-tidy
+      if: steps.diff-base.outputs.skip != 'true'
       run: |
         python3 tools/run_clang_tidy.py \
           -j 4 \
-          --diff-base HEAD~1 \
+          --diff-base ${{ steps.diff-base.outputs.ref }} \
           --github-annotations \
           --github-step-summary \
           --comment-output clang-tidy-comment.md
       continue-on-error: true
     - name: Post clang-tidy results to PR
-      if: always()
+      if: github.event_name == 'pull_request_target' && always() && steps.diff-base.outputs.skip != 'true'
       uses: actions/github-script@v7
       with:
         script: |
diff --git a/CLAUDE.md b/CLAUDE.md
index acf14592e99d..39ef7dbc380d 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -24,12 +24,31 @@ This document provides guidance for generating and reviewing code in the RocksDB
 
 ### Performance Considerations
 
+**⚠️ PERFORMANCE IS CRITICAL:** RocksDB is a high-performance storage engine where every CPU cycle and memory access matters. When writing code, always evaluate from a performance perspective. This is not optional—performance-aware coding is a fundamental requirement for all contributions.
+
 **Benchmarking and Profiling:** Performance claims should be backed by empirical evidence. Use RocksDB's benchmarking tools (e.g., `db_bench`) to validate improvements. Reviewers will request benchmark results for changes that could impact performance.
 
-**Avoid Premature Optimization:** Focus on correctness first, then optimize based on profiling data. Reviewers are skeptical of optimizations that add complexity without measurable benefit.
+**Memory Allocation:** Minimize dynamic memory allocations, especially in hot paths. Prefer stack allocation over heap allocation. Reuse buffers when possible. Consider using arena allocators or memory pools for frequent small allocations. Every `new`, `malloc`, or container resize has a cost.
+
+**Memory Copy:** Avoid unnecessary memory copies. Use move semantics, `std::string_view`, `Slice`, and pass-by-reference where appropriate. Be aware of implicit copies in STL containers and function returns. Prefer in-place operations over copy-and-modify patterns.
+
+**CPU Cache Efficiency:** Design data structures and access patterns to be cache-friendly. Keep frequently accessed data together (data locality). Prefer sequential memory access over random access. Be mindful of cache line sizes (typically 64 bytes) and avoid false sharing in concurrent code. Consider struct packing and field ordering to improve cache utilization.
+
+**Loop Optimization:** Look for opportunities to collapse nested loops, reduce loop overhead, and minimize branch mispredictions. Hoist invariant computations out of loops. Consider loop unrolling for tight inner loops. Batch operations when possible to amortize per-operation overhead.
+
+**SIMD and Vectorization:** Leverage SIMD instructions (SSE, AVX) for data-parallel operations when appropriate. Structure data to enable auto-vectorization by the compiler. Consider explicit SIMD intrinsics for critical hot paths like checksum computation, encoding/decoding, and bulk data processing.
+
+**Branch Prediction:** Minimize unpredictable branches in hot paths. Use `LIKELY`/`UNLIKELY` macros to hint branch prediction. Consider branchless alternatives for simple conditionals. Order switch cases and if-else chains by frequency.
 
 **Memory and Resource Management:** Be mindful of memory allocations, especially in hot paths. Use RAII patterns, smart pointers, and RocksDB's memory management utilities appropriately.
 
+**Hot Path Analysis:** When deciding how aggressively to optimize code, consider whether it's on a hot path:
+- **Hot path** (executed thousands+ times, e.g., data access, iteration, compaction loops): Performance is paramount. Apply all optimization techniques—loop collapsing, SIMD, cache optimization, pre-allocation, etc. The cost of each operation is multiplied by execution frequency.
+- **Cold path** (executed rarely, e.g., DB open, configuration parsing, error handling): Maintainability and clarity are more important. Prefer readable code over micro-optimizations. Complex optimizations here add maintenance burden with negligible performance benefit.
+- **Warm path** (moderate frequency): Balance both concerns. Use profiling data to guide optimization decisions.
+
+**Avoid Premature Optimization:** While performance is critical, focus on correctness first, then optimize based on profiling data. However, be performance-aware from the start—choosing the right algorithm and data structure upfront is not premature optimization. Use the hot path analysis above to decide how much optimization effort is warranted.
+
 ### API Design and Compatibility
 
 **Backwards Compatibility:** RocksDB maintains strong backwards compatibility guarantees. Breaking changes are rare and require extensive justification. When deprecating features, follow the project's deprecation policy (typically spanning multiple releases).
@@ -209,6 +228,22 @@ The following patterns emerged as frequent sources of review feedback:
     gtest_parallel.py if available. E.g.
     python3 ${GTEST_PARALLEL}/gtest_parallel.py ./table_test
 
+### Unit test dedup guidelines
+* Extract helper functions for repeated patterns such as object
+    construction, round-trip (encode → decode → verify), and common
+    assertion sequences.
+* Use table-driven tests (struct array + loop) when multiple test cases
+    share the same logic but differ only in input/expected data.
+* Prefer randomized tests over exhaustive parameter permutations. Use
+    `Random` from `util/random.h` (not `std::mt19937`). Use a time-based
+    seed with `SCOPED_TRACE("seed=" + std::to_string(seed))` so failures
+    are reproducible.
+* Keep deterministic edge-case tests separate from randomized tests
+    (error paths, boundary conditions, format verification).
+* Methods only used in tests should be private with `friend class` +
+    `TEST_F` fixture wrappers. In wrappers, always fully qualify the
+    target method to avoid infinite recursion.
+
 ### Adding new public API
     Refer to claude_md/add_public_api.md
 
diff --git a/db/blob/blob_index.h b/db/blob/blob_index.h
index e9944d78448b..fda6f946a672 100644
--- a/db/blob/blob_index.h
+++ b/db/blob/blob_index.h
@@ -137,6 +137,18 @@ class BlobIndex {
     return oss.str();
   }
 
+  // Encode this blob index into dst based on its type.
+  void EncodeTo(std::string* dst) const {
+    if (IsInlined()) {
+      EncodeInlinedTTL(dst, expiration_, value_);
+    } else if (HasTTL()) {
+      EncodeBlobTTL(dst, expiration_, file_number_, offset_, size_,
+                    compression_);
+    } else {
+      EncodeBlob(dst, file_number_, offset_, size_, compression_);
+    }
+  }
+
   static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
                                const Slice& value) {
     assert(dst != nullptr);
diff --git a/db/wide/wide_column_serialization.cc b/db/wide/wide_column_serialization.cc
index 0366a5db977d..8371b7cbbd30 100644
--- a/db/wide/wide_column_serialization.cc
+++ b/db/wide/wide_column_serialization.cc
@@ -5,10 +5,12 @@
 
 #include "db/wide/wide_column_serialization.h"
 
-#include <algorithm>
 #include <cassert>
-#include <limits>
+#include <cstring>
 
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
 #include "db/wide/wide_columns_helper.h"
 #include "rocksdb/slice.h"
 #include "util/autovector.h"
@@ -16,15 +18,46 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+Status WideColumnSerialization::BuildBlobIndexMap(
+    size_t num_columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::vector<const BlobIndex*>& blob_index_map) {
+  if (Status s = ValidateWideColumnLimit(num_columns, "Too many wide columns");
+      !s.ok()) {
+    return s;
+  }
+
+  blob_index_map.assign(num_columns, nullptr);
+  for (const auto& blob_col : blob_columns) {
+    if (blob_col.first >= blob_index_map.size()) {
+      return Status::InvalidArgument("Blob column index out of range");
+    }
+    blob_index_map[blob_col.first] = &blob_col.second;
+  }
+
+  return Status::OK();
+}
+
+bool WideColumnSerialization::ContainsBlobType(const char* type_bytes,
+                                               uint32_t num_columns) {
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    if (static_cast<uint8_t>(type_bytes[i]) == kTypeBlobIndex) {
+      return true;
+    }
+  }
+  return false;
+}
+
 Status WideColumnSerialization::Serialize(const WideColumns& columns,
                                           std::string& output) {
   const size_t num_columns = columns.size();
 
-  if (num_columns > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
-    return Status::InvalidArgument("Too many wide columns");
+  if (Status sv = ValidateWideColumnLimit(num_columns, "Too many wide columns");
+      !sv.ok()) {
+    return sv;
   }
 
-  PutVarint32(&output, kCurrentVersion);
+  PutVarint32(&output, kVersion1);
 
   PutVarint32(&output, static_cast<uint32_t>(num_columns));
 
@@ -34,19 +67,23 @@ Status WideColumnSerialization::Serialize(const WideColumns& columns,
     const WideColumn& column = columns[i];
 
     const Slice& name = column.name();
-    if (name.size() >
-        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
-      return Status::InvalidArgument("Wide column name too long");
+    if (Status s_name =
+            ValidateWideColumnLimit(name.size(), "Wide column name too long");
+        !s_name.ok()) {
+      return s_name;
     }
 
-    if (prev_name && prev_name->compare(name) >= 0) {
-      return Status::Corruption("Wide columns out of order");
+    if (prev_name) {
+      if (Status so = ValidateColumnOrder(*prev_name, name); !so.ok()) {
+        return so;
+      }
     }
 
     const Slice& value = column.value();
-    if (value.size() >
-        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
-      return Status::InvalidArgument("Wide column value too long");
+    if (Status s_val =
+            ValidateWideColumnLimit(value.size(), "Wide column value too long");
+        !s_val.ok()) {
+      return s_val;
     }
 
     PutLengthPrefixedSlice(&output, name);
@@ -64,28 +101,151 @@ Status WideColumnSerialization::Serialize(const WideColumns& columns,
   return Status::OK();
 }
 
-Status WideColumnSerialization::Deserialize(Slice& input,
-                                            WideColumns& columns) {
-  assert(columns.empty());
-
-  uint32_t version = 0;
-  if (!GetVarint32(&input, &version)) {
-    return Status::Corruption("Error decoding wide column version");
+template <typename GetName, typename GetValue>
+Status WideColumnSerialization::SerializeV2Impl(
+    size_t num_columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::string& output, GetName get_name, GetValue get_value) {
+  std::vector<const BlobIndex*> blob_index_map;
+  if (Status s = BuildBlobIndexMap(num_columns, blob_columns, blob_index_map);
+      !s.ok()) {
+    return s;
   }
+  assert(blob_index_map.size() == num_columns);
+
+  // First pass: validate column ordering, compute sizes, serialize blob
+  // indices, and build column types.
+  std::vector<std::string> serialized_blob_indices(num_columns);
+  std::vector<uint32_t> name_sizes(num_columns);
+  std::vector<uint32_t> value_sizes(num_columns);
+  std::string column_types;
+  column_types.reserve(num_columns);
+
+  Slice prev_name_storage;
+  bool has_prev = false;
+  uint32_t name_sizes_bytes = 0;
+  uint32_t names_bytes = 0;
+  uint32_t total_value_sizes_bytes = 0;
+  uint32_t total_values_bytes = 0;
+
+  for (size_t i = 0; i < num_columns; ++i) {
+    const Slice name = get_name(i);
+    const Slice value = get_value(i);
+
+    if (Status sn =
+            ValidateWideColumnLimit(name.size(), "Wide column name too long");
+        !sn.ok()) {
+      return sn;
+    }
 
-  if (version > kCurrentVersion) {
-    return Status::NotSupported("Unsupported wide column version");
-  }
+    if (has_prev) {
+      if (Status so = ValidateColumnOrder(prev_name_storage, name); !so.ok()) {
+        return so;
+      }
+    }
 
-  uint32_t num_columns = 0;
-  if (!GetVarint32(&input, &num_columns)) {
-    return Status::Corruption("Error decoding number of wide columns");
+    name_sizes[i] = static_cast<uint32_t>(name.size());
+    name_sizes_bytes += VarintLength(name_sizes[i]);
+    names_bytes += name_sizes[i];
+
+    if (blob_index_map[i] != nullptr) {
+      const BlobIndex* blob_idx = blob_index_map[i];
+      blob_idx->EncodeTo(&serialized_blob_indices[i]);
+      value_sizes[i] = static_cast<uint32_t>(serialized_blob_indices[i].size());
+      column_types.push_back(static_cast<char>(kTypeBlobIndex));
+    } else {
+      if (Status svl = ValidateWideColumnLimit(value.size(),
+                                               "Wide column value too long");
+          !svl.ok()) {
+        return svl;
+      }
+      value_sizes[i] = static_cast<uint32_t>(value.size());
+      column_types.push_back(static_cast<char>(kTypeValue));
+    }
+
+    total_value_sizes_bytes += VarintLength(value_sizes[i]);
+    total_values_bytes += value_sizes[i];
+
+    prev_name_storage = name;
+    has_prev = true;
   }
 
-  if (!num_columns) {
+  // Second pass: write all V2 sections to output.
+  // Pre-allocate output string.
+  const size_t total_size =
+      VarintLength(kVersion2) +
+      VarintLength(static_cast<uint32_t>(num_columns)) +
+      num_columns +  // column types
+      VarintLength(name_sizes_bytes) + VarintLength(total_value_sizes_bytes) +
+      VarintLength(names_bytes) + name_sizes_bytes + total_value_sizes_bytes +
+      names_bytes + total_values_bytes;
+
+  const size_t base_offset = output.size();
+  output.reserve(base_offset + total_size);
+
+  // Sections 1-3: header, skip info, column types
+  PutVarint32(&output, kVersion2);
+  PutVarint32(&output, static_cast<uint32_t>(num_columns));
+  PutVarint32(&output, name_sizes_bytes);
+  PutVarint32(&output, total_value_sizes_bytes);
+  PutVarint32(&output, names_bytes);
+  output.append(column_types);
+
+  // Sections 4-7: resize to final size, then write all 4 sections in a
+  // single loop using independent pointers. Each section's start offset is
+  // known from the sizes computed in the first pass.
+  if (num_columns == 0) {
     return Status::OK();
   }
 
+  const size_t sec4_offset = output.size();
+  output.resize(base_offset + total_size);
+
+  char* s4 = &output[sec4_offset];          // section 4: name sizes
+  char* s5 = s4 + name_sizes_bytes;         // section 5: value sizes
+  char* s6 = s5 + total_value_sizes_bytes;  // section 6: names
+  char* s7 = s6 + names_bytes;              // section 7: values
+
+  for (size_t i = 0; i < num_columns; ++i) {
+    s4 = EncodeVarint32(s4, name_sizes[i]);
+    s5 = EncodeVarint32(s5, value_sizes[i]);
+
+    memcpy(s6, get_name(i).data(), name_sizes[i]);
+    s6 += name_sizes[i];
+
+    if (blob_index_map[i] != nullptr) {
+      memcpy(s7, serialized_blob_indices[i].data(), value_sizes[i]);
+    } else {
+      memcpy(s7, get_value(i).data(), value_sizes[i]);
+    }
+    s7 += value_sizes[i];
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::SerializeV2(
+    const std::vector<std::pair<std::string, std::string>>& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::string& output) {
+  return SerializeV2Impl(
+      columns.size(), blob_columns, output,
+      [&](size_t i) { return Slice(columns[i].first); },
+      [&](size_t i) { return Slice(columns[i].second); });
+}
+
+Status WideColumnSerialization::SerializeV2(
+    const WideColumns& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    std::string& output) {
+  return SerializeV2Impl(
+      columns.size(), blob_columns, output,
+      [&](size_t i) { return columns[i].name(); },
+      [&](size_t i) { return columns[i].value(); });
+}
+
+Status WideColumnSerialization::DeserializeV1(
+    Slice& input, uint32_t num_columns, std::vector<WideColumn>& columns) {
   columns.reserve(num_columns);
 
   autovector<uint32_t, 16> column_value_sizes;
@@ -97,8 +257,11 @@ Status WideColumnSerialization::Deserialize(Slice& input,
       return Status::Corruption("Error decoding wide column name");
     }
 
-    if (!columns.empty() && columns.back().name().compare(name) >= 0) {
-      return Status::Corruption("Wide columns out of order");
+    if (!columns.empty()) {
+      if (Status so = ValidateColumnOrder(columns.back().name(), name);
+          !so.ok()) {
+        return so;
+      }
     }
 
     columns.emplace_back(name, Slice());
@@ -129,12 +292,324 @@ Status WideColumnSerialization::Deserialize(Slice& input,
   return Status::OK();
 }
 
+Status WideColumnSerialization::DeserializeV2Impl(
+    Slice& input, uint32_t num_columns, std::vector<WideColumn>& columns,
+    std::vector<ValueType>& column_types) {
+  // Section 2: SKIP INFO (3 varints)
+  uint32_t name_sizes_bytes = 0;
+  uint32_t value_sizes_bytes = 0;
+  uint32_t names_bytes = 0;
+  if (!GetVarint32(&input, &name_sizes_bytes)) {
+    return Status::Corruption("Error decoding wide column name sizes bytes");
+  }
+  if (!GetVarint32(&input, &value_sizes_bytes)) {
+    return Status::Corruption("Error decoding wide column value sizes bytes");
+  }
+  if (!GetVarint32(&input, &names_bytes)) {
+    return Status::Corruption("Error decoding wide column names bytes");
+  }
+
+  // Section 3: COLUMN TYPES (N bytes, each is a ValueType)
+  if (input.size() < num_columns) {
+    return Status::Corruption("Error decoding wide column types");
+  }
+  column_types.resize(num_columns);
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    column_types[i] = static_cast<ValueType>(input[i]);
+    if (!IsValidColumnValueType(column_types[i])) {
+      return Status::Corruption("Unsupported wide column ValueType");
+    }
+  }
+  input.remove_prefix(num_columns);
+
+  // Validate that sections 4-6 fit in the remaining input
+  const size_t metadata_size =
+      name_sizes_bytes + value_sizes_bytes + names_bytes;
+  if (input.size() < metadata_size) {
+    return Status::Corruption("Error decoding wide column sections");
+  }
+
+  // Set up 4 pointers into sections 4-7 for single-loop parsing.
+  // Skip info gives us exact boundaries for each section.
+  const char* s4 = input.data();  // section 4: name sizes
+  const char* s4_limit = s4 + name_sizes_bytes;
+  const char* s5 = s4_limit;  // section 5: value sizes
+  const char* s5_limit = s5 + value_sizes_bytes;
+  const char* s6 = s5_limit;          // section 6: names
+  const char* s7 = s6 + names_bytes;  // section 7: values
+  const char* input_end = input.data() + input.size();
+
+  columns.reserve(num_columns);
+  size_t name_pos = 0;
+  size_t value_pos = 0;
+
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    // Decode name size from section 4
+    uint32_t ns = 0;
+    const char* s4_next = GetVarint32Ptr(s4, s4_limit, &ns);
+    if (s4_next == nullptr) {
+      return Status::Corruption("Error decoding wide column name size");
+    }
+    s4 = s4_next;
+
+    // Decode value size from section 5
+    uint32_t vs = 0;
+    const char* s5_next = GetVarint32Ptr(s5, s5_limit, &vs);
+    if (s5_next == nullptr) {
+      return Status::Corruption("Error decoding wide column value size");
+    }
+    s5 = s5_next;
+
+    // Read name from section 6
+    if (name_pos + ns > names_bytes) {
+      return Status::Corruption("Error decoding wide column name");
+    }
+    Slice name(s6 + name_pos, ns);
+
+    if (!columns.empty()) {
+      if (Status so = ValidateColumnOrder(columns.back().name(), name);
+          !so.ok()) {
+        return so;
+      }
+    }
+
+    // Read value from section 7
+    if (s7 + value_pos + vs > input_end) {
+      return Status::Corruption("Error decoding wide column value payload");
+    }
+
+    columns.emplace_back(name, Slice(s7 + value_pos, vs));
+    name_pos += ns;
+    value_pos += vs;
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::Deserialize(Slice& input,
+                                            WideColumns& columns) {
+  assert(columns.empty());
+
+  // Reuse DeserializeV2, then reject any blob references.
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+  if (Status s = DeserializeV2(input, columns, blob_columns); !s.ok()) {
+    return s;
+  }
+
+  if (!blob_columns.empty()) {
+    return Status::NotSupported(
+        "Wide column contains blob references. Use DeserializeV2.");
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::DeserializeV2(
+    Slice& input, std::vector<WideColumn>& columns,
+    std::vector<std::pair<size_t, BlobIndex>>& blob_columns) {
+  assert(columns.empty());
+  assert(blob_columns.empty());
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  if (version > kVersion2) {
+    return Status::NotSupported("Unsupported wide column version");
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    return Status::OK();
+  }
+
+  if (version >= kVersion2) {
+    // V2 layout: parse columns and extract blob column info
+    std::vector<ValueType> column_types;
+
+    if (Status s = DeserializeV2Impl(input, num_columns, columns, column_types);
+        !s.ok()) {
+      return s;
+    }
+    assert(column_types.size() == num_columns);
+    assert(columns.size() == num_columns);
+
+    // Decode blob indices from value data
+    for (uint32_t i = 0; i < num_columns; ++i) {
+      if (column_types[i] == kTypeBlobIndex) {
+        BlobIndex blob_idx;
+        Slice blob_slice = columns[i].value();
+        if (Status bs = blob_idx.DecodeFrom(blob_slice); !bs.ok()) {
+          return Status::Corruption("Error decoding blob index in wide column");
+        }
+        blob_columns.emplace_back(i, blob_idx);
+      }
+    }
+  } else {
+    return DeserializeV1(input, num_columns, columns);
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::HasBlobColumns(const Slice& input,
+                                               bool& has_blob_columns) {
+  has_blob_columns = false;
+
+  Slice input_ref = input;
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input_ref, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  // Version 1 never has blob columns
+  if (version < kVersion2) {
+    return Status::OK();
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input_ref, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    return Status::OK();
+  }
+
+  // V2: Skip over SKIP INFO (3 varints) to reach COLUMN TYPES section.
+  uint32_t unused_name_sizes_bytes = 0;
+  uint32_t unused_value_sizes_bytes = 0;
+  uint32_t unused_names_bytes = 0;
+  if (!GetVarint32(&input_ref, &unused_name_sizes_bytes) ||
+      !GetVarint32(&input_ref, &unused_value_sizes_bytes) ||
+      !GetVarint32(&input_ref, &unused_names_bytes)) {
+    return Status::Corruption("Error decoding wide column skip info");
+  }
+  if (input_ref.size() < num_columns) {
+    return Status::Corruption("Error decoding wide column types");
+  }
+  has_blob_columns = ContainsBlobType(input_ref.data(), num_columns);
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::GetVersion(const Slice& input,
+                                           uint32_t& version) {
+  Slice input_ref = input;
+
+  version = 0;
+  if (!GetVarint32(&input_ref, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  return Status::OK();
+}
+
 Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
                                                         Slice& value) {
+  Slice input_ref = input;
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input_ref, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  if (version > kVersion2) {
+    return Status::NotSupported("Unsupported wide column version");
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input_ref, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    value.clear();
+    return Status::OK();
+  }
+
+  if (version >= kVersion2) {
+    // V2 fast path: use skip info to jump directly to values without
+    // scanning through variable-length sections.
+
+    // Read SKIP INFO (3 varints, immediately after header)
+    uint32_t name_sizes_bytes = 0;
+    uint32_t value_sizes_bytes = 0;
+    uint32_t names_bytes = 0;
+    if (!GetVarint32(&input_ref, &name_sizes_bytes)) {
+      return Status::Corruption("Error decoding wide column name sizes bytes");
+    }
+    if (!GetVarint32(&input_ref, &value_sizes_bytes)) {
+      return Status::Corruption("Error decoding wide column value sizes bytes");
+    }
+    if (!GetVarint32(&input_ref, &names_bytes)) {
+      return Status::Corruption("Error decoding wide column names bytes");
+    }
+
+    // Read COLUMN TYPES (N bytes)
+    if (input_ref.size() < num_columns) {
+      return Status::Corruption("Error decoding wide column types");
+    }
+    // Check if default column (index 0) is a blob reference
+    if (static_cast<uint8_t>(input_ref[0]) == kTypeBlobIndex) {
+      return Status::NotSupported(
+          "Wide column contains blob references. Use DeserializeV2.");
+    }
+    input_ref.remove_prefix(num_columns);
+
+    // Peek first name size from NAME SIZES section
+    if (input_ref.size() < name_sizes_bytes) {
+      return Status::Corruption("Error decoding wide column name sizes");
+    }
+    Slice name_sizes_section(input_ref.data(), name_sizes_bytes);
+    uint32_t first_name_size = 0;
+    if (!GetVarint32(&name_sizes_section, &first_name_size)) {
+      return Status::Corruption("Error decoding wide column name size");
+    }
+    input_ref.remove_prefix(name_sizes_bytes);
+
+    // Peek first value size from VALUE SIZES section
+    if (input_ref.size() < value_sizes_bytes) {
+      return Status::Corruption("Error decoding wide column value sizes");
+    }
+    Slice value_sizes_section(input_ref.data(), value_sizes_bytes);
+    uint32_t first_value_size = 0;
+    if (!GetVarint32(&value_sizes_section, &first_value_size)) {
+      return Status::Corruption("Error decoding wide column value size");
+    }
+    // Skip entire VALUE SIZES section using value_sizes_bytes
+    input_ref.remove_prefix(value_sizes_bytes);
+
+    // Check if the first column is the default column (empty name)
+    if (first_name_size != 0) {
+      value.clear();
+      return Status::OK();
+    }
+
+    // Skip NAMES section
+    if (input_ref.size() < names_bytes) {
+      return Status::Corruption("Error decoding wide column names");
+    }
+    input_ref.remove_prefix(names_bytes);
+
+    // Read the first value from VALUES section
+    if (input_ref.size() < first_value_size) {
+      return Status::Corruption("Error decoding wide column value payload");
+    }
+    value = Slice(input_ref.data(), first_value_size);
+    return Status::OK();
+  }
+
+  // V1 fallback: full deserialization
   WideColumns columns;
 
-  const Status s = Deserialize(input, columns);
-  if (!s.ok()) {
+  if (Status s = Deserialize(input, columns); !s.ok()) {
     return s;
   }
 
@@ -148,4 +623,145 @@ Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
   return Status::OK();
 }
 
+Status WideColumnSerialization::ResolveEntityBlobColumns(
+    const Slice& entity_value, const Slice& user_key,
+    const BlobFetcher* blob_fetcher, PrefetchBufferCollection* prefetch_buffers,
+    std::string& resolved_entity, bool& resolved, uint64_t* total_bytes_read,
+    uint64_t* num_blobs_resolved) {
+  assert(blob_fetcher);
+
+  resolved = false;
+
+  std::vector<WideColumn> columns;
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+
+  Slice input_copy = entity_value;
+  if (Status s = DeserializeV2(input_copy, columns, blob_columns); !s.ok()) {
+    return s;
+  }
+
+  if (blob_columns.empty()) {
+    return Status::OK();
+  }
+
+  resolved = true;
+
+  // Fetch each blob value
+  std::vector<std::string> resolved_blob_values;
+  resolved_blob_values.reserve(blob_columns.size());
+
+  for (const auto& blob_col : blob_columns) {
+    const BlobIndex& blob_idx = blob_col.second;
+
+    if (blob_idx.IsInlined()) {
+      resolved_blob_values.emplace_back(blob_idx.value().data(),
+                                        blob_idx.value().size());
+      continue;
+    }
+
+    FilePrefetchBuffer* prefetch_buffer =
+        prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer(
+                               blob_idx.file_number())
+                         : nullptr;
+
+    uint64_t bytes_read = 0;
+
+    PinnableSlice blob_value;
+    const Status fetch_s = blob_fetcher->FetchBlob(
+        user_key, blob_idx, prefetch_buffer, &blob_value, &bytes_read);
+    if (!fetch_s.ok()) {
+      return fetch_s;
+    }
+
+    resolved_blob_values.emplace_back(blob_value.data(), blob_value.size());
+
+    if (total_bytes_read) {
+      *total_bytes_read += bytes_read;
+    }
+  }
+
+  if (num_blobs_resolved) {
+    *num_blobs_resolved += blob_columns.size();
+  }
+
+  return SerializeResolvedEntity(columns, blob_columns, resolved_blob_values,
+                                 resolved_entity);
+}
+
+Status WideColumnSerialization::GetValueOfDefaultColumnResolvingBlobs(
+    const Slice& entity_value, const Slice& user_key,
+    const BlobFetcher* blob_fetcher, PinnableSlice& result, bool& resolved) {
+  assert(blob_fetcher);
+
+  resolved = false;
+
+  std::vector<WideColumn> columns;
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+
+  Slice input_copy = entity_value;
+  if (Status s = DeserializeV2(input_copy, columns, blob_columns); !s.ok()) {
+    return s;
+  }
+
+  // The default column (empty name) is always at index 0 when present
+  // (columns are sorted by name).
+  if (columns.empty() || columns[0].name() != kDefaultWideColumnName) {
+    result.PinSelf(Slice());
+    return Status::OK();
+  }
+
+  // Check if the default column (index 0) is a blob reference
+  for (const auto& blob_col : blob_columns) {
+    if (blob_col.first == 0) {
+      const BlobIndex& blob_idx = blob_col.second;
+
+      resolved = true;
+
+      if (blob_idx.IsInlined()) {
+        result.PinSelf(blob_idx.value());
+        return Status::OK();
+      }
+
+      return blob_fetcher->FetchBlob(user_key, blob_idx,
+                                     nullptr /* prefetch_buffer */, &result,
+                                     nullptr /* bytes_read */);
+    }
+  }
+
+  // Default column is inline
+  result.PinSelf(columns[0].value());
+  return Status::OK();
+}
+
+Status WideColumnSerialization::SerializeResolvedEntity(
+    const std::vector<WideColumn>& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+    const std::vector<std::string>& resolved_blob_values, std::string& output) {
+  assert(blob_columns.size() == resolved_blob_values.size());
+
+  // blob_columns is sorted by column index and typically small, so use a
+  // linear scan with a cursor instead of an unordered_map.
+  size_t blob_cursor = 0;
+
+  // Build result columns with resolved blob values
+  WideColumns result_columns;
+  result_columns.reserve(columns.size());
+
+  for (size_t i = 0; i < columns.size(); ++i) {
+    if (blob_cursor < blob_columns.size() &&
+        blob_columns[blob_cursor].first == i) {
+      // This is a blob column - use the resolved value
+      result_columns.emplace_back(columns[i].name(),
+                                  Slice(resolved_blob_values[blob_cursor]));
+      ++blob_cursor;
+    } else {
+      // This is an inline column - use the original value
+      result_columns.emplace_back(columns[i].name(), columns[i].value());
+    }
+  }
+
+  // Serialize using V1 format (all values inline)
+  return Serialize(result_columns, output);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/wide/wide_column_serialization.h b/db/wide/wide_column_serialization.h
index 4a97f6a78690..0a819907ae7e 100644
--- a/db/wide/wide_column_serialization.h
+++ b/db/wide/wide_column_serialization.h
@@ -6,18 +6,28 @@
 #pragma once
 
 #include <cstdint>
+#include <limits>
 #include <string>
+#include <utility>
+#include <vector>
 
+#include "db/dbformat.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/status.h"
 #include "rocksdb/wide_columns.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+class BlobFetcher;
+class BlobIndex;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class PrefetchBufferCollection;
 class Slice;
 
 // Wide-column serialization/deserialization primitives.
 //
+// Version 1 Layout:
 // The two main parts of the layout are 1) a sorted index containing the column
 // names and column value sizes and 2) the column values themselves. Keeping the
 // index and the values separate will enable selectively reading column values
@@ -40,16 +50,224 @@ class Slice;
 //          ...---+----------+-------+----------+-------+---...---+-------+
 //                | varint32 | bytes | varint32 | bytes |         | bytes |
 //          ...---+----------+-------+----------+-------+---...---+-------+
+//
+// Version 2 Layout (with blob index support):
+// Groups all metadata upfront before variable-length data. This enables
+// efficient access patterns: index-based value access skips name data
+// entirely, default column access is O(1), and type checks are O(1).
+//
+// Legend: cn = column name, cv = column value, cns = column name size,
+//         cvs = column value size, ct = column type.
+//
+// Section 1: HEADER (2 varints)
+//   +----------+--------------+
+//   | version  | # of columns |
+//   | varint32 |   varint32   |
+//   +----------+--------------+
+//
+// Section 2: SKIP INFO (3 varints)
+//   +-------------------+---------------------+------------------+
+//   | name_sizes_bytes  | value_sizes_bytes   | names_bytes      |
+//   | varint32          | varint32            | varint32         |
+//   +-------------------+---------------------+------------------+
+//   name_sizes_bytes  = byte size of NAME SIZES section (section 4)
+//   value_sizes_bytes = byte size of VALUE SIZES section (section 5)
+//   names_bytes       = byte size of NAMES section (section 6)
+//
+//   Placed immediately after the header so that header + skip info form
+//   a contiguous varint sequence (5 varints), enabling future SIMD-based
+//   varint decoding.
+//
+// Section 3: COLUMN TYPES (N bytes, fixed-size)
+//   +------+------+---...---+--------+
+//   | ct_0 | ct_1 |         | ct_N-1 |
+//   | byte | byte |         |  byte  |
+//   +------+------+---...---+--------+
+//   ct values are ValueType entries from db/dbformat.h, e.g.:
+//     kTypeValue (0x01) = inline value
+//     kTypeBlobIndex (0x11) = blob index reference
+//   Future per-column types (kTypeMerge, kTypeDeletion, etc.) can be
+//   added without format changes.
+//
+// Section 4: NAME SIZES (N varints)
+//   +----------+----------+---...---+------------+
+//   | cns_0    | cns_1    |         | cns_{N-1}  |
+//   | varint32 | varint32 |         | varint32   |
+//   +----------+----------+---...---+------------+
+//
+// Section 5: VALUE SIZES (N varints)
+//   +----------+----------+---...---+------------+
+//   | cvs_0    | cvs_1    |         | cvs_{N-1}  |
+//   | varint32 | varint32 |         | varint32   |
+//   +----------+----------+---...---+------------+
+//
+// Section 6: COLUMN NAMES (concatenated, sorted)
+//   +------+------+---...---+--------+
+//   | cn_0 | cn_1 |         | cn_N-1 |
+//   | bytes| bytes|         | bytes  |
+//   +------+------+---...---+--------+
+//
+// Section 7: COLUMN VALUES (concatenated)
+//   +------+------+---...---+--------+
+//   | cv_0 | cv_1 |         | cv_N-1 |
+//   | bytes| bytes|         | bytes  |
+//   +------+------+---...---+--------+
+//
+// When ct = kTypeBlobIndex, the cv contains a serialized BlobIndex.
 
 class WideColumnSerialization {
  public:
+  // Version constants for wide column serialization format.
+  // - kVersion1: Original format with inline column values only.
+  // - kVersion2: Extended format that supports blob index references in
+  //              columns. Used when large column values are stored in blob
+  //              files.
+  static constexpr uint32_t kVersion1 = 1;
+  static constexpr uint32_t kVersion2 = 2;
+
+  // Serialize columns using version 1 format (no blob support)
   static Status Serialize(const WideColumns& columns, std::string& output);
 
+  // Serialize columns with some columns replaced by blob indices (version 2)
+  // columns: vector of (column_name, column_value) pairs
+  // blob_columns: vector of (column_index, blob_index) pairs indicating which
+  //               columns should be stored as blob references
+  static Status SerializeV2(
+      const std::vector<std::pair<std::string, std::string>>& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::string& output);
+
+  // Overload that takes Slice-based WideColumns directly, avoiding the
+  // need to copy column names and values into string pairs.
+  static Status SerializeV2(
+      const WideColumns& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::string& output);
+
+  // Deserialize columns (version 1 format only)
   static Status Deserialize(Slice& input, WideColumns& columns);
 
+  // Deserialize columns and separate inline columns from blob columns
+  // columns: receives inline column values
+  // blob_columns: receives (column_index, blob_index) pairs for blob references
+  static Status DeserializeV2(
+      Slice& input, std::vector<WideColumn>& columns,
+      std::vector<std::pair<size_t, BlobIndex>>& blob_columns);
+
+  // Check if the serialized entity has any blob column references.
+  // Sets *has_blob_columns to true if version >= 2 and at least one column
+  // has blob type; false otherwise.
+  // Returns Status::Corruption on decode errors.
+  static Status HasBlobColumns(const Slice& input, bool& has_blob_columns);
+
   static Status GetValueOfDefaultColumn(Slice& input, Slice& value);
 
-  static constexpr uint32_t kCurrentVersion = 1;
+  // Resolves all blob column references in a V2 wide-column entity,
+  // fetches the blob values, and re-serializes as a V1 entity (all inline).
+  // Handles inlined blobs (IsInlined) defensively.
+  //
+  // Used by the read path (GetContext, DBIter) when a V2 entity with blob
+  // column references needs to be converted to V1 format for consumption by
+  // APIs that only support V1 (e.g., TimedFullMerge,
+  // PinnableWideColumns::SetWideColumnValue).
+  //
+  // Sets *resolved to false and leaves resolved_entity unchanged when
+  // no blob columns are present.
+  //
+  // Optional parameters:
+  //   prefetch_buffers - for prefetch optimization (nullptr = no prefetch)
+  //   total_bytes_read - accumulates bytes read from blob files (nullptr =
+  //   skip) num_blobs_resolved - count of blob columns resolved (nullptr =
+  //   skip)
+  static Status ResolveEntityBlobColumns(
+      const Slice& entity_value, const Slice& user_key,
+      const BlobFetcher* blob_fetcher,
+      PrefetchBufferCollection* prefetch_buffers, std::string& resolved_entity,
+      bool& resolved, uint64_t* total_bytes_read, uint64_t* num_blobs_resolved);
+
+  // Extracts the default column value from a V2 entity, resolving its
+  // blob reference if needed. The default column (empty name) is always
+  // at index 0 when present (columns are sorted).
+  //
+  // Sets result to the resolved default column value (fetching from blob
+  // file if it's a blob reference). If there is no default column, result
+  // is set to empty. Sets *resolved to true if a blob was found for the
+  // default column, false otherwise.
+  static Status GetValueOfDefaultColumnResolvingBlobs(
+      const Slice& entity_value, const Slice& user_key,
+      const BlobFetcher* blob_fetcher, PinnableSlice& result, bool& resolved);
+
+ private:
+  friend class WideColumnSerializationTest;
+  // Get the serialization version from the input.
+  // Sets *version to the version number.
+  // Returns Status::Corruption on decode errors.
+  static Status GetVersion(const Slice& input, uint32_t& version);
+
+  // Merges deserialized columns with resolved blob values and serializes
+  // the result using version 1 format (all values inline).
+  static Status SerializeResolvedEntity(
+      const std::vector<WideColumn>& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      const std::vector<std::string>& resolved_blob_values,
+      std::string& output);
+
+  // Returns InvalidArgument with the given message if size exceeds uint32_t.
+  static Status ValidateWideColumnLimit(size_t size, const char* msg) {
+    if (size > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument(msg);
+    }
+    return Status::OK();
+  }
+
+  // Returns Corruption if prev_name >= name (columns must be strictly ordered).
+  static Status ValidateColumnOrder(const Slice& prev_name, const Slice& name) {
+    if (prev_name.compare(name) >= 0) {
+      return Status::Corruption("Wide columns out of order");
+    }
+    return Status::OK();
+  }
+
+  // Shared implementation for both SerializeV2 overloads.
+  // get_name(i): returns Slice for column i's name
+  // get_value(i): returns Slice for column i's inline value
+  template <typename GetName, typename GetValue>
+  static Status SerializeV2Impl(
+      size_t num_columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::string& output, GetName get_name, GetValue get_value);
+
+  // Validates num_columns limit and builds a per-column lookup map from
+  // blob_columns. Returns InvalidArgument on validation failure.
+  static Status BuildBlobIndexMap(
+      size_t num_columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      std::vector<const BlobIndex*>& blob_index_map);
+
+  // Parses V1 layout (interleaved name/value_size pairs followed by values)
+  // into columns. Used by both Deserialize and DeserializeV2 to avoid
+  // code duplication.
+  static Status DeserializeV1(Slice& input, uint32_t num_columns,
+                              std::vector<WideColumn>& columns);
+
+  // Parses V2 layout sections 2-7 (skip info through values) into columns and
+  // column types. Used by both Deserialize and DeserializeV2 to avoid
+  // code duplication.
+  static Status DeserializeV2Impl(Slice& input, uint32_t num_columns,
+                                  std::vector<WideColumn>& columns,
+                                  std::vector<ValueType>& column_types);
+
+  // Returns true if t is a supported per-column ValueType. Currently only
+  // kTypeValue (inline) and kTypeBlobIndex are supported. Notably,
+  // kTypeWideColumnEntity is rejected to prevent recursive nesting.
+  static bool IsValidColumnValueType(ValueType t) {
+    return t == kTypeValue || t == kTypeBlobIndex;
+  }
+
+  // Returns true if any of the first num_columns type bytes equals
+  // kTypeBlobIndex. Typical entities have <10 columns, so a linear
+  // scan is sufficient; SIMD could be considered if column counts grow.
+  static bool ContainsBlobType(const char* type_bytes, uint32_t num_columns);
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/wide/wide_column_serialization_test.cc b/db/wide/wide_column_serialization_test.cc
index 83a849da9eb3..018324d855e8 100644
--- a/db/wide/wide_column_serialization_test.cc
+++ b/db/wide/wide_column_serialization_test.cc
@@ -5,13 +5,35 @@
 
 #include "db/wide/wide_column_serialization.h"
 
+#include <chrono>
+#include <limits>
+
+#include "db/blob/blob_index.h"
 #include "db/wide/wide_columns_helper.h"
 #include "test_util/testharness.h"
 #include "util/coding.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-TEST(WideColumnSerializationTest, Construct) {
+class WideColumnSerializationTest : public testing::Test {
+ protected:
+  // Wrappers for private methods accessible via friend declaration.
+  static Status GetVersion(const Slice& input, uint32_t& version) {
+    return WideColumnSerialization::GetVersion(input, version);
+  }
+
+  static Status SerializeResolvedEntity(
+      const std::vector<WideColumn>& columns,
+      const std::vector<std::pair<size_t, BlobIndex>>& blob_columns,
+      const std::vector<std::string>& resolved_blob_values,
+      std::string& output) {
+    return WideColumnSerialization::SerializeResolvedEntity(
+        columns, blob_columns, resolved_blob_values, output);
+  }
+};
+
+TEST_F(WideColumnSerializationTest, Construct) {
   constexpr char foo[] = "foo";
   constexpr char bar[] = "bar";
 
@@ -87,7 +109,7 @@ TEST(WideColumnSerializationTest, Construct) {
   }
 }
 
-TEST(WideColumnSerializationTest, SerializeDeserialize) {
+TEST_F(WideColumnSerializationTest, SerializeDeserialize) {
   WideColumns columns{{"foo", "bar"}, {"hello", "world"}};
   std::string output;
 
@@ -126,7 +148,7 @@ TEST(WideColumnSerializationTest, SerializeDeserialize) {
   }
 }
 
-TEST(WideColumnSerializationTest, SerializeDuplicateError) {
+TEST_F(WideColumnSerializationTest, SerializeDuplicateError) {
   WideColumns columns{{"foo", "bar"}, {"foo", "baz"}};
   std::string output;
 
@@ -134,7 +156,7 @@ TEST(WideColumnSerializationTest, SerializeDuplicateError) {
       WideColumnSerialization::Serialize(columns, output).IsCorruption());
 }
 
-TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
+TEST_F(WideColumnSerializationTest, SerializeOutOfOrderError) {
   WideColumns columns{{"hello", "world"}, {"foo", "bar"}};
   std::string output;
 
@@ -142,7 +164,7 @@ TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
       WideColumnSerialization::Serialize(columns, output).IsCorruption());
 }
 
-TEST(WideColumnSerializationTest, DeserializeVersionError) {
+TEST_F(WideColumnSerializationTest, DeserializeVersionError) {
   // Can't decode version
 
   std::string buf;
@@ -155,7 +177,7 @@ TEST(WideColumnSerializationTest, DeserializeVersionError) {
   ASSERT_TRUE(std::strstr(s.getState(), "version"));
 }
 
-TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
+TEST_F(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
   // Unsupported version
   constexpr uint32_t future_version = 1000;
 
@@ -170,11 +192,11 @@ TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
   ASSERT_TRUE(std::strstr(s.getState(), "version"));
 }
 
-TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
+TEST_F(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
   // Can't decode number of columns
 
   std::string buf;
-  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+  PutVarint32(&buf, WideColumnSerialization::kVersion1);
 
   Slice input(buf);
   WideColumns columns;
@@ -184,10 +206,10 @@ TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
   ASSERT_TRUE(std::strstr(s.getState(), "number"));
 }
 
-TEST(WideColumnSerializationTest, DeserializeColumnsError) {
+TEST_F(WideColumnSerializationTest, DeserializeV2Error) {
   std::string buf;
 
-  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+  PutVarint32(&buf, WideColumnSerialization::kVersion1);
 
   constexpr uint32_t num_columns = 2;
   PutVarint32(&buf, num_columns);
@@ -277,10 +299,10 @@ TEST(WideColumnSerializationTest, DeserializeColumnsError) {
   }
 }
 
-TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
+TEST_F(WideColumnSerializationTest, DeserializeV2OutOfOrder) {
   std::string buf;
 
-  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+  PutVarint32(&buf, WideColumnSerialization::kVersion1);
 
   constexpr uint32_t num_columns = 2;
   PutVarint32(&buf, num_columns);
@@ -302,6 +324,521 @@ TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
   ASSERT_TRUE(std::strstr(s.getState(), "order"));
 }
 
+TEST_F(WideColumnSerializationTest, DeserializeV2RejectsRecursiveType) {
+  // Manually construct a V2 entity where one column has type
+  // kTypeWideColumnEntity, which would create recursive nesting.
+  // Deserialization must reject this.
+  std::string buf;
+
+  PutVarint32(&buf, WideColumnSerialization::kVersion2);
+
+  constexpr uint32_t num_columns = 2;
+  PutVarint32(&buf, num_columns);
+
+  // Section 2: COLUMN TYPES -- first column inline, second recursive
+  buf.push_back(static_cast<char>(kTypeValue));
+  buf.push_back(static_cast<char>(kTypeWideColumnEntity));
+
+  // Section 3: SKIP INFO
+  PutVarint32(&buf, 2);  // name_sizes_bytes (varint(1) + varint(1))
+  PutVarint32(&buf, 2);  // value_sizes_bytes (varint(3) + varint(5))
+  PutVarint32(&buf, 2);  // names_bytes ("a" + "b")
+
+  // Section 4: NAME SIZES
+  PutVarint32(&buf, 1);  // "a"
+  PutVarint32(&buf, 1);  // "b"
+
+  // Section 5: VALUE SIZES
+  PutVarint32(&buf, 3);
+  PutVarint32(&buf, 5);
+
+  // Section 6: NAMES
+  buf.append("ab");
+
+  // Section 7: VALUES (8 bytes of placeholder data)
+  buf.append(8, 'x');
+
+  // DeserializeV2 should reject with Corruption
+  {
+    Slice input(buf);
+    std::vector<WideColumn> columns;
+    std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+    const Status s =
+        WideColumnSerialization::DeserializeV2(input, columns, blob_columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "Unsupported wide column ValueType"));
+  }
+
+  // Deserialize (V1-only API) should also reject
+  {
+    Slice input(buf);
+    WideColumns columns;
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+// Helper: create a BlobIndex from EncodeBlob parameters.
+static BlobIndex MakeBlobIndex(uint64_t file_number, uint64_t offset,
+                               uint64_t size,
+                               CompressionType compression = kNoCompression) {
+  std::string encoded;
+  BlobIndex::EncodeBlob(&encoded, file_number, offset, size, compression);
+  BlobIndex bi;
+  Slice s(encoded);
+  assert(bi.DecodeFrom(s).ok());
+  return bi;
+}
+
+// Helper: V2 serialize → DeserializeV2 round-trip, returning
+// deserialized columns and blob column info.
+static void V2SerializeAndDeserialize(
+    const std::vector<std::pair<std::string, std::string>>& columns,
+    const std::vector<std::pair<size_t, BlobIndex>>& blob_columns_in,
+    std::vector<WideColumn>* deserialized,
+    std::vector<std::pair<size_t, BlobIndex>>* blob_columns_out,
+    std::string* serialized_out) {
+  ASSERT_OK(WideColumnSerialization::SerializeV2(columns, blob_columns_in,
+                                                 *serialized_out));
+
+  Slice input(*serialized_out);
+  ASSERT_OK(WideColumnSerialization::DeserializeV2(input, *deserialized,
+                                                   *blob_columns_out));
+  ASSERT_EQ(deserialized->size(), columns.size());
+  for (size_t i = 0; i < columns.size(); ++i) {
+    ASSERT_EQ((*deserialized)[i].name(), columns[i].first);
+  }
+}
+
+// Helper: build WideColumns from string pairs.
+static WideColumns ToWideColumns(
+    const std::vector<std::pair<std::string, std::string>>& columns) {
+  WideColumns wc;
+  wc.reserve(columns.size());
+  for (const auto& col : columns) {
+    wc.emplace_back(Slice(col.first), Slice(col.second));
+  }
+  return wc;
+}
+
+// Helper: deserialize and verify column names match expected.first
+// and column values match expected_values[i].
+static void VerifyDeserialize(
+    const std::string& serialized,
+    const std::vector<std::pair<std::string, std::string>>& expected,
+    const std::vector<std::string>& expected_values) {
+  Slice input(serialized);
+  WideColumns deserialized;
+  ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized));
+  ASSERT_EQ(deserialized.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(deserialized[i].name(), expected[i].first);
+    ASSERT_EQ(deserialized[i].value(), expected_values[i]);
+  }
+}
+
+// Convenience overload: values come from expected[i].second.
+static void VerifyDeserialize(
+    const std::string& serialized,
+    const std::vector<std::pair<std::string, std::string>>& expected) {
+  std::vector<std::string> values;
+  values.reserve(expected.size());
+  for (const auto& col : expected) {
+    values.push_back(col.second);
+  }
+  VerifyDeserialize(serialized, expected, values);
+}
+
+// Helper: create a random non-inlined BlobIndex using the given RNG.
+// Only creates Blob or BlobTTL types (not InlinedTTL), because InlinedTTL
+// stores a Slice pointing into the encoded string, which would become a
+// dangling reference after this function returns.
+static BlobIndex MakeRandomBlobIndex(Random& rng) {
+  std::string bi_str;
+  if (rng.Uniform(2) == 0) {
+    BlobIndex::EncodeBlob(&bi_str, rng.Uniform(1000), rng.Uniform(10000),
+                          rng.Uniform(5000), kNoCompression);
+  } else {
+    BlobIndex::EncodeBlobTTL(&bi_str, rng.Uniform(1000000), rng.Uniform(1000),
+                             rng.Uniform(10000), rng.Uniform(5000),
+                             kSnappyCompression);
+  }
+  BlobIndex bi;
+  Slice s(bi_str);
+  assert(bi.DecodeFrom(s).ok());
+  return bi;
+}
+
+// Helper: V2 serialize with no blobs then GetValueOfDefaultColumn.
+static void VerifyGetDefaultColumn(
+    const std::vector<std::pair<std::string, std::string>>& columns,
+    const Slice& expected_value) {
+  std::vector<std::pair<size_t, BlobIndex>> no_blobs;
+  std::string serialized;
+  ASSERT_OK(
+      WideColumnSerialization::SerializeV2(columns, no_blobs, serialized));
+
+  Slice input(serialized);
+  Slice value;
+  ASSERT_OK(WideColumnSerialization::GetValueOfDefaultColumn(input, value));
+  ASSERT_EQ(value, expected_value);
+}
+
+TEST_F(WideColumnSerializationTest, SerializeResolvedEntity) {
+  // Test resolve with mixed, all-blob, and no-blob configurations
+  struct TestCase {
+    std::vector<std::pair<std::string, std::string>> columns;
+    std::vector<std::pair<size_t, BlobIndex>> blob_cols;
+    std::vector<std::string> resolved_values;
+    std::vector<std::string> expected_values;
+  };
+
+  std::vector<TestCase> cases = {
+      // Mixed inline and blob
+      {.columns = {{"a", "inline_a"}, {"b", "ph"}, {"c", "inline_c"}},
+       .blob_cols = {{1, MakeBlobIndex(50, 500, 100)}},
+       .resolved_values = {"resolved_b"},
+       .expected_values = {"inline_a", "resolved_b", "inline_c"}},
+      // All blob columns
+      {.columns = {{"x", "ph1"}, {"y", "ph2"}, {"z", "ph3"}},
+       .blob_cols = {{0, MakeBlobIndex(10, 100, 50)},
+                     {1, MakeBlobIndex(20, 200, 60)},
+                     {2, MakeBlobIndex(30, 300, 70)}},
+       .resolved_values = {"val_x", "val_y", "val_z"},
+       .expected_values = {"val_x", "val_y", "val_z"}},
+      // No blob columns
+      {.columns = {{"alpha", "val_alpha"}, {"beta", "val_beta"}},
+       .blob_cols = {},
+       .resolved_values = {},
+       .expected_values = {"val_alpha", "val_beta"}},
+  };
+
+  for (const auto& tc : cases) {
+    std::string serialized;
+    std::vector<WideColumn> deserialized;
+    std::vector<std::pair<size_t, BlobIndex>> blob_out;
+    V2SerializeAndDeserialize(tc.columns, tc.blob_cols, &deserialized,
+                              &blob_out, &serialized);
+
+    std::string resolved_output;
+    ASSERT_OK(WideColumnSerializationTest::SerializeResolvedEntity(
+        deserialized, blob_out, tc.resolved_values, resolved_output));
+
+    uint32_t v = 0;
+    ASSERT_OK(GetVersion(Slice(resolved_output), v));
+    ASSERT_EQ(v, WideColumnSerialization::kVersion1);
+
+    VerifyDeserialize(resolved_output, tc.columns, tc.expected_values);
+  }
+}
+
+TEST_F(WideColumnSerializationTest, V2GetValueOfDefaultColumn) {
+  // V2 with default column present
+  VerifyGetDefaultColumn({{"", "default_value"}, {"col1", "value1"}},
+                         "default_value");
+  // V2 without default column
+  VerifyGetDefaultColumn({{"col1", "value1"}, {"col2", "value2"}}, Slice());
+  // V2 with zero columns
+  VerifyGetDefaultColumn({}, Slice());
+
+  // V1 fallback
+  {
+    WideColumns columns{{"", "v1_default"}, {"col1", "v1"}};
+    std::string serialized;
+    ASSERT_OK(WideColumnSerialization::Serialize(columns, serialized));
+
+    Slice input(serialized);
+    Slice value;
+    ASSERT_OK(WideColumnSerialization::GetValueOfDefaultColumn(input, value));
+    ASSERT_EQ(value, "v1_default");
+  }
+}
+
+TEST_F(WideColumnSerializationTest, V2BlobColumnRejectsDeserialize) {
+  std::vector<std::pair<std::string, std::string>> columns = {
+      {"a", "inline"}, {"b", "placeholder"}};
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns = {
+      {1, MakeBlobIndex(1, 2, 3)}};
+
+  std::string serialized;
+  ASSERT_OK(
+      WideColumnSerialization::SerializeV2(columns, blob_columns, serialized));
+
+  Slice input(serialized);
+  WideColumns deserialized;
+  ASSERT_TRUE(WideColumnSerialization::Deserialize(input, deserialized)
+                  .IsNotSupported());
+}
+
+TEST_F(WideColumnSerializationTest, V2GetValueOfDefaultColumnBlobRef) {
+  // When default column (index 0) is a blob reference,
+  // GetValueOfDefaultColumn should return NotSupported.
+  std::vector<std::pair<std::string, std::string>> columns = {
+      {"", "placeholder"}, {"col1", "value1"}};
+  std::vector<std::pair<size_t, BlobIndex>> blob_columns = {
+      {0, MakeBlobIndex(10, 100, 500)}};
+
+  std::string serialized;
+  ASSERT_OK(
+      WideColumnSerialization::SerializeV2(columns, blob_columns, serialized));
+
+  Slice input(serialized);
+  Slice value;
+  ASSERT_TRUE(WideColumnSerialization::GetValueOfDefaultColumn(input, value)
+                  .IsNotSupported());
+}
+
+TEST_F(WideColumnSerializationTest, SerializeV2Errors) {
+  // Blob column index out of range
+  {
+    std::vector<std::pair<std::string, std::string>> columns = {{"a", "val"}};
+    std::vector<std::pair<size_t, BlobIndex>> blob_columns = {
+        {5, MakeBlobIndex(1, 2, 3)}};  // index 5 but only 1 column
+
+    std::string output;
+    ASSERT_TRUE(
+        WideColumnSerialization::SerializeV2(columns, blob_columns, output)
+            .IsInvalidArgument());
+  }
+
+  // Columns out of order (V2)
+  {
+    std::vector<std::pair<std::string, std::string>> columns = {{"b", "val_b"},
+                                                                {"a", "val_a"}};
+    std::vector<std::pair<size_t, BlobIndex>> no_blobs;
+
+    std::string output;
+    ASSERT_TRUE(WideColumnSerialization::SerializeV2(columns, no_blobs, output)
+                    .IsCorruption());
+  }
+
+  // Duplicate column names (V2)
+  {
+    std::vector<std::pair<std::string, std::string>> columns = {{"a", "val1"},
+                                                                {"a", "val2"}};
+    std::vector<std::pair<size_t, BlobIndex>> no_blobs;
+
+    std::string output;
+    ASSERT_TRUE(WideColumnSerialization::SerializeV2(columns, no_blobs, output)
+                    .IsCorruption());
+  }
+}
+
+TEST_F(WideColumnSerializationTest, BlobIndexEncodeToRoundTrip) {
+  // Test EncodeTo produces identical output to static Encode methods
+  // for all three blob index types.
+  auto verify_encode_to = [](const std::string& encoded_static) {
+    BlobIndex bi;
+    Slice s(encoded_static);
+    ASSERT_OK(bi.DecodeFrom(s));
+    std::string encoded_instance;
+    bi.EncodeTo(&encoded_instance);
+    ASSERT_EQ(encoded_static, encoded_instance);
+  };
+
+  std::string blob_str;
+  std::string blob_ttl_str;
+  std::string inlined_str;
+  BlobIndex::EncodeBlob(&blob_str, 42, 1024, 2048, kSnappyCompression);
+  BlobIndex::EncodeBlobTTL(&blob_ttl_str, 9999, 10, 200, 3000,
+                           kZlibCompression);
+  BlobIndex::EncodeInlinedTTL(&inlined_str, 12345, "inline_data");
+
+  verify_encode_to(blob_str);
+  verify_encode_to(blob_ttl_str);
+  verify_encode_to(inlined_str);
+}
+
+TEST_F(WideColumnSerializationTest, V2LayoutStructureVerification) {
+  // Verify the V2 binary layout structure by manually parsing sections
+  std::vector<std::pair<std::string, std::string>> columns = {
+      {"aa", "val_aa"}, {"bbb", "val_bbb"}};
+  std::vector<std::pair<size_t, BlobIndex>> empty_blob_columns;
+
+  std::string serialized;
+  ASSERT_OK(WideColumnSerialization::SerializeV2(columns, empty_blob_columns,
+                                                 serialized));
+
+  Slice data(serialized);
+
+  // Section 1: HEADER
+  uint32_t version = 0;
+  ASSERT_TRUE(GetVarint32(&data, &version));
+  ASSERT_EQ(version, WideColumnSerialization::kVersion2);
+
+  uint32_t num_columns = 0;
+  ASSERT_TRUE(GetVarint32(&data, &num_columns));
+  ASSERT_EQ(num_columns, 2u);
+
+  // Section 2: SKIP INFO (3 varints)
+  uint32_t name_sizes_bytes = 0;
+  uint32_t value_sizes_bytes = 0;
+  uint32_t names_bytes = 0;
+  ASSERT_TRUE(GetVarint32(&data, &name_sizes_bytes));
+  ASSERT_TRUE(GetVarint32(&data, &value_sizes_bytes));
+  ASSERT_TRUE(GetVarint32(&data, &names_bytes));
+  // name sizes: varint(2) + varint(3) = 1 + 1 = 2 bytes
+  ASSERT_EQ(name_sizes_bytes, 2u);
+  // value sizes: varint(6) + varint(7) = 1 + 1 = 2 bytes
+  ASSERT_EQ(value_sizes_bytes, 2u);
+  // names: "aa" + "bbb" = 2 + 3 = 5 bytes
+  ASSERT_EQ(names_bytes, 5u);
+
+  // Section 3: COLUMN TYPES (2 bytes, both inline)
+  ASSERT_GE(data.size(), 2u);
+  ASSERT_EQ(static_cast<uint8_t>(data[0]), static_cast<uint8_t>(kTypeValue));
+  ASSERT_EQ(static_cast<uint8_t>(data[1]), static_cast<uint8_t>(kTypeValue));
+  data.remove_prefix(2);
+
+  // Section 4: NAME SIZES
+  uint32_t ns0 = 0;
+  uint32_t ns1 = 0;
+  ASSERT_TRUE(GetVarint32(&data, &ns0));
+  ASSERT_TRUE(GetVarint32(&data, &ns1));
+  ASSERT_EQ(ns0, 2u);
+  ASSERT_EQ(ns1, 3u);
+
+  // Section 5: VALUE SIZES
+  uint32_t vs0 = 0;
+  uint32_t vs1 = 0;
+  ASSERT_TRUE(GetVarint32(&data, &vs0));
+  ASSERT_TRUE(GetVarint32(&data, &vs1));
+  ASSERT_EQ(vs0, 6u);  // "val_aa" = 6
+  ASSERT_EQ(vs1, 7u);  // "val_bbb" = 7
+
+  // Section 6: COLUMN NAMES
+  ASSERT_GE(data.size(), 5u);
+  ASSERT_EQ(Slice(data.data(), 2), "aa");
+  ASSERT_EQ(Slice(data.data() + 2, 3), "bbb");
+  data.remove_prefix(5);
+
+  // Section 7: COLUMN VALUES
+  ASSERT_GE(data.size(), 13u);
+  ASSERT_EQ(Slice(data.data(), 6), "val_aa");
+  ASSERT_EQ(Slice(data.data() + 6, 7), "val_bbb");
+}
+
+// Randomized correctness test: serialize and deserialize with random column
+// counts, name sizes, value sizes, and randomly chosen blob columns.
+// Validates the full round-trip for both V1 (Serialize) and V2
+// (SerializeV2) formats.
+TEST_F(WideColumnSerializationTest, RandomizedSerializeDeserializeRoundTrip) {
+  uint32_t seed = static_cast<uint32_t>(
+      std::chrono::system_clock::now().time_since_epoch().count());
+  Random rng(seed);
+  SCOPED_TRACE("seed=" + std::to_string(seed));
+
+  constexpr int kNumIterations = 100;
+
+  for (int iter = 0; iter < kNumIterations; ++iter) {
+    int num_cols = rng.Uniform(17);     // 0..16
+    int name_sz = 1 + rng.Uniform(64);  // 1..64
+    int val_sz = rng.Uniform(1025);     // 0..1024
+
+    // Generate sorted column names and random values
+    std::vector<std::pair<std::string, std::string>> columns;
+    columns.reserve(num_cols);
+    for (int c = 0; c < num_cols; ++c) {
+      // Build a sorted, unique name of exactly name_sz bytes.
+      // Use hex-encoded index as prefix to guarantee sort order,
+      // then pad with random characters.
+      char idx_str[16];
+      snprintf(idx_str, sizeof(idx_str), "%04x", c);
+      std::string name(idx_str);
+      if (static_cast<int>(name.size()) < name_sz) {
+        name.append(name_sz - name.size(),
+                    static_cast<char>('a' + rng.Uniform(26)));
+      }
+      // Ensure exactly name_sz bytes. For name_sz < 4, use just the
+      // low-order hex digits to maintain sort order.
+      if (static_cast<int>(name.size()) > name_sz) {
+        name = name.substr(name.size() - name_sz);
+      }
+
+      // Random value content
+      std::string value(val_sz, '\0');
+      for (int j = 0; j < val_sz; ++j) {
+        value[j] = static_cast<char>(rng.Uniform(256));
+      }
+      columns.emplace_back(std::move(name), std::move(value));
+    }
+
+    // Randomly select some columns as blob columns
+    std::vector<std::pair<size_t, BlobIndex>> blob_columns;
+    for (int c = 0; c < num_cols; ++c) {
+      if (rng.Uniform(3) == 0) {  // ~33% chance of being a blob column
+        blob_columns.emplace_back(c, MakeRandomBlobIndex(rng));
+      }
+    }
+
+    // V2 serialize → DeserializeV2 round-trip
+    std::string serialized;
+    std::vector<WideColumn> deserialized;
+    std::vector<std::pair<size_t, BlobIndex>> blob_out;
+    V2SerializeAndDeserialize(columns, blob_columns, &deserialized, &blob_out,
+                              &serialized);
+
+    // Verify version and HasBlobColumns
+    uint32_t v = 0;
+    ASSERT_OK(GetVersion(Slice(serialized), v));
+    ASSERT_EQ(v, WideColumnSerialization::kVersion2);
+
+    bool hb = false;
+    ASSERT_OK(WideColumnSerialization::HasBlobColumns(Slice(serialized), hb));
+    ASSERT_EQ(hb, !blob_columns.empty());
+
+    // Verify blob column round-trip
+    ASSERT_EQ(blob_out.size(), blob_columns.size());
+    for (size_t b = 0; b < blob_columns.size(); ++b) {
+      ASSERT_EQ(blob_out[b].first, blob_columns[b].first);
+      const BlobIndex& orig = blob_columns[b].second;
+      const BlobIndex& decoded = blob_out[b].second;
+      ASSERT_EQ(decoded.IsInlined(), orig.IsInlined());
+      ASSERT_EQ(decoded.HasTTL(), orig.HasTTL());
+      if (!decoded.IsInlined()) {
+        ASSERT_EQ(decoded.file_number(), orig.file_number());
+        ASSERT_EQ(decoded.offset(), orig.offset());
+        ASSERT_EQ(decoded.size(), orig.size());
+      }
+    }
+
+    // Verify inline column values
+    size_t blob_idx = 0;
+    for (int c = 0; c < num_cols; ++c) {
+      if (blob_idx < blob_columns.size() &&
+          blob_columns[blob_idx].first == static_cast<size_t>(c)) {
+        ++blob_idx;
+      } else {
+        ASSERT_EQ(deserialized[c].value(), columns[c].second);
+      }
+    }
+
+    // If no blob columns, also verify Deserialize() and both overloads
+    if (blob_columns.empty()) {
+      VerifyDeserialize(serialized, columns);
+
+      // WideColumns overload should produce identical output
+      std::string serialized2;
+      WideColumns wc = ToWideColumns(columns);
+      ASSERT_OK(
+          WideColumnSerialization::SerializeV2(wc, blob_columns, serialized2));
+      ASSERT_EQ(serialized, serialized2);
+    }
+
+    // V1 Serialize round-trip
+    {
+      WideColumns wc = ToWideColumns(columns);
+      std::string serialized_v1;
+      ASSERT_OK(WideColumnSerialization::Serialize(wc, serialized_v1));
+
+      ASSERT_OK(GetVersion(Slice(serialized_v1), v));
+      ASSERT_EQ(v, WideColumnSerialization::kVersion1);
+
+      VerifyDeserialize(serialized_v1, columns);
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/unreleased_history/new_features/wide_column_blob_support.md b/unreleased_history/new_features/wide_column_blob_support.md
new file mode 100644
index 000000000000..abc8c237162b
--- /dev/null
+++ b/unreleased_history/new_features/wide_column_blob_support.md
@@ -0,0 +1 @@
+Added support for storing wide-column entity column values in blob files. When `min_blob_size` is configured, large column values in wide-column entities will be stored in blob files, reducing SST file size and improving read performance.